From 494215fbf298787e4ead16e4c68634d241336b02 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:20 -0500 Subject: [PATCH 01/77] lib: test_bitmap: clearly separate ERANGE from EINVAL tests. This block of tests was meant to find/flag incorrect use of the ":" and "/" separators (syntax errors) and invalid (zero) group len. However they were specified with an 8 bit width and 32 bit operations, so they really contained two errors (EINVAL and ERANGE). Promote them to 32 bit so it is clear what they are meant to target. Then we can add tests specific for ERANGE (no syntax errors, just doing 32bit op on 8 bit width, plus a typical 9-on-8 fencepost error). Cc: Yury Norov Cc: Rasmus Villemoes Cc: Andy Shevchenko Acked-by: Yury Norov Reviewed-by: Andy Shevchenko Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- lib/test_bitmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 0ea0e8258f14a5..853a3a6ff59ca2 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -337,12 +337,12 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {-EINVAL, "-1", NULL, 8, 0}, {-EINVAL, "-0", NULL, 8, 0}, {-EINVAL, "10-1", NULL, 8, 0}, - {-EINVAL, "0-31:", NULL, 8, 0}, - {-EINVAL, "0-31:0", NULL, 8, 0}, - {-EINVAL, "0-31:0/", NULL, 8, 0}, - {-EINVAL, "0-31:0/0", NULL, 8, 0}, - {-EINVAL, "0-31:1/0", NULL, 8, 0}, - {-EINVAL, "0-31:10/1", NULL, 8, 0}, + {-EINVAL, "0-31:", NULL, 32, 0}, + {-EINVAL, "0-31:0", NULL, 32, 0}, + {-EINVAL, "0-31:0/", NULL, 32, 0}, + {-EINVAL, "0-31:0/0", NULL, 32, 0}, + {-EINVAL, "0-31:1/0", NULL, 32, 0}, + {-EINVAL, "0-31:10/1", NULL, 32, 0}, {-EOVERFLOW, "0-98765432123456789:10/1", NULL, 8, 0}, {-EINVAL, "a-31", NULL, 8, 0}, From 6fef5905fbd691aeb91093056b27d5ee7b106097 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:21 -0500 Subject: [PATCH 02/77] lib: test_bitmap: add tests to trigger ERANGE case. Add tests that specify a valid range, but one that is outside the width of the bitmap for which it is to be applied to. These should trigger an -ERANGE response from the code. Cc: Yury Norov Cc: Rasmus Villemoes Cc: Andy Shevchenko Acked-by: Yury Norov Reviewed-by: Andy Shevchenko Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- lib/test_bitmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 853a3a6ff59ca2..0f2e91d0a84ce8 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -337,6 +337,8 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {-EINVAL, "-1", NULL, 8, 0}, {-EINVAL, "-0", NULL, 8, 0}, {-EINVAL, "10-1", NULL, 8, 0}, + {-ERANGE, "8-8", NULL, 8, 0}, + {-ERANGE, "0-31", NULL, 8, 0}, {-EINVAL, "0-31:", NULL, 32, 0}, {-EINVAL, "0-31:0", NULL, 32, 0}, {-EINVAL, "0-31:0/", NULL, 32, 0}, From 97330db3af9a41302d1ccb0f495fcb5b5da2cc44 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:22 -0500 Subject: [PATCH 03/77] lib: test_bitmap: add more start-end:offset/len tests There are inputs to bitmap_parselist() that would probably never be entered manually by a person, but might result from some kind of automated input generator. Things like ranges of length 1, or group lengths longer than nbits, overlaps, or offsets of zero. Adding these tests serve two purposes: 1) document what might seem odd but nonetheless valid input. 2) don't regress from what we currently accept as valid. Cc: Yury Norov Cc: Rasmus Villemoes Cc: Andy Shevchenko Acked-by: Yury Norov Reviewed-by: Andy Shevchenko Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- lib/test_bitmap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 0f2e91d0a84ce8..3c1c46deb8fed1 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -34,6 +34,8 @@ static const unsigned long exp1[] __initconst = { BITMAP_FROM_U64(0x3333333311111111ULL), BITMAP_FROM_U64(0xffffffff77777777ULL), BITMAP_FROM_U64(0), + BITMAP_FROM_U64(0x00008000), + BITMAP_FROM_U64(0x80000000), }; static const unsigned long exp2[] __initconst = { @@ -334,6 +336,26 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {0, " , ,, , , ", &exp1[12 * step], 8, 0}, {0, " , ,, , , \n", &exp1[12 * step], 8, 0}, + {0, "0-0", &exp1[0], 32, 0}, + {0, "1-1", &exp1[1 * step], 32, 0}, + {0, "15-15", &exp1[13 * step], 32, 0}, + {0, "31-31", &exp1[14 * step], 32, 0}, + + {0, "0-0:0/1", &exp1[12 * step], 32, 0}, + {0, "0-0:1/1", &exp1[0], 32, 0}, + {0, "0-0:1/31", &exp1[0], 32, 0}, + {0, "0-0:31/31", &exp1[0], 32, 0}, + {0, "1-1:1/1", &exp1[1 * step], 32, 0}, + {0, "0-15:16/31", &exp1[2 * step], 32, 0}, + {0, "15-15:1/2", &exp1[13 * step], 32, 0}, + {0, "15-15:31/31", &exp1[13 * step], 32, 0}, + {0, "15-31:1/31", &exp1[13 * step], 32, 0}, + {0, "16-31:16/31", &exp1[3 * step], 32, 0}, + {0, "31-31:31/31", &exp1[14 * step], 32, 0}, + + {0, "0-31:1/3,1-31:1/3,2-31:1/3", &exp1[8 * step], 32, 0}, + {0, "1-10:8/12,8-31:24/29,0-31:0/3", &exp1[9 * step], 32, 0}, + {-EINVAL, "-1", NULL, 8, 0}, {-EINVAL, "-0", NULL, 8, 0}, {-EINVAL, "10-1", NULL, 8, 0}, From 9d7a3366b7028ae8dd16a0d7585cbf11b03b42a0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:23 -0500 Subject: [PATCH 04/77] lib: bitmap: fold nbits into region struct This will reduce parameter passing and enable using nbits as part of future dynamic region parameter parsing. Cc: Yury Norov Cc: Rasmus Villemoes Cc: Andy Shevchenko Suggested-by: Yury Norov Acked-by: Yury Norov Reviewed-by: Andy Shevchenko Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- lib/bitmap.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index 75006c4036e9e8..162e2850c62216 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -487,24 +487,24 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); /* * Region 9-38:4/10 describes the following bitmap structure: - * 0 9 12 18 38 - * .........****......****......****...... - * ^ ^ ^ ^ - * start off group_len end + * 0 9 12 18 38 N + * .........****......****......****.................. + * ^ ^ ^ ^ ^ + * start off group_len end nbits */ struct region { unsigned int start; unsigned int off; unsigned int group_len; unsigned int end; + unsigned int nbits; }; -static int bitmap_set_region(const struct region *r, - unsigned long *bitmap, int nbits) +static int bitmap_set_region(const struct region *r, unsigned long *bitmap) { unsigned int start; - if (r->end >= nbits) + if (r->end >= r->nbits) return -ERANGE; for (start = r->start; start <= r->end; start += r->group_len) @@ -640,7 +640,8 @@ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) struct region r; long ret; - bitmap_zero(maskp, nmaskbits); + r.nbits = nmaskbits; + bitmap_zero(maskp, r.nbits); while (buf) { buf = bitmap_find_region(buf); @@ -655,7 +656,7 @@ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) if (ret) return ret; - ret = bitmap_set_region(&r, maskp, nmaskbits); + ret = bitmap_set_region(&r, maskp); if (ret) return ret; } From f3c869caef648c541a7445f2a6ba2196d343f542 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:24 -0500 Subject: [PATCH 05/77] lib: bitmap: move ERANGE check from set_region to check_region It makes sense to do all the checks in check_region() and not 1/2 in check_region and 1/2 in set_region. Since set_region is called immediately after check_region, the net effect on runtime is zero, but it gets rid of an if (...) return... Cc: Yury Norov Cc: Rasmus Villemoes Cc: Andy Shevchenko Acked-by: Yury Norov Reviewed-by: Andy Shevchenko Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- lib/bitmap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index 162e2850c62216..833f152a2c431c 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -500,17 +500,12 @@ struct region { unsigned int nbits; }; -static int bitmap_set_region(const struct region *r, unsigned long *bitmap) +static void bitmap_set_region(const struct region *r, unsigned long *bitmap) { unsigned int start; - if (r->end >= r->nbits) - return -ERANGE; - for (start = r->start; start <= r->end; start += r->group_len) bitmap_set(bitmap, start, min(r->end - start + 1, r->off)); - - return 0; } static int bitmap_check_region(const struct region *r) @@ -518,6 +513,9 @@ static int bitmap_check_region(const struct region *r) if (r->start > r->end || r->group_len == 0 || r->off > r->group_len) return -EINVAL; + if (r->end >= r->nbits) + return -ERANGE; + return 0; } @@ -656,9 +654,7 @@ int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits) if (ret) return ret; - ret = bitmap_set_region(&r, maskp); - if (ret) - return ret; + bitmap_set_region(&r, maskp); } return 0; From 2c4885d24e64941702a8f81c8e83289823ba35d0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:25 -0500 Subject: [PATCH 06/77] lib: bitmap: support "N" as an alias for size of bitmap While this is done for all bitmaps, the original use case in mind was for CPU masks and cpulist_parse() as described below. It seems that a common configuration is to use the 1st couple cores for housekeeping tasks. This tends to leave the remaining ones to form a pool of similarly configured cores to take on the real workload of interest to the user. So on machine A - with 32 cores, it could be 0-3 for "system" and then 4-31 being used in boot args like nohz_full=, or rcu_nocbs= as part of setting up the worker pool of CPUs. But then newer machine B is added, and it has 48 cores, and so while the 0-3 part remains unchanged, the pool setup cpu list becomes 4-47. Multiple deployment becomes easier when we can just simply replace 31 and 47 with "N" and let the system substitute in the actual number at boot; a number that it knows better than we do. Cc: Yury Norov Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Rasmus Villemoes Cc: Andy Shevchenko Suggested-by: Yury Norov # move it from CPU code Acked-by: Yury Norov Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- .../admin-guide/kernel-parameters.rst | 7 ++++++ lib/bitmap.c | 22 ++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 1132796a8d96e7..d6e3f67953a73f 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -68,6 +68,13 @@ For example one can add to the command line following parameter: where the final item represents CPUs 100,101,125,126,150,151,... +The value "N" can be used to represent the numerically last CPU on the system, +i.e "foo_cpus=16-N" would be equivalent to "16-31" on a 32 core system. + +Keep in mind that "N" is dynamic, so if system changes cause the bitmap width +to change, such as less cores in the CPU list, then N and any ranges using N +will also change. Use the same on a small 4 core system, and "16-N" becomes +"16-3" and now the same boot input will be flagged as invalid (start > end). This document may not be entirely up to date and comprehensive. The command diff --git a/lib/bitmap.c b/lib/bitmap.c index 833f152a2c431c..9f4626a4c95f12 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -519,11 +519,17 @@ static int bitmap_check_region(const struct region *r) return 0; } -static const char *bitmap_getnum(const char *str, unsigned int *num) +static const char *bitmap_getnum(const char *str, unsigned int *num, + unsigned int lastbit) { unsigned long long n; unsigned int len; + if (str[0] == 'N') { + *num = lastbit; + return str + 1; + } + len = _parse_integer(str, 10, &n); if (!len) return ERR_PTR(-EINVAL); @@ -571,7 +577,9 @@ static const char *bitmap_find_region_reverse(const char *start, const char *end static const char *bitmap_parse_region(const char *str, struct region *r) { - str = bitmap_getnum(str, &r->start); + unsigned int lastbit = r->nbits - 1; + + str = bitmap_getnum(str, &r->start, lastbit); if (IS_ERR(str)) return str; @@ -581,7 +589,7 @@ static const char *bitmap_parse_region(const char *str, struct region *r) if (*str != '-') return ERR_PTR(-EINVAL); - str = bitmap_getnum(str + 1, &r->end); + str = bitmap_getnum(str + 1, &r->end, lastbit); if (IS_ERR(str)) return str; @@ -591,14 +599,14 @@ static const char *bitmap_parse_region(const char *str, struct region *r) if (*str != ':') return ERR_PTR(-EINVAL); - str = bitmap_getnum(str + 1, &r->off); + str = bitmap_getnum(str + 1, &r->off, lastbit); if (IS_ERR(str)) return str; if (*str != '/') return ERR_PTR(-EINVAL); - return bitmap_getnum(str + 1, &r->group_len); + return bitmap_getnum(str + 1, &r->group_len, lastbit); no_end: r->end = r->start; @@ -625,6 +633,10 @@ static const char *bitmap_parse_region(const char *str, struct region *r) * From each group will be used only defined amount of bits. * Syntax: range:used_size/group_size * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 + * The value 'N' can be used as a dynamically substituted token for the + * maximum allowed value; i.e (nmaskbits - 1). Keep in mind that it is + * dynamic, so if system changes cause the bitmap width to change, such + * as more cores in a CPU list, then any ranges using N will also change. * * Returns: 0 on success, -errno on invalid input strings. Error values: * From 99c58d1adbca25fb3ee2469bf0904e1e3e021f7e Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:26 -0500 Subject: [PATCH 07/77] lib: test_bitmap: add tests for "N" alias These are copies of existing tests, with just 31 --> N. This ensures the recently added "N" alias transparently works in any normally numeric fields of a region specification. Cc: Yury Norov Cc: Rasmus Villemoes Cc: Andy Shevchenko Acked-by: Yury Norov Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- lib/test_bitmap.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index 3c1c46deb8fed1..9cd57558318097 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -353,6 +353,16 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = { {0, "16-31:16/31", &exp1[3 * step], 32, 0}, {0, "31-31:31/31", &exp1[14 * step], 32, 0}, + {0, "N-N", &exp1[14 * step], 32, 0}, + {0, "0-0:1/N", &exp1[0], 32, 0}, + {0, "0-0:N/N", &exp1[0], 32, 0}, + {0, "0-15:16/N", &exp1[2 * step], 32, 0}, + {0, "15-15:N/N", &exp1[13 * step], 32, 0}, + {0, "15-N:1/N", &exp1[13 * step], 32, 0}, + {0, "16-N:16/N", &exp1[3 * step], 32, 0}, + {0, "N-N:N/N", &exp1[14 * step], 32, 0}, + + {0, "0-N:1/3,1-N:1/3,2-N:1/3", &exp1[8 * step], 32, 0}, {0, "0-31:1/3,1-31:1/3,2-31:1/3", &exp1[8 * step], 32, 0}, {0, "1-10:8/12,8-31:24/29,0-31:0/3", &exp1[9 * step], 32, 0}, From 3e70df91f961b9df7ab3c0ae1934bdf15454c536 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 21 Feb 2021 03:08:27 -0500 Subject: [PATCH 08/77] rcu: deprecate "all" option to rcu_nocbs= With the core bitmap support now accepting "N" as a placeholder for the end of the bitmap, "all" can be represented as "0-N" and has the advantage of not being specific to RCU (or any other subsystem). So deprecate the use of "all" by removing documentation references to it. The support itself needs to remain for now, since we don't know how many people out there are using it currently, but since it is in an __init area anyway, it isn't worth losing sleep over. Cc: Yury Norov Cc: Peter Zijlstra Cc: "Paul E. McKenney" Cc: Josh Triplett Acked-by: Yury Norov Signed-off-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 +--- kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04545725f187ff..83e2ef192de934 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4068,9 +4068,7 @@ see CONFIG_RAS_CEC help text. rcu_nocbs= [KNL] - The argument is a cpu list, as described above, - except that the string "all" can be used to - specify every CPU on the system. + The argument is a cpu list, as described above. In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d603771c7dce8..0b955627d60990 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1464,14 +1464,12 @@ static void rcu_cleanup_after_idle(void) /* * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. - * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a - * comma-separated list of CPUs and/or CPU ranges. If an invalid list is - * given, a warning is emitted and all CPUs are offloaded. + * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (!strcasecmp(str, "all")) + if (!strcasecmp(str, "all")) /* legacy: use "0-N" instead */ cpumask_setall(rcu_nocb_mask); else if (cpulist_parse(str, rcu_nocb_mask)) { From c71c39b344f7eec9d4492913f22126b03bb7b746 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jan 2021 15:56:53 -0800 Subject: [PATCH 09/77] rcutorture: Use "all" and "N" in "nohz_full" and "rcu_nocbs" This commit uses the shiny new "all" and "N" cpumask options to decouple the "nohz_full" and "rcu_nocbs" kernel boot parameters in the TREE04.boot and TREE08.boot files from the CONFIG_NR_CPUS options in the TREE04 and TREE08 files. Reported-by: Paul Gortmaker Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot | 2 +- tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot index 5adc6756792a0a..a8d94caf7d2fdf 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot @@ -1 +1 @@ -rcutree.rcu_fanout_leaf=4 nohz_full=1-7 +rcutree.rcu_fanout_leaf=4 nohz_full=1-N diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot index 22478fd3a86588..94d38445d39372 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot @@ -1,3 +1,3 @@ rcupdate.rcu_self_test=1 rcutree.rcu_fanout_exact=1 -rcu_nocbs=0-7 +rcu_nocbs=all From d3ad5bbc4da70c25ad6b386e038e711d0755767b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 6 Jan 2021 23:07:15 +0100 Subject: [PATCH 10/77] rcu: Remove superfluous rdp fetch Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da6f5213fb74cb..cdf091f351817c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -648,7 +648,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rdp = this_cpu_ptr(&rcu_data); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); From 47fcbc8dd62f15dc75916225ebacdc3bca9c12b2 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 11 Jan 2021 17:15:58 +0530 Subject: [PATCH 11/77] rcu: Fix CPU-offline trace in rcutree_dying_cpu The condition in the trace_rcu_grace_period() in rcutree_dying_cpu() is backwards, so that it uses the string "cpuofl" when the offline CPU is blocking the current grace period and "cpuofl-bgp" otherwise. Given that the "-bgp" stands for "blocking grace period", this is at best misleading. This commit therefore switches these strings in order to correctly trace whether the outgoing cpu blocks the current grace period. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cdf091f351817c..e62c2defae987b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2413,7 +2413,7 @@ int rcutree_dying_cpu(unsigned int cpu) blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); return 0; } From 6494ccb93271bee596a12db32ff44867d5be2321 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Mon, 11 Jan 2021 09:08:59 +0800 Subject: [PATCH 12/77] rcu: Remove spurious instrumentation_end() in rcu_nmi_enter() In rcu_nmi_enter(), there is an erroneous instrumentation_end() in the second branch of the "if" statement. Oddly enough, "objtool check -f vmlinux.o" fails to complain because it is unable to correctly cover all cases. Instead, objtool visits the third branch first, which marks following trace_rcu_dyntick() as visited. This commit therefore removes the spurious instrumentation_end(). Fixes: 04b25a495bd6 ("rcu: Mark rcu_nmi_enter() call to rcu_cleanup_after_idle() noinstr") Reported-by Neeraj Upadhyay Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e62c2defae987b..4d90f202ef4a6a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1076,7 +1076,6 @@ noinstr void rcu_nmi_enter(void) } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); - instrumentation_end(); } else { instrumentation_begin(); } From 5bb1369d4bea078dd1298dfc2c6ce781d9e34dde Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Sat, 16 Jan 2021 00:11:45 +0900 Subject: [PATCH 13/77] rculist: Replace reference to atomic_ops.rst The hlist_nulls_for_each_entry_rcu() docbook header references the atomic_ops.rst file, which was removed in commit f0400a77ebdc ("atomic: Delete obsolete documentation"). This commit therefore substitutes a section in memory-barriers.txt discussing the use of barrier() in loops. Cc: Peter Zijlstra Signed-off-by: Akira Yokosawa Signed-off-by: Paul E. McKenney --- include/linux/rculist_nulls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index ff3e94779e73c4..d8afdb8784c1c9 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -161,7 +161,7 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] - * [1] Documentation/core-api/atomic_ops.rst around line 114 + * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ From e75956bd00cf4246067c6aee7751faf313233435 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 14 Jan 2021 08:22:02 +0100 Subject: [PATCH 14/77] rcu: Fix kfree_rcu() docbook errors After commit 5130b8fd0690 ("rcu: Introduce kfree_rcu() single-argument macro"), kernel-doc now emits two warnings: ./include/linux/rcupdate.h:884: warning: Excess function parameter 'ptr' description in 'kfree_rcu' ./include/linux/rcupdate.h:884: warning: Excess function parameter 'rhf' description in 'kfree_rcu' This commit added some macro magic in order to call two different versions of kfree_rcu(), the first having just one argument and the second having two arguments. That makes it difficult to document the kfree_rcu() arguments in the docboook header. In order to make clearer that this macro accepts optional arguments, this commit uses macro concatenation so that this macro changes from: #define kfree_rcu kvfree_rcu to: #define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) That not only helps kernel-doc understand the macro arguments, but also provides a better C definition that makes clearer that the first argument is mandatory and the second one is optional. Fixes: 5130b8fd0690 ("rcu: Introduce kfree_rcu() single-argument macro") Tested-by: Uladzislau Rezki (Sony) Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index bd04f722714f65..5cc6deaa5df2bb 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -881,7 +881,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ -#define kfree_rcu kvfree_rcu +#define kfree_rcu(ptr, rhf...) kvfree_rcu(ptr, ## rhf) /** * kvfree_rcu() - kvfree an object after a grace period. From 148e3731d124079a036b3acf780f3d35c1b9c0aa Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 20 Jan 2021 17:21:46 +0100 Subject: [PATCH 15/77] kvfree_rcu: Directly allocate page for single-argument case Single-argument kvfree_rcu() must be invoked from sleepable contexts, so we can directly allocate pages. Furthermmore, the fallback in case of page-allocation failure is the high-latency synchronize_rcu(), so it makes sense to do these page allocations from the fastpath, and even to permit limited sleeping within the allocator. This commit therefore allocates if needed on the fastpath using GFP_KERNEL|__GFP_RETRY_MAYFAIL. This also has the beneficial effect of leaving kvfree_rcu()'s per-CPU caches to the double-argument variant of kvfree_rcu(), given that the double-argument variant cannot directly invoke the allocator. [ paulmck: Add add_ptr_to_bulk_krc_lock header comment per Michal Hocko. ] Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da6f5213fb74cb..1f8c980f41908a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3493,37 +3493,50 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp) } } +// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() +// state specified by flags. If can_alloc is true, the caller must +// be schedulable and not be holding any locks or mutexes that might be +// acquired by the memory allocator or anything that it might invoke. +// Returns true if ptr was successfully recorded, else the caller must +// use a fallback. static inline bool -kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) +add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, + unsigned long *flags, void *ptr, bool can_alloc) { struct kvfree_rcu_bulk_data *bnode; int idx; - if (unlikely(!krcp->initialized)) + *krcp = krc_this_cpu_lock(flags); + if (unlikely(!(*krcp)->initialized)) return false; - lockdep_assert_held(&krcp->lock); idx = !!is_vmalloc_addr(ptr); /* Check if a new block is required. */ - if (!krcp->bkvhead[idx] || - krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { - bnode = get_cached_bnode(krcp); - /* Switch to emergency path. */ + if (!(*krcp)->bkvhead[idx] || + (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + bnode = get_cached_bnode(*krcp); + if (!bnode && can_alloc) { + krc_this_cpu_unlock(*krcp, *flags); + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); + *krcp = krc_this_cpu_lock(flags); + } + if (!bnode) return false; /* Initialize the new block. */ bnode->nr_records = 0; - bnode->next = krcp->bkvhead[idx]; + bnode->next = (*krcp)->bkvhead[idx]; /* Attach it to the head. */ - krcp->bkvhead[idx] = bnode; + (*krcp)->bkvhead[idx] = bnode; } /* Finally insert. */ - krcp->bkvhead[idx]->records - [krcp->bkvhead[idx]->nr_records++] = ptr; + (*krcp)->bkvhead[idx]->records + [(*krcp)->bkvhead[idx]->nr_records++] = ptr; return true; } @@ -3561,8 +3574,6 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) ptr = (unsigned long *) func; } - krcp = krc_this_cpu_lock(&flags); - // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. @@ -3570,12 +3581,11 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) __func__, head); // Mark as success and leave. - success = true; - goto unlock_return; + return; } kasan_record_aux_stack(ptr); - success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); + success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); From b01b405092b7940bd366053a27ed54a87c84e96a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Jan 2021 17:21:47 +0100 Subject: [PATCH 16/77] kvfree_rcu: Use __GFP_NOMEMALLOC for single-argument kvfree_rcu() This commit applies the __GFP_NOMEMALLOC gfp flag to memory allocations carried out by the single-argument variant of kvfree_rcu(), thus avoiding this can-sleep code path from dipping into the emergency reserves. Acked-by: Michal Hocko Suggested-by: Michal Hocko Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1f8c980f41908a..08b50441ebe8c6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3519,7 +3519,7 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, if (!bnode && can_alloc) { krc_this_cpu_unlock(*krcp, *flags); bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); + __get_free_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOMEMALLOC | __GFP_NOWARN); *krcp = krc_this_cpu_lock(flags); } From 7ffc9ec8eac196cbd85669a4d7920cd80f186a51 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Jan 2021 13:38:08 -0800 Subject: [PATCH 17/77] kvfree_rcu: Make krc_this_cpu_unlock() use raw_spin_unlock_irqrestore() The krc_this_cpu_unlock() function does a raw_spin_unlock() immediately followed by a local_irq_restore(). This commit saves a line of code by merging them into a raw_spin_unlock_irqrestore(). This transformation also reduces scheduling latency because raw_spin_unlock_irqrestore() responds immediately to a reschedule request. In contrast, local_irq_restore() does a scheduling-oblivious enabling of interrupts. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 08b50441ebe8c6..7ee83f3a15eab9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3229,8 +3229,7 @@ krc_this_cpu_lock(unsigned long *flags) static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { - raw_spin_unlock(&krcp->lock); - local_irq_restore(flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); } static inline struct kvfree_rcu_bulk_data * From 3e7ce7a187fc6aaa9fda1310a2b8da8770342ff7 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 29 Jan 2021 17:16:03 +0100 Subject: [PATCH 18/77] kvfree_rcu: Replace __GFP_RETRY_MAYFAIL by __GFP_NORETRY __GFP_RETRY_MAYFAIL can spend quite a bit of time reclaiming, and this can be wasted effort given that there is a fallback code path in case memory allocation fails. __GFP_NORETRY does perform some light-weight reclaim, but it will fail under OOM conditions, allowing the fallback to be taken as an alternative to hard-OOMing the system. There is a four-way tradeoff that must be balanced: 1) Minimize use of the fallback path; 2) Avoid full-up OOM; 3) Do a light-wait allocation request; 4) Avoid dipping into the emergency reserves. Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7ee83f3a15eab9..0ecc1fb81ac322 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3517,8 +3517,20 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, bnode = get_cached_bnode(*krcp); if (!bnode && can_alloc) { krc_this_cpu_unlock(*krcp, *flags); + + // __GFP_NORETRY - allows a light-weight direct reclaim + // what is OK from minimizing of fallback hitting point of + // view. Apart of that it forbids any OOM invoking what is + // also beneficial since we are about to release memory soon. + // + // __GFP_NOMEMALLOC - prevents from consuming of all the + // memory reserves. Please note we have a fallback path. + // + // __GFP_NOWARN - it is supposed that an allocation can + // be failed under low memory or high memory pressure + // scenarios. bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOMEMALLOC | __GFP_NOWARN); + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); *krcp = krc_this_cpu_lock(flags); } From ee6ddf58475cce8a3d3697614679cd8cb4a6f583 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 29 Jan 2021 21:05:05 +0100 Subject: [PATCH 19/77] kvfree_rcu: Use same set of GFP flags as does single-argument Running an rcuscale stress-suite can lead to "Out of memory" of a system. This can happen under high memory pressure with a small amount of physical memory. For example, a KVM test configuration with 64 CPUs and 512 megabytes can result in OOM when running rcuscale with below parameters: ../kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig CONFIG_NR_CPUS=64 \ --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 \ rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" --trust-make [ 12.054448] kworker/1:1H invoked oom-killer: gfp_mask=0x2cc0(GFP_KERNEL|__GFP_NOWARN), order=0, oom_score_adj=0 [ 12.055303] CPU: 1 PID: 377 Comm: kworker/1:1H Not tainted 5.11.0-rc3+ #510 [ 12.055416] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 04/01/2014 [ 12.056485] Workqueue: events_highpri fill_page_cache_func [ 12.056485] Call Trace: [ 12.056485] dump_stack+0x57/0x6a [ 12.056485] dump_header+0x4c/0x30a [ 12.056485] ? del_timer_sync+0x20/0x30 [ 12.056485] out_of_memory.cold.47+0xa/0x7e [ 12.056485] __alloc_pages_slowpath.constprop.123+0x82f/0xc00 [ 12.056485] __alloc_pages_nodemask+0x289/0x2c0 [ 12.056485] __get_free_pages+0x8/0x30 [ 12.056485] fill_page_cache_func+0x39/0xb0 [ 12.056485] process_one_work+0x1ed/0x3b0 [ 12.056485] ? process_one_work+0x3b0/0x3b0 [ 12.060485] worker_thread+0x28/0x3c0 [ 12.060485] ? process_one_work+0x3b0/0x3b0 [ 12.060485] kthread+0x138/0x160 [ 12.060485] ? kthread_park+0x80/0x80 [ 12.060485] ret_from_fork+0x22/0x30 [ 12.062156] Mem-Info: [ 12.062350] active_anon:0 inactive_anon:0 isolated_anon:0 [ 12.062350] active_file:0 inactive_file:0 isolated_file:0 [ 12.062350] unevictable:0 dirty:0 writeback:0 [ 12.062350] slab_reclaimable:2797 slab_unreclaimable:80920 [ 12.062350] mapped:1 shmem:2 pagetables:8 bounce:0 [ 12.062350] free:10488 free_pcp:1227 free_cma:0 ... [ 12.101610] Out of memory and no killable processes... [ 12.102042] Kernel panic - not syncing: System is deadlocked on memory [ 12.102583] CPU: 1 PID: 377 Comm: kworker/1:1H Not tainted 5.11.0-rc3+ #510 [ 12.102600] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 04/01/2014 Because kvfree_rcu() has a fallback path, memory allocation failure is not the end of the world. Furthermore, the added overhead of aggressive GFP settings must be balanced against the overhead of the fallback path, which is a cache miss for double-argument kvfree_rcu() and a call to synchronize_rcu() for single-argument kvfree_rcu(). The current choice of GFP_KERNEL|__GFP_NOWARN can result in longer latencies than a call to synchronize_rcu(), so less-tenacious GFP flags would be helpful. Here is the tradeoff that must be balanced: a) Minimize use of the fallback path, b) Avoid pushing the system into OOM, c) Bound allocation latency to that of synchronize_rcu(), and d) Leave the emergency reserves to use cases lacking fallbacks. This commit therefore changes GFP flags from GFP_KERNEL|__GFP_NOWARN to GFP_KERNEL|__GFP_NORETRY|__GFP_NOMEMALLOC|__GFP_NOWARN. This combination leaves the emergency reserves alone and can initiate reclaim, but will not invoke the OOM killer. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0ecc1fb81ac322..4120d4bb3d61ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3463,7 +3463,7 @@ static void fill_page_cache_func(struct work_struct *work) for (i = 0; i < rcu_min_cached_objs; i++) { bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_KERNEL | __GFP_NOWARN); + __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); if (bnode) { raw_spin_lock_irqsave(&krcp->lock, flags); From 686fe1bf6bcce3ce9fc03c9d9035c643c320ca46 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 17 Feb 2021 19:51:10 +0100 Subject: [PATCH 20/77] rcuscale: Add kfree_rcu() single-argument scale test The single-argument variant of kfree_rcu() is currently not tested by any member of the rcutoture test suite. This commit therefore adds rcuscale code to test it. This testing is controlled by two new boolean module parameters, kfree_rcu_test_single and kfree_rcu_test_double. If one is set and the other not, only the corresponding variant is tested, otherwise both are tested, with the variant to be tested determined randomly on each invocation. Both of these module parameters are initialized to false, so setting either to true will test only that variant. Suggested-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 12 ++++++++++++ kernel/rcu/rcuscale.c | 15 ++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04545725f187ff..84fce4157506fd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4259,6 +4259,18 @@ rcuscale.kfree_rcu_test= [KNL] Set to measure performance of kfree_rcu() flooding. + rcuscale.kfree_rcu_test_double= [KNL] + Test the double-argument variant of kfree_rcu(). + If this parameter has the same value as + rcuscale.kfree_rcu_test_single, both the single- + and double-argument variants are tested. + + rcuscale.kfree_rcu_test_single= [KNL] + Test the single-argument variant of kfree_rcu(). + If this parameter has the same value as + rcuscale.kfree_rcu_test_double, both the single- + and double-argument variants are tested. + rcuscale.kfree_nthreads= [KNL] The number of threads running loops of kfree_rcu(). diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 06491d5530dbba..dca51fe9c73f26 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -625,6 +625,8 @@ rcu_scale_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); +torture_param(bool, kfree_rcu_test_double, false, "Do we run a kfree_rcu() double-argument scale test?"); +torture_param(bool, kfree_rcu_test_single, false, "Do we run a kfree_rcu() single-argument scale test?"); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -644,10 +646,13 @@ kfree_scale_thread(void *arg) struct kfree_obj *alloc_ptr; u64 start_time, end_time; long long mem_begin, mem_during = 0; + bool kfree_rcu_test_both; + DEFINE_TORTURE_RANDOM(tr); VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started"); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); + kfree_rcu_test_both = (kfree_rcu_test_single == kfree_rcu_test_double); start_time = ktime_get_mono_fast_ns(); @@ -670,7 +675,15 @@ kfree_scale_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - kfree_rcu(alloc_ptr, rh); + // By default kfree_rcu_test_single and kfree_rcu_test_double are + // initialized to false. If both have the same value (false or true) + // both are randomly tested, otherwise only the one with value true + // is tested. + if ((kfree_rcu_test_single && !kfree_rcu_test_double) || + (kfree_rcu_test_both && torture_random(&tr) & 0x800)) + kfree_rcu(alloc_ptr); + else + kfree_rcu(alloc_ptr, rh); } cond_resched(); From 5bb1bb353cfe343fc3c84faf06f72ba309fde541 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Jan 2021 13:46:11 -0800 Subject: [PATCH 21/77] mm: Don't build mm_dump_obj() on CONFIG_PRINTK=n kernels The mem_dump_obj() functionality adds a few hundred bytes, which is a small price to pay. Except on kernels built with CONFIG_PRINTK=n, in which mem_dump_obj() messages will be suppressed. This commit therefore makes mem_dump_obj() be a static inline empty function on kernels built with CONFIG_PRINTK=n and excludes all of its support functions as well. This avoids kernel bloat on systems that cannot use mem_dump_obj(). Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Suggested-by: Andrew Morton Signed-off-by: Paul E. McKenney --- include/linux/mm.h | 4 ++++ include/linux/slab.h | 2 ++ include/linux/vmalloc.h | 2 +- mm/slab.c | 2 ++ mm/slab.h | 2 ++ mm/slab_common.c | 2 ++ mm/slob.c | 2 ++ mm/slub.c | 2 ++ mm/util.c | 2 ++ mm/vmalloc.c | 2 ++ 10 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 77e64e3eac80bd..89fca443e6f195 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3135,7 +3135,11 @@ unsigned long wp_shared_mapping_range(struct address_space *mapping, extern int sysctl_nr_trim_pages; +#ifdef CONFIG_PRINTK void mem_dump_obj(void *object); +#else +static inline void mem_dump_obj(void *object) {} +#endif #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 7ae60407676703..0c97d788762cf4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -186,8 +186,10 @@ void kfree(const void *); void kfree_sensitive(const void *); size_t __ksize(const void *); size_t ksize(const void *); +#ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); +#endif #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR void __check_heap_object(const void *ptr, unsigned long n, struct page *page, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index df92211cf7718f..3de7be6dd17cd9 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -241,7 +241,7 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) int register_vmap_purge_notifier(struct notifier_block *nb); int unregister_vmap_purge_notifier(struct notifier_block *nb); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_PRINTK) bool vmalloc_dump_obj(void *object); #else static inline bool vmalloc_dump_obj(void *object) { return false; } diff --git a/mm/slab.c b/mm/slab.c index 51fd424e0d6d03..2e64efeb99a16c 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3651,6 +3651,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { struct kmem_cache *cachep; @@ -3670,6 +3671,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) if (DEBUG && cachep->flags & SLAB_STORE_USER) kpp->kp_ret = *dbg_userword(cachep, objp); } +#endif /** * __do_kmalloc - allocate memory diff --git a/mm/slab.h b/mm/slab.h index 076582f58f6875..120b1d0dfb6d27 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -619,6 +619,7 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c) return false; } +#ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; @@ -630,5 +631,6 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; }; void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +#endif #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 88e833986332e0..cec95363e62138 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -526,6 +526,7 @@ bool slab_is_available(void) return slab_state >= UP; } +#ifdef CONFIG_PRINTK /** * kmem_valid_obj - does the pointer reference a valid slab object? * @object: pointer to query. @@ -600,6 +601,7 @@ void kmem_dump_obj(void *object) pr_info(" %pS\n", kp.kp_stack[i]); } } +#endif #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ diff --git a/mm/slob.c b/mm/slob.c index 0578429b991b4b..74d3f6e60666e0 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -461,11 +461,13 @@ static void slob_free(void *block, int size) spin_unlock_irqrestore(&slob_lock, flags); } +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { kpp->kp_ptr = object; kpp->kp_page = page; } +#endif /* * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. diff --git a/mm/slub.c b/mm/slub.c index e26c274b4657f2..077a019e4d7a5a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3963,6 +3963,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) return 0; } +#ifdef CONFIG_PRINTK void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { void *base; @@ -4002,6 +4003,7 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) #endif #endif } +#endif /******************************************************************** * Kmalloc subsystem diff --git a/mm/util.c b/mm/util.c index 54870226cea64a..2d497fe0f17d41 100644 --- a/mm/util.c +++ b/mm/util.c @@ -983,6 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) return ret; } +#ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. @@ -1013,3 +1014,4 @@ void mem_dump_obj(void *object) } pr_cont(" non-slab/vmalloc memory.\n"); } +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4f5f8c907897ae..d5f2a84e488ad8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3450,6 +3450,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) } #endif /* CONFIG_SMP */ +#ifdef CONFIG_PRINTK bool vmalloc_dump_obj(void *object) { struct vm_struct *vm; @@ -3462,6 +3463,7 @@ bool vmalloc_dump_obj(void *object) vm->nr_pages, (unsigned long)vm->addr, vm->caller); return true; } +#endif #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) From 0d3dd2c8eadb7d4404b8788f552fb2b824fe2c7e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 7 Dec 2020 21:23:36 -0800 Subject: [PATCH 22/77] rcutorture: Add crude tests for mem_dump_obj() This commit adds a few crude tests for mem_dump_obj() to rcutorture runs. Just to prevent bitrot, you understand! Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 39 +++++++++++++++++++++++++++++++++++++++ mm/slab_common.c | 2 ++ mm/util.c | 1 + 3 files changed, 42 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 99657ffa66887a..8e93f2e5da79d9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1861,6 +1861,45 @@ rcu_torture_stats(void *arg) torture_shutdown_absorb("rcu_torture_stats"); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_stats"); + + { + struct rcu_head *rhp; + struct kmem_cache *kcp; + static int z; + + kcp = kmem_cache_create("rcuscale", 136, 8, SLAB_STORE_USER, NULL); + rhp = kmem_cache_alloc(kcp, GFP_KERNEL); + pr_alert("mem_dump_obj() slab test: rcu_torture_stats = %px, &rhp = %px, rhp = %px, &z = %px\n", stats_task, &rhp, rhp, &z); + pr_alert("mem_dump_obj(ZERO_SIZE_PTR):"); + mem_dump_obj(ZERO_SIZE_PTR); + pr_alert("mem_dump_obj(NULL):"); + mem_dump_obj(NULL); + pr_alert("mem_dump_obj(%px):", &rhp); + mem_dump_obj(&rhp); + pr_alert("mem_dump_obj(%px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(%px):", &rhp->func); + mem_dump_obj(&rhp->func); + pr_alert("mem_dump_obj(%px):", &z); + mem_dump_obj(&z); + kmem_cache_free(kcp, rhp); + kmem_cache_destroy(kcp); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + pr_alert("mem_dump_obj() kmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(kmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(kmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + kfree(rhp); + rhp = vmalloc(4096); + pr_alert("mem_dump_obj() vmalloc test: rcu_torture_stats = %px, &rhp = %px, rhp = %px\n", stats_task, &rhp, rhp); + pr_alert("mem_dump_obj(vmalloc %px):", rhp); + mem_dump_obj(rhp); + pr_alert("mem_dump_obj(vmalloc %px):", &rhp->func); + mem_dump_obj(&rhp->func); + vfree(rhp); + } + return 0; } diff --git a/mm/slab_common.c b/mm/slab_common.c index cec95363e62138..4c6107e39f9a99 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -545,6 +545,7 @@ bool kmem_valid_obj(void *object) page = virt_to_head_page(object); return PageSlab(page); } +EXPORT_SYMBOL_GPL(kmem_valid_obj); /** * kmem_dump_obj - Print available slab provenance information @@ -601,6 +602,7 @@ void kmem_dump_obj(void *object) pr_info(" %pS\n", kp.kp_stack[i]); } } +EXPORT_SYMBOL_GPL(kmem_dump_obj); #endif #ifndef CONFIG_SLOB diff --git a/mm/util.c b/mm/util.c index 2d497fe0f17d41..c37e24d5fa43e2 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1014,4 +1014,5 @@ void mem_dump_obj(void *object) } pr_cont(" non-slab/vmalloc memory.\n"); } +EXPORT_SYMBOL_GPL(mem_dump_obj); #endif From 3820b513a2e33d6dee1caa3b4815f92079cb9890 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 12 Nov 2020 01:51:21 +0100 Subject: [PATCH 23/77] rcu/nocb: Detect unsafe checks for offloaded rdp Provide CONFIG_PROVE_RCU sanity checks to ensure we are always reading the offloaded state of an rdp in a safe and stable way and prevent from its value to be changed under us. We must either hold the barrier mutex, the cpu-hotplug lock (read or write) or the nocb lock. Local non-preemptible reads are also safe. NOCB kthreads and timers have their own means of synchronization against the offloaded state updaters. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 +++++----- kernel/rcu/tree_plugin.h | 90 +++++++++++++++++++++++++++++++++------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da6f5213fb74cb..03503e295b5121 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -156,6 +156,7 @@ static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; @@ -1672,7 +1673,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; bool need_qs; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); raw_lockdep_assert_held_rcu_node(rnp); @@ -2128,7 +2129,7 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + offloaded = rcu_rdp_is_offloaded(rdp); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2327,7 +2328,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2497,7 +2498,7 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -3066,7 +3067,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); /* Go handle any RCU core processing required. */ - if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + if (unlikely(rcu_rdp_is_offloaded(rdp))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); @@ -3843,13 +3844,13 @@ static int rcu_pending(int user) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + if (!rcu_rdp_is_offloaded(rdp) && rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && - !rcu_segcblist_is_offloaded(&rdp->cblist) && + !rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; @@ -3968,7 +3969,7 @@ void rcu_barrier(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (cpu_is_offline(cpu) && - !rcu_segcblist_is_offloaded(&rdp->cblist)) + !rcu_rdp_is_offloaded(rdp)) continue; if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { rcu_barrier_trace(TPS("OnlineQ"), cpu, @@ -4291,7 +4292,7 @@ void rcutree_migrate_callbacks(int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); bool needwake; - if (rcu_segcblist_is_offloaded(&rdp->cblist) || + if (rcu_rdp_is_offloaded(rdp) || rcu_segcblist_empty(&rdp->cblist)) return; /* No callbacks to migrate. */ @@ -4309,7 +4310,7 @@ void rcutree_migrate_callbacks(int cpu) rcu_segcblist_disable(&rdp->cblist); WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist)); - if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { + if (rcu_rdp_is_offloaded(my_rdp)) { raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ __call_rcu_nocb_wake(my_rdp, true, flags); } else { diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d603771c7dce8..cd513ea7b0f945 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,8 +16,70 @@ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return lockdep_is_held(&rdp->nocb_lock); +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + /* Race on early boot between thread creation and assignment */ + if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) + return true; + + if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) + if (in_task()) + return true; + return false; +} + +static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) +{ + return (timer_curr_running(&rdp->nocb_timer) && !in_irq()); +} +#else +static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) +{ + return 0; +} + +static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) +{ + return false; +} + +static inline bool rcu_running_nocb_timer(struct rcu_data *rdp) +{ + return false; +} + #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) +{ + /* + * In order to read the offloaded state of an rdp is a safe + * and stable way and prevent from its value to be changed + * under us, we must either hold the barrier mutex, the cpu + * hotplug lock (read or write) or the nocb lock. Local + * non-preemptible reads are also safe. NOCB kthreads and + * timers have their own means of synchronization against the + * offloaded state updaters. + */ + RCU_LOCKDEP_WARN( + !(lockdep_is_held(&rcu_state.barrier_mutex) || + (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || + rcu_lockdep_is_held_nocb(rdp) || + (rdp == this_cpu_ptr(&rcu_data) && + !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || + rcu_current_is_nocb_kthread(rdp) || + rcu_running_nocb_timer(rdp)), + "Unsafe read of RCU_NOCB offloaded state" + ); + + return rcu_segcblist_is_offloaded(&rdp->cblist); +} + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. @@ -1257,7 +1319,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist); + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } /* @@ -1352,7 +1414,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ if (rcu_segcblist_empty(&rdp->cblist) || - rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) { + rcu_rdp_is_offloaded(rdp)) { *nextevt = KTIME_MAX; return 0; } @@ -1388,7 +1450,7 @@ static void rcu_prepare_for_idle(void) int tne; lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) return; /* Handle nohz enablement switches conservatively. */ @@ -1429,7 +1491,7 @@ static void rcu_cleanup_after_idle(void) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) return; if (rcu_try_advance_all_cbs()) invoke_rcu_core(); @@ -1560,7 +1622,7 @@ static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) static void rcu_nocb_lock(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp)) return; raw_spin_lock(&rdp->nocb_lock); } @@ -1571,7 +1633,7 @@ static void rcu_nocb_lock(struct rcu_data *rdp) */ static void rcu_nocb_unlock(struct rcu_data *rdp) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { lockdep_assert_irqs_disabled(); raw_spin_unlock(&rdp->nocb_lock); } @@ -1584,7 +1646,7 @@ static void rcu_nocb_unlock(struct rcu_data *rdp) static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, unsigned long flags) { - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { lockdep_assert_irqs_disabled(); raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } else { @@ -1596,7 +1658,7 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) { lockdep_assert_irqs_disabled(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) + if (rcu_rdp_is_offloaded(rdp)) lockdep_assert_held(&rdp->nocb_lock); } @@ -1690,7 +1752,7 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, { struct rcu_cblist rcl; - WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); + WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); rcu_lockdep_assert_cblist_protected(rdp); lockdep_assert_held(&rdp->nocb_bypass_lock); if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { @@ -1718,7 +1780,7 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j) { - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + if (!rcu_rdp_is_offloaded(rdp)) return true; rcu_lockdep_assert_cblist_protected(rdp); rcu_nocb_bypass_lock(rdp); @@ -1732,7 +1794,7 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) { rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_segcblist_is_offloaded(&rdp->cblist) || + if (!rcu_rdp_is_offloaded(rdp) || !rcu_nocb_bypass_trylock(rdp)) return; WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); @@ -1764,7 +1826,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j = jiffies; long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (!rcu_rdp_is_offloaded(rdp)) { *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); return false; /* Not offloaded, no bypassing. */ } @@ -2397,7 +2459,7 @@ int rcu_nocb_cpu_deoffload(int cpu) } mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); else @@ -2472,7 +2534,7 @@ int rcu_nocb_cpu_offload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); - if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (!rcu_rdp_is_offloaded(rdp)) { if (cpu_online(cpu)) ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); else From 5de2e5bb80aeef82f75fff76120874cdc86f935d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jan 2021 18:12:08 +0100 Subject: [PATCH 24/77] rcu/nocb: Comment the reason behind BH disablement on batch processing This commit explains why softirqs need to be disabled while invoking callbacks, even when callback processing has been offloaded. After all, invoking callbacks concurrently is one thing, but concurrently invoking the same callback is quite another. Reported-by: Boqun Feng Reported-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cd513ea7b0f945..013142d1ef7656 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2235,6 +2235,12 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); + /* + * Disable BH to provide the expected environment. Also, when + * transitioning to/from NOCB mode, a self-requeuing callback might + * be invoked from softirq. A short grace period could cause both + * instances of this callback would execute concurrently. + */ local_bh_disable(); rcu_do_batch(rdp); local_bh_enable(); From 64305db2856b969a5d48e8f3a5b0d06b5594591c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jan 2021 18:12:09 +0100 Subject: [PATCH 25/77] rcu/nocb: Forbid NOCB toggling on offline CPUs It makes no sense to de-offload an offline CPU because that CPU will never invoke any remaining callbacks. It also makes little sense to offload an offline CPU because any pending RCU callbacks were migrated when that CPU went offline. Yes, it is in theory possible to use a number of tricks to permit offloading and deoffloading offline CPUs in certain cases, but in practice it is far better to have the simple and deterministic rule "Toggling the offload state of an offline CPU is forbidden". For but one example, consider that an offloaded offline CPU might have millions of callbacks queued. Best to just say "no". This commit therefore forbids toggling of the offloaded state of offline CPUs. Reported-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- kernel/rcu/tree_plugin.h | 57 +++++++++++++++------------------------- 2 files changed, 22 insertions(+), 38 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 03503e295b5121..ee77858403f8e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4086,8 +4086,7 @@ int rcutree_prepare_cpu(unsigned int cpu) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Lock in case the CB/GP kthreads are still around handling - * old callbacks (longer term we should flush all callbacks - * before completing CPU offline) + * old callbacks. */ rcu_nocb_lock(rdp); if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013142d1ef7656..9fd8588bba1477 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2399,23 +2399,18 @@ static int rdp_offload_toggle(struct rcu_data *rdp, return 0; } -static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +static long rcu_nocb_rdp_deoffload(void *arg) { + struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int ret; + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + pr_info("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); - /* - * If there are still pending work offloaded, the offline - * CPU won't help much handling them. - */ - if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - return -EBUSY; - } ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, @@ -2446,14 +2441,6 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) return ret; } -static long rcu_nocb_rdp_deoffload(void *arg) -{ - struct rcu_data *rdp = arg; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - return __rcu_nocb_rdp_deoffload(rdp); -} - int rcu_nocb_cpu_deoffload(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2466,12 +2453,14 @@ int rcu_nocb_cpu_deoffload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) + if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - else - ret = __rcu_nocb_rdp_deoffload(rdp); - if (!ret) - cpumask_clear_cpu(cpu, rcu_nocb_mask); + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); + ret = -EINVAL; + } } cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); @@ -2480,12 +2469,14 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); -static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) +static long rcu_nocb_rdp_offload(void *arg) { + struct rcu_data *rdp = arg; struct rcu_segcblist *cblist = &rdp->cblist; unsigned long flags; int ret; + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); /* * For now we only support re-offload, ie: the rdp must have been * offloaded on boot first. @@ -2525,14 +2516,6 @@ static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) return ret; } -static long rcu_nocb_rdp_offload(void *arg) -{ - struct rcu_data *rdp = arg; - - WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); - return __rcu_nocb_rdp_offload(rdp); -} - int rcu_nocb_cpu_offload(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2541,12 +2524,14 @@ int rcu_nocb_cpu_offload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (!rcu_rdp_is_offloaded(rdp)) { - if (cpu_online(cpu)) + if (cpu_online(cpu)) { ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - else - ret = __rcu_nocb_rdp_offload(rdp); - if (!ret) - cpumask_set_cpu(cpu, rcu_nocb_mask); + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } else { + pr_info("NOCB: Can't CB-offload an offline CPU\n"); + ret = -EINVAL; + } } cpus_read_unlock(); mutex_unlock(&rcu_state.barrier_mutex); From 8a682b3974c36853b52fc8ede14dee966e96e19f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jan 2021 18:12:12 +0100 Subject: [PATCH 26/77] rcu/nocb: Avoid confusing double write of rdp->nocb_cb_sleep The nocb_cb_wait() function first sets the rdp->nocb_cb_sleep flag to true by after invoking the callbacks, and then sets it back to false if it finds more callbacks that are ready to invoke. This is confusing and will become unsafe if this flag is ever read locklessly. This commit therefore writes it only once, based on the state after both callback invocation and checking. Reported-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 9fd8588bba1477..6a7f77d90fb0aa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2230,6 +2230,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) unsigned long flags; bool needwake_state = false; bool needwake_gp = false; + bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); @@ -2253,8 +2254,6 @@ static void nocb_cb_wait(struct rcu_data *rdp) raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - WRITE_ONCE(rdp->nocb_cb_sleep, true); - if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); @@ -2262,7 +2261,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) needwake_state = true; } if (rcu_segcblist_ready_cbs(cblist)) - WRITE_ONCE(rdp->nocb_cb_sleep, false); + can_sleep = false; } else { /* * De-offloading. Clear our flag and notify the de-offload worker. @@ -2275,6 +2274,8 @@ static void nocb_cb_wait(struct rcu_data *rdp) needwake_state = true; } + WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); + if (rdp->nocb_cb_sleep) trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); From ec711bc12c777b1165585f59f7a6c35a89e04cc3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jan 2021 18:12:10 +0100 Subject: [PATCH 27/77] rcu/nocb: Only (re-)initialize segcblist when needed on CPU up At the start of a CPU-hotplug operation, the incoming CPU's callback list can be in a number of states: 1. Disabled and empty. This is the case when the boot CPU has not invoked call_rcu(), when a non-boot CPU first comes online, and when a non-offloaded CPU comes back online. In this case, it is both necessary and permissible to initialize ->cblist. Because either the CPU is currently running with interrupts disabled (boot CPU) or is not yet running at all (other CPUs), it is not necessary to acquire ->nocb_lock. In this case, initialization is required. 2. Disabled and non-empty. This cannot occur, because early boot call_rcu() invocations enable the callback list before enqueuing their callback. 3. Enabled, whether empty or not. In this case, the callback list has already been initialized. This case occurs when the boot CPU has executed an early boot call_rcu() and also when an offloaded CPU comes back online. In both cases, there is no need to initialize the callback list: In the boot-CPU case, the CPU has not (yet) gone offline, and in the offloaded case, the rcuo kthreads are taking care of business. Because it is not necessary to initialize the callback list, it is also not necessary to acquire ->nocb_lock. Therefore, checking if the segcblist is enabled suffices. This commit therefore initializes the callback list at rcutree_prepare_cpu() time only if that list is disabled. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ee77858403f8e3..402ea365e17c32 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4084,14 +4084,13 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* - * Lock in case the CB/GP kthreads are still around handling - * old callbacks. + * Only non-NOCB CPUs that didn't have early-boot callbacks need to be + * (re-)initialized. */ - rcu_nocb_lock(rdp); - if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + if (!rcu_segcblist_is_enabled(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed From 55adc3e1c82a25e99e9efef4f2b14b8b4806918a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jan 2021 18:12:13 +0100 Subject: [PATCH 28/77] rcu/nocb: Rename nocb_gp_update_state to nocb_gp_update_state_deoffloading The name nocb_gp_update_state() is unenlightening, so this commit changes it to nocb_gp_update_state_deoffloading(). This function now does what its name says, updates state and returns true if the CPU corresponding to the specified rcu_data structure is in the process of being de-offloaded. Reported-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6a7f77d90fb0aa..93d393831adcbd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2016,7 +2016,8 @@ static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) return rcu_segcblist_test_flags(&rdp->cblist, flags); } -static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state) +static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, + bool *needwake_state) { struct rcu_segcblist *cblist = &rdp->cblist; @@ -2026,7 +2027,7 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) *needwake_state = true; } - return true; + return false; } /* @@ -2037,7 +2038,7 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) *needwake_state = true; - return false; + return true; } @@ -2075,7 +2076,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); - if (!nocb_gp_update_state(rdp, &needwake_state)) { + if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_state) swake_up_one(&rdp->nocb_state_wq); From 39bbfc62cc90d33f8f5f940464d08075e0275f8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Jan 2021 10:39:31 -0800 Subject: [PATCH 29/77] rcu: Expedite deboost in case of deferred quiescent state Historically, a task that has been subjected to RCU priority boosting is deboosted at rcu_read_unlock() time. However, with the advent of deferred quiescent states, if the outermost rcu_read_unlock() was invoked with either bottom halves, interrupts, or preemption disabled, the deboosting will be delayed for some time. During this time, a low-priority process might be incorrectly running at a high real-time priority level. Fortunately, rcu_read_unlock_special() already provides mechanisms for forcing a minimal deferral of quiescent states, at least for kernels built with CONFIG_IRQ_WORK=y. These mechanisms are currently used when expedited grace periods are pending that might be blocked by the current task. This commit therefore causes those mechanisms to also be used in cases where the current task has been or might soon be subjected to RCU priority boosting. Note that this applies to all kernels built with CONFIG_RCU_BOOST=y, regardless of whether or not they are also built with CONFIG_PREEMPT_RT=y. This approach assumes that kernels build for use with aggressive real-time applications are built with CONFIG_IRQ_WORK=y. It is likely to be far simpler to enable CONFIG_IRQ_WORK=y than to implement a fast-deboosting scheme that works correctly in its absence. While in the area, alphabetize the rcu_preempt_deferred_qs_handler() function's local variables. Cc: Sebastian Andrzej Siewior Cc: Scott Wood Cc: Lai Jiangshan Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d603771c7dce8..e17cb233bfc909 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -598,9 +598,9 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) static void rcu_read_unlock_special(struct task_struct *t) { unsigned long flags; + bool irqs_were_disabled; bool preempt_bh_were_disabled = !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); - bool irqs_were_disabled; /* NMI handlers cannot block and cannot safely manipulate state. */ if (in_nmi()) @@ -609,30 +609,32 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool exp; + bool expboost; // Expedited GP in flight or possible boosting. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - exp = (t->rcu_blocked_node && - READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)); + expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || + (rdp->grpmask & READ_ONCE(rnp->expmask)) || + (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && + t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (exp && !irqs_were_disabled))) { + if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the - // wakeup is free or there is an expedited GP. + // wakeup is free or there is either an expedited + // GP in flight or a potential need to deboost. raise_softirq_irqoff(RCU_SOFTIRQ); } else { // Enabling BH or preempt does reschedule, so... - // Also if no expediting, slow is OK. - // Plus nohz_full CPUs eventually get tick enabled. + // Also if no expediting and no possible deboosting, + // slow is OK. Plus nohz_full CPUs eventually get + // tick enabled. set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) { + expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); + init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = true; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } From e2b949d54392ad890bb10fb8954d967e2fcd7503 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Jan 2021 16:11:04 -0800 Subject: [PATCH 30/77] rcutorture: Make TREE03 use real-time tree.use_softirq setting TREE03 tests RCU priority boosting, which is a real-time feature. It would also be good if it tested something closer to what is actually used by the real-time folks. This commit therefore adds tree.use_softirq=0 to the TREE03 kernel boot parameters in TREE03.boot. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot index 1c218944b1e9d8..64f864f1f361fd 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot @@ -4,3 +4,4 @@ rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcutree.kthread_prio=2 threadirqs +tree.use_softirq=0 From 5e59fba573e64cffc3a7a3113fff2336d652f45a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Jan 2021 13:30:38 -0800 Subject: [PATCH 31/77] rcutorture: Fix testing of RCU priority boosting Currently, rcutorture refuses to test RCU priority boosting in CONFIG_HOTPLUG_CPU=y kernels, which are the only kind normally built on x86 these days. This commit therefore updates rcutorture's tests of RCU priority boosting to make them safe for CPU hotplug. However, these tests will fail unless TIMER_SOFTIRQ runs at realtime priority, which does not happen in current mainline. This commit therefore also refuses to test RCU priority boosting except in kernels built with CONFIG_PREEMPT_RT=y. While in the area, this commt adds some debug output at boost-fail time that helps diagnose the cause of the failure, for example, failing to run TIMER_SOFTIRQ at realtime priority. Cc: Sebastian Andrzej Siewior Cc: Scott Wood Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 99657ffa66887a..af64bd8a70b98b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -245,11 +245,11 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#if defined(CONFIG_RCU_BOOST) && defined(CONFIG_PREEMPT_RT) +# define rcu_can_boost() 1 +#else +# define rcu_can_boost() 0 +#endif #ifdef CONFIG_RCU_TRACE static u64 notrace rcu_trace_clock_local(void) @@ -923,9 +923,13 @@ static void rcu_torture_enable_rt_throttle(void) static bool rcu_torture_boost_failed(unsigned long start, unsigned long end) { + static int dbg_done; + if (end - start > test_boost_duration * HZ - HZ / 2) { VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed"); n_rcu_torture_boost_failure++; + if (!xchg(&dbg_done, 1) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); return true; /* failed */ } @@ -948,8 +952,8 @@ static int rcu_torture_boost(void *arg) init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ do { - /* Track if the test failed already in this test interval? */ - bool failed = false; + bool failed = false; // Test failed already in this test interval + bool firsttime = true; /* Increment n_rcu_torture_boosts once per boost-test */ while (!kthread_should_stop()) { @@ -975,18 +979,17 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ smp_store_release(&rbi.inflight, 1); - call_rcu(&rbi.rcu, rcu_torture_boost_cb); + cur_ops->call(&rbi.rcu, rcu_torture_boost_cb); /* Check if the boost test failed */ - failed = failed || - rcu_torture_boost_failed(call_rcu_time, - jiffies); + if (!firsttime && !failed) + failed = rcu_torture_boost_failed(call_rcu_time, jiffies); call_rcu_time = jiffies; + firsttime = false; } if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); @@ -999,7 +1002,7 @@ static int rcu_torture_boost(void *arg) * this case the boost check would never happen in the above * loop so do another one here. */ - if (!failed && smp_load_acquire(&rbi.inflight)) + if (!firsttime && !failed && smp_load_acquire(&rbi.inflight)) rcu_torture_boost_failed(call_rcu_time, jiffies); /* @@ -1025,6 +1028,9 @@ checkwait: if (stutter_wait("rcu_torture_boost")) sched_set_fifo_low(current); } while (!torture_must_stop()); + while (smp_load_acquire(&rbi.inflight)) + schedule_timeout_uninterruptible(1); // rcu_barrier() deadlocks. + /* Clean up and exit. */ while (!kthread_should_stop() || smp_load_acquire(&rbi.inflight)) { torture_shutdown_absorb("rcu_torture_boost"); @@ -1797,7 +1803,7 @@ rcu_torture_stats_print(void) WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio - WARN_ON_ONCE(n_rcu_torture_boost_failure); // RCU boost failed + WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?) WARN_ON_ONCE(i > 1); // Too-short grace period } pr_cont("Reader Pipe: "); @@ -2595,6 +2601,8 @@ static bool rcu_torture_can_boost(void) if (!(test_boost == 1 && cur_ops->can_boost) && test_boost != 2) return false; + if (!cur_ops->call) + return false; prio = rcu_get_gp_kthreads_prio(); if (!prio) From 7308e0240410d3644c9d7cc6263079a58e3effeb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Jan 2021 13:57:16 -0800 Subject: [PATCH 32/77] rcu: Make rcu_read_unlock_special() expedite strict grace periods In kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, every grace period is an expedited grace period. However, rcu_read_unlock_special() does not treat them that way, instead allowing the deferred quiescent state to be reported whenever. This commit therefore adds a check of this Kconfig option that causes rcu_read_unlock_special() to treat all grace periods as expedited for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e17cb233bfc909..a21c41cc86ad73 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -615,6 +615,7 @@ static void rcu_read_unlock_special(struct task_struct *t) expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || (rdp->grpmask & READ_ONCE(rnp->expmask)) || + IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. From 8126c57f00cea3502a017b7c76df1fac58f89e88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Feb 2021 13:25:58 -0800 Subject: [PATCH 33/77] torture: Make jitter.sh handle large systems The current jitter.sh script expects cpumask bits to fit into whatever the awk interpreter uses for an integer, which clearly does not hold for even medium-sized systems these days. This means that on a large system, only the first 32 or 64 CPUs (depending) are subjected to jitter.sh CPU-time perturbations. This commit therefore computes a given CPU's cpumask using text manipulation rather than arithmetic shifts. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 188b864bc4bf61..3a856ec2e92af8 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -67,10 +67,10 @@ do srand(n + me + systime()); ncpus = split(cpus, ca); curcpu = ca[int(rand() * ncpus + 1)]; - mask = lshift(1, curcpu); - if (mask + 0 <= 0) - mask = 1; - printf("%#x\n", mask); + z = ""; + for (i = 1; 4 * i <= curcpu; i++) + z = z "0"; + print "0x" 2 ^ (curcpu % 4) z; }' < /dev/null` n=$(($n+1)) if ! taskset -p $cpumask $$ > /dev/null 2>&1 From 85b86994284820ec070182ec269e6e79735f523a Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 25 Jan 2021 08:41:05 +0100 Subject: [PATCH 34/77] rcu-tasks: Rectify kernel-doc for struct rcu_tasks The command 'find ./kernel/rcu/ | xargs ./scripts/kernel-doc -none' reported an issue with the kernel-doc of struct rcu_tasks. This commit rectifies the kernel-doc, such that no issues remain for ./kernel/rcu/. Signed-off-by: Lukas Bulwahn Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index af7c19439f4ec9..17c8ebe131af85 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -20,7 +20,7 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** - * Definition for a Tasks-RCU-like mechanism. + * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_head: Head of callback list. * @cbs_tail: Tail pointer for callback list. * @cbs_wq: Wait queue allowning new callback to get kthread's attention. @@ -38,7 +38,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). - * @holdout_func: This flavor's holdout-list scan function (optional). + * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. * @name: This flavor's textual name. From a434dd10cd843c7348e7c54c77eb0fac27beceb4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Feb 2021 10:26:00 -0800 Subject: [PATCH 35/77] rcu-tasks: Add block comment laying out RCU Tasks Trace design This commit adds a block comment that gives a high-level overview of how RCU tasks trace grace periods progress. It also adds a note about how exiting tasks are handled, plus it gives an overview of the memory ordering. Reported-by: Peter Zijlstra Reported-by: Mathieu Desnoyers [ paulmck: Fix commit log per Mathieu Desnoyers feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 17c8ebe131af85..350ebf5051f974 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -726,6 +726,42 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // flavors, rcu_preempt and rcu_sched. The fact that RCU Tasks Trace // readers can operate from idle, offline, and exception entry/exit in no // way allows rcu_preempt and rcu_sched readers to also do so. +// +// The implementation uses rcu_tasks_wait_gp(), which relies on function +// pointers in the rcu_tasks structure. The rcu_spawn_tasks_trace_kthread() +// function sets these function pointers up so that rcu_tasks_wait_gp() +// invokes these functions in this order: +// +// rcu_tasks_trace_pregp_step(): +// Initialize the count of readers and block CPU-hotplug operations. +// rcu_tasks_trace_pertask(), invoked on every non-idle task: +// Initialize per-task state and attempt to identify an immediate +// quiescent state for that task, or, failing that, attempt to +// set that task's .need_qs flag so that task's next outermost +// rcu_read_unlock_trace() will report the quiescent state (in which +// case the count of readers is incremented). If both attempts fail, +// the task is added to a "holdout" list. +// rcu_tasks_trace_postscan(): +// Initialize state and attempt to identify an immediate quiescent +// state as above (but only for idle tasks), unblock CPU-hotplug +// operations, and wait for an RCU grace period to avoid races with +// tasks that are in the process of exiting. +// check_all_holdout_tasks_trace(), repeatedly until holdout list is empty: +// Scans the holdout list, attempting to identify a quiescent state +// for each task on the list. If there is a quiescent state, the +// corresponding task is removed from the holdout list. +// rcu_tasks_trace_postgp(): +// Wait for the count of readers do drop to zero, reporting any stalls. +// Also execute full memory barriers to maintain ordering with code +// executing after the grace period. +// +// The exit_tasks_rcu_finish_trace() synchronizes with exiting tasks. +// +// Pre-grace-period update-side code is ordered before the grace +// period via the ->cbs_lock and barriers in rcu_tasks_kthread(). +// Pre-grace-period read-side code is ordered before the grace period by +// atomic_dec_and_test() of the count of readers (for IPIed readers) and by +// scheduler context-switch ordering (for locked-down non-running readers). // The lockdep state must be outside of #ifdef to be useful. #ifdef CONFIG_DEBUG_LOCK_ALLOC From 4ac9de07b24f93a87ad38c497ad00fe2451203e7 Mon Sep 17 00:00:00 2001 From: Stephen Zhang Date: Sat, 23 Jan 2021 16:34:01 +0800 Subject: [PATCH 36/77] torture: Replace torture_init_begin string with %s This commit replaces a hard-coded "torture_init_begin" string in a pr_alert() format with "%s" and __func__. Signed-off-by: Stephen Zhang Signed-off-by: Paul E. McKenney --- kernel/torture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/torture.c b/kernel/torture.c index 01e336f1e5b20d..0a315c387bedb2 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -816,9 +816,9 @@ bool torture_init_begin(char *ttype, int v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { - pr_alert("torture_init_begin: Refusing %s init: %s running.\n", - ttype, torture_type); - pr_alert("torture_init_begin: One torture test at a time!\n"); + pr_alert("%s: Refusing %s init: %s running.\n", + __func__, ttype, torture_type); + pr_alert("%s: One torture test at a time!\n", __func__); mutex_unlock(&fullstop_mutex); return false; } From 0a27fff30a5e561dc77e9cb1bf9cf462e1735179 Mon Sep 17 00:00:00 2001 From: Stephen Zhang Date: Sat, 23 Jan 2021 17:54:17 +0800 Subject: [PATCH 37/77] rcutorture: Replace rcu_torture_stall string with %s This commit replaces a hard-coded "rcu_torture_stall" string in a pr_alert() format with "%s" and __func__. Signed-off-by: Stephen Zhang Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 99657ffa66887a..271726e13c8875 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1971,8 +1971,8 @@ static int rcu_torture_stall(void *args) local_irq_disable(); else if (!stall_cpu_block) preempt_disable(); - pr_alert("rcu_torture_stall start on CPU %d.\n", - raw_smp_processor_id()); + pr_alert("%s start on CPU %d.\n", + __func__, raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) if (stall_cpu_block) @@ -1983,7 +1983,7 @@ static int rcu_torture_stall(void *args) preempt_enable(); cur_ops->readunlock(idx); } - pr_alert("rcu_torture_stall end.\n"); + pr_alert("%s end.\n", __func__); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); From a519d21480d330918bd522499a323432c31b6ec2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Jan 2021 10:50:32 -0800 Subject: [PATCH 38/77] torturescript: Don't rerun failed rcutorture builds If the build fails when running multiple instances of a given rcutorture scenario, for example, using the kvm.sh --configs "8*RUDE01" argument, the build will be rerun an additional seven times. This is in some sense correct, but it can waste significant time. This commit therefore checks for a prior failed build and simply copies over that build's output. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-test-1-run.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 536d103ef1667f..9d8a82cac80844 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -73,7 +73,7 @@ config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG" cp $T/KcList $resdir/ConfigFragment base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` -if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux +if test "$base_resdir" != "$resdir" && test -f $base_resdir/bzImage && test -f $base_resdir/vmlinux then # Rerunning previous test, so use that test's kernel. QEMU="`identify_qemu $base_resdir/vmlinux`" @@ -83,6 +83,17 @@ then ln -s $base_resdir/.config $resdir # for kvm-recheck.sh # Arch-independent indicator touch $resdir/builtkernel +elif test "$base_resdir" != "$resdir" +then + # Rerunning previous test for which build failed + ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh + ln -s $base_resdir/.config $resdir # for kvm-recheck.sh + echo Initial build failed, not running KVM, see $resdir. + if test -f $builddir.wait + then + mv $builddir.wait $builddir.ready + fi + exit 1 elif kvm-build.sh $T/KcList $resdir then # Had to build a kernel for this test. From 3d4977b68101b38c3f9d3be3d89e17ef1fdfc1d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2021 16:38:19 -0800 Subject: [PATCH 39/77] torture: Allow 1G of memory for torture.sh kvfree testing Yes, I do recall a time when 512MB of memory was a lot of mass storage, much less main memory, but the rcuscale kvfree_rcu() testing invoked by torture.sh can sometimes exceed it on large systems, resulting in OOM. This commit therefore causes torture.sh to pase the "--memory 1G" argument to kvm.sh to reserve a full gigabyte for this purpose. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index ad7525b7ac2978..56e2e1a4256990 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -374,7 +374,7 @@ done if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi echo " --- " $scriptname $args From a8dafbf3a5465bea6d9b45a4f011ba9b56d8b267 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Feb 2021 15:44:29 -0800 Subject: [PATCH 40/77] torture: Provide bare-metal modprobe-based advice In some environments, the torture-testing use of virtualization is inconvenient. In such cases, the modprobe and rmmod commands may be used to do torture testing, but significant setup is required to build, boot, and modprobe a kernel so as to match a given torture-test scenario. This commit therefore creates a "bare-metal" file in each results directory containing steps to run the corresponding scenario using the modprobe command on bare metal. For example, the contents of this file after using kvm.sh to build an rcutorture TREE01 kernel, perhaps with the --buildonly argument, is as follows: To run this scenario on bare metal: 1. Set your bare-metal build tree to the state shown in this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/testid.txt 2. Update your bare-metal build tree's .config based on this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/TREE01/ConfigFragment 3. Make the bare-metal kernel's build system aware of your .config updates: $ yes "" | make oldconfig 4. Build your bare-metal kernel. 5. Boot your bare-metal kernel with the following parameters: maxcpus=8 nr_cpus=43 rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcu_nocbs=0-1,3-7 6. Start the test with the following command: $ modprobe rcutorture nocbs_nthreads=8 nocbs_toggle=1000 fwd_progress=0 onoff_interval=1000 onoff_holdoff=30 n_barrier_cbs=4 stat_interval=15 shutdown_secs=120 test_no_idle_hz=1 verbose=1 7. After some time, end the test with the following command: $ rmmod rcutorture 8. Copy your bare-metal kernel's .config file, overwriting this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/TREE01/.config 9. Copy the console output from just before the modprobe to just after the rmmod into this file: /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19/TREE01/console.log 10. Check for runtime errors using the following command: $ tools/testing/selftests/rcutorture/bin/kvm-recheck.sh /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2021.02.04-17.10.19 Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 44 ++++++++++++++++--- tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 9d8a82cac80844..03c04108175441 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -7,15 +7,15 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args +# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args_in # # qemu-args defaults to "-enable-kvm -nographic", along with arguments # specifying the number of CPUs and other options # generated from the underlying CPU architecture. -# boot_args defaults to value returned by the per_version_boot_params +# boot_args_in defaults to value returned by the per_version_boot_params # shell function. # -# Anything you specify for either qemu-args or boot_args is appended to +# Anything you specify for either qemu-args or boot_args_in is appended to # the default values. The "-smp" value is deduced from the contents of # the config fragment. # @@ -134,7 +134,7 @@ do done seconds=$4 qemu_args=$5 -boot_args=$6 +boot_args_in=$6 if test -z "$TORTURE_BUILDONLY" then @@ -144,7 +144,7 @@ fi # Generate -smp qemu argument. qemu_args="-enable-kvm -nographic $qemu_args" cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment` -cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` +cpu_count=`configfrag_boot_cpus "$boot_args_in" "$config_template" "$cpu_count"` if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS" then echo CPU count limited from $cpu_count to $TORTURE_ALLOTED_CPUS | tee -a $resdir/Warnings @@ -160,13 +160,45 @@ qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`" qemu_append="`identify_qemu_append "$QEMU"`" # Pull in Kconfig-fragment boot parameters -boot_args="`configfrag_boot_params "$boot_args" "$config_template"`" +boot_args="`configfrag_boot_params "$boot_args_in" "$config_template"`" # Generate kernel-version-specific boot parameters boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`" if test -n "$TORTURE_BOOT_GDB_ARG" then boot_args="$boot_args $TORTURE_BOOT_GDB_ARG" fi + +# Give bare-metal advice +modprobe_args="`echo $boot_args | tr -s ' ' '\012' | grep "^$TORTURE_MOD\." | sed -e "s/$TORTURE_MOD\.//g"`" +kboot_args="`echo $boot_args | tr -s ' ' '\012' | grep -v "^$TORTURE_MOD\."`" +testid_txt="`dirname $resdir`/testid.txt" +touch $resdir/bare-metal +echo To run this scenario on bare metal: >> $resdir/bare-metal +echo >> $resdir/bare-metal +echo " 1." Set your bare-metal build tree to the state shown in this file: >> $resdir/bare-metal +echo " " $testid_txt >> $resdir/bare-metal +echo " 2." Update your bare-metal build tree"'"s .config based on this file: >> $resdir/bare-metal +echo " " $resdir/ConfigFragment >> $resdir/bare-metal +echo " 3." Make the bare-metal kernel"'"s build system aware of your .config updates: >> $resdir/bare-metal +echo " " $ 'yes "" | make oldconfig' >> $resdir/bare-metal +echo " 4." Build your bare-metal kernel. >> $resdir/bare-metal +echo " 5." Boot your bare-metal kernel with the following parameters: >> $resdir/bare-metal +echo " " $kboot_args >> $resdir/bare-metal +echo " 6." Start the test with the following command: >> $resdir/bare-metal +echo " " $ modprobe $TORTURE_MOD $modprobe_args >> $resdir/bare-metal +echo " 7." After some time, end the test with the following command: >> $resdir/bare-metal +echo " " $ rmmod $TORTURE_MOD >> $resdir/bare-metal +echo " 8." Copy your bare-metal kernel"'"s .config file, overwriting this file: >> $resdir/bare-metal +echo " " $resdir/.config >> $resdir/bare-metal +echo " 9." Copy the console output from just before the modprobe to just after >> $resdir/bare-metal +echo " " the rmmod into this file: >> $resdir/bare-metal +echo " " $resdir/console.log >> $resdir/bare-metal +echo "10." Check for runtime errors using the following command: >> $resdir/bare-metal +echo " " $ tools/testing/selftests/rcutorture/bin/kvm-recheck.sh `dirname $resdir` >> $resdir/bare-metal +echo >> $resdir/bare-metal +echo Some of the above steps may be skipped if you build your bare-metal >> $resdir/bare-metal +echo kernel here: `head -n 1 $testid_txt | sed -e 's/^Build directory: //'` >> $resdir/bare-metal + echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 8d3c99b35e0603..35a2132a84611a 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -29,6 +29,7 @@ PATH=${KVM}/bin:$PATH; export PATH TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`" TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" +TORTURE_BUILDONLY= TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD TORTURE_KCONFIG_ARG="" TORTURE_KCONFIG_GDB_ARG="" @@ -40,6 +41,7 @@ TORTURE_KMAKE_ARG="" TORTURE_QEMU_MEM=512 TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu +TORTURE_MOD=rcutorture TORTURE_TRUST_MAKE="" resdir="" configs="" @@ -215,6 +217,7 @@ do --torture) checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--' TORTURE_SUITE=$2 + TORTURE_MOD="`echo $TORTURE_SUITE | sed -e 's/^\(lock\|rcu\|scf\)$/\1torture/'`" shift if test "$TORTURE_SUITE" = rcuscale || test "$TORTURE_SUITE" = refscale then @@ -381,6 +384,7 @@ TORTURE_QEMU_GDB_ARG="$TORTURE_QEMU_GDB_ARG"; export TORTURE_QEMU_GDB_ARG TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG +TORTURE_MOD="$TORTURE_MOD"; export TORTURE_MOD TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC From f9d2f1e2c426ad6c4d7661cc7d90be4de2c4f7a4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 4 Feb 2021 17:20:45 -0800 Subject: [PATCH 41/77] torture: Improve readability of the testid.txt file The testid.txt file was intended for occasional in extremis use, but now that the new "bare-metal" file references it, it might see more use. This commit therefore labels sections of output and adds spacing to make it easier to see what needs to be done to make a bare-metal build tree match an rcutorture build tree. Of course, you can avoid this whole issue by building your bare-metal kernel in the same directory in which you ran rcutorture, but that might not always be an option. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 35a2132a84611a..1de198d6f999b2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -404,11 +404,16 @@ echo $scriptname $args touch $resdir/$ds/log echo $scriptname $args >> $resdir/$ds/log echo ${TORTURE_SUITE} > $resdir/$ds/TORTURE_SUITE -pwd > $resdir/$ds/testid.txt +echo Build directory: `pwd` > $resdir/$ds/testid.txt if test -d .git then + echo Current commit: `git rev-parse HEAD` >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo ' ---' Output of "'"git status"'": >> $resdir/$ds/testid.txt git status >> $resdir/$ds/testid.txt - git rev-parse HEAD >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo >> $resdir/$ds/testid.txt + echo ' ---' Output of "'"git diff HEAD"'": >> $resdir/$ds/testid.txt git diff HEAD >> $resdir/$ds/testid.txt fi ___EOF___ From 0e7457b550233314394574c6bdc890de9131daf5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2021 10:15:02 -0800 Subject: [PATCH 42/77] rcuscale: Disable verbose torture-test output Given large numbers of threads, the quantity of torture-test output is sufficient to sometimes result in RCU CPU stall warnings. The probability of these stall warnings was greatly reduced by batching the output, but the warnings were not eliminated. However, the actual test only depends on console output that is printed even when rcuscale.verbose=0. This commit therefore causes this test to run with rcuscale.verbose=0. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/configs/rcuscale/ver_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh index 0333e9b1852201..ffbe15109f0db1 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh @@ -12,5 +12,5 @@ # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 rcuscale.shutdown=1 \ - rcuscale.verbose=1 + rcuscale.verbose=0 } From aebf8c7bf6d508dfb4255db8f7355ca819d9e6c9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jan 2021 10:17:26 -0800 Subject: [PATCH 43/77] refscale: Disable verbose torture-test output Given large numbers of threads, the quantity of torture-test output is sufficient to sometimes result in RCU CPU stall warnings. The probability of these stall warnings was greatly reduced by batching the output, but the warnings were not eliminated. However, the actual test only depends on console output that is printed even when refscale.verbose=0. This commit therefore causes this test to run with refscale.verbose=0. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/configs/refscale/ver_functions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh index 321e82641287ee..f81fa2c541a640 100644 --- a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh +++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh @@ -12,5 +12,5 @@ # Adds per-version torture-module parameters to kernels supporting them. per_version_boot_params () { echo $1 refscale.shutdown=1 \ - refscale.verbose=1 + refscale.verbose=0 } From 3c43ce53fdb39921f4ee71c65dc100296e15640f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Feb 2021 15:15:13 -0800 Subject: [PATCH 44/77] torture: Move build/run synchronization files into scenario directories Currently the bN.ready and bN.wait files are placed in the rcutorture directory, which really is not at all a good place for run-specific files. This commit therefore renames these files to build.ready and build.wait and then moves them into the scenario directories within the "res" directory, for example, into tools/testing/selftests/rcutorture/res/2021.02.10-15.08.23/TINY01. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 25 +++++++++---------- tools/testing/selftests/rcutorture/bin/kvm.sh | 10 +++----- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 03c04108175441..91578d3af21993 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -7,7 +7,7 @@ # Execute this in the source tree. Do not run it as a background task # because qemu does not seem to like that much. # -# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args_in +# Usage: kvm-test-1-run.sh config resdir seconds qemu-args boot_args_in # # qemu-args defaults to "-enable-kvm -nographic", along with arguments # specifying the number of CPUs and other options @@ -35,8 +35,7 @@ mkdir $T config_template=${1} config_dir=`echo $config_template | sed -e 's,/[^/]*$,,'` title=`echo $config_template | sed -e 's/^.*\///'` -builddir=${2} -resdir=${3} +resdir=${2} if test -z "$resdir" -o ! -d "$resdir" -o ! -w "$resdir" then echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it" @@ -89,9 +88,9 @@ then ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh ln -s $base_resdir/.config $resdir # for kvm-recheck.sh echo Initial build failed, not running KVM, see $resdir. - if test -f $builddir.wait + if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi exit 1 elif kvm-build.sh $T/KcList $resdir @@ -118,23 +117,23 @@ else # Build failed. cp .config $resdir || : echo Build failed, not running KVM, see $resdir. - if test -f $builddir.wait + if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi exit 1 fi -if test -f $builddir.wait +if test -f $resdir/build.wait then - mv $builddir.wait $builddir.ready + mv $resdir/build.wait $resdir/build.ready fi -while test -f $builddir.ready +while test -f $resdir/build.ready do sleep 1 done -seconds=$4 -qemu_args=$5 -boot_args_in=$6 +seconds=$3 +qemu_args=$4 +boot_args_in=$5 if test -z "$TORTURE_BUILDONLY" then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 1de198d6f999b2..7944510f8c24be 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -444,7 +444,6 @@ function dump(first, pastlast, batchnum) print "needqemurun=" jn=1 for (j = first; j < pastlast; j++) { - builddir=KVM "/b" j - first + 1 cpusr[jn] = cpus[j]; if (cfrep[cf[j]] == "") { cfr[jn] = cf[j]; @@ -453,15 +452,15 @@ function dump(first, pastlast, batchnum) cfrep[cf[j]]++; cfr[jn] = cf[j] "." cfrep[cf[j]]; } + builddir=rd cfr[jn] "/build"; if (cpusr[jn] > ncpus && ncpus != 0) ovf = "-ovf"; else ovf = ""; print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log"; - print "rm -f " builddir ".*"; - print "touch " builddir ".wait"; print "mkdir " rd cfr[jn] " || :"; - print "kvm-test-1-run.sh " CONFIGDIR cf[j], builddir, rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" + print "touch " builddir ".wait"; + print "kvm-test-1-run.sh " CONFIGDIR cf[j], rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &" print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log"; print "while test -f " builddir ".wait" print "do" @@ -471,7 +470,7 @@ function dump(first, pastlast, batchnum) jn++; } for (j = 1; j < jn; j++) { - builddir=KVM "/b" j + builddir=rd cfr[j] "/build"; print "rm -f " builddir ".ready" print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" @@ -509,7 +508,6 @@ function dump(first, pastlast, batchnum) print "\techo ---- No kernel runs. `date` | tee -a " rd "log"; print "fi" for (j = 1; j < jn; j++) { - builddir=KVM "/b" j print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results: | tee -a " rd "log"; print "cat " rd cfr[j] "/kvm-test-1-run.sh.out | tee -a " rd "log"; } From b674100e630bf9211d7edce06b5d734b125a74ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Feb 2021 16:28:44 -0800 Subject: [PATCH 45/77] torture: Use file-based protocol to mark batch's runs complete Currently, the script generated by kvm.sh does a "wait" to wait on both the current batch's guest OSes and any jitter.sh scripts. This works, but makes it hard to abstract the jittering so that common code can be used for both local and distributed runs. This commit therefore uses "build.run" files in scenario directories, and these files are removed after the corresponding scenario's guest OS has completed. Note that --build-only runs do not create build.run files because they also do not create guest OSes and do not run any jitter.sh scripts. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-test-1-run.sh | 3 +++ tools/testing/selftests/rcutorture/bin/kvm.sh | 13 +++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 91578d3af21993..fed6f10a7b6080 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -345,4 +345,7 @@ then echo Unknown PID, cannot kill qemu command fi +# Tell the script that this run is done. +rm -f $resdir/build.run + parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 7944510f8c24be..1f5f8720cacc73 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -469,9 +469,15 @@ function dump(first, pastlast, batchnum) print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date` | tee -a " rd "log"; jn++; } + print "runfiles=" for (j = 1; j < jn; j++) { builddir=rd cfr[j] "/build"; - print "rm -f " builddir ".ready" + if (TORTURE_BUILDONLY) + print "rm -f " builddir ".ready" + else + print "mv " builddir ".ready " builddir ".run" + print "runfiles=\"$runfiles " builddir ".run\"" + fi print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` | tee -a " rd "log"; @@ -501,7 +507,10 @@ function dump(first, pastlast, batchnum) print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" print "\techo $! >> " rd "jitter_pids" } - print "\twait" + print "\twhile ls $runfiles > /dev/null 2>&1" + print "\tdo" + print "\t\t:" + print "\tdone" print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" From 37812c9429722824859788cf754dd3e33f546908 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 10:39:28 -0800 Subject: [PATCH 46/77] torture: Use "jittering" file to control jitter.sh execution Currently, jitter.sh execution is controlled by a time limit and by the "kill" command. The former allowed jitter.sh to run uselessly past the end of a set of runs that panicked during boot, and the latter is vulnerable to PID reuse. This commit therefore introduces a "jittering" file in the date-stamp directory within "res" that must be present for the jitter.sh scripts to continue executing. The time limit is still in place in order to avoid disturbing runs featuring large trace dumps, but the removal of the "jittering" file handles the panic-during-boot scenario without relying on PIDs. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 10 ++++++---- tools/testing/selftests/rcutorture/bin/kvm.sh | 5 ++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index 188b864bc4bf61..ed0ea86ddf5d37 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -5,10 +5,11 @@ # of this script is to inflict random OS jitter on a concurrently running # test. # -# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ] +# Usage: jitter.sh me duration jittering-path [ sleepmax [ spinmax ] ] # # me: Random-number-generator seed salt. # duration: Time to run in seconds. +# jittering-path: Path to file whose removal will stop this script. # sleepmax: Maximum microseconds to sleep, defaults to one second. # spinmax: Maximum microseconds to spin, defaults to one millisecond. # @@ -18,8 +19,9 @@ me=$(($1 * 1000)) duration=$2 -sleepmax=${3-1000000} -spinmax=${4-1000} +jittering=$3 +sleepmax=${4-1000000} +spinmax=${5-1000} n=1 @@ -47,7 +49,7 @@ do fi # Check for stop request. - if test -f "$TORTURE_STOPFILE" + if ! test -f "$jittering" then exit 1; fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 1f5f8720cacc73..48da4cdb29d8e6 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -503,14 +503,17 @@ function dump(first, pastlast, batchnum) print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; print "\techo > " rd "jitter_pids" + print "\ttouch " rd "jittering" for (j = 0; j < njitter; j++) { - print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" + print "\tjitter.sh " j " " dur " " rd "jittering " ja[2] " " ja[3] "&" print "\techo $! >> " rd "jitter_pids" } print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" print "\tdone" + print "\trm -f " rd "jittering" + print "\twait" print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" From 1f922db8eef015f261480347aaf79fa9a25728f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 10:56:42 -0800 Subject: [PATCH 47/77] torture: Eliminate jitter_pids file Now that there is a reliable way to convince the jitter.sh scripts to stop, the jitter_pids file is not needed, nor is the code that kills all the PIDs contained in this file. This commit therefore eliminates this file and the code using it. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-test-1-run.sh | 14 -------------- tools/testing/selftests/rcutorture/bin/kvm.sh | 5 +---- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index fed6f10a7b6080..eb5346b457d4e2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -270,20 +270,6 @@ do echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 ps -fp $killpid >> $resdir/Warnings 2>&1 fi - # Reduce probability of PID reuse by allowing a one-minute buffer - if test $((kruntime + 60)) -lt $seconds && test -s "$resdir/../jitter_pids" - then - awk < "$resdir/../jitter_pids" ' - NF > 0 { - pidlist = pidlist " " $1; - n++; - } - END { - if (n > 0) { - print "kill " pidlist; - } - }' | sh - fi else echo ' ---' `date`: "Kernel done" fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 48da4cdb29d8e6..de93802c3d00fd 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -502,12 +502,9 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\techo > " rd "jitter_pids" print "\ttouch " rd "jittering" - for (j = 0; j < njitter; j++) { + for (j = 0; j < njitter; j++) print "\tjitter.sh " j " " dur " " rd "jittering " ja[2] " " ja[3] "&" - print "\techo $! >> " rd "jitter_pids" - } print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" From 4cd54518c3d8afadd11ebd6ad4f03b00859f5e85 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 11:54:43 -0800 Subject: [PATCH 48/77] torture: Reverse jittering and duration parameters for jitter.sh Remote rcutorture testing requires that jitter.sh continue to be invoked from the generated script for local runs, but that it instead be invoked on the remote system for distributed runs. This argues for common jitterstart and jitterstop scripts. But it would be good for jitterstart and jitterstop to control the name and location of the "jittering" file, while continuing to have the duration controlled by the caller of these new scripts. This commit therefore reverses the order of the jittering and duration parameters for jitter.sh, so that the jittering parameter precedes the duration parameter. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/jitter.sh | 6 +++--- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh index ed0ea86ddf5d37..ff1d3e468e5391 100755 --- a/tools/testing/selftests/rcutorture/bin/jitter.sh +++ b/tools/testing/selftests/rcutorture/bin/jitter.sh @@ -5,7 +5,7 @@ # of this script is to inflict random OS jitter on a concurrently running # test. # -# Usage: jitter.sh me duration jittering-path [ sleepmax [ spinmax ] ] +# Usage: jitter.sh me jittering-path duration [ sleepmax [ spinmax ] ] # # me: Random-number-generator seed salt. # duration: Time to run in seconds. @@ -18,8 +18,8 @@ # Authors: Paul E. McKenney me=$(($1 * 1000)) -duration=$2 -jittering=$3 +jittering=$2 +duration=$3 sleepmax=${4-1000000} spinmax=${5-1000} diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index de93802c3d00fd..a2ee3f2fff3c76 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -504,7 +504,7 @@ function dump(first, pastlast, batchnum) print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; print "\ttouch " rd "jittering" for (j = 0; j < njitter; j++) - print "\tjitter.sh " j " " dur " " rd "jittering " ja[2] " " ja[3] "&" + print "\tjitter.sh " j " " rd "jittering " dur " " ja[2] " " ja[3] "&" print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" From 1c0c4bc1ceb580851b2d76fdef9712b3bdae134b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Feb 2021 16:20:40 -0800 Subject: [PATCH 49/77] softirq: Don't try waking ksoftirqd before it has been spawned If there is heavy softirq activity, the softirq system will attempt to awaken ksoftirqd and will stop the traditional back-of-interrupt softirq processing. This is all well and good, but only if the ksoftirqd kthreads already exist, which is not the case during early boot, in which case the system hangs. One reproducer is as follows: tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 2 --configs "TREE03" --kconfig "CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y CONFIG_NO_HZ_IDLE=y CONFIG_HZ_PERIODIC=n" --bootargs "threadirqs=1" --trust-make This commit therefore adds a couple of existence checks for ksoftirqd and forces back-of-interrupt softirq processing when ksoftirqd does not yet exist. With this change, the above test passes. Reported-by: Sebastian Andrzej Siewior Reported-by: Uladzislau Rezki Cc: Peter Zijlstra Cc: Thomas Gleixner [ paulmck: Remove unneeded check per Sebastian Siewior feedback. ] Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 9908ec4a9bfed9..bad14ca2b5200e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -211,7 +211,7 @@ static inline void invoke_softirq(void) if (ksoftirqd_running(local_softirq_pending())) return; - if (!force_irqthreads) { + if (!force_irqthreads || !__this_cpu_read(ksoftirqd)) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* * We can safely execute softirq on the current stack if From e589c7c72315f7e52ebb5cffc19615dc18d0cc50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 10:07:09 -0800 Subject: [PATCH 50/77] docs: Correctly spell Stephen Hemminger's name This commit replaces "Steve" with the his real name, which is "Stephen". Reported-by: Stephen Hemminger Signed-off-by: Paul E. McKenney --- Documentation/RCU/RTFP.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt index 3b0876c773552f..588d97366a463b 100644 --- a/Documentation/RCU/RTFP.txt +++ b/Documentation/RCU/RTFP.txt @@ -847,7 +847,7 @@ Symposium on Distributed Computing} 'It's entirely possible that the current user could be replaced by RCU and/or seqlocks, and we could get rid of brlocks entirely.' . - Steve Hemminger responds by replacing them with RCU. + Stephen Hemminger responds by replacing them with RCU. } } From 7e937220afa3eada0d4611b31e4e3c60770e39b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 Feb 2021 11:25:29 -0800 Subject: [PATCH 51/77] rcu: Add explicit barrier() to __rcu_read_unlock() Because preemptible RCU's __rcu_read_unlock() is an external function, the rough equivalent of an implicit barrier() is inserted by the compiler. Except that there is a direct call to __rcu_read_unlock() in that same file, and compilers are getting to the point where they might choose to inline the fastpath of the __rcu_read_unlock() function. This commit therefore adds an explicit barrier() to the very beginning of __rcu_read_unlock(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 2d603771c7dce8..a32494c4b6f69a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -393,8 +393,9 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; + barrier(); // critical section before exit code. if (rcu_preempt_read_exit() == 0) { - barrier(); /* critical section before exit code. */ + barrier(); // critical-section exit before .s check. if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) rcu_read_unlock_special(t); } From 565cfb9e64dac1aadf7e2130fcda19a1c018df66 Mon Sep 17 00:00:00 2001 From: Sangmoon Kim Date: Tue, 2 Mar 2021 20:55:15 +0900 Subject: [PATCH 52/77] rcu/tree: Add a trace event for RCU CPU stall warnings This commit adds a trace event which allows tracing the beginnings of RCU CPU stall warnings on systems where sysctl_panic_on_rcu_stall is disabled. The first parameter is the name of RCU flavor like other trace events. The second parameter indicates whether this is a stall of an expedited grace period, a self-detected stall of a normal grace period, or a stall of a normal grace period detected by some CPU other than the one that is stalled. RCU CPU stall warnings are often caused by external-to-RCU issues, for example, in interrupt handling or task scheduling. Therefore, this event uses TRACE_EVENT, not TRACE_EVENT_RCU, to avoid requiring those interested in tracing RCU CPU stalls to rebuild their kernels with CONFIG_RCU_TRACE=y. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Neeraj Upadhyay Signed-off-by: Sangmoon Kim Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 28 ++++++++++++++++++++++++++++ kernel/rcu/tree_exp.h | 1 + kernel/rcu/tree_stall.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5fc29400e1a2de..c7711e9b690054 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -432,6 +432,34 @@ TRACE_EVENT_RCU(rcu_fqs, __entry->cpu, __entry->qsevent) ); +/* + * Tracepoint for RCU stall events. Takes a string identifying the RCU flavor + * and a string identifying which function detected the RCU stall as follows: + * + * "StallDetected": Scheduler-tick detects other CPU's stalls. + * "SelfDetected": Scheduler-tick detects a current CPU's stall. + * "ExpeditedStall": Expedited grace period detects stalls. + */ +TRACE_EVENT(rcu_stall_warning, + + TP_PROTO(const char *rcuname, const char *msg), + + TP_ARGS(rcuname, msg), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(const char *, msg) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->msg = msg; + ), + + TP_printk("%s %s", + __entry->rcuname, __entry->msg) +); + #endif /* #if defined(CONFIG_TREE_RCU) */ /* diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6c6ff06d4ae653..2796084ef85a5d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -521,6 +521,7 @@ static void synchronize_rcu_expedited_wait(void) if (rcu_stall_is_suppressed()) continue; panic_on_rcu_stall(); + trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); ndetected = 0; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 475b26171b20ff..59b95cc5cbdf11 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -536,6 +536,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ + trace_rcu_stall_warning(rcu_state.name, TPS("StallDetected")); pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -606,6 +607,7 @@ static void print_cpu_stall(unsigned long gps) * See Documentation/RCU/stallwarn.rst for info on how to debug * RCU CPU stall warnings. */ + trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected")); pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); print_cpu_stall_info(smp_processor_id()); From 9640dcab974fb7fba086d30fd9f0ec08b8876d12 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 24 Feb 2021 16:30:29 +0800 Subject: [PATCH 53/77] rcu: Make nocb_nobypass_lim_per_jiffy static RCU triggerse the following sparse warning: kernel/rcu/tree_plugin.h:1497:5: warning: symbol 'nocb_nobypass_lim_per_jiffy' was not declared. Should it be static? This commit therefore makes this variable static. Reported-by: Abaci Robot Reported-by: Frederic Weisbecker Signed-off-by: Jiapeng Chong Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 93d393831adcbd..a1a17adeae543b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1556,7 +1556,7 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * After all, the main point of bypassing is to avoid lock contention * on ->nocb_lock, which only can happen at high call_rcu() rates. */ -int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; +static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; module_param(nocb_nobypass_lim_per_jiffy, int, 0); /* From b2fcf2102049f6e56981e0ab3d9b633b8e2741da Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:09:59 +0100 Subject: [PATCH 54/77] rcu/nocb: Fix missed nocb_timer requeue This sequence of events can lead to a failure to requeue a CPU's ->nocb_timer: 1. There are no callbacks queued for any CPU covered by CPU 0-2's ->nocb_gp_kthread. Note that ->nocb_gp_kthread is associated with CPU 0. 2. CPU 1 enqueues its first callback with interrupts disabled, and thus must defer awakening its ->nocb_gp_kthread. It therefore queues its rcu_data structure's ->nocb_timer. At this point, CPU 1's rdp->nocb_defer_wakeup is RCU_NOCB_WAKE. 3. CPU 2, which shares the same ->nocb_gp_kthread, also enqueues a callback, but with interrupts enabled, allowing it to directly awaken the ->nocb_gp_kthread. 4. The newly awakened ->nocb_gp_kthread associates both CPU 1's and CPU 2's callbacks with a future grace period and arranges for that grace period to be started. 5. This ->nocb_gp_kthread goes to sleep waiting for the end of this future grace period. 6. This grace period elapses before the CPU 1's timer fires. This is normally improbably given that the timer is set for only one jiffy, but timers can be delayed. Besides, it is possible that kernel was built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. 7. The grace period ends, so rcu_gp_kthread awakens the ->nocb_gp_kthread, which in turn awakens both CPU 1's and CPU 2's ->nocb_cb_kthread. Then ->nocb_gb_kthread sleeps waiting for more newly queued callbacks. 8. CPU 1's ->nocb_cb_kthread invokes its callback, then sleeps waiting for more invocable callbacks. 9. Note that neither kthread updated any ->nocb_timer state, so CPU 1's ->nocb_defer_wakeup is still set to RCU_NOCB_WAKE. 10. CPU 1 enqueues its second callback, this time with interrupts enabled so it can wake directly ->nocb_gp_kthread. It does so with calling wake_nocb_gp() which also cancels the pending timer that got queued in step 2. But that doesn't reset CPU 1's ->nocb_defer_wakeup which is still set to RCU_NOCB_WAKE. So CPU 1's ->nocb_defer_wakeup and its ->nocb_timer are now desynchronized. 11. ->nocb_gp_kthread associates the callback queued in 10 with a new grace period, arranges for that grace period to start and sleeps waiting for it to complete. 12. The grace period ends, rcu_gp_kthread awakens ->nocb_gp_kthread, which in turn wakes up CPU 1's ->nocb_cb_kthread which then invokes the callback queued in 10. 13. CPU 1 enqueues its third callback, this time with interrupts disabled so it must queue a timer for a deferred wakeup. However the value of its ->nocb_defer_wakeup is RCU_NOCB_WAKE which incorrectly indicates that a timer is already queued. Instead, CPU 1's ->nocb_timer was cancelled in 10. CPU 1 therefore fails to queue the ->nocb_timer. 14. CPU 1 has its pending callback and it may go unnoticed until some other CPU ever wakes up ->nocb_gp_kthread or CPU 1 ever calls an explicit deferred wakeup, for example, during idle entry. This commit fixes this bug by resetting rdp->nocb_defer_wakeup everytime we delete the ->nocb_timer. It is quite possible that there is a similar scenario involving ->nocb_bypass_timer and ->nocb_defer_wakeup. However, despite some effort from several people, a failure scenario has not yet been located. However, that by no means guarantees that no such scenario exists. Finding a failure scenario is left as an exercise for the reader, and the "Fixes:" tag below relates to ->nocb_bypass_timer instead of ->nocb_timer. Fixes: d1b222c6be1f (rcu/nocb: Add bypass callback queueing) Cc: Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Boqun Feng Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a1a17adeae543b..e392bd12931619 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1708,7 +1708,11 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force, rcu_nocb_unlock_irqrestore(rdp, flags); return false; } - del_timer(&rdp->nocb_timer); + + if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&rdp->nocb_timer); + } rcu_nocb_unlock_irqrestore(rdp, flags); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { @@ -2335,7 +2339,6 @@ static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); From 76d00b494d7962e88d4bbd4135f34aba9019c67f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:00 +0100 Subject: [PATCH 55/77] rcu/nocb: Disable bypass when CPU isn't completely offloaded Currently, the bypass is flushed at the very last moment in the deoffloading procedure. However, this approach leads to a larger state space than would be preferred. This commit therefore disables the bypass at soon as the deoffloading procedure begins, then flushes it. This guarantees that the bypass remains empty and thus out of the way of the deoffloading procedure. Symmetrically, this commit waits to enable the bypass until the offloading procedure has completed. Reported-by: Paul E. McKenney Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 7 ++++--- kernel/rcu/tree_plugin.h | 38 ++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 8afe886e85f101..3db96c4f45fd44 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -109,7 +109,7 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | - * | handling callbacks. | + * | handling callbacks. Enable bypass queueing. | * ---------------------------------------------------------------------------- */ @@ -125,7 +125,7 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | - * | ignores callbacks. | + * | ignores callbacks. Bypass enqueue is enabled. | * ---------------------------------------------------------------------------- * | * v @@ -134,7 +134,8 @@ struct rcu_cblist { * | SEGCBLIST_KTHREAD_GP | * | | * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | - * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | + * | holding nocb_lock. Wake up CB and GP kthreads if necessary. Disable | + * | bypass enqueue. | * ---------------------------------------------------------------------------- * | * v diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e392bd12931619..b08564b2bcf7c8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1830,11 +1830,22 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j = jiffies; long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); + lockdep_assert_irqs_disabled(); + + // Pure softirq/rcuc based processing: no bypassing, no + // locking. if (!rcu_rdp_is_offloaded(rdp)) { + *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); + return false; + } + + // In the process of (de-)offloading: no bypassing, but + // locking. + if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); return false; /* Not offloaded, no bypassing. */ } - lockdep_assert_irqs_disabled(); // Don't use ->nocb_bypass during early boot. if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { @@ -2416,7 +2427,16 @@ static long rcu_nocb_rdp_deoffload(void *arg) pr_info("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); - + /* + * Flush once and for all now. This suffices because we are + * running on the target CPU holding ->nocb_lock (thus having + * interrupts disabled), and because rdp_offload_toggle() + * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. + * Thus future calls to rcu_segcblist_completely_offloaded() will + * return false, which means that future calls to rcu_nocb_try_bypass() + * will refuse to put anything into the bypass. + */ + WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | @@ -2428,21 +2448,21 @@ static long rcu_nocb_rdp_deoffload(void *arg) del_timer_sync(&rdp->nocb_timer); /* - * Flush bypass. While IRQs are disabled and once we set - * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be - * enqueued on bypass. + * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked + * and IRQs disabled but let's be paranoid. */ rcu_nocb_lock_irqsave(rdp, flags); - rcu_nocb_flush_bypass(rdp, NULL, jiffies); rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); /* * With SEGCBLIST_SOFTIRQ_ONLY, we can't use - * rcu_nocb_unlock_irqrestore() anymore. Theoretically we - * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs - * disabled now, but let's be paranoid. + * rcu_nocb_unlock_irqrestore() anymore. */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); + /* Sanity check */ + WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); + + return ret; } From 0efdf14a9f83618335a0849df3586808bff36cfb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:01 +0100 Subject: [PATCH 56/77] rcu/nocb: Remove stale comment above rcu_segcblist_offload() This commit removes a stale comment claiming that the cblist must be empty before changing the offloading state. This claim was correct back when the offloaded state was defined exclusively at boot. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7f181c9675f761..aaa111237b6029 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,8 +261,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded. This - * structure must be empty. + * Mark the specified rcu_segcblist structure as offloaded. */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { From e02691b7ef51c5fac0eee5a6ebde45ce92958fae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Feb 2021 01:10:02 +0100 Subject: [PATCH 57/77] rcu/nocb: Move trace_rcu_nocb_wake() calls outside nocb_lock when possible Those tracing calls don't need to be under ->nocb_lock. This commit therefore moves them outside of that lock. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b08564b2bcf7c8..9846c8aecbb8aa 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1703,9 +1703,9 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force, lockdep_assert_held(&rdp->nocb_lock); if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { + rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); - rcu_nocb_unlock_irqrestore(rdp, flags); return false; } @@ -1955,9 +1955,9 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { + rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); - rcu_nocb_unlock_irqrestore(rdp, flags); return; } // Need to actually to a wakeup. @@ -1992,8 +1992,8 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, TPS("WakeOvfIsDeferred")); rcu_nocb_unlock_irqrestore(rdp, flags); } else { - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); rcu_nocb_unlock_irqrestore(rdp, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } return; } From 7abb18bd7567480e34f46d3512369ec49499064e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Feb 2021 16:10:38 -0800 Subject: [PATCH 58/77] rcu: Provide polling interfaces for Tree RCU grace periods There is a need for a non-blocking polling interface for RCU grace periods, so this commit supplies start_poll_synchronize_rcu() and poll_state_synchronize_rcu() for this purpose. Note that the existing get_state_synchronize_rcu() may be used if future grace periods are inevitable (perhaps due to a later call_rcu() invocation). The new start_poll_synchronize_rcu() is to be used if future grace periods might not otherwise happen. Finally, poll_state_synchronize_rcu() provides a lockless check for a grace period having elapsed since the corresponding call to either of the get_state_synchronize_rcu() or start_poll_synchronize_rcu(). As with get_state_synchronize_rcu(), the return value from either get_state_synchronize_rcu() or start_poll_synchronize_rcu() is passed in to a later call to either poll_state_synchronize_rcu() or the existing (might_sleep) cond_synchronize_rcu(). [ paulmck: Remove redundant smp_mb() per Frederic Weisbecker feedback. ] [ Update poll_state_synchronize_rcu() docbook per Frederic Weisbecker feedback. ] Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcutree.h | 2 ++ kernel/rcu/tree.c | 75 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 70 insertions(+), 7 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index df578b73960f96..b89b54130f4963 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -41,6 +41,8 @@ void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); unsigned long get_state_synchronize_rcu(void); +unsigned long start_poll_synchronize_rcu(void); +bool poll_state_synchronize_rcu(unsigned long oldstate); void cond_synchronize_rcu(unsigned long oldstate); void rcu_idle_enter(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index da6f5213fb74cb..07e8122614747a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3774,8 +3774,8 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() - * to determine whether or not a full grace period has elapsed in the - * meantime. + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. */ unsigned long get_state_synchronize_rcu(void) { @@ -3788,14 +3788,77 @@ unsigned long get_state_synchronize_rcu(void) } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); +/** + * start_poll_synchronize_rcu - Snapshot and start RCU grace period + * + * Returns a cookie that is used by a later call to cond_synchronize_rcu() + * or poll_state_synchronize_rcu() to determine whether or not a full + * grace period has elapsed in the meantime. If the needed grace period + * is not already slated to start, notifies RCU core of the need for that + * grace period. + * + * Interrupts must be enabled for the case where it is necessary to awaken + * the grace-period kthread. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long flags; + unsigned long gp_seq = get_state_synchronize_rcu(); + bool needwake; + struct rcu_data *rdp; + struct rcu_node *rnp; + + lockdep_assert_irqs_enabled(); + local_irq_save(flags); + rdp = this_cpu_ptr(&rcu_data); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); // irqs already disabled. + needwake = rcu_start_this_gp(rnp, rdp, gp_seq); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (needwake) + rcu_gp_kthread_wake(); + return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/** + * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period + * + * @oldstate: return from call to get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * + * If a full RCU grace period has elapsed since the earlier call from + * which oldstate was obtained, return @true, otherwise return @false. + * If @false is returned, it is the caller's responsibilty to invoke this + * function later on until it does return @true. Alternatively, the caller + * can explicitly wait for a grace period, for example, by passing @oldstate + * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!). + * Those needing to keep oldstate values for very long time periods + * (many hours even on 32-bit systems) should check them occasionally + * and either refresh them or set a flag indicating that the grace period + * has completed. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ + if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) { + smp_mb(); /* Ensure GP ends before subsequent accesses. */ + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * * @oldstate: return value from earlier call to get_state_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call to - * get_state_synchronize_rcu(), just return. Otherwise, invoke - * synchronize_rcu() to wait for a full grace period. + * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. + * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * * Yes, this function does not take counter wrap into account. But * counter wrap is harmless. If the counter wraps, we have waited for @@ -3804,10 +3867,8 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); */ void cond_synchronize_rcu(unsigned long oldstate) { - if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) + if (!poll_state_synchronize_rcu(oldstate)) synchronize_rcu(); - else - smp_mb(); /* Ensure GP ends before subsequent accesses. */ } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); From 040accb3cd4ac4a8d151413f569b7ba6d918a19c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 12:37:46 -0800 Subject: [PATCH 59/77] torture: Abstract jitter.sh start/stop into scripts This commit creates jitterstart.sh and jitterstop.sh scripts that handle the starting and stopping of the jitter.sh scripts. These must be sourced using the bash "." command to allow the generated script to wait on the backgrounded jitter.sh scripts. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/jitterstart.sh | 37 +++++++++++++++++++ .../selftests/rcutorture/bin/jitterstop.sh | 23 ++++++++++++ tools/testing/selftests/rcutorture/bin/kvm.sh | 7 +--- 3 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 tools/testing/selftests/rcutorture/bin/jitterstart.sh create mode 100644 tools/testing/selftests/rcutorture/bin/jitterstop.sh diff --git a/tools/testing/selftests/rcutorture/bin/jitterstart.sh b/tools/testing/selftests/rcutorture/bin/jitterstart.sh new file mode 100644 index 00000000000000..3d710ad291c382 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitterstart.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Start up the specified number of jitter.sh scripts in the background. +# +# Usage: . jitterstart.sh n jittering-dir duration [ sleepmax [ spinmax ] ] +# +# n: Number of jitter.sh scripts to start up. +# jittering-dir: Directory in which to put "jittering" file. +# duration: Time to run in seconds. +# sleepmax: Maximum microseconds to sleep, defaults to one second. +# spinmax: Maximum microseconds to spin, defaults to one millisecond. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +jitter_n=$1 +if test -z "$jitter_n" +then + echo jitterstart.sh: Missing count of jitter.sh scripts to start. + exit 33 +fi +jittering_dir=$2 +if test -z "$jittering_dir" +then + echo jitterstart.sh: Missing directory in which to place jittering file. + exit 34 +fi +shift +shift + +touch ${jittering_dir}/jittering +for ((jitter_i = 1; jitter_i <= $jitter_n; jitter_i++)) +do + jitter.sh $jitter_i "${jittering_dir}/jittering" "$@" & +done diff --git a/tools/testing/selftests/rcutorture/bin/jitterstop.sh b/tools/testing/selftests/rcutorture/bin/jitterstop.sh new file mode 100644 index 00000000000000..576a4cf4b79a57 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/jitterstop.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Remove the "jittering" file, signaling the jitter.sh scripts to stop, +# then wait for them to terminate. +# +# Usage: . jitterstop.sh jittering-dir +# +# jittering-dir: Directory containing "jittering" file. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +jittering_dir=$1 +if test -z "$jittering_dir" +then + echo jitterstop.sh: Missing directory in which to place jittering file. + exit 34 +fi + +rm -f ${jittering_dir}/jittering +wait diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index a2ee3f2fff3c76..d6973e4a2ecfac 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -502,15 +502,12 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\ttouch " rd "jittering" - for (j = 0; j < njitter; j++) - print "\tjitter.sh " j " " rd "jittering " dur " " ja[2] " " ja[3] "&" + print "\t. jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" print "\tdone" - print "\trm -f " rd "jittering" - print "\twait" + print "\t. jitterstop.sh " rd print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" From cc45716e07a41233b7c0b2183b0a3e60b85192e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Feb 2021 16:19:29 -0800 Subject: [PATCH 60/77] torture: Record TORTURE_KCONFIG_GDB_ARG in qemu-cmd When re-running old rcutorture builds, if the original run involved gdb, the re-run also needs to do so. This commit therefore records the TORTURE_KCONFIG_GDB_ARG environment variable into the qemu-cmd file so that the re-run can access it. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index eb5346b457d4e2..5d9ac90c2cfb11 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -201,6 +201,7 @@ echo kernel here: `head -n 1 $testid_txt | sed -e 's/^Build directory: //'` >> echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd +echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then From d53f52d6fc220ba2074338ce6a91f837c7a7cba0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Feb 2021 14:00:05 -0800 Subject: [PATCH 61/77] torture: Extract kvm-test-1-run-qemu.sh from kvm-test-1-run.sh Currently, kvm-test-1-run.sh both builds and runs an rcutorture kernel, which is inconvenient when it is necessary to re-run an old run or to carry out a run on a remote system. This commit therefore extracts the portion of kvm-test-1-run.sh that invoke qemu to actually run rcutorture and places it in kvm-test-1-run-qemu.sh. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run-qemu.sh | 170 ++++++++++++++++++ .../rcutorture/bin/kvm-test-1-run.sh | 127 +------------ 2 files changed, 171 insertions(+), 126 deletions(-) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh new file mode 100755 index 00000000000000..6b0d71b325eb7e --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Carry out a kvm-based run for the specified qemu-cmd file, which might +# have been generated by --build-only kvm.sh run. +# +# Usage: kvm-test-1-run-qemu.sh qemu-cmd-dir +# +# qemu-cmd-dir provides the directory containing qemu-cmd file. +# This is assumed to be of the form prefix/ds/scenario, where +# "ds" is the top-level date-stamped directory and "scenario" +# is the scenario name. Any required adjustments to this file +# must have been made by the caller. The shell-command comments +# at the end of the qemu-cmd file are not optional. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +T=${TMPDIR-/tmp}/kvm-test-1-run-qemu.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +resdir="$1" +if ! test -d "$resdir" +then + echo $0: Nonexistent directory: $resdir + exit 1 +fi +if ! test -f "$resdir/qemu-cmd" +then + echo $0: Nonexistent qemu-cmd file: $resdir/qemu-cmd + exit 1 +fi + +# Obtain settings from the qemu-cmd file. +grep '^#' $resdir/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +# Decorate qemu-cmd with redirection, backgrounding, and PID capture +sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd +echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd + +# In case qemu refuses to run... +echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log + +# Attempt to run qemu +kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` +( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & +commandcompleted=0 +if test -z "$TORTURE_KCONFIG_GDB_ARG" +then + sleep 10 # Give qemu's pid a chance to reach the file + if test -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + echo Monitoring qemu job at pid $qemu_pid + else + qemu_pid="" + echo Monitoring qemu job at yet-as-unknown pid + fi +fi +if test -n "$TORTURE_KCONFIG_GDB_ARG" +then + base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` + if ! test -f $base_resdir/vmlinux + then + base_resdir=/path/to + fi + echo Waiting for you to attach a debug session, for example: > /dev/tty + echo " gdb $base_resdir/vmlinux" > /dev/tty + echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty + echo " target remote :1234" > /dev/tty + echo " continue" > /dev/tty + kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` +fi +while : +do + if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" + then + qemu_pid=`cat "$resdir/qemu_pid"` + fi + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 + then + if test -n "$TORTURE_KCONFIG_GDB_ARG" + then + : + elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1" + then + break; + fi + sleep 1 + else + commandcompleted=1 + if test $kruntime -lt $seconds + then + echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1 + grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1 + killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`" + if test -n "$killpid" + then + echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 + ps -fp $killpid >> $resdir/Warnings 2>&1 + fi + else + echo ' ---' `date`: "Kernel done" + fi + break + fi +done +if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" +then + qemu_pid=`cat "$resdir/qemu_pid"` +fi +if test $commandcompleted -eq 0 -a -n "$qemu_pid" +then + if ! test -f "$resdir/../STOP.1" + then + echo Grace period for qemu job at pid $qemu_pid + fi + oldline="`tail $resdir/console.log`" + while : + do + if test -f "$resdir/../STOP.1" + then + echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi + kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` + if kill -0 $qemu_pid > /dev/null 2>&1 + then + : + else + break + fi + must_continue=no + newline="`tail $resdir/console.log`" + if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' + then + must_continue=yes + fi + last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" + if test -z "$last_ts" + then + last_ts=0 + fi + if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + must_continue=yes + fi + if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) + then + echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 + kill -KILL $qemu_pid + break + fi + oldline=$newline + sleep 10 + done +elif test -z "$qemu_pid" +then + echo Unknown PID, cannot kill qemu command +fi + +# Tell the script that this run is done. +rm -f $resdir/build.run + +parse-console.sh $resdir/console.log $title diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 5d9ac90c2cfb11..f3d2ded0c8cf61 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -210,129 +210,4 @@ then exit 0 fi -# Decorate qemu-cmd with redirection, backgrounding, and PID capture -sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd -echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd - -# In case qemu refuses to run... -echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log - -# Attempt to run qemu -kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) & -commandcompleted=0 -if test -z "$TORTURE_KCONFIG_GDB_ARG" -then - sleep 10 # Give qemu's pid a chance to reach the file - if test -s "$resdir/qemu_pid" - then - qemu_pid=`cat "$resdir/qemu_pid"` - echo Monitoring qemu job at pid $qemu_pid - else - qemu_pid="" - echo Monitoring qemu job at yet-as-unknown pid - fi -fi -if test -n "$TORTURE_KCONFIG_GDB_ARG" -then - echo Waiting for you to attach a debug session, for example: > /dev/tty - echo " gdb $base_resdir/vmlinux" > /dev/tty - echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty - echo " target remote :1234" > /dev/tty - echo " continue" > /dev/tty - kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null` -fi -while : -do - if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" - then - qemu_pid=`cat "$resdir/qemu_pid"` - fi - kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 - then - if test -n "$TORTURE_KCONFIG_GDB_ARG" - then - : - elif test $kruntime -ge $seconds || test -f "$resdir/../STOP.1" - then - break; - fi - sleep 1 - else - commandcompleted=1 - if test $kruntime -lt $seconds - then - echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1 - grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1 - killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`" - if test -n "$killpid" - then - echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 - ps -fp $killpid >> $resdir/Warnings 2>&1 - fi - else - echo ' ---' `date`: "Kernel done" - fi - break - fi -done -if test -z "$qemu_pid" -a -s "$resdir/qemu_pid" -then - qemu_pid=`cat "$resdir/qemu_pid"` -fi -if test $commandcompleted -eq 0 -a -n "$qemu_pid" -then - if ! test -f "$resdir/../STOP.1" - then - echo Grace period for qemu job at pid $qemu_pid - fi - oldline="`tail $resdir/console.log`" - while : - do - if test -f "$resdir/../STOP.1" - then - echo "PID $qemu_pid killed due to run STOP.1 request" >> $resdir/Warnings 2>&1 - kill -KILL $qemu_pid - break - fi - kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` - if kill -0 $qemu_pid > /dev/null 2>&1 - then - : - else - break - fi - must_continue=no - newline="`tail $resdir/console.log`" - if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : ' - then - must_continue=yes - fi - last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`" - if test -z "$last_ts" - then - last_ts=0 - fi - if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE)) - then - must_continue=yes - fi - if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE)) - then - echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1 - kill -KILL $qemu_pid - break - fi - oldline=$newline - sleep 10 - done -elif test -z "$qemu_pid" -then - echo Unknown PID, cannot kill qemu command -fi - -# Tell the script that this run is done. -rm -f $resdir/build.run - -parse-console.sh $resdir/console.log $title +kvm-test-1-run-qemu.sh $resdir From 7831b391fbf86d19ae92e2984a9274b1d2b4eb06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Feb 2021 15:32:23 -0800 Subject: [PATCH 62/77] torture: Record jitter start/stop commands Distributed runs of rcutorture will need to start and stop jittering on the remote hosts, which means that the commands must be communicated to those hosts. The commit therefore causes kvm.sh to place these commands in new TORTURE_JITTER_START and TORTURE_JITTER_STOP environment variables to communicate them to the scripts that will set this up. In addition, this commit causes kvm-test-1-run.sh to append these commands to each generated qemu-cmd file, which allows any remotely executing script to extract the needed commands from this file. Signed-off-by: Paul E. McKenney --- .../rcutorture/bin/kvm-test-1-run.sh | 2 ++ tools/testing/selftests/rcutorture/bin/kvm.sh | 24 +++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index f3d2ded0c8cf61..a69f8ae3eea528 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -202,6 +202,8 @@ echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_appen echo "# TORTURE_SHUTDOWN_GRACE=$TORTURE_SHUTDOWN_GRACE" >> $resdir/qemu-cmd echo "# seconds=$seconds" >> $resdir/qemu-cmd echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd +echo "# TORTURE_JITTER_START=\"$TORTURE_JITTER_START\"" >> $resdir/qemu-cmd +echo "# TORTURE_JITTER_STOP=\"$TORTURE_JITTER_STOP\"" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index d6973e4a2ecfac..efcbd12d5ddb78 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -35,6 +35,8 @@ TORTURE_KCONFIG_ARG="" TORTURE_KCONFIG_GDB_ARG="" TORTURE_BOOT_GDB_ARG="" TORTURE_QEMU_GDB_ARG="" +TORTURE_JITTER_START="" +TORTURE_JITTER_STOP="" TORTURE_KCONFIG_KASAN_ARG="" TORTURE_KCONFIG_KCSAN_ARG="" TORTURE_KMAKE_ARG="" @@ -443,6 +445,16 @@ function dump(first, pastlast, batchnum) print "echo ----Start batch " batchnum ": `date` | tee -a " rd "log"; print "needqemurun=" jn=1 + njitter = 0; + split(jitter, ja); + if (ja[1] == -1 && ncpus == 0) + njitter = 1; + else if (ja[1] == -1) + njitter = ncpus; + else + njitter = ja[1]; + print "TORTURE_JITTER_START=\". jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] "\"; export TORTURE_JITTER_START"; + print "TORTURE_JITTER_STOP=\". jitterstop.sh " rd " \"; export TORTURE_JITTER_STOP" for (j = first; j < pastlast; j++) { cpusr[jn] = cpus[j]; if (cfrep[cf[j]] == "") { @@ -484,14 +496,6 @@ function dump(first, pastlast, batchnum) print "\tneedqemurun=1" print "fi" } - njitter = 0; - split(jitter, ja); - if (ja[1] == -1 && ncpus == 0) - njitter = 1; - else if (ja[1] == -1) - njitter = ncpus; - else - njitter = ja[1]; if (TORTURE_BUILDONLY && njitter != 0) { njitter = 0; print "echo Build-only run, so suppressing jitter | tee -a " rd "log" @@ -502,12 +506,12 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - print "\t. jitterstart.sh " njitter " " rd " " dur " " ja[2] " " ja[3] + print "\t$TORTURE_JITTER_START"; print "\twhile ls $runfiles > /dev/null 2>&1" print "\tdo" print "\t\t:" print "\tdone" - print "\t. jitterstop.sh " rd + print "\t$TORTURE_JITTER_STOP"; print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" print "\twait" From cb1fa863a00ba0e8faf69d2ebb960b75129bccd6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Feb 2021 16:55:04 -0800 Subject: [PATCH 63/77] torture: Record kvm-test-1-run.sh and kvm-test-1-run-qemu.sh PIDs This commit records the process IDs of the kvm-test-1-run.sh and kvm-test-1-run-qemu.sh scripts to ease monitoring of remotely running instances of these scripts. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh | 2 ++ tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh index 6b0d71b325eb7e..576a9b761b415f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -33,6 +33,8 @@ then exit 1 fi +echo ' ---' `date`: Starting kernel, PID $$ + # Obtain settings from the qemu-cmd file. grep '^#' $resdir/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings . $T/qemu-cmd-settings diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index a69f8ae3eea528..a386ca8dd690bb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -41,7 +41,7 @@ then echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it" exit 1 fi -echo ' ---' `date`: Starting build +echo ' ---' `date`: Starting build, PID $$ echo ' ---' Kconfig fragment at: $config_template >> $resdir/log touch $resdir/ConfigFragment.input From 996a042e0a0684b7a666b9d745784623a3531b27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 16 Feb 2021 20:17:44 -0800 Subject: [PATCH 64/77] torture: Remove no-mpstat error message The cpus2use.sh script complains if the mpstat command is not available, and instead uses all available CPUs. Unfortunately, this complaint goes to stdout, where it confuses invokers who expect a single number. This commit removes this error message in order to avoid this confusion. The tendency of late has been to give rcutorture a full system, so this should not cause issues. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/cpus2use.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh index 1dbfb62567d2ff..6bb9930016804d 100755 --- a/tools/testing/selftests/rcutorture/bin/cpus2use.sh +++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh @@ -21,7 +21,6 @@ then awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'` else # No mpstat command, so use all available CPUs. - echo The mpstat command is not available, so greedily using all CPUs. idlecpus=$ncpus fi awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null ' From 00a447fabb5252d01035e78ae7f2943e5b4fff64 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 20 Feb 2021 10:13:52 -0800 Subject: [PATCH 65/77] torture: Rename SRCU-t and SRCU-u to avoid lowercase characters The convention that scenario names are all uppercase has two exceptions, SRCU-t and SRCU-u. This commit therefore renames them to SRCU-T and SRCU-U, respectively, to bring them in line with this convention. This in turn permits tighter argument checking in the torture-test scripting. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/configs/rcu/CFLIST | 4 ++-- .../selftests/rcutorture/configs/rcu/{SRCU-t => SRCU-T} | 0 .../rcutorture/configs/rcu/{SRCU-t.boot => SRCU-T.boot} | 0 .../selftests/rcutorture/configs/rcu/{SRCU-u => SRCU-U} | 0 .../rcutorture/configs/rcu/{SRCU-u.boot => SRCU-U.boot} | 0 5 files changed, 2 insertions(+), 2 deletions(-) rename tools/testing/selftests/rcutorture/configs/rcu/{SRCU-t => SRCU-T} (100%) rename tools/testing/selftests/rcutorture/configs/rcu/{SRCU-t.boot => SRCU-T.boot} (100%) rename tools/testing/selftests/rcutorture/configs/rcu/{SRCU-u => SRCU-U} (100%) rename tools/testing/selftests/rcutorture/configs/rcu/{SRCU-u.boot => SRCU-U.boot} (100%) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST index f2b20db9e296b3..98b6175e5aa09f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST @@ -7,8 +7,8 @@ TREE07 TREE09 SRCU-N SRCU-P -SRCU-t -SRCU-u +SRCU-T +SRCU-U TINY01 TINY02 TASKS01 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcu/SRCU-t rename to tools/testing/selftests/rcutorture/configs/rcu/SRCU-T diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot rename to tools/testing/selftests/rcutorture/configs/rcu/SRCU-T.boot diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcu/SRCU-u rename to tools/testing/selftests/rcutorture/configs/rcu/SRCU-U diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot similarity index 100% rename from tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot rename to tools/testing/selftests/rcutorture/configs/rcu/SRCU-U.boot From e633e63aa907feff98c654c1919101f3d53ebd5b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2021 07:15:41 -0800 Subject: [PATCH 66/77] torture: Make upper-case-only no-dot no-slash scenario names official This commit enforces the defacto restriction on scenario names, which is that they contain neither "/", ".", nor lowercase alphabetic characters. This restriction avoids collisions between scenario names and the torture scripting's files and directories. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index efcbd12d5ddb78..03364f4072bcd2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -104,7 +104,7 @@ do TORTURE_BUILDONLY=1 ;; --configs|--config) - checkarg --configs "(list of config files)" "$#" "$2" '^[^/]\+$' '^--' + checkarg --configs "(list of config files)" "$#" "$2" '^[^/.a-z]\+$' '^--' configs="$configs $2" shift ;; From 7ef0d5a33c81cfb1993f2947c361784b1b02adc8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2021 14:04:01 -0800 Subject: [PATCH 67/77] torture: De-capitalize TORTURE_SUITE Although it might be unlikely that someone would name a scenario "TORTURE_SUITE", they are within their rights to do so. This script therefore renames the "TORTURE_SUITE" file in the top-level date-stamped directory within "res" to "torture_suite" to avoid this name collision. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 2 +- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh index 47cf4db10896ca..e01b31b8704410 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh @@ -30,7 +30,7 @@ do resdir=`echo $i | sed -e 's,/$,,' -e 's,/[^/]*$,,'` head -1 $resdir/log fi - TORTURE_SUITE="`cat $i/../TORTURE_SUITE`" + TORTURE_SUITE="`cat $i/../torture_suite`" configfile=`echo $i | sed -e 's,^.*/,,'` rm -f $i/console.log.*.diags kvm-recheck-${TORTURE_SUITE}.sh $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 03364f4072bcd2..a1cd05c9ddc479 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -405,7 +405,7 @@ echo Results directory: $resdir/$ds echo $scriptname $args touch $resdir/$ds/log echo $scriptname $args >> $resdir/$ds/log -echo ${TORTURE_SUITE} > $resdir/$ds/TORTURE_SUITE +echo ${TORTURE_SUITE} > $resdir/$ds/torture_suite echo Build directory: `pwd` > $resdir/$ds/testid.txt if test -d .git then From d6100d764cc47100ecabdc704bde5ad0448c87cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 17 Feb 2021 14:40:03 -0800 Subject: [PATCH 68/77] torture: Create a "batches" file for build reuse This commit creates a "batches" file in the res/$ds directory, where $ds is the datestamp. This file contains the batches and the number of CPUs, for example: 1 TREE03 16 1 SRCU-P 8 2 TREE07 16 2 TREE01 8 3 TREE02 8 3 TREE04 8 3 TREE05 8 4 SRCU-N 4 4 TRACE01 4 4 TRACE02 4 4 RUDE01 2 4 RUDE01.2 2 4 TASKS01 2 4 TASKS03 2 4 SRCU-t 1 4 SRCU-u 1 4 TASKS02 1 4 TINY01 1 5 TINY02 1 5 TREE09 1 The first column is the batch number, the second the scenario number (possibly suffixed by a repetition number, as in "RUDE01.2"), and the third is the number of CPUs required by that scenario. The last line shows the number of CPUs expected by this batch file, which allows the run to be re-batched if a different number of CPUs is available. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index a1cd05c9ddc479..0add1636789959 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -565,6 +565,18 @@ echo 'ret=$?' >> $T/script echo "cat $T/kvm-recheck.sh.out | tee -a $resdir/$ds/log" >> $T/script echo 'exit $ret' >> $T/script +# Extract the tests and their batches from the script. +egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | + sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | + awk ' + /^----Start/ { + batchno = $3; + next; + } + { + print batchno, $1, $2 + }' > $T/batches + if test "$dryrun" = script then cat $T/script @@ -583,21 +595,14 @@ then exit 0 elif test "$dryrun" = batches then - # Extract the tests and their batches from the script. - egrep 'Start batch|Starting build\.' $T/script | grep -v ">>" | - sed -e 's/:.*$//' -e 's/^echo //' -e 's/-ovf//' | - awk ' - /^----Start/ { - batchno = $3; - next; - } - { - print batchno, $1, $2 - }' + cat $T/batches + exit 0 else - # Not a dryrun, so run the script. + # Not a dryrun. Record the batches and the number of CPUs, then run the script. bash $T/script ret=$? + cp $T/batches $resdir/$ds/batches + echo '#' cpus=$cpus >> $resdir/$ds/batches echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a $resdir/$ds/log exit $ret fi From 7cf86c0b6279d9d12bb697e58c7e8b2184a8f3db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 19 Feb 2021 17:49:58 -0800 Subject: [PATCH 69/77] torture: Add kvm-again.sh to rerun a previous torture-test This commit adds a kvm-again.sh script that, given the results directory of a torture-test run, re-runs that test. This means that the kernels need not be rebuilt, but it also is a step towards running torture tests on remote systems. This commit also adds a kvm-test-1-run-batch.sh script that runs one batch out of the torture test. The idea is to copy a results directory tree to remote systems, then use kvm-test-1-run-batch.sh to run batches on these systems. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-again.sh | 171 ++++++++++++++++++ .../rcutorture/bin/kvm-test-1-run-batch.sh | 67 +++++++ 2 files changed, 238 insertions(+) create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-again.sh create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh new file mode 100755 index 00000000000000..413744093994c7 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -0,0 +1,171 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Rerun a series of tests under KVM. +# +# Usage: kvm-again.sh /path/to/old/run [ options ] +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +scriptname=$0 +args="$*" + +T=${TMPDIR-/tmp}/kvm-again.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if ! test -d tools/testing/selftests/rcutorture/bin +then + echo $scriptname must be run from top-level directory of kernel source tree. + exit 1 +fi + +oldrun=$1 +shift +if ! test -d "$oldrun" +then + echo "Usage: $scriptname /path/to/old/run [ options ]" + exit 1 +fi +if ! cp "$oldrun/batches" $T/batches.oldrun +then + # Later on, can reconstitute this from console.log files. + echo Prior run batches file does not exist: $oldrun/batches + exit 1 +fi + +if test -f "$oldrun/torture_suite" +then + torture_suite="`cat $oldrun/torture_suite`" +elif test -f "$oldrun/TORTURE_SUITE" +then + torture_suite="`cat $oldrun/TORTURE_SUITE`" +else + echo "Prior run torture_suite file does not exist: $oldrun/{torture_suite,TORTURE_SUITE}" + exit 1 +fi + +KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM +PATH=${KVM}/bin:$PATH; export PATH +. functions.sh + +dryrun= +default_link="cp -R" +rundir="`pwd`/tools/testing/selftests/rcutorture/res/`date +%Y.%m.%d-%H.%M.%S-again`" + +startdate="`date`" +starttime="`get_starttime`" + +usage () { + echo "Usage: $scriptname $oldrun [ arguments ]:" + echo " --dryrun" + echo " --link hard|soft|copy" + echo " --remote" + echo " --rundir /new/res/path" + exit 1 +} + +while test $# -gt 0 +do + case "$1" in + --dryrun) + dryrun=1 + ;; + --link) + checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--' + case "$2" in + copy) + arg_link="cp -R" + ;; + hard) + arg_link="cp -Rl" + ;; + soft) + arg_link="cp -Rs" + ;; + esac + shift + ;; + --remote) + arg_remote=1 + default_link="cp -as" + ;; + --rundir) + checkarg --rundir "(absolute pathname)" "$#" "$2" '^/' '^error' + rundir=$2 + if test -e "$rundir" + then + echo "--rundir $2: Already exists." + usage + fi + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done +if test -z "$arg_link" +then + arg_link="$default_link" +fi + +echo ---- Re-run results directory: $rundir + +# Copy old run directory tree over and adjust. +mkdir -p "`dirname "$rundir"`" +if ! $arg_link "$oldrun" "$rundir" +then + echo "Cannot copy from $oldrun to $rundir." + usage +fi +rm -f "$rundir"/*/{console.log,console.log.diags,qemu_pid,qemu-retval,Warnings,kvm-test-1-run.sh.out,kvm-test-1-run-qemu.sh.out,vmlinux} "$rundir"/log +echo $oldrun > "$rundir/re-run" +if ! test -d "$rundir/../../bin" +then + $arg_link "$oldrun/../../bin" "$rundir/../.." +fi +for i in $rundir/*/qemu-cmd +do + cp "$i" $T + qemu_cmd_dir="`dirname "$i"`" + kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" + kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log < $T/qemu-cmd > $i + echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i +done +grep -v '^#' $T/batches.oldrun | awk ' +BEGIN { + oldbatch = 1; +} + +{ + if (oldbatch != $1) { + print "kvm-test-1-run-batch.sh" curbatch; + curbatch = ""; + oldbatch = $1; + } + curbatch = curbatch " " $2; +} + +END { + print "kvm-test-1-run-batch.sh" curbatch +}' > $T/runbatches.sh + +if test -n "$dryrun" +then + echo ---- Dryrun complete, directory: $rundir | tee -a "$rundir/log" +else + ( cd "$rundir"; sh $T/runbatches.sh ) + kcsan-collapse.sh "$rundir" | tee -a "$rundir/log" + echo | tee -a "$rundir/log" + echo ---- Results directory: $rundir | tee -a "$rundir/log" + kvm-recheck.sh "$rundir" > $T/kvm-recheck.sh.out 2>&1 + ret=$? + cat $T/kvm-recheck.sh.out | tee -a "$rundir/log" + echo " --- Done at `date` (`get_starttime_duration $starttime`) exitcode $ret" | tee -a "$rundir/log" + exit $ret +fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh new file mode 100755 index 00000000000000..7ea0809e229e93 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-batch.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Carry out a kvm-based run for the specified batch of scenarios, which +# might have been built by --build-only kvm.sh run. +# +# Usage: kvm-test-1-run-batch.sh SCENARIO [ SCENARIO ... ] +# +# Each SCENARIO is the name of a directory in the current directory +# containing a ready-to-run qemu-cmd file. +# +# Copyright (C) 2021 Facebook, Inc. +# +# Authors: Paul E. McKenney + +T=${TMPDIR-/tmp}/kvm-test-1-run-batch.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +echo ---- Running batch $* +# Check arguments +runfiles= +for i in "$@" +do + if ! echo $i | grep -q '^[^/.a-z]\+\(\.[0-9]\+\)\?$' + then + echo Bad scenario name: \"$i\" 1>&2 + exit 1 + fi + if ! test -d "$i" + then + echo Scenario name not a directory: \"$i\" 1>&2 + exit 2 + fi + if ! test -f "$i/qemu-cmd" + then + echo Scenario lacks a command file: \"$i/qemu-cmd\" 1>&2 + exit 3 + fi + rm -f $i/build.* + touch $i/build.run + runfiles="$runfiles $i/build.run" +done + +# Extract settings from the qemu-cmd file. +grep '^#' $1/qemu-cmd | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + +# Start up jitter, start each scenario, wait, end jitter. +echo ---- System running test: `uname -a` +echo ---- Starting kernels. `date` | tee -a log +$TORTURE_JITTER_START +for i in "$@" +do + echo ---- System running test: `uname -a` > $i/kvm-test-1-run-qemu.sh.out + echo > $i/kvm-test-1-run-qemu.sh.out + kvm-test-1-run-qemu.sh $i >> $i/kvm-test-1-run-qemu.sh.out 2>&1 & +done +for i in $runfiles +do + while ls $i > /dev/null 2>&1 + do + : + done +done +echo ---- All kernel runs complete. `date` | tee -a log +$TORTURE_JITTER_STOP From 00505165cf4484dffc488259d59689845ba77939 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2021 14:12:58 -0800 Subject: [PATCH 70/77] torture: Add --duration argument to kvm-again.sh This commit adds a --duration argument to kvm-again.sh to allow the user to override the --duration specified for the original kvm.sh run. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-again.sh | 25 +++++++++++++++- .../selftests/rcutorture/bin/kvm-transform.sh | 29 +++++++++++++++---- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 413744093994c7..e7e54581d23e2f 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -52,6 +52,7 @@ PATH=${KVM}/bin:$PATH; export PATH . functions.sh dryrun= +dur= default_link="cp -R" rundir="`pwd`/tools/testing/selftests/rcutorture/res/`date +%Y.%m.%d-%H.%M.%S-again`" @@ -61,6 +62,7 @@ starttime="`get_starttime`" usage () { echo "Usage: $scriptname $oldrun [ arguments ]:" echo " --dryrun" + echo " --duration minutes | s | h | d" echo " --link hard|soft|copy" echo " --remote" echo " --rundir /new/res/path" @@ -73,6 +75,23 @@ do --dryrun) dryrun=1 ;; + --duration) + checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(s\|m\|h\|d\|\)$' '^error' + mult=60 + if echo "$2" | grep -q 's$' + then + mult=1 + elif echo "$2" | grep -q 'h$' + then + mult=3600 + elif echo "$2" | grep -q 'd$' + then + mult=86400 + fi + ts=`echo $2 | sed -e 's/[smhd]$//'` + dur=$(($ts*mult)) + shift + ;; --link) checkarg --link "hard|soft|copy" "$#" "$2" 'hard\|soft\|copy' '^--' case "$2" in @@ -134,7 +153,11 @@ do cp "$i" $T qemu_cmd_dir="`dirname "$i"`" kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" - kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log < $T/qemu-cmd > $i + kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log $dur < $T/qemu-cmd > $i + if test -n "$dur" + then + echo "# seconds=$dur" >> $i + fi echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i done grep -v '^#' $T/batches.oldrun | awk ' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index c45a953ef39319..162dddbcde00b5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -3,7 +3,7 @@ # # Transform a qemu-cmd file to allow reuse. # -# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out +# Usage: kvm-transform.sh bzImage console.log [ seconds ] < qemu-cmd-in > qemu-cmd-out # # bzImage: Kernel and initrd from the same prior kvm.sh run. # console.log: File into which to place console output. @@ -29,20 +29,37 @@ then echo "Need console log file name." exit 1 fi +seconds=$3 +if test -n "$seconds" && echo $seconds | grep -q '[^0-9]' +then + echo "Invalid duration, should be numeric in seconds: '$seconds'" + exit 1 +fi + +awk -v image="$image" -v consolelog="$consolelog" -v seconds="$seconds" ' +/^#/ { + print $0; + next; +} -awk -v image="$image" -v consolelog="$consolelog" ' { line = ""; for (i = 1; i <= NF; i++) { - if (line == "") + if ("" seconds != "" && $i ~ /\.shutdown_secs=[0-9]*$/) { + sub(/[0-9]*$/, seconds, $i); + if (line == "") + line = $i; + else + line = line " " $i; + } else if (line == "") { line = $i; - else + } else { line = line " " $i; + } if ($i == "-serial") { i++; line = line " file:" consolelog; - } - if ($i == "-kernel") { + } else if ($i == "-kernel") { i++; line = line " " image; } From 018629e909ffcabfc657388094371f20ba90649f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2021 14:58:41 -0800 Subject: [PATCH 71/77] torture: Make kvm-transform.sh update jitter commands When rerunning an old run using kvm-again.sh, the jitter commands will re-use the original "res" directory. This works, but is clearly an accident waiting to happen. And this accident will happen with remote runs, where the original directory lives on some other system. This commit therefore updates the qemu-cmd commands to use the new res directory created for this specific run. Signed-off-by: Paul E. McKenney --- .../selftests/rcutorture/bin/kvm-again.sh | 3 ++- .../selftests/rcutorture/bin/kvm-transform.sh | 23 ++++++++++++++++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index e7e54581d23e2f..3fb57ce5aafca1 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -153,7 +153,8 @@ do cp "$i" $T qemu_cmd_dir="`dirname "$i"`" kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" - kvm-transform.sh $kernel_dir/bzImage $qemu_cmd_dir/console.log $dur < $T/qemu-cmd > $i + jitter_dir="`dirname "$kernel_dir"`" + kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur < $T/qemu-cmd > $i if test -n "$dur" then echo "# seconds=$dur" >> $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index 162dddbcde00b5..e9dcbce17bbdfc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -3,7 +3,7 @@ # # Transform a qemu-cmd file to allow reuse. # -# Usage: kvm-transform.sh bzImage console.log [ seconds ] < qemu-cmd-in > qemu-cmd-out +# Usage: kvm-transform.sh bzImage console.log jitter_dir [ seconds ] < qemu-cmd-in > qemu-cmd-out # # bzImage: Kernel and initrd from the same prior kvm.sh run. # console.log: File into which to place console output. @@ -29,14 +29,31 @@ then echo "Need console log file name." exit 1 fi -seconds=$3 +jitter_dir="$3" +if test -z "$jitter_dir" || ! test -d "$jitter_dir" +then + echo "Need valid jitter directory: '$jitter_dir'" + exit 1 +fi +seconds="$4" if test -n "$seconds" && echo $seconds | grep -q '[^0-9]' then echo "Invalid duration, should be numeric in seconds: '$seconds'" exit 1 fi -awk -v image="$image" -v consolelog="$consolelog" -v seconds="$seconds" ' +awk -v image="$image" -v consolelog="$consolelog" -v jitter_dir="$jitter_dir" \ + -v seconds="$seconds" ' +/^# TORTURE_JITTER_START=/ { + print "# TORTURE_JITTER_START=\". jitterstart.sh " $4 " " jitter_dir " " $6 " " $7; + next; +} + +/^# TORTURE_JITTER_STOP=/ { + print "# TORTURE_JITTER_STOP=\". jitterstop.sh " " " jitter_dir " " $5; + next; +} + /^#/ { print $0; next; From a5dbe2524f553a1283b3364ff91e96bfb618ceab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 12:07:39 -0800 Subject: [PATCH 72/77] torture: Make TORTURE_TRUST_MAKE available in kvm-again.sh environment Because the TORTURE_TRUST_MAKE environment variable is not recorded, kvm-again.sh runs can result in the parse-build.sh script emitting false-positive "BUG: TREE03 no build" messages. These messages are intended to complain about any lack of compiler invocations when the --trust-make flag is not given to kvm.sh. However, when this flag is given to kvm.sh (and thus when TORTURE_TRUST_MAKE=y), lack of compiler invocations is expected behavior when rebuilding from identical source code. This commit therefore makes kvm-test-1-run.sh record the value of the TORTURE_TRUST_MAKE environment variable as an additional comment in the qemu-cmd file, and also makes kvm-again.sh reconstitute that variable from that comment. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 5 +++++ tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 + 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 3fb57ce5aafca1..f1c80b02af58dc 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -161,6 +161,11 @@ do fi echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i done + +# Extract settings from the last qemu-cmd file transformed above. +grep '^#' $i | sed -e 's/^# //' > $T/qemu-cmd-settings +. $T/qemu-cmd-settings + grep -v '^#' $T/batches.oldrun | awk ' BEGIN { oldbatch = 1; diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index a386ca8dd690bb..420ed5ce9d32f5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -204,6 +204,7 @@ echo "# seconds=$seconds" >> $resdir/qemu-cmd echo "# TORTURE_KCONFIG_GDB_ARG=\"$TORTURE_KCONFIG_GDB_ARG\"" >> $resdir/qemu-cmd echo "# TORTURE_JITTER_START=\"$TORTURE_JITTER_START\"" >> $resdir/qemu-cmd echo "# TORTURE_JITTER_STOP=\"$TORTURE_JITTER_STOP\"" >> $resdir/qemu-cmd +echo "# TORTURE_TRUST_MAKE=\"$TORTURE_TRUST_MAKE\"; export TORTURE_TRUST_MAKE" >> $resdir/qemu-cmd if test -n "$TORTURE_BUILDONLY" then From 03edf700db335b9375c18310d59d0a0ab6c850df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 13:12:41 -0800 Subject: [PATCH 73/77] torture: Print proper vmlinux path for kvm-again.sh runs The kvm-again.sh script does not copy over the vmlinux files due to their large size. This means that a gdb run must use the vmlinux file from the original "res" directory. This commit therefore finds that directory and prints it out so that the user can copy and pasted the gdb command just as for the initial run. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 5 ++++- .../testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index f1c80b02af58dc..668636ee3dafbe 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -159,7 +159,10 @@ do then echo "# seconds=$dur" >> $i fi - echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i + if test -n "$arg_remote" + then + echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i + fi done # Extract settings from the last qemu-cmd file transformed above. diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh index 576a9b761b415f..5b1aa2a4f3f695 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run-qemu.sh @@ -67,7 +67,11 @@ then base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` if ! test -f $base_resdir/vmlinux then - base_resdir=/path/to + base_resdir="`cat re-run`/$resdir" + if ! test -f $base_resdir/vmlinux + then + base_resdir=/path/to + fi fi echo Waiting for you to attach a debug session, for example: > /dev/tty echo " gdb $base_resdir/vmlinux" > /dev/tty From a1ab2e89f36d678512a50cbebf6afc4201f41a31 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Feb 2021 14:33:03 -0800 Subject: [PATCH 74/77] torture: Consolidate qemu-cmd duration editing into kvm-transform.sh Currently, kvm-again.sh updates the duration in the "seconds=" comment in the qemu-cmd file, but kvm-transform.sh updates the duration in the actual qemu command arguments. This is an accident waiting to happen. This commit therefore consolidates these updates into kvm-transform.sh. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-again.sh | 4 ---- tools/testing/selftests/rcutorture/bin/kvm-transform.sh | 8 ++++++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-again.sh b/tools/testing/selftests/rcutorture/bin/kvm-again.sh index 668636ee3dafbe..46e47a00a7db4d 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-again.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-again.sh @@ -155,10 +155,6 @@ do kernel_dir="`echo $qemu_cmd_dir | sed -e 's/\.[0-9]\+$//'`" jitter_dir="`dirname "$kernel_dir"`" kvm-transform.sh "$kernel_dir/bzImage" "$qemu_cmd_dir/console.log" "$jitter_dir" $dur < $T/qemu-cmd > $i - if test -n "$dur" - then - echo "# seconds=$dur" >> $i - fi if test -n "$arg_remote" then echo "# TORTURE_KCONFIG_GDB_ARG=''" >> $i diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh index e9dcbce17bbdfc..d40b4e60a50cbb 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh @@ -44,6 +44,14 @@ fi awk -v image="$image" -v consolelog="$consolelog" -v jitter_dir="$jitter_dir" \ -v seconds="$seconds" ' +/^# seconds=/ { + if (seconds == "") + print $0; + else + print "# seconds=" seconds; + next; +} + /^# TORTURE_JITTER_START=/ { print "# TORTURE_JITTER_START=\". jitterstart.sh " $4 " " jitter_dir " " $6 " " $7; next; From 114e4a4b4884c14ebd35874cbe3e1ca0d38efa5d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Feb 2021 20:55:57 -0800 Subject: [PATCH 75/77] torture: Fix kvm.sh --datestamp regex check Some versions of grep are happy to interpret a nonsensically placed "-" within a "[]" pattern as a dash, while others give an error message. This commit therefore places the "-" at the end of the expression where it was supposed to be in the first place. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 0add1636789959..6bf00a003d3d1e 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -120,7 +120,7 @@ do shift ;; --datestamp) - checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._-/]*$' '^--' + checkarg --datestamp "(relative pathname)" "$#" "$2" '^[a-zA-Z0-9._/-]*$' '^--' ds=$2 shift ;; From 0909fc2b2c41aae50a18a36ac2858d156f521871 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Feb 2021 17:36:06 -0800 Subject: [PATCH 76/77] rcu: Provide polling interfaces for Tiny RCU grace periods There is a need for a non-blocking polling interface for RCU grace periods, so this commit supplies start_poll_synchronize_rcu() and poll_state_synchronize_rcu() for this purpose. Note that the existing get_state_synchronize_rcu() may be used if future grace periods are inevitable (perhaps due to a later call_rcu() invocation). The new start_poll_synchronize_rcu() is to be used if future grace periods might not otherwise happen. Finally, poll_state_synchronize_rcu() provides a lockless check for a grace period having elapsed since the corresponding call to either of the get_state_synchronize_rcu() or start_poll_synchronize_rcu(). As with get_state_synchronize_rcu(), the return value from either get_state_synchronize_rcu() or start_poll_synchronize_rcu() is passed in to a later call to either poll_state_synchronize_rcu() or the existing (might_sleep) cond_synchronize_rcu(). [ paulmck: Revert cond_synchronize_rcu() to might_sleep() per Frederic Weisbecker feedback. ] Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 7 +++---- kernel/rcu/tiny.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 2a97334eb7865f..35e0be326ffc73 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -17,10 +17,9 @@ /* Never flag non-existent other CPUs! */ static inline bool rcu_eqs_special_set(int cpu) { return false; } -static inline unsigned long get_state_synchronize_rcu(void) -{ - return 0; -} +unsigned long get_state_synchronize_rcu(void); +unsigned long start_poll_synchronize_rcu(void); +bool poll_state_synchronize_rcu(unsigned long oldstate); static inline void cond_synchronize_rcu(unsigned long oldstate) { diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index aa897c3f2e92c6..c8a029fbb11434 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -32,12 +32,14 @@ struct rcu_ctrlblk { struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ + unsigned long gp_seq; /* Grace-period counter. */ }; /* Definition for rcupdate control block. */ static struct rcu_ctrlblk rcu_ctrlblk = { .donetail = &rcu_ctrlblk.rcucblist, .curtail = &rcu_ctrlblk.rcucblist, + .gp_seq = 0 - 300UL, }; void rcu_barrier(void) @@ -56,6 +58,7 @@ void rcu_qs(void) rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; raise_softirq_irqoff(RCU_SOFTIRQ); } + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 1); local_irq_restore(flags); } @@ -177,6 +180,43 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Return a grace-period-counter "cookie". For more information, + * see the Tree RCU header comment. + */ +unsigned long get_state_synchronize_rcu(void) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); + +/* + * Return a grace-period-counter "cookie" and ensure that a future grace + * period completes. For more information, see the Tree RCU header comment. + */ +unsigned long start_poll_synchronize_rcu(void) +{ + unsigned long gp_seq = get_state_synchronize_rcu(); + + if (unlikely(is_idle_task(current))) { + /* force scheduling for rcu_qs() */ + resched_cpu(0); + } + return gp_seq; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); + +/* + * Return true if the grace period corresponding to oldstate has completed + * and false otherwise. For more information, see the Tree RCU header + * comment. + */ +bool poll_state_synchronize_rcu(unsigned long oldstate) +{ + return READ_ONCE(rcu_ctrlblk.gp_seq) != oldstate; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); + void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); From 7ac3fdf099bf784794eb944e0ba5bb69867ca06d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 25 Feb 2021 20:56:10 -0800 Subject: [PATCH 77/77] rcutorture: Test start_poll_synchronize_rcu() and poll_state_synchronize_rcu() This commit causes rcutorture to test the new start_poll_synchronize_rcu() and poll_state_synchronize_rcu() functions. Because of the difficulty of determining the nature of a synchronous RCU grace (expedited or not), the test that insisted that poll_state_synchronize_rcu() detect an intervening synchronize_rcu() had to be dropped. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 99657ffa66887a..956e6bfd7e7745 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -494,6 +494,8 @@ static struct rcu_torture_ops rcu_ops = { .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, .get_gp_state = get_state_synchronize_rcu, + .start_gp_poll = start_poll_synchronize_rcu, + .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, @@ -1223,14 +1225,6 @@ rcu_torture_writer(void *arg) WARN_ON_ONCE(1); break; } - if (cur_ops->get_gp_state && cur_ops->poll_gp_state) - WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && - !cur_ops->poll_gp_state(cookie), - "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", - __func__, - rcu_torture_writer_state_getname(), - rcu_torture_writer_state, - cookie, cur_ops->get_gp_state()); } WRITE_ONCE(rcu_torture_current_version, rcu_torture_current_version + 1); @@ -1589,7 +1583,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) preempt_enable(); if (cur_ops->get_gp_state && cur_ops->poll_gp_state) WARN_ONCE(cur_ops->poll_gp_state(cookie), - "%s: Cookie check 3 failed %s(%d) %lu->%lu\n", + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", __func__, rcu_torture_writer_state_getname(), rcu_torture_writer_state,