From 752be5c5c910a1a270e97b5b39c7a7d06a39e7c6 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Mon, 1 Aug 2022 11:32:25 +0900
Subject: [PATCH 01/44] tracing/eprobe: Add eprobe filter support

Add the filter option to the event probe. This is useful if user wants
to derive a new event based on the condition of the original event.

E.g.
 echo 'e:egroup/stat_runtime_4core sched/sched_stat_runtime \
        runtime=$runtime:u32 if cpu < 4' >> ../dynamic_events

Then it can filter the events only on first 4 cores.
Note that the fields used for 'if' must be the fields in the original
events, not eprobe events.

Link: https://lkml.kernel.org/r/165932114513.2850673.2592206685744598080.stgit@devnote2

Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_eprobe.c | 104 +++++++++++++++++++++++++++++++++---
 kernel/trace/trace_probe.h  |   3 +-
 2 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 1783e347891249..78299d3724a294 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -26,6 +26,9 @@ struct trace_eprobe {
 	/* tracepoint event */
 	const char *event_name;
 
+	/* filter string for the tracepoint */
+	char *filter_str;
+
 	struct trace_event_call *event;
 
 	struct dyn_event	devent;
@@ -664,14 +667,15 @@ static struct event_trigger_data *
 new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
 {
 	struct event_trigger_data *trigger;
+	struct event_filter *filter = NULL;
 	struct eprobe_data *edata;
+	int ret;
 
 	edata = kzalloc(sizeof(*edata), GFP_KERNEL);
 	trigger = kzalloc(sizeof(*trigger), GFP_KERNEL);
 	if (!trigger || !edata) {
-		kfree(edata);
-		kfree(trigger);
-		return ERR_PTR(-ENOMEM);
+		ret = -ENOMEM;
+		goto error;
 	}
 
 	trigger->flags = EVENT_TRIGGER_FL_PROBE;
@@ -686,13 +690,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file)
 	trigger->cmd_ops = &event_trigger_cmd;
 
 	INIT_LIST_HEAD(&trigger->list);
-	RCU_INIT_POINTER(trigger->filter, NULL);
+
+	if (ep->filter_str) {
+		ret = create_event_filter(file->tr, file->event_call,
+					ep->filter_str, false, &filter);
+		if (ret)
+			goto error;
+	}
+	RCU_INIT_POINTER(trigger->filter, filter);
 
 	edata->file = file;
 	edata->ep = ep;
 	trigger->private_data = edata;
 
 	return trigger;
+error:
+	free_event_filter(filter);
+	kfree(edata);
+	kfree(trigger);
+	return ERR_PTR(ret);
 }
 
 static int enable_eprobe(struct trace_eprobe *ep,
@@ -726,6 +742,7 @@ static int disable_eprobe(struct trace_eprobe *ep,
 {
 	struct event_trigger_data *trigger = NULL, *iter;
 	struct trace_event_file *file;
+	struct event_filter *filter;
 	struct eprobe_data *edata;
 
 	file = find_event_file(tr, ep->event_system, ep->event_name);
@@ -752,6 +769,10 @@ static int disable_eprobe(struct trace_eprobe *ep,
 	/* Make sure nothing is using the edata or trigger */
 	tracepoint_synchronize_unregister();
 
+	filter = rcu_access_pointer(trigger->filter);
+
+	if (filter)
+		free_event_filter(filter);
 	kfree(edata);
 	kfree(trigger);
 
@@ -927,12 +948,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[
 	return ret;
 }
 
+static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[])
+{
+	struct event_filter *dummy;
+	int i, ret, len = 0;
+	char *p;
+
+	if (argc == 0) {
+		trace_probe_log_err(0, NO_EP_FILTER);
+		return -EINVAL;
+	}
+
+	/* Recover the filter string */
+	for (i = 0; i < argc; i++)
+		len += strlen(argv[i]) + 1;
+
+	ep->filter_str = kzalloc(len, GFP_KERNEL);
+	if (!ep->filter_str)
+		return -ENOMEM;
+
+	p = ep->filter_str;
+	for (i = 0; i < argc; i++) {
+		ret = snprintf(p, len, "%s ", argv[i]);
+		if (ret < 0)
+			goto error;
+		if (ret > len) {
+			ret = -E2BIG;
+			goto error;
+		}
+		p += ret;
+		len -= ret;
+	}
+	p[-1] = '\0';
+
+	/*
+	 * Ensure the filter string can be parsed correctly. Note, this
+	 * filter string is for the original event, not for the eprobe.
+	 */
+	ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str,
+				  true, &dummy);
+	free_event_filter(dummy);
+	if (ret)
+		goto error;
+
+	return 0;
+error:
+	kfree(ep->filter_str);
+	ep->filter_str = NULL;
+	return ret;
+}
+
 static int __trace_eprobe_create(int argc, const char *argv[])
 {
 	/*
 	 * Argument syntax:
-	 *      e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS]
-	 * Fetch args:
+	 *      e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER]
+	 * Fetch args (no space):
 	 *  <name>=$<field>[:TYPE]
 	 */
 	const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
@@ -942,8 +1013,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
 	char buf1[MAX_EVENT_NAME_LEN];
 	char buf2[MAX_EVENT_NAME_LEN];
 	char gbuf[MAX_EVENT_NAME_LEN];
-	int ret = 0;
-	int i;
+	int ret = 0, filter_idx = 0;
+	int i, filter_cnt;
 
 	if (argc < 2 || argv[0][0] != 'e')
 		return -ECANCELED;
@@ -973,6 +1044,15 @@ static int __trace_eprobe_create(int argc, const char *argv[])
 		event = buf1;
 	}
 
+	for (i = 2; i < argc; i++) {
+		if (!strcmp(argv[i], "if")) {
+			filter_idx = i + 1;
+			filter_cnt = argc - filter_idx;
+			argc = i;
+			break;
+		}
+	}
+
 	mutex_lock(&event_mutex);
 	event_call = find_and_get_event(sys_name, sys_event);
 	ep = alloc_event_probe(group, event, event_call, argc - 2);
@@ -988,6 +1068,14 @@ static int __trace_eprobe_create(int argc, const char *argv[])
 		goto error;
 	}
 
+	if (filter_idx) {
+		trace_probe_log_set_index(filter_idx);
+		ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx);
+		if (ret)
+			goto parse_error;
+	} else
+		ep->filter_str = NULL;
+
 	argc -= 2; argv += 2;
 	/* parse arguments */
 	for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 3b3869ae8cfd15..de38f1c037762e 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(SAME_PROBE,		"There is already the exact same probe event"),\
 	C(NO_EVENT_INFO,	"This requires both group and event name to attach"),\
 	C(BAD_ATTACH_EVENT,	"Attached event does not exist"),\
-	C(BAD_ATTACH_ARG,	"Attached event does not have this field"),
+	C(BAD_ATTACH_ARG,	"Attached event does not have this field"),\
+	C(NO_EP_FILTER,		"No filter rule after 'if'"),
 
 #undef C
 #define C(a, b)		TP_ERR_##a

From 9e14bae7d049cfdd5ab22cb200bc7ea847cfa8c9 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Mon, 1 Aug 2022 11:32:34 +0900
Subject: [PATCH 02/44] selftests/ftrace: Add eprobe syntax error testcase

Add a syntax error test case for eprobe as same as kprobes.

Link: https://lkml.kernel.org/r/165932115471.2850673.8014722990775242727.stgit@devnote2

Cc: Tzvetomir Stoyanov <tz.stoyanov@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 .../test.d/dynevent/eprobes_syntax_errors.tc  | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc

diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
new file mode 100644
index 00000000000000..fc1daac7f06682
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
@@ -0,0 +1,27 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Event probe event parser error log check
+# requires: dynamic_events events/syscalls/sys_enter_openat "<attached-group>.<attached-event> [<args>]":README error_log
+
+check_error() { # command-with-error-pos-by-^
+    ftrace_errlog_check 'event_probe' "$1" 'dynamic_events'
+}
+
+check_error 'e ^a.'			# NO_EVENT_INFO
+check_error 'e ^.b'			# NO_EVENT_INFO
+check_error 'e ^a.b'			# BAD_ATTACH_EVENT
+check_error 'e syscalls/sys_enter_openat ^foo'	# BAD_ATTACH_ARG
+check_error 'e:^/bar syscalls/sys_enter_openat'	# NO_GROUP_NAME
+check_error 'e:^12345678901234567890123456789012345678901234567890123456789012345/bar syscalls/sys_enter_openat'	# GROUP_TOO_LONG
+
+check_error 'e:^foo.1/bar syscalls/sys_enter_openat'	# BAD_GROUP_NAME
+check_error 'e:^ syscalls/sys_enter_openat'		# NO_EVENT_NAME
+check_error 'e:foo/^12345678901234567890123456789012345678901234567890123456789012345 syscalls/sys_enter_openat'	# EVENT_TOO_LONG
+check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'	# BAD_EVENT_NAME
+
+check_error 'e:foo/bar syscalls/sys_enter_openat arg=^dfd'	# BAD_FETCH_ARG
+check_error 'e:foo/bar syscalls/sys_enter_openat ^arg=$foo'	# BAD_ATTACH_ARG
+
+check_error 'e:foo/bar syscalls/sys_enter_openat if ^'	# NO_EP_FILTER
+
+exit 0

From 01c44bf8337ac79f31aa090e842bf3b9dd762aca Mon Sep 17 00:00:00 2001
From: Zeng Heng <zengheng4@huawei.com>
Date: Wed, 24 Aug 2022 11:43:56 +0800
Subject: [PATCH 03/44] rv/monitors: add 'static' qualifier for local symbols

The sparse tool complains as follows:

kernel/trace/rv/monitors/wwnr/wwnr.c:18:19:
warning: symbol 'rv_wwnr' was not declared. Should it be static?

The `rv_wwnr` symbol is not dereferenced by other extern files,
so add static qualifier for it.

So does wip module.

Link: https://lkml.kernel.org/r/20220824034357.2014202-2-zengheng4@huawei.com

Cc: <mingo@redhat.com>
Fixes:	ccc319dcb450 ("rv/monitor: Add the wwnr monitor")
Fixes:	8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k")
Signed-off-by: Zeng Heng <zengheng4@huawei.com>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/rv/monitors/wip/wip.c   | 4 ++--
 kernel/trace/rv/monitors/wwnr/wwnr.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index 83cace53b9fafe..1a989bc142f346 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -16,7 +16,7 @@
 
 #include "wip.h"
 
-struct rv_monitor rv_wip;
+static struct rv_monitor rv_wip;
 DECLARE_DA_MON_PER_CPU(wip, unsigned char);
 
 static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
@@ -60,7 +60,7 @@ static void disable_wip(void)
 	da_monitor_destroy_wip();
 }
 
-struct rv_monitor rv_wip = {
+static struct rv_monitor rv_wip = {
 	.name = "wip",
 	.description = "wakeup in preemptive per-cpu testing monitor.",
 	.enable = enable_wip,
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 599225d9cf3824..a063b93c6a1d37 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -15,7 +15,7 @@
 
 #include "wwnr.h"
 
-struct rv_monitor rv_wwnr;
+static struct rv_monitor rv_wwnr;
 DECLARE_DA_MON_PER_TASK(wwnr, unsigned char);
 
 static void handle_switch(void *data, bool preempt, struct task_struct *p,
@@ -59,7 +59,7 @@ static void disable_wwnr(void)
 	da_monitor_destroy_wwnr();
 }
 
-struct rv_monitor rv_wwnr = {
+static struct rv_monitor rv_wwnr = {
 	.name = "wwnr",
 	.description = "wakeup while not running per-task testing model.",
 	.enable = enable_wwnr,

From 4359a011e259a4608afc7fb3635370c9d4ba5943 Mon Sep 17 00:00:00 2001
From: Zeng Heng <zengheng4@huawei.com>
Date: Wed, 24 Aug 2022 11:43:57 +0800
Subject: [PATCH 04/44] rv/dot2K: add 'static' qualifier for local variable

Following Daniel's suggestion, fix similar warning
in template files, which would prevent new monitors
from such warning.

Link: https://lkml.kernel.org/r/20220824034357.2014202-3-zengheng4@huawei.com

Cc: <mingo@redhat.com>
Fixes:	24bce201d798 ("tools/rv: Add dot2k")
Suggested-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Zeng Heng <zengheng4@huawei.com>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 tools/verification/dot2/dot2k_templates/main_global.c   | 4 ++--
 tools/verification/dot2/dot2k_templates/main_per_cpu.c  | 4 ++--
 tools/verification/dot2/dot2k_templates/main_per_task.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/verification/dot2/dot2k_templates/main_global.c b/tools/verification/dot2/dot2k_templates/main_global.c
index f4b712dbc92e07..dcd1162dced8b2 100644
--- a/tools/verification/dot2/dot2k_templates/main_global.c
+++ b/tools/verification/dot2/dot2k_templates/main_global.c
@@ -27,7 +27,7 @@
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-struct rv_monitor rv_MODEL_NAME;
+static struct rv_monitor rv_MODEL_NAME;
 DECLARE_DA_MON_GLOBAL(MODEL_NAME, MIN_TYPE);
 
 /*
@@ -63,7 +63,7 @@ TRACEPOINT_DETACH
 /*
  * This is the monitor register section.
  */
-struct rv_monitor rv_MODEL_NAME = {
+static struct rv_monitor rv_MODEL_NAME = {
 	.name = "MODEL_NAME",
 	.description = "auto-generated MODEL_NAME",
 	.enable = enable_MODEL_NAME,
diff --git a/tools/verification/dot2/dot2k_templates/main_per_cpu.c b/tools/verification/dot2/dot2k_templates/main_per_cpu.c
index 4080d1ca3354a7..8f877e86a22fc5 100644
--- a/tools/verification/dot2/dot2k_templates/main_per_cpu.c
+++ b/tools/verification/dot2/dot2k_templates/main_per_cpu.c
@@ -27,7 +27,7 @@
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-struct rv_monitor rv_MODEL_NAME;
+static struct rv_monitor rv_MODEL_NAME;
 DECLARE_DA_MON_PER_CPU(MODEL_NAME, MIN_TYPE);
 
 /*
@@ -63,7 +63,7 @@ TRACEPOINT_DETACH
 /*
  * This is the monitor register section.
  */
-struct rv_monitor rv_MODEL_NAME = {
+static struct rv_monitor rv_MODEL_NAME = {
 	.name = "MODEL_NAME",
 	.description = "auto-generated MODEL_NAME",
 	.enable = enable_MODEL_NAME,
diff --git a/tools/verification/dot2/dot2k_templates/main_per_task.c b/tools/verification/dot2/dot2k_templates/main_per_task.c
index 89197175384f7e..8c2fdb82463446 100644
--- a/tools/verification/dot2/dot2k_templates/main_per_task.c
+++ b/tools/verification/dot2/dot2k_templates/main_per_task.c
@@ -27,7 +27,7 @@
  *
  * The rv monitor reference is needed for the monitor declaration.
  */
-struct rv_monitor rv_MODEL_NAME;
+static struct rv_monitor rv_MODEL_NAME;
 DECLARE_DA_MON_PER_TASK(MODEL_NAME, MIN_TYPE);
 
 /*
@@ -63,7 +63,7 @@ TRACEPOINT_DETACH
 /*
  * This is the monitor register section.
  */
-struct rv_monitor rv_MODEL_NAME = {
+static struct rv_monitor rv_MODEL_NAME = {
 	.name = "MODEL_NAME",
 	.description = "auto-generated MODEL_NAME",
 	.enable = enable_MODEL_NAME,

From b7b037eb5f548cc947e743d456d66eb110316f1e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 6 Sep 2022 18:53:14 -0400
Subject: [PATCH 05/44] tracing: Add numeric delta time to the trace event
 benchmark

In order to testing filtering and histograms via the trace event
benchmark, record the delta time of the last event as a numeric value
(currently, it just saves it within the string) so that filters and
histograms can use it.

Link: https://lkml.kernel.org/r/20220906225529.213677569@goodmis.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_benchmark.c | 2 +-
 kernel/trace/trace_benchmark.h | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 801c2a7f76054d..54d5fa35c90abf 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -51,7 +51,7 @@ static void trace_do_benchmark(void)
 
 	local_irq_disable();
 	start = trace_clock_local();
-	trace_benchmark_event(bm_str);
+	trace_benchmark_event(bm_str, bm_last);
 	stop = trace_clock_local();
 	local_irq_enable();
 
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
index 79e6fbe5b3658d..c3e91060dc949d 100644
--- a/kernel/trace/trace_benchmark.h
+++ b/kernel/trace/trace_benchmark.h
@@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void);
 
 TRACE_EVENT_FN(benchmark_event,
 
-	TP_PROTO(const char *str),
+	TP_PROTO(const char *str, u64 delta),
 
-	TP_ARGS(str),
+	TP_ARGS(str, delta),
 
 	TP_STRUCT__entry(
 		__array(	char,	str,	BENCHMARK_EVENT_STRLEN	)
+		__field(	u64,	delta)
 	),
 
 	TP_fast_assign(
 		memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+		__entry->delta = delta;
 	),
 
-	TP_printk("%s", __entry->str),
+	TP_printk("%s delta=%llu", __entry->str, __entry->delta),
 
 	trace_benchmark_reg, trace_benchmark_unreg
 );

From 86087383ec0a7ac2bcc3284e13e0e9966f7e3bfa Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 6 Sep 2022 18:53:15 -0400
Subject: [PATCH 06/44] tracing/hist: Call hist functions directly via a switch
 statement

Due to retpolines, indirect calls are much more expensive than direct
calls. The histograms have a select set of functions it uses for the
histograms, instead of using function pointers to call them, create a
hist_fn_call() function that uses a switch statement to call the histogram
functions directly. This gives a 13% speedup to the histogram logic.

Using the histogram benchmark:

Before:

 # event histogram
 #
 # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active]
 #

{ delta:        129 } hitcount:       2213
{ delta:        130 } hitcount:     285965
{ delta:        131 } hitcount:    1146545
{ delta:        132 } hitcount:    5185432
{ delta:        133 } hitcount:   19896215
{ delta:        134 } hitcount:   53118616
{ delta:        135 } hitcount:   83816709
{ delta:        136 } hitcount:   68329562
{ delta:        137 } hitcount:   41859349
{ delta:        138 } hitcount:   46257797
{ delta:        139 } hitcount:   54400831
{ delta:        140 } hitcount:   72875007
{ delta:        141 } hitcount:   76193272
{ delta:        142 } hitcount:   49504263
{ delta:        143 } hitcount:   38821072
{ delta:        144 } hitcount:   47702679
{ delta:        145 } hitcount:   41357297
{ delta:        146 } hitcount:   22058238
{ delta:        147 } hitcount:    9720002
{ delta:        148 } hitcount:    3193542
{ delta:        149 } hitcount:     927030
{ delta:        150 } hitcount:     850772
{ delta:        151 } hitcount:    1477380
{ delta:        152 } hitcount:    2687977
{ delta:        153 } hitcount:    2865985
{ delta:        154 } hitcount:    1977492
{ delta:        155 } hitcount:    2475607
{ delta:        156 } hitcount:    3403612

After:

 # event histogram
 #
 # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active]
 #

{ delta:        113 } hitcount:        272
{ delta:        114 } hitcount:        840
{ delta:        118 } hitcount:        344
{ delta:        119 } hitcount:      25428
{ delta:        120 } hitcount:     350590
{ delta:        121 } hitcount:    1892484
{ delta:        122 } hitcount:    6205004
{ delta:        123 } hitcount:   11583521
{ delta:        124 } hitcount:   37590979
{ delta:        125 } hitcount:  108308504
{ delta:        126 } hitcount:  131672461
{ delta:        127 } hitcount:   88700598
{ delta:        128 } hitcount:   65939870
{ delta:        129 } hitcount:   45055004
{ delta:        130 } hitcount:   33174464
{ delta:        131 } hitcount:   31813493
{ delta:        132 } hitcount:   29011676
{ delta:        133 } hitcount:   22798782
{ delta:        134 } hitcount:   22072486
{ delta:        135 } hitcount:   17034113
{ delta:        136 } hitcount:    8982490
{ delta:        137 } hitcount:    2865908
{ delta:        138 } hitcount:     980382
{ delta:        139 } hitcount:    1651944
{ delta:        140 } hitcount:    4112073
{ delta:        141 } hitcount:    3963269
{ delta:        142 } hitcount:    1712508
{ delta:        143 } hitcount:     575941
{ delta:        144 } hitcount:     351427
{ delta:        145 } hitcount:     218077
{ delta:        146 } hitcount:     167297
{ delta:        147 } hitcount:     146198
{ delta:        148 } hitcount:     116122
{ delta:        149 } hitcount:      58993
{ delta:        150 } hitcount:      40228

The delta above is in nanoseconds. It brings the fastest time down from
129ns to 113ns, and the peak from 141ns to 126ns.

Link: https://lkml.kernel.org/r/20220906225529.411545333@goodmis.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 246 +++++++++++++++++++++----------
 1 file changed, 169 insertions(+), 77 deletions(-)

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fdf784620c283a..48465f7e97b42b 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -104,6 +104,38 @@ enum field_op_id {
 	FIELD_OP_MULT,
 };
 
+enum hist_field_fn {
+	HIST_FIELD_FN_NOP,
+	HIST_FIELD_FN_VAR_REF,
+	HIST_FIELD_FN_COUNTER,
+	HIST_FIELD_FN_CONST,
+	HIST_FIELD_FN_LOG2,
+	HIST_FIELD_FN_BUCKET,
+	HIST_FIELD_FN_TIMESTAMP,
+	HIST_FIELD_FN_CPU,
+	HIST_FIELD_FN_STRING,
+	HIST_FIELD_FN_DYNSTRING,
+	HIST_FIELD_FN_RELDYNSTRING,
+	HIST_FIELD_FN_PSTRING,
+	HIST_FIELD_FN_S64,
+	HIST_FIELD_FN_U64,
+	HIST_FIELD_FN_S32,
+	HIST_FIELD_FN_U32,
+	HIST_FIELD_FN_S16,
+	HIST_FIELD_FN_U16,
+	HIST_FIELD_FN_S8,
+	HIST_FIELD_FN_U8,
+	HIST_FIELD_FN_UMINUS,
+	HIST_FIELD_FN_MINUS,
+	HIST_FIELD_FN_PLUS,
+	HIST_FIELD_FN_DIV,
+	HIST_FIELD_FN_MULT,
+	HIST_FIELD_FN_DIV_POWER2,
+	HIST_FIELD_FN_DIV_NOT_POWER2,
+	HIST_FIELD_FN_DIV_MULT_SHIFT,
+	HIST_FIELD_FN_EXECNAME,
+};
+
 /*
  * A hist_var (histogram variable) contains variable information for
  * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF
@@ -123,15 +155,15 @@ struct hist_var {
 struct hist_field {
 	struct ftrace_event_field	*field;
 	unsigned long			flags;
-	hist_field_fn_t			fn;
-	unsigned int			ref;
-	unsigned int			size;
-	unsigned int			offset;
-	unsigned int                    is_signed;
 	unsigned long			buckets;
 	const char			*type;
 	struct hist_field		*operands[HIST_FIELD_OPERANDS_MAX];
 	struct hist_trigger_data	*hist_data;
+	enum hist_field_fn		fn_num;
+	unsigned int			ref;
+	unsigned int			size;
+	unsigned int			offset;
+	unsigned int                    is_signed;
 
 	/*
 	 * Variable fields contain variable-specific info in var.
@@ -166,14 +198,11 @@ struct hist_field {
 	u64				div_multiplier;
 };
 
-static u64 hist_field_none(struct hist_field *field,
-			   struct tracing_map_elt *elt,
-			   struct trace_buffer *buffer,
-			   struct ring_buffer_event *rbe,
-			   void *event)
-{
-	return 0;
-}
+static u64 hist_fn_call(struct hist_field *hist_field,
+			struct tracing_map_elt *elt,
+			struct trace_buffer *buffer,
+			struct ring_buffer_event *rbe,
+			void *event);
 
 static u64 hist_field_const(struct hist_field *field,
 			   struct tracing_map_elt *elt,
@@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field,
 {
 	struct hist_field *operand = hist_field->operands[0];
 
-	u64 val = operand->fn(operand, elt, buffer, rbe, event);
+	u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
 
 	return (u64) ilog2(roundup_pow_of_two(val));
 }
@@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field,
 	struct hist_field *operand = hist_field->operands[0];
 	unsigned long buckets = hist_field->buckets;
 
-	u64 val = operand->fn(operand, elt, buffer, rbe, event);
+	u64 val = hist_fn_call(operand, elt, buffer, rbe, event);
 
 	if (WARN_ON_ONCE(!buckets))
 		return val;
@@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-	u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+	u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
 	return val1 + val2;
 }
@@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-	u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+	u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
 	return val1 - val2;
 }
@@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-	u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+	u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
 	/* Return -1 for the undefined case */
 	if (!val2)
@@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
 
 	return val1 >> __ffs64(operand2->constant);
 }
@@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
 
 	return div64_u64(val1, operand2->constant);
 }
@@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
 
 	/*
 	 * If the divisor is a constant, do a multiplication and shift instead.
@@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field,
 	struct hist_field *operand1 = hist_field->operands[0];
 	struct hist_field *operand2 = hist_field->operands[1];
 
-	u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
-	u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event);
+	u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event);
+	u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event);
 
 	return val1 * val2;
 }
@@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field,
 {
 	struct hist_field *operand = hist_field->operands[0];
 
-	s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event);
+	s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event);
 	u64 val = (u64)-sval;
 
 	return val;
@@ -657,19 +686,19 @@ struct snapshot_context {
  * Returns the specific division function to use if the divisor
  * is constant. This avoids extra branches when the trigger is hit.
  */
-static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor)
 {
 	u64 div = divisor->constant;
 
 	if (!(div & (div - 1)))
-		return div_by_power_of_two;
+		return HIST_FIELD_FN_DIV_POWER2;
 
 	/* If the divisor is too large, do a regular division */
 	if (div > (1 << HIST_DIV_SHIFT))
-		return div_by_not_power_of_two;
+		return HIST_FIELD_FN_DIV_NOT_POWER2;
 
 	divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
-	return div_by_mult_and_shift;
+	return HIST_FIELD_FN_DIV_MULT_SHIFT;
 }
 
 static void track_data_free(struct track_data *track_data)
@@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field,
 	return field_name;
 }
 
-static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+static enum hist_field_fn select_value_fn(int field_size, int field_is_signed)
 {
-	hist_field_fn_t fn = NULL;
-
 	switch (field_size) {
 	case 8:
 		if (field_is_signed)
-			fn = hist_field_s64;
+			return HIST_FIELD_FN_S64;
 		else
-			fn = hist_field_u64;
-		break;
+			return HIST_FIELD_FN_U64;
 	case 4:
 		if (field_is_signed)
-			fn = hist_field_s32;
+			return HIST_FIELD_FN_S32;
 		else
-			fn = hist_field_u32;
-		break;
+			return HIST_FIELD_FN_U32;
 	case 2:
 		if (field_is_signed)
-			fn = hist_field_s16;
+			return HIST_FIELD_FN_S16;
 		else
-			fn = hist_field_u16;
-		break;
+			return HIST_FIELD_FN_U16;
 	case 1:
 		if (field_is_signed)
-			fn = hist_field_s8;
+			return HIST_FIELD_FN_S8;
 		else
-			fn = hist_field_u8;
-		break;
+			return HIST_FIELD_FN_U8;
 	}
 
-	return fn;
+	return HIST_FIELD_FN_NOP;
 }
 
 static int parse_map_size(char *str)
@@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
 		goto out; /* caller will populate */
 
 	if (flags & HIST_FIELD_FL_VAR_REF) {
-		hist_field->fn = hist_field_var_ref;
+		hist_field->fn_num = HIST_FIELD_FN_VAR_REF;
 		goto out;
 	}
 
 	if (flags & HIST_FIELD_FL_HITCOUNT) {
-		hist_field->fn = hist_field_counter;
+		hist_field->fn_num = HIST_FIELD_FN_COUNTER;
 		hist_field->size = sizeof(u64);
 		hist_field->type = "u64";
 		goto out;
 	}
 
 	if (flags & HIST_FIELD_FL_CONST) {
-		hist_field->fn = hist_field_const;
+		hist_field->fn_num = HIST_FIELD_FN_CONST;
 		hist_field->size = sizeof(u64);
 		hist_field->type = kstrdup("u64", GFP_KERNEL);
 		if (!hist_field->type)
@@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
 	}
 
 	if (flags & HIST_FIELD_FL_STACKTRACE) {
-		hist_field->fn = hist_field_none;
+		hist_field->fn_num = HIST_FIELD_FN_NOP;
 		goto out;
 	}
 
 	if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) {
 		unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET);
-		hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 :
-			hist_field_bucket;
+		hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 :
+			HIST_FIELD_FN_BUCKET;
 		hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
 		hist_field->size = hist_field->operands[0]->size;
 		hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL);
@@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
 	}
 
 	if (flags & HIST_FIELD_FL_TIMESTAMP) {
-		hist_field->fn = hist_field_timestamp;
+		hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP;
 		hist_field->size = sizeof(u64);
 		hist_field->type = "u64";
 		goto out;
 	}
 
 	if (flags & HIST_FIELD_FL_CPU) {
-		hist_field->fn = hist_field_cpu;
+		hist_field->fn_num = HIST_FIELD_FN_CPU;
 		hist_field->size = sizeof(int);
 		hist_field->type = "unsigned int";
 		goto out;
@@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
 			goto free;
 
 		if (field->filter_type == FILTER_STATIC_STRING) {
-			hist_field->fn = hist_field_string;
+			hist_field->fn_num = HIST_FIELD_FN_STRING;
 			hist_field->size = field->size;
 		} else if (field->filter_type == FILTER_DYN_STRING) {
-			hist_field->fn = hist_field_dynstring;
+			hist_field->fn_num = HIST_FIELD_FN_DYNSTRING;
 		} else if (field->filter_type == FILTER_RDYN_STRING)
-			hist_field->fn = hist_field_reldynstring;
+			hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING;
 		else
-			hist_field->fn = hist_field_pstring;
+			hist_field->fn_num = HIST_FIELD_FN_PSTRING;
 	} else {
 		hist_field->size = field->size;
 		hist_field->is_signed = field->is_signed;
@@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
 		if (!hist_field->type)
 			goto free;
 
-		hist_field->fn = select_value_fn(field->size,
-						 field->is_signed);
-		if (!hist_field->fn) {
+		hist_field->fn_num = select_value_fn(field->size,
+						     field->is_signed);
+		if (hist_field->fn_num == HIST_FIELD_FN_NOP) {
 			destroy_hist_field(hist_field, 0);
 			return NULL;
 		}
@@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
 	if (!alias)
 		return NULL;
 
-	alias->fn = var_ref->fn;
+	alias->fn_num = var_ref->fn_num;
 	alias->operands[0] = var_ref;
 
 	if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
@@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
 
 	expr->flags |= operand1->flags &
 		(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
-	expr->fn = hist_field_unary_minus;
+	expr->fn_num = HIST_FIELD_FN_UMINUS;
 	expr->operands[0] = operand1;
 	expr->size = operand1->size;
 	expr->is_signed = operand1->is_signed;
@@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 	unsigned long operand_flags, operand2_flags;
 	int field_op, ret = -EINVAL;
 	char *sep, *operand1_str;
-	hist_field_fn_t op_fn;
+	enum hist_field_fn op_fn;
 	bool combine_consts;
 
 	if (*n_subexprs > 3) {
@@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 
 	switch (field_op) {
 	case FIELD_OP_MINUS:
-		op_fn = hist_field_minus;
+		op_fn = HIST_FIELD_FN_MINUS;
 		break;
 	case FIELD_OP_PLUS:
-		op_fn = hist_field_plus;
+		op_fn = HIST_FIELD_FN_PLUS;
 		break;
 	case FIELD_OP_DIV:
-		op_fn = hist_field_div;
+		op_fn = HIST_FIELD_FN_DIV;
 		break;
 	case FIELD_OP_MULT:
-		op_fn = hist_field_mult;
+		op_fn = HIST_FIELD_FN_MULT;
 		break;
 	default:
 		ret = -EINVAL;
@@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 		op_fn = hist_field_get_div_fn(operand2);
 	}
 
+	expr->fn_num = op_fn;
+
 	if (combine_consts) {
 		if (var1)
 			expr->operands[0] = var1;
 		if (var2)
 			expr->operands[1] = var2;
 
-		expr->constant = op_fn(expr, NULL, NULL, NULL, NULL);
+		expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL);
+		expr->fn_num = HIST_FIELD_FN_CONST;
 
 		expr->operands[0] = NULL;
 		expr->operands[1] = NULL;
@@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
 
 		expr->name = expr_str(expr, 0);
 	} else {
-		expr->fn = op_fn;
-
 		/* The operand sizes should be the same, so just pick one */
 		expr->size = operand1->size;
 		expr->is_signed = operand1->is_signed;
@@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
 		struct hist_field *var = field_var->var;
 		struct hist_field *val = field_var->val;
 
-		var_val = val->fn(val, elt, buffer, rbe, rec);
+		var_val = hist_fn_call(val, elt, buffer, rbe, rec);
 		var_idx = var->var.idx;
 
 		if (val->flags & HIST_FIELD_FL_STRING) {
@@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field,
 	return (u64)(unsigned long)(elt_data->comm);
 }
 
+static u64 hist_fn_call(struct hist_field *hist_field,
+			struct tracing_map_elt *elt,
+			struct trace_buffer *buffer,
+			struct ring_buffer_event *rbe,
+			void *event)
+{
+	switch (hist_field->fn_num) {
+	case HIST_FIELD_FN_VAR_REF:
+		return hist_field_var_ref(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_COUNTER:
+		return hist_field_counter(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_CONST:
+		return hist_field_const(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_LOG2:
+		return hist_field_log2(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_BUCKET:
+		return hist_field_bucket(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_TIMESTAMP:
+		return hist_field_timestamp(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_CPU:
+		return hist_field_cpu(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_STRING:
+		return hist_field_string(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_DYNSTRING:
+		return hist_field_dynstring(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_RELDYNSTRING:
+		return hist_field_reldynstring(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_PSTRING:
+		return hist_field_pstring(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_S64:
+		return hist_field_s64(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_U64:
+		return hist_field_u64(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_S32:
+		return hist_field_s32(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_U32:
+		return hist_field_u32(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_S16:
+		return hist_field_s16(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_U16:
+		return hist_field_u16(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_S8:
+		return hist_field_s8(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_U8:
+		return hist_field_u8(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_UMINUS:
+		return hist_field_unary_minus(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_MINUS:
+		return hist_field_minus(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_PLUS:
+		return hist_field_plus(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_DIV:
+		return hist_field_div(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_MULT:
+		return hist_field_mult(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_DIV_POWER2:
+		return div_by_power_of_two(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_DIV_NOT_POWER2:
+		return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_DIV_MULT_SHIFT:
+		return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event);
+	case HIST_FIELD_FN_EXECNAME:
+		return hist_field_execname(hist_field, elt, buffer, rbe, event);
+	default:
+		return 0;
+	}
+}
+
 /* Convert a var that points to common_pid.execname to a string */
 static void update_var_execname(struct hist_field *hist_field)
 {
@@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field)
 	kfree_const(hist_field->type);
 	hist_field->type = "char[]";
 
-	hist_field->fn = hist_field_execname;
+	hist_field->fn_num = HIST_FIELD_FN_EXECNAME;
 }
 
 static int create_var_field(struct hist_trigger_data *hist_data,
@@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 
 	for_each_hist_val_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
-		hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+		hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
 		if (hist_field->flags & HIST_FIELD_FL_VAR) {
 			var_idx = hist_field->var.idx;
 
@@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 	for_each_hist_key_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
 		if (hist_field->flags & HIST_FIELD_FL_VAR) {
-			hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec);
+			hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec);
 			var_idx = hist_field->var.idx;
 			tracing_map_set_var(elt, var_idx, hist_val);
 		}
@@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data,
 					 HIST_STACKTRACE_SKIP);
 			key = entries;
 		} else {
-			field_contents = key_field->fn(key_field, elt, buffer, rbe, rec);
+			field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec);
 			if (key_field->flags & HIST_FIELD_FL_STRING) {
 				key = (void *)(unsigned long)field_contents;
 				use_compound_key = true;

From 26c4e3d10ad0566f8604e6c378e939f941707f36 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 6 Sep 2022 18:53:16 -0400
Subject: [PATCH 07/44] tracing: Move struct filter_pred into
 trace_events_filter.c

The structure filter_pred and the typedef of the function used are only
referenced by trace_events_filter.c. There's no reason to have it in an
external header file. Move them into the only file they are used in.

Link: https://lkml.kernel.org/r/20220906225529.598047132@goodmis.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.h               | 13 -------------
 kernel/trace/trace_events_filter.c | 13 +++++++++++++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 900e75d96c8466..54ee5711c72995 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
 struct filter_pred;
 struct regex;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
-
 typedef int (*regex_match_func)(char *str, struct regex *r, int len);
 
 enum regex_type {
@@ -1455,17 +1453,6 @@ struct regex {
 	regex_match_func	match;
 };
 
-struct filter_pred {
-	filter_pred_fn_t 	fn;
-	u64 			val;
-	struct regex		regex;
-	unsigned short		*ops;
-	struct ftrace_event_field *field;
-	int 			offset;
-	int			not;
-	int 			op;
-};
-
 static inline bool is_string_field(struct ftrace_event_field *field)
 {
 	return field->filter_type == FILTER_DYN_STRING ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4b1057ab9d9682..c49c689ce4ad96 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -43,6 +43,19 @@ enum filter_op_ids { OPS };
 
 static const char * ops[] = { OPS };
 
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+
+struct filter_pred {
+	filter_pred_fn_t 	fn;
+	u64 			val;
+	struct regex		regex;
+	unsigned short		*ops;
+	struct ftrace_event_field *field;
+	int 			offset;
+	int			not;
+	int 			op;
+};
+
 /*
  * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
  * pred_funcs_##type below must match the order of them above.

From fde59ab1614942a16dd0ce6f053ae96718f0ee50 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 6 Sep 2022 18:53:17 -0400
Subject: [PATCH 08/44] tracing/filter: Call filter predicate functions
 directly via a switch statement

Due to retpolines, indirect calls are much more expensive than direct
calls. The filters have a select set of functions it uses for the
predicates. Instead of using function pointers to call them, create a
filter_pred_fn_call() function that uses a switch statement to call the
predicate functions directly. This gives almost a 10% speedup to the
filter logic.

Using the histogram benchmark:

Before:

 # event histogram
 #
 # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active]
 #

{ delta:        113 } hitcount:        272
{ delta:        114 } hitcount:        840
{ delta:        118 } hitcount:        344
{ delta:        119 } hitcount:      25428
{ delta:        120 } hitcount:     350590
{ delta:        121 } hitcount:    1892484
{ delta:        122 } hitcount:    6205004
{ delta:        123 } hitcount:   11583521
{ delta:        124 } hitcount:   37590979
{ delta:        125 } hitcount:  108308504
{ delta:        126 } hitcount:  131672461
{ delta:        127 } hitcount:   88700598
{ delta:        128 } hitcount:   65939870
{ delta:        129 } hitcount:   45055004
{ delta:        130 } hitcount:   33174464
{ delta:        131 } hitcount:   31813493
{ delta:        132 } hitcount:   29011676
{ delta:        133 } hitcount:   22798782
{ delta:        134 } hitcount:   22072486
{ delta:        135 } hitcount:   17034113
{ delta:        136 } hitcount:    8982490
{ delta:        137 } hitcount:    2865908
{ delta:        138 } hitcount:     980382
{ delta:        139 } hitcount:    1651944
{ delta:        140 } hitcount:    4112073
{ delta:        141 } hitcount:    3963269
{ delta:        142 } hitcount:    1712508
{ delta:        143 } hitcount:     575941

After:

 # event histogram
 #
 # trigger info: hist:keys=delta:vals=hitcount:sort=delta:size=2048 if delta > 0 [active]
 #

{ delta:        103 } hitcount:         60
{ delta:        104 } hitcount:      16966
{ delta:        105 } hitcount:     396625
{ delta:        106 } hitcount:    3223400
{ delta:        107 } hitcount:   12053754
{ delta:        108 } hitcount:   20241711
{ delta:        109 } hitcount:   14850200
{ delta:        110 } hitcount:    4946599
{ delta:        111 } hitcount:    3479315
{ delta:        112 } hitcount:   18698299
{ delta:        113 } hitcount:   62388733
{ delta:        114 } hitcount:   95803834
{ delta:        115 } hitcount:   58278130
{ delta:        116 } hitcount:   15364800
{ delta:        117 } hitcount:    5586866
{ delta:        118 } hitcount:    2346880
{ delta:        119 } hitcount:    1131091
{ delta:        120 } hitcount:     620896
{ delta:        121 } hitcount:     236652
{ delta:        122 } hitcount:     105957
{ delta:        123 } hitcount:     119107
{ delta:        124 } hitcount:      54494
{ delta:        125 } hitcount:      63856
{ delta:        126 } hitcount:      64454
{ delta:        127 } hitcount:      34818
{ delta:        128 } hitcount:      41446
{ delta:        129 } hitcount:      51242
{ delta:        130 } hitcount:      28361
{ delta:        131 } hitcount:      23926

The peak before was 126ns per event, after the peak is 114ns, and the
fastest time went from 113ns to 103ns.

Link: https://lkml.kernel.org/r/20220906225529.781407172@goodmis.org

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Tom Zanussi <zanussi@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 230 ++++++++++++++++++++---------
 1 file changed, 157 insertions(+), 73 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c49c689ce4ad96..96acc2b71ac747 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -43,10 +43,33 @@ enum filter_op_ids { OPS };
 
 static const char * ops[] = { OPS };
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+enum filter_pred_fn {
+	FILTER_PRED_FN_NOP,
+	FILTER_PRED_FN_64,
+	FILTER_PRED_FN_S64,
+	FILTER_PRED_FN_U64,
+	FILTER_PRED_FN_32,
+	FILTER_PRED_FN_S32,
+	FILTER_PRED_FN_U32,
+	FILTER_PRED_FN_16,
+	FILTER_PRED_FN_S16,
+	FILTER_PRED_FN_U16,
+	FILTER_PRED_FN_8,
+	FILTER_PRED_FN_S8,
+	FILTER_PRED_FN_U8,
+	FILTER_PRED_FN_COMM,
+	FILTER_PRED_FN_STRING,
+	FILTER_PRED_FN_STRLOC,
+	FILTER_PRED_FN_STRRELLOC,
+	FILTER_PRED_FN_PCHAR_USER,
+	FILTER_PRED_FN_PCHAR,
+	FILTER_PRED_FN_CPU,
+	FILTER_PRED_FN_,
+	FILTER_PRED_TEST_VISITED,
+};
 
 struct filter_pred {
-	filter_pred_fn_t 	fn;
+	enum filter_pred_fn 	fn_num;
 	u64 			val;
 	struct regex		regex;
 	unsigned short		*ops;
@@ -603,44 +626,48 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
 	return ERR_PTR(ret);
 }
 
+enum pred_cmp_types {
+	PRED_CMP_TYPE_NOP,
+	PRED_CMP_TYPE_LT,
+	PRED_CMP_TYPE_LE,
+	PRED_CMP_TYPE_GT,
+	PRED_CMP_TYPE_GE,
+	PRED_CMP_TYPE_BAND,
+};
+
 #define DEFINE_COMPARISON_PRED(type)					\
-static int filter_pred_LT_##type(struct filter_pred *pred, void *event)	\
-{									\
-	type *addr = (type *)(event + pred->offset);			\
-	type val = (type)pred->val;					\
-	return *addr < val;						\
-}									\
-static int filter_pred_LE_##type(struct filter_pred *pred, void *event)	\
+static int filter_pred_##type(struct filter_pred *pred, void *event)	\
 {									\
-	type *addr = (type *)(event + pred->offset);			\
-	type val = (type)pred->val;					\
-	return *addr <= val;						\
-}									\
-static int filter_pred_GT_##type(struct filter_pred *pred, void *event)	\
-{									\
-	type *addr = (type *)(event + pred->offset);			\
-	type val = (type)pred->val;					\
-	return *addr > val;					\
-}									\
-static int filter_pred_GE_##type(struct filter_pred *pred, void *event)	\
-{									\
-	type *addr = (type *)(event + pred->offset);			\
-	type val = (type)pred->val;					\
-	return *addr >= val;						\
-}									\
-static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
-{									\
-	type *addr = (type *)(event + pred->offset);			\
-	type val = (type)pred->val;					\
-	return !!(*addr & val);						\
-}									\
-static const filter_pred_fn_t pred_funcs_##type[] = {			\
-	filter_pred_LE_##type,						\
-	filter_pred_LT_##type,						\
-	filter_pred_GE_##type,						\
-	filter_pred_GT_##type,						\
-	filter_pred_BAND_##type,					\
-};
+	switch (pred->op) {						\
+	case OP_LT: {							\
+		type *addr = (type *)(event + pred->offset);		\
+		type val = (type)pred->val;				\
+		return *addr < val;					\
+	}								\
+	case OP_LE: {					\
+		type *addr = (type *)(event + pred->offset);		\
+		type val = (type)pred->val;				\
+		return *addr <= val;					\
+	}								\
+	case OP_GT: {					\
+		type *addr = (type *)(event + pred->offset);		\
+		type val = (type)pred->val;				\
+		return *addr > val;					\
+	}								\
+	case OP_GE: {					\
+		type *addr = (type *)(event + pred->offset);		\
+		type val = (type)pred->val;				\
+		return *addr >= val;					\
+	}								\
+	case OP_BAND: {					\
+		type *addr = (type *)(event + pred->offset);		\
+		type val = (type)pred->val;				\
+		return !!(*addr & val);					\
+	}								\
+	default:							\
+		return 0;						\
+	}								\
+}
 
 #define DEFINE_EQUALITY_PRED(size)					\
 static int filter_pred_##size(struct filter_pred *pred, void *event)	\
@@ -849,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event)
 	return cmp ^ pred->not;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event)
-{
-	return 0;
-}
-
 /*
  * regex_match_foo - Basic regex callbacks
  *
@@ -999,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred)
 	}
 }
 
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int test_pred_visited_fn(struct filter_pred *pred, void *event);
+#else
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+	return 0;
+}
+#endif
+
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event);
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
@@ -1016,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec)
 
 	for (i = 0; prog[i].pred; i++) {
 		struct filter_pred *pred = prog[i].pred;
-		int match = pred->fn(pred, rec);
+		int match = filter_pred_fn_call(pred, rec);
 		if (match == prog[i].when_to_branch)
 			i = prog[i].target;
 	}
@@ -1202,10 +1237,10 @@ int filter_assign_type(const char *type)
 	return FILTER_OTHER;
 }
 
-static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
-					    int field_size, int field_is_signed)
+static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op,
+						int field_size, int field_is_signed)
 {
-	filter_pred_fn_t fn = NULL;
+	enum filter_pred_fn fn = FILTER_PRED_FN_NOP;
 	int pred_func_index = -1;
 
 	switch (op) {
@@ -1214,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
 		break;
 	default:
 		if (WARN_ON_ONCE(op < PRED_FUNC_START))
-			return NULL;
+			return fn;
 		pred_func_index = op - PRED_FUNC_START;
 		if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
-			return NULL;
+			return fn;
 	}
 
 	switch (field_size) {
 	case 8:
 		if (pred_func_index < 0)
-			fn = filter_pred_64;
+			fn = FILTER_PRED_FN_64;
 		else if (field_is_signed)
-			fn = pred_funcs_s64[pred_func_index];
+			fn = FILTER_PRED_FN_S64;
 		else
-			fn = pred_funcs_u64[pred_func_index];
+			fn = FILTER_PRED_FN_U64;
 		break;
 	case 4:
 		if (pred_func_index < 0)
-			fn = filter_pred_32;
+			fn = FILTER_PRED_FN_32;
 		else if (field_is_signed)
-			fn = pred_funcs_s32[pred_func_index];
+			fn = FILTER_PRED_FN_S32;
 		else
-			fn = pred_funcs_u32[pred_func_index];
+			fn = FILTER_PRED_FN_U32;
 		break;
 	case 2:
 		if (pred_func_index < 0)
-			fn = filter_pred_16;
+			fn = FILTER_PRED_FN_16;
 		else if (field_is_signed)
-			fn = pred_funcs_s16[pred_func_index];
+			fn = FILTER_PRED_FN_S16;
 		else
-			fn = pred_funcs_u16[pred_func_index];
+			fn = FILTER_PRED_FN_U16;
 		break;
 	case 1:
 		if (pred_func_index < 0)
-			fn = filter_pred_8;
+			fn = FILTER_PRED_FN_8;
 		else if (field_is_signed)
-			fn = pred_funcs_s8[pred_func_index];
+			fn = FILTER_PRED_FN_S8;
 		else
-			fn = pred_funcs_u8[pred_func_index];
+			fn = FILTER_PRED_FN_U8;
 		break;
 	}
 
 	return fn;
 }
 
+
+static int filter_pred_fn_call(struct filter_pred *pred, void *event)
+{
+	switch (pred->fn_num) {
+	case FILTER_PRED_FN_64:
+		return filter_pred_64(pred, event);
+	case FILTER_PRED_FN_S64:
+		return filter_pred_s64(pred, event);
+	case FILTER_PRED_FN_U64:
+		return filter_pred_u64(pred, event);
+	case FILTER_PRED_FN_32:
+		return filter_pred_32(pred, event);
+	case FILTER_PRED_FN_S32:
+		return filter_pred_s32(pred, event);
+	case FILTER_PRED_FN_U32:
+		return filter_pred_u32(pred, event);
+	case FILTER_PRED_FN_16:
+		return filter_pred_16(pred, event);
+	case FILTER_PRED_FN_S16:
+		return filter_pred_s16(pred, event);
+	case FILTER_PRED_FN_U16:
+		return filter_pred_u16(pred, event);
+	case FILTER_PRED_FN_8:
+		return filter_pred_8(pred, event);
+	case FILTER_PRED_FN_S8:
+		return filter_pred_s8(pred, event);
+	case FILTER_PRED_FN_U8:
+		return filter_pred_u8(pred, event);
+	case FILTER_PRED_FN_COMM:
+		return filter_pred_comm(pred, event);
+	case FILTER_PRED_FN_STRING:
+		return filter_pred_string(pred, event);
+	case FILTER_PRED_FN_STRLOC:
+		return filter_pred_strloc(pred, event);
+	case FILTER_PRED_FN_STRRELLOC:
+		return filter_pred_strrelloc(pred, event);
+	case FILTER_PRED_FN_PCHAR_USER:
+		return filter_pred_pchar_user(pred, event);
+	case FILTER_PRED_FN_PCHAR:
+		return filter_pred_pchar(pred, event);
+	case FILTER_PRED_FN_CPU:
+		return filter_pred_cpu(pred, event);
+	case FILTER_PRED_TEST_VISITED:
+		return test_pred_visited_fn(pred, event);
+	default:
+		return 0;
+	}
+}
+
 /* Called when a predicate is encountered by predicate_parse() */
 static int parse_pred(const char *str, void *data,
 		      int pos, struct filter_parse_error *pe,
@@ -1351,7 +1435,7 @@ static int parse_pred(const char *str, void *data,
 			parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
 			goto err_free;
 		}
-		pred->fn = filter_pred_none;
+		pred->fn_num = FILTER_PRED_FN_NOP;
 
 		/*
 		 * Quotes are not required, but if they exist then we need
@@ -1429,16 +1513,16 @@ static int parse_pred(const char *str, void *data,
 		filter_build_regex(pred);
 
 		if (field->filter_type == FILTER_COMM) {
-			pred->fn = filter_pred_comm;
+			pred->fn_num = FILTER_PRED_FN_COMM;
 
 		} else if (field->filter_type == FILTER_STATIC_STRING) {
-			pred->fn = filter_pred_string;
+			pred->fn_num = FILTER_PRED_FN_STRING;
 			pred->regex.field_len = field->size;
 
 		} else if (field->filter_type == FILTER_DYN_STRING) {
-			pred->fn = filter_pred_strloc;
+			pred->fn_num = FILTER_PRED_FN_STRLOC;
 		} else if (field->filter_type == FILTER_RDYN_STRING)
-			pred->fn = filter_pred_strrelloc;
+			pred->fn_num = FILTER_PRED_FN_STRRELLOC;
 		else {
 
 			if (!ustring_per_cpu) {
@@ -1449,9 +1533,9 @@ static int parse_pred(const char *str, void *data,
 			}
 
 			if (ustring)
-				pred->fn = filter_pred_pchar_user;
+				pred->fn_num = FILTER_PRED_FN_PCHAR_USER;
 			else
-				pred->fn = filter_pred_pchar;
+				pred->fn_num = FILTER_PRED_FN_PCHAR;
 		}
 		/* go past the last quote */
 		i++;
@@ -1499,10 +1583,10 @@ static int parse_pred(const char *str, void *data,
 		pred->val = val;
 
 		if (field->filter_type == FILTER_CPU)
-			pred->fn = filter_pred_cpu;
+			pred->fn_num = FILTER_PRED_FN_CPU;
 		else {
-			pred->fn = select_comparison_fn(pred->op, field->size,
-							field->is_signed);
+			pred->fn_num = select_comparison_fn(pred->op, field->size,
+							    field->is_signed);
 			if (pred->op == OP_NE)
 				pred->not = 1;
 		}
@@ -2309,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
 		struct filter_pred *pred = prog[i].pred;
 		struct ftrace_event_field *field = pred->field;
 
-		WARN_ON_ONCE(!pred->fn);
+		WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP);
 
 		if (!field) {
 			WARN_ONCE(1, "all leafs should have field defined %d", i);
@@ -2319,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields)
 		if (!strchr(fields, *field->name))
 			continue;
 
-		pred->fn = test_pred_visited_fn;
+		pred->fn_num = FILTER_PRED_TEST_VISITED;
 	}
 }
 

From 51714678eacc91fcfb8b235ced57ea70ade81cad Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 14 Sep 2022 14:14:16 +0800
Subject: [PATCH 09/44] tracepoint: Optimize the critical region of mutex_lock
 in tracepoint_module_coming()

The memory allocation of 'tp_mod' does not require mutex_lock()
protection, move it out.

Link: https://lkml.kernel.org/r/20220914061416.1630-1-thunder.leizhen@huawei.com

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ef42c1a1192053..f23144af574304 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv)
 static int tracepoint_module_coming(struct module *mod)
 {
 	struct tp_module *tp_mod;
-	int ret = 0;
 
 	if (!mod->num_tracepoints)
 		return 0;
@@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod)
 	 */
 	if (trace_module_has_bad_taint(mod))
 		return 0;
-	mutex_lock(&tracepoint_module_list_mutex);
+
 	tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
-	if (!tp_mod) {
-		ret = -ENOMEM;
-		goto end;
-	}
+	if (!tp_mod)
+		return -ENOMEM;
 	tp_mod->mod = mod;
+
+	mutex_lock(&tracepoint_module_list_mutex);
 	list_add_tail(&tp_mod->list, &tracepoint_module_list);
 	blocking_notifier_call_chain(&tracepoint_notify_list,
 			MODULE_STATE_COMING, tp_mod);
-end:
 	mutex_unlock(&tracepoint_module_list_mutex);
-	return ret;
+	return 0;
 }
 
 static void tracepoint_module_going(struct module *mod)

From 40d81137f186f188f9aa9081ff12018be6ee19d0 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Wed, 14 Sep 2022 19:04:36 +0800
Subject: [PATCH 10/44] x86/ftrace: Remove unused modifying_ftrace_code
 declaration

All uses of modifying_ftrace_code have been removed by
commit 768ae4406a5c ("x86/ftrace: Use text_poke()"),
so remove the declaration, too.

Link: https://lkml.kernel.org/r/20220914110437.1436353-2-cuigaosheng1@huawei.com

Cc: <mingo@redhat.com>
Cc: <tglx@linutronix.de>
Cc: <bp@alien8.de>
Cc: <dave.hansen@linux.intel.com>
Cc: <x86@kernel.org>
Cc: <hpa@zytor.com>
Cc: <mhiramat@kernel.org>
Cc: <peterz@infradead.org>
Cc: <ast@kernel.org>
Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/x86/include/asm/ftrace.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b5ef474be858d4..908d99b127d31b 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -23,7 +23,6 @@
 #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
 #ifndef __ASSEMBLY__
-extern atomic_t modifying_ftrace_code;
 extern void __fentry__(void);
 
 static inline unsigned long ftrace_call_adjust(unsigned long addr)

From d4940b84da4fbba7f7ed6e6287e1bef48d1293e9 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Wed, 14 Sep 2022 19:04:37 +0800
Subject: [PATCH 11/44] x86/kprobes: Remove unused
 arch_kprobe_override_function() declaration

All uses of arch_kprobe_override_function() have been removed by
commit 540adea3809f ("error-injection: Separate error-injection
from kprobe"), so remove the declaration, too.

Link: https://lkml.kernel.org/r/20220914110437.1436353-3-cuigaosheng1@huawei.com

Cc: <mingo@redhat.com>
Cc: <tglx@linutronix.de>
Cc: <bp@alien8.de>
Cc: <dave.hansen@linux.intel.com>
Cc: <x86@kernel.org>
Cc: <hpa@zytor.com>
Cc: <mhiramat@kernel.org>
Cc: <peterz@infradead.org>
Cc: <ast@kernel.org>
Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/x86/include/asm/kprobes.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 71ea2eab43d510..a2e9317aad4955 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -50,8 +50,6 @@ extern const int kretprobe_blacklist_size;
 
 void arch_remove_kprobe(struct kprobe *p);
 
-extern void arch_kprobe_override_function(struct pt_regs *regs);
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */

From ac48e189527fae87253ef2bf58892e782fb36874 Mon Sep 17 00:00:00 2001
From: Yipeng Zou <zouyipeng@huawei.com>
Date: Mon, 19 Sep 2022 20:56:28 +0800
Subject: [PATCH 12/44] tracing: kprobe: Fix kprobe event gen test module on
 exit

Correct gen_kretprobe_test clr event para on module exit.
This will make it can't to delete.

Link: https://lkml.kernel.org/r/20220919125629.238242-2-zouyipeng@huawei.com

Cc: <linux-riscv@lists.infradead.org>
Cc: <mingo@redhat.com>
Cc: <paul.walmsley@sifive.com>
Cc: <palmer@dabbelt.com>
Cc: <aou@eecs.berkeley.edu>
Cc: <zanussi@kernel.org>
Cc: <liaochang1@huawei.com>
Cc: <chris.zjh@huawei.com>
Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module")
Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/kprobe_event_gen_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index 18b0f1cbb947f0..e023154be0f856 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -206,7 +206,7 @@ static void __exit kprobe_event_gen_test_exit(void)
 	WARN_ON(kprobe_event_delete("gen_kprobe_test"));
 
 	/* Disable the event or you can't remove it */
-	WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr,
+	WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr,
 					  "kprobes",
 					  "gen_kretprobe_test", false));
 

From d8ef45d66c01425ff748e13ef7dd1da7a91cc93c Mon Sep 17 00:00:00 2001
From: Yipeng Zou <zouyipeng@huawei.com>
Date: Mon, 19 Sep 2022 20:56:29 +0800
Subject: [PATCH 13/44] tracing: kprobe: Make gen test module work in arm and
 riscv

For now, this selftest module can only work in x86 because of the
kprobe cmd was fixed use of x86 registers.
This patch adapted to register names under arm and riscv, So that
this module can be worked on those platform.

Link: https://lkml.kernel.org/r/20220919125629.238242-3-zouyipeng@huawei.com

Cc: <linux-riscv@lists.infradead.org>
Cc: <mingo@redhat.com>
Cc: <paul.walmsley@sifive.com>
Cc: <palmer@dabbelt.com>
Cc: <aou@eecs.berkeley.edu>
Cc: <zanussi@kernel.org>
Cc: <liaochang1@huawei.com>
Cc: <chris.zjh@huawei.com>
Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module")
Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/kprobe_event_gen_test.c | 47 +++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index e023154be0f856..80e04a1e19772a 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -35,6 +35,45 @@
 static struct trace_event_file *gen_kprobe_test;
 static struct trace_event_file *gen_kretprobe_test;
 
+#define KPROBE_GEN_TEST_FUNC	"do_sys_open"
+
+/* X86 */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32)
+#define KPROBE_GEN_TEST_ARG0	"dfd=%ax"
+#define KPROBE_GEN_TEST_ARG1	"filename=%dx"
+#define KPROBE_GEN_TEST_ARG2	"flags=%cx"
+#define KPROBE_GEN_TEST_ARG3	"mode=+4($stack)"
+
+/* ARM64 */
+#elif defined(CONFIG_ARM64)
+#define KPROBE_GEN_TEST_ARG0	"dfd=%x0"
+#define KPROBE_GEN_TEST_ARG1	"filename=%x1"
+#define KPROBE_GEN_TEST_ARG2	"flags=%x2"
+#define KPROBE_GEN_TEST_ARG3	"mode=%x3"
+
+/* ARM */
+#elif defined(CONFIG_ARM)
+#define KPROBE_GEN_TEST_ARG0	"dfd=%r0"
+#define KPROBE_GEN_TEST_ARG1	"filename=%r1"
+#define KPROBE_GEN_TEST_ARG2	"flags=%r2"
+#define KPROBE_GEN_TEST_ARG3	"mode=%r3"
+
+/* RISCV */
+#elif defined(CONFIG_RISCV)
+#define KPROBE_GEN_TEST_ARG0	"dfd=%a0"
+#define KPROBE_GEN_TEST_ARG1	"filename=%a1"
+#define KPROBE_GEN_TEST_ARG2	"flags=%a2"
+#define KPROBE_GEN_TEST_ARG3	"mode=%a3"
+
+/* others */
+#else
+#define KPROBE_GEN_TEST_ARG0	NULL
+#define KPROBE_GEN_TEST_ARG1	NULL
+#define KPROBE_GEN_TEST_ARG2	NULL
+#define KPROBE_GEN_TEST_ARG3	NULL
+#endif
+
+
 /*
  * Test to make sure we can create a kprobe event, then add more
  * fields.
@@ -58,14 +97,14 @@ static int __init test_gen_kprobe_cmd(void)
 	 * fields.
 	 */
 	ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test",
-					 "do_sys_open",
-					 "dfd=%ax", "filename=%dx");
+					 KPROBE_GEN_TEST_FUNC,
+					 KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1);
 	if (ret)
 		goto free;
 
 	/* Use kprobe_event_add_fields to add the rest of the fields */
 
-	ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)");
+	ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3);
 	if (ret)
 		goto free;
 
@@ -128,7 +167,7 @@ static int __init test_gen_kretprobe_cmd(void)
 	 * Define the kretprobe event.
 	 */
 	ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test",
-					    "do_sys_open",
+					    KPROBE_GEN_TEST_FUNC,
 					    "$retval");
 	if (ret)
 		goto free;

From 99ee9317a1305cd5626736785c8cb38b0e47686c Mon Sep 17 00:00:00 2001
From: Nico Pache <npache@redhat.com>
Date: Mon, 19 Sep 2022 08:49:32 -0600
Subject: [PATCH 14/44] tracing/osnoise: Fix possible recursive locking in
 stop_per_cpu_kthreads

There is a recursive lock on the cpu_hotplug_lock.

In kernel/trace/trace_osnoise.c:<start/stop>_per_cpu_kthreads:
    - start_per_cpu_kthreads calls cpus_read_lock() and if
	start_kthreads returns a error it will call stop_per_cpu_kthreads.
    - stop_per_cpu_kthreads then calls cpus_read_lock() again causing
      deadlock.

Fix this by calling cpus_read_unlock() before calling
stop_per_cpu_kthreads. This behavior can also be seen in commit
f46b16520a08 ("trace/hwlat: Implement the per-cpu mode").

This error was noticed during the LTP ftrace-stress-test:

WARNING: possible recursive locking detected
--------------------------------------------
sh/275006 is trying to acquire lock:
ffffffffb02f5400 (cpu_hotplug_lock){++++}-{0:0}, at: stop_per_cpu_kthreads

but task is already holding lock:
ffffffffb02f5400 (cpu_hotplug_lock){++++}-{0:0}, at: start_per_cpu_kthreads

other info that might help us debug this:
 Possible unsafe locking scenario:

      CPU0
      ----
 lock(cpu_hotplug_lock);
 lock(cpu_hotplug_lock);

 *** DEADLOCK ***

May be due to missing lock nesting notation

3 locks held by sh/275006:
 #0: ffff8881023f0470 (sb_writers#24){.+.+}-{0:0}, at: ksys_write
 #1: ffffffffb084f430 (trace_types_lock){+.+.}-{3:3}, at: rb_simple_write
 #2: ffffffffb02f5400 (cpu_hotplug_lock){++++}-{0:0}, at: start_per_cpu_kthreads

Link: https://lkml.kernel.org/r/20220919144932.3064014-1-npache@redhat.com

Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations")
Signed-off-by: Nico Pache <npache@redhat.com>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_osnoise.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 313439920a8ce9..78d536d3ff3dbc 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void)
 	for_each_cpu(cpu, current_mask) {
 		retval = start_kthread(cpu);
 		if (retval) {
+			cpus_read_unlock();
 			stop_per_cpu_kthreads();
-			break;
+			return retval;
 		}
 	}
 

From 834168fb2ce57681dee86a405ec560f54417830c Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Thu, 22 Sep 2022 18:32:08 +0800
Subject: [PATCH 15/44] rv/monitor: Add __init/__exit annotations to module
 init/exit funcs

Add missing __init/__exit annotations to module init/exit funcs.

Link: https://lkml.kernel.org/r/20220922103208.162869-1-xiujianfeng@huawei.com

Fixes: 24bce201d798 ("tools/rv: Add dot2k")
Fixes: 8812d21219b9 ("rv/monitor: Add the wip monitor skeleton created by dot2k")
Fixes: ccc319dcb450 ("rv/monitor: Add the wwnr monitor")
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Acked-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/rv/monitors/wip/wip.c                      | 4 ++--
 kernel/trace/rv/monitors/wwnr/wwnr.c                    | 4 ++--
 tools/verification/dot2/dot2k_templates/main_global.c   | 4 ++--
 tools/verification/dot2/dot2k_templates/main_per_cpu.c  | 4 ++--
 tools/verification/dot2/dot2k_templates/main_per_task.c | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index 1a989bc142f346..b2b49a27e8863b 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -69,13 +69,13 @@ static struct rv_monitor rv_wip = {
 	.enabled = 0,
 };
 
-static int register_wip(void)
+static int __init register_wip(void)
 {
 	rv_register_monitor(&rv_wip);
 	return 0;
 }
 
-static void unregister_wip(void)
+static void __exit unregister_wip(void)
 {
 	rv_unregister_monitor(&rv_wip);
 }
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index a063b93c6a1d37..0e43dd2db685d0 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -68,13 +68,13 @@ static struct rv_monitor rv_wwnr = {
 	.enabled = 0,
 };
 
-static int register_wwnr(void)
+static int __init register_wwnr(void)
 {
 	rv_register_monitor(&rv_wwnr);
 	return 0;
 }
 
-static void unregister_wwnr(void)
+static void __exit unregister_wwnr(void)
 {
 	rv_unregister_monitor(&rv_wwnr);
 }
diff --git a/tools/verification/dot2/dot2k_templates/main_global.c b/tools/verification/dot2/dot2k_templates/main_global.c
index dcd1162dced8b2..a5658bfb904455 100644
--- a/tools/verification/dot2/dot2k_templates/main_global.c
+++ b/tools/verification/dot2/dot2k_templates/main_global.c
@@ -72,13 +72,13 @@ static struct rv_monitor rv_MODEL_NAME = {
 	.enabled = 0,
 };
 
-static int register_MODEL_NAME(void)
+static int __init register_MODEL_NAME(void)
 {
 	rv_register_monitor(&rv_MODEL_NAME);
 	return 0;
 }
 
-static void unregister_MODEL_NAME(void)
+static void __exit unregister_MODEL_NAME(void)
 {
 	rv_unregister_monitor(&rv_MODEL_NAME);
 }
diff --git a/tools/verification/dot2/dot2k_templates/main_per_cpu.c b/tools/verification/dot2/dot2k_templates/main_per_cpu.c
index 8f877e86a22fc5..03539a97633f14 100644
--- a/tools/verification/dot2/dot2k_templates/main_per_cpu.c
+++ b/tools/verification/dot2/dot2k_templates/main_per_cpu.c
@@ -72,13 +72,13 @@ static struct rv_monitor rv_MODEL_NAME = {
 	.enabled = 0,
 };
 
-static int register_MODEL_NAME(void)
+static int __init register_MODEL_NAME(void)
 {
 	rv_register_monitor(&rv_MODEL_NAME);
 	return 0;
 }
 
-static void unregister_MODEL_NAME(void)
+static void __exit unregister_MODEL_NAME(void)
 {
 	rv_unregister_monitor(&rv_MODEL_NAME);
 }
diff --git a/tools/verification/dot2/dot2k_templates/main_per_task.c b/tools/verification/dot2/dot2k_templates/main_per_task.c
index 8c2fdb82463446..ffd92af87a8694 100644
--- a/tools/verification/dot2/dot2k_templates/main_per_task.c
+++ b/tools/verification/dot2/dot2k_templates/main_per_task.c
@@ -72,13 +72,13 @@ static struct rv_monitor rv_MODEL_NAME = {
 	.enabled = 0,
 };
 
-static int register_MODEL_NAME(void)
+static int __init register_MODEL_NAME(void)
 {
 	rv_register_monitor(&rv_MODEL_NAME);
 	return 0;
 }
 
-static void unregister_MODEL_NAME(void)
+static void __exit unregister_MODEL_NAME(void)
 {
 	rv_unregister_monitor(&rv_MODEL_NAME);
 }

From c0a581d7126c0bbc96163276f585fd7b4e4d8d0e Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 22 Sep 2022 10:56:22 -0400
Subject: [PATCH 16/44] tracing: Disable interrupt or preemption before
 acquiring arch_spinlock_t

It was found that some tracing functions in kernel/trace/trace.c acquire
an arch_spinlock_t with preemption and irqs enabled. An example is the
tracing_saved_cmdlines_size_read() function which intermittently causes
a "BUG: using smp_processor_id() in preemptible" warning when the LTP
read_all_proc test is run.

That can be problematic in case preemption happens after acquiring the
lock. Add the necessary preemption or interrupt disabling code in the
appropriate places before acquiring an arch_spinlock_t.

The convention here is to disable preemption for trace_cmdline_lock and
interupt for max_lock.

Link: https://lkml.kernel.org/r/20220922145622.1744826-1-longman@redhat.com

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: stable@vger.kernel.org
Fixes: a35873a0993b ("tracing: Add conditional snapshot")
Fixes: 939c7a4f04fc ("tracing: Introduce saved_cmdlines_size file")
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d3005279165d9c..aed7ea6e6045c3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr)
 {
 	void *cond_data = NULL;
 
+	local_irq_disable();
 	arch_spin_lock(&tr->max_lock);
 
 	if (tr->cond_snapshot)
 		cond_data = tr->cond_snapshot->cond_data;
 
 	arch_spin_unlock(&tr->max_lock);
+	local_irq_enable();
 
 	return cond_data;
 }
@@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
 		goto fail_unlock;
 	}
 
+	local_irq_disable();
 	arch_spin_lock(&tr->max_lock);
 	tr->cond_snapshot = cond_snapshot;
 	arch_spin_unlock(&tr->max_lock);
+	local_irq_enable();
 
 	mutex_unlock(&trace_types_lock);
 
@@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
 {
 	int ret = 0;
 
+	local_irq_disable();
 	arch_spin_lock(&tr->max_lock);
 
 	if (!tr->cond_snapshot)
@@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr)
 	}
 
 	arch_spin_unlock(&tr->max_lock);
+	local_irq_enable();
 
 	return ret;
 }
@@ -2200,6 +2206,11 @@ static size_t tgid_map_max;
 
 #define SAVED_CMDLINES_DEFAULT 128
 #define NO_CMDLINE_MAP UINT_MAX
+/*
+ * Preemption must be disabled before acquiring trace_cmdline_lock.
+ * The various trace_arrays' max_lock must be acquired in a context
+ * where interrupt is disabled.
+ */
 static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 struct saved_cmdlines_buffer {
 	unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk)
 	 * the lock, but we also don't want to spin
 	 * nor do we want to disable interrupts,
 	 * so if we miss here, then better luck next time.
+	 *
+	 * This is called within the scheduler and wake up, so interrupts
+	 * had better been disabled and run queue lock been held.
 	 */
+	lockdep_assert_preemption_disabled();
 	if (!arch_spin_trylock(&trace_cmdline_lock))
 		return 0;
 
@@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
 	char buf[64];
 	int r;
 
+	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
 	r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
 	arch_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
 		return -ENOMEM;
 	}
 
+	preempt_disable();
 	arch_spin_lock(&trace_cmdline_lock);
 	savedcmd_temp = savedcmd;
 	savedcmd = s;
 	arch_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 	free_saved_cmdlines_buffer(savedcmd_temp);
 
 	return 0;
@@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 
 #ifdef CONFIG_TRACER_SNAPSHOT
 	if (t->use_max_tr) {
+		local_irq_disable();
 		arch_spin_lock(&tr->max_lock);
 		if (tr->cond_snapshot)
 			ret = -EBUSY;
 		arch_spin_unlock(&tr->max_lock);
+		local_irq_enable();
 		if (ret)
 			goto out;
 	}
@@ -7436,10 +7457,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		goto out;
 	}
 
+	local_irq_disable();
 	arch_spin_lock(&tr->max_lock);
 	if (tr->cond_snapshot)
 		ret = -EBUSY;
 	arch_spin_unlock(&tr->max_lock);
+	local_irq_enable();
 	if (ret)
 		goto out;
 

From 3008119a3dd8052b7e5c07488e20a7abb0d287f2 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Fri, 23 Sep 2022 17:00:12 +0800
Subject: [PATCH 17/44] ftrace: Remove obsoleted code from ftrace and
 task_struct

The trace of "struct task_struct" was no longer used since
commit 345ddcc882d8 ("ftrace: Have set_ftrace_pid use the
bitmap like events do"), and the functions about flags for
current->trace is useless, so remove them.

Link: https://lkml.kernel.org/r/20220923090012.505990-1-cuigaosheng1@huawei.com

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 41 -----------------------------------------
 include/linux/sched.h  |  3 ---
 2 files changed, 44 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0b61371e287b94..62557d4bffc2b6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1122,47 +1122,6 @@ static inline void unpause_graph_tracing(void) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 #ifdef CONFIG_TRACING
-
-/* flags for current->trace */
-enum {
-	TSK_TRACE_FL_TRACE_BIT	= 0,
-	TSK_TRACE_FL_GRAPH_BIT	= 1,
-};
-enum {
-	TSK_TRACE_FL_TRACE	= 1 << TSK_TRACE_FL_TRACE_BIT,
-	TSK_TRACE_FL_GRAPH	= 1 << TSK_TRACE_FL_GRAPH_BIT,
-};
-
-static inline void set_tsk_trace_trace(struct task_struct *tsk)
-{
-	set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_trace(struct task_struct *tsk)
-{
-	clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_trace(struct task_struct *tsk)
-{
-	return tsk->trace & TSK_TRACE_FL_TRACE;
-}
-
-static inline void set_tsk_trace_graph(struct task_struct *tsk)
-{
-	set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline void clear_tsk_trace_graph(struct task_struct *tsk)
-{
-	clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
-}
-
-static inline int test_tsk_trace_graph(struct task_struct *tsk)
-{
-	return tsk->trace & TSK_TRACE_FL_GRAPH;
-}
-
 enum ftrace_dump_mode;
 
 extern enum ftrace_dump_mode ftrace_dump_on_oops;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711c1..c7ee04e9147a0f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1381,9 +1381,6 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_TRACING
-	/* State flags for use by tracers: */
-	unsigned long			trace;
-
 	/* Bitmask and counter of trace recursion: */
 	unsigned long			trace_recursion;
 #endif /* CONFIG_TRACING */

From ae398ad89456847aab9f12b0e63c51443af5da48 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Sat, 24 Sep 2022 15:26:29 +0800
Subject: [PATCH 18/44] x86: kprobes: Remove unused macro stack_addr

An unused macro reported by [-Wunused-macros].

This macro is used to access the sp in pt_regs because at that time
x86_32 can only get sp by kernel_stack_pointer(regs).

'3c88c692c287 ("x86/stackframe/32: Provide consistent pt_regs")'
This commit have unified the pt_regs and from them we can get sp from
pt_regs with regs->sp easily. Nowhere is using this macro anymore.

Refrencing pt_regs directly is more clear. Remove this macro for
code cleaning.

Link: https://lkml.kernel.org/r/20220924072629.104759-1-chenzhongjin@huawei.com

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 arch/x86/kernel/kprobes/core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4c3c27b6aea3b5..eb8bc82846b997 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -59,8 +59,6 @@
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-#define stack_addr(regs) ((unsigned long *)regs->sp)
-
 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
 	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
 	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \

From dc399adecd4e2826868e5d116a58e33071b18346 Mon Sep 17 00:00:00 2001
From: Tao Chen <chentao.kernel@linux.alibaba.com>
Date: Sat, 24 Sep 2022 22:13:34 +0800
Subject: [PATCH 19/44] tracing/eprobe: Fix alloc event dir failed when event
 name no set

The event dir will alloc failed when event name no set, using the
command:
"echo "e:esys/ syscalls/sys_enter_openat file=\$filename:string"
>> dynamic_events"
It seems that dir name="syscalls/sys_enter_openat" is not allowed
in debugfs. So just use the "sys_enter_openat" as the event name.

Link: https://lkml.kernel.org/r/1664028814-45923-1-git-send-email-chentao.kernel@linux.alibaba.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <zanussi@kernel.org>
Cc: Linyu Yuan <quic_linyyuan@quicinc.com>
Cc: Tao Chen <chentao.kernel@linux.alibaba.com
Cc: stable@vger.kernel.org
Fixes: 95c104c378dc ("tracing: Auto generate event name when creating a group of events")
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Tao Chen <chentao.kernel@linux.alibaba.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_eprobe.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 78299d3724a294..c08bde9871ec52 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -1039,8 +1039,7 @@ static int __trace_eprobe_create(int argc, const char *argv[])
 	}
 
 	if (!event) {
-		strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN);
-		sanitize_event_name(buf1);
+		strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
 		event = buf1;
 	}
 

From 0ce0638edf5ec83343302b884fa208179580700a Mon Sep 17 00:00:00 2001
From: Zheng Yejian <zhengyejian1@huawei.com>
Date: Mon, 26 Sep 2022 15:20:08 +0000
Subject: [PATCH 20/44] ftrace: Properly unset FTRACE_HASH_FL_MOD

When executing following commands like what document said, but the log
"#### all functions enabled ####" was not shown as expect:
  1. Set a 'mod' filter:
    $ echo 'write*:mod:ext3' > /sys/kernel/tracing/set_ftrace_filter
  2. Invert above filter:
    $ echo '!write*:mod:ext3' >> /sys/kernel/tracing/set_ftrace_filter
  3. Read the file:
    $ cat /sys/kernel/tracing/set_ftrace_filter

By some debugging, I found that flag FTRACE_HASH_FL_MOD was not unset
after inversion like above step 2 and then result of ftrace_hash_empty()
is incorrect.

Link: https://lkml.kernel.org/r/20220926152008.2239274-1-zhengyejian1@huawei.com

Cc: <mingo@redhat.com>
Cc: stable@vger.kernel.org
Fixes: 8c08f0d5c6fb ("ftrace: Have cached module filters be an active filter")
Signed-off-by: Zheng Yejian <zhengyejian1@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 439e2ab6905ee1..5a1ec7e1af33d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6081,8 +6081,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 
 		if (filter_hash) {
 			orig_hash = &iter->ops->func_hash->filter_hash;
-			if (iter->tr && !list_empty(&iter->tr->mod_trace))
-				iter->hash->flags |= FTRACE_HASH_FL_MOD;
+			if (iter->tr) {
+				if (list_empty(&iter->tr->mod_trace))
+					iter->hash->flags &= ~FTRACE_HASH_FL_MOD;
+				else
+					iter->hash->flags |= FTRACE_HASH_FL_MOD;
+			}
 		} else
 			orig_hash = &iter->ops->func_hash->notrace_hash;
 

From 9d2ce78ddcee159eb6a97449e9c68b6d60b9cec4 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 26 Sep 2022 17:41:46 -0700
Subject: [PATCH 21/44] ftrace: Fix recursive locking direct_mutex in
 ftrace_modify_direct_caller

Naveen reported recursive locking of direct_mutex with sample
ftrace-direct-modify.ko:

[   74.762406] WARNING: possible recursive locking detected
[   74.762887] 6.0.0-rc6+ #33 Not tainted
[   74.763216] --------------------------------------------
[   74.763672] event-sample-fn/1084 is trying to acquire lock:
[   74.764152] ffffffff86c9d6b0 (direct_mutex){+.+.}-{3:3}, at: \
    register_ftrace_function+0x1f/0x180
[   74.764922]
[   74.764922] but task is already holding lock:
[   74.765421] ffffffff86c9d6b0 (direct_mutex){+.+.}-{3:3}, at: \
    modify_ftrace_direct+0x34/0x1f0
[   74.766142]
[   74.766142] other info that might help us debug this:
[   74.766701]  Possible unsafe locking scenario:
[   74.766701]
[   74.767216]        CPU0
[   74.767437]        ----
[   74.767656]   lock(direct_mutex);
[   74.767952]   lock(direct_mutex);
[   74.768245]
[   74.768245]  *** DEADLOCK ***
[   74.768245]
[   74.768750]  May be due to missing lock nesting notation
[   74.768750]
[   74.769332] 1 lock held by event-sample-fn/1084:
[   74.769731]  #0: ffffffff86c9d6b0 (direct_mutex){+.+.}-{3:3}, at: \
    modify_ftrace_direct+0x34/0x1f0
[   74.770496]
[   74.770496] stack backtrace:
[   74.770884] CPU: 4 PID: 1084 Comm: event-sample-fn Not tainted ...
[   74.771498] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
[   74.772474] Call Trace:
[   74.772696]  <TASK>
[   74.772896]  dump_stack_lvl+0x44/0x5b
[   74.773223]  __lock_acquire.cold.74+0xac/0x2b7
[   74.773616]  lock_acquire+0xd2/0x310
[   74.773936]  ? register_ftrace_function+0x1f/0x180
[   74.774357]  ? lock_is_held_type+0xd8/0x130
[   74.774744]  ? my_tramp2+0x11/0x11 [ftrace_direct_modify]
[   74.775213]  __mutex_lock+0x99/0x1010
[   74.775536]  ? register_ftrace_function+0x1f/0x180
[   74.775954]  ? slab_free_freelist_hook.isra.43+0x115/0x160
[   74.776424]  ? ftrace_set_hash+0x195/0x220
[   74.776779]  ? register_ftrace_function+0x1f/0x180
[   74.777194]  ? kfree+0x3e1/0x440
[   74.777482]  ? my_tramp2+0x11/0x11 [ftrace_direct_modify]
[   74.777941]  ? __schedule+0xb40/0xb40
[   74.778258]  ? register_ftrace_function+0x1f/0x180
[   74.778672]  ? my_tramp1+0xf/0xf [ftrace_direct_modify]
[   74.779128]  register_ftrace_function+0x1f/0x180
[   74.779527]  ? ftrace_set_filter_ip+0x33/0x70
[   74.779910]  ? __schedule+0xb40/0xb40
[   74.780231]  ? my_tramp1+0xf/0xf [ftrace_direct_modify]
[   74.780678]  ? my_tramp2+0x11/0x11 [ftrace_direct_modify]
[   74.781147]  ftrace_modify_direct_caller+0x5b/0x90
[   74.781563]  ? 0xffffffffa0201000
[   74.781859]  ? my_tramp1+0xf/0xf [ftrace_direct_modify]
[   74.782309]  modify_ftrace_direct+0x1b2/0x1f0
[   74.782690]  ? __schedule+0xb40/0xb40
[   74.783014]  ? simple_thread+0x2a/0xb0 [ftrace_direct_modify]
[   74.783508]  ? __schedule+0xb40/0xb40
[   74.783832]  ? my_tramp2+0x11/0x11 [ftrace_direct_modify]
[   74.784294]  simple_thread+0x76/0xb0 [ftrace_direct_modify]
[   74.784766]  kthread+0xf5/0x120
[   74.785052]  ? kthread_complete_and_exit+0x20/0x20
[   74.785464]  ret_from_fork+0x22/0x30
[   74.785781]  </TASK>

Fix this by using register_ftrace_function_nolock in
ftrace_modify_direct_caller.

Link: https://lkml.kernel.org/r/20220927004146.1215303-1-song@kernel.org

Fixes: 53cd885bc5c3 ("ftrace: Allow IPMODIFY and DIRECT ops on the same function")
Reported-and-tested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5a1ec7e1af33d3..406d0597c409a5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5427,6 +5427,8 @@ static struct ftrace_ops stub_ops = {
  * it is safe to modify the ftrace record, where it should be
  * currently calling @old_addr directly, to call @new_addr.
  *
+ * This is called with direct_mutex locked.
+ *
  * Safety checks should be made to make sure that the code at
  * @rec->ip is currently calling @old_addr. And this must
  * also update entry->direct to @new_addr.
@@ -5439,6 +5441,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
 	unsigned long ip = rec->ip;
 	int ret;
 
+	lockdep_assert_held(&direct_mutex);
+
 	/*
 	 * The ftrace_lock was used to determine if the record
 	 * had more than one registered user to it. If it did,
@@ -5461,7 +5465,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
 	if (ret)
 		goto out_lock;
 
-	ret = register_ftrace_function(&stub_ops);
+	ret = register_ftrace_function_nolock(&stub_ops);
 	if (ret) {
 		ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
 		goto out_lock;

From fa8f4a89736b654125fb254b0db753ac68a5fced Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 27 Sep 2022 14:43:17 -0400
Subject: [PATCH 22/44] ring-buffer: Allow splice to read previous partially
 read pages

If a page is partially read, and then the splice system call is run
against the ring buffer, it will always fail to read, no matter how much
is in the ring buffer. That's because the code path for a partial read of
the page does will fail if the "full" flag is set.

The splice system call wants full pages, so if the read of the ring buffer
is not yet full, it should return zero, and the splice will block. But if
a previous read was done, where the beginning has been consumed, it should
still be given to the splice caller if the rest of the page has been
written to.

This caused the splice command to never consume data in this scenario, and
let the ring buffer just fill up and lose events.

Link: https://lkml.kernel.org/r/20220927144317.46be6b80@gandalf.local.home

Cc: stable@vger.kernel.org
Fixes: 8789a9e7df6bf ("ring-buffer: read page interface")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d59b6a328b7fec..6b145d48dfd1d9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5616,7 +5616,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 		unsigned int pos = 0;
 		unsigned int size;
 
-		if (full)
+		/*
+		 * If a full page is expected, this can still be returned
+		 * if there's been a previous partial read and the
+		 * rest of the page can be read and the commit page is off
+		 * the reader page.
+		 */
+		if (full &&
+		    (!read || (len < (commit - read)) ||
+		     cpu_buffer->reader_page == cpu_buffer->commit_page))
 			goto out_unlock;
 
 		if (len > (commit - read))

From 3b19d614b61b93a131f463817e08219c9ce1fee3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 27 Sep 2022 19:15:24 -0400
Subject: [PATCH 23/44] ring-buffer: Have the shortest_full queue be the
 shortest not longest

The logic to know when the shortest waiters on the ring buffer should be
woken up or not has uses a less than instead of a greater than compare,
which causes the shortest_full to actually be the longest.

Link: https://lkml.kernel.org/r/20220927231823.718039222@goodmis.org

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 2c2b0a78b3739 ("ring-buffer: Add percentage of ring buffer full to wake up reader")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6b145d48dfd1d9..02db92c9eb1ba7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1011,7 +1011,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
 			nr_pages = cpu_buffer->nr_pages;
 			dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
 			if (!cpu_buffer->shortest_full ||
-			    cpu_buffer->shortest_full < full)
+			    cpu_buffer->shortest_full > full)
 				cpu_buffer->shortest_full = full;
 			raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 			if (!pagebusy &&

From ec0bbc5ec5664dcee344f79373852117dc672c86 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 27 Sep 2022 19:15:25 -0400
Subject: [PATCH 24/44] ring-buffer: Check pending waiters when doing wake ups
 as well

The wake up waiters only checks the "wakeup_full" variable and not the
"full_waiters_pending". The full_waiters_pending is set when a waiter is
added to the wait queue. The wakeup_full is only set when an event is
triggered, and it clears the full_waiters_pending to avoid multiple calls
to irq_work_queue().

The irq_work callback really needs to check both wakeup_full as well as
full_waiters_pending such that this code can be used to wake up waiters
when a file is closed that represents the ring buffer and the waiters need
to be woken up.

Link: https://lkml.kernel.org/r/20220927231824.209460321@goodmis.org

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 02db92c9eb1ba7..5a7d818ca3ea38 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -917,8 +917,9 @@ static void rb_wake_up_waiters(struct irq_work *work)
 	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
 
 	wake_up_all(&rbwork->waiters);
-	if (rbwork->wakeup_full) {
+	if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
 		rbwork->wakeup_full = false;
+		rbwork->full_waiters_pending = false;
 		wake_up_all(&rbwork->full_waiters);
 	}
 }

From 7e9fbbb1b776d8d7969551565bc246f74ec53b27 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 28 Sep 2022 13:39:38 -0400
Subject: [PATCH 25/44] ring-buffer: Add ring_buffer_wake_waiters()

On closing of a file that represents a ring buffer or flushing the file,
there may be waiters on the ring buffer that needs to be woken up and exit
the ring_buffer_wait() function.

Add ring_buffer_wake_waiters() to wake up the waiters on the ring buffer
and allow them to exit the wait loop.

Link: https://lkml.kernel.org/r/20220928133938.28dc2c27@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: 15693458c4bc0 ("tracing/ring-buffer: Move poll wake ups into ring buffer code")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 +-
 kernel/trace/ring_buffer.c  | 39 +++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dac53fd3afea33..2504df9a0453e6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -101,7 +101,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full);
 __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
 			  struct file *filp, poll_table *poll_table);
-
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu);
 
 #define RING_BUFFER_ALL_CPUS -1
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5a7d818ca3ea38..3046deacf7b332 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -413,6 +413,7 @@ struct rb_irq_work {
 	struct irq_work			work;
 	wait_queue_head_t		waiters;
 	wait_queue_head_t		full_waiters;
+	long				wait_index;
 	bool				waiters_pending;
 	bool				full_waiters_pending;
 	bool				wakeup_full;
@@ -924,6 +925,37 @@ static void rb_wake_up_waiters(struct irq_work *work)
 	}
 }
 
+/**
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
+ * @buffer: The ring buffer to wake waiters on
+ *
+ * In the case of a file that represents a ring buffer is closing,
+ * it is prudent to wake up any waiters that are on this.
+ */
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct rb_irq_work *rbwork;
+
+	if (cpu == RING_BUFFER_ALL_CPUS) {
+
+		/* Wake up individual ones too. One level recursion */
+		for_each_buffer_cpu(buffer, cpu)
+			ring_buffer_wake_waiters(buffer, cpu);
+
+		rbwork = &buffer->irq_work;
+	} else {
+		cpu_buffer = buffer->buffers[cpu];
+		rbwork = &cpu_buffer->irq_work;
+	}
+
+	rbwork->wait_index++;
+	/* make sure the waiters see the new index */
+	smp_wmb();
+
+	rb_wake_up_waiters(&rbwork->work);
+}
+
 /**
  * ring_buffer_wait - wait for input to the ring buffer
  * @buffer: buffer to wait on
@@ -939,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	DEFINE_WAIT(wait);
 	struct rb_irq_work *work;
+	long wait_index;
 	int ret = 0;
 
 	/*
@@ -957,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
 		work = &cpu_buffer->irq_work;
 	}
 
+	wait_index = READ_ONCE(work->wait_index);
 
 	while (true) {
 		if (full)
@@ -1021,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
 		}
 
 		schedule();
+
+		/* Make sure to see the new wait index */
+		smp_rmb();
+		if (wait_index != work->wait_index)
+			break;
 	}
 
 	if (full)

From f3ddb74ad0790030c9592229fb14d8c451f4e9a8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Tue, 27 Sep 2022 19:15:27 -0400
Subject: [PATCH 26/44] tracing: Wake up ring buffer waiters on closing of the
 file

When the file that represents the ring buffer is closed, there may be
waiters waiting on more input from the ring buffer. Call
ring_buffer_wake_waiters() to wake up any waiters when the file is
closed.

Link: https://lkml.kernel.org/r/20220927231825.182416969@goodmis.org

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  1 +
 kernel/trace/trace.c         | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8401dec93c155e..20749bd9db718c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -92,6 +92,7 @@ struct trace_iterator {
 	unsigned int		temp_size;
 	char			*fmt;	/* modified format holder */
 	unsigned int		fmt_size;
+	long			wait_index;
 
 	/* trace_seq for __print_flags() and __print_symbolic() etc. */
 	struct trace_seq	tmp_seq;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aed7ea6e6045c3..e101b0764b398b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8160,6 +8160,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 
 	__trace_array_put(iter->tr);
 
+	iter->wait_index++;
+	/* Make sure the waiters see the new wait_index */
+	smp_wmb();
+
+	ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
 	if (info->spare)
 		ring_buffer_free_read_page(iter->array_buffer->buffer,
 					   info->spare_cpu, info->spare);
@@ -8313,6 +8319,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
 	/* did we read anything? */
 	if (!spd.nr_pages) {
+		long wait_index;
+
 		if (ret)
 			goto out;
 
@@ -8320,10 +8328,17 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
 			goto out;
 
+		wait_index = READ_ONCE(iter->wait_index);
+
 		ret = wait_on_pipe(iter, iter->tr->buffer_percent);
 		if (ret)
 			goto out;
 
+		/* Make sure we see the new wait_index */
+		smp_rmb();
+		if (wait_index != iter->wait_index)
+			goto out;
+
 		goto again;
 	}
 

From 01b2a52171735c6eea80ee2f355f32bea6c41418 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 29 Sep 2022 09:50:29 -0400
Subject: [PATCH 27/44] tracing: Add ioctl() to force ring buffer waiters to
 wake up

If a process is waiting on the ring buffer for data, there currently isn't
a clean way to force it to wake up. Add an ioctl call that will force any
tasks that are waiting on the trace_pipe_raw file to wake up.

Link: https://lkml.kernel.org/r/20220929095029.117f913f@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e101b0764b398b..58afc83afc9d5a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8349,12 +8349,34 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	return ret;
 }
 
+/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
+static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	if (cmd)
+		return -ENOIOCTLCMD;
+
+	mutex_lock(&trace_types_lock);
+
+	iter->wait_index++;
+	/* Make sure the waiters see the new wait_index */
+	smp_wmb();
+
+	ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file);
+
+	mutex_unlock(&trace_types_lock);
+	return 0;
+}
+
 static const struct file_operations tracing_buffers_fops = {
 	.open		= tracing_buffers_open,
 	.read		= tracing_buffers_read,
 	.poll		= tracing_buffers_poll,
 	.release	= tracing_buffers_release,
 	.splice_read	= tracing_buffers_splice_read,
+	.unlocked_ioctl = tracing_buffers_ioctl,
 	.llseek		= no_llseek,
 };
 

From 2b0fd9a59b7990c161fa1cb7b79edb22847c87c2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 28 Sep 2022 18:22:20 -0400
Subject: [PATCH 28/44] tracing: Wake up waiters when tracing is disabled

When tracing is disabled, there's no reason that waiters should stay
waiting, wake them up, otherwise tasks get stuck when they should be
flushing the buffers.

Cc: stable@vger.kernel.org
Fixes: e30f53aad2202 ("tracing: Do not busy wait in buffer splice")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 58afc83afc9d5a..bb5597c6bfc122 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8334,6 +8334,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if (ret)
 			goto out;
 
+		/* No need to wait after waking up when tracing is off */
+		if (!tracer_tracing_is_on(iter->tr))
+			goto out;
+
 		/* Make sure we see the new wait_index */
 		smp_rmb();
 		if (wait_index != iter->wait_index)
@@ -9065,6 +9069,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 			tracer_tracing_off(tr);
 			if (tr->current_trace->stop)
 				tr->current_trace->stop(tr);
+			/* Wake up any waiters */
+			ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS);
 		}
 		mutex_unlock(&trace_types_lock);
 	}

From e841e8bfac490957fed3157326b96342dc76798a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Wed, 28 Sep 2022 22:58:28 +0100
Subject: [PATCH 29/44] tracing: Fix spelling mistake "preapre" -> "prepare"

There is a spelling mistake in the trace text. Fix it.

Link: https://lkml.kernel.org/r/20220928215828.66325-1-colin.i.king@gmail.com

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb5597c6bfc122..def721de68a090 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -10157,7 +10157,7 @@ __init static int tracer_alloc_buffers(void)
 	 * buffer. The memory will be removed once the "instance" is removed.
 	 */
 	ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE,
-				      "trace/RB:preapre", trace_rb_cpu_prepare,
+				      "trace/RB:prepare", trace_rb_cpu_prepare,
 				      NULL);
 	if (ret < 0)
 		goto out_free_cpumask;

From 9cbf12343d595a1a5ecc84c4471b1fe52e2be19b Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:04 -0700
Subject: [PATCH 30/44] tracing/user_events: Use NULL for strstr checks

Trivial fix to ensure strstr checks use NULL instead of 0.

Link: https://lkml.kernel.org/r/20220728233309.1896-2-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index a6621c52ce454f..075d694d20e3d7 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -277,7 +277,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
 	goto add_field;
 
 add_validator:
-	if (strstr(type, "char") != 0)
+	if (strstr(type, "char") != NULL)
 		validator_flags |= VALIDATOR_ENSURE_NULL;
 
 	validator = kmalloc(sizeof(*validator), GFP_KERNEL);
@@ -458,7 +458,7 @@ static const char *user_field_format(const char *type)
 		return "%d";
 	if (strcmp(type, "unsigned char") == 0)
 		return "%u";
-	if (strstr(type, "char[") != 0)
+	if (strstr(type, "char[") != NULL)
 		return "%s";
 
 	/* Unknown, likely struct, allowed treat as 64-bit */
@@ -479,7 +479,7 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func)
 
 	return false;
 check:
-	return strstr(type, "char") != 0;
+	return strstr(type, "char") != NULL;
 }
 
 #define LEN_OR_ZERO (len ? len - pos : 0)

From 95f187603dbff69bef19b2ab3bb54d2c060f556d Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:05 -0700
Subject: [PATCH 31/44] tracing/user_events: Use WRITE instead of READ for io
 vector import

import_single_range expects the direction/rw to be where it came from,
not the protection/limit. Since the import is in a write path use WRITE.

Link: https://lkml.kernel.org/r/20220728233309.1896-3-beaub@linux.microsoft.com
Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 075d694d20e3d7..15edbf6b1e2ed6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -1245,7 +1245,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf,
 	if (unlikely(*ppos != 0))
 		return -EFAULT;
 
-	if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+	if (unlikely(import_single_range(WRITE, (char __user *)ubuf,
+					 count, &iov, &i)))
 		return -EFAULT;
 
 	return user_events_write_core(file, &i);

From e6f89a149872ab0e03cfded97983df74dfb0ef21 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:06 -0700
Subject: [PATCH 32/44] tracing/user_events: Ensure user provided strings are
 safely formatted

User processes can provide bad strings that may cause issues or leak
kernel details back out. Don't trust the content of these strings
when formatting strings for matching.

This also moves to a consistent dynamic length string creation model.

Link: https://lkml.kernel.org/r/20220728233309.1896-4-beaub@linux.microsoft.com
Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 91 +++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 32 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 15edbf6b1e2ed6..f9bb7d37d76ff1 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -45,7 +45,6 @@
 #define MAX_EVENT_DESC 512
 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
-#define MAX_FIELD_ARG_NAME 256
 
 static char *register_page_data;
 
@@ -483,6 +482,48 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func)
 }
 
 #define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_dyn_field_set_string(int argc, const char **argv, int *iout,
+				     char *buf, int len, bool *colon)
+{
+	int pos = 0, i = *iout;
+
+	*colon = false;
+
+	for (; i < argc; ++i) {
+		if (i != *iout)
+			pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]);
+
+		if (strchr(argv[i], ';')) {
+			++i;
+			*colon = true;
+			break;
+		}
+	}
+
+	/* Actual set, advance i */
+	if (len != 0)
+		*iout = i;
+
+	return pos + 1;
+}
+
+static int user_field_set_string(struct ftrace_event_field *field,
+				 char *buf, int len, bool colon)
+{
+	int pos = 0;
+
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type);
+	pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+	pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name);
+
+	if (colon)
+		pos += snprintf(buf + pos, LEN_OR_ZERO, ";");
+
+	return pos + 1;
+}
+
 static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
 {
 	struct ftrace_event_field *field, *next;
@@ -926,49 +967,35 @@ static int user_event_free(struct dyn_event *ev)
 static bool user_field_match(struct ftrace_event_field *field, int argc,
 			     const char **argv, int *iout)
 {
-	char *field_name, *arg_name;
-	int len, pos, i = *iout;
+	char *field_name = NULL, *dyn_field_name = NULL;
 	bool colon = false, match = false;
+	int dyn_len, len;
 
-	if (i >= argc)
+	if (*iout >= argc)
 		return false;
 
-	len = MAX_FIELD_ARG_NAME;
-	field_name = kmalloc(len, GFP_KERNEL);
-	arg_name = kmalloc(len, GFP_KERNEL);
+	dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+					    0, &colon);
 
-	if (!arg_name || !field_name)
-		goto out;
-
-	pos = 0;
-
-	for (; i < argc; ++i) {
-		if (i != *iout)
-			pos += snprintf(arg_name + pos, len - pos, " ");
+	len = user_field_set_string(field, field_name, 0, colon);
 
-		pos += snprintf(arg_name + pos, len - pos, argv[i]);
-
-		if (strchr(argv[i], ';')) {
-			++i;
-			colon = true;
-			break;
-		}
-	}
+	if (dyn_len != len)
+		return false;
 
-	pos = 0;
+	dyn_field_name = kmalloc(dyn_len, GFP_KERNEL);
+	field_name = kmalloc(len, GFP_KERNEL);
 
-	pos += snprintf(field_name + pos, len - pos, field->type);
-	pos += snprintf(field_name + pos, len - pos, " ");
-	pos += snprintf(field_name + pos, len - pos, field->name);
+	if (!dyn_field_name || !field_name)
+		goto out;
 
-	if (colon)
-		pos += snprintf(field_name + pos, len - pos, ";");
+	user_dyn_field_set_string(argc, argv, iout, dyn_field_name,
+				  dyn_len, &colon);
 
-	*iout = i;
+	user_field_set_string(field, field_name, len, colon);
 
-	match = strcmp(arg_name, field_name) == 0;
+	match = strcmp(dyn_field_name, field_name) == 0;
 out:
-	kfree(arg_name);
+	kfree(dyn_field_name);
 	kfree(field_name);
 
 	return match;

From d401b72458562c2f2a81dad162de5c1b8e191e17 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:07 -0700
Subject: [PATCH 33/44] tracing/user_events: Use refcount instead of atomic for
 ref tracking

User processes could open up enough event references to cause rollovers.
These could cause use after free scenarios, which we do not want.
Switching to refcount APIs prevent this, but will leak memory once
saturated.

Once saturated, user processes can still use the events. This prevents
a bad user process from stopping existing telemetry from being emitted.

Link: https://lkml.kernel.org/r/20220728233309.1896-5-beaub@linux.microsoft.com
Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 53 +++++++++++++++-----------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index f9bb7d37d76ff1..2bcae7abfa81e5 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -14,6 +14,7 @@
 #include <linux/uio.h>
 #include <linux/ioctl.h>
 #include <linux/jhash.h>
+#include <linux/refcount.h>
 #include <linux/trace_events.h>
 #include <linux/tracefs.h>
 #include <linux/types.h>
@@ -57,7 +58,7 @@ static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
  * within a file a user_event might be created if it does not
  * already exist. These are globally used and their lifetime
  * is tied to the refcnt member. These cannot go away until the
- * refcnt reaches zero.
+ * refcnt reaches one.
  */
 struct user_event {
 	struct tracepoint tracepoint;
@@ -67,7 +68,7 @@ struct user_event {
 	struct hlist_node node;
 	struct list_head fields;
 	struct list_head validators;
-	atomic_t refcnt;
+	refcount_t refcnt;
 	int index;
 	int flags;
 	int min_size;
@@ -105,6 +106,12 @@ static u32 user_event_key(char *name)
 	return jhash(name, strlen(name), 0);
 }
 
+static __always_inline __must_check
+bool user_event_last_ref(struct user_event *user)
+{
+	return refcount_read(&user->refcnt) == 1;
+}
+
 static __always_inline __must_check
 size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
 {
@@ -662,7 +669,7 @@ static struct user_event *find_user_event(char *name, u32 *outkey)
 
 	hash_for_each_possible(register_table, user, node, key)
 		if (!strcmp(EVENT_NAME(user), name)) {
-			atomic_inc(&user->refcnt);
+			refcount_inc(&user->refcnt);
 			return user;
 		}
 
@@ -876,12 +883,12 @@ static int user_event_reg(struct trace_event_call *call,
 
 	return ret;
 inc:
-	atomic_inc(&user->refcnt);
+	refcount_inc(&user->refcnt);
 	update_reg_page_for(user);
 	return 0;
 dec:
 	update_reg_page_for(user);
-	atomic_dec(&user->refcnt);
+	refcount_dec(&user->refcnt);
 	return 0;
 }
 
@@ -907,7 +914,7 @@ static int user_event_create(const char *raw_command)
 	ret = user_event_parse_cmd(name, &user);
 
 	if (!ret)
-		atomic_dec(&user->refcnt);
+		refcount_dec(&user->refcnt);
 
 	mutex_unlock(&reg_mutex);
 
@@ -951,14 +958,14 @@ static bool user_event_is_busy(struct dyn_event *ev)
 {
 	struct user_event *user = container_of(ev, struct user_event, devent);
 
-	return atomic_read(&user->refcnt) != 0;
+	return !user_event_last_ref(user);
 }
 
 static int user_event_free(struct dyn_event *ev)
 {
 	struct user_event *user = container_of(ev, struct user_event, devent);
 
-	if (atomic_read(&user->refcnt) != 0)
+	if (!user_event_last_ref(user))
 		return -EBUSY;
 
 	return destroy_user_event(user);
@@ -1137,8 +1144,8 @@ static int user_event_parse(char *name, char *args, char *flags,
 
 	user->index = index;
 
-	/* Ensure we track ref */
-	atomic_inc(&user->refcnt);
+	/* Ensure we track self ref and caller ref (2) */
+	refcount_set(&user->refcnt, 2);
 
 	dyn_event_init(&user->devent, &user_event_dops);
 	dyn_event_add(&user->devent, &user->call);
@@ -1164,29 +1171,17 @@ static int user_event_parse(char *name, char *args, char *flags,
 static int delete_user_event(char *name)
 {
 	u32 key;
-	int ret;
 	struct user_event *user = find_user_event(name, &key);
 
 	if (!user)
 		return -ENOENT;
 
-	/* Ensure we are the last ref */
-	if (atomic_read(&user->refcnt) != 1) {
-		ret = -EBUSY;
-		goto put_ref;
-	}
-
-	ret = destroy_user_event(user);
+	refcount_dec(&user->refcnt);
 
-	if (ret)
-		goto put_ref;
-
-	return ret;
-put_ref:
-	/* No longer have this ref */
-	atomic_dec(&user->refcnt);
+	if (!user_event_last_ref(user))
+		return -EBUSY;
 
-	return ret;
+	return destroy_user_event(user);
 }
 
 /*
@@ -1314,7 +1309,7 @@ static int user_events_ref_add(struct file *file, struct user_event *user)
 
 	new_refs->events[i] = user;
 
-	atomic_inc(&user->refcnt);
+	refcount_inc(&user->refcnt);
 
 	rcu_assign_pointer(file->private_data, new_refs);
 
@@ -1374,7 +1369,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
 	ret = user_events_ref_add(file, user);
 
 	/* No longer need parse ref, ref_add either worked or not */
-	atomic_dec(&user->refcnt);
+	refcount_dec(&user->refcnt);
 
 	/* Positive number is index and valid */
 	if (ret < 0)
@@ -1464,7 +1459,7 @@ static int user_events_release(struct inode *node, struct file *file)
 		user = refs->events[i];
 
 		if (user)
-			atomic_dec(&user->refcnt);
+			refcount_dec(&user->refcnt);
 	}
 out:
 	file->private_data = NULL;

From 39d6d08b2edf99c4b39a689a70bf0adee065b357 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:08 -0700
Subject: [PATCH 34/44] tracing/user_events: Use bits vs bytes for enabled
 status page data

User processes may require many events and when they do the cache
performance of a byte index status check is less ideal than a bit index.
The previous event limit per-page was 4096, the new limit is 32,768.

This change adds a bitwise index to the user_reg struct. Programs check
that the bit at status_bit has a bit set within the status page(s).

Link: https://lkml.kernel.org/r/20220728233309.1896-6-beaub@linux.microsoft.com
Link: https://lore.kernel.org/all/2059213643.196683.1648499088753.JavaMail.zimbra@efficios.com/

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/user_events.h                   | 15 +---
 kernel/trace/trace_events_user.c              | 75 +++++++++++++++++--
 samples/user_events/example.c                 | 25 +++++--
 .../selftests/user_events/ftrace_test.c       | 47 ++++++++++--
 .../testing/selftests/user_events/perf_test.c | 11 ++-
 5 files changed, 135 insertions(+), 38 deletions(-)

diff --git a/include/linux/user_events.h b/include/linux/user_events.h
index 736e0560346386..592a3fbed98e40 100644
--- a/include/linux/user_events.h
+++ b/include/linux/user_events.h
@@ -20,15 +20,6 @@
 #define USER_EVENTS_SYSTEM "user_events"
 #define USER_EVENTS_PREFIX "u:"
 
-/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
-#define EVENT_BIT_FTRACE 0
-#define EVENT_BIT_PERF 1
-#define EVENT_BIT_OTHER 7
-
-#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
-#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
-#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)
-
 /* Create dynamic location entry within a 32-bit value */
 #define DYN_LOC(offset, size) ((size) << 16 | (offset))
 
@@ -45,12 +36,12 @@ struct user_reg {
 	/* Input: Pointer to string with event name, description and flags */
 	__u64 name_args;
 
-	/* Output: Byte index of the event within the status page */
-	__u32 status_index;
+	/* Output: Bitwise index of the event within the status page */
+	__u32 status_bit;
 
 	/* Output: Index of the event to use when writing data */
 	__u32 write_index;
-};
+} __attribute__((__packed__));
 
 #define DIAG_IOC_MAGIC '*'
 
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 2bcae7abfa81e5..2c0a6ec755489b 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -40,17 +40,44 @@
  */
 #define MAX_PAGE_ORDER 0
 #define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
+#define MAX_EVENTS (MAX_BYTES * 8)
 
 /* Limit how long of an event name plus args within the subsystem. */
 #define MAX_EVENT_DESC 512
 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
 #define MAX_FIELD_ARRAY_SIZE 1024
 
+/*
+ * The MAP_STATUS_* macros are used for taking a index and determining the
+ * appropriate byte and the bit in the byte to set/reset for an event.
+ *
+ * The lower 3 bits of the index decide which bit to set.
+ * The remaining upper bits of the index decide which byte to use for the bit.
+ *
+ * This is used when an event has a probe attached/removed to reflect live
+ * status of the event wanting tracing or not to user-programs via shared
+ * memory maps.
+ */
+#define MAP_STATUS_BYTE(index) ((index) >> 3)
+#define MAP_STATUS_MASK(index) BIT((index) & 7)
+
+/*
+ * Internal bits (kernel side only) to keep track of connected probes:
+ * These are used when status is requested in text form about an event. These
+ * bits are compared against an internal byte on the event to determine which
+ * probes to print out to the user.
+ *
+ * These do not reflect the mapped bytes between the user and kernel space.
+ */
+#define EVENT_STATUS_FTRACE BIT(0)
+#define EVENT_STATUS_PERF BIT(1)
+#define EVENT_STATUS_OTHER BIT(7)
+
 static char *register_page_data;
 
 static DEFINE_MUTEX(reg_mutex);
-static DEFINE_HASHTABLE(register_table, 4);
+static DEFINE_HASHTABLE(register_table, 8);
 static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
 
 /*
@@ -72,6 +99,7 @@ struct user_event {
 	int index;
 	int flags;
 	int min_size;
+	char status;
 };
 
 /*
@@ -106,6 +134,22 @@ static u32 user_event_key(char *name)
 	return jhash(name, strlen(name), 0);
 }
 
+static __always_inline
+void user_event_register_set(struct user_event *user)
+{
+	int i = user->index;
+
+	register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+}
+
+static __always_inline
+void user_event_register_clear(struct user_event *user)
+{
+	int i = user->index;
+
+	register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+}
+
 static __always_inline __must_check
 bool user_event_last_ref(struct user_event *user)
 {
@@ -648,7 +692,7 @@ static int destroy_user_event(struct user_event *user)
 
 	dyn_event_remove(&user->devent);
 
-	register_page_data[user->index] = 0;
+	user_event_register_clear(user);
 	clear_bit(user->index, page_bitmap);
 	hash_del(&user->node);
 
@@ -827,7 +871,12 @@ static void update_reg_page_for(struct user_event *user)
 		rcu_read_unlock_sched();
 	}
 
-	register_page_data[user->index] = status;
+	if (status)
+		user_event_register_set(user);
+	else
+		user_event_register_clear(user);
+
+	user->status = status;
 }
 
 /*
@@ -1332,7 +1381,17 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
 	if (size > PAGE_SIZE)
 		return -E2BIG;
 
-	return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+	if (size < offsetofend(struct user_reg, write_index))
+		return -EINVAL;
+
+	ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+	if (ret)
+		return ret;
+
+	kreg->size = size;
+
+	return 0;
 }
 
 /*
@@ -1376,7 +1435,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
 		return ret;
 
 	put_user((u32)ret, &ureg->write_index);
-	put_user(user->index, &ureg->status_index);
+	put_user(user->index, &ureg->status_bit);
 
 	return 0;
 }
@@ -1485,7 +1544,7 @@ static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	unsigned long size = vma->vm_end - vma->vm_start;
 
-	if (size != MAX_EVENTS)
+	if (size != MAX_BYTES)
 		return -EINVAL;
 
 	return remap_pfn_range(vma, vma->vm_start,
@@ -1520,7 +1579,7 @@ static int user_seq_show(struct seq_file *m, void *p)
 	mutex_lock(&reg_mutex);
 
 	hash_for_each(register_table, i, user, node) {
-		status = register_page_data[user->index];
+		status = user->status;
 		flags = user->flags;
 
 		seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
diff --git a/samples/user_events/example.c b/samples/user_events/example.c
index 4f5778e441c0b5..d06dc24156ec03 100644
--- a/samples/user_events/example.c
+++ b/samples/user_events/example.c
@@ -12,13 +12,21 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <asm/bitsperlong.h>
+#include <endian.h>
 #include <linux/user_events.h>
 
+#if __BITS_PER_LONG == 64
+#define endian_swap(x) htole64(x)
+#else
+#define endian_swap(x) htole32(x)
+#endif
+
 /* Assumes debugfs is mounted */
 const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
 const char *status_file = "/sys/kernel/debug/tracing/user_events_status";
 
-static int event_status(char **status)
+static int event_status(long **status)
 {
 	int fd = open(status_file, O_RDONLY);
 
@@ -33,7 +41,8 @@ static int event_status(char **status)
 	return 0;
 }
 
-static int event_reg(int fd, const char *command, int *status, int *write)
+static int event_reg(int fd, const char *command, long *index, long *mask,
+		     int *write)
 {
 	struct user_reg reg = {0};
 
@@ -43,7 +52,8 @@ static int event_reg(int fd, const char *command, int *status, int *write)
 	if (ioctl(fd, DIAG_IOCSREG, &reg) == -1)
 		return -1;
 
-	*status = reg.status_index;
+	*index = reg.status_bit / __BITS_PER_LONG;
+	*mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG));
 	*write = reg.write_index;
 
 	return 0;
@@ -51,8 +61,9 @@ static int event_reg(int fd, const char *command, int *status, int *write)
 
 int main(int argc, char **argv)
 {
-	int data_fd, status, write;
-	char *status_page;
+	int data_fd, write;
+	long index, mask;
+	long *status_page;
 	struct iovec io[2];
 	__u32 count = 0;
 
@@ -61,7 +72,7 @@ int main(int argc, char **argv)
 
 	data_fd = open(data_file, O_RDWR);
 
-	if (event_reg(data_fd, "test u32 count", &status, &write) == -1)
+	if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1)
 		return errno;
 
 	/* Setup iovec */
@@ -75,7 +86,7 @@ int main(int argc, char **argv)
 	getchar();
 
 	/* Check if anyone is listening */
-	if (status_page[status]) {
+	if (status_page[index] & mask) {
 		/* Yep, trace out our data */
 		writev(data_fd, (const struct iovec *)io, 2);
 
diff --git a/tools/testing/selftests/user_events/ftrace_test.c b/tools/testing/selftests/user_events/ftrace_test.c
index a80fb5ef61d504..404a2713dcae8c 100644
--- a/tools/testing/selftests/user_events/ftrace_test.c
+++ b/tools/testing/selftests/user_events/ftrace_test.c
@@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e
 const char *trace_file = "/sys/kernel/debug/tracing/trace";
 const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";
 
+static inline int status_check(char *status_page, int status_bit)
+{
+	return status_page[status_bit >> 3] & (1 << (status_bit & 7));
+}
+
 static int trace_bytes(void)
 {
 	int fd = open(trace_file, O_RDONLY);
@@ -197,12 +202,12 @@ TEST_F(user, register_events) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
 
 	/* Multiple registers should result in same index */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
 
 	/* Ensure disabled */
 	self->enable_fd = open(enable_file, O_RDWR);
@@ -212,15 +217,15 @@ TEST_F(user, register_events) {
 	/* MMAP should work and be zero'd */
 	ASSERT_NE(MAP_FAILED, status_page);
 	ASSERT_NE(NULL, status_page);
-	ASSERT_EQ(0, status_page[reg.status_index]);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* Enable event and ensure bits updated in status */
 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
-	ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]);
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
 
 	/* Disable event and ensure bits updated in status */
 	ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
-	ASSERT_EQ(0, status_page[reg.status_index]);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* File still open should return -EBUSY for delete */
 	ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
@@ -240,6 +245,8 @@ TEST_F(user, write_events) {
 	struct iovec io[3];
 	__u32 field1, field2;
 	int before = 0, after = 0;
+	int page_size = sysconf(_SC_PAGESIZE);
+	char *status_page;
 
 	reg.size = sizeof(reg);
 	reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
@@ -254,10 +261,18 @@ TEST_F(user, write_events) {
 	io[2].iov_base = &field2;
 	io[2].iov_len = sizeof(field2);
 
+	status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
+			   self->status_fd, 0);
+
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
+
+	/* MMAP should work and be zero'd */
+	ASSERT_NE(MAP_FAILED, status_page);
+	ASSERT_NE(NULL, status_page);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* Write should fail on invalid slot with ENOENT */
 	io[0].iov_base = &field2;
@@ -271,6 +286,9 @@ TEST_F(user, write_events) {
 	self->enable_fd = open(enable_file, O_RDWR);
 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
 
+	/* Event should now be enabled */
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
+
 	/* Write should make it out to ftrace buffers */
 	before = trace_bytes();
 	ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3));
@@ -298,7 +316,7 @@ TEST_F(user, write_fault) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
 
 	/* Write should work normally */
 	ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2));
@@ -315,6 +333,11 @@ TEST_F(user, write_validator) {
 	int loc, bytes;
 	char data[8];
 	int before = 0, after = 0;
+	int page_size = sysconf(_SC_PAGESIZE);
+	char *status_page;
+
+	status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
+			   self->status_fd, 0);
 
 	reg.size = sizeof(reg);
 	reg.name_args = (__u64)"__test_event __rel_loc char[] data";
@@ -322,7 +345,12 @@ TEST_F(user, write_validator) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
+	ASSERT_NE(0, reg.status_bit);
+
+	/* MMAP should work and be zero'd */
+	ASSERT_NE(MAP_FAILED, status_page);
+	ASSERT_NE(NULL, status_page);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	io[0].iov_base = &reg.write_index;
 	io[0].iov_len = sizeof(reg.write_index);
@@ -340,6 +368,9 @@ TEST_F(user, write_validator) {
 	self->enable_fd = open(enable_file, O_RDWR);
 	ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
 
+	/* Event should now be enabled */
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
+
 	/* Full in-bounds write should work */
 	before = trace_bytes();
 	loc = DYN_LOC(0, bytes);
diff --git a/tools/testing/selftests/user_events/perf_test.c b/tools/testing/selftests/user_events/perf_test.c
index 26851d51d6bbe4..8b4c7879d5a729 100644
--- a/tools/testing/selftests/user_events/perf_test.c
+++ b/tools/testing/selftests/user_events/perf_test.c
@@ -35,6 +35,11 @@ static long perf_event_open(struct perf_event_attr *pe, pid_t pid,
 	return syscall(__NR_perf_event_open, pe, pid, cpu, group_fd, flags);
 }
 
+static inline int status_check(char *status_page, int status_bit)
+{
+	return status_page[status_bit >> 3] & (1 << (status_bit & 7));
+}
+
 static int get_id(void)
 {
 	FILE *fp = fopen(id_file, "r");
@@ -120,8 +125,8 @@ TEST_F(user, perf_write) {
 	/* Register should work */
 	ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
 	ASSERT_EQ(0, reg.write_index);
-	ASSERT_NE(0, reg.status_index);
-	ASSERT_EQ(0, status_page[reg.status_index]);
+	ASSERT_NE(0, reg.status_bit);
+	ASSERT_EQ(0, status_check(status_page, reg.status_bit));
 
 	/* Id should be there */
 	id = get_id();
@@ -144,7 +149,7 @@ TEST_F(user, perf_write) {
 	ASSERT_NE(MAP_FAILED, perf_page);
 
 	/* Status should be updated */
-	ASSERT_EQ(EVENT_STATUS_PERF, status_page[reg.status_index]);
+	ASSERT_NE(0, status_check(status_page, reg.status_bit));
 
 	event.index = reg.write_index;
 	event.field1 = 0xc001;

From 933678b6183bbe7afa332e70132065db3305ee44 Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Thu, 28 Jul 2022 16:33:09 -0700
Subject: [PATCH 35/44] tracing/user_events: Update ABI documentation to align
 to bits vs bytes

Update the documentation to reflect the new ABI requirements and how to
use the byte index with the mask properly to check event status.

Link: https://lkml.kernel.org/r/20220728233309.1896-7-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 Documentation/trace/user_events.rst | 86 +++++++++++++++++++----------
 1 file changed, 58 insertions(+), 28 deletions(-)

diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst
index c180936f49fc8b..9f181f342a7031 100644
--- a/Documentation/trace/user_events.rst
+++ b/Documentation/trace/user_events.rst
@@ -20,14 +20,14 @@ dynamic_events is the same as the ioctl with the u: prefix applied.
 
 Typically programs will register a set of events that they wish to expose to
 tools that can read trace_events (such as ftrace and perf). The registration
-process gives back two ints to the program for each event. The first int is the
-status index. This index describes which byte in the
+process gives back two ints to the program for each event. The first int is
+the status bit. This describes which bit in little-endian format in the
 /sys/kernel/debug/tracing/user_events_status file represents this event. The
-second int is the write index. This index describes the data when a write() or
+second int is the write index which describes the data when a write() or
 writev() is called on the /sys/kernel/debug/tracing/user_events_data file.
 
-The structures referenced in this document are contained with the
-/include/uap/linux/user_events.h file in the source tree.
+The structures referenced in this document are contained within the
+/include/uapi/linux/user_events.h file in the source tree.
 
 **NOTE:** *Both user_events_status and user_events_data are under the tracefs
 filesystem and may be mounted at different paths than above.*
@@ -38,18 +38,18 @@ Registering within a user process is done via ioctl() out to the
 /sys/kernel/debug/tracing/user_events_data file. The command to issue is
 DIAG_IOCSREG.
 
-This command takes a struct user_reg as an argument::
+This command takes a packed struct user_reg as an argument::
 
   struct user_reg {
         u32 size;
         u64 name_args;
-        u32 status_index;
+        u32 status_bit;
         u32 write_index;
   };
 
 The struct user_reg requires two inputs, the first is the size of the structure
 to ensure forward and backward compatibility. The second is the command string
-to issue for registering. Upon success two outputs are set, the status index
+to issue for registering. Upon success two outputs are set, the status bit
 and the write index.
 
 User based events show up under tracefs like any other event under the
@@ -111,15 +111,56 @@ in realtime. This allows user programs to only incur the cost of the write() or
 writev() calls when something is actively attached to the event.
 
 User programs call mmap() on /sys/kernel/debug/tracing/user_events_status to
-check the status for each event that is registered. The byte to check in the
-file is given back after the register ioctl() via user_reg.status_index.
+check the status for each event that is registered. The bit to check in the
+file is given back after the register ioctl() via user_reg.status_bit. The bit
+is always in little-endian format. Programs can check if the bit is set either
+using a byte-wise index with a mask or a long-wise index with a little-endian
+mask.
+
 Currently the size of user_events_status is a single page, however, custom
 kernel configurations can change this size to allow more user based events. In
 all cases the size of the file is a multiple of a page size.
 
-For example, if the register ioctl() gives back a status_index of 3 you would
-check byte 3 of the returned mmap data to see if anything is attached to that
-event.
+For example, if the register ioctl() gives back a status_bit of 3 you would
+check byte 0 (3 / 8) of the returned mmap data and then AND the result with 8
+(1 << (3 % 8)) to see if anything is attached to that event.
+
+A byte-wise index check is performed as follows::
+
+  int index, mask;
+  char *status_page;
+
+  index = status_bit / 8;
+  mask = 1 << (status_bit % 8);
+
+  ...
+
+  if (status_page[index] & mask) {
+        /* Enabled */
+  }
+
+A long-wise index check is performed as follows::
+
+  #include <asm/bitsperlong.h>
+  #include <endian.h>
+
+  #if __BITS_PER_LONG == 64
+  #define endian_swap(x) htole64(x)
+  #else
+  #define endian_swap(x) htole32(x)
+  #endif
+
+  long index, mask, *status_page;
+
+  index = status_bit / __BITS_PER_LONG;
+  mask = 1L << (status_bit % __BITS_PER_LONG);
+  mask = endian_swap(mask);
+
+  ...
+
+  if (status_page[index] & mask) {
+        /* Enabled */
+  }
 
 Administrators can easily check the status of all registered events by reading
 the user_events_status file directly via a terminal. The output is as follows::
@@ -137,7 +178,7 @@ For example, on a system that has a single event the output looks like this::
 
   Active: 1
   Busy: 0
-  Max: 4096
+  Max: 32768
 
 If a user enables the user event via ftrace, the output would change to this::
 
@@ -145,21 +186,10 @@ If a user enables the user event via ftrace, the output would change to this::
 
   Active: 1
   Busy: 1
-  Max: 4096
-
-**NOTE:** *A status index of 0 will never be returned. This allows user
-programs to have an index that can be used on error cases.*
-
-Status Bits
-^^^^^^^^^^^
-The byte being checked will be non-zero if anything is attached. Programs can
-check specific bits in the byte to see what mechanism has been attached.
-
-The following values are defined to aid in checking what has been attached:
-
-**EVENT_STATUS_FTRACE** - Bit set if ftrace has been attached (Bit 0).
+  Max: 32768
 
-**EVENT_STATUS_PERF** - Bit set if perf has been attached (Bit 1).
+**NOTE:** *A status bit of 0 will never be returned. This allows user programs
+to have a bit that can be used on error cases.*
 
 Writing Data
 ------------

From a0fcaaed0c46cf9399d3a2d6e0c87ddb3df0e044 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 29 Sep 2022 10:49:09 -0400
Subject: [PATCH 36/44] ring-buffer: Fix race between reset page and reading
 page

The ring buffer is broken up into sub buffers (currently of page size).
Each sub buffer has a pointer to its "tail" (the last event written to the
sub buffer). When a new event is requested, the tail is locally
incremented to cover the size of the new event. This is done in a way that
there is no need for locking.

If the tail goes past the end of the sub buffer, the process of moving to
the next sub buffer takes place. After setting the current sub buffer to
the next one, the previous one that had the tail go passed the end of the
sub buffer needs to be reset back to the original tail location (before
the new event was requested) and the rest of the sub buffer needs to be
"padded".

The race happens when a reader takes control of the sub buffer. As readers
do a "swap" of sub buffers from the ring buffer to get exclusive access to
the sub buffer, it replaces the "head" sub buffer with an empty sub buffer
that goes back into the writable portion of the ring buffer. This swap can
happen as soon as the writer moves to the next sub buffer and before it
updates the last sub buffer with padding.

Because the sub buffer can be released to the reader while the writer is
still updating the padding, it is possible for the reader to see the event
that goes past the end of the sub buffer. This can cause obvious issues.

To fix this, add a few memory barriers so that the reader definitely sees
the updates to the sub buffer, and also waits until the writer has put
back the "tail" of the sub buffer back to the last event that was written
on it.

To be paranoid, it will only spin for 1 second, otherwise it will
warn and shutdown the ring buffer code. 1 second should be enough as
the writer does have preemption disabled. If the writer doesn't move
within 1 second (with preemption disabled) something is horribly
wrong. No interrupt should last 1 second!

Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369
Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@vger.kernel.org
Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area")
Reported-by: Jiazi.Li <jiazi.li@transsion.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3046deacf7b332..c3f354cfc5ba1d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2648,6 +2648,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Mark the rest of the page with padding */
 		rb_event_set_padding(event);
 
+		/* Make sure the padding is visible before the write update */
+		smp_wmb();
+
 		/* Set the write back to the previous setting */
 		local_sub(length, &tail_page->write);
 		return;
@@ -2659,6 +2662,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	/* time delta must be non zero */
 	event->time_delta = 1;
 
+	/* Make sure the padding is visible before the tail_page->write update */
+	smp_wmb();
+
 	/* Set write to end of buffer */
 	length = (tail + length) - BUF_PAGE_SIZE;
 	local_sub(length, &tail_page->write);
@@ -4627,6 +4633,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	arch_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 
+	/*
+	 * The writer has preempt disable, wait for it. But not forever
+	 * Although, 1 second is pretty much "forever"
+	 */
+#define USECS_WAIT	1000000
+        for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
+		/* If the write is past the end of page, a writer is still updating it */
+		if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+			break;
+
+		udelay(1);
+
+		/* Get the latest version of the reader write value */
+		smp_rmb();
+	}
+
+	/* The writer is not moving forward? Something is wrong */
+	if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
+		reader = NULL;
+
+	/*
+	 * Make sure we see any padding after the write update
+	 * (see rb_reset_tail())
+	 */
+	smp_rmb();
+
+
 	return reader;
 }
 

From 0e0f0b74f895942afaf2f9213b97a6cb021d05df Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 28 Sep 2022 12:46:21 +0100
Subject: [PATCH 37/44] MAINTAINERS: add myself as a tracing reviewer

Since I'm actively involved in a number of arch bits that intersect
ftrace (e.g. the actual arch implementation on arm64, stacktracing,
entry management, and general instrumentation safety), add myself as a
reviewer of the core ftrace code so that I have the change to catch any
potential problems early.

I spoke with Steven about this at LPC, and it seemed to make sense to
add me as a reviewer.

Link: https://lkml.kernel.org/r/20220928114621.248038-1-mark.rutland@arm.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f5ca4aefd184c3..ef2fc5e581b405 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20615,6 +20615,7 @@ F:	drivers/char/tpm/
 TRACING
 M:	Steven Rostedt <rostedt@goodmis.org>
 M:	Ingo Molnar <mingo@redhat.com>
+R:	Mark Rutland <mark.rutland@arm.com>
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
 F:	Documentation/trace/ftrace.rst

From ed87277f122674a943239c6e60d352c8d56deb50 Mon Sep 17 00:00:00 2001
From: Chen Zhongjin <chenzhongjin@huawei.com>
Date: Fri, 30 Sep 2022 18:32:36 +0800
Subject: [PATCH 38/44] tracing: Remove unused variable 'dups'

Reported by Clang [-Wunused-but-set-variable]

'commit c193707dde77 ("tracing: Remove code which merges duplicates")'
This commit removed the code which merges duplicates in detect_dups(),
but forgot to delete the variable 'dups' which used to merge
duplicates in the loop.

Now only 'total_dups' is needed, remove 'dups' for clean code.

Link: https://lkml.kernel.org/r/20220930103236.253985-1-chenzhongjin@huawei.com

Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 9901708ce6b8a5..c774e560f2f957 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
 static void detect_dups(struct tracing_map_sort_entry **sort_entries,
 		      int n_entries, unsigned int key_size)
 {
-	unsigned int dups = 0, total_dups = 0;
+	unsigned int total_dups = 0;
 	int i;
 	void *key;
 
@@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries,
 	key = sort_entries[0]->key;
 	for (i = 1; i < n_entries; i++) {
 		if (!memcmp(sort_entries[i]->key, key, key_size)) {
-			dups++; total_dups++;
+			total_dups++;
 			continue;
 		}
 		key = sort_entries[i]->key;
-		dups = 0;
 	}
 
 	WARN_ONCE(total_dups > 0,

From 5d8d2bb946dd24d00c99384471ad8148571a9fbd Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Fri, 30 Sep 2022 12:41:31 -0400
Subject: [PATCH 39/44] tracing: Add Masami Hiramatsu as co-maintainer

Masami has been maintaining kprobes for a while now and that code has
been an integral part of tracing. He has also been an excellent reviewer
of all the tracing code and contributor as well.

The tracing subsystem needs another active maintainer to keep it running
smoothly, and I do not know anyone more qualified for the job than Masami.

Ingo has also told me that he has not been active in the tracing code for
some time and said he could be removed from the TRACING portion of the
MAINTAINERS file.

Link: https://lkml.kernel.org/r/20220930124131.7b6432dd@gandalf.local.home

Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ef2fc5e581b405..749558b0946469 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20614,7 +20614,7 @@ F:	drivers/char/tpm/
 
 TRACING
 M:	Steven Rostedt <rostedt@goodmis.org>
-M:	Ingo Molnar <mingo@redhat.com>
+M:	Masami Hiramatsu <mhiramat@kernel.org>
 R:	Mark Rutland <mark.rutland@arm.com>
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
@@ -20631,7 +20631,7 @@ F:	tools/testing/selftests/ftrace/
 
 TRACING MMIO ACCESSES (MMIOTRACE)
 M:	Steven Rostedt <rostedt@goodmis.org>
-M:	Ingo Molnar <mingo@kernel.org>
+M:	Masami Hiramatsu <mhiramat@kernel.org>
 R:	Karol Herbst <karolherbst@gmail.com>
 R:	Pekka Paalanen <ppaalanen@gmail.com>
 L:	linux-kernel@vger.kernel.org

From e5d271812e7a4d527e65b0228b4a16795c0e0c6c Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Fri, 30 Sep 2022 17:10:16 -0700
Subject: [PATCH 40/44] tracing/user_events: Move pages/locks into groups to
 prepare for namespaces

In order to enable namespaces or any sort of isolation within
user_events the register lock and pages need to be broken up into
groups. Each event and file now has a group pointer which stores the
actual pages to map, lookup data and synchronization objects.

This only enables a single group that maps to init_user_ns, as IMA
namespace has done. This enables user_events to start the work of
supporting namespaces by walking the namespaces up to the init_user_ns.
Future patches will address other user namespaces and will align to the
approaches the IMA namespace uses.

Link: https://lore.kernel.org/linux-kernel/20220915193221.1728029-15-stefanb@linux.ibm.com/#t
Link: https://lkml.kernel.org/r/20221001001016.2832-2-beaub@linux.microsoft.com

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_user.c | 346 ++++++++++++++++++++++++-------
 1 file changed, 274 insertions(+), 72 deletions(-)

diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 2c0a6ec755489b..ae78c2d53c8ad6 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -74,11 +74,25 @@
 #define EVENT_STATUS_PERF BIT(1)
 #define EVENT_STATUS_OTHER BIT(7)
 
-static char *register_page_data;
+/*
+ * Stores the pages, tables, and locks for a group of events.
+ * Each logical grouping of events has its own group, with a
+ * matching page for status checks within user programs. This
+ * allows for isolation of events to user programs by various
+ * means.
+ */
+struct user_event_group {
+	struct page *pages;
+	char *register_page_data;
+	char *system_name;
+	struct hlist_node node;
+	struct mutex reg_mutex;
+	DECLARE_HASHTABLE(register_table, 8);
+	DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+};
 
-static DEFINE_MUTEX(reg_mutex);
-static DEFINE_HASHTABLE(register_table, 8);
-static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+/* Group for init_user_ns mapping, top-most group */
+static struct user_event_group *init_group;
 
 /*
  * Stores per-event properties, as users register events
@@ -88,6 +102,7 @@ static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
  * refcnt reaches one.
  */
 struct user_event {
+	struct user_event_group *group;
 	struct tracepoint tracepoint;
 	struct trace_event_call call;
 	struct trace_event_class class;
@@ -114,6 +129,11 @@ struct user_event_refs {
 	struct user_event *events[];
 };
 
+struct user_event_file_info {
+	struct user_event_group *group;
+	struct user_event_refs *refs;
+};
+
 #define VALIDATOR_ENSURE_NULL (1 << 0)
 #define VALIDATOR_REL (1 << 1)
 
@@ -126,7 +146,8 @@ struct user_event_validator {
 typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
 				   void *tpdata, bool *faulted);
 
-static int user_event_parse(char *name, char *args, char *flags,
+static int user_event_parse(struct user_event_group *group, char *name,
+			    char *args, char *flags,
 			    struct user_event **newuser);
 
 static u32 user_event_key(char *name)
@@ -134,12 +155,128 @@ static u32 user_event_key(char *name)
 	return jhash(name, strlen(name), 0);
 }
 
+static void set_page_reservations(char *pages, bool set)
+{
+	int page;
+
+	for (page = 0; page < MAX_PAGES; ++page) {
+		void *addr = pages + (PAGE_SIZE * page);
+
+		if (set)
+			SetPageReserved(virt_to_page(addr));
+		else
+			ClearPageReserved(virt_to_page(addr));
+	}
+}
+
+static void user_event_group_destroy(struct user_event_group *group)
+{
+	if (group->register_page_data)
+		set_page_reservations(group->register_page_data, false);
+
+	if (group->pages)
+		__free_pages(group->pages, MAX_PAGE_ORDER);
+
+	kfree(group->system_name);
+	kfree(group);
+}
+
+static char *user_event_group_system_name(struct user_namespace *user_ns)
+{
+	char *system_name;
+	int len = sizeof(USER_EVENTS_SYSTEM) + 1;
+
+	if (user_ns != &init_user_ns) {
+		/*
+		 * Unexpected at this point:
+		 * We only currently support init_user_ns.
+		 * When we enable more, this will trigger a failure so log.
+		 */
+		pr_warn("user_events: Namespace other than init_user_ns!\n");
+		return NULL;
+	}
+
+	system_name = kmalloc(len, GFP_KERNEL);
+
+	if (!system_name)
+		return NULL;
+
+	snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM);
+
+	return system_name;
+}
+
+static inline struct user_event_group
+*user_event_group_from_user_ns(struct user_namespace *user_ns)
+{
+	if (user_ns == &init_user_ns)
+		return init_group;
+
+	return NULL;
+}
+
+static struct user_event_group *current_user_event_group(void)
+{
+	struct user_namespace *user_ns = current_user_ns();
+	struct user_event_group *group = NULL;
+
+	while (user_ns) {
+		group = user_event_group_from_user_ns(user_ns);
+
+		if (group)
+			break;
+
+		user_ns = user_ns->parent;
+	}
+
+	return group;
+}
+
+static struct user_event_group
+*user_event_group_create(struct user_namespace *user_ns)
+{
+	struct user_event_group *group;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+
+	if (!group)
+		return NULL;
+
+	group->system_name = user_event_group_system_name(user_ns);
+
+	if (!group->system_name)
+		goto error;
+
+	group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+
+	if (!group->pages)
+		goto error;
+
+	group->register_page_data = page_address(group->pages);
+
+	set_page_reservations(group->register_page_data, true);
+
+	/* Zero all bits beside 0 (which is reserved for failures) */
+	bitmap_zero(group->page_bitmap, MAX_EVENTS);
+	set_bit(0, group->page_bitmap);
+
+	mutex_init(&group->reg_mutex);
+	hash_init(group->register_table);
+
+	return group;
+error:
+	if (group)
+		user_event_group_destroy(group);
+
+	return NULL;
+};
+
 static __always_inline
 void user_event_register_set(struct user_event *user)
 {
 	int i = user->index;
 
-	register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+	user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
 }
 
 static __always_inline
@@ -147,7 +284,7 @@ void user_event_register_clear(struct user_event *user)
 {
 	int i = user->index;
 
-	register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+	user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
 }
 
 static __always_inline __must_check
@@ -191,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call)
  *
  * Upon success user_event has its ref count increased by 1.
  */
-static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+static int user_event_parse_cmd(struct user_event_group *group,
+				char *raw_command, struct user_event **newuser)
 {
 	char *name = raw_command;
 	char *args = strpbrk(name, " ");
@@ -205,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
 	if (flags)
 		*flags++ = '\0';
 
-	return user_event_parse(name, args, flags, newuser);
+	return user_event_parse(group, name, args, flags, newuser);
 }
 
 static int user_field_array_size(const char *type)
@@ -693,7 +831,7 @@ static int destroy_user_event(struct user_event *user)
 	dyn_event_remove(&user->devent);
 
 	user_event_register_clear(user);
-	clear_bit(user->index, page_bitmap);
+	clear_bit(user->index, user->group->page_bitmap);
 	hash_del(&user->node);
 
 	user_event_destroy_validators(user);
@@ -704,14 +842,15 @@ static int destroy_user_event(struct user_event *user)
 	return ret;
 }
 
-static struct user_event *find_user_event(char *name, u32 *outkey)
+static struct user_event *find_user_event(struct user_event_group *group,
+					  char *name, u32 *outkey)
 {
 	struct user_event *user;
 	u32 key = user_event_key(name);
 
 	*outkey = key;
 
-	hash_for_each_possible(register_table, user, node, key)
+	hash_for_each_possible(group->register_table, user, node, key)
 		if (!strcmp(EVENT_NAME(user), name)) {
 			refcount_inc(&user->refcnt);
 			return user;
@@ -943,6 +1082,7 @@ static int user_event_reg(struct trace_event_call *call,
 
 static int user_event_create(const char *raw_command)
 {
+	struct user_event_group *group;
 	struct user_event *user;
 	char *name;
 	int ret;
@@ -958,14 +1098,19 @@ static int user_event_create(const char *raw_command)
 	if (!name)
 		return -ENOMEM;
 
-	mutex_lock(&reg_mutex);
+	group = current_user_event_group();
+
+	if (!group)
+		return -ENOENT;
+
+	mutex_lock(&group->reg_mutex);
 
-	ret = user_event_parse_cmd(name, &user);
+	ret = user_event_parse_cmd(group, name, &user);
 
 	if (!ret)
 		refcount_dec(&user->refcnt);
 
-	mutex_unlock(&reg_mutex);
+	mutex_unlock(&group->reg_mutex);
 
 	if (ret)
 		kfree(name);
@@ -1119,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user)
  * The name buffer lifetime is owned by this method for success cases only.
  * Upon success the returned user_event has its ref count increased by 1.
  */
-static int user_event_parse(char *name, char *args, char *flags,
+static int user_event_parse(struct user_event_group *group, char *name,
+			    char *args, char *flags,
 			    struct user_event **newuser)
 {
 	int ret;
@@ -1129,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags,
 
 	/* Prevent dyn_event from racing */
 	mutex_lock(&event_mutex);
-	user = find_user_event(name, &key);
+	user = find_user_event(group, name, &key);
 	mutex_unlock(&event_mutex);
 
 	if (user) {
@@ -1142,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags,
 		return 0;
 	}
 
-	index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+	index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
 
 	if (index == MAX_EVENTS)
 		return -EMFILE;
@@ -1156,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags,
 	INIT_LIST_HEAD(&user->fields);
 	INIT_LIST_HEAD(&user->validators);
 
+	user->group = group;
 	user->tracepoint.name = name;
 
 	ret = user_event_parse_fields(user, args);
@@ -1174,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags,
 	user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
 	user->call.tp = &user->tracepoint;
 	user->call.event.funcs = &user_event_funcs;
+	user->class.system = group->system_name;
 
-	user->class.system = USER_EVENTS_SYSTEM;
 	user->class.fields_array = user_event_fields_array;
 	user->class.get_fields = user_event_get_fields;
 	user->class.reg = user_event_reg;
@@ -1198,8 +1345,8 @@ static int user_event_parse(char *name, char *args, char *flags,
 
 	dyn_event_init(&user->devent, &user_event_dops);
 	dyn_event_add(&user->devent, &user->call);
-	set_bit(user->index, page_bitmap);
-	hash_add(register_table, &user->node, key);
+	set_bit(user->index, group->page_bitmap);
+	hash_add(group->register_table, &user->node, key);
 
 	mutex_unlock(&event_mutex);
 
@@ -1217,10 +1364,10 @@ static int user_event_parse(char *name, char *args, char *flags,
 /*
  * Deletes a previously created event if it is no longer being used.
  */
-static int delete_user_event(char *name)
+static int delete_user_event(struct user_event_group *group, char *name)
 {
 	u32 key;
-	struct user_event *user = find_user_event(name, &key);
+	struct user_event *user = find_user_event(group, name, &key);
 
 	if (!user)
 		return -ENOENT;
@@ -1238,6 +1385,7 @@ static int delete_user_event(char *name)
  */
 static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 {
+	struct user_event_file_info *info = file->private_data;
 	struct user_event_refs *refs;
 	struct user_event *user = NULL;
 	struct tracepoint *tp;
@@ -1249,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 
 	rcu_read_lock_sched();
 
-	refs = rcu_dereference_sched(file->private_data);
+	refs = rcu_dereference_sched(info->refs);
 
 	/*
 	 * The refs->events array is protected by RCU, and new items may be
@@ -1307,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
 	return ret;
 }
 
+static int user_events_open(struct inode *node, struct file *file)
+{
+	struct user_event_group *group;
+	struct user_event_file_info *info;
+
+	group = current_user_event_group();
+
+	if (!group)
+		return -ENOENT;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return -ENOMEM;
+
+	info->group = group;
+
+	file->private_data = info;
+
+	return 0;
+}
+
 static ssize_t user_events_write(struct file *file, const char __user *ubuf,
 				 size_t count, loff_t *ppos)
 {
@@ -1328,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
 	return user_events_write_core(kp->ki_filp, i);
 }
 
-static int user_events_ref_add(struct file *file, struct user_event *user)
+static int user_events_ref_add(struct user_event_file_info *info,
+			       struct user_event *user)
 {
+	struct user_event_group *group = info->group;
 	struct user_event_refs *refs, *new_refs;
 	int i, size, count = 0;
 
-	refs = rcu_dereference_protected(file->private_data,
-					 lockdep_is_held(&reg_mutex));
+	refs = rcu_dereference_protected(info->refs,
+					 lockdep_is_held(&group->reg_mutex));
 
 	if (refs) {
 		count = refs->count;
@@ -1360,7 +1532,7 @@ static int user_events_ref_add(struct file *file, struct user_event *user)
 
 	refcount_inc(&user->refcnt);
 
-	rcu_assign_pointer(file->private_data, new_refs);
+	rcu_assign_pointer(info->refs, new_refs);
 
 	if (refs)
 		kfree_rcu(refs, rcu);
@@ -1397,7 +1569,8 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
 /*
  * Registers a user_event on behalf of a user process.
  */
-static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+static long user_events_ioctl_reg(struct user_event_file_info *info,
+				  unsigned long uarg)
 {
 	struct user_reg __user *ureg = (struct user_reg __user *)uarg;
 	struct user_reg reg;
@@ -1418,14 +1591,14 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
 		return ret;
 	}
 
-	ret = user_event_parse_cmd(name, &user);
+	ret = user_event_parse_cmd(info->group, name, &user);
 
 	if (ret) {
 		kfree(name);
 		return ret;
 	}
 
-	ret = user_events_ref_add(file, user);
+	ret = user_events_ref_add(info, user);
 
 	/* No longer need parse ref, ref_add either worked or not */
 	refcount_dec(&user->refcnt);
@@ -1443,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
 /*
  * Deletes a user_event on behalf of a user process.
  */
-static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+static long user_events_ioctl_del(struct user_event_file_info *info,
+				  unsigned long uarg)
 {
 	void __user *ubuf = (void __user *)uarg;
 	char *name;
@@ -1456,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
 
 	/* event_mutex prevents dyn_event from racing */
 	mutex_lock(&event_mutex);
-	ret = delete_user_event(name);
+	ret = delete_user_event(info->group, name);
 	mutex_unlock(&event_mutex);
 
 	kfree(name);
@@ -1470,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg)
 static long user_events_ioctl(struct file *file, unsigned int cmd,
 			      unsigned long uarg)
 {
+	struct user_event_file_info *info = file->private_data;
+	struct user_event_group *group = info->group;
 	long ret = -ENOTTY;
 
 	switch (cmd) {
 	case DIAG_IOCSREG:
-		mutex_lock(&reg_mutex);
-		ret = user_events_ioctl_reg(file, uarg);
-		mutex_unlock(&reg_mutex);
+		mutex_lock(&group->reg_mutex);
+		ret = user_events_ioctl_reg(info, uarg);
+		mutex_unlock(&group->reg_mutex);
 		break;
 
 	case DIAG_IOCSDEL:
-		mutex_lock(&reg_mutex);
-		ret = user_events_ioctl_del(file, uarg);
-		mutex_unlock(&reg_mutex);
+		mutex_lock(&group->reg_mutex);
+		ret = user_events_ioctl_del(info, uarg);
+		mutex_unlock(&group->reg_mutex);
 		break;
 	}
 
@@ -1494,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
  */
 static int user_events_release(struct inode *node, struct file *file)
 {
+	struct user_event_file_info *info = file->private_data;
+	struct user_event_group *group;
 	struct user_event_refs *refs;
 	struct user_event *user;
 	int i;
 
+	if (!info)
+		return -EINVAL;
+
+	group = info->group;
+
 	/*
 	 * Ensure refs cannot change under any situation by taking the
 	 * register mutex during the final freeing of the references.
 	 */
-	mutex_lock(&reg_mutex);
+	mutex_lock(&group->reg_mutex);
 
-	refs = file->private_data;
+	refs = info->refs;
 
 	if (!refs)
 		goto out;
@@ -1523,32 +1706,51 @@ static int user_events_release(struct inode *node, struct file *file)
 out:
 	file->private_data = NULL;
 
-	mutex_unlock(&reg_mutex);
+	mutex_unlock(&group->reg_mutex);
 
 	kfree(refs);
+	kfree(info);
 
 	return 0;
 }
 
 static const struct file_operations user_data_fops = {
+	.open = user_events_open,
 	.write = user_events_write,
 	.write_iter = user_events_write_iter,
 	.unlocked_ioctl	= user_events_ioctl,
 	.release = user_events_release,
 };
 
+static struct user_event_group *user_status_group(struct file *file)
+{
+	struct seq_file *m = file->private_data;
+
+	if (!m)
+		return NULL;
+
+	return m->private;
+}
+
 /*
  * Maps the shared page into the user process for checking if event is enabled.
  */
 static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	char *pages;
+	struct user_event_group *group = user_status_group(file);
 	unsigned long size = vma->vm_end - vma->vm_start;
 
 	if (size != MAX_BYTES)
 		return -EINVAL;
 
+	if (!group)
+		return -EINVAL;
+
+	pages = group->register_page_data;
+
 	return remap_pfn_range(vma, vma->vm_start,
-			       virt_to_phys(register_page_data) >> PAGE_SHIFT,
+			       virt_to_phys(pages) >> PAGE_SHIFT,
 			       size, vm_get_page_prot(VM_READ));
 }
 
@@ -1572,13 +1774,17 @@ static void user_seq_stop(struct seq_file *m, void *p)
 
 static int user_seq_show(struct seq_file *m, void *p)
 {
+	struct user_event_group *group = m->private;
 	struct user_event *user;
 	char status;
 	int i, active = 0, busy = 0, flags;
 
-	mutex_lock(&reg_mutex);
+	if (!group)
+		return -EINVAL;
+
+	mutex_lock(&group->reg_mutex);
 
-	hash_for_each(register_table, i, user, node) {
+	hash_for_each(group->register_table, i, user, node) {
 		status = user->status;
 		flags = user->flags;
 
@@ -1602,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p)
 		active++;
 	}
 
-	mutex_unlock(&reg_mutex);
+	mutex_unlock(&group->reg_mutex);
 
 	seq_puts(m, "\n");
 	seq_printf(m, "Active: %d\n", active);
@@ -1621,7 +1827,24 @@ static const struct seq_operations user_seq_ops = {
 
 static int user_status_open(struct inode *node, struct file *file)
 {
-	return seq_open(file, &user_seq_ops);
+	struct user_event_group *group;
+	int ret;
+
+	group = current_user_event_group();
+
+	if (!group)
+		return -ENOENT;
+
+	ret = seq_open(file, &user_seq_ops);
+
+	if (!ret) {
+		/* Chain group to seq_file */
+		struct seq_file *m = file->private_data;
+
+		m->private = group;
+	}
+
+	return ret;
 }
 
 static const struct file_operations user_status_fops = {
@@ -1662,42 +1885,21 @@ static int create_user_tracefs(void)
 	return -ENODEV;
 }
 
-static void set_page_reservations(bool set)
-{
-	int page;
-
-	for (page = 0; page < MAX_PAGES; ++page) {
-		void *addr = register_page_data + (PAGE_SIZE * page);
-
-		if (set)
-			SetPageReserved(virt_to_page(addr));
-		else
-			ClearPageReserved(virt_to_page(addr));
-	}
-}
-
 static int __init trace_events_user_init(void)
 {
-	struct page *pages;
 	int ret;
 
-	/* Zero all bits beside 0 (which is reserved for failures) */
-	bitmap_zero(page_bitmap, MAX_EVENTS);
-	set_bit(0, page_bitmap);
+	init_group = user_event_group_create(&init_user_ns);
 
-	pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-	if (!pages)
+	if (!init_group)
 		return -ENOMEM;
-	register_page_data = page_address(pages);
-
-	set_page_reservations(true);
 
 	ret = create_user_tracefs();
 
 	if (ret) {
 		pr_warn("user_events could not register with tracefs\n");
-		set_page_reservations(false);
-		__free_pages(pages, MAX_PAGE_ORDER);
+		user_event_group_destroy(init_group);
+		init_group = NULL;
 		return ret;
 	}
 

From cf04f2d5df0037741207382ac8fe289e8bf84ced Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 5 Oct 2022 00:38:09 -0400
Subject: [PATCH 41/44] ftrace: Still disable enabled records marked as
 disabled

Weak functions started causing havoc as they showed up in the
"available_filter_functions" and this confused people as to why some
functions marked as "notrace" were listed, but when enabled they did
nothing. This was because weak functions can still have fentry calls, and
these addresses get added to the "available_filter_functions" file.
kallsyms is what converts those addresses to names, and since the weak
functions are not listed in kallsyms, it would just pick the function
before that.

To solve this, there was a trick to detect weak functions listed, and
these records would be marked as DISABLED so that they do not get enabled
and are mostly ignored. As the processing of the list of all functions to
figure out what is weak or not can take a long time, this process is put
off into a kernel thread and run in parallel with the rest of start up.

Now the issue happens whet function tracing is enabled via the kernel
command line. As it starts very early in boot up, it can be enabled before
the records that are weak are marked to be disabled. This causes an issue
in the accounting, as the weak records are enabled by the command line
function tracing, but after boot up, they are not disabled.

The ftrace records have several accounting flags and a ref count. The
DISABLED flag is just one. If the record is enabled before it is marked
DISABLED it will get an ENABLED flag and also have its ref counter
incremented. After it is marked for DISABLED, neither the ENABLED flag nor
the ref counter is cleared. There's sanity checks on the records that are
performed after an ftrace function is registered or unregistered, and this
detected that there were records marked as ENABLED with ref counter that
should not have been.

Note, the module loading code uses the DISABLED flag as well to keep its
functions from being modified while its being loaded and some of these
flags may get set in this process. So changing the verification code to
ignore DISABLED records is a no go, as it still needs to verify that the
module records are working too.

Also, the weak functions still are calling a trampoline. Even though they
should never be called, it is dangerous to leave these weak functions
calling a trampoline that is freed, so they should still be set back to
nops.

There's two places that need to not skip records that have the ENABLED
and the DISABLED flags set. That is where the ftrace_ops is processed and
sets the records ref counts, and then later when the function itself is to
be updated, and the ENABLED flag gets removed. Add a helper function
"skip_record()" that returns true if the record has the DISABLED flag set
but not the ENABLED flag.

Link: https://lkml.kernel.org/r/20221005003809.27d2b97b@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@vger.kernel.org
Fixes: b39181f7c6907 ("ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 406d0597c409a5..83362a15579161 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1644,6 +1644,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex
 static struct ftrace_ops *
 ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
 
+static bool skip_record(struct dyn_ftrace *rec)
+{
+	/*
+	 * At boot up, weak functions are set to disable. Function tracing
+	 * can be enabled before they are, and they still need to be disabled now.
+	 * If the record is disabled, still continue if it is marked as already
+	 * enabled (this is needed to keep the accounting working).
+	 */
+	return rec->flags & FTRACE_FL_DISABLED &&
+		!(rec->flags & FTRACE_FL_ENABLED);
+}
+
 static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 				     int filter_hash,
 				     bool inc)
@@ -1693,7 +1705,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		int in_hash = 0;
 		int match = 0;
 
-		if (rec->flags & FTRACE_FL_DISABLED)
+		if (skip_record(rec))
 			continue;
 
 		if (all) {
@@ -2126,7 +2138,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 
 	ftrace_bug_type = FTRACE_BUG_UNKNOWN;
 
-	if (rec->flags & FTRACE_FL_DISABLED)
+	if (skip_record(rec))
 		return FTRACE_UPDATE_IGNORE;
 
 	/*
@@ -2241,7 +2253,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 	if (update) {
 		/* If there's no more users, clear all flags */
 		if (!ftrace_rec_count(rec))
-			rec->flags = 0;
+			rec->flags &= FTRACE_FL_DISABLED;
 		else
 			/*
 			 * Just disable the record, but keep the ops TRAMP
@@ -2634,7 +2646,7 @@ void __weak ftrace_replace_code(int mod_flags)
 
 	do_for_each_ftrace_rec(pg, rec) {
 
-		if (rec->flags & FTRACE_FL_DISABLED)
+		if (skip_record(rec))
 			continue;
 
 		failed = __ftrace_replace_code(rec, enable);

From a541a9559bb0a8ecc434de01d3e4826c32e8bb53 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Wed, 5 Oct 2022 11:37:57 -0400
Subject: [PATCH 42/44] tracing: Do not free snapshot if tracer is on cmdline

The ftrace_boot_snapshot and alloc_snapshot cmdline options allocate the
snapshot buffer at boot up for use later. The ftrace_boot_snapshot in
particular requires the snapshot to be allocated because it will take a
snapshot at the end of boot up allowing to see the traces that happened
during boot so that it's not lost when user space takes over.

When a tracer is registered (started) there's a path that checks if it
requires the snapshot buffer or not, and if it does not and it was
allocated it will do a synchronization and free the snapshot buffer.

This is only required if the previous tracer was using it for "max
latency" snapshots, as it needs to make sure all max snapshots are
complete before freeing. But this is only needed if the previous tracer
was using the snapshot buffer for latency (like irqoff tracer and
friends). But it does not make sense to free it, if the previous tracer
was not using it, and the snapshot was allocated by the cmdline
parameters. This basically takes away the point of allocating it in the
first place!

Note, the allocated snapshot worked fine for just trace events, but fails
when a tracer is enabled on the cmdline.

Further investigation, this goes back even further and it does not require
a tracer on the cmdline to fail. Simply enable snapshots and then enable a
tracer, and it will remove the snapshot.

Link: https://lkml.kernel.org/r/20221005113757.041df7fe@gandalf.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@vger.kernel.org
Fixes: 45ad21ca5530 ("tracing: Have trace_array keep track if snapshot buffer is allocated")
Reported-by: Ross Zwisler <zwisler@kernel.org>
Tested-by: Ross Zwisler <zwisler@kernel.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index def721de68a090..47a44b055a1d48 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6428,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
+	had_max_tr = tr->current_trace->use_max_tr;
+
 	/* Current trace needs to be nop_trace before synchronize_rcu */
 	tr->current_trace = &nop_trace;
 
-#ifdef CONFIG_TRACER_MAX_TRACE
-	had_max_tr = tr->allocated_snapshot;
-
 	if (had_max_tr && !t->use_max_tr) {
 		/*
 		 * We need to make sure that the update_max_tr sees that
@@ -6446,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
 		free_snapshot(tr);
 	}
 
-	if (t->use_max_tr && !had_max_tr) {
+	if (t->use_max_tr && !tr->allocated_snapshot) {
 		ret = tracing_alloc_snapshot_instance(tr);
 		if (ret < 0)
 			goto out;
 	}
+#else
+	tr->current_trace = &nop_trace;
 #endif
 
 	if (t->init) {

From fb17b268396046ee8a10c0f5d30c0a7afeef77e2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 6 Oct 2022 10:43:47 -0400
Subject: [PATCH 43/44] tracing: Update MAINTAINERS to reflect new tracing git
 repo

The tracing git repo will no longer be housed in my personal git repo,
but instead live in trace/linux-trace.git.

Update the MAINTAINERS file appropriately.

Link: https://lkml.kernel.org/r/20221006144439.282193367@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 MAINTAINERS | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 749558b0946469..86b8aa4c11cbf5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11380,7 +11380,7 @@ M:	Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
 M:	"David S. Miller" <davem@davemloft.net>
 M:	Masami Hiramatsu <mhiramat@kernel.org>
 S:	Maintained
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
 F:	Documentation/trace/kprobes.rst
 F:	include/asm-generic/kprobes.h
 F:	include/linux/kprobes.h
@@ -20617,7 +20617,7 @@ M:	Steven Rostedt <rostedt@goodmis.org>
 M:	Masami Hiramatsu <mhiramat@kernel.org>
 R:	Mark Rutland <mark.rutland@arm.com>
 S:	Maintained
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
 F:	Documentation/trace/ftrace.rst
 F:	arch/*/*/*/*ftrace*
 F:	arch/*/*/*ftrace*

From 4f881a696484a7d31a4d1b12547615b1a3ee5771 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 6 Oct 2022 10:43:48 -0400
Subject: [PATCH 44/44] ftrace: Create separate entry in MAINTAINERS for
 function hooks

The function hooks (ftrace) is a completely different subsystem from the
general tracing. It manages how to attach callbacks to most functions in
the kernel. It is also used by live kernel patching. It really is not part
of tracing, although tracing uses it.

Create a separate entry for FUNCTION HOOKS (FTRACE) to be separate from
tracing itself in the MAINTAINERS file.

Perhaps it should be moved out of the kernel/trace directory, but that's
for another time.

Link: https://lkml.kernel.org/r/20221006144439.459272364@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 MAINTAINERS | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 86b8aa4c11cbf5..d95f5d3b4d37a4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8411,6 +8411,19 @@ L:	platform-driver-x86@vger.kernel.org
 S:	Maintained
 F:	drivers/platform/x86/fujitsu-tablet.c
 
+FUNCTION HOOKS (FTRACE)
+M:	Steven Rostedt <rostedt@goodmis.org>
+M:	Masami Hiramatsu <mhiramat@kernel.org>
+R:	Mark Rutland <mark.rutland@arm.com>
+S:	Maintained
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
+F:	Documentation/trace/ftrace*
+F:	kernel/trace/ftrace*
+F:	kernel/trace/fgraph.c
+F:	arch/*/*/*/*ftrace*
+F:	arch/*/*/*ftrace*
+F:	include/*/ftrace.h
+
 FUNGIBLE ETHERNET DRIVERS
 M:	Dimitris Michailidis <dmichail@fungible.com>
 L:	netdev@vger.kernel.org
@@ -20615,14 +20628,10 @@ F:	drivers/char/tpm/
 TRACING
 M:	Steven Rostedt <rostedt@goodmis.org>
 M:	Masami Hiramatsu <mhiramat@kernel.org>
-R:	Mark Rutland <mark.rutland@arm.com>
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
-F:	Documentation/trace/ftrace.rst
-F:	arch/*/*/*/*ftrace*
-F:	arch/*/*/*ftrace*
+F:	Documentation/trace/*
 F:	fs/tracefs/
-F:	include/*/ftrace.h
 F:	include/linux/trace*.h
 F:	include/trace/
 F:	kernel/trace/