From f2b20c66274dafd57f1a9221aae84640319685a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 8 Nov 2021 20:15:13 +0000 Subject: [PATCH 01/50] tracing: Fix spelling mistake "aritmethic" -> "arithmetic" There is a spelling mistake in the tracing mini-HOWTO text. Fix it. Link: https://lkml.kernel.org/r/20211108201513.42876-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88de94da596b13..4821fe6a40a51d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5635,7 +5635,7 @@ static const char readme_msg[] = "\t - a numeric literal: e.g. ms_per_sec=1000,\n" "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" "\n" - "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n" + "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" "\t multiplication(*) and division(/) operators. An operand can be either a\n" "\t variable reference, field or numeric literal.\n" "\n" From 05770dd0ad110854c7157d95700d7c89979cdb3e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:12 +0900 Subject: [PATCH 02/50] tracing: Support __rel_loc relative dynamic data location attribute Add '__rel_loc' new dynamic data location attribute which encodes the data location from the next to the field itself. The '__data_loc' is used for encoding the dynamic data location on the trace event record. But '__data_loc' is not useful if the writer doesn't know the event header (e.g. user event), because it records the dynamic data offset from the entry of the record, not the field itself. This new '__rel_loc' attribute encodes the data location relatively from the next of the field. For example, when there is a record like below (the number in the parentheses is the size of fields) |header(N)|common(M)|fields(K)|__data_loc(4)|fields(L)|data(G)| In this case, '__data_loc' field will be __data_loc = (G << 16) | (N+M+K+4+L) If '__rel_loc' is used, this will be |header(N)|common(M)|fields(K)|__rel_loc(4)|fields(L)|data(G)| where __rel_loc = (G << 16) | (L) This case shows L bytes after the '__rel_loc' attribute field, if there is no fields after the __rel_loc field, L must be 0. This is relatively easy (and no need to consider the kernel header change) when the event data fields are composed by user who doesn't know header and common fields. Link: https://lkml.kernel.org/r/163757341258.510314.4214431827833229956.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_events_filter.c | 32 ++++++++++++++++++++++++++++-- kernel/trace/trace_events_hist.c | 21 ++++++++++++++++++-- kernel/trace/trace_events_inject.c | 11 ++++++++-- 5 files changed, 60 insertions(+), 6 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2d167ac3452c58..3900404aa0636d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -782,6 +782,7 @@ enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, + FILTER_RDYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_COMM, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 38715aa6cfdfb7..5db2bec8ca7e6f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1465,6 +1465,7 @@ struct filter_pred { static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING || field->filter_type == FILTER_STATIC_STRING || field->filter_type == FILTER_PTR_STRING || field->filter_type == FILTER_COMM; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9124038b140f7..996920ed1812d3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -706,6 +706,29 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* + * Filter predicate for relative dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry as same as dynamic string. + * The difference is that the relative one records the location offset + * from the field itself, not the event entry. + */ +static int filter_pred_strrelloc(struct filter_pred *pred, void *event) +{ + u32 *item = (u32 *)(event + pred->offset); + u32 str_item = *item; + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(&item[1]) + str_loc; + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + /* Filter predicate for CPUs. */ static int filter_pred_cpu(struct filter_pred *pred, void *event) { @@ -756,7 +779,7 @@ static int filter_pred_none(struct filter_pred *pred, void *event) * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING, unless @len is zero. + * RDYN_STRING, or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) @@ -1083,6 +1106,9 @@ int filter_assign_type(const char *type) if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; + if (strstr(type, "__rel_loc") && strstr(type, "char")) + return FILTER_RDYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; @@ -1318,8 +1344,10 @@ static int parse_pred(const char *str, void *data, pred->fn = filter_pred_string; pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn = filter_pred_strloc; + } else if (field->filter_type == FILTER_RDYN_STRING) + pred->fn = filter_pred_strrelloc; else pred->fn = filter_pred_pchar; /* go past the last quote */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 319f9c8ca7e7d8..9b8da439149ca9 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -217,6 +217,20 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, return (u64)(unsigned long)addr; } +static u64 hist_field_reldynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 *item = event + hist_field->field->offset; + u32 str_item = *item; + int str_loc = str_item & 0xffff; + char *addr = (char *)&item[1] + str_loc; + + return (u64)(unsigned long)addr; +} + static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -1956,8 +1970,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (field->filter_type == FILTER_STATIC_STRING) { hist_field->fn = hist_field_string; hist_field->size = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { hist_field->fn = hist_field_dynstring; + } else if (field->filter_type == FILTER_RDYN_STRING) + hist_field->fn = hist_field_reldynstring; else hist_field->fn = hist_field_pstring; } else { @@ -4961,7 +4977,8 @@ static inline void add_to_key(char *compound_key, void *key, struct ftrace_event_field *field; field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING) + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) size = *(u32 *)(rec + field->offset) >> 16; else if (field->filter_type == FILTER_STATIC_STRING) size = field->size; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index c188045c5f976c..d6b4935a78c08f 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -168,10 +168,14 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) continue; if (field->filter_type == FILTER_STATIC_STRING) continue; - if (field->filter_type == FILTER_DYN_STRING) { + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { u32 *str_item; int str_loc = entry_size & 0xffff; + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; + str_item = (u32 *)(entry + field->offset); *str_item = str_loc; /* string length is 0. */ } else { @@ -214,7 +218,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) if (field->filter_type == FILTER_STATIC_STRING) { strlcpy(entry + field->offset, addr, field->size); - } else if (field->filter_type == FILTER_DYN_STRING) { + } else if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; int str_loc = entry_size & 0xffff; u32 *str_item; @@ -229,6 +234,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) strlcpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; *str_item = (str_len << 16) | str_loc; } else { char **paddr; From 55de2c0b5610cba5a5a93c0788031133c457e689 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:21 +0900 Subject: [PATCH 03/50] tracing: Add '__rel_loc' using trace event macros Add '__rel_loc' using trace event macros. These macros are usually not used in the kernel, except for testing purpose. This also add "rel_" variant of macros for dynamic_array string, and bitmask. Link: https://lkml.kernel.org/r/163757342119.510314.816029622439099016.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/trace/bpf_probe.h | 16 +++++ include/trace/perf.h | 16 +++++ include/trace/trace_events.h | 120 ++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 3 + 4 files changed, 153 insertions(+), 2 deletions(-) diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index a8e97f84b65273..7660a7846586cc 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -21,6 +21,22 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) +#undef __get_rel_dynamic_array +#define __get_rel_dynamic_array(field) \ + ((void *)(&__entry->__rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ + (__entry->__rel_loc_##field & 0xffff)) + +#undef __get_rel_dynamic_array_len +#define __get_rel_dynamic_array_len(field) \ + ((__entry->__rel_loc_##field >> 16) & 0xffff) + +#undef __get_rel_str +#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) + +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) + #undef __perf_count #define __perf_count(c) (c) diff --git a/include/trace/perf.h b/include/trace/perf.h index dbc6c74defc382..ea4405de175a3e 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -21,6 +21,22 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) +#undef __get_rel_dynamic_array +#define __get_rel_dynamic_array(field) \ + ((void *)(&__entry->__rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ + (__entry->__rel_loc_##field & 0xffff)) + +#undef __get_rel_dynamic_array_len +#define __get_rel_dynamic_array_len(field) \ + ((__entry->__rel_loc_##field >> 16) & 0xffff) + +#undef __get_rel_str +#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) + +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) + #undef __perf_count #define __perf_count(c) (__count = (c)) diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 08810a4638805d..8c6f7c43351857 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -108,6 +108,18 @@ TRACE_MAKE_SYSTEM_STR(); #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) u32 __rel_loc_##item; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(char, item, -1) + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -200,11 +212,23 @@ TRACE_MAKE_SYSTEM_STR(); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + #undef __string_len #define __string_len(item, src, len) __dynamic_array(char, item, -1) -#undef __bitmask -#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) u32 item; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ @@ -293,6 +317,19 @@ TRACE_MAKE_SYSTEM_STR(); #undef __get_str #define __get_str(field) ((char *)__get_dynamic_array(field)) +#undef __get_rel_dynamic_array +#define __get_rel_dynamic_array(field) \ + ((void *)(&__entry->__rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ + (__entry->__rel_loc_##field & 0xffff)) + +#undef __get_rel_dynamic_array_len +#define __get_rel_dynamic_array_len(field) \ + ((__entry->__rel_loc_##field >> 16) & 0xffff) + +#undef __get_rel_str +#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) + #undef __get_bitmask #define __get_bitmask(field) \ ({ \ @@ -302,6 +339,15 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ }) +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) \ + ({ \ + void *__bitmask = __get_rel_dynamic_array(field); \ + unsigned int __bitmask_size; \ + __bitmask_size = __get_rel_dynamic_array_len(field); \ + trace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ + }) + #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ ({ \ @@ -471,6 +517,21 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(_type, _item, _len) { \ + .type = "__rel_loc " #_type "[]", .name = #_item, \ + .size = 4, .align = 4, \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ static struct trace_event_fields trace_event_fields_##call[] = { \ @@ -519,6 +580,22 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ #undef __string_len #define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) \ + __item_length = (len) * sizeof(type); \ + __data_offsets->item = __data_size + \ + offsetof(typeof(*entry), __data) - \ + offsetof(typeof(*entry), __rel_loc_##item) - \ + sizeof(u32); \ + __data_offsets->item |= __item_length << 16; \ + __data_size += __item_length; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, \ + strlen((src) ? (const char *)(src) : "(null)") + 1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, (len) + 1) /* * __bitmask_size_in_bytes_raw is the number of bytes needed to hold * num_possible_cpus(). @@ -542,6 +619,10 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, \ __bitmask_size_in_longs(nr_bits)) +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, \ + __bitmask_size_in_longs(nr_bits)) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static inline notrace int trace_event_get_offsets_##call( \ @@ -706,6 +787,37 @@ static inline notrace int trace_event_get_offsets_##call( \ #define __assign_bitmask(dst, src, nr_bits) \ memcpy(__get_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) \ + __entry->__rel_loc_##item = __data_offsets.item; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __assign_rel_str +#define __assign_rel_str(dst, src) \ + strcpy(__get_rel_str(dst), (src) ? (const char *)(src) : "(null)"); + +#undef __assign_rel_str_len +#define __assign_rel_str_len(dst, src, len) \ + do { \ + memcpy(__get_rel_str(dst), (src), (len)); \ + __get_rel_str(dst)[len] = '\0'; \ + } while (0) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) + +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) + +#undef __assign_rel_bitmask +#define __assign_rel_bitmask(dst, src, nr_bits) \ + memcpy(__get_rel_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) + #undef TP_fast_assign #define TP_fast_assign(args...) args @@ -770,6 +882,10 @@ static inline void ftrace_test_probe_##call(void) \ #undef __get_dynamic_array_len #undef __get_str #undef __get_bitmask +#undef __get_rel_dynamic_array +#undef __get_rel_dynamic_array_len +#undef __get_rel_str +#undef __get_rel_bitmask #undef __print_array #undef __print_hex_dump diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5db2bec8ca7e6f..7162157b970bfd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -83,6 +83,9 @@ enum trace_type { #undef __dynamic_array #define __dynamic_array(type, item) type item[]; +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item) type item[]; + #undef F_STRUCT #define F_STRUCT(args...) args From b466b13321648d3bd7bfceb9afcba5302b585cf8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:30 +0900 Subject: [PATCH 04/50] samples/trace_event: Add '__rel_loc' using sample event Add '__rel_loc' using sample event for testing. User can use this for testing purpose. There is no reason to use this macro from the kernel. Link: https://lkml.kernel.org/r/163757343050.510314.2876529802471645178.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- samples/trace_events/trace-events-sample.c | 3 ++ samples/trace_events/trace-events-sample.h | 33 ++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 1a72b7d95cdc74..4d34dc0b0fee7d 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -21,6 +21,7 @@ static const char *random_strings[] = { static void simple_thread_func(int cnt) { + unsigned long bitmask[1] = {0xdeadbeefUL}; int array[6]; int len = cnt % 5; int i; @@ -43,6 +44,8 @@ static void simple_thread_func(int cnt) trace_foo_with_template_cond("prints other times", cnt); trace_foo_with_template_print("I have to be different", cnt); + + trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask); } static int simple_thread(void *arg) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index e61471ab7d1418..5ab74fc9a2dfe8 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -506,6 +506,39 @@ DEFINE_EVENT_PRINT(foo_template, foo_with_template_print, TP_ARGS(foo, bar), TP_printk("bar %s %d", __get_str(foo), __entry->bar)); +/* + * There are yet another __rel_loc dynamic data attribute. If you + * use __rel_dynamic_array() and __rel_string() etc. macros, you + * can use this attribute. There is no difference from the viewpoint + * of functionality with/without 'rel' but the encoding is a bit + * different. This is expected to be used with user-space event, + * there is no reason that the kernel event use this, but only for + * testing. + */ + +TRACE_EVENT(foo_rel_loc, + + TP_PROTO(const char *foo, int bar, unsigned long *mask), + + TP_ARGS(foo, bar, mask), + + TP_STRUCT__entry( + __rel_string( foo, foo ) + __field( int, bar ) + __rel_bitmask( bitmask, + BITS_PER_BYTE * sizeof(unsigned long) ) + ), + + TP_fast_assign( + __assign_rel_str(foo, foo); + __entry->bar = bar; + __assign_rel_bitmask(bitmask, mask, + BITS_PER_BYTE * sizeof(unsigned long)); + ), + + TP_printk("foo_rel_loc %s, %d, %s", __get_rel_str(foo), __entry->bar, + __get_rel_bitmask(bitmask)) +); #endif /***** NOTICE! The #if protection ends here. *****/ From cd7729043b31b32b34daaa6775e255b76de7f976 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:40 +0900 Subject: [PATCH 05/50] libtraceevent: Add __rel_loc relative location attribute support Add '__rel_loc' new dynamic data location attribute which encodes the data location from the next to the field itself. This is similar to the '__data_loc' but the location offset is not from the event entry but from the next of the field. This patch adds '__rel_loc' decoding support in the libtraceevent. Link: https://lkml.kernel.org/r/163757343994.510314.13241077597729303802.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/lib/traceevent/event-parse.c | 59 +++++++++++++++++++---------- tools/lib/traceevent/event-parse.h | 5 ++- tools/lib/traceevent/parse-filter.c | 5 ++- 3 files changed, 47 insertions(+), 22 deletions(-) diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index fe58843d047ca3..8e24c4c78c7f0b 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -1367,6 +1367,14 @@ static int field_is_dynamic(struct tep_format_field *field) return 0; } +static int field_is_relative_dynamic(struct tep_format_field *field) +{ + if (strncmp(field->type, "__rel_loc", 9) == 0) + return 1; + + return 0; +} + static int field_is_long(struct tep_format_field *field) { /* includes long long */ @@ -1622,6 +1630,8 @@ static int event_read_fields(struct tep_event *event, struct tep_format_field ** field->flags |= TEP_FIELD_IS_STRING; if (field_is_dynamic(field)) field->flags |= TEP_FIELD_IS_DYNAMIC; + if (field_is_relative_dynamic(field)) + field->flags |= TEP_FIELD_IS_DYNAMIC | TEP_FIELD_IS_RELATIVE; if (field_is_long(field)) field->flags |= TEP_FIELD_IS_LONG; @@ -2928,7 +2938,7 @@ process_str(struct tep_event *event __maybe_unused, struct tep_print_arg *arg, arg->type = TEP_PRINT_STRING; arg->string.string = token; - arg->string.offset = -1; + arg->string.field = NULL; if (read_expected(TEP_EVENT_DELIM, ")") < 0) goto out_err; @@ -2957,7 +2967,7 @@ process_bitmask(struct tep_event *event __maybe_unused, struct tep_print_arg *ar arg->type = TEP_PRINT_BITMASK; arg->bitmask.bitmask = token; - arg->bitmask.offset = -1; + arg->bitmask.field = NULL; if (read_expected(TEP_EVENT_DELIM, ")") < 0) goto out_err; @@ -3123,19 +3133,23 @@ process_function(struct tep_event *event, struct tep_print_arg *arg, free_token(token); return process_int_array(event, arg, tok); } - if (strcmp(token, "__get_str") == 0) { + if (strcmp(token, "__get_str") == 0 || + strcmp(token, "__get_rel_str") == 0) { free_token(token); return process_str(event, arg, tok); } - if (strcmp(token, "__get_bitmask") == 0) { + if (strcmp(token, "__get_bitmask") == 0 || + strcmp(token, "__get_rel_bitmask") == 0) { free_token(token); return process_bitmask(event, arg, tok); } - if (strcmp(token, "__get_dynamic_array") == 0) { + if (strcmp(token, "__get_dynamic_array") == 0 || + strcmp(token, "__get_rel_dynamic_array") == 0) { free_token(token); return process_dynamic_array(event, arg, tok); } - if (strcmp(token, "__get_dynamic_array_len") == 0) { + if (strcmp(token, "__get_dynamic_array_len") == 0 || + strcmp(token, "__get_rel_dynamic_array_len") == 0) { free_token(token); return process_dynamic_array_len(event, arg, tok); } @@ -4163,14 +4177,16 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, case TEP_PRINT_STRING: { int str_offset; - if (arg->string.offset == -1) { - struct tep_format_field *f; + if (!arg->string.field) + arg->string.field = tep_find_any_field(event, arg->string.string); + if (!arg->string.field) + break; - f = tep_find_any_field(event, arg->string.string); - arg->string.offset = f->offset; - } - str_offset = data2host4(tep, *(unsigned int *)(data + arg->string.offset)); + str_offset = data2host4(tep, + *(unsigned int *)(data + arg->string.field->offset)); str_offset &= 0xffff; + if (arg->string.field->flags & TEP_FIELD_IS_RELATIVE) + str_offset += arg->string.field->offset + arg->string.field->size; print_str_to_seq(s, format, len_arg, ((char *)data) + str_offset); break; } @@ -4181,15 +4197,16 @@ static void print_str_arg(struct trace_seq *s, void *data, int size, int bitmask_offset; int bitmask_size; - if (arg->bitmask.offset == -1) { - struct tep_format_field *f; - - f = tep_find_any_field(event, arg->bitmask.bitmask); - arg->bitmask.offset = f->offset; - } - bitmask_offset = data2host4(tep, *(unsigned int *)(data + arg->bitmask.offset)); + if (!arg->bitmask.field) + arg->bitmask.field = tep_find_any_field(event, arg->bitmask.bitmask); + if (!arg->bitmask.field) + break; + bitmask_offset = data2host4(tep, + *(unsigned int *)(data + arg->bitmask.field->offset)); bitmask_size = bitmask_offset >> 16; bitmask_offset &= 0xffff; + if (arg->bitmask.field->flags & TEP_FIELD_IS_RELATIVE) + bitmask_offset += arg->bitmask.field->offset + arg->bitmask.field->size; print_bitmask_to_seq(tep, s, format, len_arg, data + bitmask_offset, bitmask_size); break; @@ -5109,6 +5126,8 @@ void tep_print_field(struct trace_seq *s, void *data, offset = val; len = offset >> 16; offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } if (field->flags & TEP_FIELD_IS_STRING && is_printable_array(data + offset, len)) { @@ -6987,6 +7006,8 @@ void *tep_get_field_raw(struct trace_seq *s, struct tep_event *event, data + offset, field->size); *len = offset >> 16; offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } else *len = field->size; diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h index a67ad9a5b835a2..41d4f9f6a84367 100644 --- a/tools/lib/traceevent/event-parse.h +++ b/tools/lib/traceevent/event-parse.h @@ -125,6 +125,7 @@ enum tep_format_flags { TEP_FIELD_IS_LONG = 32, TEP_FIELD_IS_FLAG = 64, TEP_FIELD_IS_SYMBOLIC = 128, + TEP_FIELD_IS_RELATIVE = 256, }; struct tep_format_field { @@ -153,12 +154,12 @@ struct tep_print_arg_atom { struct tep_print_arg_string { char *string; - int offset; + struct tep_format_field *field; }; struct tep_print_arg_bitmask { char *bitmask; - int offset; + struct tep_format_field *field; }; struct tep_print_arg_field { diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c index 368826bb5a5729..5df177070d53a0 100644 --- a/tools/lib/traceevent/parse-filter.c +++ b/tools/lib/traceevent/parse-filter.c @@ -1712,8 +1712,11 @@ static const char *get_field_str(struct tep_filter_arg *arg, struct tep_record * if (arg->str.field->flags & TEP_FIELD_IS_DYNAMIC) { addr = *(unsigned int *)val; - val = record->data + (addr & 0xffff); size = addr >> 16; + addr &= 0xffff; + if (arg->str.field->flags & TEP_FIELD_IS_RELATIVE) + addr += arg->str.field->offset + arg->str.field->size; + val = record->data + addr; } /* From 7c689c839734a23eda855e69a56ed4795533bf71 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:48 +0900 Subject: [PATCH 06/50] tools/perf: Add '__rel_loc' event field parsing support Add new '__rel_loc' dynamic data location attribute support. This type attribute is similar to the '__data_loc' but records the offset from the field itself. The libtraceevent adds TEP_FIELD_IS_RELATIVE to the 'tep_format_field::flags' with TEP_FIELD_IS_DYNAMIC for'__rel_loc'. Link: https://lkml.kernel.org/r/163757344810.510314.12449413842136229871.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/perf/builtin-trace.c | 2 ++ tools/perf/util/data-convert-bt.c | 2 ++ tools/perf/util/evsel.c | 2 ++ tools/perf/util/python.c | 2 ++ tools/perf/util/scripting-engines/trace-event-perl.c | 2 ++ tools/perf/util/scripting-engines/trace-event-python.c | 2 ++ tools/perf/util/sort.c | 2 ++ 7 files changed, 14 insertions(+) diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 0b52e08e558e3c..ea517762cb0c94 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -2726,6 +2726,8 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, offset = format_field__intval(field, sample, evsel->needs_swap); syscall_arg.len = offset >> 16; offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } val = (uintptr_t)(sample->raw_data + offset); diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c index 8f7705bbc2da8e..9e0aee276df8e1 100644 --- a/tools/perf/util/data-convert-bt.c +++ b/tools/perf/util/data-convert-bt.c @@ -318,6 +318,8 @@ static int add_tracepoint_field_value(struct ctf_writer *cw, offset = tmp_val; len = offset >> 16; offset &= 0xffff; + if (flags & TEP_FIELD_IS_RELATIVE) + offset += fmtf->offset + fmtf->size; } if (flags & TEP_FIELD_IS_ARRAY) { diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ac0127be045935..f29d37004f55c1 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2706,6 +2706,8 @@ void *evsel__rawptr(struct evsel *evsel, struct perf_sample *sample, const char if (field->flags & TEP_FIELD_IS_DYNAMIC) { offset = *(int *)(sample->raw_data + field->offset); offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } return sample->raw_data + offset; diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 563a9ba8954f31..3e7538db91070e 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -428,6 +428,8 @@ tracepoint_field(struct pyrf_event *pe, struct tep_format_field *field) offset = val; len = offset >> 16; offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } if (field->flags & TEP_FIELD_IS_STRING && is_printable_array(data + offset, len)) { diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 32a721b3e9a56a..a5d945415bbc11 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -392,6 +392,8 @@ static void perl_process_tracepoint(struct perf_sample *sample, if (field->flags & TEP_FIELD_IS_DYNAMIC) { offset = *(int *)(data + field->offset); offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } else offset = field->offset; XPUSHs(sv_2mortal(newSVpv((char *)data + offset, 0))); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index c0c010350bc2a1..d1f1501ce7fc33 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -942,6 +942,8 @@ static void python_process_tracepoint(struct perf_sample *sample, offset = val; len = offset >> 16; offset &= 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; } if (field->flags & TEP_FIELD_IS_STRING && is_printable_array(data + offset, len)) { diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index a111065b484ef7..d9a106f0edb212 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2365,6 +2365,8 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt, tep_read_number_field(field, a->raw_data, &dyn); offset = dyn & 0xffff; size = (dyn >> 16) & 0xffff; + if (field->flags & TEP_FIELD_IS_RELATIVE) + offset += field->offset + field->size; /* record max width for output */ if (size > hde->dynamic_len) From e07a1d576239cf836070e740d4bd7c5e8a64868f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:46 -0500 Subject: [PATCH 07/50] tracing: Use __this_cpu_read() in trace_event_buffer_lock_reserver() The value read by this_cpu_read() is used later and its use is expected to stay on the same CPU as being read. But this_cpu_read() does not warn if it is called without preemption disabled, where as __this_cpu_read() will check if preemption is disabled on CONFIG_DEBUG_PREEMPT Currently all callers have preemption disabled, but there may be new callers in the future that may not. Link: https://lkml.kernel.org/r/20211130024318.698165354@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4821fe6a40a51d..2e87b7bf2ba7f5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2746,7 +2746,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && - (entry = this_cpu_read(trace_buffered_event))) { + (entry = __this_cpu_read(trace_buffered_event))) { /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, From 6c536d76cfe63b79e9e468ef0876315420a19074 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:47 -0500 Subject: [PATCH 08/50] tracing: Disable preemption when using the filter buffer In case trace_event_buffer_lock_reserve() is called with preemption enabled, the algorithm that defines the usage of the per cpu filter buffer may fail if the task schedules to another CPU after determining which buffer it will use. Disable preemption when using the filter buffer. And because that same buffer must be used throughout the call, keep preemption disabled until the filter buffer is released. This will also keep the semantics between the use case of when the filter buffer is used, and when the ring buffer itself is used, as that case also disables preemption until the ring buffer is released. Link: https://lkml.kernel.org/r/20211130024318.880190623@goodmis.org [ Fixed warning of assignment in if statement Reported-by: kernel test robot ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 59 +++++++++++++++++++++++++------------------- kernel/trace/trace.h | 4 ++- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e87b7bf2ba7f5..e3b8c906b7b4dc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -980,6 +980,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer, event); } @@ -2745,8 +2747,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && - (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && - (entry = __this_cpu_read(trace_buffered_event))) { + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { + preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, @@ -2764,33 +2766,38 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ - int max_len = PAGE_SIZE - struct_size(entry, array, 1); + if ((entry = __this_cpu_read(trace_buffered_event))) { + int max_len = PAGE_SIZE - struct_size(entry, array, 1); - val = this_cpu_inc_return(trace_buffered_event_cnt); + val = this_cpu_inc_return(trace_buffered_event_cnt); - /* - * Preemption is disabled, but interrupts and NMIs - * can still come in now. If that happens after - * the above increment, then it will have to go - * back to the old method of allocating the event - * on the ring buffer, and if the filter fails, it - * will have to call ring_buffer_discard_commit() - * to remove it. - * - * Need to also check the unlikely case that the - * length is bigger than the temp buffer size. - * If that happens, then the reserve is pretty much - * guaranteed to fail, as the ring buffer currently - * only allows events less than a page. But that may - * change in the future, so let the ring buffer reserve - * handle the failure in that case. - */ - if (val == 1 && likely(len <= max_len)) { - trace_event_setup(entry, type, trace_ctx); - entry->array[0] = len; - return entry; + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ + if (val == 1 && likely(len <= max_len)) { + trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + /* Return with preemption disabled */ + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); } - this_cpu_dec(trace_buffered_event_cnt); + /* __trace_buffer_lock_reserve() disables preemption */ + preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7162157b970bfd..8bd1a815ce90b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1337,10 +1337,12 @@ __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { - /* Simply release the temp buffer */ + /* Simply release the temp buffer and enable preemption */ this_cpu_dec(trace_buffered_event_cnt); + preempt_enable_notrace(); return; } + /* ring_buffer_discard_commit() enables preemption */ ring_buffer_discard_commit(buffer, event); } From 3e8b1a29a0e8d300466cf2a23d2f6d41971c5a0c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:48 -0500 Subject: [PATCH 09/50] tracing: Have eprobes use filtering logic of trace events The eprobes open code the reserving of the event on the ring buffer for ftrace instead of using the ftrace event wrappers, which means that it doesn't get affected by the filters, breaking the filtering logic on user space. Link: https://lkml.kernel.org/r/20211130024319.068451680@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 928867f527e70a..88487752d307cd 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -489,18 +489,12 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) if (trace_trigger_soft_disabled(edata->file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = edata->file; - dsize = get_eprobe_size(&edata->ep->tp, rec); - fbuffer.regs = NULL; - - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, - call->event.type, - sizeof(*entry) + edata->ep->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, edata->file, + sizeof(*entry) + edata->ep->tp.size + dsize); + + if (!entry) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); From 5e6cd84e2f8bd3619b5d8f3dd4b44c0086a6ce1d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:49 -0500 Subject: [PATCH 10/50] tracing/kprobes: Do not open code event reserve logic As kprobe events use trace_event_buffer_commit() to commit the event to the ftrace ring buffer, for consistency, it should use trace_event_buffer_reserve() to allocate it, as the two functions are related. Link: https://lkml.kernel.org/r/20211130024319.257430762@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 33272a7b691292..d10c01948e68a6 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1383,17 +1383,11 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; @@ -1430,16 +1424,11 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; From b7d5eb267f8c234d6eda40e21c0105a1f6231d14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 6 Dec 2021 16:24:40 -0500 Subject: [PATCH 11/50] tracing/uprobes: Use trace_event_buffer_reserve() helper To be consistent with kprobes and eprobes, use trace_event_buffer_reserver() and trace_event_buffer_commit(). This will ensure that any updates to trace events will also be implemented on uprobe events. Link: https://lkml.kernel.org/r/20211206162440.69fbf96c@gandalf.local.home Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f5f0039d31e5aa..a4d5c624fe79c5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -949,8 +949,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; - struct trace_buffer *buffer; - struct ring_buffer_event *event; + struct trace_event_buffer fbuffer; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); @@ -965,12 +964,10 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, size, 0); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { entry->vaddr[0] = func; entry->vaddr[1] = instruction_pointer(regs); @@ -982,7 +979,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); + trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ From 1d83c3a20b0c5708b51c16a021ab76305dbb9943 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sun, 16 May 2021 02:24:10 +0000 Subject: [PATCH 12/50] tracing: Fix synth_event_add_val() kernel-doc comment It's named field here. Link: https://lkml.kernel.org/r/20210516022410.64271-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 22db3ce95e74f2..98e0026489949a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1978,7 +1978,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_next_val); /** * synth_event_add_val - Add a named field's value to an open synth trace * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to + * @val: The value to set the named field to * @trace_state: A pointer to object tracking the piecewise trace state * * Set the value of the named field in an event that's been opened by From a6ed2aee54644cfa2d04ca86308767f5c3a087e8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 24 Nov 2021 12:03:08 +0100 Subject: [PATCH 13/50] tracing: Switch to kvfree_rcu() API Instead of invoking a synchronize_rcu() to free a pointer after a grace period we can directly make use of new API that does the same but in more efficient way. Link: https://lkml.kernel.org/r/20211124110308.2053-10-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 3 +-- kernel/trace/trace_probe.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed5540..4719a848bf1740 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -138,8 +138,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - synchronize_rcu(); - kfree(inst); + kvfree_rcu(inst); } /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ed2a3f372972c..8a3822818bf88e 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1138,8 +1138,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - synchronize_rcu(); - kfree(link); + kvfree_rcu(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); From 2972e3050e3517a85ca1813b227d4c302e804343 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 7 Dec 2021 14:25:58 +0000 Subject: [PATCH 14/50] tracing: Make trace_marker{,_raw} stream-like The tracing marker files are write-only streams with no meaningful concept of file position. Using stream_open() to mark them as stream-link indicates this and has the added advantage that a single file descriptor can now be used from multiple threads without contention thanks to clearing FMODE_ATOMIC_POS. Note that this has the potential to break existing userspace by since both lseek(2) and pwrite(2) will now return ESPIPE when previously lseek would have updated the stored offset and pwrite would have appended to the trace. A survey of libtracefs and several other projects found to use trace_marker(_raw) [1][2][3] suggests that everyone limits themselves to calling write(2) and close(2) on these file descriptors so there is a good chance this will go unnoticed and the benefits of reduced overhead and lock contention seem worth the risk. [1] https://github.com/google/perfetto [2] https://github.com/intel/media-driver/ [3] https://w1.fi/cgit/hostap/ Link: https://lkml.kernel.org/r/20211207142558.347029-1-john@metanate.com Signed-off-by: John Keeping Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3b8c906b7b4dc..588de6df473f4f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4841,6 +4841,12 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + stream_open(inode, filp); + return tracing_open_generic_tr(inode, filp); +} + static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -7117,9 +7123,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tt) event_triggers_post_call(tr->trace_marker_file, tt); - if (written > 0) - *fpos += written; - return written; } @@ -7178,9 +7181,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - if (written > 0) - *fpos += written; - return written; } @@ -7580,16 +7580,14 @@ static const struct file_operations tracing_free_buffer_fops = { }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_mark_raw_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_raw_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; From c8a7ff13f5fc6e6b313de2a8e02304c2a90a3906 Mon Sep 17 00:00:00 2001 From: Yinan Liu Date: Tue, 7 Dec 2021 23:13:47 +0800 Subject: [PATCH 15/50] script/sorttable: Code style improvements Modified the code style issue of if() {}, keep the code style consistent. Link: https://lkml.kernel.org/r/20211207151348.54921-3-yinan@linux.alibaba.com Signed-off-by: Yinan Liu Signed-off-by: Steven Rostedt (VMware) --- scripts/sorttable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/sorttable.h b/scripts/sorttable.h index a2baa2fefb1379..7b9745cf8c7091 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -364,11 +364,11 @@ static int do_sort(Elf_Ehdr *ehdr, void *retval = NULL; /* wait for ORC tables sort done */ rc = pthread_join(orc_sort_thread, &retval); - if (rc) + if (rc) { fprintf(stderr, "pthread_join failed '%s': %s\n", strerror(errno), fname); - else if (retval) { + } else if (retval) { rc = -1; fprintf(stderr, "failed to sort ORC tables '%s': %s\n", From ee34c52c7115b60a7c96b2034dd1737765dd5493 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 8 Dec 2021 10:27:31 -0500 Subject: [PATCH 16/50] tracefs: Use d_inode() helper function to get the dentry inode Instead of referencing the inode from a dentry via dentry->d_inode, use the helper function d_inode(dentry) instead. This is the considered the correct way to access it. Reported-by: Christian Brauner Reported: https://lore.kernel.org/all/20211208104454.nhxyvmmn6d2qhpwl@wittgenstein/ Signed-off-by: Steven Rostedt (VMware) --- fs/tracefs/inode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 925a621b432e3c..9899c6078c952d 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -109,12 +109,12 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * also the directory that is being deleted. */ inode_unlock(inode); - inode_unlock(dentry->d_inode); + inode_unlock(d_inode(dentry)); ret = tracefs_ops.rmdir(name); inode_lock_nested(inode, I_MUTEX_PARENT); - inode_lock(dentry->d_inode); + inode_lock(d_inode(dentry)); kfree(name); @@ -212,7 +212,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) static int tracefs_apply_options(struct super_block *sb) { struct tracefs_fs_info *fsi = sb->s_fs_info; - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_inode(sb->s_root); struct tracefs_mount_opts *opts = &fsi->mount_opts; inode->i_mode &= ~S_IALLUGO; @@ -331,18 +331,18 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - inode_lock(parent->d_inode); - if (unlikely(IS_DEADDIR(parent->d_inode))) + inode_lock(d_inode(parent)); + if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry) && dentry->d_inode) { + if (!IS_ERR(dentry) && d_inode(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { - inode_unlock(parent->d_inode); + inode_unlock(d_inode(parent)); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - inode_unlock(dentry->d_parent->d_inode); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - inode_unlock(dentry->d_parent->d_inode); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -415,7 +415,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); + fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } @@ -440,8 +440,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } From e161c6bf3955d737f755f8eaa3b92de4bc6bd0e7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 25 Nov 2021 21:28:52 +0100 Subject: [PATCH 17/50] tracing: Iterate trace_[ku]probe objects directly As suggested by Linus [1] using list_for_each_entry to iterate directly trace_[ku]probe objects so we can skip another call to container_of in these loops. [1] https://lore.kernel.org/r/CAHk-=wjakjw6-rDzDDBsuMoDCqd+9ogifR_EE1F0K-jYek1CdA@mail.gmail.com Link: https://lkml.kernel.org/r/20211125202852.406405-1-jolsa@kernel.org Suggested-by: Linus Torvalds Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 13 ++++--------- kernel/trace/trace_uprobe.c | 23 ++++++++--------------- 2 files changed, 12 insertions(+), 24 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d10c01948e68a6..f8c26ee72de3d0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -327,11 +327,9 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) static void __disable_trace_kprobe(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_kprobe *tk; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (!trace_kprobe_is_registered(tk)) continue; if (trace_kprobe_is_return(tk)) @@ -348,7 +346,7 @@ static void __disable_trace_kprobe(struct trace_probe *tp) static int enable_trace_kprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_kprobe *tk; bool enabled; int ret = 0; @@ -369,8 +367,7 @@ static int enable_trace_kprobe(struct trace_event_call *call, if (enabled) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (trace_kprobe_has_gone(tk)) continue; ret = __enable_trace_kprobe(tk); @@ -559,11 +556,9 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, struct trace_kprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (strcmp(trace_kprobe_symbol(orig), trace_kprobe_symbol(comp)) || trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a4d5c624fe79c5..3bd09d61213729 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -409,12 +409,10 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, struct trace_uprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; struct inode *comp_inode = d_real_inode(comp->path.dentry); int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (comp_inode != d_real_inode(orig->path.dentry) || comp->offset != orig->offset) continue; @@ -1072,14 +1070,12 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) static void __probe_event_disable(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_uprobe *tu; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { if (!tu->inode) continue; @@ -1091,7 +1087,7 @@ static void __probe_event_disable(struct trace_probe *tp) static int probe_event_enable(struct trace_event_call *call, struct trace_event_file *file, filter_func_t filter) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; bool enabled; int ret; @@ -1126,8 +1122,7 @@ static int probe_event_enable(struct trace_event_call *call, if (ret) goto err_flags; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = trace_uprobe_enable(tu, filter); if (ret) { __probe_event_disable(tp); @@ -1272,7 +1267,7 @@ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, static int uprobe_perf_close(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int ret = 0; @@ -1284,8 +1279,7 @@ static int uprobe_perf_close(struct trace_event_call *call, if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; @@ -1297,7 +1291,7 @@ static int uprobe_perf_close(struct trace_event_call *call, static int uprobe_perf_open(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int err = 0; @@ -1309,8 +1303,7 @@ static int uprobe_perf_open(struct trace_event_call *call, if (trace_uprobe_filter_add(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); From 4f67cca70c0f615e9cfe6ac42244f3416ec60877 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 30 Sep 2021 15:38:21 -0700 Subject: [PATCH 18/50] tracing: Do not let synth_events block other dyn_event systems during create synth_events is returning -EINVAL if the dyn_event create command does not contain ' \t'. This prevents other systems from getting called back. synth_events needs to return -ECANCELED in these cases when the command is not targeting the synth_event system. Link: https://lore.kernel.org/linux-trace-devel/20210930223821.11025-1-beaub@linux.microsoft.com Fixes: c9e759b1e8456 ("tracing: Rework synthetic event command parsing") Reviewed-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 98e0026489949a..149011e34ad9f7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2053,6 +2053,13 @@ static int create_synth_event(const char *raw_command) last_cmd_set(raw_command); + name = raw_command; + + /* Don't try to process if not our system */ + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + p = strpbrk(raw_command, " \t"); if (!p) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -2061,12 +2068,6 @@ static int create_synth_event(const char *raw_command) fields = skip_spaces(p); - name = raw_command; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - /* This interface accepts group name prefix */ if (strchr(name, '/')) { len = str_has_prefix(name, SYNTH_SYSTEM "/"); From dba879672258699223b0ce61f9e5c079b0476d92 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 10 Dec 2021 09:22:45 +0800 Subject: [PATCH 19/50] tracing: Use memset_startat helper in trace_iterator_reset() Make use of memset_startat helper to simplify the code, there should be no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20211210012245.207489-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8bd1a815ce90b7..64a7ec44a63545 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1932,14 +1932,7 @@ extern struct trace_iterator *tracepoint_print_iter; */ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) { - const size_t offset = offsetof(struct trace_iterator, seq); - - /* - * Keep gcc from complaining about overwriting more than just one - * member in the structure. - */ - memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); - + memset_startat(iter, 0, seq); iter->pos = -1; } From 2768c1e7f9d7b82f9e129efe3677c783bc77b8f9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 10 Dec 2021 20:26:16 -0500 Subject: [PATCH 20/50] tracing: Use trace_iterator_reset() in tracing_read_pipe() Currently tracing_read_pipe() open codes trace_iterator_reset(). Just have it use trace_iterator_reset() instead. Link: https://lkml.kernel.org/r/20211210202616.64d432d2@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 588de6df473f4f..547d82628c2e2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6731,10 +6731,9 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset_startat(iter, 0, seq); + trace_iterator_reset(iter); cpumask_clear(iter->started); trace_seq_init(&iter->seq); - iter->pos = -1; trace_event_read_lock(); trace_access_lock(iter->cpu_file); From 9ec5a7d16899ed9062cc4c3dd3a13e1771411ab3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:11 -0600 Subject: [PATCH 21/50] tracing: Change event_command func() to parse() The name of the func() callback on event_command is too generic and is easily confused with other callbacks with that name, so change it to something that reflects its actual purpose. In this case, the main purpose of the callback is to parse an event command, so call it parse() instead. Link: https://lkml.kernel.org/r/7784e321840752ed88aac0b349c0c685fc9247b1.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 19 +++++++++++------- kernel/trace/trace_eprobe.c | 8 ++++---- kernel/trace/trace_events_hist.c | 26 ++++++++++++------------- kernel/trace/trace_events_trigger.c | 30 ++++++++++++++--------------- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64a7ec44a63545..3b2b1bfc686ffd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1578,9 +1578,9 @@ extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); -extern int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +extern int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, @@ -1702,7 +1702,7 @@ struct event_trigger_ops { * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. * - * @func: The callback function responsible for parsing and + * @parse: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the * user. It allocates the trigger instance and registers it with * the appropriate trace event. It makes use of the other @@ -1737,15 +1737,20 @@ struct event_trigger_ops { * * @get_trigger_ops: The callback function invoked to retrieve the * event_trigger_ops implementation associated with the command. + * This callback function allows a single event_command to + * support multiple trigger implementations via different sets of + * event_trigger_ops, depending on the value of the @param + * string. */ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; int flags; - int (*func)(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *params); + int (*parse)(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); int (*reg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 88487752d307cd..84d5bfa34a9943 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -549,9 +549,9 @@ static struct event_trigger_ops eprobe_trigger_ops = { .free = eprobe_trigger_free, }; -static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { return -1; } @@ -580,7 +580,7 @@ static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = eprobe_trigger_cmd_func, + .parse = eprobe_trigger_cmd_parse, .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9b8da439149ca9..89bbbbd3a3f5d7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2761,9 +2761,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, } static struct event_command trigger_hist_cmd; -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -2966,8 +2966,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, var_hist->hist_data = hist_data; /* Create the new histogram with our variable */ - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "", "hist", cmd); if (ret) { kfree(cmd); kfree(var_hist->cmd); @@ -5729,8 +5729,8 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_field_var_hists; i++) { file = hist_data->field_var_hists[i]->hist_data->event_file; cmd = hist_data->field_var_hists[i]->cmd; - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "!hist", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "!hist", "hist", cmd); WARN_ON_ONCE(ret < 0); } } @@ -6146,9 +6146,9 @@ static void hist_unreg_all(struct trace_event_file *file) } } -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; @@ -6331,7 +6331,7 @@ static struct event_command trigger_hist_cmd = { .name = "hist", .trigger_type = ETT_EVENT_HIST, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = event_hist_trigger_func, + .parse = event_hist_trigger_parse, .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, @@ -6446,7 +6446,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, @@ -6457,7 +6457,7 @@ static struct event_command trigger_hist_enable_cmd = { static struct event_command trigger_hist_disable_cmd = { .name = DISABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3d5c07239a2a8f..15aae07cbe613e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -245,7 +245,7 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) mutex_lock(&trigger_cmd_mutex); list_for_each_entry(p, &trigger_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(p, file, buff, command, next); + ret = p->parse(p, file, buff, command, next); goto out_unlock; } } @@ -622,7 +622,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, } /** - * event_trigger_callback - Generic event_command @func implementation + * event_trigger_parse - Generic event_command @parse implementation * @cmd_ops: The command ops, used for trigger registration * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger @@ -632,15 +632,15 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, * Common implementation for event command parsing and trigger * instantiation. * - * Usually used directly as the @func method in event command + * Usually used directly as the @parse method in event command * implementations. * * Return: 0 on success, errno otherwise */ static int -event_trigger_callback(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +event_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -1069,7 +1069,7 @@ onoff_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1080,7 +1080,7 @@ static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1157,7 +1157,7 @@ snapshot_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_trigger, .get_trigger_ops = snapshot_get_trigger_ops, @@ -1249,7 +1249,7 @@ static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = stacktrace_get_trigger_ops, @@ -1380,9 +1380,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { .free = event_enable_trigger_free, }; -int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1628,7 +1628,7 @@ event_enable_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_enable_cmd = { .name = ENABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, @@ -1638,7 +1638,7 @@ static struct event_command trigger_enable_cmd = { static struct event_command trigger_disable_cmd = { .name = DISABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, From fb339e531bfccbd12d49b165f37636e62778b69f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:12 -0600 Subject: [PATCH 22/50] tracing: Change event_trigger_ops func() to trigger() The name of the func() callback on event_trigger_ops is too generic and is easily confused with other callbacks with that name, so change it to something that reflects its actual purpose. In this case, the main purpose of the callback is to implement an event trigger, so call it trigger() instead. Also add some more documentation to event_trigger_ops describing the callbacks a bit better. Link: https://lkml.kernel.org/r/36ab812e3ee74ee03ae0043fda41a858ee728c00.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 19 ++++++++++++++---- kernel/trace/trace_eprobe.c | 2 +- kernel/trace/trace_events_hist.c | 12 ++++++------ kernel/trace/trace_events_trigger.c | 30 ++++++++++++++--------------- 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b2b1bfc686ffd..13f23082f256ad 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1619,10 +1619,20 @@ extern int register_trigger_hist_enable_disable_cmds(void); * The methods in this structure provide per-event trigger hooks for * various trigger operations. * + * The @init and @free methods are used during trigger setup and + * teardown, typically called from an event_command's @parse() + * function implementation. + * + * The @print method is used to print the trigger spec. + * + * The @trigger method is the function that actually implements the + * trigger and is called in the context of the triggering event + * whenever that event occurs. + * * All the methods below, except for @init() and @free(), must be * implemented. * - * @func: The trigger 'probe' function called when the triggering + * @trigger: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that * registered the trigger (see struct event_command) along with @@ -1651,9 +1661,10 @@ extern int register_trigger_hist_enable_disable_cmds(void); * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *rbe); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 84d5bfa34a9943..6d363fd8a1e4c8 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -543,7 +543,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data, } static struct event_trigger_ops eprobe_trigger_ops = { - .func = eprobe_trigger_func, + .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, .free = eprobe_trigger_free, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 89bbbbd3a3f5d7..229ce5c2dfd37b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5759,7 +5759,7 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, .free = event_hist_trigger_free, @@ -5793,7 +5793,7 @@ static void event_hist_trigger_named_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_named_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, @@ -6383,28 +6383,28 @@ hist_enable_count_trigger(struct event_trigger_data *data, } static struct event_trigger_ops hist_enable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_enable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 15aae07cbe613e..24aceeb50dc09e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -68,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -78,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); } return tt; } @@ -106,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL, NULL); + data->ops->trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -1023,28 +1023,28 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops traceon_trigger_ops = { - .func = traceon_trigger, + .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceon_count_trigger_ops = { - .func = traceon_count_trigger, + .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_trigger_ops = { - .func = traceoff_trigger, + .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_count_trigger_ops = { - .func = traceoff_count_trigger, + .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1135,14 +1135,14 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops snapshot_trigger_ops = { - .func = snapshot_trigger, + .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops snapshot_count_trigger_ops = { - .func = snapshot_count_trigger, + .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1226,14 +1226,14 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops stacktrace_trigger_ops = { - .func = stacktrace_trigger, + .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops stacktrace_count_trigger_ops = { - .func = stacktrace_count_trigger, + .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1353,28 +1353,28 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_enable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_enable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, From 2378a2d6b6cf863bdd566aae495336c72bdaec99 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:13 -0600 Subject: [PATCH 23/50] tracing: Remove ops param from event_command reg()/unreg() callbacks The event_trigger_ops for an event_command are already accessible via event_trigger_data.ops so remove the redundant ops from the callback. Link: https://lkml.kernel.org/r/4c6f2a41820452f9cacddc7634ad442928aa2aa6.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ---- kernel/trace/trace_eprobe.c | 12 ++++++------ kernel/trace/trace_events_hist.c | 10 +++++----- kernel/trace/trace_events_trigger.c | 22 +++++++++------------- 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13f23082f256ad..22a1e8635acf54 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1582,11 +1582,9 @@ extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); extern void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); @@ -1763,11 +1761,9 @@ struct event_command { char *glob, char *cmd, char *param_and_filter); int (*reg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg_all)(struct trace_event_file *file); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 6d363fd8a1e4c8..191db32dec4698 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -556,16 +556,16 @@ static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, return -1; } -static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static int eprobe_trigger_reg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { return -1; } -static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static void eprobe_trigger_unreg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 229ce5c2dfd37b..5e6a988a8a517e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5910,7 +5910,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } -static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, +static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6062,7 +6062,7 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, return false; } -static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6262,7 +6262,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6271,7 +6271,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of triggers registered, * but if it didn't register any it returns zero. Consider no @@ -6314,7 +6314,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 24aceeb50dc09e..d40b857db5723e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -540,7 +540,6 @@ void update_cond_flag(struct trace_event_file *file) /** * register_trigger - Generic event_command @reg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data to associate with the trigger * @file: The trace_event_file associated with the event * @@ -551,7 +550,7 @@ void update_cond_flag(struct trace_event_file *file) * * Return: 0 on success, errno otherwise */ -static int register_trigger(char *glob, struct event_trigger_ops *ops, +static int register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -589,7 +588,6 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, /** * unregister_trigger - Generic event_command @unreg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @test: Trigger-specific data used to find the trigger to remove * @file: The trace_event_file associated with the event * @@ -598,7 +596,7 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { @@ -673,7 +671,7 @@ event_trigger_parse(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); ret = 0; goto out; @@ -708,14 +706,14 @@ event_trigger_parse(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { - cmd_ops->unreg(glob, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob, trigger_data, file); ret = -ENOENT; } else if (ret > 0) ret = 0; @@ -1116,14 +1114,14 @@ snapshot_count_trigger(struct event_trigger_data *data, } static int -register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, +register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { if (tracing_alloc_snapshot_instance(file->tr) != 0) return 0; - return register_trigger(glob, ops, data, file); + return register_trigger(glob, data, file); } static int @@ -1455,7 +1453,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, trigger_data->private_data = enable_data; if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1502,7 +1500,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. @@ -1532,7 +1530,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, } int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { @@ -1574,7 +1571,6 @@ int event_enable_register_trigger(char *glob, } void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file) { From 86599dbe2c5272588f859858239d1f52321eb0f9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:14 -0600 Subject: [PATCH 24/50] tracing: Add helper functions to simplify event_command.parse() callback handling The event_command.parse() callback is responsible for parsing and registering triggers. The existing command implementions for this callback duplicate a lot of the same code, so to clean up and consolidate those implementations, introduce a handful of helper functions for implementors to use. This also makes it easier for new commands to be implemented and allows them to focus more on the customizations they provide rather than obscuring and complicating it with boilerplate code. Link: https://lkml.kernel.org/r/c1ff71f594d45177706571132bd3119491097221.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 24 ++ kernel/trace/trace_events_trigger.c | 342 ++++++++++++++++++++++++++++ 2 files changed, 366 insertions(+) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 22a1e8635acf54..d038ddbf1beaa9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1610,6 +1610,30 @@ get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); +extern bool event_trigger_check_remove(const char *glob); +extern bool event_trigger_empty_param(const char *param); +extern int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required); +extern struct event_trigger_data * +event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data); +extern int event_trigger_parse_num(char *trigger, + struct event_trigger_data *trigger_data); +extern int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data); +extern void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data); +extern int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *trigger, + struct event_trigger_data *trigger_data, + int *n_registered); /** * struct event_trigger_ops - callbacks for trace event triggers diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d40b857db5723e..d00fee705f9c57 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -619,6 +619,348 @@ static void unregister_trigger(char *glob, data->ops->free(data->ops, data); } +/* + * Event trigger parsing helper functions. + * + * These functions help make it easier to write an event trigger + * parsing function i.e. the struct event_command.parse() callback + * function responsible for parsing and registering a trigger command + * written to the 'trigger' file. + * + * A trigger command (or just 'trigger' for short) takes the form: + * [trigger] [if filter] + * + * The struct event_command.parse() callback (and other struct + * event_command functions) refer to several components of a trigger + * command. Those same components are referenced by the event trigger + * parsing helper functions defined below. These components are: + * + * cmd - the trigger command name + * glob - the trigger command name optionally prefaced with '!' + * param_and_filter - text following cmd and ':' + * param - text following cmd and ':' and stripped of filter + * filter - the optional filter text following (and including) 'if' + * + * To illustrate the use of these componenents, here are some concrete + * examples. For the following triggers: + * + * echo 'traceon:5 if pid == 0' > trigger + * - 'traceon' is both cmd and glob + * - '5 if pid == 0' is the param_and_filter + * - '5' is the param + * - 'if pid == 0' is the filter + * + * echo 'enable_event:sys:event:n' > trigger + * - 'enable_event' is both cmd and glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'hist:keys=pid if prio > 50' > trigger + * - 'hist' is both cmd and glob + * - 'keys=pid if prio > 50' is the param_and_filter + * - 'keys=pid' is the param + * - 'if prio > 50' is the filter + * + * echo '!enable_event:sys:event:n' > trigger + * - 'enable_event' the cmd + * - '!enable_event' is the glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'traceoff' > trigger + * - 'traceoff' is both cmd and glob + * - there is no param_and_filter + * - there is no param + * - there is no filter + * + * There are a few different categories of event trigger covered by + * these helpers: + * + * - triggers that don't require a parameter e.g. traceon + * - triggers that do require a parameter e.g. enable_event and hist + * - triggers that though they may not require a param may support an + * optional 'n' param (n = number of times the trigger should fire) + * e.g.: traceon:5 or enable_event:sys:event:n + * - triggers that do not support an 'n' param e.g. hist + * + * These functions can be used or ignored as necessary - it all + * depends on the complexity of the trigger, and the granularity of + * the functions supported reflects the fact that some implementations + * may need to customize certain aspects of their implementations and + * won't need certain functions. For instance, the hist trigger + * implementation doesn't use event_trigger_separate_filter() because + * it has special requirements for handling the filter. + */ + +/** + * event_trigger_check_remove - check whether an event trigger specifies remove + * @glob: The trigger command string, with optional remove(!) operator + * + * The event trigger callback implementations pass in 'glob' as a + * parameter. This is the command name either with or without a + * remove(!) operator. This function simply parses the glob and + * determines whether the command corresponds to a trigger removal or + * a trigger addition. + * + * Return: true if this is a remove command, false otherwise + */ +bool event_trigger_check_remove(const char *glob) +{ + return (glob && glob[0] == '!') ? true : false; +} + +/** + * event_trigger_empty_param - check whether the param is empty + * @param: The trigger param string + * + * The event trigger callback implementations pass in 'param' as a + * parameter. This corresponds to the string following the command + * name minus the command name. This function can be called by a + * callback implementation for any command that requires a param; a + * callback that doesn't require a param can ignore it. + * + * Return: true if this is an empty param, false otherwise + */ +bool event_trigger_empty_param(const char *param) +{ + return !param; +} + +/** + * event_trigger_separate_filter - separate an event trigger from a filter + * @param: The param string containing trigger and possibly filter + * @trigger: outparam, will be filled with a pointer to the trigger + * @filter: outparam, will be filled with a pointer to the filter + * @param_required: Specifies whether or not the param string is required + * + * Given a param string of the form '[trigger] [if filter]', this + * function separates the filter from the trigger and returns the + * trigger in *trigger and the filter in *filter. Either the *trigger + * or the *filter may be set to NULL by this function - if not set to + * NULL, they will contain strings corresponding to the trigger and + * filter. + * + * There are two cases that need to be handled with respect to the + * passed-in param: either the param is required, or it is not + * required. If @param_required is set, and there's no param, it will + * return -EINVAL. If @param_required is not set and there's a param + * that starts with a number, that corresponds to the case of a + * trigger with :n (n = number of times the trigger should fire) and + * the parsing continues normally; otherwise the function just returns + * and assumes param just contains a filter and there's nothing else + * to do. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required) +{ + int ret = 0; + + *param = *filter = NULL; + + if (!param_and_filter) { + if (param_required) + ret = -EINVAL; + goto out; + } + + /* + * Here we check for an optional param. The only legal + * optional param is :n, and if that's the case, continue + * below. Otherwise we assume what's left is a filter and + * return it as the filter string for the caller to deal with. + */ + if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { + *filter = param_and_filter; + goto out; + } + + /* + * Separate the param from the filter (param [if filter]). + * Here we have either an optional :n param or a required + * param and an optional filter. + */ + *param = strsep(¶m_and_filter, " \t"); + + /* + * Here we have a filter, though it may be empty. + */ + if (param_and_filter) { + *filter = skip_spaces(param_and_filter); + if (!**filter) + *filter = NULL; + } +out: + return ret; +} + +/** + * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * @cmd_ops: The event_command operations for the trigger + * @cmd: The cmd string + * @param: The param string + * @private_data: User data to associate with the event trigger + * + * Allocate an event_trigger_data instance and initialize it. The + * @cmd_ops are used along with the @cmd and @param to get the + * trigger_ops to assign to the event_trigger_data. @private_data can + * also be passed in and associated with the event_trigger_data. + * + * Use event_trigger_free() to free an event_trigger_data object. + * + * Return: The trigger_data object success, NULL otherwise + */ +struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, param); + + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + return NULL; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = private_data; + + INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + return trigger_data; +} + +/** + * event_trigger_parse_num - parse and return the number param for a trigger + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * + * Parse the :n (n = number of times the trigger should fire) param + * and set the count variable in the trigger_data to the parsed count. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_parse_num(char *param, + struct event_trigger_data *trigger_data) +{ + char *number; + int ret = 0; + + if (param) { + number = strsep(¶m, ":"); + + if (!strlen(number)) + return -EINVAL; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + } + + return ret; +} + +/** + * event_trigger_set_filter - set an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @param: The string containing the filter + * @trigger_data: The trigger_data for the trigger + * + * Set the filter for the trigger. If the filter is NULL, just return + * without error. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data) +{ + if (param && cmd_ops->set_filter) + return cmd_ops->set_filter(param, trigger_data, file); + + return 0; +} + +/** + * event_trigger_reset_filter - reset an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @trigger_data: The trigger_data for the trigger + * + * Reset the filter for the trigger to no filter. + */ +void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data) +{ + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); +} + +/** + * event_trigger_register - register an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @cmd: The cmd string + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * @n_registered: optional outparam, the number of triggers registered + * + * Register an event trigger. The @cmd_ops are used to call the + * cmd_ops->reg() function which actually does the registration. The + * cmd_ops->reg() function returns the number of triggers registered, + * which is assigned to n_registered, if n_registered is non-NULL. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *param, + struct event_trigger_data *trigger_data, + int *n_registered) +{ + int ret; + + if (n_registered) + *n_registered = 0; + + ret = cmd_ops->reg(glob, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + cmd_ops->unreg(glob, trigger_data, file); + ret = -ENOENT; + } else if (ret > 0) { + if (n_registered) + *n_registered = ret; + /* Just return zero, not the number of enabled functions */ + ret = 0; + } + + return ret; +} + +/* + * End event trigger parsing helper functions. + */ + /** * event_trigger_parse - Generic event_command @parse implementation * @cmd_ops: The command ops, used for trigger registration From 289e7b0f7eb47b87a0441e6c81336316f301eb39 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 13 Dec 2021 11:08:53 +0100 Subject: [PATCH 25/50] tracing: Account bottom half disabled sections. Disabling only bottom halves via local_bh_disable() disables also preemption but this remains invisible to tracing. On a CONFIG_PREEMPT kernel one might wonder why there is no scheduling happening despite the N flag in the trace. The reason might be the a rcu_read_lock_bh() section. Add a 'b' to the tracing output if in task context with disabled bottom halves. Link: https://lkml.kernel.org/r/YbcbtdtC/bjCKo57@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 6 ++++-- kernel/trace/trace_output.c | 4 ++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3900404aa0636d..70c069aef02ca8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -172,6 +172,7 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_BH_OFF = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 547d82628c2e2c..a73d78dcda2c90 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2603,6 +2603,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; + if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) + trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; @@ -4190,7 +4192,7 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" + "# / _-----=> irqs-off/BH-disabled\n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" @@ -4231,7 +4233,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3547e7176ff797..8aa493d25c73eb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -445,14 +445,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int bh_off; int nmi; nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + bh_off = entry->flags & TRACE_FLAG_BH_OFF; irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + bh_off ? 'b' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; From 8c7224245557707c613f130431cafbaaa4889615 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 14 Dec 2021 09:28:02 +0800 Subject: [PATCH 26/50] tracing/uprobes: Check the return value of kstrdup() for tu->filename kstrdup() returns NULL when some internal memory errors happen, it is better to check the return value of it so to catch the memory error in time. Link: https://lkml.kernel.org/r/tencent_3C2E330722056D7891D2C83F29C802734B06@qq.com Acked-by: Masami Hiramatsu Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU") Signed-off-by: Xiaoke Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3bd09d61213729..08b0e841730268 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1609,6 +1609,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); + if (!tu->filename) { + ret = -ENOMEM; + goto error; + } + init_trace_event_call(tu); ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; From 1c1857d400355e96f0fe8b32adc6fa7594d03b52 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 14 Dec 2021 10:26:46 +0800 Subject: [PATCH 27/50] tracing/probes: check the return value of kstrndup() for pbuf kstrndup() is a memory allocation-related function, it returns NULL when some internal memory errors happen. It is better to check the return value of it so to catch the memory error in time. Link: https://lkml.kernel.org/r/tencent_4D6E270731456EB88712ED7F13883C334906@qq.com Acked-by: Masami Hiramatsu Fixes: a42e3c4de964 ("tracing/probe: Add immediate string parameter support") Signed-off-by: Xiaoke Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_probe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8a3822818bf88e..73d90179b51bd3 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) return -EINVAL; } *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + if (!*pbuf) + return -ENOMEM; return 0; } From 72b3942a173c387b27860ba1069636726e208777 Mon Sep 17 00:00:00 2001 From: Yinan Liu Date: Sun, 12 Dec 2021 19:33:58 +0800 Subject: [PATCH 28/50] scripts: ftrace - move the sort-processing in ftrace_init When the kernel starts, the initialization of ftrace takes up a portion of the time (approximately 6~8ms) to sort mcount addresses. We can save this time by moving mcount-sorting to compile time. Link: https://lkml.kernel.org/r/20211212113358.34208-2-yinan@linux.alibaba.com Signed-off-by: Yinan Liu Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 11 +++- scripts/Makefile | 6 +- scripts/link-vmlinux.sh | 6 +- scripts/sorttable.c | 2 + scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++- 5 files changed, 137 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30bc880c3849cb..9ca63df6553a83 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6406,8 +6406,15 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - sort(start, count, sizeof(*start), - ftrace_cmp_ips, NULL); + /* + * Sorting mcount in vmlinux at build time depend on + * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in + * modules can not be sorted at build time. + */ + if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { + sort(start, count, sizeof(*start), + ftrace_cmp_ips, NULL); + } start_pg = ftrace_allocate_pages(count); if (!start_pg) diff --git a/scripts/Makefile b/scripts/Makefile index 9adb6d247818f1..b082d2f9335746 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -17,6 +17,7 @@ hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert hostprogs-always-$(CONFIG_SYSTEM_REVOCATION_LIST) += extract-cert HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include +HOSTLDLIBS_sorttable = -lpthread HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include HOSTCFLAGS_sign-file.o = $(CRYPTO_CFLAGS) HOSTLDLIBS_sign-file = $(CRYPTO_LIBS) @@ -29,7 +30,10 @@ ARCH := x86 endif HOSTCFLAGS_sorttable.o += -I$(srctree)/tools/arch/x86/include HOSTCFLAGS_sorttable.o += -DUNWINDER_ORC_ENABLED -HOSTLDLIBS_sorttable = -lpthread +endif + +ifdef CONFIG_DYNAMIC_FTRACE +HOSTCFLAGS_sorttable.o += -DMCOUNT_SORT_ENABLED endif # The following programs are only built on demand diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 5cdd9bc5c3852b..dd9955f45774cc 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -400,6 +400,9 @@ if [ -n "${CONFIG_DEBUG_INFO_BTF}" -a -n "${CONFIG_BPF}" ]; then ${RESOLVE_BTFIDS} vmlinux fi +info SYSMAP System.map +mksysmap vmlinux System.map + if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then info SORTTAB vmlinux if ! sorttable vmlinux; then @@ -408,9 +411,6 @@ if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then fi fi -info SYSMAP System.map -mksysmap vmlinux System.map - # step a (see comment above) if [ -n "${CONFIG_KALLSYMS}" ]; then mksysmap ${kallsyms_vmlinux} .tmp_System.map diff --git a/scripts/sorttable.c b/scripts/sorttable.c index b7c2ad71f9cfab..70bdc787ddfb66 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 7b9745cf8c7091..1e8b77928fa476 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -19,6 +19,9 @@ #undef extable_ent_size #undef compare_extable +#undef get_mcount_loc +#undef sort_mcount_loc +#undef elf_mcount_loc #undef do_sort #undef Elf_Addr #undef Elf_Ehdr @@ -41,6 +44,9 @@ #ifdef SORTTABLE_64 # define extable_ent_size 16 # define compare_extable compare_extable_64 +# define get_mcount_loc get_mcount_loc_64 +# define sort_mcount_loc sort_mcount_loc_64 +# define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 # define Elf_Addr Elf64_Addr # define Elf_Ehdr Elf64_Ehdr @@ -62,6 +68,9 @@ #else # define extable_ent_size 8 # define compare_extable compare_extable_32 +# define get_mcount_loc get_mcount_loc_32 +# define sort_mcount_loc sort_mcount_loc_32 +# define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 # define Elf_Addr Elf32_Addr # define Elf_Ehdr Elf32_Ehdr @@ -84,8 +93,6 @@ #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* ORC unwinder only support X86_64 */ -#include -#include #include #define ERRSTR_MAXSZ 256 @@ -191,7 +198,64 @@ static int compare_extable(const void *a, const void *b) return 1; return 0; } +#ifdef MCOUNT_SORT_ENABLED +struct elf_mcount_loc { + Elf_Ehdr *ehdr; + Elf_Shdr *init_data_sec; + uint_t start_mcount_loc; + uint_t stop_mcount_loc; +}; + +/* Sort the addresses stored between __start_mcount_loc to __stop_mcount_loc in vmlinux */ +static void *sort_mcount_loc(void *arg) +{ + struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; + uint_t offset = emloc->start_mcount_loc - _r(&(emloc->init_data_sec)->sh_addr) + + _r(&(emloc->init_data_sec)->sh_offset); + uint_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; + unsigned char *start_loc = (void *)emloc->ehdr + offset; + + qsort(start_loc, count/sizeof(uint_t), sizeof(uint_t), compare_extable); + return NULL; +} + +/* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */ +static void get_mcount_loc(uint_t *_start, uint_t *_stop) +{ + FILE *file_start, *file_stop; + char start_buff[20]; + char stop_buff[20]; + int len = 0; + + file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r"); + if (!file_start) { + fprintf(stderr, "get start_mcount_loc error!"); + return; + } + + file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r"); + if (!file_stop) { + fprintf(stderr, "get stop_mcount_loc error!"); + pclose(file_start); + return; + } + + while (fgets(start_buff, sizeof(start_buff), file_start) != NULL) { + len = strlen(start_buff); + start_buff[len - 1] = '\0'; + } + *_start = strtoul(start_buff, NULL, 16); + + while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) { + len = strlen(stop_buff); + stop_buff[len - 1] = '\0'; + } + *_stop = strtoul(stop_buff, NULL, 16); + pclose(file_start); + pclose(file_stop); +} +#endif static int do_sort(Elf_Ehdr *ehdr, char const *const fname, table_sort_t custom_sort) @@ -217,6 +281,12 @@ static int do_sort(Elf_Ehdr *ehdr, int idx; unsigned int shnum; unsigned int shstrndx; +#ifdef MCOUNT_SORT_ENABLED + struct elf_mcount_loc mstruct; + uint_t _start_mcount_loc = 0; + uint_t _stop_mcount_loc = 0; + pthread_t mcount_sort_thread; +#endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) unsigned int orc_ip_size = 0; unsigned int orc_size = 0; @@ -253,6 +323,17 @@ static int do_sort(Elf_Ehdr *ehdr, symtab_shndx = (Elf32_Word *)((const char *)ehdr + _r(&s->sh_offset)); +#ifdef MCOUNT_SORT_ENABLED + /* locate the .init.data section in vmlinux */ + if (!strcmp(secstrings + idx, ".init.data")) { + get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc); + mstruct.ehdr = ehdr; + mstruct.init_data_sec = s; + mstruct.start_mcount_loc = _start_mcount_loc; + mstruct.stop_mcount_loc = _stop_mcount_loc; + } +#endif + #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* locate the ORC unwind tables */ if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { @@ -294,6 +375,23 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } #endif + +#ifdef MCOUNT_SORT_ENABLED + if (!mstruct.init_data_sec || !_start_mcount_loc || !_stop_mcount_loc) { + fprintf(stderr, + "incomplete mcount's sort in file: %s\n", + fname); + goto out; + } + + /* create thread to sort mcount_loc concurrently */ + if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) { + fprintf(stderr, + "pthread_create mcount_sort_thread failed '%s': %s\n", + strerror(errno), fname); + goto out; + } +#endif if (!extab_sec) { fprintf(stderr, "no __ex_table in file: %s\n", fname); goto out; @@ -376,5 +474,23 @@ static int do_sort(Elf_Ehdr *ehdr, } } #endif + +#ifdef MCOUNT_SORT_ENABLED + if (mcount_sort_thread) { + void *retval = NULL; + /* wait for mcount sort done */ + rc = pthread_join(mcount_sort_thread, &retval); + if (rc) { + fprintf(stderr, + "pthread_join failed '%s': %s\n", + strerror(errno), fname); + } else if (retval) { + rc = -1; + fprintf(stderr, + "failed to sort mcount '%s': %s\n", + (char *)retval, fname); + } + } +#endif return rc; } From 8147dc78e6e4b645f8277bdf377f2193ddfcdee1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 6 Dec 2021 15:18:58 -0500 Subject: [PATCH 29/50] ftrace: Add test to make sure compiled time sorts work Now that ftrace function pointers are sorted at compile time, add a test that makes sure they are sorted at run time. This test is only run if it is configured in. Link: https://lkml.kernel.org/r/20211206151858.4d21a24d@gandalf.local.home Cc: Yinan Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 14 ++++++++++++++ kernel/trace/ftrace.c | 23 +++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 420ff4bc67fd70..f468767bc287f1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -915,6 +915,20 @@ config EVENT_TRACE_TEST_SYSCALLS TBD - enable a way to actually call the syscalls as we test their events +config FTRACE_SORT_STARTUP_TEST + bool "Verify compile time sorting of ftrace functions" + depends on DYNAMIC_FTRACE + depends on BUILDTIME_TABLE_SORT + help + Sorting of the mcount_loc sections that is used to find the + where the ftrace knows where to patch functions for tracing + and other callbacks is done at compile time. But if the sort + is not done correctly, it will cause non-deterministic failures. + When this is set, the sorted sections will be verified that they + are in deed sorted and will warn if they are not. + + If unsure, say N + config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9ca63df6553a83..403e485bf09199 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6388,6 +6388,27 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } +#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST +static void test_is_sorted(unsigned long *start, unsigned long count) +{ + int i; + + for (i = 1; i < count; i++) { + if (WARN(start[i - 1] > start[i], + "[%d] %pS at %lx is not sorted with %pS at %lx\n", i, + (void *)start[i - 1], start[i - 1], + (void *)start[i], start[i])) + break; + } + if (i == count) + pr_info("ftrace section at %px sorted properly\n", start); +} +#else +static void test_is_sorted(unsigned long *start, unsigned long count) +{ +} +#endif + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -6414,6 +6435,8 @@ static int ftrace_process_locs(struct module *mod, if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { sort(start, count, sizeof(*start), ftrace_cmp_ips, NULL); + } else { + test_is_sorted(start, count); } start_pg = ftrace_allocate_pages(count); From ecbe794e777aab99de3b753110801a9cda747d2d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 22 Dec 2021 19:00:25 +0800 Subject: [PATCH 30/50] tracing: Fix mismatched comment in __string_len Here __assign_str_len() should be used for the __string_len type, instead of __assign_str() in the comment. Link: https://lkml.kernel.org/r/5c012db463392d0e6d4f0636203d778962ad060a.1640170494.git.geliang.tang@suse.com Reviewed-by: Masami Hiramatsu Fixes: 883b4aee4dec6 ("tracing: Add trace_event helper macros __string_len() and __assign_str_len()") Signed-off-by: Geliang Tang Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 5ab74fc9a2dfe8..cbbbb83beced09 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -155,7 +155,7 @@ * * To assign this string, use the helper macro __assign_str_len(). * - * __assign_str(foo, bar, len); + * __assign_str_len(foo, bar, len); * * Then len + 1 is allocated to the ring buffer, and a nul terminating * byte is added. This is similar to: From 3e2a56e6f639492311e0a8533f0a7aed60816308 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Jan 2022 17:56:56 -0500 Subject: [PATCH 31/50] tracing: Have syscall trace events use trace_event_buffer_lock_reserve() Currently, the syscall trace events call trace_buffer_lock_reserve() directly, which means that it misses out on some of the filtering optimizations provided by the helper function trace_event_buffer_lock_reserve(). Have the syscall trace events call that instead, as it was missed when adding the update to use the temp buffer when filtering. Link: https://lkml.kernel.org/r/20220107225839.823118570@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reviewed-by: Masami Hiramatsu Fixes: 0fc1b09ff1ff4 ("tracing: Use temp buffer when filtering events") Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8bfcd3b0942262..f755bde42fd07c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -323,8 +323,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->enter_event->event.type, size, trace_ctx); if (!event) return; @@ -367,8 +366,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->exit_event->event.type, sizeof(*entry), trace_ctx); if (!event) From 77360f9bbc7e5e2ab7a2c8b4c0244fbbfcfc6f62 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Jan 2022 11:55:32 -0500 Subject: [PATCH 32/50] tracing: Add test for user space strings when filtering on string pointers Pingfan reported that the following causes a fault: echo "filename ~ \"cpu\"" > events/syscalls/sys_enter_openat/filter echo 1 > events/syscalls/sys_enter_at/enable The reason is that trace event filter treats the user space pointer defined by "filename" as a normal pointer to compare against the "cpu" string. The following bug happened: kvm-03-guest16 login: [72198.026181] BUG: unable to handle page fault for address: 00007fffaae8ef60 #PF: supervisor read access in kernel mode #PF: error_code(0x0001) - permissions violation PGD 80000001008b7067 P4D 80000001008b7067 PUD 2393f1067 PMD 2393ec067 PTE 8000000108f47867 Oops: 0001 [#1] PREEMPT SMP PTI CPU: 1 PID: 1 Comm: systemd Kdump: loaded Not tainted 5.14.0-32.el9.x86_64 #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:strlen+0x0/0x20 Code: 48 89 f9 74 09 48 83 c1 01 80 39 00 75 f7 31 d2 44 0f b6 04 16 44 88 04 11 48 83 c2 01 45 84 c0 75 ee c3 0f 1f 80 00 00 00 00 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 31 RSP: 0018:ffffb5b900013e48 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff8fc1c49ede00 RCX: 0000000000000000 RDX: 0000000000000020 RSI: ffff8fc1c02d601c RDI: 00007fffaae8ef60 RBP: 00007fffaae8ef60 R08: 0005034f4ddb8ea4 R09: 0000000000000000 R10: ffff8fc1c02d601c R11: 0000000000000000 R12: ffff8fc1c8a6e380 R13: 0000000000000000 R14: ffff8fc1c02d6010 R15: ffff8fc1c00453c0 FS: 00007fa86123db40(0000) GS:ffff8fc2ffd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fffaae8ef60 CR3: 0000000102880001 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: filter_pred_pchar+0x18/0x40 filter_match_preds+0x31/0x70 ftrace_syscall_enter+0x27a/0x2c0 syscall_trace_enter.constprop.0+0x1aa/0x1d0 do_syscall_64+0x16/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fa861d88664 The above happened because the kernel tried to access user space directly and triggered a "supervisor read access in kernel mode" fault. Worse yet, the memory could not even be loaded yet, and a SEGFAULT could happen as well. This could be true for kernel space accessing as well. To be even more robust, test both kernel and user space strings. If the string fails to read, then simply have the filter fail. Note, TASK_SIZE is used to determine if the pointer is user or kernel space and the appropriate strncpy_from_kernel/user_nofault() function is used to copy the memory. For some architectures, the compare to TASK_SIZE may always pick user space or kernel space. If it gets it wrong, the only thing is that the filter will fail to match. In the future, this needs to be fixed to have the event denote which should be used. But failing a filter is much better than panicing the machine, and that can be solved later. Link: https://lore.kernel.org/all/20220107044951.22080-1-kernelfans@gmail.com/ Link: https://lkml.kernel.org/r/20220110115532.536088fd@gandalf.local.home Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tom Zanussi Reported-by: Pingfan Liu Tested-by: Pingfan Liu Fixes: 87a342f5db69d ("tracing/filters: Support filtering for char * strings") Signed-off-by: Steven Rostedt --- Documentation/trace/events.rst | 10 +++++ kernel/trace/trace_events_filter.c | 66 ++++++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 8ddb9b09451c8c..45e66a60a816a9 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -230,6 +230,16 @@ Currently the caret ('^') for an error always appears at the beginning of the filter string; the error message should still be useful though even without more accurate position info. +5.2.1 Filter limitations +------------------------ + +If a filter is placed on a string pointer ``(char *)`` that does not point +to a string on the ring buffer, but instead points to kernel or user space +memory, then, for safety reasons, at most 1024 bytes of the content is +copied onto a temporary buffer to do the compare. If the copy of the memory +faults (the pointer points to memory that should not be accessed), then the +string compare will be treated as not matching. + 5.3 Clearing filters -------------------- diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 996920ed1812d3..2e9ef64e9ee9ff 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -5,6 +5,7 @@ * Copyright (C) 2009 Tom Zanussi */ +#include #include #include #include @@ -654,6 +655,47 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); +/* user space strings temp buffer */ +#define USTRING_BUF_SIZE 1024 + +struct ustring_buffer { + char buffer[USTRING_BUF_SIZE]; +}; + +static __percpu struct ustring_buffer *ustring_per_cpu; + +static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char __user *ustr; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* + * We use TASK_SIZE to denote user or kernel space, but this will + * not work for all architectures. If it picks the wrong one, it may + * just fail the filter (but will not bug). + * + * TODO: Have a way to properly denote which one this is for. + */ + if (likely((unsigned long)str >= TASK_SIZE)) { + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + } else { + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + } + return kstr; +} + /* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event) { @@ -671,10 +713,16 @@ static int filter_pred_string(struct filter_pred *pred, void *event) static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); + char *str; int cmp, match; - int len = strlen(*addr) + 1; /* including tailing '\0' */ + int len; - cmp = pred->regex.match(*addr, &pred->regex, len); + str = test_string(*addr); + if (!str) + return 0; + + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); match = cmp ^ pred->not; @@ -1348,8 +1396,17 @@ static int parse_pred(const char *str, void *data, pred->fn = filter_pred_strloc; } else if (field->filter_type == FILTER_RDYN_STRING) pred->fn = filter_pred_strrelloc; - else + else { + + if (!ustring_per_cpu) { + /* Once allocated, keep it around for good */ + ustring_per_cpu = alloc_percpu(struct ustring_buffer); + if (!ustring_per_cpu) + goto err_mem; + } + pred->fn = filter_pred_pchar; + } /* go past the last quote */ i++; @@ -1415,6 +1472,9 @@ static int parse_pred(const char *str, void *data, err_free: kfree(pred); return -EINVAL; +err_mem: + kfree(pred); + return -ENOMEM; } enum { From dfea08a2116fe327f79d8f4d4b2cf6e0c88be11f Mon Sep 17 00:00:00 2001 From: Xiangyang Zhang Date: Fri, 7 Jan 2022 23:02:42 +0800 Subject: [PATCH 33/50] tracing/kprobes: 'nmissed' not showed correctly for kretprobe The 'nmissed' column of the 'kprobe_profile' file for kretprobe is not showed correctly, kretprobe can be skipped by two reasons, shortage of kretprobe_instance which is counted by tk->rp.nmissed, and kprobe itself is missed by some reason, so to show the sum. Link: https://lkml.kernel.org/r/20220107150242.5019-1-xyz.sun.ok@gmail.com Cc: stable@vger.kernel.org Fixes: 4a846b443b4e ("tracing/kprobes: Cleanup kprobe tracer code") Acked-by: Masami Hiramatsu Signed-off-by: Xiangyang Zhang Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f8c26ee72de3d0..3d85323278eda3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1170,15 +1170,18 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_kprobe *tk; + unsigned long nmissed; if (!is_trace_kprobe(ev)) return 0; tk = to_trace_kprobe(ev); + nmissed = trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), - tk->rp.kp.nmissed); + nmissed); return 0; } From 6e1b4bd1911d814077d77e2ac6529d74ee68c0f6 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Mon, 10 Jan 2022 00:22:32 +0800 Subject: [PATCH 34/50] tracing: Remove duplicate warnings when calling trace_create_file() Since the same warning message is already printed in the trace_create_file() function, there is no need to print it again. Link: https://lkml.kernel.org/r/20220109162232.361747-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 92be9cb1d7d4bc..3147614c1812a1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3461,10 +3461,8 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) { - pr_warn("Could not create tracefs 'enable' entry\n"); + if (!entry) return -ENOMEM; - } /* There are not as crucial, just warn if they are not created */ @@ -3480,17 +3478,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, + trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, + trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; From 0878355b51f5f26632e652c848a8e174bb02d22d Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sun, 9 Jan 2022 18:34:59 +0300 Subject: [PATCH 35/50] tracing/osnoise: Properly unhook events if start_per_cpu_kthreads() fails If start_per_cpu_kthreads() called from osnoise_workload_start() returns error, event hooks are left in broken state: unhook_irq_events() called but unhook_thread_events() and unhook_softirq_events() not called, and trace_osnoise_callback_enabled flag not cleared. On the next tracer enable, hooks get not installed due to trace_osnoise_callback_enabled flag. And on the further tracer disable an attempt to remove non-installed hooks happened, hitting a WARN_ON_ONCE() in tracepoint_remove_func(). Fix the error path by adding the missing part of cleanup. While at this, introduce osnoise_unhook_events() to avoid code duplication between this error path and normal tracer disable. Link: https://lkml.kernel.org/r/20220109153459.3701773-1-nikita.yushchenko@virtuozzo.com Cc: stable@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Acked-by: Daniel Bristot de Oliveira Signed-off-by: Nikita Yushchenko Signed-off-by: Steven Rostedt --- kernel/trace/trace_osnoise.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4719a848bf1740..36d9d5be08b4ab 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2122,6 +2122,13 @@ static int osnoise_hook_events(void) return -EINVAL; } +static void osnoise_unhook_events(void) +{ + unhook_thread_events(); + unhook_softirq_events(); + unhook_irq_events(); +} + /* * osnoise_workload_start - start the workload and hook to events */ @@ -2154,7 +2161,14 @@ static int osnoise_workload_start(void) retval = start_per_cpu_kthreads(); if (retval) { - unhook_irq_events(); + trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ + barrier(); + + osnoise_unhook_events(); return retval; } @@ -2185,9 +2199,7 @@ static void osnoise_workload_stop(void) stop_per_cpu_kthreads(); - unhook_irq_events(); - unhook_softirq_events(); - unhook_thread_events(); + osnoise_unhook_events(); } static void osnoise_tracer_start(struct trace_array *tr) From 79ce8f43ac5ac16d6d63729e858da42b0460d87b Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:20 +0100 Subject: [PATCH 36/50] rtla: Real-Time Linux Analysis tool The rtla is a meta-tool that includes a set of commands that aims to analyze the real-time properties of Linux. But instead of testing Linux as a black box, rtla leverages kernel tracing capabilities to provide precise information about the properties and root causes of unexpected results. rtla --help works and provide information about the available options. This is just the "main" and the Makefile, no function yet. Link: https://lkml.kernel.org/r/bf9118ed43a09e6c054c9a491cbe7411ad1acd89.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/Makefile | 76 +++++++++++++++++++++++++++++++++++ tools/tracing/rtla/README.txt | 36 +++++++++++++++++ tools/tracing/rtla/src/rtla.c | 72 +++++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+) create mode 100644 tools/tracing/rtla/Makefile create mode 100644 tools/tracing/rtla/README.txt create mode 100644 tools/tracing/rtla/src/rtla.c diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile new file mode 100644 index 00000000000000..56b1bdd60c9666 --- /dev/null +++ b/tools/tracing/rtla/Makefile @@ -0,0 +1,76 @@ +NAME := rtla +VERSION := 0.5 + +# From libtracefs: +# Makefiles suck: This macro sets a default value of $(2) for the +# variable named by $(1), unless the variable has been set by +# environment or command line. This is necessary for CC and AR +# because make sets default values, so the simpler ?= approach +# won't work as expected. +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix. +$(call allow-override,CC,$(CROSS_COMPILE)gcc) +$(call allow-override,AR,$(CROSS_COMPILE)ar) +$(call allow-override,STRIP,$(CROSS_COMPILE)strip) +$(call allow-override,PKG_CONFIG,pkg-config) +$(call allow-override,LD_SO_CONF_PATH,/etc/ld.so.conf.d/) +$(call allow-override,LDCONFIG,ldconfig) + +INSTALL = install +FOPTS := -flto=auto -ffat-lto-objects -fexceptions -fstack-protector-strong \ + -fasynchronous-unwind-tables -fstack-clash-protection +WOPTS := -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -Wno-maybe-uninitialized + +TRACEFS_HEADERS := $$($(PKG_CONFIG) --cflags libtracefs) + +CFLAGS := -O -g -DVERSION=\"$(VERSION)\" $(FOPTS) $(MOPTS) $(WOPTS) $(TRACEFS_HEADERS) +LDFLAGS := -ggdb +LIBS := $$($(PKG_CONFIG) --libs libtracefs) -lprocps + +SRC := $(wildcard src/*.c) +HDR := $(wildcard src/*.h) +OBJ := $(SRC:.c=.o) +DIRS := src +FILES := Makefile README.txt +CEXT := bz2 +TARBALL := $(NAME)-$(VERSION).tar.$(CEXT) +TAROPTS := -cvjf $(TARBALL) +BINDIR := /usr/bin +DATADIR := /usr/share +DOCDIR := $(DATADIR)/doc +MANDIR := $(DATADIR)/man +LICDIR := $(DATADIR)/licenses + +.PHONY: all +all: rtla + +rtla: $(OBJ) + $(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS) + +static: $(OBJ) + $(CC) -o rtla-static $(LDFLAGS) --static $(OBJ) $(LIBS) -lpthread -ldl + +.PHONY: install +install: + $(INSTALL) -d -m 755 $(DESTDIR)$(BINDIR) + $(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR) + $(STRIP) $(DESTDIR)$(BINDIR)/rtla + +.PHONY: clean tarball +clean: + @test ! -f rtla || rm rtla + @test ! -f rtla-static || rm rtla-static + @test ! -f src/rtla.o || rm src/rtla.o + @test ! -f $(TARBALL) || rm -f $(TARBALL) + @rm -rf *~ $(OBJ) *.tar.$(CEXT) + +tarball: clean + rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION) + cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION) + tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION) + rm -rf $(NAME)-$(VERSION) diff --git a/tools/tracing/rtla/README.txt b/tools/tracing/rtla/README.txt new file mode 100644 index 00000000000000..6c88446f7e74aa --- /dev/null +++ b/tools/tracing/rtla/README.txt @@ -0,0 +1,36 @@ +RTLA: Real-Time Linux Analysis tools + +The rtla is a meta-tool that includes a set of commands that +aims to analyze the real-time properties of Linux. But, instead of +testing Linux as a black box, rtla leverages kernel tracing +capabilities to provide precise information about the properties +and root causes of unexpected results. + +Installing RTLA + +RTLA depends on some libraries and tools. More precisely, it depends on the +following libraries: + + - libtracefs + - libtraceevent + - procps + +It also depends on python3-docutils to compile man pages. + +For development, we suggest the following steps for compiling rtla: + + $ git clone git://git.kernel.org/pub/scm/libs/libtrace/libtraceevent.git + $ cd libtraceevent/ + $ make + $ sudo make install + $ cd .. + $ git clone git://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git + $ cd libtracefs/ + $ make + $ sudo make install + $ cd .. + $ cd $rtla_src + $ make + $ sudo make install + +For further information, please refer to the rtla man page. diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c new file mode 100644 index 00000000000000..5ae2664ed47df7 --- /dev/null +++ b/tools/tracing/rtla/src/rtla.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include + +/* + * rtla_usage - print rtla usage + */ +static void rtla_usage(void) +{ + int i; + + static const char *msg[] = { + "", + "rtla version " VERSION, + "", + " usage: rtla COMMAND ...", + "", + " commands:", + "", + NULL, + }; + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +/* + * run_command - try to run a rtla tool command + * + * It returns 0 if it fails. The tool's main will generally not + * return as they should call exit(). + */ +int run_command(int argc, char **argv, int start_position) +{ + return 0; +} + +int main(int argc, char *argv[]) +{ + int retval; + + /* is it an alias? */ + retval = run_command(argc, argv, 0); + if (retval) + exit(0); + + if (argc < 2) + goto usage; + + if (strcmp(argv[1], "-h") == 0) { + rtla_usage(); + exit(0); + } else if (strcmp(argv[1], "--help") == 0) { + rtla_usage(); + exit(0); + } + + retval = run_command(argc, argv, 1); + if (retval) + exit(0); + +usage: + rtla_usage(); + exit(1); +} From b1696371d865fa28bd20adb13d1820ef031f2ee2 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:21 +0100 Subject: [PATCH 37/50] rtla: Helper functions for rtla This is a set of utils and tracer helper functions. They are used by rtla mostly to parse config, display data and some trace operations that are not part of libtracefs (because they are only useful it for this case). Link: https://lkml.kernel.org/r/a94c128aba9e6e66d502b7094f2e8c7ac95b12e5.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/src/trace.c | 192 +++++++++++++++ tools/tracing/rtla/src/trace.h | 27 ++ tools/tracing/rtla/src/utils.c | 433 +++++++++++++++++++++++++++++++++ tools/tracing/rtla/src/utils.h | 56 +++++ 4 files changed, 708 insertions(+) create mode 100644 tools/tracing/rtla/src/trace.c create mode 100644 tools/tracing/rtla/src/trace.h create mode 100644 tools/tracing/rtla/src/utils.c create mode 100644 tools/tracing/rtla/src/utils.h diff --git a/tools/tracing/rtla/src/trace.c b/tools/tracing/rtla/src/trace.c new file mode 100644 index 00000000000000..107a0c6387f7f1 --- /dev/null +++ b/tools/tracing/rtla/src/trace.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "utils.h" + +/* + * enable_tracer_by_name - enable a tracer on the given instance + */ +int enable_tracer_by_name(struct tracefs_instance *inst, const char *tracer_name) +{ + enum tracefs_tracers tracer; + int retval; + + tracer = TRACEFS_TRACER_CUSTOM; + + debug_msg("enabling %s tracer\n", tracer_name); + + retval = tracefs_tracer_set(inst, tracer, tracer_name); + if (retval < 0) { + if (errno == ENODEV) + err_msg("tracer %s not found!\n", tracer_name); + + err_msg("failed to enable the tracer %s\n", tracer_name); + return -1; + } + + return 0; +} + +/* + * disable_tracer - set nop tracer to the insta + */ +void disable_tracer(struct tracefs_instance *inst) +{ + enum tracefs_tracers t = TRACEFS_TRACER_NOP; + int retval; + + retval = tracefs_tracer_set(inst, t); + if (retval < 0) + err_msg("oops, error disabling tracer\n"); +} + +/* + * create_instance - create a trace instance with *instance_name + */ +struct tracefs_instance *create_instance(char *instance_name) +{ + return tracefs_instance_create(instance_name); +} + +/* + * destroy_instance - remove a trace instance and free the data + */ +void destroy_instance(struct tracefs_instance *inst) +{ + tracefs_instance_destroy(inst); + tracefs_instance_free(inst); +} + +/* + * save_trace_to_file - save the trace output of the instance to the file + */ +int save_trace_to_file(struct tracefs_instance *inst, const char *filename) +{ + const char *file = "trace"; + mode_t mode = 0644; + char buffer[4096]; + int out_fd, in_fd; + int retval = -1; + + in_fd = tracefs_instance_file_open(inst, file, O_RDONLY); + if (in_fd < 0) { + err_msg("Failed to open trace file\n"); + return -1; + } + + out_fd = creat(filename, mode); + if (out_fd < 0) { + err_msg("Failed to create output file %s\n", filename); + goto out_close_in; + } + + do { + retval = read(in_fd, buffer, sizeof(buffer)); + if (retval <= 0) + goto out_close; + + retval = write(out_fd, buffer, retval); + if (retval < 0) + goto out_close; + } while (retval > 0); + + retval = 0; +out_close: + close(out_fd); +out_close_in: + close(in_fd); + return retval; +} + +/* + * collect_registered_events - call the existing callback function for the event + * + * If an event has a registered callback function, call it. + * Otherwise, ignore the event. + */ +int +collect_registered_events(struct tep_event *event, struct tep_record *record, + int cpu, void *context) +{ + struct trace_instance *trace = context; + struct trace_seq *s = trace->seq; + + if (!event->handler) + return 0; + + event->handler(s, record, event, context); + + return 0; +} + +/* + * trace_instance_destroy - destroy and free a rtla trace instance + */ +void trace_instance_destroy(struct trace_instance *trace) +{ + if (trace->inst) { + disable_tracer(trace->inst); + destroy_instance(trace->inst); + } + + if (trace->seq) + free(trace->seq); + + if (trace->tep) + tep_free(trace->tep); +} + +/* + * trace_instance_init - create an rtla trace instance + * + * It is more than the tracefs instance, as it contains other + * things required for the tracing, such as the local events and + * a seq file. + * + * Note that the trace instance is returned disabled. This allows + * the tool to apply some other configs, like setting priority + * to the kernel threads, before starting generating trace entries. + */ +int trace_instance_init(struct trace_instance *trace, char *tool_name) +{ + trace->seq = calloc(1, sizeof(*trace->seq)); + if (!trace->seq) + goto out_err; + + trace_seq_init(trace->seq); + + trace->inst = create_instance(tool_name); + if (!trace->inst) + goto out_err; + + trace->tep = tracefs_local_events(NULL); + if (!trace->tep) + goto out_err; + + /* + * Let the main enable the record after setting some other + * things such as the priority of the tracer's threads. + */ + tracefs_trace_off(trace->inst); + + return 0; + +out_err: + trace_instance_destroy(trace); + return 1; +} + +/* + * trace_instance_start - start tracing a given rtla instance + */ +int trace_instance_start(struct trace_instance *trace) +{ + return tracefs_trace_on(trace->inst); +} diff --git a/tools/tracing/rtla/src/trace.h b/tools/tracing/rtla/src/trace.h new file mode 100644 index 00000000000000..0ea1df0ad9a77c --- /dev/null +++ b/tools/tracing/rtla/src/trace.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +struct trace_instance { + struct tracefs_instance *inst; + struct tep_handle *tep; + struct trace_seq *seq; +}; + +int trace_instance_init(struct trace_instance *trace, char *tool_name); +int trace_instance_start(struct trace_instance *trace); +void trace_instance_destroy(struct trace_instance *trace); + +struct trace_seq *get_trace_seq(void); +int enable_tracer_by_name(struct tracefs_instance *inst, const char *tracer_name); +void disable_tracer(struct tracefs_instance *inst); + +int enable_osnoise(struct trace_instance *trace); +int enable_timerlat(struct trace_instance *trace); + +struct tracefs_instance *create_instance(char *instance_name); +void destroy_instance(struct tracefs_instance *inst); + +int save_trace_to_file(struct tracefs_instance *inst, const char *filename); +int collect_registered_events(struct tep_event *tep, struct tep_record *record, + int cpu, void *context); diff --git a/tools/tracing/rtla/src/utils.c b/tools/tracing/rtla/src/utils.c new file mode 100644 index 00000000000000..1c9f0eea616699 --- /dev/null +++ b/tools/tracing/rtla/src/utils.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#define MAX_MSG_LENGTH 1024 +int config_debug; + +/* + * err_msg - print an error message to the stderr + */ +void err_msg(const char *fmt, ...) +{ + char message[MAX_MSG_LENGTH]; + va_list ap; + + va_start(ap, fmt); + vsnprintf(message, sizeof(message), fmt, ap); + va_end(ap); + + fprintf(stderr, "%s", message); +} + +/* + * debug_msg - print a debug message to stderr if debug is set + */ +void debug_msg(const char *fmt, ...) +{ + char message[MAX_MSG_LENGTH]; + va_list ap; + + if (!config_debug) + return; + + va_start(ap, fmt); + vsnprintf(message, sizeof(message), fmt, ap); + va_end(ap); + + fprintf(stderr, "%s", message); +} + +/* + * get_llong_from_str - get a long long int from a string + */ +long long get_llong_from_str(char *start) +{ + long long value; + char *end; + + errno = 0; + value = strtoll(start, &end, 10); + if (errno || start == end) + return -1; + + return value; +} + +/* + * get_duration - fill output with a human readable duration since start_time + */ +void get_duration(time_t start_time, char *output, int output_size) +{ + time_t now = time(NULL); + struct tm *tm_info; + time_t duration; + + duration = difftime(now, start_time); + tm_info = localtime(&duration); + + snprintf(output, output_size, "%3d %02d:%02d:%02d", + tm_info->tm_yday, + tm_info->tm_hour - 1, + tm_info->tm_min, + tm_info->tm_sec); +} + +/* + * parse_cpu_list - parse a cpu_list filling a char vector with cpus set + * + * Receives a cpu list, like 1-3,5 (cpus 1, 2, 3, 5), and then set the char + * in the monitored_cpus. + * + * XXX: convert to a bitmask. + */ +int parse_cpu_list(char *cpu_list, char **monitored_cpus) +{ + char *mon_cpus; + const char *p; + int end_cpu; + int nr_cpus; + int cpu; + int i; + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + mon_cpus = malloc(nr_cpus * sizeof(char)); + memset(mon_cpus, 0, (nr_cpus * sizeof(char))); + + for (p = cpu_list; *p; ) { + cpu = atoi(p); + if (cpu < 0 || (!cpu && *p != '0') || cpu >= nr_cpus) + goto err; + + while (isdigit(*p)) + p++; + if (*p == '-') { + p++; + end_cpu = atoi(p); + if (end_cpu < cpu || (!end_cpu && *p != '0') || end_cpu >= nr_cpus) + goto err; + while (isdigit(*p)) + p++; + } else + end_cpu = cpu; + + if (cpu == end_cpu) { + debug_msg("cpu_list: adding cpu %d\n", cpu); + mon_cpus[cpu] = 1; + } else { + for (i = cpu; i <= end_cpu; i++) { + debug_msg("cpu_list: adding cpu %d\n", i); + mon_cpus[i] = 1; + } + } + + if (*p == ',') + p++; + } + + *monitored_cpus = mon_cpus; + + return 0; + +err: + debug_msg("Error parsing the cpu list %s", cpu_list); + return 1; +} + +/* + * parse_duration - parse duration with s/m/h/d suffix converting it to seconds + */ +long parse_seconds_duration(char *val) +{ + char *end; + long t; + + t = strtol(val, &end, 10); + + if (end) { + switch (*end) { + case 's': + case 'S': + break; + case 'm': + case 'M': + t *= 60; + break; + case 'h': + case 'H': + t *= 60 * 60; + break; + + case 'd': + case 'D': + t *= 24 * 60 * 60; + break; + } + } + + return t; +} + +/* + * parse_ns_duration - parse duration with ns/us/ms/s converting it to nanoseconds + */ +long parse_ns_duration(char *val) +{ + char *end; + long t; + + t = strtol(val, &end, 10); + + if (end) { + if (!strncmp(end, "ns", 2)) { + return t; + } else if (!strncmp(end, "us", 2)) { + t *= 1000; + return t; + } else if (!strncmp(end, "ms", 2)) { + t *= 1000 * 1000; + return t; + } else if (!strncmp(end, "s", 1)) { + t *= 1000 * 1000 * 1000; + return t; + } + return -1; + } + + return t; +} + +/* + * This is a set of helper functions to use SCHED_DEADLINE. + */ +#ifdef __x86_64__ +# define __NR_sched_setattr 314 +# define __NR_sched_getattr 315 +#elif __i386__ +# define __NR_sched_setattr 351 +# define __NR_sched_getattr 352 +#elif __arm__ +# define __NR_sched_setattr 380 +# define __NR_sched_getattr 381 +#elif __aarch64__ +# define __NR_sched_setattr 274 +# define __NR_sched_getattr 275 +#elif __powerpc__ +# define __NR_sched_setattr 355 +# define __NR_sched_getattr 356 +#elif __s390x__ +# define __NR_sched_setattr 345 +# define __NR_sched_getattr 346 +#endif + +#define SCHED_DEADLINE 6 + +static inline int sched_setattr(pid_t pid, const struct sched_attr *attr, + unsigned int flags) { + return syscall(__NR_sched_setattr, pid, attr, flags); +} + +static inline int sched_getattr(pid_t pid, struct sched_attr *attr, + unsigned int size, unsigned int flags) +{ + return syscall(__NR_sched_getattr, pid, attr, size, flags); +} + +int __set_sched_attr(int pid, struct sched_attr *attr) +{ + int flags = 0; + int retval; + + retval = sched_setattr(pid, attr, flags); + if (retval < 0) { + err_msg("boost_with_deadline failed to boost pid %d: %s\n", + pid, strerror(errno)); + return 1; + } + + return 0; +} +/* + * set_comm_sched_attr - set sched params to threads starting with char *comm + * + * This function uses procps to list the currently running threads and then + * set the sched_attr *attr to the threads that start with char *comm. It is + * mainly used to set the priority to the kernel threads created by the + * tracers. + */ +int set_comm_sched_attr(const char *comm, struct sched_attr *attr) +{ + int flags = PROC_FILLCOM | PROC_FILLSTAT; + PROCTAB *ptp; + proc_t task; + int retval; + + ptp = openproc(flags); + if (!ptp) { + err_msg("error openproc()\n"); + return -ENOENT; + } + + memset(&task, 0, sizeof(task)); + + while (readproc(ptp, &task)) { + retval = strncmp(comm, task.cmd, strlen(comm)); + if (retval) + continue; + retval = __set_sched_attr(task.tid, attr); + if (retval) + goto out_err; + } + + closeproc(ptp); + return 0; + +out_err: + closeproc(ptp); + return 1; +} + +#define INVALID_VAL (~0L) +static long get_long_ns_after_colon(char *start) +{ + long val = INVALID_VAL; + + /* find the ":" */ + start = strstr(start, ":"); + if (!start) + return -1; + + /* skip ":" */ + start++; + val = parse_ns_duration(start); + + return val; +} + +static long get_long_after_colon(char *start) +{ + long val = INVALID_VAL; + + /* find the ":" */ + start = strstr(start, ":"); + if (!start) + return -1; + + /* skip ":" */ + start++; + val = get_llong_from_str(start); + + return val; +} + +/* + * parse priority in the format: + * SCHED_OTHER: + * o: + * O: + * SCHED_RR: + * r: + * R: + * SCHED_FIFO: + * f: + * F: + * SCHED_DEADLINE: + * d:runtime:period + * D:runtime:period + */ +int parse_prio(char *arg, struct sched_attr *sched_param) +{ + long prio; + long runtime; + long period; + + memset(sched_param, 0, sizeof(*sched_param)); + sched_param->size = sizeof(*sched_param); + + switch (arg[0]) { + case 'd': + case 'D': + /* d:runtime:period */ + if (strlen(arg) < 4) + return -1; + + runtime = get_long_ns_after_colon(arg); + if (runtime == INVALID_VAL) + return -1; + + period = get_long_ns_after_colon(&arg[2]); + if (period == INVALID_VAL) + return -1; + + if (runtime > period) + return -1; + + sched_param->sched_policy = SCHED_DEADLINE; + sched_param->sched_runtime = runtime; + sched_param->sched_deadline = period; + sched_param->sched_period = period; + break; + case 'f': + case 'F': + /* f:prio */ + prio = get_long_after_colon(arg); + if (prio == INVALID_VAL) + return -1; + + if (prio < sched_get_priority_min(SCHED_FIFO)) + return -1; + if (prio > sched_get_priority_max(SCHED_FIFO)) + return -1; + + sched_param->sched_policy = SCHED_FIFO; + sched_param->sched_priority = prio; + break; + case 'r': + case 'R': + /* r:prio */ + prio = get_long_after_colon(arg); + if (prio == INVALID_VAL) + return -1; + + if (prio < sched_get_priority_min(SCHED_RR)) + return -1; + if (prio > sched_get_priority_max(SCHED_RR)) + return -1; + + sched_param->sched_policy = SCHED_RR; + sched_param->sched_priority = prio; + break; + case 'o': + case 'O': + /* o:prio */ + prio = get_long_after_colon(arg); + if (prio == INVALID_VAL) + return -1; + + if (prio < sched_get_priority_min(SCHED_OTHER)) + return -1; + if (prio > sched_get_priority_max(SCHED_OTHER)) + return -1; + + sched_param->sched_policy = SCHED_OTHER; + sched_param->sched_priority = prio; + break; + default: + return -1; + } + return 0; +} diff --git a/tools/tracing/rtla/src/utils.h b/tools/tracing/rtla/src/utils.h new file mode 100644 index 00000000000000..9aa962319ca235 --- /dev/null +++ b/tools/tracing/rtla/src/utils.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +/* + * '18446744073709551615\0' + */ +#define BUFF_U64_STR_SIZE 24 + +#define container_of(ptr, type, member)({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)) ; }) + +extern int config_debug; +void debug_msg(const char *fmt, ...); +void err_msg(const char *fmt, ...); + +long parse_seconds_duration(char *val); +void get_duration(time_t start_time, char *output, int output_size); + +int parse_cpu_list(char *cpu_list, char **monitored_cpus); +long long get_llong_from_str(char *start); + +static inline void +update_min(unsigned long long *a, unsigned long long *b) +{ + if (*a > *b) + *a = *b; +} + +static inline void +update_max(unsigned long long *a, unsigned long long *b) +{ + if (*a < *b) + *a = *b; +} + +static inline void +update_sum(unsigned long long *a, unsigned long long *b) +{ + *a += *b; +} + +struct sched_attr { + uint32_t size; + uint32_t sched_policy; + uint64_t sched_flags; + int32_t sched_nice; + uint32_t sched_priority; + uint64_t sched_runtime; + uint64_t sched_deadline; + uint64_t sched_period; +}; + +int parse_prio(char *arg, struct sched_attr *sched_param); +int set_comm_sched_attr(const char *comm, struct sched_attr *attr); From 0605bf009f18d558758df9edef59698bf1389b28 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:22 +0100 Subject: [PATCH 38/50] rtla: Add osnoise tool The osnoise tool is the interface for the osnoise tracer. The osnoise tool will have multiple "modes" with different outputs. At this point, no mode is included. The osnoise.c includes the osnoise_context abstraction. It serves to read-save-change-restore the default values from tracing/osnoise/ directory. When the context is deleted, the default values are restored. It also includes some other helper functions for managing osnoise tracer sessions. With these bits and pieces in place, we can start adding some functionality to rtla. Link: https://lkml.kernel.org/r/2d44c21ff561f503b4c7b1813892761818118460.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/Makefile | 2 + tools/tracing/rtla/src/osnoise.c | 855 +++++++++++++++++++++++++++++++ tools/tracing/rtla/src/osnoise.h | 89 ++++ tools/tracing/rtla/src/rtla.c | 10 + 4 files changed, 956 insertions(+) create mode 100644 tools/tracing/rtla/src/osnoise.c create mode 100644 tools/tracing/rtla/src/osnoise.h diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 56b1bdd60c9666..d40b442fcd3066 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -60,6 +60,8 @@ install: $(INSTALL) -d -m 755 $(DESTDIR)$(BINDIR) $(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR) $(STRIP) $(DESTDIR)$(BINDIR)/rtla + @test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise + ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/osnoise .PHONY: clean tarball clean: diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c new file mode 100644 index 00000000000000..e5254044e87832 --- /dev/null +++ b/tools/tracing/rtla/src/osnoise.c @@ -0,0 +1,855 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "osnoise.h" +#include "utils.h" + +/* + * osnoise_get_cpus - return the original "osnoise/cpus" content + * + * It also saves the value to be restored. + */ +char *osnoise_get_cpus(struct osnoise_context *context) +{ + if (context->curr_cpus) + return context->curr_cpus; + + if (context->orig_cpus) + return context->orig_cpus; + + context->orig_cpus = tracefs_instance_file_read(NULL, "osnoise/cpus", NULL); + + /* + * The error value (NULL) is the same for tracefs_instance_file_read() + * and this functions, so: + */ + return context->orig_cpus; +} + +/* + * osnoise_set_cpus - configure osnoise to run on *cpus + * + * "osnoise/cpus" file is used to set the cpus in which osnoise/timerlat + * will run. This function opens this file, saves the current value, + * and set the cpus passed as argument. + */ +int osnoise_set_cpus(struct osnoise_context *context, char *cpus) +{ + char *orig_cpus = osnoise_get_cpus(context); + char buffer[1024]; + int retval; + + if (!orig_cpus) + return -1; + + context->curr_cpus = strdup(cpus); + if (!context->curr_cpus) + return -1; + + snprintf(buffer, 1024, "%s\n", cpus); + + debug_msg("setting cpus to %s from %s", cpus, context->orig_cpus); + + retval = tracefs_instance_file_write(NULL, "osnoise/cpus", buffer); + if (retval < 0) { + free(context->curr_cpus); + context->curr_cpus = NULL; + return -1; + } + + return 0; +} + +/* + * osnoise_restore_cpus - restore the original "osnoise/cpus" + * + * osnoise_set_cpus() saves the original data for the "osnoise/cpus" + * file. This function restore the original config it was previously + * modified. + */ +void osnoise_restore_cpus(struct osnoise_context *context) +{ + int retval; + + if (!context->orig_cpus) + return; + + if (!context->curr_cpus) + return; + + /* nothing to do? */ + if (!strcmp(context->orig_cpus, context->curr_cpus)) + goto out_done; + + debug_msg("restoring cpus to %s", context->orig_cpus); + + retval = tracefs_instance_file_write(NULL, "osnoise/cpus", context->orig_cpus); + if (retval < 0) + err_msg("could not restore original osnoise cpus\n"); + +out_done: + free(context->curr_cpus); + context->curr_cpus = NULL; +} + +/* + * osnoise_put_cpus - restore cpus config and cleanup data + */ +void osnoise_put_cpus(struct osnoise_context *context) +{ + osnoise_restore_cpus(context); + + if (!context->orig_cpus) + return; + + free(context->orig_cpus); + context->orig_cpus = NULL; +} + +/* + * osnoise_read_ll_config - read a long long value from a config + * + * returns -1 on error. + */ +static long long osnoise_read_ll_config(char *rel_path) +{ + long long retval; + char *buffer; + + buffer = tracefs_instance_file_read(NULL, rel_path, NULL); + if (!buffer) + return -1; + + /* get_llong_from_str returns -1 on error */ + retval = get_llong_from_str(buffer); + + debug_msg("reading %s returned %lld\n", rel_path, retval); + + free(buffer); + + return retval; +} + +/* + * osnoise_write_ll_config - write a long long value to a config in rel_path + * + * returns -1 on error. + */ +static long long osnoise_write_ll_config(char *rel_path, long long value) +{ + char buffer[BUFF_U64_STR_SIZE]; + long long retval; + + snprintf(buffer, sizeof(buffer), "%lld\n", value); + + debug_msg("setting %s to %lld\n", rel_path, value); + + retval = tracefs_instance_file_write(NULL, rel_path, buffer); + return retval; +} + +/* + * osnoise_get_runtime - return the original "osnoise/runtime_us" value + * + * It also saves the value to be restored. + */ +unsigned long long osnoise_get_runtime(struct osnoise_context *context) +{ + long long runtime_us; + + if (context->runtime_us != OSNOISE_TIME_INIT_VAL) + return context->runtime_us; + + if (context->orig_runtime_us != OSNOISE_TIME_INIT_VAL) + return context->orig_runtime_us; + + runtime_us = osnoise_read_ll_config("osnoise/runtime_us"); + if (runtime_us < 0) + goto out_err; + + context->orig_runtime_us = runtime_us; + return runtime_us; + +out_err: + return OSNOISE_TIME_INIT_VAL; +} + +/* + * osnoise_get_period - return the original "osnoise/period_us" value + * + * It also saves the value to be restored. + */ +unsigned long long osnoise_get_period(struct osnoise_context *context) +{ + long long period_us; + + if (context->period_us != OSNOISE_TIME_INIT_VAL) + return context->period_us; + + if (context->orig_period_us != OSNOISE_TIME_INIT_VAL) + return context->orig_period_us; + + period_us = osnoise_read_ll_config("osnoise/period_us"); + if (period_us < 0) + goto out_err; + + context->orig_period_us = period_us; + return period_us; + +out_err: + return OSNOISE_TIME_INIT_VAL; +} + +static int __osnoise_write_runtime(struct osnoise_context *context, + unsigned long long runtime) +{ + int retval; + + if (context->orig_runtime_us == OSNOISE_TIME_INIT_VAL) + return -1; + + retval = osnoise_write_ll_config("osnoise/runtime_us", runtime); + if (retval < 0) + return -1; + + context->runtime_us = runtime; + return 0; +} + +static int __osnoise_write_period(struct osnoise_context *context, + unsigned long long period) +{ + int retval; + + if (context->orig_period_us == OSNOISE_TIME_INIT_VAL) + return -1; + + retval = osnoise_write_ll_config("osnoise/period_us", period); + if (retval < 0) + return -1; + + context->period_us = period; + return 0; +} + +/* + * osnoise_set_runtime_period - set osnoise runtime and period + * + * Osnoise's runtime and period are related as runtime <= period. + * Thus, this function saves the original values, and then tries + * to set the runtime and period if they are != 0. + */ +int osnoise_set_runtime_period(struct osnoise_context *context, + unsigned long long runtime, + unsigned long long period) +{ + unsigned long long curr_runtime_us; + unsigned long long curr_period_us; + int retval; + + if (!period && !runtime) + return 0; + + curr_runtime_us = osnoise_get_runtime(context); + curr_period_us = osnoise_get_period(context); + + /* error getting any value? */ + if (curr_period_us == OSNOISE_TIME_INIT_VAL || curr_runtime_us == OSNOISE_TIME_INIT_VAL) + return -1; + + if (!period) { + if (runtime > curr_period_us) + return -1; + return __osnoise_write_runtime(context, runtime); + } else if (!runtime) { + if (period < curr_runtime_us) + return -1; + return __osnoise_write_period(context, period); + } + + if (runtime > curr_period_us) { + retval = __osnoise_write_period(context, period); + if (retval) + return -1; + retval = __osnoise_write_runtime(context, runtime); + if (retval) + return -1; + } else { + retval = __osnoise_write_runtime(context, runtime); + if (retval) + return -1; + retval = __osnoise_write_period(context, period); + if (retval) + return -1; + } + + return 0; +} + +/* + * osnoise_restore_runtime_period - restore the original runtime and period + */ +void osnoise_restore_runtime_period(struct osnoise_context *context) +{ + unsigned long long orig_runtime = context->orig_runtime_us; + unsigned long long orig_period = context->orig_period_us; + unsigned long long curr_runtime = context->runtime_us; + unsigned long long curr_period = context->period_us; + int retval; + + if ((orig_runtime == OSNOISE_TIME_INIT_VAL) && (orig_period == OSNOISE_TIME_INIT_VAL)) + return; + + if ((orig_period == curr_period) && (orig_runtime == curr_runtime)) + goto out_done; + + retval = osnoise_set_runtime_period(context, orig_runtime, orig_period); + if (retval) + err_msg("Could not restore original osnoise runtime/period\n"); + +out_done: + context->runtime_us = OSNOISE_TIME_INIT_VAL; + context->period_us = OSNOISE_TIME_INIT_VAL; +} + +/* + * osnoise_put_runtime_period - restore original values and cleanup data + */ +void osnoise_put_runtime_period(struct osnoise_context *context) +{ + osnoise_restore_runtime_period(context); + + if (context->orig_runtime_us != OSNOISE_TIME_INIT_VAL) + context->orig_runtime_us = OSNOISE_TIME_INIT_VAL; + + if (context->orig_period_us != OSNOISE_TIME_INIT_VAL) + context->orig_period_us = OSNOISE_TIME_INIT_VAL; +} + +/* + * osnoise_get_timerlat_period_us - read and save the original "timerlat_period_us" + */ +static long long +osnoise_get_timerlat_period_us(struct osnoise_context *context) +{ + long long timerlat_period_us; + + if (context->timerlat_period_us != OSNOISE_TIME_INIT_VAL) + return context->timerlat_period_us; + + if (context->orig_timerlat_period_us != OSNOISE_TIME_INIT_VAL) + return context->orig_timerlat_period_us; + + timerlat_period_us = osnoise_read_ll_config("osnoise/timerlat_period_us"); + if (timerlat_period_us < 0) + goto out_err; + + context->orig_timerlat_period_us = timerlat_period_us; + return timerlat_period_us; + +out_err: + return OSNOISE_TIME_INIT_VAL; +} + +/* + * osnoise_set_timerlat_period_us - set "timerlat_period_us" + */ +int osnoise_set_timerlat_period_us(struct osnoise_context *context, long long timerlat_period_us) +{ + long long curr_timerlat_period_us = osnoise_get_timerlat_period_us(context); + int retval; + + if (curr_timerlat_period_us == OSNOISE_TIME_INIT_VAL) + return -1; + + retval = osnoise_write_ll_config("osnoise/timerlat_period_us", timerlat_period_us); + if (retval < 0) + return -1; + + context->timerlat_period_us = timerlat_period_us; + + return 0; +} + +/* + * osnoise_restore_timerlat_period_us - restore "timerlat_period_us" + */ +void osnoise_restore_timerlat_period_us(struct osnoise_context *context) +{ + int retval; + + if (context->orig_timerlat_period_us == OSNOISE_TIME_INIT_VAL) + return; + + if (context->orig_timerlat_period_us == context->timerlat_period_us) + goto out_done; + + retval = osnoise_write_ll_config("osnoise/timerlat_period_us", context->orig_timerlat_period_us); + if (retval < 0) + err_msg("Could not restore original osnoise timerlat_period_us\n"); + +out_done: + context->timerlat_period_us = OSNOISE_TIME_INIT_VAL; +} + +/* + * osnoise_put_timerlat_period_us - restore original values and cleanup data + */ +void osnoise_put_timerlat_period_us(struct osnoise_context *context) +{ + osnoise_restore_timerlat_period_us(context); + + if (context->orig_timerlat_period_us == OSNOISE_TIME_INIT_VAL) + return; + + context->orig_timerlat_period_us = OSNOISE_TIME_INIT_VAL; +} + +/* + * osnoise_get_stop_us - read and save the original "stop_tracing_us" + */ +static long long +osnoise_get_stop_us(struct osnoise_context *context) +{ + long long stop_us; + + if (context->stop_us != OSNOISE_OPTION_INIT_VAL) + return context->stop_us; + + if (context->orig_stop_us != OSNOISE_OPTION_INIT_VAL) + return context->orig_stop_us; + + stop_us = osnoise_read_ll_config("osnoise/stop_tracing_us"); + if (stop_us < 0) + goto out_err; + + context->orig_stop_us = stop_us; + return stop_us; + +out_err: + return OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_set_stop_us - set "stop_tracing_us" + */ +int osnoise_set_stop_us(struct osnoise_context *context, long long stop_us) +{ + long long curr_stop_us = osnoise_get_stop_us(context); + int retval; + + if (curr_stop_us == OSNOISE_OPTION_INIT_VAL) + return -1; + + retval = osnoise_write_ll_config("osnoise/stop_tracing_us", stop_us); + if (retval < 0) + return -1; + + context->stop_us = stop_us; + + return 0; +} + +/* + * osnoise_restore_stop_us - restore the original "stop_tracing_us" + */ +void osnoise_restore_stop_us(struct osnoise_context *context) +{ + int retval; + + if (context->orig_stop_us == OSNOISE_OPTION_INIT_VAL) + return; + + if (context->orig_stop_us == context->stop_us) + goto out_done; + + retval = osnoise_write_ll_config("osnoise/stop_tracing_us", context->orig_stop_us); + if (retval < 0) + err_msg("Could not restore original osnoise stop_us\n"); + +out_done: + context->stop_us = OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_put_stop_us - restore original values and cleanup data + */ +void osnoise_put_stop_us(struct osnoise_context *context) +{ + osnoise_restore_stop_us(context); + + if (context->orig_stop_us == OSNOISE_OPTION_INIT_VAL) + return; + + context->orig_stop_us = OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_get_stop_total_us - read and save the original "stop_tracing_total_us" + */ +static long long +osnoise_get_stop_total_us(struct osnoise_context *context) +{ + long long stop_total_us; + + if (context->stop_total_us != OSNOISE_OPTION_INIT_VAL) + return context->stop_total_us; + + if (context->orig_stop_total_us != OSNOISE_OPTION_INIT_VAL) + return context->orig_stop_total_us; + + stop_total_us = osnoise_read_ll_config("osnoise/stop_tracing_total_us"); + if (stop_total_us < 0) + goto out_err; + + context->orig_stop_total_us = stop_total_us; + return stop_total_us; + +out_err: + return OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_set_stop_total_us - set "stop_tracing_total_us" + */ +int osnoise_set_stop_total_us(struct osnoise_context *context, long long stop_total_us) +{ + long long curr_stop_total_us = osnoise_get_stop_total_us(context); + int retval; + + if (curr_stop_total_us == OSNOISE_OPTION_INIT_VAL) + return -1; + + retval = osnoise_write_ll_config("osnoise/stop_tracing_total_us", stop_total_us); + if (retval < 0) + return -1; + + context->stop_total_us = stop_total_us; + + return 0; +} + +/* + * osnoise_restore_stop_total_us - restore the original "stop_tracing_total_us" + */ +void osnoise_restore_stop_total_us(struct osnoise_context *context) +{ + int retval; + + if (context->orig_stop_total_us == OSNOISE_OPTION_INIT_VAL) + return; + + if (context->orig_stop_total_us == context->stop_total_us) + goto out_done; + + retval = osnoise_write_ll_config("osnoise/stop_tracing_total_us", + context->orig_stop_total_us); + if (retval < 0) + err_msg("Could not restore original osnoise stop_total_us\n"); + +out_done: + context->stop_total_us = OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_put_stop_total_us - restore original values and cleanup data + */ +void osnoise_put_stop_total_us(struct osnoise_context *context) +{ + osnoise_restore_stop_total_us(context); + + if (context->orig_stop_total_us == OSNOISE_OPTION_INIT_VAL) + return; + + context->orig_stop_total_us = OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_get_print_stack - read and save the original "print_stack" + */ +static long long +osnoise_get_print_stack(struct osnoise_context *context) +{ + long long print_stack; + + if (context->print_stack != OSNOISE_OPTION_INIT_VAL) + return context->print_stack; + + if (context->orig_print_stack != OSNOISE_OPTION_INIT_VAL) + return context->orig_print_stack; + + print_stack = osnoise_read_ll_config("osnoise/print_stack"); + if (print_stack < 0) + goto out_err; + + context->orig_print_stack = print_stack; + return print_stack; + +out_err: + return OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_set_print_stack - set "print_stack" + */ +int osnoise_set_print_stack(struct osnoise_context *context, long long print_stack) +{ + long long curr_print_stack = osnoise_get_print_stack(context); + int retval; + + if (curr_print_stack == OSNOISE_OPTION_INIT_VAL) + return -1; + + retval = osnoise_write_ll_config("osnoise/print_stack", print_stack); + if (retval < 0) + return -1; + + context->print_stack = print_stack; + + return 0; +} + +/* + * osnoise_restore_print_stack - restore the original "print_stack" + */ +void osnoise_restore_print_stack(struct osnoise_context *context) +{ + int retval; + + if (context->orig_print_stack == OSNOISE_OPTION_INIT_VAL) + return; + + if (context->orig_print_stack == context->print_stack) + goto out_done; + + retval = osnoise_write_ll_config("osnoise/print_stack", context->orig_print_stack); + if (retval < 0) + err_msg("Could not restore original osnoise print_stack\n"); + +out_done: + context->print_stack = OSNOISE_OPTION_INIT_VAL; +} + +/* + * osnoise_put_print_stack - restore original values and cleanup data + */ +void osnoise_put_print_stack(struct osnoise_context *context) +{ + osnoise_restore_print_stack(context); + + if (context->orig_print_stack == OSNOISE_OPTION_INIT_VAL) + return; + + context->orig_print_stack = OSNOISE_OPTION_INIT_VAL; +} + +/* + * enable_osnoise - enable osnoise tracer in the trace_instance + */ +int enable_osnoise(struct trace_instance *trace) +{ + return enable_tracer_by_name(trace->inst, "osnoise"); +} + +/* + * enable_timerlat - enable timerlat tracer in the trace_instance + */ +int enable_timerlat(struct trace_instance *trace) +{ + return enable_tracer_by_name(trace->inst, "timerlat"); +} + +enum { + FLAG_CONTEXT_NEWLY_CREATED = (1 << 0), + FLAG_CONTEXT_DELETED = (1 << 1), +}; + +/* + * osnoise_get_context - increase the usage of a context and return it + */ +int osnoise_get_context(struct osnoise_context *context) +{ + int ret; + + if (context->flags & FLAG_CONTEXT_DELETED) { + ret = -1; + } else { + context->ref++; + ret = 0; + } + + return ret; +} + +/* + * osnoise_context_alloc - alloc an osnoise_context + * + * The osnoise context contains the information of the "osnoise/" configs. + * It is used to set and restore the config. + */ +struct osnoise_context *osnoise_context_alloc(void) +{ + struct osnoise_context *context; + + context = calloc(1, sizeof(*context)); + if (!context) + return NULL; + + context->orig_stop_us = OSNOISE_OPTION_INIT_VAL; + context->stop_us = OSNOISE_OPTION_INIT_VAL; + + context->orig_stop_total_us = OSNOISE_OPTION_INIT_VAL; + context->stop_total_us = OSNOISE_OPTION_INIT_VAL; + + context->orig_print_stack = OSNOISE_OPTION_INIT_VAL; + context->print_stack = OSNOISE_OPTION_INIT_VAL; + + osnoise_get_context(context); + + return context; +} + +/* + * osnoise_put_context - put the osnoise_put_context + * + * If there is no other user for the context, the original data + * is restored. + */ +void osnoise_put_context(struct osnoise_context *context) +{ + if (--context->ref < 1) + context->flags |= FLAG_CONTEXT_DELETED; + + if (!(context->flags & FLAG_CONTEXT_DELETED)) + return; + + osnoise_put_cpus(context); + osnoise_put_runtime_period(context); + osnoise_put_stop_us(context); + osnoise_put_stop_total_us(context); + osnoise_put_timerlat_period_us(context); + osnoise_put_print_stack(context); + + free(context); +} + +/* + * osnoise_destroy_tool - disable trace, restore configs and free data + */ +void osnoise_destroy_tool(struct osnoise_tool *top) +{ + trace_instance_destroy(&top->trace); + + if (top->context) + osnoise_put_context(top->context); + + free(top); +} + +/* + * osnoise_init_tool - init an osnoise tool + * + * It allocs data, create a context to store data and + * creates a new trace instance for the tool. + */ +struct osnoise_tool *osnoise_init_tool(char *tool_name) +{ + struct osnoise_tool *top; + int retval; + + top = calloc(1, sizeof(*top)); + if (!top) + return NULL; + + top->context = osnoise_context_alloc(); + if (!top->context) + goto out_err; + + retval = trace_instance_init(&top->trace, tool_name); + if (retval) + goto out_err; + + return top; +out_err: + osnoise_destroy_tool(top); + return NULL; +} + +/* + * osnoise_init_trace_tool - init a tracer instance to trace osnoise events + */ +struct osnoise_tool *osnoise_init_trace_tool(char *tracer) +{ + struct osnoise_tool *trace; + int retval; + + trace = osnoise_init_tool("osnoise_trace"); + if (!trace) + return NULL; + + retval = tracefs_event_enable(trace->trace.inst, "osnoise", NULL); + if (retval < 0 && !errno) { + err_msg("Could not find osnoise events\n"); + goto out_err; + } + + retval = enable_tracer_by_name(trace->trace.inst, tracer); + if (retval) { + err_msg("Could not enable osnoiser tracer for tracing\n"); + goto out_err; + } + + return trace; +out_err: + osnoise_destroy_tool(trace); + return NULL; +} + +static void osnoise_usage(void) +{ + int i; + + static const char *msg[] = { + "", + "osnoise version " VERSION, + "", + " usage: [rtla] osnoise [MODE] ...", + "", + " modes:", + " top - prints the summary from osnoise tracer", + "", + "if no MODE is given, the top mode is called, passing the arguments", + NULL, + }; + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +int osnoise_main(int argc, char *argv[]) +{ + if (argc == 0) + goto usage; + + if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) { + osnoise_usage(); + exit(0); + } + +usage: + osnoise_usage(); + exit(1); +} diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h new file mode 100644 index 00000000000000..1184fc66dd5063 --- /dev/null +++ b/tools/tracing/rtla/src/osnoise.h @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "trace.h" + +/* + * osnoise_context - read, store, write, restore osnoise configs. + */ +struct osnoise_context { + int flags; + int ref; + + char *curr_cpus; + char *orig_cpus; + + /* 0 as init value */ + unsigned long long orig_runtime_us; + unsigned long long runtime_us; + + /* 0 as init value */ + unsigned long long orig_period_us; + unsigned long long period_us; + + /* 0 as init value */ + long long orig_timerlat_period_us; + long long timerlat_period_us; + + /* -1 as init value because 0 is disabled */ + long long orig_stop_us; + long long stop_us; + + /* -1 as init value because 0 is disabled */ + long long orig_stop_total_us; + long long stop_total_us; + + /* -1 as init value because 0 is disabled */ + long long orig_print_stack; + long long print_stack; +}; + +/* + * *_INIT_VALs are also invalid values, they are used to + * communicate errors. + */ +#define OSNOISE_OPTION_INIT_VAL (-1) +#define OSNOISE_TIME_INIT_VAL (0) + +struct osnoise_context *osnoise_context_alloc(void); +int osnoise_get_context(struct osnoise_context *context); +void osnoise_put_context(struct osnoise_context *context); + +int osnoise_set_cpus(struct osnoise_context *context, char *cpus); +void osnoise_restore_cpus(struct osnoise_context *context); + +int osnoise_set_runtime_period(struct osnoise_context *context, + unsigned long long runtime, + unsigned long long period); +void osnoise_restore_runtime_period(struct osnoise_context *context); + +int osnoise_set_stop_us(struct osnoise_context *context, + long long stop_us); +void osnoise_restore_stop_us(struct osnoise_context *context); + +int osnoise_set_stop_total_us(struct osnoise_context *context, + long long stop_total_us); +void osnoise_restore_stop_total_us(struct osnoise_context *context); + +int osnoise_set_timerlat_period_us(struct osnoise_context *context, + long long timerlat_period_us); +void osnoise_restore_timerlat_period_us(struct osnoise_context *context); + +void osnoise_restore_print_stack(struct osnoise_context *context); +int osnoise_set_print_stack(struct osnoise_context *context, + long long print_stack); + +/* + * osnoise_tool - osnoise based tool definition. + */ +struct osnoise_tool { + struct trace_instance trace; + struct osnoise_context *context; + void *data; + void *params; + time_t start_time; +}; + +void osnoise_destroy_tool(struct osnoise_tool *top); +struct osnoise_tool *osnoise_init_tool(char *tool_name); +struct osnoise_tool *osnoise_init_trace_tool(char *tracer); + +int osnoise_main(int argc, char **argv); diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c index 5ae2664ed47df7..669b9750b3b303 100644 --- a/tools/tracing/rtla/src/rtla.c +++ b/tools/tracing/rtla/src/rtla.c @@ -8,6 +8,8 @@ #include #include +#include "osnoise.h" + /* * rtla_usage - print rtla usage */ @@ -22,6 +24,7 @@ static void rtla_usage(void) " usage: rtla COMMAND ...", "", " commands:", + " osnoise - gives information about the operating system noise (osnoise)", "", NULL, }; @@ -39,7 +42,14 @@ static void rtla_usage(void) */ int run_command(int argc, char **argv, int start_position) { + if (strcmp(argv[start_position], "osnoise") == 0) { + osnoise_main(argc-start_position, &argv[start_position]); + goto ran; + } + return 0; +ran: + return 1; } int main(int argc, char *argv[]) From 1eceb2fc2ca549a170d7ee7cd1fde2daeda646ac Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:23 +0100 Subject: [PATCH 39/50] rtla/osnoise: Add osnoise top mode The rtla osnoise tool is an interface for the osnoise tracer. The osnoise tracer dispatches a kernel thread per-cpu. These threads read the time in a loop while with preemption, softirqs and IRQs enabled, thus allowing all the sources of osnoise during its execution. The osnoise threads take note of the entry and exit point of any source of interferences, increasing a per-cpu interference counter. The osnoise tracer also saves an interference counter for each source of interference. The rtla osnoise top mode displays information about the periodic summary from the osnoise tracer. One example of rtla osnoise top output is: [root@alien ~]# rtla osnoise top -c 0-3 -d 1m -q -r 900000 -P F:1 Operating System Noise duration: 0 00:01:00 | time is in us CPU Period Runtime Noise % CPU Aval Max Noise Max Single HW NMI IRQ Softirq Thread 0 #58 52200000 1031 99.99802 91 60 0 0 52285 0 101 1 #59 53100000 5 99.99999 5 5 0 9 53122 0 18 2 #59 53100000 7 99.99998 7 7 0 8 53115 0 18 3 #59 53100000 8274 99.98441 277 23 0 9 53778 0 660 "rtla osnoise top --help" works and provide information about the available options. Link: https://lkml.kernel.org/r/0d796993abf587ae5a170bb8415c49368d4999e1.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/src/osnoise.c | 16 + tools/tracing/rtla/src/osnoise.h | 1 + tools/tracing/rtla/src/osnoise_top.c | 579 +++++++++++++++++++++++++++ 3 files changed, 596 insertions(+) create mode 100644 tools/tracing/rtla/src/osnoise_top.c diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c index e5254044e87832..fbf4f9bfda023e 100644 --- a/tools/tracing/rtla/src/osnoise.c +++ b/tools/tracing/rtla/src/osnoise.c @@ -844,9 +844,25 @@ int osnoise_main(int argc, char *argv[]) if (argc == 0) goto usage; + /* + * if osnoise was called without any argument, run the + * default cmdline. + */ + if (argc == 1) { + osnoise_top_main(argc, argv); + exit(0); + } + if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) { osnoise_usage(); exit(0); + } else if (strncmp(argv[1], "-", 1) == 0) { + /* the user skipped the tool, call the default one */ + osnoise_top_main(argc, argv); + exit(0); + } else if (strcmp(argv[1], "top") == 0) { + osnoise_top_main(argc-1, &argv[1]); + exit(0); } usage: diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h index 1184fc66dd5063..a4b114ed7d7197 100644 --- a/tools/tracing/rtla/src/osnoise.h +++ b/tools/tracing/rtla/src/osnoise.h @@ -86,4 +86,5 @@ void osnoise_destroy_tool(struct osnoise_tool *top); struct osnoise_tool *osnoise_init_tool(char *tool_name); struct osnoise_tool *osnoise_init_trace_tool(char *tracer); +int osnoise_top_main(int argc, char **argv); int osnoise_main(int argc, char **argv); diff --git a/tools/tracing/rtla/src/osnoise_top.c b/tools/tracing/rtla/src/osnoise_top.c new file mode 100644 index 00000000000000..332b2ac205fc43 --- /dev/null +++ b/tools/tracing/rtla/src/osnoise_top.c @@ -0,0 +1,579 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "osnoise.h" +#include "utils.h" + +/* + * osnoise top parameters + */ +struct osnoise_top_params { + char *cpus; + char *monitored_cpus; + char *trace_output; + unsigned long long runtime; + unsigned long long period; + long long stop_us; + long long stop_total_us; + int sleep_time; + int duration; + int quiet; + int set_sched; + struct sched_attr sched_param; +}; + +struct osnoise_top_cpu { + unsigned long long sum_runtime; + unsigned long long sum_noise; + unsigned long long max_noise; + unsigned long long max_sample; + + unsigned long long hw_count; + unsigned long long nmi_count; + unsigned long long irq_count; + unsigned long long softirq_count; + unsigned long long thread_count; + + int sum_cycles; +}; + +struct osnoise_top_data { + struct osnoise_top_cpu *cpu_data; + int nr_cpus; +}; + +/* + * osnoise_free_top - free runtime data + */ +static void +osnoise_free_top(struct osnoise_top_data *data) +{ + free(data->cpu_data); + free(data); +} + +/* + * osnoise_alloc_histogram - alloc runtime data + */ +static struct osnoise_top_data *osnoise_alloc_top(int nr_cpus) +{ + struct osnoise_top_data *data; + + data = calloc(1, sizeof(*data)); + if (!data) + return NULL; + + data->nr_cpus = nr_cpus; + + /* one set of histograms per CPU */ + data->cpu_data = calloc(1, sizeof(*data->cpu_data) * nr_cpus); + if (!data->cpu_data) + goto cleanup; + + return data; + +cleanup: + osnoise_free_top(data); + return NULL; +} + +/* + * osnoise_top_handler - this is the handler for osnoise tracer events + */ +static int +osnoise_top_handler(struct trace_seq *s, struct tep_record *record, + struct tep_event *event, void *context) +{ + struct trace_instance *trace = context; + struct osnoise_tool *tool; + unsigned long long val; + struct osnoise_top_cpu *cpu_data; + struct osnoise_top_data *data; + int cpu = record->cpu; + + tool = container_of(trace, struct osnoise_tool, trace); + + data = tool->data; + cpu_data = &data->cpu_data[cpu]; + + cpu_data->sum_cycles++; + + tep_get_field_val(s, event, "runtime", record, &val, 1); + update_sum(&cpu_data->sum_runtime, &val); + + tep_get_field_val(s, event, "noise", record, &val, 1); + update_max(&cpu_data->max_noise, &val); + update_sum(&cpu_data->sum_noise, &val); + + tep_get_field_val(s, event, "max_sample", record, &val, 1); + update_max(&cpu_data->max_sample, &val); + + tep_get_field_val(s, event, "hw_count", record, &val, 1); + update_sum(&cpu_data->hw_count, &val); + + tep_get_field_val(s, event, "nmi_count", record, &val, 1); + update_sum(&cpu_data->nmi_count, &val); + + tep_get_field_val(s, event, "irq_count", record, &val, 1); + update_sum(&cpu_data->irq_count, &val); + + tep_get_field_val(s, event, "softirq_count", record, &val, 1); + update_sum(&cpu_data->softirq_count, &val); + + tep_get_field_val(s, event, "thread_count", record, &val, 1); + update_sum(&cpu_data->thread_count, &val); + + return 0; +} + +/* + * osnoise_top_header - print the header of the tool output + */ +static void osnoise_top_header(struct osnoise_tool *top) +{ + struct trace_seq *s = top->trace.seq; + char duration[26]; + + get_duration(top->start_time, duration, sizeof(duration)); + + trace_seq_printf(s, "\033[2;37;40m"); + trace_seq_printf(s, " Operating System Noise"); + trace_seq_printf(s, " "); + trace_seq_printf(s, " "); + trace_seq_printf(s, "\033[0;0;0m"); + trace_seq_printf(s, "\n"); + + trace_seq_printf(s, "duration: %9s | time is in us\n", duration); + + trace_seq_printf(s, "\033[2;30;47m"); + trace_seq_printf(s, "CPU Period Runtime "); + trace_seq_printf(s, " Noise "); + trace_seq_printf(s, " %% CPU Aval "); + trace_seq_printf(s, " Max Noise Max Single "); + trace_seq_printf(s, " HW NMI IRQ Softirq Thread"); + trace_seq_printf(s, "\033[0;0;0m"); + trace_seq_printf(s, "\n"); +} + +/* + * clear_terminal - clears the output terminal + */ +static void clear_terminal(struct trace_seq *seq) +{ + if (!config_debug) + trace_seq_printf(seq, "\033c"); +} + +/* + * osnoise_top_print - prints the output of a given CPU + */ +static void osnoise_top_print(struct osnoise_tool *tool, int cpu) +{ + struct trace_seq *s = tool->trace.seq; + struct osnoise_top_cpu *cpu_data; + struct osnoise_top_data *data; + int percentage; + int decimal; + + data = tool->data; + cpu_data = &data->cpu_data[cpu]; + + if (!cpu_data->sum_runtime) + return; + + percentage = ((cpu_data->sum_runtime - cpu_data->sum_noise) * 10000000) + / cpu_data->sum_runtime; + decimal = percentage % 100000; + percentage = percentage / 100000; + + trace_seq_printf(s, "%3d #%-6d %12llu ", cpu, cpu_data->sum_cycles, cpu_data->sum_runtime); + trace_seq_printf(s, "%12llu ", cpu_data->sum_noise); + trace_seq_printf(s, " %3d.%05d", percentage, decimal); + trace_seq_printf(s, "%12llu %12llu", cpu_data->max_noise, cpu_data->max_sample); + + trace_seq_printf(s, "%12llu ", cpu_data->hw_count); + trace_seq_printf(s, "%12llu ", cpu_data->nmi_count); + trace_seq_printf(s, "%12llu ", cpu_data->irq_count); + trace_seq_printf(s, "%12llu ", cpu_data->softirq_count); + trace_seq_printf(s, "%12llu\n", cpu_data->thread_count); +} + +/* + * osnoise_print_stats - print data for all cpus + */ +static void +osnoise_print_stats(struct osnoise_top_params *params, struct osnoise_tool *top) +{ + struct trace_instance *trace = &top->trace; + static int nr_cpus = -1; + int i; + + if (nr_cpus == -1) + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + if (!params->quiet) + clear_terminal(trace->seq); + + osnoise_top_header(top); + + for (i = 0; i < nr_cpus; i++) { + if (params->cpus && !params->monitored_cpus[i]) + continue; + osnoise_top_print(top, i); + } + + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + +/* + * osnoise_top_usage - prints osnoise top usage message + */ +void osnoise_top_usage(char *usage) +{ + int i; + + static const char * const msg[] = { + " usage: rtla osnoise [top] [-h] [-q] [-D] [-d s] [-p us] [-r us] [-s us] [-S us] [-t[=file]] \\", + " [-c cpu-list] [-P priority]", + "", + " -h/--help: print this menu", + " -p/--period us: osnoise period in us", + " -r/--runtime us: osnoise runtime in us", + " -s/--stop us: stop trace if a single sample is higher than the argument in us", + " -S/--stop-total us: stop trace if the total sample is higher than the argument in us", + " -c/--cpus cpu-list: list of cpus to run osnoise threads", + " -d/--duration time[s|m|h|d]: duration of the session", + " -D/--debug: print debug info", + " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -q/--quiet print only a summary at the end", + " -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters", + " o:prio - use SCHED_OTHER with prio", + " r:prio - use SCHED_RR with prio", + " f:prio - use SCHED_FIFO with prio", + " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", + " in nanoseconds", + NULL, + }; + + if (usage) + fprintf(stderr, "%s\n", usage); + + fprintf(stderr, "rtla osnoise top: a per-cpu summary of the OS noise (version %s)\n", + VERSION); + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +/* + * osnoise_top_parse_args - allocs, parse and fill the cmd line parameters + */ +struct osnoise_top_params *osnoise_top_parse_args(int argc, char **argv) +{ + struct osnoise_top_params *params; + int retval; + int c; + + params = calloc(1, sizeof(*params)); + if (!params) + exit(1); + + while (1) { + static struct option long_options[] = { + {"cpus", required_argument, 0, 'c'}, + {"debug", no_argument, 0, 'D'}, + {"duration", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"period", required_argument, 0, 'p'}, + {"priority", required_argument, 0, 'P'}, + {"quiet", no_argument, 0, 'q'}, + {"runtime", required_argument, 0, 'r'}, + {"stop", required_argument, 0, 's'}, + {"stop-total", required_argument, 0, 'S'}, + {"trace", optional_argument, 0, 't'}, + {0, 0, 0, 0} + }; + + /* getopt_long stores the option index here. */ + int option_index = 0; + + c = getopt_long(argc, argv, "c:d:Dhp:P:qr:s:S:t::", + long_options, &option_index); + + /* Detect the end of the options. */ + if (c == -1) + break; + + switch (c) { + case 'c': + retval = parse_cpu_list(optarg, ¶ms->monitored_cpus); + if (retval) + osnoise_top_usage("\nInvalid -c cpu list\n"); + params->cpus = optarg; + break; + case 'D': + config_debug = 1; + break; + case 'd': + params->duration = parse_seconds_duration(optarg); + if (!params->duration) + osnoise_top_usage("Invalid -D duration\n"); + break; + case 'h': + case '?': + osnoise_top_usage(NULL); + break; + case 'p': + params->period = get_llong_from_str(optarg); + if (params->period > 10000000) + osnoise_top_usage("Period longer than 10 s\n"); + break; + case 'P': + retval = parse_prio(optarg, ¶ms->sched_param); + if (retval == -1) + osnoise_top_usage("Invalid -P priority"); + params->set_sched = 1; + break; + case 'q': + params->quiet = 1; + break; + case 'r': + params->runtime = get_llong_from_str(optarg); + if (params->runtime < 100) + osnoise_top_usage("Runtime shorter than 100 us\n"); + break; + case 's': + params->stop_us = get_llong_from_str(optarg); + break; + case 'S': + params->stop_total_us = get_llong_from_str(optarg); + break; + case 't': + if (optarg) + /* skip = */ + params->trace_output = &optarg[1]; + else + params->trace_output = "osnoise_trace.txt"; + break; + default: + osnoise_top_usage("Invalid option"); + } + } + + if (geteuid()) { + err_msg("osnoise needs root permission\n"); + exit(EXIT_FAILURE); + } + + return params; +} + +/* + * osnoise_top_apply_config - apply the top configs to the initialized tool + */ +static int +osnoise_top_apply_config(struct osnoise_tool *tool, struct osnoise_top_params *params) +{ + int retval; + + if (!params->sleep_time) + params->sleep_time = 1; + + if (params->cpus) { + retval = osnoise_set_cpus(tool->context, params->cpus); + if (retval) { + err_msg("Failed to apply CPUs config\n"); + goto out_err; + } + } + + if (params->runtime || params->period) { + retval = osnoise_set_runtime_period(tool->context, + params->runtime, + params->period); + if (retval) { + err_msg("Failed to set runtime and/or period\n"); + goto out_err; + } + } + + if (params->stop_us) { + retval = osnoise_set_stop_us(tool->context, params->stop_us); + if (retval) { + err_msg("Failed to set stop us\n"); + goto out_err; + } + } + + if (params->stop_total_us) { + retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us); + if (retval) { + err_msg("Failed to set stop total us\n"); + goto out_err; + } + } + + return 0; + +out_err: + return -1; +} + +/* + * osnoise_init_top - initialize a osnoise top tool with parameters + */ +struct osnoise_tool *osnoise_init_top(struct osnoise_top_params *params) +{ + struct osnoise_tool *tool; + int nr_cpus; + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + tool = osnoise_init_tool("osnoise_top"); + if (!tool) + return NULL; + + tool->data = osnoise_alloc_top(nr_cpus); + if (!tool->data) + goto out_err; + + tool->params = params; + + tep_register_event_handler(tool->trace.tep, -1, "ftrace", "osnoise", + osnoise_top_handler, NULL); + + return tool; + +out_err: + osnoise_free_top(tool->data); + osnoise_destroy_tool(tool); + return NULL; +} + +static int stop_tracing; +static void stop_top(int sig) +{ + stop_tracing = 1; +} + +/* + * osnoise_top_set_signals - handles the signal to stop the tool + */ +static void osnoise_top_set_signals(struct osnoise_top_params *params) +{ + signal(SIGINT, stop_top); + if (params->duration) { + signal(SIGALRM, stop_top); + alarm(params->duration); + } +} + +int osnoise_top_main(int argc, char **argv) +{ + struct osnoise_top_params *params; + struct trace_instance *trace; + struct osnoise_tool *record; + struct osnoise_tool *tool; + int return_value = 1; + int retval; + + params = osnoise_top_parse_args(argc, argv); + if (!params) + exit(1); + + tool = osnoise_init_top(params); + if (!tool) { + err_msg("Could not init osnoise top\n"); + goto out_exit; + } + + retval = osnoise_top_apply_config(tool, params); + if (retval) { + err_msg("Could not apply config\n"); + goto out_top; + } + + trace = &tool->trace; + + retval = enable_osnoise(trace); + if (retval) { + err_msg("Failed to enable osnoise tracer\n"); + goto out_top; + } + + if (params->set_sched) { + retval = set_comm_sched_attr("osnoise/", ¶ms->sched_param); + if (retval) { + err_msg("Failed to set sched parameters\n"); + goto out_top; + } + } + + trace_instance_start(trace); + + if (params->trace_output) { + record = osnoise_init_trace_tool("osnoise"); + if (!record) { + err_msg("Failed to enable the trace instance\n"); + goto out_top; + } + trace_instance_start(&record->trace); + } + + tool->start_time = time(NULL); + osnoise_top_set_signals(params); + + do { + sleep(params->sleep_time); + + retval = tracefs_iterate_raw_events(trace->tep, + trace->inst, + NULL, + 0, + collect_registered_events, + trace); + if (retval < 0) { + err_msg("Error iterating on events\n"); + goto out_top; + } + + if (!params->quiet) + osnoise_print_stats(params, tool); + + if (!tracefs_trace_is_on(trace->inst)) + break; + + } while (!stop_tracing); + + osnoise_print_stats(params, tool); + + return_value = 0; + + if (!tracefs_trace_is_on(trace->inst)) { + printf("osnoise hit stop tracing\n"); + if (params->trace_output) { + printf(" Saving trace to %s\n", params->trace_output); + save_trace_to_file(record->trace.inst, params->trace_output); + } + } + +out_top: + osnoise_free_top(tool->data); + osnoise_destroy_tool(tool); + if (params->trace_output) + osnoise_destroy_tool(record); +out_exit: + exit(return_value); +} From 829a6c0b5698c7d51aa2bcd16963fcd9de295476 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:24 +0100 Subject: [PATCH 40/50] rtla/osnoise: Add the hist mode The rtla osnoise hist tool collects all osnoise:sample_threshold occurrence in a histogram, displaying the results in a user-friendly way. The tool also allows many configurations of the osnoise tracer and the collection of the tracer output. Here is one example of the rtla osnoise hist tool output: ---------- %< ---------- [root@f34 ~]# rtla osnoise hist --bucket-size 10 --entries 100 -c 0-8 -d 1M -r 9000 -P F:1 # RTLA osnoise histogram # Time unit is microseconds (us) # Duration: 0 00:01:00 Index CPU-000 CPU-001 CPU-002 CPU-003 CPU-004 CPU-005 CPU-006 CPU-007 CPU-008 0 430 434 352 455 440 463 467 436 484 10 88 88 92 141 120 100 126 166 100 20 19 7 12 22 8 8 13 13 16 30 6 0 2 0 1 2 2 1 0 50 0 0 0 0 0 0 1 0 0 over: 0 0 0 0 0 0 0 0 0 count: 543 529 458 618 569 573 609 616 600 min: 0 0 0 0 0 0 0 0 0 avg: 0 0 0 0 0 0 0 0 0 max: 30 20 30 20 30 30 50 30 20 ---------- >% ---------- Running - rtla osnoise hist --help provides information about the available options. Link: https://lkml.kernel.org/r/c68060544de89b8b62510ed91c7369f162eb465b.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/src/osnoise.c | 6 +- tools/tracing/rtla/src/osnoise.h | 1 + tools/tracing/rtla/src/osnoise_hist.c | 801 ++++++++++++++++++++++++++ 3 files changed, 807 insertions(+), 1 deletion(-) create mode 100644 tools/tracing/rtla/src/osnoise_hist.c diff --git a/tools/tracing/rtla/src/osnoise.c b/tools/tracing/rtla/src/osnoise.c index fbf4f9bfda023e..7b73d1eccd0e3d 100644 --- a/tools/tracing/rtla/src/osnoise.c +++ b/tools/tracing/rtla/src/osnoise.c @@ -828,7 +828,8 @@ static void osnoise_usage(void) " usage: [rtla] osnoise [MODE] ...", "", " modes:", - " top - prints the summary from osnoise tracer", + " top - prints the summary from osnoise tracer", + " hist - prints a histogram of osnoise samples", "", "if no MODE is given, the top mode is called, passing the arguments", NULL, @@ -863,6 +864,9 @@ int osnoise_main(int argc, char *argv[]) } else if (strcmp(argv[1], "top") == 0) { osnoise_top_main(argc-1, &argv[1]); exit(0); + } else if (strcmp(argv[1], "hist") == 0) { + osnoise_hist_main(argc-1, &argv[1]); + exit(0); } usage: diff --git a/tools/tracing/rtla/src/osnoise.h b/tools/tracing/rtla/src/osnoise.h index a4b114ed7d7197..9e4b2e2a4559f4 100644 --- a/tools/tracing/rtla/src/osnoise.h +++ b/tools/tracing/rtla/src/osnoise.h @@ -86,5 +86,6 @@ void osnoise_destroy_tool(struct osnoise_tool *top); struct osnoise_tool *osnoise_init_tool(char *tool_name); struct osnoise_tool *osnoise_init_trace_tool(char *tracer); +int osnoise_hist_main(int argc, char *argv[]); int osnoise_top_main(int argc, char **argv); int osnoise_main(int argc, char **argv); diff --git a/tools/tracing/rtla/src/osnoise_hist.c b/tools/tracing/rtla/src/osnoise_hist.c new file mode 100644 index 00000000000000..180fcbe423cd1d --- /dev/null +++ b/tools/tracing/rtla/src/osnoise_hist.c @@ -0,0 +1,801 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "osnoise.h" + +struct osnoise_hist_params { + char *cpus; + char *monitored_cpus; + char *trace_output; + unsigned long long runtime; + unsigned long long period; + long long stop_us; + long long stop_total_us; + int sleep_time; + int duration; + int set_sched; + int output_divisor; + struct sched_attr sched_param; + + char no_header; + char no_summary; + char no_index; + char with_zeros; + int bucket_size; + int entries; +}; + +struct osnoise_hist_cpu { + int *samples; + int count; + + unsigned long long min_sample; + unsigned long long sum_sample; + unsigned long long max_sample; + +}; + +struct osnoise_hist_data { + struct tracefs_hist *trace_hist; + struct osnoise_hist_cpu *hist; + int entries; + int bucket_size; + int nr_cpus; +}; + +/* + * osnoise_free_histogram - free runtime data + */ +static void +osnoise_free_histogram(struct osnoise_hist_data *data) +{ + int cpu; + + /* one histogram for IRQ and one for thread, per CPU */ + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (data->hist[cpu].samples) + free(data->hist[cpu].samples); + } + + /* one set of histograms per CPU */ + if (data->hist) + free(data->hist); + + free(data); +} + +/* + * osnoise_alloc_histogram - alloc runtime data + */ +static struct osnoise_hist_data +*osnoise_alloc_histogram(int nr_cpus, int entries, int bucket_size) +{ + struct osnoise_hist_data *data; + int cpu; + + data = calloc(1, sizeof(*data)); + if (!data) + return NULL; + + data->entries = entries; + data->bucket_size = bucket_size; + data->nr_cpus = nr_cpus; + + data->hist = calloc(1, sizeof(*data->hist) * nr_cpus); + if (!data->hist) + goto cleanup; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + data->hist[cpu].samples = calloc(1, sizeof(*data->hist->samples) * (entries + 1)); + if (!data->hist[cpu].samples) + goto cleanup; + } + + /* set the min to max */ + for (cpu = 0; cpu < nr_cpus; cpu++) + data->hist[cpu].min_sample = ~0; + + return data; + +cleanup: + osnoise_free_histogram(data); + return NULL; +} + +static void osnoise_hist_update_multiple(struct osnoise_tool *tool, int cpu, + unsigned long long duration, int count) +{ + struct osnoise_hist_params *params = tool->params; + struct osnoise_hist_data *data = tool->data; + int entries = data->entries; + int bucket; + int *hist; + + if (params->output_divisor) + duration = duration / params->output_divisor; + + if (data->bucket_size) + bucket = duration / data->bucket_size; + + hist = data->hist[cpu].samples; + data->hist[cpu].count += count; + update_min(&data->hist[cpu].min_sample, &duration); + update_sum(&data->hist[cpu].sum_sample, &duration); + update_max(&data->hist[cpu].max_sample, &duration); + + if (bucket < entries) + hist[bucket] += count; + else + hist[entries] += count; +} + +/* + * osnoise_destroy_trace_hist - disable events used to collect histogram + */ +static void osnoise_destroy_trace_hist(struct osnoise_tool *tool) +{ + struct osnoise_hist_data *data = tool->data; + + tracefs_hist_pause(tool->trace.inst, data->trace_hist); + tracefs_hist_destroy(tool->trace.inst, data->trace_hist); +} + +/* + * osnoise_init_trace_hist - enable events used to collect histogram + */ +static int osnoise_init_trace_hist(struct osnoise_tool *tool) +{ + struct osnoise_hist_params *params = tool->params; + struct osnoise_hist_data *data = tool->data; + int bucket_size; + char buff[128]; + int retval = 0; + + /* + * Set the size of the bucket. + */ + bucket_size = params->output_divisor * params->bucket_size; + snprintf(buff, sizeof(buff), "duration.buckets=%d", bucket_size); + + data->trace_hist = tracefs_hist_alloc(tool->trace.tep, "osnoise", "sample_threshold", + buff, TRACEFS_HIST_KEY_NORMAL); + if (!data->trace_hist) + return 1; + + retval = tracefs_hist_add_key(data->trace_hist, "cpu", 0); + if (retval) + goto out_err; + + retval = tracefs_hist_start(tool->trace.inst, data->trace_hist); + if (retval) + goto out_err; + + return 0; + +out_err: + osnoise_destroy_trace_hist(tool); + return 1; +} + +/* + * osnoise_read_trace_hist - parse histogram file and file osnoise histogram + */ +static void osnoise_read_trace_hist(struct osnoise_tool *tool) +{ + struct osnoise_hist_data *data = tool->data; + long long cpu, counter, duration; + char *content, *position; + + tracefs_hist_pause(tool->trace.inst, data->trace_hist); + + content = tracefs_event_file_read(tool->trace.inst, "osnoise", + "sample_threshold", + "hist", NULL); + if (!content) + return; + + position = content; + while (true) { + position = strstr(position, "duration: ~"); + if (!position) + break; + position += strlen("duration: ~"); + duration = get_llong_from_str(position); + if (duration == -1) + err_msg("error reading duration from histogram\n"); + + position = strstr(position, "cpu:"); + if (!position) + break; + position += strlen("cpu: "); + cpu = get_llong_from_str(position); + if (cpu == -1) + err_msg("error reading cpu from histogram\n"); + + position = strstr(position, "hitcount:"); + if (!position) + break; + position += strlen("hitcount: "); + counter = get_llong_from_str(position); + if (counter == -1) + err_msg("error reading counter from histogram\n"); + + osnoise_hist_update_multiple(tool, cpu, duration, counter); + } + free(content); +} + +/* + * osnoise_hist_header - print the header of the tracer to the output + */ +static void osnoise_hist_header(struct osnoise_tool *tool) +{ + struct osnoise_hist_params *params = tool->params; + struct osnoise_hist_data *data = tool->data; + struct trace_seq *s = tool->trace.seq; + char duration[26]; + int cpu; + + if (params->no_header) + return; + + get_duration(tool->start_time, duration, sizeof(duration)); + trace_seq_printf(s, "# RTLA osnoise histogram\n"); + trace_seq_printf(s, "# Time unit is %s (%s)\n", + params->output_divisor == 1 ? "nanoseconds" : "microseconds", + params->output_divisor == 1 ? "ns" : "us"); + + trace_seq_printf(s, "# Duration: %s\n", duration); + + if (!params->no_index) + trace_seq_printf(s, "Index"); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + trace_seq_printf(s, " CPU-%03d", cpu); + } + trace_seq_printf(s, "\n"); + + trace_seq_do_printf(s); + trace_seq_reset(s); +} + +/* + * osnoise_print_summary - print the summary of the hist data to the output + */ +static void +osnoise_print_summary(struct osnoise_hist_params *params, + struct trace_instance *trace, + struct osnoise_hist_data *data) +{ + int cpu; + + if (params->no_summary) + return; + + if (!params->no_index) + trace_seq_printf(trace->seq, "count:"); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + trace_seq_printf(trace->seq, "%9d ", data->hist[cpu].count); + } + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "min: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + trace_seq_printf(trace->seq, "%9llu ", data->hist[cpu].min_sample); + + } + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "avg: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + if (data->hist[cpu].count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].sum_sample / data->hist[cpu].count); + else + trace_seq_printf(trace->seq, " - "); + } + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "max: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + trace_seq_printf(trace->seq, "%9llu ", data->hist[cpu].max_sample); + + } + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + +/* + * osnoise_print_stats - print data for all CPUs + */ +static void +osnoise_print_stats(struct osnoise_hist_params *params, struct osnoise_tool *tool) +{ + struct osnoise_hist_data *data = tool->data; + struct trace_instance *trace = &tool->trace; + int bucket, cpu; + int total; + + osnoise_hist_header(tool); + + for (bucket = 0; bucket < data->entries; bucket++) { + total = 0; + + if (!params->no_index) + trace_seq_printf(trace->seq, "%-6d", + bucket * data->bucket_size); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + total += data->hist[cpu].samples[bucket]; + trace_seq_printf(trace->seq, "%9d ", data->hist[cpu].samples[bucket]); + } + + if (total == 0 && !params->with_zeros) { + trace_seq_reset(trace->seq); + continue; + } + + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); + } + + if (!params->no_index) + trace_seq_printf(trace->seq, "over: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].count) + continue; + + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].samples[data->entries]); + } + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); + + osnoise_print_summary(params, trace, data); +} + +/* + * osnoise_hist_usage - prints osnoise hist usage message + */ +static void osnoise_hist_usage(char *usage) +{ + int i; + + static const char * const msg[] = { + "", + " usage: rtla osnoise hist [-h] [-D] [-d s] [-p us] [-r us] [-s us] [-S us] [-t[=file]] \\", + " [-c cpu-list] [-P priority] [-b N] [-e N] [--no-header] [--no-summary] \\", + " [--no-index] [--with-zeros]", + "", + " -h/--help: print this menu", + " -p/--period us: osnoise period in us", + " -r/--runtime us: osnoise runtime in us", + " -s/--stop us: stop trace if a single sample is higher than the argument in us", + " -S/--stop-total us: stop trace if the total sample is higher than the argument in us", + " -c/--cpus cpu-list: list of cpus to run osnoise threads", + " -d/--duration time[s|m|h|d]: duration of the session", + " -D/--debug: print debug info", + " -t/--trace[=file]: save the stopped trace to [file|osnoise_trace.txt]", + " -b/--bucket-size N: set the histogram bucket size (default 1)", + " -e/--entries N: set the number of entries of the histogram (default 256)", + " --no-header: do not print header", + " --no-summary: do not print summary", + " --no-index: do not print index", + " --with-zeros: print zero only entries", + " -P/--priority o:prio|r:prio|f:prio|d:runtime:period: set scheduling parameters", + " o:prio - use SCHED_OTHER with prio", + " r:prio - use SCHED_RR with prio", + " f:prio - use SCHED_FIFO with prio", + " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", + " in nanoseconds", + NULL, + }; + + if (usage) + fprintf(stderr, "%s\n", usage); + + fprintf(stderr, "rtla osnoise hist: a per-cpu histogram of the OS noise (version %s)\n", + VERSION); + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +/* + * osnoise_hist_parse_args - allocs, parse and fill the cmd line parameters + */ +static struct osnoise_hist_params +*osnoise_hist_parse_args(int argc, char *argv[]) +{ + struct osnoise_hist_params *params; + int retval; + int c; + + params = calloc(1, sizeof(*params)); + if (!params) + exit(1); + + /* display data in microseconds */ + params->output_divisor = 1000; + params->bucket_size = 1; + params->entries = 256; + + while (1) { + static struct option long_options[] = { + {"bucket-size", required_argument, 0, 'b'}, + {"entries", required_argument, 0, 'e'}, + {"cpus", required_argument, 0, 'c'}, + {"debug", no_argument, 0, 'D'}, + {"duration", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"period", required_argument, 0, 'p'}, + {"priority", required_argument, 0, 'P'}, + {"runtime", required_argument, 0, 'r'}, + {"stop", required_argument, 0, 's'}, + {"stop-total", required_argument, 0, 'S'}, + {"trace", optional_argument, 0, 't'}, + {"no-header", no_argument, 0, '0'}, + {"no-summary", no_argument, 0, '1'}, + {"no-index", no_argument, 0, '2'}, + {"with-zeros", no_argument, 0, '3'}, + {0, 0, 0, 0} + }; + + /* getopt_long stores the option index here. */ + int option_index = 0; + + c = getopt_long(argc, argv, "c:b:d:e:Dhp:P:r:s:S:t::0123", + long_options, &option_index); + + /* detect the end of the options. */ + if (c == -1) + break; + + switch (c) { + case 'b': + params->bucket_size = get_llong_from_str(optarg); + if ((params->bucket_size == 0) || (params->bucket_size >= 1000000)) + osnoise_hist_usage("Bucket size needs to be > 0 and <= 1000000\n"); + break; + case 'c': + retval = parse_cpu_list(optarg, ¶ms->monitored_cpus); + if (retval) + osnoise_hist_usage("\nInvalid -c cpu list\n"); + params->cpus = optarg; + break; + case 'D': + config_debug = 1; + break; + case 'd': + params->duration = parse_seconds_duration(optarg); + if (!params->duration) + osnoise_hist_usage("Invalid -D duration\n"); + break; + case 'e': + params->entries = get_llong_from_str(optarg); + if ((params->entries < 10) || (params->entries > 9999999)) + osnoise_hist_usage("Entries must be > 10 and < 9999999\n"); + break; + case 'h': + case '?': + osnoise_hist_usage(NULL); + break; + case 'p': + params->period = get_llong_from_str(optarg); + if (params->period > 10000000) + osnoise_hist_usage("Period longer than 10 s\n"); + break; + case 'P': + retval = parse_prio(optarg, ¶ms->sched_param); + if (retval == -1) + osnoise_hist_usage("Invalid -P priority"); + params->set_sched = 1; + break; + case 'r': + params->runtime = get_llong_from_str(optarg); + if (params->runtime < 100) + osnoise_hist_usage("Runtime shorter than 100 us\n"); + break; + case 's': + params->stop_us = get_llong_from_str(optarg); + break; + case 'S': + params->stop_total_us = get_llong_from_str(optarg); + break; + case 't': + if (optarg) + /* skip = */ + params->trace_output = &optarg[1]; + else + params->trace_output = "osnoise_trace.txt"; + break; + case '0': /* no header */ + params->no_header = 1; + break; + case '1': /* no summary */ + params->no_summary = 1; + break; + case '2': /* no index */ + params->no_index = 1; + break; + case '3': /* with zeros */ + params->with_zeros = 1; + break; + default: + osnoise_hist_usage("Invalid option"); + } + } + + if (geteuid()) { + err_msg("rtla needs root permission\n"); + exit(EXIT_FAILURE); + } + + if (params->no_index && !params->with_zeros) + osnoise_hist_usage("no-index set and with-zeros not set - it does not make sense"); + + return params; +} + +/* + * osnoise_hist_apply_config - apply the hist configs to the initialized tool + */ +static int +osnoise_hist_apply_config(struct osnoise_tool *tool, struct osnoise_hist_params *params) +{ + int retval; + + if (!params->sleep_time) + params->sleep_time = 1; + + if (params->cpus) { + retval = osnoise_set_cpus(tool->context, params->cpus); + if (retval) { + err_msg("Failed to apply CPUs config\n"); + goto out_err; + } + } + + if (params->runtime || params->period) { + retval = osnoise_set_runtime_period(tool->context, + params->runtime, + params->period); + if (retval) { + err_msg("Failed to set runtime and/or period\n"); + goto out_err; + } + } + + if (params->stop_us) { + retval = osnoise_set_stop_us(tool->context, params->stop_us); + if (retval) { + err_msg("Failed to set stop us\n"); + goto out_err; + } + } + + if (params->stop_total_us) { + retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us); + if (retval) { + err_msg("Failed to set stop total us\n"); + goto out_err; + } + } + + return 0; + +out_err: + return -1; +} + +/* + * osnoise_init_hist - initialize a osnoise hist tool with parameters + */ +static struct osnoise_tool +*osnoise_init_hist(struct osnoise_hist_params *params) +{ + struct osnoise_tool *tool; + int nr_cpus; + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + tool = osnoise_init_tool("osnoise_hist"); + if (!tool) + return NULL; + + tool->data = osnoise_alloc_histogram(nr_cpus, params->entries, params->bucket_size); + if (!tool->data) + goto out_err; + + tool->params = params; + + return tool; + +out_err: + osnoise_destroy_tool(tool); + return NULL; +} + +static int stop_tracing; +static void stop_hist(int sig) +{ + stop_tracing = 1; +} + +/* + * osnoise_hist_set_signals - handles the signal to stop the tool + */ +static void +osnoise_hist_set_signals(struct osnoise_hist_params *params) +{ + signal(SIGINT, stop_hist); + if (params->duration) { + signal(SIGALRM, stop_hist); + alarm(params->duration); + } +} + +int osnoise_hist_main(int argc, char *argv[]) +{ + struct osnoise_hist_params *params; + struct trace_instance *trace; + struct osnoise_tool *record; + struct osnoise_tool *tool; + int return_value = 1; + int retval; + + params = osnoise_hist_parse_args(argc, argv); + if (!params) + exit(1); + + tool = osnoise_init_hist(params); + if (!tool) { + err_msg("Could not init osnoise hist\n"); + goto out_exit; + } + + retval = osnoise_hist_apply_config(tool, params); + if (retval) { + err_msg("Could not apply config\n"); + goto out_destroy; + } + + trace = &tool->trace; + + retval = enable_osnoise(trace); + if (retval) { + err_msg("Failed to enable osnoise tracer\n"); + goto out_destroy; + } + + retval = osnoise_init_trace_hist(tool); + if (retval) + goto out_destroy; + + if (params->set_sched) { + retval = set_comm_sched_attr("osnoise/", ¶ms->sched_param); + if (retval) { + err_msg("Failed to set sched parameters\n"); + goto out_hist; + } + } + + trace_instance_start(trace); + + if (params->trace_output) { + record = osnoise_init_trace_tool("osnoise"); + if (!record) { + err_msg("Failed to enable the trace instance\n"); + goto out_hist; + } + trace_instance_start(&record->trace); + } + + tool->start_time = time(NULL); + osnoise_hist_set_signals(params); + + while (!stop_tracing) { + sleep(params->sleep_time); + + retval = tracefs_iterate_raw_events(trace->tep, + trace->inst, + NULL, + 0, + collect_registered_events, + trace); + if (retval < 0) { + err_msg("Error iterating on events\n"); + goto out_hist; + } + + if (!tracefs_trace_is_on(trace->inst)) + break; + }; + + osnoise_read_trace_hist(tool); + + osnoise_print_stats(params, tool); + + return_value = 0; + + if (!tracefs_trace_is_on(trace->inst)) { + printf("rtla timelat hit stop tracing\n"); + if (params->trace_output) { + printf(" Saving trace to %s\n", params->trace_output); + save_trace_to_file(record->trace.inst, params->trace_output); + } + } + +out_hist: + osnoise_free_histogram(tool->data); +out_destroy: + osnoise_destroy_tool(tool); + if (params->trace_output) + osnoise_destroy_tool(record); + free(params); +out_exit: + exit(return_value); +} From a828cd18bc4ad72ff6d64c0dd23dc814a6741fed Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:25 +0100 Subject: [PATCH 41/50] rtla: Add timerlat tool and timelart top mode The rtla timerlat tool is an interface for the timerlat tracer. The timerlat tracer dispatches a kernel thread per-cpu. These threads set a periodic timer to wake themselves up and go back to sleep. After the wakeup, they collect and generate useful information for the debugging of operating system timer latency. The timerlat tracer outputs information in two ways. It periodically prints the timer latency at the timer IRQ handler and the Thread handler. It also provides information for each noise via the osnoise tracepoints. The rtla timerlat top mode displays a summary of the periodic output from the timerlat tracer. Here is one example of the rtla timerlat tool output: ---------- %< ---------- [root@alien ~]# rtla timerlat top -c 0-3 -d 1m Timer Latency 0 00:01:00 | IRQ Timer Latency (us) | Thread Timer Latency (us) CPU COUNT | cur min avg max | cur min avg max 0 #60001 | 0 0 0 3 | 1 1 1 6 1 #60001 | 0 0 0 3 | 2 1 1 5 2 #60001 | 0 0 1 6 | 1 1 2 7 3 #60001 | 0 0 0 7 | 1 1 1 11 ---------- >% ---------- Running: # rtla timerlat --help # rtla timerlat top --help provides information about the available options. Link: https://lkml.kernel.org/r/e95032e20c2b88c962195bf7693bb53c9ebcced8.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/Makefile | 2 + tools/tracing/rtla/src/rtla.c | 5 + tools/tracing/rtla/src/timerlat.c | 68 +++ tools/tracing/rtla/src/timerlat.h | 4 + tools/tracing/rtla/src/timerlat_top.c | 618 ++++++++++++++++++++++++++ 5 files changed, 697 insertions(+) create mode 100644 tools/tracing/rtla/src/timerlat.c create mode 100644 tools/tracing/rtla/src/timerlat.h create mode 100644 tools/tracing/rtla/src/timerlat_top.c diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index d40b442fcd3066..7a7348faec3258 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -62,6 +62,8 @@ install: $(STRIP) $(DESTDIR)$(BINDIR)/rtla @test ! -f $(DESTDIR)$(BINDIR)/osnoise || rm $(DESTDIR)$(BINDIR)/osnoise ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/osnoise + @test ! -f $(DESTDIR)$(BINDIR)/timerlat || rm $(DESTDIR)$(BINDIR)/timerlat + ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/timerlat .PHONY: clean tarball clean: diff --git a/tools/tracing/rtla/src/rtla.c b/tools/tracing/rtla/src/rtla.c index 669b9750b3b303..09bd21b8af810b 100644 --- a/tools/tracing/rtla/src/rtla.c +++ b/tools/tracing/rtla/src/rtla.c @@ -9,6 +9,7 @@ #include #include "osnoise.h" +#include "timerlat.h" /* * rtla_usage - print rtla usage @@ -25,6 +26,7 @@ static void rtla_usage(void) "", " commands:", " osnoise - gives information about the operating system noise (osnoise)", + " timerlat - measures the timer irq and thread latency", "", NULL, }; @@ -45,6 +47,9 @@ int run_command(int argc, char **argv, int start_position) if (strcmp(argv[start_position], "osnoise") == 0) { osnoise_main(argc-start_position, &argv[start_position]); goto ran; + } else if (strcmp(argv[start_position], "timerlat") == 0) { + timerlat_main(argc-start_position, &argv[start_position]); + goto ran; } return 0; diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c new file mode 100644 index 00000000000000..29a51c01b08432 --- /dev/null +++ b/tools/tracing/rtla/src/timerlat.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "timerlat.h" + +static void timerlat_usage(void) +{ + int i; + + static const char * const msg[] = { + "", + "timerlat version " VERSION, + "", + " usage: [rtla] timerlat [MODE] ...", + "", + " modes:", + " top - prints the summary from timerlat tracer", + "", + "if no MODE is given, the top mode is called, passing the arguments", + NULL, + }; + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +int timerlat_main(int argc, char *argv[]) +{ + if (argc == 0) + goto usage; + + /* + * if timerlat was called without any argument, run the + * default cmdline. + */ + if (argc == 1) { + timerlat_top_main(argc, argv); + exit(0); + } + + if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) { + timerlat_usage(); + exit(0); + } else if (strncmp(argv[1], "-", 1) == 0) { + /* the user skipped the tool, call the default one */ + timerlat_top_main(argc, argv); + exit(0); + } else if (strcmp(argv[1], "top") == 0) { + timerlat_top_main(argc-1, &argv[1]); + exit(0); + } + +usage: + timerlat_usage(); + exit(1); +} diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h new file mode 100644 index 00000000000000..6f9d3925acd084 --- /dev/null +++ b/tools/tracing/rtla/src/timerlat.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int timerlat_top_main(int argc, char *argv[]); +int timerlat_main(int argc, char *argv[]); diff --git a/tools/tracing/rtla/src/timerlat_top.c b/tools/tracing/rtla/src/timerlat_top.c new file mode 100644 index 00000000000000..1ebd5291539cb5 --- /dev/null +++ b/tools/tracing/rtla/src/timerlat_top.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "osnoise.h" +#include "timerlat.h" + +struct timerlat_top_params { + char *cpus; + char *monitored_cpus; + char *trace_output; + unsigned long long runtime; + long long stop_us; + long long stop_total_us; + long long timerlat_period_us; + long long print_stack; + int sleep_time; + int output_divisor; + int duration; + int quiet; + int set_sched; + struct sched_attr sched_param; +}; + +struct timerlat_top_cpu { + int irq_count; + int thread_count; + + unsigned long long cur_irq; + unsigned long long min_irq; + unsigned long long sum_irq; + unsigned long long max_irq; + + unsigned long long cur_thread; + unsigned long long min_thread; + unsigned long long sum_thread; + unsigned long long max_thread; +}; + +struct timerlat_top_data { + struct timerlat_top_cpu *cpu_data; + int nr_cpus; +}; + +/* + * timerlat_free_top - free runtime data + */ +static void +timerlat_free_top(struct timerlat_top_data *data) +{ + free(data->cpu_data); + free(data); +} + +/* + * timerlat_alloc_histogram - alloc runtime data + */ +static struct timerlat_top_data *timerlat_alloc_top(int nr_cpus) +{ + struct timerlat_top_data *data; + int cpu; + + data = calloc(1, sizeof(*data)); + if (!data) + return NULL; + + data->nr_cpus = nr_cpus; + + /* one set of histograms per CPU */ + data->cpu_data = calloc(1, sizeof(*data->cpu_data) * nr_cpus); + if (!data->cpu_data) + goto cleanup; + + /* set the min to max */ + for (cpu = 0; cpu < nr_cpus; cpu++) { + data->cpu_data[cpu].min_irq = ~0; + data->cpu_data[cpu].min_thread = ~0; + } + + return data; + +cleanup: + timerlat_free_top(data); + return NULL; +} + +/* + * timerlat_hist_update - record a new timerlat occurent on cpu, updating data + */ +static void +timerlat_top_update(struct osnoise_tool *tool, int cpu, + unsigned long long thread, + unsigned long long latency) +{ + struct timerlat_top_data *data = tool->data; + struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu]; + + if (!thread) { + cpu_data->irq_count++; + cpu_data->cur_irq = latency; + update_min(&cpu_data->min_irq, &latency); + update_sum(&cpu_data->sum_irq, &latency); + update_max(&cpu_data->max_irq, &latency); + } else { + cpu_data->thread_count++; + cpu_data->cur_thread = latency; + update_min(&cpu_data->min_thread, &latency); + update_sum(&cpu_data->sum_thread, &latency); + update_max(&cpu_data->max_thread, &latency); + } +} + +/* + * timerlat_top_handler - this is the handler for timerlat tracer events + */ +static int +timerlat_top_handler(struct trace_seq *s, struct tep_record *record, + struct tep_event *event, void *context) +{ + struct trace_instance *trace = context; + unsigned long long latency, thread; + struct osnoise_tool *top; + int cpu = record->cpu; + + top = container_of(trace, struct osnoise_tool, trace); + + tep_get_field_val(s, event, "context", record, &thread, 1); + tep_get_field_val(s, event, "timer_latency", record, &latency, 1); + + timerlat_top_update(top, cpu, thread, latency); + + return 0; +} + +/* + * timerlat_top_header - print the header of the tool output + */ +static void timerlat_top_header(struct osnoise_tool *top) +{ + struct timerlat_top_params *params = top->params; + struct trace_seq *s = top->trace.seq; + char duration[26]; + + get_duration(top->start_time, duration, sizeof(duration)); + + trace_seq_printf(s, "\033[2;37;40m"); + trace_seq_printf(s, " Timer Latency "); + trace_seq_printf(s, "\033[0;0;0m"); + trace_seq_printf(s, "\n"); + + trace_seq_printf(s, "%-6s | IRQ Timer Latency (%s) | Thread Timer Latency (%s)\n", duration, + params->output_divisor == 1 ? "ns" : "us", + params->output_divisor == 1 ? "ns" : "us"); + + trace_seq_printf(s, "\033[2;30;47m"); + trace_seq_printf(s, "CPU COUNT | cur min avg max | cur min avg max"); + trace_seq_printf(s, "\033[0;0;0m"); + trace_seq_printf(s, "\n"); +} + +/* + * timerlat_top_print - prints the output of a given CPU + */ +static void timerlat_top_print(struct osnoise_tool *top, int cpu) +{ + + struct timerlat_top_params *params = top->params; + struct timerlat_top_data *data = top->data; + struct timerlat_top_cpu *cpu_data = &data->cpu_data[cpu]; + int divisor = params->output_divisor; + struct trace_seq *s = top->trace.seq; + + if (divisor == 0) + return; + + /* + * Skip if no data is available: is this cpu offline? + */ + if (!cpu_data->irq_count && !cpu_data->thread_count) + return; + + /* + * Unless trace is being lost, IRQ counter is always the max. + */ + trace_seq_printf(s, "%3d #%-9d |", cpu, cpu_data->irq_count); + + if (!cpu_data->irq_count) { + trace_seq_printf(s, " - "); + trace_seq_printf(s, " - "); + trace_seq_printf(s, " - "); + trace_seq_printf(s, " - |"); + } else { + trace_seq_printf(s, "%9llu ", cpu_data->cur_irq / params->output_divisor); + trace_seq_printf(s, "%9llu ", cpu_data->min_irq / params->output_divisor); + trace_seq_printf(s, "%9llu ", (cpu_data->sum_irq / cpu_data->irq_count) / divisor); + trace_seq_printf(s, "%9llu |", cpu_data->max_irq / divisor); + } + + if (!cpu_data->thread_count) { + trace_seq_printf(s, " - "); + trace_seq_printf(s, " - "); + trace_seq_printf(s, " - "); + trace_seq_printf(s, " -\n"); + } else { + trace_seq_printf(s, "%9llu ", cpu_data->cur_thread / divisor); + trace_seq_printf(s, "%9llu ", cpu_data->min_thread / divisor); + trace_seq_printf(s, "%9llu ", + (cpu_data->sum_thread / cpu_data->thread_count) / divisor); + trace_seq_printf(s, "%9llu\n", cpu_data->max_thread / divisor); + } +} + +/* + * clear_terminal - clears the output terminal + */ +static void clear_terminal(struct trace_seq *seq) +{ + if (!config_debug) + trace_seq_printf(seq, "\033c"); +} + +/* + * timerlat_print_stats - print data for all cpus + */ +static void +timerlat_print_stats(struct timerlat_top_params *params, struct osnoise_tool *top) +{ + struct trace_instance *trace = &top->trace; + static int nr_cpus = -1; + int i; + + if (nr_cpus == -1) + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + if (!params->quiet) + clear_terminal(trace->seq); + + timerlat_top_header(top); + + for (i = 0; i < nr_cpus; i++) { + if (params->cpus && !params->monitored_cpus[i]) + continue; + timerlat_top_print(top, i); + } + + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + +/* + * timerlat_top_usage - prints timerlat top usage message + */ +static void timerlat_top_usage(char *usage) +{ + int i; + + static const char *const msg[] = { + "", + " usage: rtla timerlat [top] [-h] [-q] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] [-t[=file]] \\", + " [-c cpu-list] [-P priority]", + "", + " -h/--help: print this menu", + " -p/--period us: timerlat period in us", + " -i/--irq us: stop trace if the irq latency is higher than the argument in us", + " -T/--thread us: stop trace if the thread latency is higher than the argument in us", + " -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us", + " -c/--cpus cpus: run the tracer only on the given cpus", + " -d/--duration time[m|h|d]: duration of the session in seconds", + " -D/--debug: print debug info", + " -t/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -n/--nano: display data in nanoseconds", + " -q/--quiet print only a summary at the end", + " -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters", + " o:prio - use SCHED_OTHER with prio", + " r:prio - use SCHED_RR with prio", + " f:prio - use SCHED_FIFO with prio", + " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", + " in nanoseconds", + NULL, + }; + + if (usage) + fprintf(stderr, "%s\n", usage); + + fprintf(stderr, "rtla timerlat top: a per-cpu summary of the timer latency (version %s)\n", + VERSION); + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +/* + * timerlat_top_parse_args - allocs, parse and fill the cmd line parameters + */ +static struct timerlat_top_params +*timerlat_top_parse_args(int argc, char **argv) +{ + struct timerlat_top_params *params; + int retval; + int c; + + params = calloc(1, sizeof(*params)); + if (!params) + exit(1); + + /* display data in microseconds */ + params->output_divisor = 1000; + + while (1) { + static struct option long_options[] = { + {"cpus", required_argument, 0, 'c'}, + {"debug", no_argument, 0, 'D'}, + {"duration", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"irq", required_argument, 0, 'i'}, + {"nano", no_argument, 0, 'n'}, + {"period", required_argument, 0, 'p'}, + {"priority", required_argument, 0, 'P'}, + {"quiet", no_argument, 0, 'q'}, + {"stack", required_argument, 0, 's'}, + {"thread", required_argument, 0, 'T'}, + {"trace", optional_argument, 0, 't'}, + {0, 0, 0, 0} + }; + + /* getopt_long stores the option index here. */ + int option_index = 0; + + c = getopt_long(argc, argv, "c:d:Dhi:np:P:qs:t::T:", + long_options, &option_index); + + /* detect the end of the options. */ + if (c == -1) + break; + + switch (c) { + case 'c': + retval = parse_cpu_list(optarg, ¶ms->monitored_cpus); + if (retval) + timerlat_top_usage("\nInvalid -c cpu list\n"); + params->cpus = optarg; + break; + case 'D': + config_debug = 1; + break; + case 'd': + params->duration = parse_seconds_duration(optarg); + if (!params->duration) + timerlat_top_usage("Invalid -D duration\n"); + break; + case 'h': + case '?': + timerlat_top_usage(NULL); + break; + case 'i': + params->stop_us = get_llong_from_str(optarg); + break; + case 'n': + params->output_divisor = 1; + break; + case 'p': + params->timerlat_period_us = get_llong_from_str(optarg); + if (params->timerlat_period_us > 1000000) + timerlat_top_usage("Period longer than 1 s\n"); + break; + case 'P': + retval = parse_prio(optarg, ¶ms->sched_param); + if (retval == -1) + timerlat_top_usage("Invalid -P priority"); + params->set_sched = 1; + break; + case 'q': + params->quiet = 1; + break; + case 's': + params->print_stack = get_llong_from_str(optarg); + break; + case 'T': + params->stop_total_us = get_llong_from_str(optarg); + break; + case 't': + if (optarg) + /* skip = */ + params->trace_output = &optarg[1]; + else + params->trace_output = "timerlat_trace.txt"; + break; + default: + timerlat_top_usage("Invalid option"); + } + } + + if (geteuid()) { + err_msg("rtla needs root permission\n"); + exit(EXIT_FAILURE); + } + + return params; +} + +/* + * timerlat_top_apply_config - apply the top configs to the initialized tool + */ +static int +timerlat_top_apply_config(struct osnoise_tool *top, struct timerlat_top_params *params) +{ + int retval; + + if (!params->sleep_time) + params->sleep_time = 1; + + if (params->cpus) { + retval = osnoise_set_cpus(top->context, params->cpus); + if (retval) { + err_msg("Failed to apply CPUs config\n"); + goto out_err; + } + } + + if (params->stop_us) { + retval = osnoise_set_stop_us(top->context, params->stop_us); + if (retval) { + err_msg("Failed to set stop us\n"); + goto out_err; + } + } + + if (params->stop_total_us) { + retval = osnoise_set_stop_total_us(top->context, params->stop_total_us); + if (retval) { + err_msg("Failed to set stop total us\n"); + goto out_err; + } + } + + + if (params->timerlat_period_us) { + retval = osnoise_set_timerlat_period_us(top->context, params->timerlat_period_us); + if (retval) { + err_msg("Failed to set timerlat period\n"); + goto out_err; + } + } + + + if (params->print_stack) { + retval = osnoise_set_print_stack(top->context, params->print_stack); + if (retval) { + err_msg("Failed to set print stack\n"); + goto out_err; + } + } + + return 0; + +out_err: + return -1; +} + +/* + * timerlat_init_top - initialize a timerlat top tool with parameters + */ +static struct osnoise_tool +*timerlat_init_top(struct timerlat_top_params *params) +{ + struct osnoise_tool *top; + int nr_cpus; + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + top = osnoise_init_tool("timerlat_top"); + if (!top) + return NULL; + + top->data = timerlat_alloc_top(nr_cpus); + if (!top->data) + goto out_err; + + top->params = params; + + tep_register_event_handler(top->trace.tep, -1, "ftrace", "timerlat", + timerlat_top_handler, top); + + return top; + +out_err: + osnoise_destroy_tool(top); + return NULL; +} + +static int stop_tracing; +static void stop_top(int sig) +{ + stop_tracing = 1; +} + +/* + * timerlat_top_set_signals - handles the signal to stop the tool + */ +static void +timerlat_top_set_signals(struct timerlat_top_params *params) +{ + signal(SIGINT, stop_top); + if (params->duration) { + signal(SIGALRM, stop_top); + alarm(params->duration); + } +} + +int timerlat_top_main(int argc, char *argv[]) +{ + struct timerlat_top_params *params; + struct trace_instance *trace; + struct osnoise_tool *record; + struct osnoise_tool *top; + int return_value = 1; + int retval; + + params = timerlat_top_parse_args(argc, argv); + if (!params) + exit(1); + + top = timerlat_init_top(params); + if (!top) { + err_msg("Could not init osnoise top\n"); + goto out_exit; + } + + retval = timerlat_top_apply_config(top, params); + if (retval) { + err_msg("Could not apply config\n"); + goto out_top; + } + + trace = &top->trace; + + retval = enable_timerlat(trace); + if (retval) { + err_msg("Failed to enable timerlat tracer\n"); + goto out_top; + } + + if (params->set_sched) { + retval = set_comm_sched_attr("timerlat/", ¶ms->sched_param); + if (retval) { + err_msg("Failed to set sched parameters\n"); + goto out_top; + } + } + + trace_instance_start(trace); + + if (params->trace_output) { + record = osnoise_init_trace_tool("timerlat"); + if (!record) { + err_msg("Failed to enable the trace instance\n"); + goto out_top; + } + trace_instance_start(&record->trace); + } + + top->start_time = time(NULL); + timerlat_top_set_signals(params); + + while (!stop_tracing) { + sleep(params->sleep_time); + + retval = tracefs_iterate_raw_events(trace->tep, + trace->inst, + NULL, + 0, + collect_registered_events, + trace); + if (retval < 0) { + err_msg("Error iterating on events\n"); + goto out_top; + } + + if (!params->quiet) + timerlat_print_stats(params, top); + + if (!tracefs_trace_is_on(trace->inst)) + break; + + }; + + timerlat_print_stats(params, top); + + return_value = 0; + + if (!tracefs_trace_is_on(trace->inst)) { + printf("rtla timelat hit stop tracing\n"); + if (params->trace_output) { + printf(" Saving trace to %s\n", params->trace_output); + save_trace_to_file(record->trace.inst, params->trace_output); + } + } + +out_top: + timerlat_free_top(top->data); + osnoise_destroy_tool(top); + if (params->trace_output) + osnoise_destroy_tool(record); + free(params); +out_exit: + exit(return_value); +} From 1eeb6328e8b3a917ba0be4965f272147dbd4cf55 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:26 +0100 Subject: [PATCH 42/50] rtla/timerlat: Add timerlat hist mode The rtla hist hist mode displays a histogram of each tracer event occurrence, both for IRQ and timer latencies. The tool also allows many configurations of the timerlat tracer and the collection of the tracer output. Here is one example of the rtla timerlat hist mode output: ---------- %< ---------- [root@alien ~]# rtla timerlat hist -c 0-3 -d 1M # RTLA timerlat histogram # Time unit is microseconds (us) # Duration: 0 00:01:00 Index IRQ-000 Thr-000 IRQ-001 Thr-001 IRQ-002 Thr-002 IRQ-003 Thr-003 0 58572 0 59373 0 58691 0 58895 0 1 1422 57021 628 57241 1310 56160 1102 56805 2 6 2931 0 2695 0 3567 4 3031 3 1 40 0 53 0 260 0 142 4 0 7 0 5 0 6 0 17 5 0 2 0 5 0 7 0 4 6 0 0 0 2 0 1 0 1 8 0 0 0 0 0 0 0 1 over: 0 0 0 0 0 0 0 0 count: 60001 60001 60001 60001 60001 60001 60001 60001 min: 0 1 0 1 0 1 0 1 avg: 0 1 0 1 0 1 0 1 max: 3 5 1 6 1 6 2 8 ---------- >% ---------- Running - rtla timerlat hist --help provides information about the available options. Link: https://lkml.kernel.org/r/7049ed3c46b7d6aceab18ffe7770003dfc4ddceb.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- tools/tracing/rtla/src/timerlat.c | 6 +- tools/tracing/rtla/src/timerlat.h | 2 +- tools/tracing/rtla/src/timerlat_hist.c | 822 +++++++++++++++++++++++++ 3 files changed, 828 insertions(+), 2 deletions(-) create mode 100644 tools/tracing/rtla/src/timerlat_hist.c diff --git a/tools/tracing/rtla/src/timerlat.c b/tools/tracing/rtla/src/timerlat.c index 29a51c01b08432..97abbf494fee47 100644 --- a/tools/tracing/rtla/src/timerlat.c +++ b/tools/tracing/rtla/src/timerlat.c @@ -25,7 +25,8 @@ static void timerlat_usage(void) " usage: [rtla] timerlat [MODE] ...", "", " modes:", - " top - prints the summary from timerlat tracer", + " top - prints the summary from timerlat tracer", + " hist - prints a histogram of timer latencies", "", "if no MODE is given, the top mode is called, passing the arguments", NULL, @@ -60,6 +61,9 @@ int timerlat_main(int argc, char *argv[]) } else if (strcmp(argv[1], "top") == 0) { timerlat_top_main(argc-1, &argv[1]); exit(0); + } else if (strcmp(argv[1], "hist") == 0) { + timerlat_hist_main(argc-1, &argv[1]); + exit(0); } usage: diff --git a/tools/tracing/rtla/src/timerlat.h b/tools/tracing/rtla/src/timerlat.h index 6f9d3925acd084..88561bfd14f3a7 100644 --- a/tools/tracing/rtla/src/timerlat.h +++ b/tools/tracing/rtla/src/timerlat.h @@ -1,4 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 - +int timerlat_hist_main(int argc, char *argv[]); int timerlat_top_main(int argc, char *argv[]); int timerlat_main(int argc, char *argv[]); diff --git a/tools/tracing/rtla/src/timerlat_hist.c b/tools/tracing/rtla/src/timerlat_hist.c new file mode 100644 index 00000000000000..235f9620ef3d6c --- /dev/null +++ b/tools/tracing/rtla/src/timerlat_hist.c @@ -0,0 +1,822 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Red Hat Inc, Daniel Bristot de Oliveira + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "osnoise.h" +#include "timerlat.h" + +struct timerlat_hist_params { + char *cpus; + char *monitored_cpus; + char *trace_output; + unsigned long long runtime; + long long stop_us; + long long stop_total_us; + long long timerlat_period_us; + long long print_stack; + int sleep_time; + int output_divisor; + int duration; + int set_sched; + struct sched_attr sched_param; + + char no_irq; + char no_thread; + char no_header; + char no_summary; + char no_index; + char with_zeros; + int bucket_size; + int entries; +}; + +struct timerlat_hist_cpu { + int *irq; + int *thread; + + int irq_count; + int thread_count; + + unsigned long long min_irq; + unsigned long long sum_irq; + unsigned long long max_irq; + + unsigned long long min_thread; + unsigned long long sum_thread; + unsigned long long max_thread; +}; + +struct timerlat_hist_data { + struct timerlat_hist_cpu *hist; + int entries; + int bucket_size; + int nr_cpus; +}; + +/* + * timerlat_free_histogram - free runtime data + */ +static void +timerlat_free_histogram(struct timerlat_hist_data *data) +{ + int cpu; + + /* one histogram for IRQ and one for thread, per CPU */ + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (data->hist[cpu].irq) + free(data->hist[cpu].irq); + + if (data->hist[cpu].thread) + free(data->hist[cpu].thread); + } + + /* one set of histograms per CPU */ + if (data->hist) + free(data->hist); + + free(data); +} + +/* + * timerlat_alloc_histogram - alloc runtime data + */ +static struct timerlat_hist_data +*timerlat_alloc_histogram(int nr_cpus, int entries, int bucket_size) +{ + struct timerlat_hist_data *data; + int cpu; + + data = calloc(1, sizeof(*data)); + if (!data) + return NULL; + + data->entries = entries; + data->bucket_size = bucket_size; + data->nr_cpus = nr_cpus; + + /* one set of histograms per CPU */ + data->hist = calloc(1, sizeof(*data->hist) * nr_cpus); + if (!data->hist) + goto cleanup; + + /* one histogram for IRQ and one for thread, per cpu */ + for (cpu = 0; cpu < nr_cpus; cpu++) { + data->hist[cpu].irq = calloc(1, sizeof(*data->hist->irq) * (entries + 1)); + if (!data->hist[cpu].irq) + goto cleanup; + data->hist[cpu].thread = calloc(1, sizeof(*data->hist->thread) * (entries + 1)); + if (!data->hist[cpu].thread) + goto cleanup; + } + + /* set the min to max */ + for (cpu = 0; cpu < nr_cpus; cpu++) { + data->hist[cpu].min_irq = ~0; + data->hist[cpu].min_thread = ~0; + } + + return data; + +cleanup: + timerlat_free_histogram(data); + return NULL; +} + +/* + * timerlat_hist_update - record a new timerlat occurent on cpu, updating data + */ +static void +timerlat_hist_update(struct osnoise_tool *tool, int cpu, + unsigned long long thread, + unsigned long long latency) +{ + struct timerlat_hist_params *params = tool->params; + struct timerlat_hist_data *data = tool->data; + int entries = data->entries; + int bucket; + int *hist; + + if (params->output_divisor) + latency = latency / params->output_divisor; + + if (data->bucket_size) + bucket = latency / data->bucket_size; + + if (!thread) { + hist = data->hist[cpu].irq; + data->hist[cpu].irq_count++; + update_min(&data->hist[cpu].min_irq, &latency); + update_sum(&data->hist[cpu].sum_irq, &latency); + update_max(&data->hist[cpu].max_irq, &latency); + } else { + hist = data->hist[cpu].thread; + data->hist[cpu].thread_count++; + update_min(&data->hist[cpu].min_thread, &latency); + update_sum(&data->hist[cpu].sum_thread, &latency); + update_max(&data->hist[cpu].max_thread, &latency); + } + + if (bucket < entries) + hist[bucket]++; + else + hist[entries]++; +} + +/* + * timerlat_hist_handler - this is the handler for timerlat tracer events + */ +static int +timerlat_hist_handler(struct trace_seq *s, struct tep_record *record, + struct tep_event *event, void *data) +{ + struct trace_instance *trace = data; + unsigned long long thread, latency; + struct osnoise_tool *tool; + int cpu = record->cpu; + + tool = container_of(trace, struct osnoise_tool, trace); + + tep_get_field_val(s, event, "context", record, &thread, 1); + tep_get_field_val(s, event, "timer_latency", record, &latency, 1); + + timerlat_hist_update(tool, cpu, thread, latency); + + return 0; +} + +/* + * timerlat_hist_header - print the header of the tracer to the output + */ +static void timerlat_hist_header(struct osnoise_tool *tool) +{ + struct timerlat_hist_params *params = tool->params; + struct timerlat_hist_data *data = tool->data; + struct trace_seq *s = tool->trace.seq; + char duration[26]; + int cpu; + + if (params->no_header) + return; + + get_duration(tool->start_time, duration, sizeof(duration)); + trace_seq_printf(s, "# RTLA timerlat histogram\n"); + trace_seq_printf(s, "# Time unit is %s (%s)\n", + params->output_divisor == 1 ? "nanoseconds" : "microseconds", + params->output_divisor == 1 ? "ns" : "us"); + + trace_seq_printf(s, "# Duration: %s\n", duration); + + if (!params->no_index) + trace_seq_printf(s, "Index"); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) + trace_seq_printf(s, " IRQ-%03d", cpu); + + if (!params->no_thread) + trace_seq_printf(s, " Thr-%03d", cpu); + } + trace_seq_printf(s, "\n"); + + + trace_seq_do_printf(s); + trace_seq_reset(s); +} + +/* + * timerlat_print_summary - print the summary of the hist data to the output + */ +static void +timerlat_print_summary(struct timerlat_hist_params *params, + struct trace_instance *trace, + struct timerlat_hist_data *data) +{ + int cpu; + + if (params->no_summary) + return; + + if (!params->no_index) + trace_seq_printf(trace->seq, "count:"); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].irq_count); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].thread_count); + } + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "min: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].min_thread); + } + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "avg: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) { + if (data->hist[cpu].irq_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].sum_irq / data->hist[cpu].irq_count); + else + trace_seq_printf(trace->seq, " - "); + } + + if (!params->no_thread) { + if (data->hist[cpu].thread_count) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].sum_thread / data->hist[cpu].thread_count); + else + trace_seq_printf(trace->seq, " - "); + } + } + trace_seq_printf(trace->seq, "\n"); + + if (!params->no_index) + trace_seq_printf(trace->seq, "max: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_irq); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9llu ", + data->hist[cpu].max_thread); + } + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); +} + +/* + * timerlat_print_stats - print data for all CPUs + */ +static void +timerlat_print_stats(struct timerlat_hist_params *params, struct osnoise_tool *tool) +{ + struct timerlat_hist_data *data = tool->data; + struct trace_instance *trace = &tool->trace; + int bucket, cpu; + int total; + + timerlat_hist_header(tool); + + for (bucket = 0; bucket < data->entries; bucket++) { + total = 0; + + if (!params->no_index) + trace_seq_printf(trace->seq, "%-6d", + bucket * data->bucket_size); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) { + total += data->hist[cpu].irq[bucket]; + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].irq[bucket]); + } + + if (!params->no_thread) { + total += data->hist[cpu].thread[bucket]; + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].thread[bucket]); + } + + } + + if (total == 0 && !params->with_zeros) { + trace_seq_reset(trace->seq); + continue; + } + + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); + } + + if (!params->no_index) + trace_seq_printf(trace->seq, "over: "); + + for (cpu = 0; cpu < data->nr_cpus; cpu++) { + if (params->cpus && !params->monitored_cpus[cpu]) + continue; + + if (!data->hist[cpu].irq_count && !data->hist[cpu].thread_count) + continue; + + if (!params->no_irq) + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].irq[data->entries]); + + if (!params->no_thread) + trace_seq_printf(trace->seq, "%9d ", + data->hist[cpu].thread[data->entries]); + } + trace_seq_printf(trace->seq, "\n"); + trace_seq_do_printf(trace->seq); + trace_seq_reset(trace->seq); + + timerlat_print_summary(params, trace, data); +} + +/* + * timerlat_hist_usage - prints timerlat top usage message + */ +static void timerlat_hist_usage(char *usage) +{ + int i; + + char *msg[] = { + "", + " usage: [rtla] timerlat hist [-h] [-q] [-d s] [-D] [-n] [-p us] [-i us] [-T us] [-s us] [-t[=file]] \\", + " [-c cpu-list] [-P priority] [-e N] [-b N] [--no-irq] [--no-thread] [--no-header] [--no-summary] \\", + " [--no-index] [--with-zeros]", + "", + " -h/--help: print this menu", + " -p/--period us: timerlat period in us", + " -i/--irq us: stop trace if the irq latency is higher than the argument in us", + " -T/--thread us: stop trace if the thread latency is higher than the argument in us", + " -s/--stack us: save the stack trace at the IRQ if a thread latency is higher than the argument in us", + " -c/--cpus cpus: run the tracer only on the given cpus", + " -d/--duration time[m|h|d]: duration of the session in seconds", + " -D/--debug: print debug info", + " -T/--trace[=file]: save the stopped trace to [file|timerlat_trace.txt]", + " -n/--nano: display data in nanoseconds", + " -b/--bucket-size N: set the histogram bucket size (default 1)", + " -e/--entries N: set the number of entries of the histogram (default 256)", + " --no-irq: ignore IRQ latencies", + " --no-thread: ignore thread latencies", + " --no-header: do not print header", + " --no-summary: do not print summary", + " --no-index: do not print index", + " --with-zeros: print zero only entries", + " -P/--priority o:prio|r:prio|f:prio|d:runtime:period : set scheduling parameters", + " o:prio - use SCHED_OTHER with prio", + " r:prio - use SCHED_RR with prio", + " f:prio - use SCHED_FIFO with prio", + " d:runtime[us|ms|s]:period[us|ms|s] - use SCHED_DEADLINE with runtime and period", + " in nanoseconds", + NULL, + }; + + if (usage) + fprintf(stderr, "%s\n", usage); + + fprintf(stderr, "rtla timerlat hist: a per-cpu histogram of the timer latency (version %s)\n", + VERSION); + + for (i = 0; msg[i]; i++) + fprintf(stderr, "%s\n", msg[i]); + exit(1); +} + +/* + * timerlat_hist_parse_args - allocs, parse and fill the cmd line parameters + */ +static struct timerlat_hist_params +*timerlat_hist_parse_args(int argc, char *argv[]) +{ + struct timerlat_hist_params *params; + int retval; + int c; + + params = calloc(1, sizeof(*params)); + if (!params) + exit(1); + + /* display data in microseconds */ + params->output_divisor = 1000; + params->bucket_size = 1; + params->entries = 256; + + while (1) { + static struct option long_options[] = { + {"cpus", required_argument, 0, 'c'}, + {"bucket-size", required_argument, 0, 'b'}, + {"debug", no_argument, 0, 'D'}, + {"entries", required_argument, 0, 'e'}, + {"duration", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + {"irq", required_argument, 0, 'i'}, + {"nano", no_argument, 0, 'n'}, + {"period", required_argument, 0, 'p'}, + {"priority", required_argument, 0, 'P'}, + {"stack", required_argument, 0, 's'}, + {"thread", required_argument, 0, 'T'}, + {"trace", optional_argument, 0, 't'}, + {"no-irq", no_argument, 0, '0'}, + {"no-thread", no_argument, 0, '1'}, + {"no-header", no_argument, 0, '2'}, + {"no-summary", no_argument, 0, '3'}, + {"no-index", no_argument, 0, '4'}, + {"with-zeros", no_argument, 0, '5'}, + {0, 0, 0, 0} + }; + + /* getopt_long stores the option index here. */ + int option_index = 0; + + c = getopt_long(argc, argv, "c:b:d:e:Dhi:np:P:s:t::T:012345", + long_options, &option_index); + + /* detect the end of the options. */ + if (c == -1) + break; + + switch (c) { + case 'c': + retval = parse_cpu_list(optarg, ¶ms->monitored_cpus); + if (retval) + timerlat_hist_usage("\nInvalid -c cpu list\n"); + params->cpus = optarg; + break; + case 'b': + params->bucket_size = get_llong_from_str(optarg); + if ((params->bucket_size == 0) || (params->bucket_size >= 1000000)) + timerlat_hist_usage("Bucket size needs to be > 0 and <= 1000000\n"); + break; + case 'D': + config_debug = 1; + break; + case 'd': + params->duration = parse_seconds_duration(optarg); + if (!params->duration) + timerlat_hist_usage("Invalid -D duration\n"); + break; + case 'e': + params->entries = get_llong_from_str(optarg); + if ((params->entries < 10) || (params->entries > 9999999)) + timerlat_hist_usage("Entries must be > 10 and < 9999999\n"); + break; + case 'h': + case '?': + timerlat_hist_usage(NULL); + break; + case 'i': + params->stop_us = get_llong_from_str(optarg); + break; + case 'n': + params->output_divisor = 1; + break; + case 'p': + params->timerlat_period_us = get_llong_from_str(optarg); + if (params->timerlat_period_us > 1000000) + timerlat_hist_usage("Period longer than 1 s\n"); + break; + case 'P': + retval = parse_prio(optarg, ¶ms->sched_param); + if (retval == -1) + timerlat_hist_usage("Invalid -P priority"); + params->set_sched = 1; + break; + case 's': + params->print_stack = get_llong_from_str(optarg); + break; + case 'T': + params->stop_total_us = get_llong_from_str(optarg); + break; + case 't': + if (optarg) + /* skip = */ + params->trace_output = &optarg[1]; + else + params->trace_output = "timerlat_trace.txt"; + break; + case '0': /* no irq */ + params->no_irq = 1; + break; + case '1': /* no thread */ + params->no_thread = 1; + break; + case '2': /* no header */ + params->no_header = 1; + break; + case '3': /* no summary */ + params->no_summary = 1; + break; + case '4': /* no index */ + params->no_index = 1; + break; + case '5': /* with zeros */ + params->with_zeros = 1; + break; + default: + timerlat_hist_usage("Invalid option"); + } + } + + if (geteuid()) { + err_msg("rtla needs root permission\n"); + exit(EXIT_FAILURE); + } + + if (params->no_irq && params->no_thread) + timerlat_hist_usage("no-irq and no-thread set, there is nothing to do here"); + + if (params->no_index && !params->with_zeros) + timerlat_hist_usage("no-index set with with-zeros is not set - it does not make sense"); + + return params; +} + +/* + * timerlat_hist_apply_config - apply the hist configs to the initialized tool + */ +static int +timerlat_hist_apply_config(struct osnoise_tool *tool, struct timerlat_hist_params *params) +{ + int retval; + + if (!params->sleep_time) + params->sleep_time = 1; + + if (params->cpus) { + retval = osnoise_set_cpus(tool->context, params->cpus); + if (retval) { + err_msg("Failed to apply CPUs config\n"); + goto out_err; + } + } + + if (params->stop_us) { + retval = osnoise_set_stop_us(tool->context, params->stop_us); + if (retval) { + err_msg("Failed to set stop us\n"); + goto out_err; + } + } + + if (params->stop_total_us) { + retval = osnoise_set_stop_total_us(tool->context, params->stop_total_us); + if (retval) { + err_msg("Failed to set stop total us\n"); + goto out_err; + } + } + + if (params->timerlat_period_us) { + retval = osnoise_set_timerlat_period_us(tool->context, params->timerlat_period_us); + if (retval) { + err_msg("Failed to set timerlat period\n"); + goto out_err; + } + } + + if (params->print_stack) { + retval = osnoise_set_print_stack(tool->context, params->print_stack); + if (retval) { + err_msg("Failed to set print stack\n"); + goto out_err; + } + } + + return 0; + +out_err: + return -1; +} + +/* + * timerlat_init_hist - initialize a timerlat hist tool with parameters + */ +static struct osnoise_tool +*timerlat_init_hist(struct timerlat_hist_params *params) +{ + struct osnoise_tool *tool; + int nr_cpus; + + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + + tool = osnoise_init_tool("timerlat_hist"); + if (!tool) + return NULL; + + tool->data = timerlat_alloc_histogram(nr_cpus, params->entries, params->bucket_size); + if (!tool->data) + goto out_err; + + tool->params = params; + + tep_register_event_handler(tool->trace.tep, -1, "ftrace", "timerlat", + timerlat_hist_handler, tool); + + return tool; + +out_err: + osnoise_destroy_tool(tool); + return NULL; +} + +static int stop_tracing; +static void stop_hist(int sig) +{ + stop_tracing = 1; +} + +/* + * timerlat_hist_set_signals - handles the signal to stop the tool + */ +static void +timerlat_hist_set_signals(struct timerlat_hist_params *params) +{ + signal(SIGINT, stop_hist); + if (params->duration) { + signal(SIGALRM, stop_hist); + alarm(params->duration); + } +} + +int timerlat_hist_main(int argc, char *argv[]) +{ + struct timerlat_hist_params *params; + struct trace_instance *trace; + struct osnoise_tool *record; + struct osnoise_tool *tool; + int return_value = 1; + int retval; + + params = timerlat_hist_parse_args(argc, argv); + if (!params) + exit(1); + + tool = timerlat_init_hist(params); + if (!tool) { + err_msg("Could not init osnoise hist\n"); + goto out_exit; + } + + retval = timerlat_hist_apply_config(tool, params); + if (retval) { + err_msg("Could not apply config\n"); + goto out_hist; + } + + trace = &tool->trace; + + retval = enable_timerlat(trace); + if (retval) { + err_msg("Failed to enable timerlat tracer\n"); + goto out_hist; + } + + if (params->set_sched) { + retval = set_comm_sched_attr("timerlat/", ¶ms->sched_param); + if (retval) { + err_msg("Failed to set sched parameters\n"); + goto out_hist; + } + } + + trace_instance_start(trace); + + if (params->trace_output) { + record = osnoise_init_trace_tool("timerlat"); + if (!record) { + err_msg("Failed to enable the trace instance\n"); + goto out_hist; + } + trace_instance_start(&record->trace); + } + + tool->start_time = time(NULL); + timerlat_hist_set_signals(params); + + while (!stop_tracing) { + sleep(params->sleep_time); + + retval = tracefs_iterate_raw_events(trace->tep, + trace->inst, + NULL, + 0, + collect_registered_events, + trace); + if (retval < 0) { + err_msg("Error iterating on events\n"); + goto out_hist; + } + + if (!tracefs_trace_is_on(trace->inst)) + break; + }; + + timerlat_print_stats(params, tool); + + return_value = 0; + + if (!tracefs_trace_is_on(trace->inst)) { + printf("rtla timelat hit stop tracing\n"); + if (params->trace_output) { + printf(" Saving trace to %s\n", params->trace_output); + save_trace_to_file(record->trace.inst, params->trace_output); + } + } + +out_hist: + timerlat_free_histogram(tool->data); + osnoise_destroy_tool(tool); + if (params->trace_output) + osnoise_destroy_tool(record); + free(params); +out_exit: + exit(return_value); +} From d40d48e1f1f25abe755521e33a67f5d2b8b9443c Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:27 +0100 Subject: [PATCH 43/50] rtla: Add Documentation Adds the basis for rtla documentation. This patch also includes the rtla(1) man page. As suggested by Jonathan Corbet, we are placing these man pages at Documentation/tools/rtla, using rst format. It is not linked to the official documentation, though. The Makefile is based on bpftool's Documentation one. Link: https://lkml.kernel.org/r/5f510f3e962fc0cd531c43f5a815544dd720c3f2.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Steven Rostedt Suggested-by: Jonathan Corbet Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- Documentation/tools/rtla/Makefile | 41 +++++++++++++++++ Documentation/tools/rtla/common_appendix.rst | 12 +++++ Documentation/tools/rtla/rtla.rst | 48 ++++++++++++++++++++ tools/tracing/rtla/Makefile | 30 ++++++++++-- 4 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 Documentation/tools/rtla/Makefile create mode 100644 Documentation/tools/rtla/common_appendix.rst create mode 100644 Documentation/tools/rtla/rtla.rst diff --git a/Documentation/tools/rtla/Makefile b/Documentation/tools/rtla/Makefile new file mode 100644 index 00000000000000..9f2b84af1a6c77 --- /dev/null +++ b/Documentation/tools/rtla/Makefile @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Based on bpftool's Documentation Makefile + +INSTALL ?= install +RM ?= rm -f +RMDIR ?= rmdir --ignore-fail-on-non-empty + +PREFIX ?= /usr/share +MANDIR ?= $(PREFIX)/man +MAN1DIR = $(MANDIR)/man1 + +MAN1_RST = $(wildcard rtla*.rst) + +_DOC_MAN1 = $(patsubst %.rst,%.1,$(MAN1_RST)) +DOC_MAN1 = $(addprefix $(OUTPUT),$(_DOC_MAN1)) + +RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) +RST2MAN_OPTS += --verbose + +$(OUTPUT)%.1: %.rst +ifndef RST2MAN_DEP + $(error "rst2man not found, but required to generate man pages") +endif + rst2man $(RST2MAN_OPTS) $< > $@ + +man1: $(DOC_MAN1) +man: man1 + +clean: + $(RM) $(DOC_MAN1) + +install: man + $(INSTALL) -d -m 755 $(DESTDIR)$(MAN1DIR) + $(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(MAN1DIR) + +uninstall: + $(RM) $(addprefix $(DESTDIR)$(MAN1DIR)/,$(_DOC_MAN1)) + $(RMDIR) $(DESTDIR)$(MAN1DIR) + +.PHONY: man man1 clean install uninstall +.DEFAULT_GOAL := man diff --git a/Documentation/tools/rtla/common_appendix.rst b/Documentation/tools/rtla/common_appendix.rst new file mode 100644 index 00000000000000..b494084acccdb0 --- /dev/null +++ b/Documentation/tools/rtla/common_appendix.rst @@ -0,0 +1,12 @@ +REPORTING BUGS +============== +Report bugs to + +LICENSE +======= +**rtla** is Free Software licensed under the GNU GPLv2 + +COPYING +======= +Copyright \(C) 2021 Red Hat, Inc. Free use of this software is granted under +the terms of the GNU Public License (GPL). diff --git a/Documentation/tools/rtla/rtla.rst b/Documentation/tools/rtla/rtla.rst new file mode 100644 index 00000000000000..fc0d233efcd5df --- /dev/null +++ b/Documentation/tools/rtla/rtla.rst @@ -0,0 +1,48 @@ +========= +rtla +========= +-------------------------------- +Real-time Linux Analysis tool +-------------------------------- + +:Manual section: 1 + +SYNOPSIS +======== +**rtla** *COMMAND* [*OPTIONS*] + +DESCRIPTION +=========== +The **rtla** is a meta-tool that includes a set of commands that aims to +analyze the real-time properties of Linux. But instead of testing Linux +as a black box, **rtla** leverages kernel tracing capabilities to provide +precise information about the properties and root causes of unexpected +results. + +COMMANDS +======== +**osnoise** + + Gives information about the operating system noise (osnoise). + +**timerlat** + + Measures the IRQ and thread timer latency. + +OPTIONS +======= +**-h**, **--help** + + Display the help text. + +For other options, see the man page for the corresponding command. + +SEE ALSO +======== +**rtla-osnoise**\(1), **rtla-timerlat**\(1) + +AUTHOR +====== +Daniel Bristot de Oliveira + +.. include:: common_appendix.rst diff --git a/tools/tracing/rtla/Makefile b/tools/tracing/rtla/Makefile index 7a7348faec3258..2d52ff0bff7dad 100644 --- a/tools/tracing/rtla/Makefile +++ b/tools/tracing/rtla/Makefile @@ -45,18 +45,28 @@ DATADIR := /usr/share DOCDIR := $(DATADIR)/doc MANDIR := $(DATADIR)/man LICDIR := $(DATADIR)/licenses +SRCTREE := $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR)) + +# If running from the tarball, man pages are stored in the Documentation +# dir. If running from the kernel source, man pages are stored in +# Documentation/tools/rtla/. +ifneq ($(wildcard Documentation/.*),) +DOCSRC = Documentation/ +else +DOCSRC = $(SRCTREE)/../../../Documentation/tools/rtla/ +endif .PHONY: all all: rtla -rtla: $(OBJ) +rtla: $(OBJ) doc $(CC) -o rtla $(LDFLAGS) $(OBJ) $(LIBS) static: $(OBJ) $(CC) -o rtla-static $(LDFLAGS) --static $(OBJ) $(LIBS) -lpthread -ldl .PHONY: install -install: +install: doc_install $(INSTALL) -d -m 755 $(DESTDIR)$(BINDIR) $(INSTALL) rtla -m 755 $(DESTDIR)$(BINDIR) $(STRIP) $(DESTDIR)$(BINDIR)/rtla @@ -66,15 +76,27 @@ install: ln -s $(DESTDIR)$(BINDIR)/rtla $(DESTDIR)$(BINDIR)/timerlat .PHONY: clean tarball -clean: +clean: doc_clean @test ! -f rtla || rm rtla @test ! -f rtla-static || rm rtla-static @test ! -f src/rtla.o || rm src/rtla.o @test ! -f $(TARBALL) || rm -f $(TARBALL) @rm -rf *~ $(OBJ) *.tar.$(CEXT) -tarball: clean +tarball: clean rm -rf $(NAME)-$(VERSION) && mkdir $(NAME)-$(VERSION) cp -r $(DIRS) $(FILES) $(NAME)-$(VERSION) + mkdir $(NAME)-$(VERSION)/Documentation/ + cp -rp $(SRCTREE)/../../../Documentation/tools/rtla/* $(NAME)-$(VERSION)/Documentation/ tar $(TAROPTS) --exclude='*~' $(NAME)-$(VERSION) rm -rf $(NAME)-$(VERSION) + +.PHONY: doc doc_clean doc_install +doc: + $(MAKE) -C $(DOCSRC) + +doc_clean: + $(MAKE) -C $(DOCSRC) clean + +doc_install: + $(MAKE) -C $(DOCSRC) install From 496082df01bb08a40f9207d918734cf908c5f107 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:28 +0100 Subject: [PATCH 44/50] rtla: Add rtla osnoise man page Man page for rtla osnoise command. Link: https://lkml.kernel.org/r/304747c602e46f6396e3bb75dfdcb42ae3656dca.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- .../tools/rtla/common_osnoise_description.rst | 8 +++ Documentation/tools/rtla/rtla-osnoise.rst | 59 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 Documentation/tools/rtla/common_osnoise_description.rst create mode 100644 Documentation/tools/rtla/rtla-osnoise.rst diff --git a/Documentation/tools/rtla/common_osnoise_description.rst b/Documentation/tools/rtla/common_osnoise_description.rst new file mode 100644 index 00000000000000..8973c5df888f63 --- /dev/null +++ b/Documentation/tools/rtla/common_osnoise_description.rst @@ -0,0 +1,8 @@ +The **rtla osnoise** tool is an interface for the *osnoise* tracer. The +*osnoise* tracer dispatches a kernel thread per-cpu. These threads read the +time in a loop while with preemption, softirq and IRQs enabled, thus +allowing all the sources of operating systme noise during its execution. +The *osnoise*'s tracer threads take note of the delta between each time +read, along with an interference counter of all sources of interference. +At the end of each period, the *osnoise* tracer displays a summary of +the results. diff --git a/Documentation/tools/rtla/rtla-osnoise.rst b/Documentation/tools/rtla/rtla-osnoise.rst new file mode 100644 index 00000000000000..c129b206ce3484 --- /dev/null +++ b/Documentation/tools/rtla/rtla-osnoise.rst @@ -0,0 +1,59 @@ +=============== +rtla-osnoise +=============== +------------------------------------------------------------------ +Measure the operating system noise +------------------------------------------------------------------ + +:Manual section: 1 + +SYNOPSIS +======== +**rtla osnoise** [*MODE*] ... + +DESCRIPTION +=========== + +.. include:: common_osnoise_description.rst + +The *osnoise* tracer outputs information in two ways. It periodically prints +a summary of the noise of the operating system, including the counters of +the occurrence of the source of interference. It also provides information +for each noise via the **osnoise:** tracepoints. The **rtla osnoise top** +mode displays information about the periodic summary from the *osnoise* tracer. +The **rtla osnoise hist** mode displays information about the noise using +the **osnoise:** tracepoints. For further details, please refer to the +respective man page. + +MODES +===== +**top** + + Prints the summary from osnoise tracer. + +**hist** + + Prints a histogram of osnoise samples. + +If no MODE is given, the top mode is called, passing the arguments. + +OPTIONS +======= + +**-h**, **--help** + + Display the help text. + +For other options, see the man page for the corresponding mode. + +SEE ALSO +======== +**rtla-osnoise-top**\(1), **rtla-osnoise-hist**\(1) + +Osnoise tracer documentation: + +AUTHOR +====== +Written by Daniel Bristot de Oliveira + +.. include:: common_appendix.rst From b1be48307de4ccb315036c58bb1363a93c27d188 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:29 +0100 Subject: [PATCH 45/50] rtla: Add rtla osnoise top documentation Man page for rtla osnoise top mode. Link: https://lkml.kernel.org/r/445aa2173ca152fc2e68719e3c1a2547dd01efd3.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- Documentation/tools/rtla/common_options.rst | 28 +++++++++ .../tools/rtla/common_osnoise_options.rst | 17 ++++++ .../tools/rtla/common_top_options.rst | 3 + Documentation/tools/rtla/rtla-osnoise-top.rst | 61 +++++++++++++++++++ 4 files changed, 109 insertions(+) create mode 100644 Documentation/tools/rtla/common_options.rst create mode 100644 Documentation/tools/rtla/common_osnoise_options.rst create mode 100644 Documentation/tools/rtla/common_top_options.rst create mode 100644 Documentation/tools/rtla/rtla-osnoise-top.rst diff --git a/Documentation/tools/rtla/common_options.rst b/Documentation/tools/rtla/common_options.rst new file mode 100644 index 00000000000000..721790ad984eb0 --- /dev/null +++ b/Documentation/tools/rtla/common_options.rst @@ -0,0 +1,28 @@ +**-c**, **--cpus** *cpu-list* + + Set the osnoise tracer to run the sample threads in the cpu-list. + +**-d**, **--duration** *time[s|m|h|d]* + + Set the duration of the session. + +**-D**, **--debug** + + Print debug info. + +**-t**, **--trace**\[*=file*] + + Save the stopped trace to [*file|osnoise_trace.txt*]. + +**-P**, **--priority** *o:prio|r:prio|f:prio|d:runtime:period* + + Set scheduling parameters to the osnoise tracer threads, the format to set the priority are: + + - *o:prio* - use SCHED_OTHER with *prio*; + - *r:prio* - use SCHED_RR with *prio*; + - *f:prio* - use SCHED_FIFO with *prio*; + - *d:runtime[us|ms|s]:period[us|ms|s]* - use SCHED_DEADLINE with *runtime* and *period* in nanoseconds. + +**-h**, **--help** + + Print help menu. diff --git a/Documentation/tools/rtla/common_osnoise_options.rst b/Documentation/tools/rtla/common_osnoise_options.rst new file mode 100644 index 00000000000000..d556883e4e2637 --- /dev/null +++ b/Documentation/tools/rtla/common_osnoise_options.rst @@ -0,0 +1,17 @@ +**-p**, **--period** *us* + + Set the *osnoise* tracer period in microseconds. + +**-r**, **--runtime** *us* + + Set the *osnoise* tracer runtime in microseconds. + +**-s**, **--stop** *us* + + Stop the trace if a single sample is higher than the argument in microseconds. + If **-T** is set, it will also save the trace to the output. + +**-S**, **--stop-total** *us* + + Stop the trace if the total sample is higher than the argument in microseconds. + If **-T** is set, it will also save the trace to the output. diff --git a/Documentation/tools/rtla/common_top_options.rst b/Documentation/tools/rtla/common_top_options.rst new file mode 100644 index 00000000000000..f48878938f842c --- /dev/null +++ b/Documentation/tools/rtla/common_top_options.rst @@ -0,0 +1,3 @@ +**-q**, **--quiet** + + Print only a summary at the end of the session. diff --git a/Documentation/tools/rtla/rtla-osnoise-top.rst b/Documentation/tools/rtla/rtla-osnoise-top.rst new file mode 100644 index 00000000000000..5d75d1394516fd --- /dev/null +++ b/Documentation/tools/rtla/rtla-osnoise-top.rst @@ -0,0 +1,61 @@ +=================== +rtla-osnoise-top +=================== +----------------------------------------------- +Display a summary of the operating system noise +----------------------------------------------- + +:Manual section: 1 + +SYNOPSIS +======== +**rtla osnoise top** [*OPTIONS*] + +DESCRIPTION +=========== +.. include:: common_osnoise_description.rst + +**rtla osnoise top** collects the periodic summary from the *osnoise* tracer, +including the counters of the occurrence of the interference source, +displaying the results in a user-friendly format. + +The tool also allows many configurations of the *osnoise* tracer and the +collection of the tracer output. + +OPTIONS +======= +.. include:: common_osnoise_options.rst + +.. include:: common_top_options.rst + +.. include:: common_options.rst + +EXAMPLE +======= +In the example below, the **rtla osnoise top** tool is set to run with a +real-time priority *FIFO:1*, on CPUs *0-3*, for *900ms* at each period +(*1s* by default). The reason for reducing the runtime is to avoid starving +the rtla tool. The tool is also set to run for *one minute* and to display +a summary of the report at the end of the session:: + + [root@f34 ~]# rtla osnoise top -P F:1 -c 0-3 -r 900000 -d 1M -q + Operating System Noise + duration: 0 00:01:00 | time is in us + CPU Period Runtime Noise % CPU Aval Max Noise Max Single HW NMI IRQ Softirq Thread + 0 #59 53100000 304896 99.42580 6978 56 549 0 53111 1590 13 + 1 #59 53100000 338339 99.36282 8092 24 399 0 53130 1448 31 + 2 #59 53100000 290842 99.45227 6582 39 855 0 53110 1406 12 + 3 #59 53100000 204935 99.61405 6251 33 290 0 53156 1460 12 + +SEE ALSO +======== + +**rtla-osnoise**\(1), **rtla-osnoise-hist**\(1) + +Osnoise tracer documentation: + +AUTHOR +====== +Written by Daniel Bristot de Oliveira + +.. include:: common_appendix.rst From e7041c6b3c123ef2160ff7a948e458f8241e4e03 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:30 +0100 Subject: [PATCH 46/50] rtla: Add rtla osnoise hist documentation Man page for rtla osnoise hist mode. Link: https://lkml.kernel.org/r/d43cc5d516b8db180575c4b66c4aa67c6a724a8d.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- .../tools/rtla/common_hist_options.rst | 23 +++++++ .../tools/rtla/rtla-osnoise-hist.rst | 66 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 Documentation/tools/rtla/common_hist_options.rst create mode 100644 Documentation/tools/rtla/rtla-osnoise-hist.rst diff --git a/Documentation/tools/rtla/common_hist_options.rst b/Documentation/tools/rtla/common_hist_options.rst new file mode 100644 index 00000000000000..0266cd08a6c929 --- /dev/null +++ b/Documentation/tools/rtla/common_hist_options.rst @@ -0,0 +1,23 @@ +**-b**, **--bucket-size** *N* + + Set the histogram bucket size (default *1*). + +**-e**, **--entries** *N* + + Set the number of entries of the histogram (default 256). + +**--no-header** + + Do not print header. + +**--no-summary** + + Do not print summary. + +**--no-index** + + Do not print index. + +**--with-zeros** + + Print zero only entries. diff --git a/Documentation/tools/rtla/rtla-osnoise-hist.rst b/Documentation/tools/rtla/rtla-osnoise-hist.rst new file mode 100644 index 00000000000000..52298ddd8701b2 --- /dev/null +++ b/Documentation/tools/rtla/rtla-osnoise-hist.rst @@ -0,0 +1,66 @@ +=================== +rtla-osnoise-hist +=================== +------------------------------------------------------ +Display a histogram of the osnoise tracer samples +------------------------------------------------------ + +:Manual section: 1 + +SYNOPSIS +======== +**rtla osnoise hist** [*OPTIONS*] + +DESCRIPTION +=========== +.. include:: common_osnoise_description.rst + +The **rtla osnoise hist** tool collects all **osnoise:sample_threshold** +occurrence in a histogram, displaying the results in a user-friendly way. +The tool also allows many configurations of the *osnoise* tracer and the +collection of the tracer output. + +OPTIONS +======= +.. include:: common_osnoise_options.rst + +.. include:: common_hist_options.rst + +.. include:: common_options.rst + +EXAMPLE +======= +In the example below, *osnoise* tracer threads are set to run with real-time +priority *FIFO:1*, on CPUs *0-11*, for *900ms* at each period (*1s* by +default). The reason for reducing the runtime is to avoid starving the +**rtla** tool. The tool is also set to run for *one minute*. The output +histogram is set to group outputs in buckets of *10us* and *25* entries:: + + [root@f34 ~/]# rtla osnoise hist -P F:1 -c 0-11 -r 900000 -d 1M -b 10 -e 25 + # RTLA osnoise histogram + # Time unit is microseconds (us) + # Duration: 0 00:01:00 + Index CPU-000 CPU-001 CPU-002 CPU-003 CPU-004 CPU-005 CPU-006 CPU-007 CPU-008 CPU-009 CPU-010 CPU-011 + 0 42982 46287 51779 53740 52024 44817 49898 36500 50408 50128 49523 52377 + 10 12224 8356 2912 878 2667 10155 4573 18894 4214 4836 5708 2413 + 20 8 5 12 2 13 24 20 41 29 53 39 39 + 30 1 1 0 0 10 3 6 19 15 31 30 38 + 40 0 0 0 0 0 4 2 7 2 3 8 11 + 50 0 0 0 0 0 0 0 0 0 1 1 2 + over: 0 0 0 0 0 0 0 0 0 0 0 0 + count: 55215 54649 54703 54620 54714 55003 54499 55461 54668 55052 55309 54880 + min: 0 0 0 0 0 0 0 0 0 0 0 0 + avg: 0 0 0 0 0 0 0 0 0 0 0 0 + max: 30 30 20 20 30 40 40 40 40 50 50 50 + +SEE ALSO +======== +**rtla-osnoise**\(1), **rtla-osnoise-top**\(1) + +*osnoise* tracer documentation: + +AUTHOR +====== +Written by Daniel Bristot de Oliveira + +.. include:: common_appendix.rst From 29380d4055e524de918c76a861df17d14a4b7224 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:31 +0100 Subject: [PATCH 47/50] rtla: Add rtla timerlat documentation Man page for rtla timerlat tool. Link: https://lkml.kernel.org/r/78678b8d024bf5a3a79f831ac9441b96e8d2f56e.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- .../rtla/common_timerlat_description.rst | 10 ++++ Documentation/tools/rtla/rtla-timerlat.rst | 57 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 Documentation/tools/rtla/common_timerlat_description.rst create mode 100644 Documentation/tools/rtla/rtla-timerlat.rst diff --git a/Documentation/tools/rtla/common_timerlat_description.rst b/Documentation/tools/rtla/common_timerlat_description.rst new file mode 100644 index 00000000000000..321201cb859789 --- /dev/null +++ b/Documentation/tools/rtla/common_timerlat_description.rst @@ -0,0 +1,10 @@ +The **rtla timerlat** tool is an interface for the *timerlat* tracer. The +*timerlat* tracer dispatches a kernel thread per-cpu. These threads +set a periodic timer to wake themselves up and go back to sleep. After +the wakeup, they collect and generate useful information for the +debugging of operating system timer latency. + +The *timerlat* tracer outputs information in two ways. It periodically +prints the timer latency at the timer *IRQ* handler and the *Thread* +handler. It also enable the trace of the most relevant information via +**osnoise:** tracepoints. diff --git a/Documentation/tools/rtla/rtla-timerlat.rst b/Documentation/tools/rtla/rtla-timerlat.rst new file mode 100644 index 00000000000000..44a49e6f302b8d --- /dev/null +++ b/Documentation/tools/rtla/rtla-timerlat.rst @@ -0,0 +1,57 @@ +================ +rtla-timerlat +================ +------------------------------------------- +Measures the operating system timer latency +------------------------------------------- + +:Manual section: 1 + +SYNOPSIS +======== +**rtla timerlat** [*MODE*] ... + +DESCRIPTION +=========== + +.. include:: common_timerlat_description.rst + +The *timerlat* tracer outputs information in two ways. It periodically +prints the timer latency at the timer *IRQ* handler and the *Thread* handler. +It also provides information for each noise via the **osnoise:** tracepoints. +The **rtla timerlat top** mode displays a summary of the periodic output +from the *timerlat* tracer. The **rtla hist hist** mode displays a histogram +of each tracer event occurrence. For further details, please refer to the +respective man page. + +MODES +===== +**top** + + Prints the summary from *timerlat* tracer. + +**hist** + + Prints a histogram of timerlat samples. + +If no *MODE* is given, the top mode is called, passing the arguments. + +OPTIONS +======= +**-h**, **--help** + + Display the help text. + +For other options, see the man page for the corresponding mode. + +SEE ALSO +======== +**rtla-timerlat-top**\(1), **rtla-timerlat-hist**\(1) + +*timerlat* tracer documentation: + +AUTHOR +====== +Written by Daniel Bristot de Oliveira + +.. include:: common_appendix.rst From df337d014b573fe567cd190753a4f5fa69281870 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:32 +0100 Subject: [PATCH 48/50] rtla: Add rtla timerlat top documentation Man page for rtla timerlat top mode. Link: https://lkml.kernel.org/r/58c3d6212e6c6f1f012deb2e998dd082da92075f.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- .../tools/rtla/common_timerlat_options.rst | 16 ++ .../tools/rtla/rtla-timerlat-top.rst | 145 ++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 Documentation/tools/rtla/common_timerlat_options.rst create mode 100644 Documentation/tools/rtla/rtla-timerlat-top.rst diff --git a/Documentation/tools/rtla/common_timerlat_options.rst b/Documentation/tools/rtla/common_timerlat_options.rst new file mode 100644 index 00000000000000..e9c1bfd55d48d5 --- /dev/null +++ b/Documentation/tools/rtla/common_timerlat_options.rst @@ -0,0 +1,16 @@ +**-p**, **--period** *us* + + Set the *timerlat* tracer period in microseconds. + +**-i**, **--irq** *us* + + Stop trace if the *IRQ* latency is higher than the argument in us. + +**-T**, **--thread** *us* + + Stop trace if the *Thread* latency is higher than the argument in us. + +**-s**, **--stack** *us* + + Save the stack trace at the *IRQ* if a *Thread* latency is higher than the + argument in us. diff --git a/Documentation/tools/rtla/rtla-timerlat-top.rst b/Documentation/tools/rtla/rtla-timerlat-top.rst new file mode 100644 index 00000000000000..1c321de1c171ef --- /dev/null +++ b/Documentation/tools/rtla/rtla-timerlat-top.rst @@ -0,0 +1,145 @@ +==================== +rtla-timerlat-top +==================== +------------------------------------------- +Measures the operating system timer latency +------------------------------------------- + +:Manual section: 1 + +SYNOPSIS +======== +**rtla timerlat top** [*OPTIONS*] ... + +DESCRIPTION +=========== + +.. include:: common_timerlat_description.rst + +The **rtla timerlat top** displays a summary of the periodic output +from the *timerlat* tracer. It also provides information for each +operating system noise via the **osnoise:** tracepoints that can be +seem with the option **-T**. + +OPTIONS +======= + +.. include:: common_timerlat_options.rst + +.. include:: common_top_options.rst + +.. include:: common_options.rst + +EXAMPLE +======= + +In the example below, the *timerlat* tracer is set to capture the stack trace at +the IRQ handler, printing it to the buffer if the *Thread* timer latency is +higher than *30 us*. It is also set to stop the session if a *Thread* timer +latency higher than *30 us* is hit. Finally, it is set to save the trace +buffer if the stop condition is hit:: + + [root@alien ~]# rtla timerlat top -s 30 -t 30 -T + Timer Latency + 0 00:00:59 | IRQ Timer Latency (us) | Thread Timer Latency (us) + CPU COUNT | cur min avg max | cur min avg max + 0 #58634 | 1 0 1 10 | 11 2 10 23 + 1 #58634 | 1 0 1 9 | 12 2 9 23 + 2 #58634 | 0 0 1 11 | 10 2 9 23 + 3 #58634 | 1 0 1 11 | 11 2 9 24 + 4 #58634 | 1 0 1 10 | 11 2 9 26 + 5 #58634 | 1 0 1 8 | 10 2 9 25 + 6 #58634 | 12 0 1 12 | 30 2 10 30 <--- CPU with spike + 7 #58634 | 1 0 1 9 | 11 2 9 23 + 8 #58633 | 1 0 1 9 | 11 2 9 26 + 9 #58633 | 1 0 1 9 | 10 2 9 26 + 10 #58633 | 1 0 1 13 | 11 2 9 28 + 11 #58633 | 1 0 1 13 | 12 2 9 24 + 12 #58633 | 1 0 1 8 | 10 2 9 23 + 13 #58633 | 1 0 1 10 | 10 2 9 22 + 14 #58633 | 1 0 1 18 | 12 2 9 27 + 15 #58633 | 1 0 1 10 | 11 2 9 28 + 16 #58633 | 0 0 1 11 | 7 2 9 26 + 17 #58633 | 1 0 1 13 | 10 2 9 24 + 18 #58633 | 1 0 1 9 | 13 2 9 22 + 19 #58633 | 1 0 1 10 | 11 2 9 23 + 20 #58633 | 1 0 1 12 | 11 2 9 28 + 21 #58633 | 1 0 1 14 | 11 2 9 24 + 22 #58633 | 1 0 1 8 | 11 2 9 22 + 23 #58633 | 1 0 1 10 | 11 2 9 27 + timerlat hit stop tracing + saving trace to timerlat_trace.txt + [root@alien bristot]# tail -60 timerlat_trace.txt + [...] + timerlat/5-79755 [005] ....... 426.271226: #58634 context thread timer_latency 10823 ns + sh-109404 [006] dnLh213 426.271247: #58634 context irq timer_latency 12505 ns + sh-109404 [006] dNLh313 426.271258: irq_noise: local_timer:236 start 426.271245463 duration 12553 ns + sh-109404 [006] d...313 426.271263: thread_noise: sh:109404 start 426.271245853 duration 4769 ns + timerlat/6-79756 [006] ....... 426.271264: #58634 context thread timer_latency 30328 ns + timerlat/6-79756 [006] ....1.. 426.271265: + => timerlat_irq + => __hrtimer_run_queues + => hrtimer_interrupt + => __sysvec_apic_timer_interrupt + => sysvec_apic_timer_interrupt + => asm_sysvec_apic_timer_interrupt + => _raw_spin_unlock_irqrestore <---- spinlock that disabled interrupt. + => try_to_wake_up + => autoremove_wake_function + => __wake_up_common + => __wake_up_common_lock + => ep_poll_callback + => __wake_up_common + => __wake_up_common_lock + => fsnotify_add_event + => inotify_handle_inode_event + => fsnotify + => __fsnotify_parent + => __fput + => task_work_run + => exit_to_user_mode_prepare + => syscall_exit_to_user_mode + => do_syscall_64 + => entry_SYSCALL_64_after_hwframe + => 0x7265000001378c + => 0x10000cea7 + => 0x25a00000204a + => 0x12e302d00000000 + => 0x19b51010901b6 + => 0x283ce00726500 + => 0x61ea308872 + => 0x00000fe3 + bash-109109 [007] d..h... 426.271265: #58634 context irq timer_latency 1211 ns + timerlat/6-79756 [006] ....... 426.271267: timerlat_main: stop tracing hit on cpu 6 + +In the trace, it is possible the notice that the *IRQ* timer latency was +already high, accounting *12505 ns*. The IRQ delay was caused by the +*bash-109109* process that disabled IRQs in the wake-up path +(*_try_to_wake_up()* function). The duration of the IRQ handler that woke +up the timerlat thread, informed with the **osnoise:irq_noise** event, was +also high and added more *12553 ns* to the Thread latency. Finally, the +**osnoise:thread_noise** added by the currently running thread (including +the scheduling overhead) added more *4769 ns*. Summing up these values, +the *Thread* timer latency accounted for *30328 ns*. + +The primary reason for this high value is the wake-up path that was hit +twice during this case: when the *bash-109109* was waking up a thread +and then when the *timerlat* thread was awakened. This information can +then be used as the starting point of a more fine-grained analysis. + +Note that **rtla timerlat** was dispatched without changing *timerlat* tracer +threads' priority. That is generally not needed because these threads hava +priority *FIFO:95* by default, which is a common priority used by real-time +kernel developers to analyze scheduling delays. + +SEE ALSO +-------- +**rtla-timerlat**\(1), **rtla-timerlat-hist**\(1) + +*timerlat* tracer documentation: + +AUTHOR +------ +Written by Daniel Bristot de Oliveira + +.. include:: common_appendix.rst From 5dce5904e3b9bc11d9635dd3458c7ff32caebb59 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 10 Dec 2021 19:11:33 +0100 Subject: [PATCH 49/50] rtla: Add rtla timerlat hist documentation Man page for rtla timerlat hist mode. Link: https://lkml.kernel.org/r/4a184003fdb81e23be3fe5ec882b1c89d5a95458.1639158831.git.bristot@kernel.org Cc: Tao Zhou Cc: Ingo Molnar Cc: Tom Zanussi Cc: Masami Hiramatsu Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Cc: linux-rt-users@vger.kernel.org Cc: linux-trace-devel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- .../tools/rtla/rtla-timerlat-hist.rst | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 Documentation/tools/rtla/rtla-timerlat-hist.rst diff --git a/Documentation/tools/rtla/rtla-timerlat-hist.rst b/Documentation/tools/rtla/rtla-timerlat-hist.rst new file mode 100644 index 00000000000000..e12eae1f330196 --- /dev/null +++ b/Documentation/tools/rtla/rtla-timerlat-hist.rst @@ -0,0 +1,106 @@ +===================== +rtla-timerlat-hist +===================== +------------------------------------------------ +Histograms of the operating system timer latency +------------------------------------------------ + +:Manual section: 1 + +SYNOPSIS +======== +**rtla timerlat hist** [*OPTIONS*] ... + +DESCRIPTION +=========== + +.. include:: common_timerlat_description.rst + +The **rtla timerlat hist** displays a histogram of each tracer event +occurrence. This tool uses the periodic information, and the +**osnoise:** tracepoints are enabled when using the **-T** option. + +OPTIONS +======= + +.. include:: common_timerlat_options.rst + +.. include:: common_hist_options.rst + +.. include:: common_options.rst + +EXAMPLE +======= +In the example below, **rtla timerlat hist** is set to run for *10* minutes, +in the cpus *0-4*, *skipping zero* only lines. Moreover, **rtla timerlat +hist** will change the priority of the *timelat* threads to run under +*SCHED_DEADLINE* priority, with a *10us* runtime every *1ms* period. The +*1ms* period is also passed to the *timerlat* tracer:: + + [root@alien ~]# timerlat hist -d 10m -c 0-4 -P d:100us:1ms -p 1ms + # RTLA timerlat histogram + # Time unit is microseconds (us) + # Duration: 0 00:10:00 + Index IRQ-000 Thr-000 IRQ-001 Thr-001 IRQ-002 Thr-002 IRQ-003 Thr-003 IRQ-004 Thr-004 + 0 276489 0 206089 0 466018 0 481102 0 205546 0 + 1 318327 35487 388149 30024 94531 48382 83082 71078 388026 55730 + 2 3282 122584 4019 126527 28231 109012 23311 89309 4568 98739 + 3 940 11815 837 9863 6209 16227 6895 17196 910 9780 + 4 444 17287 424 11574 2097 38443 2169 36736 462 13476 + 5 206 43291 255 25581 1223 101908 1304 101137 236 28913 + 6 132 101501 96 64584 635 213774 757 215471 99 73453 + 7 74 169347 65 124758 350 57466 441 53639 69 148573 + 8 53 85183 31 156751 229 9052 306 9026 39 139907 + 9 22 10387 12 42762 161 2554 225 2689 19 26192 + 10 13 1898 8 5770 114 1247 128 1405 13 3772 + 11 9 560 9 924 71 686 76 765 8 713 + 12 4 256 2 360 50 411 64 474 3 278 + 13 2 167 2 172 43 256 53 350 4 180 + 14 1 88 1 116 15 198 42 223 0 115 + 15 2 63 3 94 11 139 20 150 0 58 + 16 2 37 0 56 5 78 10 102 0 39 + 17 0 18 0 28 4 57 8 80 0 15 + 18 0 8 0 17 2 50 6 56 0 12 + 19 0 9 0 5 0 19 0 48 0 18 + 20 0 4 0 8 0 11 2 27 0 4 + 21 0 2 0 3 1 9 1 18 0 6 + 22 0 1 0 3 1 7 0 3 0 5 + 23 0 2 0 4 0 2 0 7 0 2 + 24 0 2 0 2 1 3 0 3 0 5 + 25 0 0 0 1 0 1 0 1 0 3 + 26 0 1 0 0 0 2 0 2 0 0 + 27 0 0 0 3 0 1 0 0 0 1 + 28 0 0 0 3 0 0 0 1 0 0 + 29 0 0 0 2 0 2 0 1 0 3 + 30 0 1 0 0 0 0 0 0 0 0 + 31 0 1 0 0 0 0 0 2 0 2 + 32 0 0 0 1 0 2 0 0 0 0 + 33 0 0 0 2 0 0 0 0 0 1 + 34 0 0 0 0 0 0 0 0 0 2 + 35 0 1 0 1 0 0 0 0 0 1 + 36 0 1 0 0 0 1 0 1 0 0 + 37 0 0 0 1 0 0 0 0 0 0 + 40 0 0 0 0 0 1 0 1 0 0 + 41 0 0 0 0 0 0 0 0 0 1 + 42 0 0 0 0 0 0 0 0 0 1 + 44 0 0 0 0 0 1 0 0 0 0 + 46 0 0 0 0 0 0 0 1 0 0 + 47 0 0 0 0 0 0 0 0 0 1 + 50 0 0 0 0 0 0 0 0 0 1 + 54 0 0 0 1 0 0 0 0 0 0 + 58 0 0 0 1 0 0 0 0 0 0 + over: 0 0 0 0 0 0 0 0 0 0 + count: 600002 600002 600002 600002 600002 600002 600002 600002 600002 600002 + min: 0 1 0 1 0 1 0 1 0 1 + avg: 0 5 0 5 0 4 0 4 0 5 + max: 16 36 15 58 24 44 21 46 13 50 + +SEE ALSO +======== +**rtla-timerlat**\(1), **rtla-timerlat-top**\(1) + +*timerlat* tracer documentation: + +AUTHOR +====== +Written by Daniel Bristot de Oliveira From f37c3bbc635994eda203a6da4ba0f9d05165a8d6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2022 20:08:40 -0500 Subject: [PATCH 50/50] tracing: Add ustring operation to filtering string pointers Since referencing user space pointers is special, if the user wants to filter on a field that is a pointer to user space, then they need to specify it. Add a ".ustring" attribute to the field name for filters to state that the field is pointing to user space such that the kernel can take the appropriate action to read that pointer. Link: https://lore.kernel.org/all/yt9d8rvmt2jq.fsf@linux.ibm.com/ Fixes: 77360f9bbc7e ("tracing: Add test for user space strings when filtering on string pointers") Tested-by: Sven Schnelle Signed-off-by: Steven Rostedt --- Documentation/trace/events.rst | 9 ++++ kernel/trace/trace_events_filter.c | 81 +++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 24 deletions(-) diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 45e66a60a816a9..c47f381d0c0025 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -198,6 +198,15 @@ The glob (~) accepts a wild card character (\*,?) and character classes prev_comm ~ "*sh*" prev_comm ~ "ba*sh" +If the field is a pointer that points into user space (for example +"filename" from sys_enter_openat), then you have to append ".ustring" to the +field name:: + + filename.ustring ~ "password" + +As the kernel will have to know how to retrieve the memory that the pointer +is at from user space. + 5.2 Setting filters ------------------- diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2e9ef64e9ee9ff..b458a9afa2c00e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -665,6 +665,23 @@ struct ustring_buffer { static __percpu struct ustring_buffer *ustring_per_cpu; static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + return kstr; +} + +static __always_inline char *test_ustring(char *str) { struct ustring_buffer *ubuf; char __user *ustr; @@ -676,23 +693,11 @@ static __always_inline char *test_string(char *str) ubuf = this_cpu_ptr(ustring_per_cpu); kstr = ubuf->buffer; - /* - * We use TASK_SIZE to denote user or kernel space, but this will - * not work for all architectures. If it picks the wrong one, it may - * just fail the filter (but will not bug). - * - * TODO: Have a way to properly denote which one this is for. - */ - if (likely((unsigned long)str >= TASK_SIZE)) { - /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) - return NULL; - } else { - /* user space address? */ - ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) - return NULL; - } + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + return kstr; } @@ -709,24 +714,42 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } +static __always_inline int filter_pchar(struct filter_pred *pred, char *str) +{ + int cmp, match; + int len; + + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); + + match = cmp ^ pred->not; + + return match; +} /* Filter predicate for char * pointers */ static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); char *str; - int cmp, match; - int len; str = test_string(*addr); if (!str) return 0; - len = strlen(str) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(str, &pred->regex, len); + return filter_pchar(pred, str); +} - match = cmp ^ pred->not; +/* Filter predicate for char * pointers in user space*/ +static int filter_pred_pchar_user(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; - return match; + str = test_ustring(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); } /* @@ -1232,6 +1255,7 @@ static int parse_pred(const char *str, void *data, struct filter_pred *pred = NULL; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + bool ustring = false; char q; u64 val; int len; @@ -1266,6 +1290,12 @@ static int parse_pred(const char *str, void *data, return -EINVAL; } + /* See if the field is a user space string */ + if ((len = str_has_prefix(str + i, ".ustring"))) { + ustring = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1405,7 +1435,10 @@ static int parse_pred(const char *str, void *data, goto err_mem; } - pred->fn = filter_pred_pchar; + if (ustring) + pred->fn = filter_pred_pchar_user; + else + pred->fn = filter_pred_pchar; } /* go past the last quote */ i++;