Skip to content

Commit

Permalink
tracing/user_events: Use bits vs bytes for enabled status page data
Browse files Browse the repository at this point in the history
User processes may require many events and when they do the cache
performance of a byte index status check is less ideal than a bit index.
The previous event limit per-page was 4096, the new limit is 32,768.

This change adds a bitwise index to the user_reg struct. Programs check
that the bit at status_bit has a bit set within the status page(s).

Link: https://lkml.kernel.org/r/[email protected]
Link: https://lore.kernel.org/all/[email protected]/

Suggested-by: Mathieu Desnoyers <[email protected]>
Signed-off-by: Beau Belgrave <[email protected]>
Signed-off-by: Steven Rostedt (Google) <[email protected]>
  • Loading branch information
beaubelgrave authored and rostedt committed Sep 29, 2022
1 parent d401b72 commit 39d6d08
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 38 deletions.
15 changes: 3 additions & 12 deletions include/linux/user_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,6 @@
#define USER_EVENTS_SYSTEM "user_events"
#define USER_EVENTS_PREFIX "u:"

/* Bits 0-6 are for known probe types, Bit 7 is for unknown probes */
#define EVENT_BIT_FTRACE 0
#define EVENT_BIT_PERF 1
#define EVENT_BIT_OTHER 7

#define EVENT_STATUS_FTRACE (1 << EVENT_BIT_FTRACE)
#define EVENT_STATUS_PERF (1 << EVENT_BIT_PERF)
#define EVENT_STATUS_OTHER (1 << EVENT_BIT_OTHER)

/* Create dynamic location entry within a 32-bit value */
#define DYN_LOC(offset, size) ((size) << 16 | (offset))

Expand All @@ -45,12 +36,12 @@ struct user_reg {
/* Input: Pointer to string with event name, description and flags */
__u64 name_args;

/* Output: Byte index of the event within the status page */
__u32 status_index;
/* Output: Bitwise index of the event within the status page */
__u32 status_bit;

/* Output: Index of the event to use when writing data */
__u32 write_index;
};
} __attribute__((__packed__));

#define DIAG_IOC_MAGIC '*'

Expand Down
75 changes: 67 additions & 8 deletions kernel/trace/trace_events_user.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,44 @@
*/
#define MAX_PAGE_ORDER 0
#define MAX_PAGES (1 << MAX_PAGE_ORDER)
#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
#define MAX_EVENTS (MAX_BYTES * 8)

/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024

/*
* The MAP_STATUS_* macros are used for taking a index and determining the
* appropriate byte and the bit in the byte to set/reset for an event.
*
* The lower 3 bits of the index decide which bit to set.
* The remaining upper bits of the index decide which byte to use for the bit.
*
* This is used when an event has a probe attached/removed to reflect live
* status of the event wanting tracing or not to user-programs via shared
* memory maps.
*/
#define MAP_STATUS_BYTE(index) ((index) >> 3)
#define MAP_STATUS_MASK(index) BIT((index) & 7)

/*
* Internal bits (kernel side only) to keep track of connected probes:
* These are used when status is requested in text form about an event. These
* bits are compared against an internal byte on the event to determine which
* probes to print out to the user.
*
* These do not reflect the mapped bytes between the user and kernel space.
*/
#define EVENT_STATUS_FTRACE BIT(0)
#define EVENT_STATUS_PERF BIT(1)
#define EVENT_STATUS_OTHER BIT(7)

static char *register_page_data;

static DEFINE_MUTEX(reg_mutex);
static DEFINE_HASHTABLE(register_table, 4);
static DEFINE_HASHTABLE(register_table, 8);
static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);

/*
Expand All @@ -72,6 +99,7 @@ struct user_event {
int index;
int flags;
int min_size;
char status;
};

/*
Expand Down Expand Up @@ -106,6 +134,22 @@ static u32 user_event_key(char *name)
return jhash(name, strlen(name), 0);
}

static __always_inline
void user_event_register_set(struct user_event *user)
{
int i = user->index;

register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
}

static __always_inline
void user_event_register_clear(struct user_event *user)
{
int i = user->index;

register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
}

static __always_inline __must_check
bool user_event_last_ref(struct user_event *user)
{
Expand Down Expand Up @@ -648,7 +692,7 @@ static int destroy_user_event(struct user_event *user)

dyn_event_remove(&user->devent);

register_page_data[user->index] = 0;
user_event_register_clear(user);
clear_bit(user->index, page_bitmap);
hash_del(&user->node);

Expand Down Expand Up @@ -827,7 +871,12 @@ static void update_reg_page_for(struct user_event *user)
rcu_read_unlock_sched();
}

register_page_data[user->index] = status;
if (status)
user_event_register_set(user);
else
user_event_register_clear(user);

user->status = status;
}

/*
Expand Down Expand Up @@ -1332,7 +1381,17 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (size > PAGE_SIZE)
return -E2BIG;

return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
if (size < offsetofend(struct user_reg, write_index))
return -EINVAL;

ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);

if (ret)
return ret;

kreg->size = size;

return 0;
}

/*
Expand Down Expand Up @@ -1376,7 +1435,7 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
return ret;

put_user((u32)ret, &ureg->write_index);
put_user(user->index, &ureg->status_index);
put_user(user->index, &ureg->status_bit);

return 0;
}
Expand Down Expand Up @@ -1485,7 +1544,7 @@ static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
{
unsigned long size = vma->vm_end - vma->vm_start;

if (size != MAX_EVENTS)
if (size != MAX_BYTES)
return -EINVAL;

return remap_pfn_range(vma, vma->vm_start,
Expand Down Expand Up @@ -1520,7 +1579,7 @@ static int user_seq_show(struct seq_file *m, void *p)
mutex_lock(&reg_mutex);

hash_for_each(register_table, i, user, node) {
status = register_page_data[user->index];
status = user->status;
flags = user->flags;

seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
Expand Down
25 changes: 18 additions & 7 deletions samples/user_events/example.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,21 @@
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <asm/bitsperlong.h>
#include <endian.h>
#include <linux/user_events.h>

#if __BITS_PER_LONG == 64
#define endian_swap(x) htole64(x)
#else
#define endian_swap(x) htole32(x)
#endif

/* Assumes debugfs is mounted */
const char *data_file = "/sys/kernel/debug/tracing/user_events_data";
const char *status_file = "/sys/kernel/debug/tracing/user_events_status";

static int event_status(char **status)
static int event_status(long **status)
{
int fd = open(status_file, O_RDONLY);

Expand All @@ -33,7 +41,8 @@ static int event_status(char **status)
return 0;
}

static int event_reg(int fd, const char *command, int *status, int *write)
static int event_reg(int fd, const char *command, long *index, long *mask,
int *write)
{
struct user_reg reg = {0};

Expand All @@ -43,16 +52,18 @@ static int event_reg(int fd, const char *command, int *status, int *write)
if (ioctl(fd, DIAG_IOCSREG, &reg) == -1)
return -1;

*status = reg.status_index;
*index = reg.status_bit / __BITS_PER_LONG;
*mask = endian_swap(1L << (reg.status_bit % __BITS_PER_LONG));
*write = reg.write_index;

return 0;
}

int main(int argc, char **argv)
{
int data_fd, status, write;
char *status_page;
int data_fd, write;
long index, mask;
long *status_page;
struct iovec io[2];
__u32 count = 0;

Expand All @@ -61,7 +72,7 @@ int main(int argc, char **argv)

data_fd = open(data_file, O_RDWR);

if (event_reg(data_fd, "test u32 count", &status, &write) == -1)
if (event_reg(data_fd, "test u32 count", &index, &mask, &write) == -1)
return errno;

/* Setup iovec */
Expand All @@ -75,7 +86,7 @@ int main(int argc, char **argv)
getchar();

/* Check if anyone is listening */
if (status_page[status]) {
if (status_page[index] & mask) {
/* Yep, trace out our data */
writev(data_fd, (const struct iovec *)io, 2);

Expand Down
47 changes: 39 additions & 8 deletions tools/testing/selftests/user_events/ftrace_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ const char *enable_file = "/sys/kernel/debug/tracing/events/user_events/__test_e
const char *trace_file = "/sys/kernel/debug/tracing/trace";
const char *fmt_file = "/sys/kernel/debug/tracing/events/user_events/__test_event/format";

static inline int status_check(char *status_page, int status_bit)
{
return status_page[status_bit >> 3] & (1 << (status_bit & 7));
}

static int trace_bytes(void)
{
int fd = open(trace_file, O_RDONLY);
Expand Down Expand Up @@ -197,12 +202,12 @@ TEST_F(user, register_events) {
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
ASSERT_NE(0, reg.status_index);
ASSERT_NE(0, reg.status_bit);

/* Multiple registers should result in same index */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
ASSERT_NE(0, reg.status_index);
ASSERT_NE(0, reg.status_bit);

/* Ensure disabled */
self->enable_fd = open(enable_file, O_RDWR);
Expand All @@ -212,15 +217,15 @@ TEST_F(user, register_events) {
/* MMAP should work and be zero'd */
ASSERT_NE(MAP_FAILED, status_page);
ASSERT_NE(NULL, status_page);
ASSERT_EQ(0, status_page[reg.status_index]);
ASSERT_EQ(0, status_check(status_page, reg.status_bit));

/* Enable event and ensure bits updated in status */
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))
ASSERT_EQ(EVENT_STATUS_FTRACE, status_page[reg.status_index]);
ASSERT_NE(0, status_check(status_page, reg.status_bit));

/* Disable event and ensure bits updated in status */
ASSERT_NE(-1, write(self->enable_fd, "0", sizeof("0")))
ASSERT_EQ(0, status_page[reg.status_index]);
ASSERT_EQ(0, status_check(status_page, reg.status_bit));

/* File still open should return -EBUSY for delete */
ASSERT_EQ(-1, ioctl(self->data_fd, DIAG_IOCSDEL, "__test_event"));
Expand All @@ -240,6 +245,8 @@ TEST_F(user, write_events) {
struct iovec io[3];
__u32 field1, field2;
int before = 0, after = 0;
int page_size = sysconf(_SC_PAGESIZE);
char *status_page;

reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event u32 field1; u32 field2";
Expand All @@ -254,10 +261,18 @@ TEST_F(user, write_events) {
io[2].iov_base = &field2;
io[2].iov_len = sizeof(field2);

status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
self->status_fd, 0);

/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
ASSERT_NE(0, reg.status_index);
ASSERT_NE(0, reg.status_bit);

/* MMAP should work and be zero'd */
ASSERT_NE(MAP_FAILED, status_page);
ASSERT_NE(NULL, status_page);
ASSERT_EQ(0, status_check(status_page, reg.status_bit));

/* Write should fail on invalid slot with ENOENT */
io[0].iov_base = &field2;
Expand All @@ -271,6 +286,9 @@ TEST_F(user, write_events) {
self->enable_fd = open(enable_file, O_RDWR);
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))

/* Event should now be enabled */
ASSERT_NE(0, status_check(status_page, reg.status_bit));

/* Write should make it out to ftrace buffers */
before = trace_bytes();
ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 3));
Expand Down Expand Up @@ -298,7 +316,7 @@ TEST_F(user, write_fault) {
/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
ASSERT_NE(0, reg.status_index);
ASSERT_NE(0, reg.status_bit);

/* Write should work normally */
ASSERT_NE(-1, writev(self->data_fd, (const struct iovec *)io, 2));
Expand All @@ -315,14 +333,24 @@ TEST_F(user, write_validator) {
int loc, bytes;
char data[8];
int before = 0, after = 0;
int page_size = sysconf(_SC_PAGESIZE);
char *status_page;

status_page = mmap(NULL, page_size, PROT_READ, MAP_SHARED,
self->status_fd, 0);

reg.size = sizeof(reg);
reg.name_args = (__u64)"__test_event __rel_loc char[] data";

/* Register should work */
ASSERT_EQ(0, ioctl(self->data_fd, DIAG_IOCSREG, &reg));
ASSERT_EQ(0, reg.write_index);
ASSERT_NE(0, reg.status_index);
ASSERT_NE(0, reg.status_bit);

/* MMAP should work and be zero'd */
ASSERT_NE(MAP_FAILED, status_page);
ASSERT_NE(NULL, status_page);
ASSERT_EQ(0, status_check(status_page, reg.status_bit));

io[0].iov_base = &reg.write_index;
io[0].iov_len = sizeof(reg.write_index);
Expand All @@ -340,6 +368,9 @@ TEST_F(user, write_validator) {
self->enable_fd = open(enable_file, O_RDWR);
ASSERT_NE(-1, write(self->enable_fd, "1", sizeof("1")))

/* Event should now be enabled */
ASSERT_NE(0, status_check(status_page, reg.status_bit));

/* Full in-bounds write should work */
before = trace_bytes();
loc = DYN_LOC(0, bytes);
Expand Down
Loading

0 comments on commit 39d6d08

Please sign in to comment.