From: Beau Belgrave <beaub@linux.microsoft.com>
To: rostedt@goodmis.org
Cc: linux-trace-devel@vger.kernel.org, beaub@linux.microsoft.com
Subject: [PATCH v2 1/3] libtracefs: Add user_events to libtracefs sources
Date: Tue, 22 Feb 2022 15:23:14 -0800 [thread overview]
Message-ID: <20220222232316.14640-2-beaub@linux.microsoft.com> (raw)
In-Reply-To: <20220222232316.14640-1-beaub@linux.microsoft.com>
The user events are scheduled to be included into Linux 5.18, which
register a special mmapped page to denote when the user event is enabled
(from an external source). This API adds a wrapper to the kernel
interface that makes it easy to register user events and test if they
are enabled and to record the event when it is.
Link:
https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
---
Makefile | 8 +
include/tracefs-local.h | 24 ++
include/tracefs.h | 67 +++++
src/Makefile | 4 +
src/tracefs-userevents.c | 516 +++++++++++++++++++++++++++++++++++++++
5 files changed, 619 insertions(+)
create mode 100644 src/tracefs-userevents.c
diff --git a/Makefile b/Makefile
index 544684c..a4598b4 100644
--- a/Makefile
+++ b/Makefile
@@ -154,6 +154,14 @@ CFLAGS ?= -g -Wall
CPPFLAGS ?=
LDFLAGS ?=
+USEREVENTS_INSTALLED := $(shell if (echo "$(pound)include <linux/user_events.h>" | $(CC) -E - >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi)
+export USEREVENTS_INSTALLED
+ifeq ($(USEREVENTS_INSTALLED), 1)
+CFLAGS += -DUSEREVENTS
+else
+$(warning user_events.h not installed, skipping)
+endif
+
CUNIT_INSTALLED := $(shell if (printf "$(pound)include <CUnit/Basic.h>\n void main(){CU_initialize_registry();}" | $(CC) -x c - -lcunit -o /dev/null >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi)
export CUNIT_INSTALLED
diff --git a/include/tracefs-local.h b/include/tracefs-local.h
index bf157e1..9491545 100644
--- a/include/tracefs-local.h
+++ b/include/tracefs-local.h
@@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep,
struct tep_event *get_tep_event(struct tep_handle *tep,
const char *system, const char *name);
+/* Internal interface for ftrace user events */
+
+struct tracefs_user_event_group;
+
+struct tracefs_user_event_internal
+{
+ struct tracefs_user_event event_external;
+ int write_index;
+ int iovecs;
+ int rels;
+ int len;
+ struct tracefs_user_event_group *group;
+ struct tracefs_user_event_internal *next;
+};
+
+struct tracefs_user_event_group
+{
+ int fd;
+ int mmap_len;
+ char *mmap;
+ pthread_mutex_t lock;
+ struct tracefs_user_event_internal *events;
+};
+
#endif /* _TRACE_FS_LOCAL_H */
diff --git a/include/tracefs.h b/include/tracefs.h
index 1848ad0..74241a9 100644
--- a/include/tracefs.h
+++ b/include/tracefs.h
@@ -571,4 +571,71 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name,
struct tep_event *
tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth);
+/* User events */
+enum tracefs_uevent_type {
+ TRACEFS_UEVENT_END,
+ TRACEFS_UEVENT_u8,
+ TRACEFS_UEVENT_s8,
+ TRACEFS_UEVENT_u16,
+ TRACEFS_UEVENT_s16,
+ TRACEFS_UEVENT_u32,
+ TRACEFS_UEVENT_s32,
+ TRACEFS_UEVENT_u64,
+ TRACEFS_UEVENT_s64,
+ TRACEFS_UEVENT_string,
+ TRACEFS_UEVENT_struct,
+ TRACEFS_UEVENT_varray,
+ TRACEFS_UEVENT_vstring,
+};
+
+enum tracefs_uevent_flags {
+ /* None */
+ TRACEFS_UEVENT_FLAG_NONE = 0,
+
+ /* When BPF is attached, use iterator/no copy */
+ TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0,
+};
+
+struct tracefs_uevent_item {
+ /* Type of item */
+ enum tracefs_uevent_type type;
+
+ /* Length of data, optional during register */
+ int len;
+
+ union {
+ /* Used during write */
+ const void *data;
+
+ /* Used during register */
+ const char *name;
+ };
+};
+
+struct tracefs_user_event {
+ unsigned int size;
+ char *enabled;
+};
+
+struct tracefs_user_event_group;
+
+struct tracefs_user_event_group *tracefs_user_event_group_open(void);
+
+void tracefs_user_event_group_close(struct tracefs_user_event_group *group);
+
+int tracefs_user_event_delete(const char *name);
+
+struct tracefs_user_event *
+tracefs_user_event_register(struct tracefs_user_event_group *group,
+ const char *name, enum tracefs_uevent_flags flags,
+ struct tracefs_uevent_item *items);
+
+static inline bool tracefs_user_event_enabled(struct tracefs_user_event *event)
+{
+ return event && ((volatile char *)event->enabled)[0] != 0;
+}
+
+int tracefs_user_event_record(struct tracefs_user_event *event,
+ struct tracefs_uevent_item *items);
+
#endif /* _TRACE_FS_H */
diff --git a/src/Makefile b/src/Makefile
index e8afab5..984e8cf 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,6 +14,10 @@ OBJS += tracefs-filter.o
OBJS += tracefs-dynevents.o
OBJS += tracefs-eprobes.o
+ifeq ($(USEREVENTS_INSTALLED), 1)
+OBJS += tracefs-userevents.o
+endif
+
# Order matters for the the three below
OBJS += sqlhist-lex.o
OBJS += sqlhist.tab.o
diff --git a/src/tracefs-userevents.c b/src/tracefs-userevents.c
new file mode 100644
index 0000000..ccd511b
--- /dev/null
+++ b/src/tracefs-userevents.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2022 Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <alloca.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <linux/user_events.h>
+
+#include "tracefs.h"
+#include "tracefs-local.h"
+
+#define STAT_FILE "user_events_status"
+#define DATA_FILE "user_events_data"
+
+static void free_user_events(struct tracefs_user_event_internal *event)
+{
+ struct tracefs_user_event_internal *next;
+
+ while (event) {
+ next = event->next;
+ free(event);
+ event = next;
+ }
+}
+
+static int append_field(struct tracefs_uevent_item *item, struct trace_seq *seq,
+ int index)
+{
+ if (index != 0)
+ trace_seq_printf(seq, ";");
+
+ switch (item->type) {
+ case TRACEFS_UEVENT_u8:
+ trace_seq_printf(seq, " u8 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s8:
+ trace_seq_printf(seq, " s8 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_u16:
+ trace_seq_printf(seq, " u16 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s16:
+ trace_seq_printf(seq, " s16 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_u32:
+ trace_seq_printf(seq, " u32 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s32:
+ trace_seq_printf(seq, " s32 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_u64:
+ trace_seq_printf(seq, " u64 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_s64:
+ trace_seq_printf(seq, " s64 %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_string:
+ if (item->len <= 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ trace_seq_printf(seq, " char[%d] %s", item->len, item->name);
+ break;
+
+ case TRACEFS_UEVENT_struct:
+ /*
+ * struct must have 2 strings, do simple check
+ * in user, kernel will fully validate
+ */
+ if (!strchr(item->name, ' ')) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (item->len <= 0) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ trace_seq_printf(seq, " struct %s %d", item->name, item->len);
+ break;
+
+ case TRACEFS_UEVENT_varray:
+ /* Variable length array */
+ trace_seq_printf(seq, " __rel_loc u8[] %s", item->name);
+ break;
+
+ case TRACEFS_UEVENT_vstring:
+ /* Variable length string */
+ trace_seq_printf(seq, " __rel_loc char[] %s", item->name);
+ break;
+
+ default:
+ /* Unknown */
+ errno = ENOENT;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int create_reg_cmd(const char *name, enum tracefs_uevent_flags flags,
+ struct tracefs_uevent_item *item, struct trace_seq *seq)
+{
+ int ret, index = 0;
+
+ trace_seq_printf(seq, "%s", name);
+
+ if (flags & TRACEFS_UEVENT_FLAG_bpf_iter)
+ trace_seq_printf(seq, ":BPF_ITER");
+
+ while (item->type != TRACEFS_UEVENT_END) {
+ ret = append_field(item, seq, index++);
+
+ if (ret < 0)
+ return ret;
+
+ item++;
+ }
+
+ trace_seq_terminate(seq);
+
+ if (seq->state) {
+ errno = ENOMEM;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int get_write_counts(struct tracefs_user_event_internal *event,
+ struct tracefs_uevent_item *item)
+{
+ event->rels = 0;
+ event->len = 0;
+
+ /* Start at 1, need iovec for write_index */
+ event->iovecs = 1;
+
+ while (item->type != TRACEFS_UEVENT_END) {
+ switch (item->type) {
+ case TRACEFS_UEVENT_u8:
+ case TRACEFS_UEVENT_s8:
+ event->len += sizeof(__u8);
+ break;
+
+ case TRACEFS_UEVENT_u16:
+ case TRACEFS_UEVENT_s16:
+ event->len += sizeof(__u16);
+ break;
+
+ case TRACEFS_UEVENT_u32:
+ case TRACEFS_UEVENT_s32:
+ event->len += sizeof(__u32);
+ break;
+
+ case TRACEFS_UEVENT_u64:
+ case TRACEFS_UEVENT_s64:
+ event->len += sizeof(__u64);
+ break;
+
+ case TRACEFS_UEVENT_string:
+ case TRACEFS_UEVENT_struct:
+ event->len += item->len;
+ break;
+
+ case TRACEFS_UEVENT_varray:
+ case TRACEFS_UEVENT_vstring:
+ /* Requires a rel loc entry */
+ event->len += sizeof(__u32);
+ event->rels++;
+ break;
+
+ default:
+ /* Unknown */
+ errno = ENOENT;
+ return -1;
+ }
+
+ event->iovecs++;
+ item++;
+ }
+
+ return 0;
+}
+
+/**
+ * tracefs_user_event_group_open - Opens a new group to use for user events
+ *
+ * Returns a pointer to a group to use for user events. The pointer is valid
+ * until tracefs_user_event_group_close() is called. In case of an error NULL
+ * is returned.
+ */
+struct tracefs_user_event_group *tracefs_user_event_group_open(void)
+{
+ int stat, write, page_size, i;
+ struct tracefs_user_event_group *group;
+
+ stat = tracefs_instance_file_open(NULL, STAT_FILE, O_RDWR);
+
+ if (stat < 0)
+ return NULL;
+
+ write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR);
+
+ if (write < 0)
+ goto put_stat;
+
+ group = malloc(sizeof(*group));
+
+ if (!group)
+ goto put_write;
+
+ if (pthread_mutex_init(&group->lock, NULL) < 0)
+ goto put_group;
+
+ /* Scale up to 16-bit max user events a page at a time */
+ page_size = sysconf(_SC_PAGESIZE);
+ group->mmap_len = page_size;
+
+ for (i = 0; i < 16; ++i) {
+ group->mmap = mmap(NULL, group->mmap_len,
+ PROT_READ, MAP_SHARED, stat, 0);
+
+ if (group->mmap == MAP_FAILED && errno == EINVAL) {
+ /* Increase by page size and try again */
+ group->mmap_len += page_size;
+ continue;
+ }
+
+ break;
+ }
+
+ if (group->mmap == MAP_FAILED)
+ goto put_group;
+
+ group->fd = write;
+ group->events = NULL;
+
+ /* Status fd no longer needed */
+ close(stat);
+
+ return group;
+
+put_group:
+ free(group);
+put_write:
+ close(write);
+put_stat:
+ close(stat);
+
+ return NULL;
+}
+
+/**
+ * tracefs_user_event_delete - Deletes a user event from the system
+ * @name: Name of the event to delete
+ *
+ * Deletes the event from the system if it is not used.
+ */
+int tracefs_user_event_delete(const char *name)
+{
+ int ret, write;
+
+ write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR);
+
+ if (write < 0)
+ return write;
+
+ ret = ioctl(write, DIAG_IOCSDEL, name);
+
+ close(write);
+
+ return ret;
+}
+
+/**
+ * tracefs_user_event_group_close - Closes a group containing user events
+ * @group: Group to close
+ *
+ * Closes a group and all the user events within it. Any user event that has
+ * been added to the group is no longer valid and cannot be used.
+ */
+void tracefs_user_event_group_close(struct tracefs_user_event_group *group)
+{
+ if (!group)
+ return;
+
+ if (group->mmap != MAP_FAILED)
+ munmap(group->mmap, group->mmap_len);
+
+ if (group->fd != -1)
+ close(group->fd);
+
+ free_user_events(group->events);
+ free(group);
+}
+
+/**
+ * tracefs_user_event_register - Registers a user event with the system
+ * @group: Group to add the user event to
+ * @name: Name of the event to register
+ * @flags: Flags to use
+ * @items: Array of items that the event contains
+ *
+ * Allocates and registers a user event with the system. The user event will be
+ * added to the @group. The lifetime of the event is bound to the @group. When
+ * the @group is closed via tracefs_user_event_group_close() the event will no
+ * longer exist and should not be used.
+ *
+ * The @items are processed in order and the final item type must be set to
+ * TRACEFS_UEVENT_END to mark the last item. Each item must have the type
+ * and name defined. The string and struct type also require the len to be set
+ * for the item.
+ *
+ * Return a pointer to a user event on success, or NULL or error.
+ *
+ * errno will be set to EINVAL if @group is null or unexpected @items.
+ */
+struct tracefs_user_event *
+tracefs_user_event_register(struct tracefs_user_event_group *group,
+ const char *name, enum tracefs_uevent_flags flags,
+ struct tracefs_uevent_item *items)
+{
+ struct tracefs_user_event_internal *event = NULL;
+ struct user_reg reg = {0};
+ struct trace_seq seq;
+
+ if (!group || !items) {
+ errno = EINVAL;
+ return NULL;
+ }
+
+ trace_seq_init(&seq);
+
+ /* Populate cmd */
+ if (create_reg_cmd(name, flags, items, &seq))
+ return NULL;
+
+ event = malloc(sizeof(*event));
+
+ if (!event)
+ goto put_seq;
+
+ reg.size = sizeof(reg);
+ reg.name_args = (__u64)seq.buffer;
+
+ /* Register event with kernel */
+ if (ioctl(group->fd, DIAG_IOCSREG, ®) == -1)
+ goto put_event;
+
+ /* Sanity check bounds returned */
+ if (reg.status_index >= group->mmap_len) {
+ errno = EINVAL;
+ goto put_event;
+ }
+
+ if (get_write_counts(event, items))
+ goto put_event;
+
+ /* Keep track of user view at this point in time */
+ event->event_external.size = sizeof(event->event_external);
+ event->event_external.enabled = &group->mmap[reg.status_index];
+
+ event->write_index = reg.write_index;
+ event->group = group;
+
+ /* Add event into the group under lock */
+ pthread_mutex_lock(&group->lock);
+ event->next = group->events;
+ group->events = event->next;
+ pthread_mutex_unlock(&group->lock);
+
+ trace_seq_destroy(&seq);
+
+ return &event->event_external;
+put_event:
+ free(event);
+put_seq:
+ trace_seq_destroy(&seq);
+
+ return NULL;
+}
+
+/**
+ * tracefs_user_event_record - Records an event with data
+ * @event: User event to record data about
+ * @items: Items to write for the event
+ *
+ * Records items for the event. Callers should check if the cost of recording
+ * should be performed by calling tracefs_user_event_enabled(). Items are
+ * checked to ensure they fit within the described items during register. Each
+ * item must specify the length of the item being recorded.
+ *
+ * Return the number of bytes recorded or -1 upon error.
+ *
+ * errno will be set to EINVAL if @event or @items is null or @items contains
+ * an item with a length of less than or equal to 0.
+ * errno will be set to E2BIG if @items contains more items than previously
+ * registered for the event.
+ */
+int tracefs_user_event_record(struct tracefs_user_event *event,
+ struct tracefs_uevent_item *items)
+{
+ struct tracefs_user_event_internal *e;
+ struct iovec *head, *io, *relio, *io_end;
+ __u32 *rel, *rel_end;
+ int len, rel_offset, data_offset, used;
+
+ if (!event || !items) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ e = (struct tracefs_user_event_internal *)event;
+ head = io = alloca(sizeof(*io) * (e->iovecs + e->rels));
+ rel = alloca(sizeof(*rel) * e->rels);
+
+ io_end = head + (e->iovecs + e->rels);
+ rel_end = rel + e->rels;
+
+ /* Relative offset starts at end of static data */
+ relio = io + e->iovecs;
+ rel_offset = e->len;
+ data_offset = 0;
+
+ /* Write index must be first */
+ io->iov_base = &e->write_index;
+ io->iov_len = sizeof(e->write_index);
+ io++;
+ used = 1;
+
+ while (items->type != TRACEFS_UEVENT_END) {
+ len = items->len;
+
+ if (len <= 0)
+ goto bad_length;
+
+ if (io >= io_end)
+ goto bad_count;
+
+ switch (items->type) {
+ case TRACEFS_UEVENT_varray:
+ case TRACEFS_UEVENT_vstring:
+ /* Dual vectors */
+ used += 2;
+
+ if (rel >= rel_end || relio >= io_end)
+ goto bad_count;
+
+ /* __rel_loc types */
+ relio->iov_base = (void *)items->data;
+ relio->iov_len = len;
+ relio++;
+
+ io->iov_base = (void *)rel;
+ io->iov_len = sizeof(*rel);
+ io++;
+ rel_offset -= sizeof(*rel);
+
+ /* Fill in rel loc data */
+ *rel = DYN_LOC(rel_offset + data_offset, len);
+ data_offset += len;
+ rel++;
+
+ break;
+
+ default:
+ /* Single vector */
+ used++;
+
+ /* Direct types */
+ io->iov_base = (void *)items->data;
+ io->iov_len = len;
+ io++;
+ rel_offset -= len;
+
+ break;
+ }
+
+ items++;
+ }
+
+ return writev(e->group->fd, head, used);
+
+bad_length:
+ fprintf(stderr, "Bad user_event item length at index %d\n",
+ used - 1);
+ errno = EINVAL;
+ return -1;
+
+bad_count:
+ fprintf(stderr, "Too many user_event items passed\n");
+ errno = E2BIG;
+ return -1;
+}
--
2.17.1
next prev parent reply other threads:[~2022-02-22 23:23 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-02-22 23:23 [PATCH v2 0/3] libtracefs: Add APIs for user_events to libtracefs Beau Belgrave
2022-02-22 23:23 ` Beau Belgrave [this message]
2022-02-23 3:29 ` [PATCH v2 1/3] libtracefs: Add user_events to libtracefs sources Steven Rostedt
2022-02-22 23:23 ` [PATCH v2 2/3] libtracefs: Add documentation and sample code for user_events Beau Belgrave
2022-02-22 23:23 ` [PATCH v2 3/3] libtracefs: Add unit tests " Beau Belgrave
2022-02-23 15:17 ` Steven Rostedt
2022-02-23 17:25 ` Beau Belgrave
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220222232316.14640-2-beaub@linux.microsoft.com \
--to=beaub@linux.microsoft.com \
--cc=linux-trace-devel@vger.kernel.org \
--cc=rostedt@goodmis.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).