All of lore.kernel.org
 help / color / mirror / Atom feed
From: Beau Belgrave <beaub@linux.microsoft.com>
To: rostedt@goodmis.org
Cc: linux-trace-devel@vger.kernel.org, beaub@linux.microsoft.com
Subject: [PATCH v2 1/3] libtracefs: Add user_events to libtracefs sources
Date: Tue, 22 Feb 2022 15:23:14 -0800	[thread overview]
Message-ID: <20220222232316.14640-2-beaub@linux.microsoft.com> (raw)
In-Reply-To: <20220222232316.14640-1-beaub@linux.microsoft.com>

The user events are scheduled to be included into Linux 5.18, which
register a special mmapped page to denote when the user event is enabled
(from an external source). This API adds a wrapper to the kernel
interface that makes it easy to register user events and test if they
are enabled and to record the event when it is.

Link:
https://lore.kernel.org/linux-trace-devel/20220121192833.GA3128@kbox/T/#m2bcf53c373fbeaba2c46d1a053b3174171167e4e

Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
---
 Makefile                 |   8 +
 include/tracefs-local.h  |  24 ++
 include/tracefs.h        |  67 +++++
 src/Makefile             |   4 +
 src/tracefs-userevents.c | 516 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 619 insertions(+)
 create mode 100644 src/tracefs-userevents.c

diff --git a/Makefile b/Makefile
index 544684c..a4598b4 100644
--- a/Makefile
+++ b/Makefile
@@ -154,6 +154,14 @@ CFLAGS ?= -g -Wall
 CPPFLAGS ?=
 LDFLAGS ?=
 
+USEREVENTS_INSTALLED := $(shell if (echo "$(pound)include <linux/user_events.h>" | $(CC) -E - >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi)
+export USEREVENTS_INSTALLED
+ifeq ($(USEREVENTS_INSTALLED), 1)
+CFLAGS += -DUSEREVENTS
+else
+$(warning user_events.h not installed, skipping)
+endif
+
 CUNIT_INSTALLED := $(shell if (printf "$(pound)include <CUnit/Basic.h>\n void main(){CU_initialize_registry();}" | $(CC) -x c - -lcunit -o /dev/null >/dev/null 2>&1) ; then echo 1; else echo 0 ; fi)
 export CUNIT_INSTALLED
 
diff --git a/include/tracefs-local.h b/include/tracefs-local.h
index bf157e1..9491545 100644
--- a/include/tracefs-local.h
+++ b/include/tracefs-local.h
@@ -119,4 +119,28 @@ int trace_rescan_events(struct tep_handle *tep,
 struct tep_event *get_tep_event(struct tep_handle *tep,
 				const char *system, const char *name);
 
+/* Internal interface for ftrace user events */
+
+struct tracefs_user_event_group;
+
+struct tracefs_user_event_internal
+{
+	struct tracefs_user_event		event_external;
+	int					write_index;
+	int					iovecs;
+	int					rels;
+	int					len;
+	struct tracefs_user_event_group		*group;
+	struct tracefs_user_event_internal	*next;
+};
+
+struct tracefs_user_event_group
+{
+	int					fd;
+	int					mmap_len;
+	char					*mmap;
+	pthread_mutex_t				lock;
+	struct tracefs_user_event_internal	*events;
+};
+
 #endif /* _TRACE_FS_LOCAL_H */
diff --git a/include/tracefs.h b/include/tracefs.h
index 1848ad0..74241a9 100644
--- a/include/tracefs.h
+++ b/include/tracefs.h
@@ -571,4 +571,71 @@ struct tracefs_synth *tracefs_sql(struct tep_handle *tep, const char *name,
 struct tep_event *
 tracefs_synth_get_event(struct tep_handle *tep, struct tracefs_synth *synth);
 
+/* User events */
+enum tracefs_uevent_type {
+	TRACEFS_UEVENT_END,
+	TRACEFS_UEVENT_u8,
+	TRACEFS_UEVENT_s8,
+	TRACEFS_UEVENT_u16,
+	TRACEFS_UEVENT_s16,
+	TRACEFS_UEVENT_u32,
+	TRACEFS_UEVENT_s32,
+	TRACEFS_UEVENT_u64,
+	TRACEFS_UEVENT_s64,
+	TRACEFS_UEVENT_string,
+	TRACEFS_UEVENT_struct,
+	TRACEFS_UEVENT_varray,
+	TRACEFS_UEVENT_vstring,
+};
+
+enum tracefs_uevent_flags {
+	/* None */
+	TRACEFS_UEVENT_FLAG_NONE = 0,
+
+	/* When BPF is attached, use iterator/no copy */
+	TRACEFS_UEVENT_FLAG_bpf_iter = 1 << 0,
+};
+
+struct tracefs_uevent_item {
+	/* Type of item */
+	enum tracefs_uevent_type	type;
+
+	/* Length of data, optional during register */
+	int len;
+
+	union {
+		/* Used during write */
+		const void		*data;
+
+		/* Used during register */
+		const char		*name;
+	};
+};
+
+struct tracefs_user_event {
+	unsigned int	size;
+	char		*enabled;
+};
+
+struct tracefs_user_event_group;
+
+struct tracefs_user_event_group *tracefs_user_event_group_open(void);
+
+void tracefs_user_event_group_close(struct tracefs_user_event_group *group);
+
+int tracefs_user_event_delete(const char *name);
+
+struct tracefs_user_event *
+tracefs_user_event_register(struct tracefs_user_event_group *group,
+			    const char *name, enum tracefs_uevent_flags flags,
+			    struct tracefs_uevent_item *items);
+
+static inline bool tracefs_user_event_enabled(struct tracefs_user_event *event)
+{
+	return event && ((volatile char *)event->enabled)[0] != 0;
+}
+
+int tracefs_user_event_record(struct tracefs_user_event *event,
+			      struct tracefs_uevent_item *items);
+
 #endif /* _TRACE_FS_H */
diff --git a/src/Makefile b/src/Makefile
index e8afab5..984e8cf 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,6 +14,10 @@ OBJS += tracefs-filter.o
 OBJS += tracefs-dynevents.o
 OBJS += tracefs-eprobes.o
 
+ifeq ($(USEREVENTS_INSTALLED), 1)
+OBJS += tracefs-userevents.o
+endif
+
 # Order matters for the the three below
 OBJS += sqlhist-lex.o
 OBJS += sqlhist.tab.o
diff --git a/src/tracefs-userevents.c b/src/tracefs-userevents.c
new file mode 100644
index 0000000..ccd511b
--- /dev/null
+++ b/src/tracefs-userevents.c
@@ -0,0 +1,516 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Copyright (C) 2022 Microsoft Corporation.
+ *
+ * Authors:
+ *   Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <alloca.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <linux/user_events.h>
+
+#include "tracefs.h"
+#include "tracefs-local.h"
+
+#define STAT_FILE "user_events_status"
+#define DATA_FILE "user_events_data"
+
+static void free_user_events(struct tracefs_user_event_internal *event)
+{
+	struct tracefs_user_event_internal *next;
+
+	while (event) {
+		next = event->next;
+		free(event);
+		event = next;
+	}
+}
+
+static int append_field(struct tracefs_uevent_item *item, struct trace_seq *seq,
+			int index)
+{
+	if (index != 0)
+		trace_seq_printf(seq, ";");
+
+	switch (item->type) {
+	case TRACEFS_UEVENT_u8:
+		trace_seq_printf(seq, " u8 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_s8:
+		trace_seq_printf(seq, " s8 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_u16:
+		trace_seq_printf(seq, " u16 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_s16:
+		trace_seq_printf(seq, " s16 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_u32:
+		trace_seq_printf(seq, " u32 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_s32:
+		trace_seq_printf(seq, " s32 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_u64:
+		trace_seq_printf(seq, " u64 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_s64:
+		trace_seq_printf(seq, " s64 %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_string:
+		if (item->len <= 0) {
+			errno = EINVAL;
+			return -1;
+		}
+
+		trace_seq_printf(seq, " char[%d] %s", item->len, item->name);
+		break;
+
+	case TRACEFS_UEVENT_struct:
+		/*
+		 * struct must have 2 strings, do simple check
+		 * in user, kernel will fully validate
+		 */
+		if (!strchr(item->name, ' ')) {
+			errno = EINVAL;
+			return -1;
+		}
+
+		if (item->len <= 0) {
+			errno = EINVAL;
+			return -1;
+		}
+
+		trace_seq_printf(seq, " struct %s %d", item->name, item->len);
+		break;
+
+	case TRACEFS_UEVENT_varray:
+		/* Variable length array */
+		trace_seq_printf(seq, " __rel_loc u8[] %s", item->name);
+		break;
+
+	case TRACEFS_UEVENT_vstring:
+		/* Variable length string */
+		trace_seq_printf(seq, " __rel_loc char[] %s", item->name);
+		break;
+
+	default:
+		/* Unknown */
+		errno = ENOENT;
+		return -1;
+	}
+
+	return 0;
+}
+
+static int create_reg_cmd(const char *name, enum tracefs_uevent_flags flags,
+			  struct tracefs_uevent_item *item, struct trace_seq *seq)
+{
+	int ret, index = 0;
+
+	trace_seq_printf(seq, "%s", name);
+
+	if (flags & TRACEFS_UEVENT_FLAG_bpf_iter)
+		trace_seq_printf(seq, ":BPF_ITER");
+
+	while (item->type != TRACEFS_UEVENT_END) {
+		ret = append_field(item, seq, index++);
+
+		if (ret < 0)
+			return ret;
+
+		item++;
+	}
+
+	trace_seq_terminate(seq);
+
+	if (seq->state) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	return 0;
+}
+
+static int get_write_counts(struct tracefs_user_event_internal *event,
+			    struct tracefs_uevent_item *item)
+{
+	event->rels = 0;
+	event->len = 0;
+
+	/* Start at 1, need iovec for write_index */
+	event->iovecs = 1;
+
+	while (item->type != TRACEFS_UEVENT_END) {
+		switch (item->type) {
+		case TRACEFS_UEVENT_u8:
+		case TRACEFS_UEVENT_s8:
+			event->len += sizeof(__u8);
+			break;
+
+		case TRACEFS_UEVENT_u16:
+		case TRACEFS_UEVENT_s16:
+			event->len += sizeof(__u16);
+			break;
+
+		case TRACEFS_UEVENT_u32:
+		case TRACEFS_UEVENT_s32:
+			event->len += sizeof(__u32);
+			break;
+
+		case TRACEFS_UEVENT_u64:
+		case TRACEFS_UEVENT_s64:
+			event->len += sizeof(__u64);
+			break;
+
+		case TRACEFS_UEVENT_string:
+		case TRACEFS_UEVENT_struct:
+			event->len += item->len;
+			break;
+
+		case TRACEFS_UEVENT_varray:
+		case TRACEFS_UEVENT_vstring:
+			/* Requires a rel loc entry */
+			event->len += sizeof(__u32);
+			event->rels++;
+			break;
+
+		default:
+			/* Unknown */
+			errno = ENOENT;
+			return -1;
+		}
+
+		event->iovecs++;
+		item++;
+	}
+
+	return 0;
+}
+
+/**
+ * tracefs_user_event_group_open - Opens a new group to use for user events
+ *
+ * Returns a pointer to a group to use for user events. The pointer is valid
+ * until tracefs_user_event_group_close() is called. In case of an error NULL
+ * is returned.
+ */
+struct tracefs_user_event_group *tracefs_user_event_group_open(void)
+{
+	int stat, write, page_size, i;
+	struct tracefs_user_event_group *group;
+
+	stat = tracefs_instance_file_open(NULL, STAT_FILE, O_RDWR);
+
+	if (stat < 0)
+		return NULL;
+
+	write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR);
+
+	if (write < 0)
+		goto put_stat;
+
+	group = malloc(sizeof(*group));
+
+	if (!group)
+		goto put_write;
+
+	if (pthread_mutex_init(&group->lock, NULL) < 0)
+		goto put_group;
+
+	/* Scale up to 16-bit max user events a page at a time */
+	page_size = sysconf(_SC_PAGESIZE);
+	group->mmap_len = page_size;
+
+	for (i = 0; i < 16; ++i) {
+		group->mmap = mmap(NULL, group->mmap_len,
+				   PROT_READ, MAP_SHARED, stat, 0);
+
+		if (group->mmap == MAP_FAILED && errno == EINVAL) {
+			/* Increase by page size and try again */
+			group->mmap_len += page_size;
+			continue;
+		}
+
+		break;
+	}
+
+	if (group->mmap == MAP_FAILED)
+		goto put_group;
+
+	group->fd = write;
+	group->events = NULL;
+
+	/* Status fd no longer needed */
+	close(stat);
+
+	return group;
+
+put_group:
+	free(group);
+put_write:
+	close(write);
+put_stat:
+	close(stat);
+
+	return NULL;
+}
+
+/**
+ * tracefs_user_event_delete - Deletes a user event from the system
+ * @name: Name of the event to delete
+ *
+ * Deletes the event from the system if it is not used.
+ */
+int tracefs_user_event_delete(const char *name)
+{
+	int ret, write;
+
+	write = tracefs_instance_file_open(NULL, DATA_FILE, O_RDWR);
+
+	if (write < 0)
+		return write;
+
+	ret = ioctl(write, DIAG_IOCSDEL, name);
+
+	close(write);
+
+	return ret;
+}
+
+/**
+ * tracefs_user_event_group_close - Closes a group containing user events
+ * @group: Group to close
+ *
+ * Closes a group and all the user events within it. Any user event that has
+ * been added to the group is no longer valid and cannot be used.
+ */
+void tracefs_user_event_group_close(struct tracefs_user_event_group *group)
+{
+	if (!group)
+		return;
+
+	if (group->mmap != MAP_FAILED)
+		munmap(group->mmap, group->mmap_len);
+
+	if (group->fd != -1)
+		close(group->fd);
+
+	free_user_events(group->events);
+	free(group);
+}
+
+/**
+ * tracefs_user_event_register - Registers a user event with the system
+ * @group: Group to add the user event to
+ * @name: Name of the event to register
+ * @flags: Flags to use
+ * @items: Array of items that the event contains
+ *
+ * Allocates and registers a user event with the system. The user event will be
+ * added to the @group. The lifetime of the event is bound to the @group. When
+ * the @group is closed via tracefs_user_event_group_close() the event will no
+ * longer exist and should not be used.
+ *
+ * The @items are processed in order and the final item type must be set to
+ * TRACEFS_UEVENT_END to mark the last item. Each item must have the type
+ * and name defined. The string and struct type also require the len to be set
+ * for the item.
+ *
+ * Return a pointer to a user event on success, or NULL or error.
+ *
+ * errno will be set to EINVAL if @group is null or unexpected @items.
+ */
+struct tracefs_user_event *
+tracefs_user_event_register(struct tracefs_user_event_group *group,
+			    const char *name, enum tracefs_uevent_flags flags,
+			    struct tracefs_uevent_item *items)
+{
+	struct tracefs_user_event_internal *event = NULL;
+	struct user_reg reg = {0};
+	struct trace_seq seq;
+
+	if (!group || !items) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	trace_seq_init(&seq);
+
+	/* Populate cmd */
+	if (create_reg_cmd(name, flags, items, &seq))
+		return NULL;
+
+	event = malloc(sizeof(*event));
+
+	if (!event)
+		goto put_seq;
+
+	reg.size = sizeof(reg);
+	reg.name_args = (__u64)seq.buffer;
+
+	/* Register event with kernel */
+	if (ioctl(group->fd, DIAG_IOCSREG, &reg) == -1)
+		goto put_event;
+
+	/* Sanity check bounds returned */
+	if (reg.status_index >= group->mmap_len) {
+		errno = EINVAL;
+		goto put_event;
+	}
+
+	if (get_write_counts(event, items))
+		goto put_event;
+
+	/* Keep track of user view at this point in time */
+	event->event_external.size = sizeof(event->event_external);
+	event->event_external.enabled = &group->mmap[reg.status_index];
+
+	event->write_index = reg.write_index;
+	event->group = group;
+
+	/* Add event into the group under lock */
+	pthread_mutex_lock(&group->lock);
+	event->next = group->events;
+	group->events = event->next;
+	pthread_mutex_unlock(&group->lock);
+
+	trace_seq_destroy(&seq);
+
+	return &event->event_external;
+put_event:
+	free(event);
+put_seq:
+	trace_seq_destroy(&seq);
+
+	return NULL;
+}
+
+/**
+ * tracefs_user_event_record - Records an event with data
+ * @event: User event to record data about
+ * @items: Items to write for the event
+ *
+ * Records items for the event. Callers should check if the cost of recording
+ * should be performed by calling tracefs_user_event_enabled(). Items are
+ * checked to ensure they fit within the described items during register. Each
+ * item must specify the length of the item being recorded.
+ *
+ * Return the number of bytes recorded or -1 upon error.
+ *
+ * errno will be set to EINVAL if @event or @items is null or @items contains
+ * an item with a length of less than or equal to 0.
+ * errno will be set to E2BIG if @items contains more items than previously
+ * registered for the event.
+ */
+int tracefs_user_event_record(struct tracefs_user_event *event,
+			      struct tracefs_uevent_item *items)
+{
+	struct tracefs_user_event_internal *e;
+	struct iovec *head, *io, *relio, *io_end;
+	__u32 *rel, *rel_end;
+	int len, rel_offset, data_offset, used;
+
+	if (!event || !items) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	e = (struct tracefs_user_event_internal *)event;
+	head = io = alloca(sizeof(*io) * (e->iovecs + e->rels));
+	rel = alloca(sizeof(*rel) * e->rels);
+
+	io_end = head + (e->iovecs + e->rels);
+	rel_end = rel + e->rels;
+
+	/* Relative offset starts at end of static data */
+	relio = io + e->iovecs;
+	rel_offset = e->len;
+	data_offset = 0;
+
+	/* Write index must be first */
+	io->iov_base = &e->write_index;
+	io->iov_len = sizeof(e->write_index);
+	io++;
+	used = 1;
+
+	while (items->type != TRACEFS_UEVENT_END) {
+		len = items->len;
+
+		if (len <= 0)
+			goto bad_length;
+
+		if (io >= io_end)
+			goto bad_count;
+
+		switch (items->type) {
+		case TRACEFS_UEVENT_varray:
+		case TRACEFS_UEVENT_vstring:
+			/* Dual vectors */
+			used += 2;
+
+			if (rel >= rel_end || relio >= io_end)
+				goto bad_count;
+
+			/* __rel_loc types */
+			relio->iov_base = (void *)items->data;
+			relio->iov_len = len;
+			relio++;
+
+			io->iov_base = (void *)rel;
+			io->iov_len = sizeof(*rel);
+			io++;
+			rel_offset -= sizeof(*rel);
+
+			/* Fill in rel loc data */
+			*rel = DYN_LOC(rel_offset + data_offset, len);
+			data_offset += len;
+			rel++;
+
+			break;
+
+		default:
+			/* Single vector */
+			used++;
+
+			/* Direct types */
+			io->iov_base = (void *)items->data;
+			io->iov_len = len;
+			io++;
+			rel_offset -= len;
+
+			break;
+		}
+
+		items++;
+	}
+
+	return writev(e->group->fd, head, used);
+
+bad_length:
+	fprintf(stderr, "Bad user_event item length at index %d\n",
+		used - 1);
+	errno = EINVAL;
+	return -1;
+
+bad_count:
+	fprintf(stderr, "Too many user_event items passed\n");
+	errno = E2BIG;
+	return -1;
+}
-- 
2.17.1


  reply	other threads:[~2022-02-22 23:23 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-22 23:23 [PATCH v2 0/3] libtracefs: Add APIs for user_events to libtracefs Beau Belgrave
2022-02-22 23:23 ` Beau Belgrave [this message]
2022-02-23  3:29   ` [PATCH v2 1/3] libtracefs: Add user_events to libtracefs sources Steven Rostedt
2022-02-22 23:23 ` [PATCH v2 2/3] libtracefs: Add documentation and sample code for user_events Beau Belgrave
2022-02-22 23:23 ` [PATCH v2 3/3] libtracefs: Add unit tests " Beau Belgrave
2022-02-23 15:17   ` Steven Rostedt
2022-02-23 17:25     ` Beau Belgrave

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220222232316.14640-2-beaub@linux.microsoft.com \
    --to=beaub@linux.microsoft.com \
    --cc=linux-trace-devel@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.