From: David Vernet <void@manifault.com>
To: bpf@vger.kernel.org
Cc: ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
john.fastabend@gmail.com, martin.lau@linux.dev, song@kernel.org,
yhs@fb.com, kpsingh@kernel.org, sdf@google.com,
haoluo@google.com, jolsa@kernel.org, tj@kernel.org,
joannelkoong@gmail.com, linux-kernel@vger.kernel.org,
Kernel-team@fb.com
Subject: [PATCH 4/5] bpf: Add libbpf logic for user-space ring buffer
Date: Mon, 8 Aug 2022 08:53:40 -0700 [thread overview]
Message-ID: <20220808155341.2479054-4-void@manifault.com> (raw)
In-Reply-To: <20220808155341.2479054-1-void@manifault.com>
Now that all of the logic is in place in the kernel to support user-space
produced ringbuffers, we can add the user-space logic to libbpf.
Signed-off-by: David Vernet <void@manifault.com>
---
kernel/bpf/ringbuf.c | 7 +-
tools/lib/bpf/libbpf.c | 10 +-
tools/lib/bpf/libbpf.h | 19 +++
tools/lib/bpf/libbpf.map | 6 +
tools/lib/bpf/libbpf_probes.c | 1 +
tools/lib/bpf/ringbuf.c | 216 ++++++++++++++++++++++++++++++++++
6 files changed, 256 insertions(+), 3 deletions(-)
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index fc589fc8eb7c..a10558e79ec8 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -639,7 +639,9 @@ static int __bpf_user_ringbuf_poll(struct bpf_ringbuf *rb, void **sample,
if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
return -EBUSY;
- /* Synchronizes with smp_store_release() in user-space. */
+ /* Synchronizes with smp_store_release() in __ring_buffer_user__commit()
+ * in user-space.
+ */
prod_pos = smp_load_acquire(&rb->producer_pos);
/* Synchronizes with smp_store_release() in
* __bpf_user_ringbuf_sample_release().
@@ -695,6 +697,9 @@ __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size,
/* To release the ringbuffer, just increment the producer position to
* signal that a new sample can be consumed. The busy bit is cleared by
* userspace when posting a new sample to the ringbuffer.
+ *
+ * Synchronizes with smp_load_acquire() in ring_buffer_user__reserve()
+ * in user-space.
*/
smp_store_release(&rb->consumer_pos, rb->consumer_pos + size +
BPF_RINGBUF_HDR_SZ);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 9c1f2d09f44e..f7fe09dce643 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2367,6 +2367,12 @@ static size_t adjust_ringbuf_sz(size_t sz)
return sz;
}
+static bool map_is_ringbuf(const struct bpf_map *map)
+{
+ return map->def.type == BPF_MAP_TYPE_RINGBUF ||
+ map->def.type == BPF_MAP_TYPE_USER_RINGBUF;
+}
+
static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def)
{
map->def.type = def->map_type;
@@ -2381,7 +2387,7 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def
map->btf_value_type_id = def->value_type_id;
/* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */
- if (map->def.type == BPF_MAP_TYPE_RINGBUF)
+ if (map_is_ringbuf(map))
map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries);
if (def->parts & MAP_DEF_MAP_TYPE)
@@ -4363,7 +4369,7 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries)
map->def.max_entries = max_entries;
/* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */
- if (map->def.type == BPF_MAP_TYPE_RINGBUF)
+ if (map_is_ringbuf(map))
map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries);
return 0;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 61493c4cddac..6d1d0539b08d 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1009,6 +1009,7 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook,
/* Ring buffer APIs */
struct ring_buffer;
+struct ring_buffer_user;
typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size);
@@ -1028,6 +1029,24 @@ LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms);
LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb);
LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb);
+struct ring_buffer_user_opts {
+ size_t sz; /* size of this struct, for forward/backward compatibility */
+};
+
+#define ring_buffer_user_opts__last_field sz
+
+LIBBPF_API struct ring_buffer_user *
+ring_buffer_user__new(int map_fd, const struct ring_buffer_user_opts *opts);
+LIBBPF_API void *ring_buffer_user__reserve(struct ring_buffer_user *rb,
+ uint32_t size);
+LIBBPF_API void *ring_buffer_user__poll(struct ring_buffer_user *rb,
+ uint32_t size, int timeout_ms);
+LIBBPF_API void ring_buffer_user__submit(struct ring_buffer_user *rb,
+ void *sample);
+LIBBPF_API void ring_buffer_user__discard(struct ring_buffer_user *rb,
+ void *sample);
+LIBBPF_API void ring_buffer_user__free(struct ring_buffer_user *rb);
+
/* Perf buffer APIs */
struct perf_buffer;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 119e6e1ea7f1..8db11040df1b 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -365,4 +365,10 @@ LIBBPF_1.0.0 {
libbpf_bpf_map_type_str;
libbpf_bpf_prog_type_str;
perf_buffer__buffer;
+ ring_buffer_user__discard;
+ ring_buffer_user__free;
+ ring_buffer_user__new;
+ ring_buffer_user__poll;
+ ring_buffer_user__reserve;
+ ring_buffer_user__submit;
};
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 6d495656f554..f3a8e8e74eb8 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -231,6 +231,7 @@ static int probe_map_create(enum bpf_map_type map_type)
return btf_fd;
break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_USER_RINGBUF:
key_size = 0;
value_size = 0;
max_entries = 4096;
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 8bc117bcc7bc..86e3c11d8486 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -39,6 +39,17 @@ struct ring_buffer {
int ring_cnt;
};
+struct ring_buffer_user {
+ struct epoll_event event;
+ unsigned long *consumer_pos;
+ unsigned long *producer_pos;
+ void *data;
+ unsigned long mask;
+ size_t page_size;
+ int map_fd;
+ int epoll_fd;
+};
+
static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r)
{
if (r->consumer_pos) {
@@ -300,3 +311,208 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb)
{
return rb->epoll_fd;
}
+
+static void __user_ringbuf_unmap_ring(struct ring_buffer_user *rb)
+{
+ if (rb->consumer_pos) {
+ munmap(rb->consumer_pos, rb->page_size);
+ rb->consumer_pos = NULL;
+ }
+ if (rb->producer_pos) {
+ munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1));
+ rb->producer_pos = NULL;
+ }
+}
+
+void ring_buffer_user__free(struct ring_buffer_user *rb)
+{
+ if (!rb)
+ return;
+
+ __user_ringbuf_unmap_ring(rb);
+
+ if (rb->epoll_fd >= 0)
+ close(rb->epoll_fd);
+
+ free(rb);
+}
+
+static int __ring_buffer_user_map(struct ring_buffer_user *rb, int map_fd)
+{
+
+ struct bpf_map_info info;
+ __u32 len = sizeof(info);
+ void *tmp;
+ struct epoll_event *rb_epoll;
+ int err;
+
+ memset(&info, 0, sizeof(info));
+
+ err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
+ if (err) {
+ err = -errno;
+ pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n",
+ map_fd, err);
+ return libbpf_err(err);
+ }
+
+ if (info.type != BPF_MAP_TYPE_USER_RINGBUF) {
+ pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n",
+ map_fd);
+ return libbpf_err(-EINVAL);
+ }
+
+ rb->map_fd = map_fd;
+ rb->mask = info.max_entries - 1;
+
+ /* Map read-only consumer page */
+ tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0);
+ if (tmp == MAP_FAILED) {
+ err = -errno;
+ pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
+ map_fd, err);
+ return libbpf_err(err);
+ }
+ rb->consumer_pos = tmp;
+
+ /* Map read-write the producer page and data pages. We map the data
+ * region as twice the total size of the ringbuffer to allow the simple
+ * reading and writing of samples that wrap around the end of the
+ * buffer. See the kernel implementation for details.
+ */
+ tmp = mmap(NULL, rb->page_size + 2 * info.max_entries,
+ PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, rb->page_size);
+ if (tmp == MAP_FAILED) {
+ err = -errno;
+ pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n",
+ map_fd, err);
+ return libbpf_err(err);
+ }
+
+ rb->producer_pos = tmp;
+ rb->data = tmp + rb->page_size;
+
+ rb_epoll = &rb->event;
+ rb_epoll->events = EPOLLOUT;
+ if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) {
+ err = -errno;
+ pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n",
+ map_fd, err);
+ return libbpf_err(err);
+ }
+
+ return 0;
+}
+
+struct ring_buffer_user *
+ring_buffer_user__new(int map_fd, const struct ring_buffer_user_opts *opts)
+{
+ struct ring_buffer_user *rb;
+ int err;
+
+ if (!OPTS_VALID(opts, ring_buffer_opts))
+ return errno = EINVAL, NULL;
+
+ rb = calloc(1, sizeof(*rb));
+ if (!rb)
+ return errno = ENOMEM, NULL;
+
+ rb->page_size = getpagesize();
+
+ rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (rb->epoll_fd < 0) {
+ err = -errno;
+ pr_warn("user ringbuf: failed to create epoll instance: %d\n",
+ err);
+ goto err_out;
+ }
+
+ err = __ring_buffer_user_map(rb, map_fd);
+ if (err)
+ goto err_out;
+
+ return rb;
+
+err_out:
+ ring_buffer_user__free(rb);
+ return errno = -err, NULL;
+}
+
+static void __ring_buffer_user__commit(struct ring_buffer_user *rb)
+{
+ uint32_t *hdr;
+ uint32_t total_len;
+ unsigned long prod_pos;
+
+ prod_pos = *rb->producer_pos;
+ hdr = rb->data + (prod_pos & rb->mask);
+
+ total_len = *hdr + BPF_RINGBUF_HDR_SZ;
+
+ /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_poll() in
+ * the kernel.
+ */
+ smp_store_release(rb->producer_pos, prod_pos + total_len);
+}
+
+/* Discard a previously reserved sample into the ring buffer. Because the user
+ * ringbuffer is assumed to be single producer, this can simply be a no-op, and
+ * the producer pointer is left in the same place as when it was reserved.
+ */
+void ring_buffer_user__discard(struct ring_buffer_user *rb, void *sample)
+{}
+
+/* Submit a previously reserved sample into the ring buffer.
+ */
+void ring_buffer_user__submit(struct ring_buffer_user *rb, void *sample)
+{
+ __ring_buffer_user__commit(rb);
+}
+
+/* Reserve a pointer to a sample in the user ring buffer. This function is *not*
+ * thread safe, and the ring-buffer supports only a single producer.
+ */
+void *ring_buffer_user__reserve(struct ring_buffer_user *rb, uint32_t size)
+{
+ uint32_t *hdr;
+ /* 64-bit to avoid overflow in case of extreme application behavior */
+ size_t avail_size, total_size, max_size;
+ unsigned long cons_pos, prod_pos;
+
+ /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_poll() in
+ * the kernel.
+ */
+ cons_pos = smp_load_acquire(rb->consumer_pos);
+ /* Synchronizes with smp_store_release() in __ring_buffer_user__commit()
+ */
+ prod_pos = smp_load_acquire(rb->producer_pos);
+
+ max_size = rb->mask + 1;
+ avail_size = max_size - (prod_pos - cons_pos);
+ total_size = size + BPF_RINGBUF_HDR_SZ;
+
+ if (total_size > max_size || avail_size < total_size)
+ return NULL;
+
+ hdr = rb->data + (prod_pos & rb->mask);
+ *hdr = size;
+
+ /* Producer pos is updated when a sample is submitted. */
+
+ return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask);
+}
+
+/* Poll for available space in the ringbuffer, and reserve a record when it
+ * becomes available.
+ */
+void *ring_buffer_user__poll(struct ring_buffer_user *rb, uint32_t size,
+ int timeout_ms)
+{
+ int cnt;
+
+ cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, timeout_ms);
+ if (cnt < 0)
+ return NULL;
+
+ return ring_buffer_user__reserve(rb, size);
+}
--
2.30.2
next prev parent reply other threads:[~2022-08-08 15:54 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-08 15:53 [PATCH 1/5] bpf: Clear callee saved regs after updating REG0 David Vernet
2022-08-08 15:53 ` [PATCH 2/5] bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type David Vernet
2022-08-11 23:22 ` Andrii Nakryiko
2022-08-12 16:21 ` David Vernet
2022-08-11 23:29 ` Andrii Nakryiko
2022-08-12 16:23 ` David Vernet
2022-08-16 18:43 ` Andrii Nakryiko
2022-08-08 15:53 ` [PATCH 3/5] bpf: Add bpf_user_ringbuf_drain() helper David Vernet
2022-08-08 21:55 ` kernel test robot
2022-08-11 23:22 ` Andrii Nakryiko
2022-08-12 0:01 ` David Vernet
2022-08-12 0:46 ` David Vernet
2022-08-16 18:57 ` Andrii Nakryiko
2022-08-16 22:14 ` David Vernet
2022-08-16 22:59 ` Andrii Nakryiko
2022-08-17 20:24 ` David Vernet
2022-08-17 21:03 ` David Vernet
2022-08-08 15:53 ` David Vernet [this message]
2022-08-11 23:39 ` [PATCH 4/5] bpf: Add libbpf logic for user-space ring buffer Andrii Nakryiko
2022-08-12 17:28 ` David Vernet
2022-08-16 19:09 ` Andrii Nakryiko
2022-08-17 14:02 ` David Vernet
2022-08-18 3:05 ` David Vernet
2022-08-08 15:53 ` [PATCH 5/5] selftests/bpf: Add selftests validating the user ringbuf David Vernet
2022-08-08 18:14 ` [PATCH 1/5] bpf: Clear callee saved regs after updating REG0 Joanne Koong
2022-08-08 18:50 ` David Vernet
2022-08-08 23:32 ` Joanne Koong
2022-08-09 12:47 ` David Vernet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220808155341.2479054-4-void@manifault.com \
--to=void@manifault.com \
--cc=Kernel-team@fb.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=haoluo@google.com \
--cc=joannelkoong@gmail.com \
--cc=john.fastabend@gmail.com \
--cc=jolsa@kernel.org \
--cc=kpsingh@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=martin.lau@linux.dev \
--cc=sdf@google.com \
--cc=song@kernel.org \
--cc=tj@kernel.org \
--cc=yhs@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.