From: Cong Wang <xiyou.wangcong@gmail.com>
To: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org, John Fastabend <john.fastabend@gmail.com>,
Jakub Sitnicki <jakub@cloudflare.com>,
Jiayuan Chen <jiayuan.chen@linux.dev>,
hemanthmalla@gmail.com, zijianzhang@bytedance.com,
Cong Wang <xiyou.wangcong@gmail.com>,
Cong Wang <cwang@multikernel.io>
Subject: [RFC PATCH bpf-next 3/5] selftests/bpf: add tcp_splice basic round-trip test
Date: Thu, 11 Jun 2026 18:14:50 -0700 [thread overview]
Message-ID: <20260612011452.134466-4-xiyou.wangcong@gmail.com> (raw)
In-Reply-To: <20260612011452.134466-1-xiyou.wangcong@gmail.com>
Loads a sock_ops BPF program that, on each ESTABLISHED callback,
inserts self into a sockhash keyed by the local 4-tuple, looks up
the peer using the swapped 4-tuple, and calls the new
bpf_sock_splice_pair kfunc on whichever peer it finds. Counters track
how many calls returned 0 (winner) vs -EEXIST (race loser) vs other
errors.
Userspace creates a loopback TCP pair, waits for both ESTABLISHED
callbacks to land, then verifies pair_ok >= 1 and pair_other_err == 0.
A receiver thread blocks in recv() before the main thread sends; the
test asserts the bytes round-trip through the rendezvous data plane.
Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang@multikernel.io>
---
.../selftests/bpf/prog_tests/tcp_splice.c | 206 ++++++++++++++++++
.../selftests/bpf/progs/test_tcp_splice.c | 101 +++++++++
2 files changed, 307 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/tcp_splice.c
create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_splice.c
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_splice.c b/tools/testing/selftests/bpf/prog_tests/tcp_splice.c
new file mode 100644
index 000000000000..b80a1129c6aa
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_splice.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+#include "test_tcp_splice.skel.h"
+
+#include <pthread.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define MSG "hello rendezvous"
+#define CLIENT_BANNER "client-banner"
+#define SERVER_BANNER "server-banner"
+
+struct recv_arg {
+ int fd;
+ char buf[64];
+ int n;
+ int err;
+};
+
+static void *recv_thread(void *p)
+{
+ struct recv_arg *a = p;
+
+ a->n = recv(a->fd, a->buf, sizeof(a->buf) - 1, 0);
+ a->err = errno;
+ return NULL;
+}
+
+struct send_arg {
+ int fd;
+ const char *buf;
+ size_t len;
+ int n;
+ int err;
+};
+
+static void *send_thread(void *p)
+{
+ struct send_arg *a = p;
+
+ a->n = send(a->fd, a->buf, a->len, 0);
+ a->err = errno;
+ return NULL;
+}
+
+static int run_basic(int cgroup_fd, struct test_tcp_splice *skel)
+{
+ pthread_t tid;
+ struct recv_arg a = {};
+ int sfd = -1, cfd = -1, lfd = -1;
+ int n, err = -1;
+
+ lfd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+ if (!ASSERT_GE(lfd, 0, "start_server"))
+ return -1;
+
+ cfd = connect_to_fd(lfd, 0);
+ if (!ASSERT_GE(cfd, 0, "connect_to_fd"))
+ goto out;
+
+ sfd = accept(lfd, NULL, NULL);
+ if (!ASSERT_GE(sfd, 0, "accept"))
+ goto out;
+
+ /* Give both ESTABLISHED sock_ops callbacks a moment to run. */
+ usleep(50 * 1000);
+
+ if (!ASSERT_GE(skel->bss->pair_ok, 1, "splice paired"))
+ goto out;
+ ASSERT_EQ(skel->bss->pair_other_err, 0, "no unexpected pair errors");
+
+ /* Drive the splice fast path: receiver enters recv() and publishes
+ * its bvec, sender then writes directly into it.
+ */
+ a.fd = sfd;
+ if (!ASSERT_OK(pthread_create(&tid, NULL, recv_thread, &a),
+ "pthread_create"))
+ goto out;
+ usleep(20 * 1000); /* let recv block */
+
+ n = send(cfd, MSG, strlen(MSG), 0);
+ ASSERT_EQ(n, (int)strlen(MSG), "send length");
+
+ pthread_join(tid, NULL);
+ ASSERT_EQ(a.n, (int)strlen(MSG), "recv length");
+ a.buf[a.n > 0 ? a.n : 0] = 0;
+ ASSERT_STREQ(a.buf, MSG, "recv contents");
+
+ err = 0;
+out:
+ if (cfd >= 0)
+ close(cfd);
+ if (sfd >= 0)
+ close(sfd);
+ if (lfd >= 0)
+ close(lfd);
+ return err;
+}
+
+/* Bidirectional-write deadlock-avoidance test.
+ *
+ * Both sides issue send() before either calls recv(), the classic
+ * pattern that used to deadlock under synchronous rendezvous (and
+ * the actual cause of "kex_exchange_identification: write: Broken
+ * pipe" with SSH on loopback). The bounded-wait fallback in
+ * tcp_bpf_splice_sendmsg() must let both writes complete via the
+ * normal TCP path within ~1 ms, and the banners must arrive intact
+ * on the other side when recv() is called next.
+ */
+static int run_bidir_write(int cgroup_fd, struct test_tcp_splice *skel)
+{
+ pthread_t client_send_tid, server_send_tid;
+ struct send_arg cs = { .buf = CLIENT_BANNER,
+ .len = sizeof(CLIENT_BANNER) - 1 };
+ struct send_arg ss = { .buf = SERVER_BANNER,
+ .len = sizeof(SERVER_BANNER) - 1 };
+ struct recv_arg cr = {}, sr = {};
+ int sfd = -1, cfd = -1, lfd = -1;
+ int err = -1;
+
+ lfd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+ if (!ASSERT_GE(lfd, 0, "start_server"))
+ return -1;
+ cfd = connect_to_fd(lfd, 0);
+ if (!ASSERT_GE(cfd, 0, "connect_to_fd"))
+ goto out;
+ sfd = accept(lfd, NULL, NULL);
+ if (!ASSERT_GE(sfd, 0, "accept"))
+ goto out;
+
+ usleep(50 * 1000); /* let pair complete */
+
+ /* Both sides write first, neither reads yet. Both must return
+ * within bounded time (no deadlock).
+ */
+ cs.fd = cfd;
+ ss.fd = sfd;
+ if (!ASSERT_OK(pthread_create(&client_send_tid, NULL, send_thread, &cs),
+ "client send thread"))
+ goto out;
+ if (!ASSERT_OK(pthread_create(&server_send_tid, NULL, send_thread, &ss),
+ "server send thread"))
+ goto out;
+
+ pthread_join(client_send_tid, NULL);
+ pthread_join(server_send_tid, NULL);
+ ASSERT_EQ(cs.n, (int)cs.len, "client send length");
+ ASSERT_EQ(ss.n, (int)ss.len, "server send length");
+
+ /* Now read on each side - the bytes the peer wrote should have
+ * landed via the TCP fallback path.
+ */
+ cr.fd = cfd;
+ cr.n = recv(cr.fd, cr.buf, sizeof(cr.buf) - 1, 0);
+ ASSERT_EQ(cr.n, (int)ss.len, "client recv length");
+ cr.buf[cr.n > 0 ? cr.n : 0] = 0;
+ ASSERT_STREQ(cr.buf, SERVER_BANNER, "client got server banner");
+
+ sr.fd = sfd;
+ sr.n = recv(sr.fd, sr.buf, sizeof(sr.buf) - 1, 0);
+ ASSERT_EQ(sr.n, (int)cs.len, "server recv length");
+ sr.buf[sr.n > 0 ? sr.n : 0] = 0;
+ ASSERT_STREQ(sr.buf, CLIENT_BANNER, "server got client banner");
+
+ err = 0;
+out:
+ if (cfd >= 0)
+ close(cfd);
+ if (sfd >= 0)
+ close(sfd);
+ if (lfd >= 0)
+ close(lfd);
+ return err;
+}
+
+void test_tcp_splice(void)
+{
+ struct test_tcp_splice *skel;
+ int cgroup_fd, prog_fd;
+
+ cgroup_fd = test__join_cgroup("/tcp_splice");
+ if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+ return;
+
+ skel = test_tcp_splice__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open_load"))
+ goto close_cgroup;
+
+ prog_fd = bpf_program__fd(skel->progs.sockops_splice);
+ if (!ASSERT_OK(bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0),
+ "attach sockops"))
+ goto destroy_skel;
+
+ if (test__start_subtest("basic"))
+ run_basic(cgroup_fd, skel);
+ if (test__start_subtest("bidir_write"))
+ run_bidir_write(cgroup_fd, skel);
+
+destroy_skel:
+ test_tcp_splice__destroy(skel);
+close_cgroup:
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_splice.c b/tools/testing/selftests/bpf/progs/test_tcp_splice.c
new file mode 100644
index 000000000000..09c7f0f9e311
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcp_splice.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Sock_ops BPF program that pairs locally-connected TCP sockets via the
+ * bpf_sock_splice_pair kfunc. Each side of an established loopback
+ * connection inserts itself into a sockhash keyed by its 4-tuple and
+ * looks up the peer using the swapped tuple. Whichever side finds the
+ * peer attempts to splice; the race loser sees -EEXIST.
+ */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct flow_key {
+ __u32 saddr;
+ __u32 daddr;
+ __u16 sport;
+ __u16 dport;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 16);
+ __type(key, struct flow_key);
+ __type(value, __u64);
+} rendezvous SEC(".maps");
+
+int bpf_sock_splice_pair(struct sock *peer, struct bpf_sock_ops_kern *skops) __ksym;
+void *bpf_cast_to_kern_ctx(void *obj) __ksym;
+
+__u32 pair_ok;
+__u32 pair_other_err;
+
+/* IPv4 only: the verifier doesn't accept memcpy from sock_ops ctx
+ * because it lowers to "ctx + reg" pointer arithmetic. IPv6 support
+ * would need explicit field-by-field reads of local_ip6[i] /
+ * remote_ip6[i] at constant indices.
+ */
+static __always_inline void mk_key(struct bpf_sock_ops *s,
+ struct flow_key *k, int swap)
+{
+ /* skops->local_port is already in host byte order. skops->remote_port
+ * is laid out as the network-order 16-bit port in the upper half of
+ * a u32 (see sock_ops_convert_ctx_access); bpf_ntohl produces the
+ * host-order port directly - no further shift.
+ */
+ __u16 lport = (__u16)s->local_port;
+ __u16 rport = bpf_ntohl(s->remote_port);
+
+ if (!swap) {
+ k->saddr = s->local_ip4;
+ k->daddr = s->remote_ip4;
+ k->sport = lport;
+ k->dport = rport;
+ } else {
+ k->saddr = s->remote_ip4;
+ k->daddr = s->local_ip4;
+ k->sport = rport;
+ k->dport = lport;
+ }
+}
+
+SEC("sockops")
+int sockops_splice(struct bpf_sock_ops *skops)
+{
+ struct flow_key self_key, peer_key;
+ struct bpf_sock *peer;
+ int ret;
+
+ if (skops->op != BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB &&
+ skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB)
+ return 0;
+ if (skops->family != 2 /* AF_INET */)
+ return 0;
+
+ mk_key(skops, &self_key, 0);
+ mk_key(skops, &peer_key, 1);
+
+ /* BPF_ANY: a reused 4-tuple after close (e.g. fast reconnect) must
+ * overwrite the stale entry rather than silently fail.
+ */
+ bpf_sock_hash_update(skops, &rendezvous, &self_key, BPF_ANY);
+
+ peer = bpf_map_lookup_elem(&rendezvous, &peer_key);
+ if (!peer)
+ return 0;
+
+ /* The sockhash bpf_map_lookup_elem above is an acquire, so @peer
+ * carries a reference. A sock_ops program cannot call
+ * bpf_sk_release, so the reference is handed to bpf_sock_splice_pair
+ * which is KF_RELEASE and consumes it - no explicit release here,
+ * and none is possible from this program type.
+ */
+ ret = bpf_sock_splice_pair((struct sock *)peer,
+ bpf_cast_to_kern_ctx(skops));
+ if (ret == 0)
+ __sync_fetch_and_add(&pair_ok, 1);
+ else if (ret != -17 /* -EEXIST: race loser, expected */)
+ __sync_fetch_and_add(&pair_other_err, 1);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.43.0
next prev parent reply other threads:[~2026-06-12 1:15 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-12 1:14 [RFC PATCH bpf-next 0/5] tcp: opportunistic loopback splice for BPF-paired sockets Cong Wang
2026-06-12 1:14 ` [RFC PATCH bpf-next 1/5] tcp_bpf: add bpf_sock_splice_pair kfunc for opportunistic loopback splice Cong Wang
2026-06-12 2:10 ` bot+bpf-ci
2026-06-12 1:14 ` [RFC PATCH bpf-next 2/5] tcp_bpf: busy-poll the splice ring before parking the receiver Cong Wang
2026-06-12 1:14 ` Cong Wang [this message]
2026-06-12 1:14 ` [RFC PATCH bpf-next 4/5] bpf: allow SO_BUSY_POLL in bpf_setsockopt() Cong Wang
2026-06-12 1:14 ` [RFC PATCH bpf-next 5/5] selftests/bpf: set SO_BUSY_POLL from the tcp_splice sockops prog Cong Wang
2026-06-12 16:01 ` [RFC PATCH bpf-next 0/5] tcp: opportunistic loopback splice for BPF-paired sockets Alexei Starovoitov
2026-06-12 18:12 ` Cong Wang
2026-06-12 18:34 ` Alexei Starovoitov
2026-06-12 20:17 ` Cong Wang
2026-06-12 22:10 ` [syzbot ci] " syzbot ci
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260612011452.134466-4-xiyou.wangcong@gmail.com \
--to=xiyou.wangcong@gmail.com \
--cc=bpf@vger.kernel.org \
--cc=cwang@multikernel.io \
--cc=hemanthmalla@gmail.com \
--cc=jakub@cloudflare.com \
--cc=jiayuan.chen@linux.dev \
--cc=john.fastabend@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=zijianzhang@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox