Netdev List
 help / color / mirror / Atom feed
From: Cong Wang <xiyou.wangcong@gmail.com>
To: netdev@vger.kernel.org
Cc: bpf@vger.kernel.org, John Fastabend <john.fastabend@gmail.com>,
	Jakub Sitnicki <jakub@cloudflare.com>,
	Jiayuan Chen <jiayuan.chen@linux.dev>,
	hemanthmalla@gmail.com, zijianzhang@bytedance.com,
	Cong Wang <xiyou.wangcong@gmail.com>,
	Cong Wang <cwang@multikernel.io>
Subject: [RFC PATCH bpf-next 3/5] selftests/bpf: add tcp_splice basic round-trip test
Date: Thu, 11 Jun 2026 18:14:50 -0700	[thread overview]
Message-ID: <20260612011452.134466-4-xiyou.wangcong@gmail.com> (raw)
In-Reply-To: <20260612011452.134466-1-xiyou.wangcong@gmail.com>

Loads a sock_ops BPF program that, on each ESTABLISHED callback,
inserts self into a sockhash keyed by the local 4-tuple, looks up
the peer using the swapped 4-tuple, and calls the new
bpf_sock_splice_pair kfunc on whichever peer it finds. Counters track
how many calls returned 0 (winner) vs -EEXIST (race loser) vs other
errors.

Userspace creates a loopback TCP pair, waits for both ESTABLISHED
callbacks to land, then verifies pair_ok >= 1 and pair_other_err == 0.
A receiver thread blocks in recv() before the main thread sends; the
test asserts the bytes round-trip through the rendezvous data plane.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang@multikernel.io>
---
 .../selftests/bpf/prog_tests/tcp_splice.c     | 206 ++++++++++++++++++
 .../selftests/bpf/progs/test_tcp_splice.c     | 101 +++++++++
 2 files changed, 307 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tcp_splice.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_tcp_splice.c

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_splice.c b/tools/testing/selftests/bpf/prog_tests/tcp_splice.c
new file mode 100644
index 000000000000..b80a1129c6aa
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_splice.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+#include "test_tcp_splice.skel.h"
+
+#include <pthread.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define MSG "hello rendezvous"
+#define CLIENT_BANNER "client-banner"
+#define SERVER_BANNER "server-banner"
+
+struct recv_arg {
+	int fd;
+	char buf[64];
+	int n;
+	int err;
+};
+
+static void *recv_thread(void *p)
+{
+	struct recv_arg *a = p;
+
+	a->n = recv(a->fd, a->buf, sizeof(a->buf) - 1, 0);
+	a->err = errno;
+	return NULL;
+}
+
+struct send_arg {
+	int fd;
+	const char *buf;
+	size_t len;
+	int n;
+	int err;
+};
+
+static void *send_thread(void *p)
+{
+	struct send_arg *a = p;
+
+	a->n = send(a->fd, a->buf, a->len, 0);
+	a->err = errno;
+	return NULL;
+}
+
+static int run_basic(int cgroup_fd, struct test_tcp_splice *skel)
+{
+	pthread_t tid;
+	struct recv_arg a = {};
+	int sfd = -1, cfd = -1, lfd = -1;
+	int n, err = -1;
+
+	lfd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+	if (!ASSERT_GE(lfd, 0, "start_server"))
+		return -1;
+
+	cfd = connect_to_fd(lfd, 0);
+	if (!ASSERT_GE(cfd, 0, "connect_to_fd"))
+		goto out;
+
+	sfd = accept(lfd, NULL, NULL);
+	if (!ASSERT_GE(sfd, 0, "accept"))
+		goto out;
+
+	/* Give both ESTABLISHED sock_ops callbacks a moment to run. */
+	usleep(50 * 1000);
+
+	if (!ASSERT_GE(skel->bss->pair_ok, 1, "splice paired"))
+		goto out;
+	ASSERT_EQ(skel->bss->pair_other_err, 0, "no unexpected pair errors");
+
+	/* Drive the splice fast path: receiver enters recv() and publishes
+	 * its bvec, sender then writes directly into it.
+	 */
+	a.fd = sfd;
+	if (!ASSERT_OK(pthread_create(&tid, NULL, recv_thread, &a),
+		       "pthread_create"))
+		goto out;
+	usleep(20 * 1000); /* let recv block */
+
+	n = send(cfd, MSG, strlen(MSG), 0);
+	ASSERT_EQ(n, (int)strlen(MSG), "send length");
+
+	pthread_join(tid, NULL);
+	ASSERT_EQ(a.n, (int)strlen(MSG), "recv length");
+	a.buf[a.n > 0 ? a.n : 0] = 0;
+	ASSERT_STREQ(a.buf, MSG, "recv contents");
+
+	err = 0;
+out:
+	if (cfd >= 0)
+		close(cfd);
+	if (sfd >= 0)
+		close(sfd);
+	if (lfd >= 0)
+		close(lfd);
+	return err;
+}
+
+/* Bidirectional-write deadlock-avoidance test.
+ *
+ * Both sides issue send() before either calls recv(), the classic
+ * pattern that used to deadlock under synchronous rendezvous (and
+ * the actual cause of "kex_exchange_identification: write: Broken
+ * pipe" with SSH on loopback). The bounded-wait fallback in
+ * tcp_bpf_splice_sendmsg() must let both writes complete via the
+ * normal TCP path within ~1 ms, and the banners must arrive intact
+ * on the other side when recv() is called next.
+ */
+static int run_bidir_write(int cgroup_fd, struct test_tcp_splice *skel)
+{
+	pthread_t client_send_tid, server_send_tid;
+	struct send_arg cs = { .buf = CLIENT_BANNER,
+			       .len = sizeof(CLIENT_BANNER) - 1 };
+	struct send_arg ss = { .buf = SERVER_BANNER,
+			       .len = sizeof(SERVER_BANNER) - 1 };
+	struct recv_arg cr = {}, sr = {};
+	int sfd = -1, cfd = -1, lfd = -1;
+	int err = -1;
+
+	lfd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+	if (!ASSERT_GE(lfd, 0, "start_server"))
+		return -1;
+	cfd = connect_to_fd(lfd, 0);
+	if (!ASSERT_GE(cfd, 0, "connect_to_fd"))
+		goto out;
+	sfd = accept(lfd, NULL, NULL);
+	if (!ASSERT_GE(sfd, 0, "accept"))
+		goto out;
+
+	usleep(50 * 1000); /* let pair complete */
+
+	/* Both sides write first, neither reads yet. Both must return
+	 * within bounded time (no deadlock).
+	 */
+	cs.fd = cfd;
+	ss.fd = sfd;
+	if (!ASSERT_OK(pthread_create(&client_send_tid, NULL, send_thread, &cs),
+		       "client send thread"))
+		goto out;
+	if (!ASSERT_OK(pthread_create(&server_send_tid, NULL, send_thread, &ss),
+		       "server send thread"))
+		goto out;
+
+	pthread_join(client_send_tid, NULL);
+	pthread_join(server_send_tid, NULL);
+	ASSERT_EQ(cs.n, (int)cs.len, "client send length");
+	ASSERT_EQ(ss.n, (int)ss.len, "server send length");
+
+	/* Now read on each side - the bytes the peer wrote should have
+	 * landed via the TCP fallback path.
+	 */
+	cr.fd = cfd;
+	cr.n = recv(cr.fd, cr.buf, sizeof(cr.buf) - 1, 0);
+	ASSERT_EQ(cr.n, (int)ss.len, "client recv length");
+	cr.buf[cr.n > 0 ? cr.n : 0] = 0;
+	ASSERT_STREQ(cr.buf, SERVER_BANNER, "client got server banner");
+
+	sr.fd = sfd;
+	sr.n = recv(sr.fd, sr.buf, sizeof(sr.buf) - 1, 0);
+	ASSERT_EQ(sr.n, (int)cs.len, "server recv length");
+	sr.buf[sr.n > 0 ? sr.n : 0] = 0;
+	ASSERT_STREQ(sr.buf, CLIENT_BANNER, "server got client banner");
+
+	err = 0;
+out:
+	if (cfd >= 0)
+		close(cfd);
+	if (sfd >= 0)
+		close(sfd);
+	if (lfd >= 0)
+		close(lfd);
+	return err;
+}
+
+void test_tcp_splice(void)
+{
+	struct test_tcp_splice *skel;
+	int cgroup_fd, prog_fd;
+
+	cgroup_fd = test__join_cgroup("/tcp_splice");
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	skel = test_tcp_splice__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_load"))
+		goto close_cgroup;
+
+	prog_fd = bpf_program__fd(skel->progs.sockops_splice);
+	if (!ASSERT_OK(bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0),
+		       "attach sockops"))
+		goto destroy_skel;
+
+	if (test__start_subtest("basic"))
+		run_basic(cgroup_fd, skel);
+	if (test__start_subtest("bidir_write"))
+		run_bidir_write(cgroup_fd, skel);
+
+destroy_skel:
+	test_tcp_splice__destroy(skel);
+close_cgroup:
+	close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_splice.c b/tools/testing/selftests/bpf/progs/test_tcp_splice.c
new file mode 100644
index 000000000000..09c7f0f9e311
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcp_splice.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Sock_ops BPF program that pairs locally-connected TCP sockets via the
+ * bpf_sock_splice_pair kfunc. Each side of an established loopback
+ * connection inserts itself into a sockhash keyed by its 4-tuple and
+ * looks up the peer using the swapped tuple. Whichever side finds the
+ * peer attempts to splice; the race loser sees -EEXIST.
+ */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct flow_key {
+	__u32	saddr;
+	__u32	daddr;
+	__u16	sport;
+	__u16	dport;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKHASH);
+	__uint(max_entries, 16);
+	__type(key, struct flow_key);
+	__type(value, __u64);
+} rendezvous SEC(".maps");
+
+int bpf_sock_splice_pair(struct sock *peer, struct bpf_sock_ops_kern *skops) __ksym;
+void *bpf_cast_to_kern_ctx(void *obj) __ksym;
+
+__u32 pair_ok;
+__u32 pair_other_err;
+
+/* IPv4 only: the verifier doesn't accept memcpy from sock_ops ctx
+ * because it lowers to "ctx + reg" pointer arithmetic. IPv6 support
+ * would need explicit field-by-field reads of local_ip6[i] /
+ * remote_ip6[i] at constant indices.
+ */
+static __always_inline void mk_key(struct bpf_sock_ops *s,
+				   struct flow_key *k, int swap)
+{
+	/* skops->local_port is already in host byte order. skops->remote_port
+	 * is laid out as the network-order 16-bit port in the upper half of
+	 * a u32 (see sock_ops_convert_ctx_access); bpf_ntohl produces the
+	 * host-order port directly - no further shift.
+	 */
+	__u16 lport = (__u16)s->local_port;
+	__u16 rport = bpf_ntohl(s->remote_port);
+
+	if (!swap) {
+		k->saddr = s->local_ip4;
+		k->daddr = s->remote_ip4;
+		k->sport = lport;
+		k->dport = rport;
+	} else {
+		k->saddr = s->remote_ip4;
+		k->daddr = s->local_ip4;
+		k->sport = rport;
+		k->dport = lport;
+	}
+}
+
+SEC("sockops")
+int sockops_splice(struct bpf_sock_ops *skops)
+{
+	struct flow_key self_key, peer_key;
+	struct bpf_sock *peer;
+	int ret;
+
+	if (skops->op != BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB &&
+	    skops->op != BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB)
+		return 0;
+	if (skops->family != 2 /* AF_INET */)
+		return 0;
+
+	mk_key(skops, &self_key, 0);
+	mk_key(skops, &peer_key, 1);
+
+	/* BPF_ANY: a reused 4-tuple after close (e.g. fast reconnect) must
+	 * overwrite the stale entry rather than silently fail.
+	 */
+	bpf_sock_hash_update(skops, &rendezvous, &self_key, BPF_ANY);
+
+	peer = bpf_map_lookup_elem(&rendezvous, &peer_key);
+	if (!peer)
+		return 0;
+
+	/* The sockhash bpf_map_lookup_elem above is an acquire, so @peer
+	 * carries a reference. A sock_ops program cannot call
+	 * bpf_sk_release, so the reference is handed to bpf_sock_splice_pair
+	 * which is KF_RELEASE and consumes it - no explicit release here,
+	 * and none is possible from this program type.
+	 */
+	ret = bpf_sock_splice_pair((struct sock *)peer,
+				   bpf_cast_to_kern_ctx(skops));
+	if (ret == 0)
+		__sync_fetch_and_add(&pair_ok, 1);
+	else if (ret != -17 /* -EEXIST: race loser, expected */)
+		__sync_fetch_and_add(&pair_other_err, 1);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.43.0


  parent reply	other threads:[~2026-06-12  1:15 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-12  1:14 [RFC PATCH bpf-next 0/5] tcp: opportunistic loopback splice for BPF-paired sockets Cong Wang
2026-06-12  1:14 ` [RFC PATCH bpf-next 1/5] tcp_bpf: add bpf_sock_splice_pair kfunc for opportunistic loopback splice Cong Wang
2026-06-12  2:10   ` bot+bpf-ci
2026-06-12  1:14 ` [RFC PATCH bpf-next 2/5] tcp_bpf: busy-poll the splice ring before parking the receiver Cong Wang
2026-06-12  1:14 ` Cong Wang [this message]
2026-06-12  1:14 ` [RFC PATCH bpf-next 4/5] bpf: allow SO_BUSY_POLL in bpf_setsockopt() Cong Wang
2026-06-12  1:14 ` [RFC PATCH bpf-next 5/5] selftests/bpf: set SO_BUSY_POLL from the tcp_splice sockops prog Cong Wang
2026-06-12 16:01 ` [RFC PATCH bpf-next 0/5] tcp: opportunistic loopback splice for BPF-paired sockets Alexei Starovoitov
2026-06-12 18:12   ` Cong Wang
2026-06-12 18:34     ` Alexei Starovoitov
2026-06-12 20:17       ` Cong Wang
2026-06-12 22:10 ` [syzbot ci] " syzbot ci

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260612011452.134466-4-xiyou.wangcong@gmail.com \
    --to=xiyou.wangcong@gmail.com \
    --cc=bpf@vger.kernel.org \
    --cc=cwang@multikernel.io \
    --cc=hemanthmalla@gmail.com \
    --cc=jakub@cloudflare.com \
    --cc=jiayuan.chen@linux.dev \
    --cc=john.fastabend@gmail.com \
    --cc=netdev@vger.kernel.org \
    --cc=zijianzhang@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox