Netdev List
 help / color / mirror / Atom feed
From: Kuniyuki Iwashima <kuniyu@google.com>
To: Alexei Starovoitov <ast@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	 Andrii Nakryiko <andrii@kernel.org>,
	Martin KaFai Lau <martin.lau@linux.dev>,
	 Eduard Zingerman <eddyz87@gmail.com>,
	Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Yonghong Song <yonghong.song@linux.dev>,
	John Fastabend <john.fastabend@gmail.com>,
	 Stanislav Fomichev <sdf@fomichev.me>,
	Eric Dumazet <edumazet@google.com>,
	 Neal Cardwell <ncardwell@google.com>,
	Willem de Bruijn <willemb@google.com>,
	 Tenzin Ukyab <ukyab@berkeley.edu>,
	Kuniyuki Iwashima <kuniyu@google.com>,
	 Kuniyuki Iwashima <kuni1840@gmail.com>,
	bpf@vger.kernel.org, netdev@vger.kernel.org
Subject: [PATCH v3 bpf-next 06/11] bpf: tcp: Make BPF_SOCK_OPS_RCVQ_CB and SOCKMAP mutually exclusive.
Date: Sat, 23 May 2026 08:29:35 +0000	[thread overview]
Message-ID: <20260523083001.2911931-7-kuniyu@google.com> (raw)
In-Reply-To: <20260523083001.2911931-1-kuniyu@google.com>

Both BPF_SOCK_OPS_RCVQ_CB and SOCKMAP can intercept and handle
socket receive queues, leading to overlapping use cases.

While BPF_SOCK_OPS_RCVQ_CB focuses on optimizing single-socket
performance by reducing EPOLLIN wakeups and fully preserves TCP
zerocopy support, SOCKMAP is designed to facilitate multi-socket
routing at the cost of higher overhead and no zerocopy support.

Enabling both features on the same socket makes no sense and
results in unexpected interference between them.

For instance, SOCKMAP calls __tcp_cleanup_rbuf(), where we will
add a BPF_SOCK_OPS_RCVQ_CB hook, and bpf_sock_ops_tcp_set_rcvlowat()
calls sk->sk_data_ready(), which would trigger SOCKMAP.

Let's make BPF_SOCK_OPS_RCVQ_CB and SOCKMAP mutually exclusive.

Note that it requires write_lock_bh(&sk->sk_callback_lock) to
synchronise with tcp_bpf_update_proto() and check if sk->sk_prot
is one of tcp_bpf_prots[][] because sock_map_update_elem() only
holds bh_lock_sock() without checking sock_owned_by_user().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
v3: Check sk->sk_prot and update tp->bpf_sock_ops_cb_flags
    under sk->sk_callback_lock, and only when not flagged yet.
---
 include/net/tcp.h  |  1 +
 net/core/filter.c  | 35 +++++++++++++++++++++++++++++++----
 net/ipv4/tcp_bpf.c | 12 ++++++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c6a6853909c4..bc95d8e7b62e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2853,6 +2853,7 @@ struct sk_msg;
 struct sk_psock;
 
 #ifdef CONFIG_BPF_SYSCALL
+bool tcp_in_sockmap(const struct sock *sk);
 int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #ifdef CONFIG_BPF_STREAM_PARSER
diff --git a/net/core/filter.c b/net/core/filter.c
index 3608036632a8..1fb63b264b18 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5382,12 +5382,34 @@ static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
 	return 0;
 }
 
+static int __bpf_sock_ops_cb_flags_set(struct sock *sk, int val)
+{
+	if (!(val & BPF_SOCK_OPS_RCVQ_CB_FLAG) ||
+	    tcp_sk(sk)->bpf_sock_ops_cb_flags & BPF_SOCK_OPS_RCVQ_CB_FLAG) {
+		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+		return 0;
+	}
+
+	write_lock_bh(&sk->sk_callback_lock);
+
+	if (unlikely(tcp_in_sockmap(sk))) {
+		write_unlock_bh(&sk->sk_callback_lock);
+		return -EBUSY;
+	}
+
+	tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return 0;
+}
+
 static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 				  char *optval, int optlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned long timeout;
-	int val;
+	int val, err;
 
 	if (optlen != sizeof(int))
 		return -EINVAL;
@@ -5424,7 +5446,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 	case TCP_BPF_SOCK_OPS_CB_FLAGS:
 		if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
 			return -EINVAL;
-		tp->bpf_sock_ops_cb_flags = val;
+		err = __bpf_sock_ops_cb_flags_set(sk, val);
+		if (err)
+			return err;
 		break;
 	default:
 		return -EINVAL;
@@ -5999,8 +6023,9 @@ static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
 BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
 	   int, argval)
 {
-	struct sock *sk = bpf_sock->sk;
 	int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+	struct sock *sk = bpf_sock->sk;
+	int err;
 
 	if (!is_locked_tcp_sock_ops(bpf_sock) &&
 	    bpf_sock->op != BPF_SOCK_OPS_RCVQ_CB)
@@ -6009,7 +6034,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
 	if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
 		return -EINVAL;
 
-	tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+	err = __bpf_sock_ops_cb_flags_set(sk, val);
+	if (err)
+		return err;
 
 	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
 }
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index cc0bd73f36b6..7e7966b095f9 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -705,6 +705,16 @@ int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
 }
 #endif /* CONFIG_BPF_STREAM_PARSER */
 
+bool tcp_in_sockmap(const struct sock *sk)
+{
+	const struct proto *prot = sk->sk_prot;
+
+	lockdep_assert_held(&sk->sk_callback_lock);
+
+	return &tcp_bpf_prots[0][0] <= prot &&
+		prot <= &tcp_bpf_prots[TCP_BPF_NUM_PROTS - 1][TCP_BPF_NUM_CFGS - 1];
+}
+
 int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
@@ -729,6 +739,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 			sock_replace_proto(sk, psock->sk_proto);
 		}
 		return 0;
+	} else if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RCVQ_CB_FLAG)) {
+		return -EBUSY;
 	}
 
 	if (sk->sk_family == AF_INET6) {
-- 
2.54.0.746.g67dd491aae-goog


  parent reply	other threads:[~2026-05-23  8:30 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-23  8:29 [PATCH v3 bpf-next 00/11] bpf: Add SOCK_OPS hooks for TCP AutoLOWAT Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 01/11] selftest: bpf: Use BPF_SOCK_OPS_ALL_CB_FLAGS + 1 for bad_cb_test_rv Kuniyuki Iwashima
2026-05-23  9:06   ` bot+bpf-ci
2026-05-23  8:29 ` [PATCH v3 bpf-next 02/11] bpf: tcp: Introduce BPF_SOCK_OPS_RCVQ_CB Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 03/11] bpf: tcp: Support bpf_skb_load_bytes() for BPF_SOCK_OPS_RCVQ_CB Kuniyuki Iwashima
2026-05-26 20:34   ` Martin KaFai Lau
2026-05-26 21:21     ` Kuniyuki Iwashima
2026-05-26 22:18       ` Martin KaFai Lau
2026-05-23  8:29 ` [PATCH v3 bpf-next 04/11] tcp: Split out __tcp_set_rcvlowat() Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 05/11] bpf: tcp: Add kfunc to adjust sk->sk_rcvlowat Kuniyuki Iwashima
2026-05-23  9:06   ` bot+bpf-ci
2026-05-23  8:29 ` Kuniyuki Iwashima [this message]
2026-05-23  9:20   ` [PATCH v3 bpf-next 06/11] bpf: tcp: Make BPF_SOCK_OPS_RCVQ_CB and SOCKMAP mutually exclusive bot+bpf-ci
2026-05-24  3:37     ` Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 07/11] bpf: mptcp: Don't support BPF_SOCK_OPS_RCVQ_CB Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 08/11] bpf: tcp: Reject BPF_SOCK_OPS_RCVQ_CB if receive queue is not empty Kuniyuki Iwashima
2026-05-23  9:20   ` bot+bpf-ci
2026-05-23  8:29 ` [PATCH v3 bpf-next 09/11] bpf: tcp: Factorise bpf_skops_established() Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 10/11] bpf: tcp: Add SOCK_OPS rcvlowat hook Kuniyuki Iwashima
2026-05-26 20:47   ` Martin KaFai Lau
2026-05-26 21:07     ` Kuniyuki Iwashima
2026-05-26 21:37       ` Amery Hung
2026-05-26 21:51         ` Kuniyuki Iwashima
2026-05-23  8:29 ` [PATCH v3 bpf-next 11/11] selftest: bpf: Add test for BPF_SOCK_OPS_RCVQ_CB Kuniyuki Iwashima
2026-05-23  9:20   ` bot+bpf-ci
2026-05-24  4:03     ` Kuniyuki Iwashima
2026-05-26 21:01   ` Martin KaFai Lau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260523083001.2911931-7-kuniyu@google.com \
    --to=kuniyu@google.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=eddyz87@gmail.com \
    --cc=edumazet@google.com \
    --cc=john.fastabend@gmail.com \
    --cc=kuni1840@gmail.com \
    --cc=martin.lau@linux.dev \
    --cc=memxor@gmail.com \
    --cc=ncardwell@google.com \
    --cc=netdev@vger.kernel.org \
    --cc=sdf@fomichev.me \
    --cc=ukyab@berkeley.edu \
    --cc=willemb@google.com \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox