Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf 2/2] selftests/bpf: Add test for UDP sock leak on sockmap lookup-bind-release
From: Michal Luczaj @ 2026-06-23 18:03 UTC (permalink / raw)
  To: John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Alexei Starovoitov, Cong Wang, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj
In-Reply-To: <20260623-sockmap-lookup-udp-leak-v1-0-05804f9308e4@rbox.co>

Setup and join a cgroup, then attach a cgroup/connect4 program that runs

   sk = bpf_map_lookup_elem(sockmap, 0)
   bpf_bind(ctx, sa, sizeof(sa))
   bpf_sk_release(sk)

Unpatched kernel leaks the socket.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       | 50 ++++++++++++++++++++++
 .../bpf/progs/test_sockmap_lookup_bind_release.c   | 37 ++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index cb3229711f93..11972ffdb16e 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -7,6 +7,7 @@
 
 #include "test_progs.h"
 #include "test_skmsg_load_helpers.skel.h"
+#include "test_sockmap_lookup_bind_release.skel.h"
 #include "test_sockmap_update.skel.h"
 #include "test_sockmap_invalid_update.skel.h"
 #include "test_sockmap_skb_verdict_attach.skel.h"
@@ -17,6 +18,7 @@
 #include "test_sockmap_msg_pop_data.skel.h"
 #include "bpf_iter_sockmap.skel.h"
 
+#include "cgroup_helpers.h"
 #include "sockmap_helpers.h"
 
 #define TCP_REPAIR		19	/* TCP sock is under repair right now */
@@ -1373,6 +1375,52 @@ static void test_sockmap_multi_channels(int sotype)
 	test_sockmap_pass_prog__destroy(skel);
 }
 
+#define LOOKUP_BIND_RELEASE_CG	"/sockmap_lookup-bind-release"
+#define LOOKUP_BIND_RELEASE_REP	64
+
+static void test_sockmap_lookup_bind_release(void)
+{
+	struct test_sockmap_lookup_bind_release *skel;
+	struct sockaddr_in sa;
+	int cg, i;
+
+	cg = cgroup_setup_and_join(LOOKUP_BIND_RELEASE_CG);
+	if (!ASSERT_OK_FD(cg, "cgroup_setup_and_join"))
+		return;
+
+	skel = test_sockmap_lookup_bind_release__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto cleanup;
+
+	skel->links.connect = bpf_program__attach_cgroup(skel->progs.connect, cg);
+	if (!ASSERT_OK_PTR(skel->links.connect, "attach_cgroup"))
+		goto destroy;
+
+	sa.sin_family = AF_INET;
+	sa.sin_port = bpf_htons(1234);
+	sa.sin_addr.s_addr = bpf_htonl(INADDR_LOOPBACK);
+
+	for (i = 0; i < LOOKUP_BIND_RELEASE_REP; ++i) {
+		__close_fd int sk;
+
+		sk = xsocket(AF_INET, SOCK_DGRAM, 0);
+		if (sk < 0)
+			break;
+
+		if (xbpf_map_update_elem(bpf_map__fd(skel->maps.sockmap), &u32(0),
+					 &sk, BPF_ANY))
+			break;
+
+		if (xconnect(sk, (struct sockaddr *)&sa, sizeof(sa)))
+			break;
+	}
+
+destroy:
+	test_sockmap_lookup_bind_release__destroy(skel);
+cleanup:
+	cleanup_cgroup_environment();
+}
+
 void test_sockmap_basic(void)
 {
 	if (test__start_subtest("sockmap create_update_free"))
@@ -1451,4 +1499,6 @@ void test_sockmap_basic(void)
 		test_sockmap_multi_channels(SOCK_STREAM);
 	if (test__start_subtest("sockmap udp multi channels"))
 		test_sockmap_multi_channels(SOCK_DGRAM);
+	if (test__start_subtest("sockmap lookup-bind-release"))
+		test_sockmap_lookup_bind_release();
 }
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_lookup_bind_release.c b/tools/testing/selftests/bpf/progs/test_sockmap_lookup_bind_release.c
new file mode 100644
index 000000000000..cc77b193893b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_lookup_bind_release.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SOCKMAP);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, int);
+} sockmap SEC(".maps");
+
+SEC("cgroup/connect4")
+int connect(struct bpf_sock_addr *ctx)
+{
+	struct bpf_sock *sk;
+	int ret = SK_DROP;
+
+	sk = bpf_map_lookup_elem(&sockmap, &(int){0});
+	if (sk) {
+		if (sk == ctx->sk) {
+			struct sockaddr_in sa = {
+				.sin_family = ctx->user_family,
+				.sin_port = ctx->user_port,
+				.sin_addr.s_addr = ctx->user_ip4
+			};
+
+			ret = !bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa));
+		}
+
+		bpf_sk_release(sk);
+	}
+
+	return ret;
+}
+
+char _license[] SEC("license") = "GPL";

-- 
2.54.0


^ permalink raw reply related

* [PATCH bpf 0/2] bpf, sockmap: Fix sockmap leaking UDP socks
From: Michal Luczaj @ 2026-06-23 18:03 UTC (permalink / raw)
  To: John Fastabend, Jakub Sitnicki, Jiayuan Chen, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Alexei Starovoitov, Cong Wang, Daniel Borkmann, Andrii Nakryiko,
	Eduard Zingerman, Kumar Kartikeya Dwivedi, Martin KaFai Lau,
	Song Liu, Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan
  Cc: netdev, bpf, linux-kernel, linux-kselftest, Michal Luczaj

Fix for UDP sockets refcount asymmetry in sockmap lookup/release.
Accompanied by a selftest.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
---
Michal Luczaj (2):
      bpf, sockmap: Don't leak UDP socks on lookup-bind-release
      selftests/bpf: Add test for UDP sock leak on sockmap lookup-bind-release

 net/ipv4/udp_bpf.c                                 |  3 ++
 .../selftests/bpf/prog_tests/sockmap_basic.c       | 50 ++++++++++++++++++++++
 .../bpf/progs/test_sockmap_lookup_bind_release.c   | 37 ++++++++++++++++
 3 files changed, 90 insertions(+)
---
base-commit: 12091470c6b4c1c14b2de12dcbae2ada6cb6d20b
change-id: 20260617-sockmap-lookup-udp-leak-bc4e5c5481d7

Best regards,
--  
Michal Luczaj <mhal@rbox.co>


^ permalink raw reply

* Re: [PATCH bpf-next v4 1/3] bpf: Add BPF_FIB_LOOKUP_VLAN flag to bpf_fib_lookup() helper
From: Avinash Duduskar @ 2026-06-23 18:28 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, ast, daniel, andrii
  Cc: eddyz87, memxor, martin.lau, song, yonghong.song, jolsa, emil,
	john.fastabend, sdf, davem, edumazet, kuba, pabeni, horms, shuah,
	hawk, yatsenko, leon.hwang, kpsingh, a.s.protopopov, ameryhung,
	rongtao, eyal.birger, bpf, netdev, linux-kernel, linux-kselftest,
	dsahern
In-Reply-To: <877bnpeaeq.fsf@toke.dk>

Toke Høiland-Jørgensen <toke@redhat.com> writes:

> I think it's better to just move the assignment of params->ifindex
> entirely into bpf_fib_set_fwd_params(), instead of this restore dance.
> That way this can be simplified to:
>
> 	err = bpf_fib_set_fwd_params(dev, params, flags, mtu);
> 	if (!err && fwd_dev)
> 		*fwd_dev = dev;
> 	return err;

The caller-side restore is ungainly, agreed, but the assignment can't move
all the way into the helper. The early params->ifindex = dev->ifindex
sits above the neighbour lookup on purpose: that is d1c362e1dd68a
("bpf: Always return target ifindex in bpf_fib_lookup"), which took it
out of bpf_fib_set_fwd_params() and put it there so a program still
gets the target ifindex on the BPF_FIB_LKUP_RET_NO_NEIGH path and can
bpf_redirect_neigh() on it. bpf_fib_set_fwd_params() is called only at
the set_fwd_params label, below the NO_NEIGH return (and below the IPv6
NO_SRC_ADDR return), so an assignment living in the helper never runs
on those paths and params->ifindex falls back to the input. That would
change the reported ifindex for plain bpf_fib_lookup() callers hitting
NO_NEIGH, not only the VLAN ones.

I can still get the caller down to your form by keeping the early write
and moving just the VLAN_FAILURE rewind into the helper, with one extra
parameter, the input ifindex saved before the egress write:

	err = bpf_fib_set_fwd_params(dev, params, flags, mtu, in_ifindex);
	if (!err && fwd_dev)
		*fwd_dev = dev;
	return err;

and the helper owning the rewind in the unreducible branch:

	} else {
		params->ifindex = in_ifindex;
		return BPF_FIB_LKUP_RET_VLAN_FAILURE;
	}

So the restore leaves the caller; the early egress write stays because
NO_NEIGH and NO_SRC_ADDR depend on it.

3/3 adds a NO_NEIGH arm that pins the egress ifindex (input != egress):
with the assignment moved into the helper, that case reports the input
ifindex instead, while the return code stays NO_NEIGH, only the ifindex
flips. It passes with the early write kept.

> If you move the ifdef into the if statement, the if statement can have
> an else-branch that assigns params->ifindex, so you don't need the
> restore dance (see below).

Same constraint: an else-branch inside bpf_fib_set_fwd_params() only
runs when the helper runs, which is never on the NO_NEIGH/NO_SRC_ADDR
returns, so it cannot be the sole writer of the egress ifindex.

Does the in_ifindex version look right to you? The alternative is to
route the error returns through the label so the assignment can live
fully in the helper; threading the return codes back through it works,
but it is its own kind of dance and reads worse to me.

Thanks,
Avinash

^ permalink raw reply

* Re: [PATCH net] nfc: nci: fix out-of-bounds write in nci_target_auto_activated()
From: Simon Horman @ 2026-06-23 17:21 UTC (permalink / raw)
  To: sam
  Cc: Simon Horman, david, davem, edumazet, kuba, pabeni, oe-linux-nfc,
	netdev, linux-kernel, stable
In-Reply-To: <20260622145243.3167276-1-sam@bynar.io>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
nfc: nci: fix out-of-bounds write in nci_target_auto_activated()

This commit adds a bounds check to nci_target_auto_activated() to prevent
an out-of-bounds array write. If ndev->n_targets equals
NCI_MAX_DISCOVERED_TARGETS, it prints a debug message and returns early.

> diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
> index c96512bb86531..566ca839fa488 100644
> --- a/net/nfc/nci/ntf.c
> +++ b/net/nfc/nci/ntf.c
> @@ -603,6 +603,12 @@ static void nci_target_auto_activated(struct nci_dev *ndev,
>  	struct nfc_target *target;
>  	int rc;
>  
> +	/* This is a new target, check if we've enough room */
> +	if (ndev->n_targets == NCI_MAX_DISCOVERED_TARGETS) {
> +		pr_debug("not enough room, ignoring new target...\n");
> +		return;

[Severity: High]
Does this early return cause the state machine to stall?

Looking at nci_rf_intf_activated_ntf_packet(), the state is transitioned
to NCI_POLL_ACTIVE right before this function is called:

net/nfc/nci/ntf.c:nci_rf_intf_activated_ntf_packet() {
    ...
    atomic_set(&ndev->state, NCI_POLL_ACTIVE);
    if (err == NCI_STATUS_OK)
        nci_target_auto_activated(ndev, &ntf);
    ...
}

Because of the early return here, nfc_targets_found() is skipped. Does this
leave the state wedged in NCI_POLL_ACTIVE, permanently blocking future 
attempts to stop polling since nci_stop_poll() requires the state to be 
NCI_DISCOVERY?

> +	}
> +
>  	target = &ndev->targets[ndev->n_targets];

[ ... ]

[Severity: Critical]
This isn't a bug introduced by this patch, but can the ntf union copy
uninitialized stack memory into persistent device structures?

In net/nfc/nci/ntf.c:nci_rf_discover_ntf_packet(), the local ntf struct
is declared on the stack:

    struct nci_rf_discover_ntf ntf;

If the incoming packet has ntf.rf_tech_specific_params_len == 0, the
extraction routines are bypassed, leaving the rf_tech_specific_params union
uninitialized. The struct is then passed to nci_add_new_target():

    if (add_target == true)
        nci_add_new_target(ndev, &ntf);

This eventually calls nci_add_new_protocol(), which blindly reads from this
union:

    nfca_poll = (struct rf_tech_specific_params_nfca_poll *)params;
    target->sens_res = nfca_poll->sens_res;
    target->sel_res = nfca_poll->sel_res;
    target->nfcid1_len = nfca_poll->nfcid1_len;

Could this uninitialized stack garbage be copied into ndev->targets, and
later broadcasted to userspace via Netlink in nfc_targets_found()?

[Severity: Critical]
This isn't a bug introduced by this patch, but does the general bytes length
calculation underflow?

In net/nfc/nci/ntf.c:nci_store_general_bytes_nfc_dep():

    ndev->remote_gb_len = min_t(__u8, 
        (ntf->activation_params.listen_nfc_dep.atr_req_len - 
         NFC_ATR_REQ_GT_OFFSET), NFC_ATR_REQ_GB_MAXSIZE);

    memcpy(ndev->remote_gb, 
        (ntf->activation_params.listen_nfc_dep.atr_req + 
         NFC_ATR_REQ_GT_OFFSET), ndev->remote_gb_len);

The length is computed by subtracting NFC_ATR_REQ_GT_OFFSET (14) from
atr_req_len. If atr_req_len is less than 14, this subtraction will
underflow.

Because of the min_t cast, this large underflowed value is bounded to
NFC_ATR_REQ_GB_MAXSIZE (48). The memcpy() then copies 48 bytes starting
from atr_req + 14 into ndev->remote_gb.

Since the source buffer is part of the ntf union allocated on the stack
in nci_rf_intf_activated_ntf_packet(), could this leak up to 48 bytes of
uninitialized kernel stack memory to userspace?

[Severity: High]
This isn't a bug introduced by this patch, but does the lack of skb->len
validation in NCI response handlers allow out-of-bounds reads?

In nci_core_init_rsp_packet_v1() in net/nfc/nci/rsp.c, fields are read
and arrays are copied from the payload:

    ndev->num_supported_rf_interfaces = 
        rsp_1->num_supported_rf_interfaces;

    memcpy(ndev->supported_rf_interfaces, rsp_1->supported_rf_interfaces, 
           ndev->num_supported_rf_interfaces);

Similarly in nci_core_init_rsp_packet_v2():

    while (rf_interface_idx < ndev->num_supported_rf_interfaces) {
        ndev->supported_rf_interfaces[rf_interface_idx++] = 
            *supported_rf_interface++;
    }

Since the NCI control header is stripped via skb_pull(), leaving the
payload length in skb->len, what happens if an NFCC sends a short response
packet (e.g. plen = 1)? Could this cause the handlers to read adjacent
kernel slab memory?

[Severity: High]
This isn't a bug introduced by this patch, but is there a missing boundary
validation when extracting activation parameters?

In net/nfc/nci/ntf.c:nci_extract_activation_params_iso_dep():

    nfca_poll->rats_res_len = min_t(__u8, *data++, NFC_ATS_MAXSIZE);
    if (nfca_poll->rats_res_len > 0) {
        memcpy(nfca_poll->rats_res, data, nfca_poll->rats_res_len);
    }

And in nci_extract_activation_params_nfc_dep():

    poll->atr_res_len = min_t(__u8, *data++, NFC_ATR_RES_MAXSIZE - 2);
    if (poll->atr_res_len > 0)
        memcpy(poll->atr_res, data, poll->atr_res_len);

These inner lengths are bounded by a maximum size like NFC_ATS_MAXSIZE, but
they are not validated against the remaining bytes in the SKB (skb->len).

If a malformed packet provides a large inner length but a short payload,
could this memcpy() read out of bounds into the device structure,
potentially leaking adjacent slab memory to userspace later?

[Severity: High]
This isn't a bug introduced by this patch, but is there a potential
priority inversion or timeout deadlock involving the device lock?

Functions like nfc_stop_poll() in net/nfc/core.c acquire the device lock:

    device_lock(&dev->dev);
    ...
    if (dev->ops->stop_poll)
        dev->ops->stop_poll(dev);

This eventually calls nci_request(), which synchronously waits for a
completion signaled by the nci_rx_work thread. However, if an NTF packet
is received first, nci_rx_work processes it and invokes
nfc_targets_found(), which also attempts to acquire the device lock:

    device_lock(&dev->dev);

Since the calling thread already holds the device lock, nci_rx_work blocks
indefinitely. Because the RX worker is blocked, it cannot process the
pending RSP, causing nci_request() to time out and fail. Could this
deadlock the RX thread?

^ permalink raw reply

* Re: [PATCH v12 11/12] x86/vmscape: Resolve conflict between attack-vectors and vmscape=force
From: bot+bpf-ci @ 2026-06-23 18:13 UTC (permalink / raw)
  To: pawan.kumar.gupta, x86, jon, nik.borisov, hpa, jpoimboe,
	david.kaplan, seanjc, bp, dave.hansen, peterz, ast, daniel,
	andrii, kpsingh, jolsa, davem, david.laight.linux, luto, tglx,
	mingo, dsahern, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, sdf, haoluo, pbonzini, corbet, jbaron, aliceryhl,
	rostedt, ardb, skhan
  Cc: linux-kernel, kvm, asit.k.mallick, tao1.zhang, bpf, netdev,
	linux-doc, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260622-vmscape-bhb-v12-11-76cbda0ae3e5@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 1456 bytes --]

> x86/vmscape: Resolve conflict between attack-vectors and vmscape=force
>
> vmscape=force option currently defaults to AUTO mitigation. This lets
> attack-vector controls to override the vmscape mitigation. Preventing the
> user from being able to force VMSCAPE mitigation.
>
> When vmscape mitigation is forced, allow it be deployed irrespective of
> attack vectors. Introduce VMSCAPE_MITIGATION_ON that wins over
> attack-vector controls.
>
> Tested-by: Jon Kohler <jon@nutanix.com>
> Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>

This appears to fix a behavioral bug where vmscape=force mapped to
VMSCAPE_MITIGATION_AUTO, which is gated by should_mitigate_vuln(), so
attack-vector controls could override a user-forced mitigation.

Should this carry a Fixes: tag?

The force -> VMSCAPE_MITIGATION_AUTO mapping that allowed the override
predates this series. The earlier refactor 68d94c61c09a ("x86/vmscape:
Move mitigation selection to a switch()") only moved the logic into a
switch() and did not introduce the conflict, so it does not look like the
right Fixes: target. A precise commit to point at could not be confirmed
from this tree.

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/28046221590

^ permalink raw reply

* Re: [PATCH v12 02/12] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: bot+bpf-ci @ 2026-06-23 18:13 UTC (permalink / raw)
  To: pawan.kumar.gupta, x86, jon, nik.borisov, hpa, jpoimboe,
	david.kaplan, seanjc, bp, dave.hansen, peterz, ast, daniel,
	andrii, kpsingh, jolsa, davem, david.laight.linux, luto, tglx,
	mingo, dsahern, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, sdf, haoluo, pbonzini, corbet, jbaron, aliceryhl,
	rostedt, ardb, skhan
  Cc: linux-kernel, kvm, asit.k.mallick, tao1.zhang, bpf, netdev,
	linux-doc, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260622-vmscape-bhb-v12-2-76cbda0ae3e5@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 3513 bytes --]

> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
> index 83f51cab0b1e..2cb4a96247d8 100644
> --- a/arch/x86/kernel/cpu/bugs.c
> +++ b/arch/x86/kernel/cpu/bugs.c

[ ... ]

> @@ -3242,6 +3246,15 @@ void __init cpu_select_mitigations(void)
>  		x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
>  	}
>
> +	/*
> +	 * Switch to long BHB clear sequence on newer CPUs (with BHI_CTRL
> +	 * support), see Intel's BHI guidance.
> +	 */
> +	if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL)) {
> +		bhb_seq_outer_loop = 12;
> +		bhb_seq_inner_loop = 7;
> +	}

Should this also consider the case of a guest running under a hypervisor
that does not expose BHI_CTRL?

When BHI_CTRL is absent the loop counts keep their short defaults (5/5)
and the guest declares itself BHI-mitigated.  In a heterogeneous
migration pool a guest may present as Ice Lake (no BHI_CTRL) while
actually running on Alder Lake or newer hardware, where the short
sequence is not sufficient.

A default-safe approach would be to also select the long sequence when
running under a hypervisor without BHI_CTRL, for example:

	if (cpu_feature_enabled(X86_FEATURE_BHI_CTRL) ||
	    cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) {
		bhb_seq_outer_loop = 12;
		bhb_seq_inner_loop = 7;
	}

There is precedent for assuming the worst under a hypervisor, such as
vulnerable_to_its(), cpu_set_bug_bits() and check_null_seg_clears_base().

This came up during review and the resolution was to let the series move
forward and revisit the hypervisor case afterwards.  Is that still the
intent, or should the hypervisor fallback land with this patch?


For the missing Fixes: tag, quoting the full commit message:

commit a2a0812335ee19ae1df81f88e736da5210748c2b
Author: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>

>     x86/bhi: Make clear_bhb_loop() effective on newer CPUs
>
>     As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
>     the Branch History Buffer (BHB). On Alder Lake and newer parts this
>     sequence is not sufficient because it doesn't clear enough entries. This
>     was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
>     in the kernel.
>     Now with VMSCAPE (BHI variant) it is also required to isolate branch
>     history between guests and userspace. Since BHI_DIS_S only protects the
>     kernel, the newer CPUs also use IBPB.
>     A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
>     But it currently does not clear enough BHB entries to be effective on newer
>     CPUs with larger BHB. At boot, dynamically set the loop count of
>     clear_bhb_loop() such that it is effective on newer CPUs too.
>     Introduce global loop counts, initializing them with appropriate value
>     based on the hardware feature X86_FEATURE_BHI_CTRL.
>     Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
>     Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
>     Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>

The short BHB clear sequence with the hardcoded loop count of 5 was
added by clear_bhb_loop() in entry_64.S, which is the code this patch
makes configurable.  Should this carry:

  Fixes: 7390db8aea0d ("x86/bhi: Add support for clearing branch history at syscall entry")


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/28046221590

^ permalink raw reply

* Re: [PATCH v12 08/12] KVM: Define EXPORT_STATIC_CALL_FOR_KVM()
From: bot+bpf-ci @ 2026-06-23 18:13 UTC (permalink / raw)
  To: pawan.kumar.gupta, x86, jon, nik.borisov, hpa, jpoimboe,
	david.kaplan, seanjc, bp, dave.hansen, peterz, ast, daniel,
	andrii, kpsingh, jolsa, davem, david.laight.linux, luto, tglx,
	mingo, dsahern, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, sdf, haoluo, pbonzini, corbet, jbaron, aliceryhl,
	rostedt, ardb, skhan
  Cc: linux-kernel, kvm, asit.k.mallick, tao1.zhang, bpf, netdev,
	linux-doc, ast, andrii, daniel, martin.lau, eddyz87,
	yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260622-vmscape-bhb-v12-8-76cbda0ae3e5@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 2597 bytes --]

> EXPORT_SYMBOL_FOR_KVM() exists to export symbols to KVM modules. Static
> calls need the same treatment when the core kernel defines a static_call
> that KVM needs access to (e.g. from a VM-exit path).
>
> Define EXPORT_STATIC_CALL_FOR_KVM() as the static_call analogue of
> EXPORT_SYMBOL_FOR_KVM(). The same three-way logic applies:
>
>   - KVM_SUB_MODULES defined: export to "kvm," plus all sub-modules
>   - KVM=m, no sub-modules: export to "kvm" only
>   - KVM built-in: no export needed (noop)
>
>   As with EXPORT_SYMBOL_FOR_KVM(), allow architectures to override both
>   macros (e.g. to suppress the export when kvm.ko itself will not be
>   built despite CONFIG_KVM=m). Add the x86 no-op overrides in
>   arch/x86/include/asm/kvm_types.h for that case. To keep the pair in
>   sync, EXPORT_STATIC_CALL_FOR_KVM() is defined inside the
>   EXPORT_SYMBOL_FOR_KVM #ifndef block; an arch that defines
>   EXPORT_SYMBOL_FOR_KVM must also define EXPORT_STATIC_CALL_FOR_KVM or the
>   build will fail with a compile-time error.
>
> As with EXPORT_SYMBOL_FOR_KVM(), allow architectures to override
> EXPORT_STATIC_CALL_FOR_KVM definition (e.g. to suppress the export when
> kvm.ko itself will not be built despite CONFIG_KVM=m). Add the x86 no-op
> override in arch/x86/include/asm/kvm_types.h for that case.
>
> Architectures must also define EXPORT_STATIC_CALL_FOR_KVM when they define
> EXPORT_SYMBOL_FOR_KVM.

This isn't a bug, but the arch-override rationale appears twice in nearly
identical form.  The third paragraph (starting "As with
EXPORT_SYMBOL_FOR_KVM(), allow architectures to override both macros") and
the fourth paragraph (starting "As with EXPORT_SYMBOL_FOR_KVM(), allow
architectures to override EXPORT_STATIC_CALL_FOR_KVM definition") both
explain suppressing the export when kvm.ko will not be built and both
mention adding the x86 no-op override.

The requirement that an arch defining EXPORT_SYMBOL_FOR_KVM must also
define EXPORT_STATIC_CALL_FOR_KVM also shows up twice: once inside the
third paragraph ("an arch that defines EXPORT_SYMBOL_FOR_KVM must also
define EXPORT_STATIC_CALL_FOR_KVM or the build will fail") and again as the
standalone final sentence ("Architectures must also define
EXPORT_STATIC_CALL_FOR_KVM when they define EXPORT_SYMBOL_FOR_KVM").

Could the duplicate paragraphs be folded into a single statement?


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/28046221590

^ permalink raw reply

* Re: [PATCH net 1/1] net/sched: cls_api: Handle TC_ACT_CONSUMED in tcf_qevent_handle
From: Jamal Hadi Salim @ 2026-06-23 18:00 UTC (permalink / raw)
  To: netdev
  Cc: davem, edumazet, kuba, pabeni, horms, jiri, victor, security,
	Zero Day Initiative
In-Reply-To: <20260620130749.226642-1-jhs@mojatatu.com>

On Sat, Jun 20, 2026 at 9:07 AM Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>
> tcf_classify() can return TC_ACT_CONSUMED while the skb is held by the
> defragmentation engine (e.g. act_ct on out-of-order fragments). When
> that happens the skb is no longer owned by the caller and must not be
> touched again.
>
> tcf_qevent_handle() did not handle TC_ACT_CONSUMED: it fell through the
> switch and returned the skb to the caller as if classification had
> passed. The only qdisc that wires up qevents today is RED, via three call sites
> (qe_mark on RED_PROB_MARK/HARD_MARK, qe_early_drop on congestion_drop)
> red_enqueue() was continuing to operate on an skb it no longer owns  in this
> case -- enqueueing it, dropping it, or updating statistics. Resulting in a UAF.
>
>   tc qdisc add dev eth0 root handle 1: red ... qevent early_drop block 10
>   tc filter add block 10 ... action ct
>
>   (with ct defrag enabled and traffic that produces out-of-order
>   fragments, e.g. a fragmented UDP stream)
>
> Handle TC_ACT_CONSUMED in tcf_qevent_handle() the same way the ingress
> and egress fast paths do: treat it as stolen and return NULL without
> touching the skb. Unlike the TC_ACT_STOLEN case, the skb must not be
> dropped/freed here, as it is no longer owned by us.
>

I just looked at sashiko claims - one of them (on ebpf) is legit but
the one on qdiscs is some BS it is making up. I will address the ebpf
one this week.

cheers,
jamal

> Fixes: 3f14b377d01d ("net/sched: act_ct: fix skb leak and crash on ooo frags")
> Reported-by: Zero Day Initiative <zdi-disclosures@trendmicro.com>
> Tested-by: Victor Nogueira <victor@mojatatu.com>
> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
> ---
>  net/sched/cls_api.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
> index 20f7f9ee0b353..3e67600a4a1a1 100644
> --- a/net/sched/cls_api.c
> +++ b/net/sched/cls_api.c
> @@ -4049,6 +4049,9 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru
>                 skb_do_redirect(skb);
>                 *ret = __NET_XMIT_STOLEN;
>                 return NULL;
> +       case TC_ACT_CONSUMED:
> +               *ret = __NET_XMIT_STOLEN;
> +               return NULL;
>         }
>
>         return skb;
> --
> 2.34.1
>

^ permalink raw reply

* Re: [PATCH v14 0/9] tls: Add TLS 1.3 hardware offload support
From: Nils Juenemann @ 2026-06-23 17:53 UTC (permalink / raw)
  To: rjethwani, netdev
  Cc: borisp, davem, edumazet, john.fastabend, kuba, leon, mbloch,
	saeedm, sd, tariqt

Hi Rishikesh, all,

we have been testing the v14 TLS 1.3 HW offload series on a ConnectX-6
DX and hit a sendfile() final-record loss on the device TX path. We
reduced it to a self-contained C reproducer and characterized it;
reporting it here with the analysis and a question on where a fix belongs.

Setup:

NIC: ConnectX-6 DX (crypto enabled), FW 22.47.1026, SR-IOV VF,
TX offload only

Kernel: net-next + this v14 series

TLS 1.3, AES-128-GCM, kTLS installed via setsockopt(TLS_TX) on the
sending side with fixed test crypto material and no handshake, like
tools/testing/selftests/net/tls

a server sends a file with the raw sendfile(2) syscall; a client on
another host reads the decrypted stream and counts the bytes

Trigger: sendfile(2) with a count larger than the bytes remaining in
the file (count > EOF). This is what a generic copy loop / Go's
net.TCPConn.ReadFrom passes for a file of unknown length (~2 GiB). The
kernel sends up to EOF, but the connection's final TLS record then
appears not to be put on the wire unless a subsequent write flushes it.
An abrupt close() appears to drop it, and the peer receives the whole
body except the last record's bytes.

Reproducer results (two hosts over the ConnectX - a loopback/same-host
connection stays on TLS_SW and does not show it). Same file, 226965
bytes (= 13*16384 + 13973):

TLS_HW count>EOF close() -> 212992 short
TLS_HW count>EOF close(), no zerocopy -> 212992 same
TLS_HW count==exact close() -> 226965 full
TLS_HW count>EOF close_notify, then close() -> 226965 full
TLS_SW count>EOF close(), hw-tx-offload off -> 226965 full

So it is specific to the device-offload TX path: the final record of a
count > EOF sendfile() appears not to be finalized/flushed at EOF, only
by a following write. A bounded count, a trailing write (close_notify),
or software kTLS all avoid it. TLS_TX_ZEROCOPY_RO makes no difference.
We are currently using the exact-count workaround in a preview environment.

We may be misreading the code, so this is only a pointer: with
count > EOF tls_push_data() fills the last record without reaching the
size==0 case; on the device path tls_device_record_close() for that
pending record appears to run only on the next push, and an abrupt
teardown appears to discard it. The software path seems to flush
pending TX records on close (tls_sw_release_resources_tx), which would
explain why it is unaffected.

Reproducer:
https://gist.github.com/totallyunknown/a8f0ad3c54e40befde2f5a8d360fa6be

It installs kTLS with fixed test crypto material via
setsockopt(TLS_TX/TLS_RX), sends a file using the raw sendfile(2)
syscall, and compares count > EOF against exact-count and close_notify.
The v14 selftest (patch 9/9) sends via send() only and ends cleanly, so
it misses this; a sendfile() + count > EOF case reproduces it
deterministically for us.

Question: should the device offload finalize and flush the connection's
final record at EOF / on close, the way software kTLS does, or is a
trailing write required by contract? And should a fix live in net/tls
(device record close on the final partial record / the close path) or
on the mlx5 side?

Thanks,
Nils Juenemann

^ permalink raw reply

* Re: [PATCH net] net/mlx5e: Use sender devcom for MPV master-up
From: manjunath.b.patil @ 2026-06-23 17:51 UTC (permalink / raw)
  To: Tariq Toukan, Saeed Mahameed, Mark Bloch, Leon Romanovsky, netdev
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Patrisious Haddad, linux-rdma, linux-kernel, stable
In-Reply-To: <293db0b4-f308-469e-99c1-ef1b57d41451@nvidia.com>



On 6/22/26 2:01 AM, Tariq Toukan wrote:
> 
> 
> On 10/06/2026 20:39, Manjunath Patil wrote:
>> After PCIe DPC recovery, mlx5 reloads the affected functions and
>> replays multiport affiliation events. In the reported failure, the
>> first relevant device error was:
>>
>>    pcieport 0000:10:01.1: DPC: containment event
>>    pcieport 0000:10:01.1: PCIe Bus Error: severity=Uncorrected (Fatal)
>>    pcieport 0000:10:01.1:    [ 5] SDES                   (First)
>>
>> mlx5 recovered the PCI functions and resumed 0000:11:00.1. During
>> that resume, RDMA multiport binding replayed
>> MLX5_DRIVER_EVENT_AFFILIATION_DONE and mlx5e sent
>> MPV_DEVCOM_MASTER_UP. The host then panicked with:
>>
>>    BUG: kernel NULL pointer dereference, address: 0000000000000010
>>    RIP: mlx5_devcom_comp_set_ready+0x5/0x40 [mlx5_core]
>>    RDI: 0000000000000000
>>
>> Call trace included:
>>
>>    mlx5_devcom_comp_set_ready
>>    mlx5e_devcom_event_mpv
>>    mlx5_devcom_send_event
>>    mlx5_ib_bind_slave_port
>>    mlx5r_mp_probe
>>    mlx5_pci_resume
>>
>> MPV devcom registration publishes mlx5e private data to the component
>> peer list before mlx5e_devcom_init_mpv() stores the returned component
>> device in priv->devcom. A concurrent master-up event can therefore
>> reach a peer whose private data is visible but whose priv->devcom
>> backpointer is still NULL.
>>
>> MPV_DEVCOM_MASTER_UP already carries the sender/master mlx5e private
>> data as event_data. The ready bit is stored on the shared devcom
>> component, not on an individual peer. Use the sender devcom when
>> marking the MPV component ready.
>>
>> This preserves the readiness transition while avoiding a NULL
>> dereference of the peer devcom pointer during affiliation replay after
>> PCI error recovery.
>>
>> Fixes: bf11485f8419 ("net/mlx5: Register mlx5e priv to devcom in MPV 
>> mode")
>> Assisted-by: Codex:gpt-5
>> Signed-off-by: Manjunath Patil <manjunath.b.patil@oracle.com>
>> Cc: stable@vger.kernel.org # 6.7+
>> ---
> 
> Thanks for your patch and sorry for the late response.
> 
>>   drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 7 +++++--
>>   1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/ 
>> drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> index 8f2b3abe0092..f7ff20b97e8c 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
>> @@ -211,11 +211,14 @@ static void mlx5e_disable_async_events(struct 
>> mlx5e_priv *priv)
>>   static int mlx5e_devcom_event_mpv(int event, void *my_data, void 
>> *event_data)
>>   {
>> -    struct mlx5e_priv *slave_priv = my_data;
>> +    struct mlx5e_priv *master_priv = event_data;
> 
> makes sense.
> 
>>       switch (event) {
>>       case MPV_DEVCOM_MASTER_UP:
>> -        mlx5_devcom_comp_set_ready(slave_priv->devcom, true);
>> +        if (!master_priv || !master_priv->devcom)
>> +            return -EINVAL;
> 
> is this currently possible? or just being defensive?
> if this return is unreachable I'd drop it.

Yes, the check is only defensive. For MPV_DEVCOM_MASTER_UP, event_data 
is passed from mlx5e_devcom_init_mpv() after priv->devcom has been 
assigned, so it should not be reachable in the valid path.

Please feel free to drop the check while applying. If you prefer a v2, 
let me know and I will send one.

Thanks,
Manjunath

> 
>> +
>> +        mlx5_devcom_comp_set_ready(master_priv->devcom, true);
>>           break;
>>       case MPV_DEVCOM_MASTER_DOWN:
>>           /* no need for comp set ready false since we unregister after
> 


^ permalink raw reply

* [PATCH bpf-next v2 15/15] selftests/bpf: Add test for bpf_tcp_ops header option hooks
From: Amery Hung @ 2026-06-23 17:50 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

Add a test exercising the bpf_tcp_ops parse_hdr, hdr_opt_len and
write_hdr_opt members together with the header option helpers.

The struct_ops program (progs/bpf_tcp_ops_hdr.c) reserves space in
hdr_opt_len via bpf_reserve_hdr_opt(), writes an experimental option in
write_hdr_opt via bpf_store_hdr_opt(), and recovers it in parse_hdr via
bpf_load_hdr_opt() on the incoming skb. Each hook bumps a counter and the
parse hook records the option payload, so the three callbacks and all
three overloaded helpers are covered.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 .../bpf/prog_tests/bpf_tcp_ops_hdr.c          | 97 +++++++++++++++++++
 .../selftests/bpf/progs/bpf_tcp_ops_hdr.c     | 83 ++++++++++++++++
 2 files changed, 180 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops_hdr.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_tcp_ops_hdr.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops_hdr.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops_hdr.c
new file mode 100644
index 000000000000..73e34d2be9a4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops_hdr.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "cgroup_helpers.h"
+#include "bpf_tcp_ops_hdr.skel.h"
+
+#define CGROUP_PATH	"/bpf_tcp_ops_hdr"
+#define TEST_NETNS	"bpf_tcp_ops_hdr"
+
+#define TEST_OPT_D0	0xAB
+#define TEST_OPT_D1	0xCD
+
+static void send_recv(void)
+{
+	char buf[64] = {};
+	int server_fd, client_fd, accept_fd;
+	ssize_t n;
+
+	server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
+		return;
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_OK_FD(client_fd, "connect_to_fd"))
+		goto close_server;
+
+	accept_fd = accept(server_fd, NULL, NULL);
+	if (!ASSERT_OK_FD(accept_fd, "accept"))
+		goto close_client;
+
+	/* Exchange data both directions so option-bearing data packets
+	 * are sent and parsed on each side.
+	 */
+	n = send(client_fd, buf, sizeof(buf), 0);
+	ASSERT_EQ(n, sizeof(buf), "client_send");
+	n = recv(accept_fd, buf, sizeof(buf), 0);
+	ASSERT_EQ(n, sizeof(buf), "server_recv");
+
+	n = send(accept_fd, buf, sizeof(buf), 0);
+	ASSERT_EQ(n, sizeof(buf), "server_send");
+	n = recv(client_fd, buf, sizeof(buf), 0);
+	ASSERT_EQ(n, sizeof(buf), "client_recv");
+
+	close(accept_fd);
+close_client:
+	close(client_fd);
+close_server:
+	close(server_fd);
+}
+
+static void run_hdr_opt(void)
+{
+	struct bpf_tcp_ops_hdr *skel = NULL;
+	struct bpf_link *link = NULL;
+	struct netns_obj *ns = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops_hdr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	link = bpf_map__attach_cgroup_opts(skel->maps.test_hdr_ops, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link, "attach_cgroup"))
+		goto done;
+
+	send_recv();
+
+	/* Reserve + write hooks ran while sending. */
+	ASSERT_GT(skel->bss->hdr_opt_len_cnt, 0, "hdr_opt_len_cnt");
+	ASSERT_GT(skel->bss->write_cnt, 0, "write_cnt");
+	/* Parse hook ran and recovered our option on the receive side. */
+	ASSERT_GT(skel->bss->parse_cnt, 0, "parse_cnt");
+	ASSERT_GT(skel->bss->found_cnt, 0, "found_cnt");
+	ASSERT_EQ(skel->bss->found_d0, TEST_OPT_D0, "found_d0");
+	ASSERT_EQ(skel->bss->found_d1, TEST_OPT_D1, "found_d1");
+
+done:
+	bpf_link__destroy(link);
+	bpf_tcp_ops_hdr__destroy(skel);
+	netns_free(ns);
+	close(cgroup_fd);
+}
+
+void test_bpf_tcp_ops_hdr(void)
+{
+	run_hdr_opt();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_ops_hdr.c b/tools/testing/selftests/bpf/progs/bpf_tcp_ops_hdr.c
new file mode 100644
index 000000000000..46618a604d96
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_tcp_ops_hdr.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/* Experimental option kind and payload written/parsed by this test. */
+#define TEST_OPT_KIND	0xFD
+#define TEST_OPT_LEN	4
+#define TEST_OPT_D0	0xAB
+#define TEST_OPT_D1	0xCD
+
+int hdr_opt_len_cnt;
+int write_cnt;
+int parse_cnt;
+int found_cnt;
+__u8 found_d0;
+__u8 found_d1;
+
+SEC("struct_ops")
+void BPF_PROG(test_hdr_opt_len, struct sock *sk, struct sk_buff *skb,
+	      struct request_sock *req, struct sk_buff *syn_skb,
+	      enum tcp_synack_type synack_type, unsigned int *remaining)
+{
+	hdr_opt_len_cnt++;
+
+	/* Reserve TEST_OPT_LEN bytes; the helper decrements *remaining. Stacks
+	 * with other progs in the cgroup hierarchy.
+	 */
+	bpf_reserve_hdr_opt(ctx, TEST_OPT_LEN, 0);
+}
+
+SEC("struct_ops")
+void BPF_PROG(test_write_hdr_opt, struct sock *sk, struct sk_buff *skb,
+	      struct request_sock *req, struct sk_buff *syn_skb,
+	      enum tcp_synack_type synack_type, __u32 opt_off)
+{
+	__u8 opt[TEST_OPT_LEN] = {
+		TEST_OPT_KIND, TEST_OPT_LEN, TEST_OPT_D0, TEST_OPT_D1,
+	};
+
+	/* bpf_store_hdr_opt() takes the program ctx (the kernel reads the
+	 * outgoing skb from it); it appends after any options already written
+	 * in the reserved window, rejects duplicates, and confines the write to
+	 * the header option scratch. Stacks across progs in the cgroup hierarchy.
+	 */
+	if (bpf_store_hdr_opt(ctx, opt, sizeof(opt), 0))
+		return;
+
+	write_cnt++;
+}
+
+SEC("struct_ops")
+void BPF_PROG(test_parse_hdr, struct sock *sk, struct sk_buff *skb)
+{
+	__u8 opt[TEST_OPT_LEN] = {
+		TEST_OPT_KIND, TEST_OPT_LEN, TEST_OPT_D0, TEST_OPT_D1,
+	};
+
+	parse_cnt++;
+
+	/* Look up the experimental option written by test_write_hdr_opt() in
+	 * the incoming skb. For an experimental kind the search matches on the
+	 * 2-byte magic in opt[2..3]; on a match the found option is copied back
+	 * into opt[].
+	 */
+	if (bpf_load_hdr_opt(ctx, opt, sizeof(opt), 0) < 0)
+		return;
+
+	found_d0 = opt[2];
+	found_d1 = opt[3];
+	found_cnt++;
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops test_hdr_ops = {
+	.hdr_opt_len	= (void *)test_hdr_opt_len,
+	.write_hdr_opt	= (void *)test_write_hdr_opt,
+	.parse_hdr	= (void *)test_parse_hdr,
+};
+
+char _license[] SEC("license") = "GPL";
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 14/15] selftests/bpf: Test attaching struct_ops to a cgroup
From: Amery Hung @ 2026-06-23 17:50 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

Exercise attaching the bpf_tcp_ops struct_ops to cgroups via the generic
cgroup link infrastructure. The struct_ops instances record their
execution order and the previous return value to validate correctness.

Subtests:
- query:        BPF_F_QUERY_EFFECTIVE and attached query return the maps
- order:        BPF_F_PREORDER vs attach order within a cgroup
- before_after: BPF_F_BEFORE/BPF_F_AFTER relative positioning
- update:       bpf_link__update_map swaps a link's map, keeping its slot
- retval:       int return value chained across timeout_init progs of
                multiple bpf_tcp_ops attached to a cgroup
- hierarchy:    parent and child attachments merge in the child's
                effective array (descendant before ancestor)
- inherit:      a child created after the attach inherits the parent's
                prog

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 .../selftests/bpf/prog_tests/bpf_tcp_ops.c    | 554 ++++++++++++++++++
 .../testing/selftests/bpf/progs/bpf_tcp_ops.c | 141 +++++
 2 files changed, 695 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_tcp_ops.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
new file mode 100644
index 000000000000..4d087bdc4613
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ops.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <bpf/btf.h>
+#include "cgroup_helpers.h"
+#include "bpf_tcp_ops.skel.h"
+
+#define CGROUP_PATH	"/bpf_tcp_ops"
+#define TEST_NETNS	"bpf_tcp_ops"
+
+static __s32 get_bpf_tcp_ops_type_id(void)
+{
+	struct btf *vmlinux_btf;
+	__s32 type_id;
+
+	vmlinux_btf = btf__load_vmlinux_btf();
+	if (!ASSERT_OK_PTR(vmlinux_btf, "load_vmlinux_btf"))
+		return -1;
+
+	type_id = btf__find_by_name_kind(vmlinux_btf, "bpf_tcp_ops", BTF_KIND_STRUCT);
+	btf__free(vmlinux_btf);
+
+	ASSERT_GT(type_id, 0, "find_bpf_tcp_ops");
+	return type_id;
+}
+
+static void reset_order(struct bpf_tcp_ops *skel)
+{
+	memset(skel->bss->listen_order, 0, sizeof(skel->bss->listen_order));
+	memset(skel->bss->connect_order, 0, sizeof(skel->bss->connect_order));
+	skel->bss->listen_cnt = 0;
+	skel->bss->connect_cnt = 0;
+}
+
+static void do_listen_connect(int family)
+{
+	const char *addr = family == AF_INET ? "127.0.0.1" : "::1";
+	int server_fd, client_fd;
+
+	server_fd = start_server(family, SOCK_STREAM, addr, 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
+		return;
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (ASSERT_OK_FD(client_fd, "connect_to_fd"))
+		close(client_fd);
+
+	close(server_fd);
+}
+
+/*
+ * Attach ops1 and ops2 normally (in that order), then ops3 with
+ * BPF_F_PREORDER. Expected execution order: [3, 1, 2] — ops3 runs
+ * first despite being attached last, ops1 before ops2 by attach order.
+ */
+static void test_order(int cgroup_fd, struct bpf_tcp_ops *skel, int family)
+{
+	LIBBPF_OPTS(bpf_cgroup_opts, preorder_opts, .flags = BPF_F_PREORDER);
+	struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL;
+
+	link1 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link1, "attach_ops1"))
+		goto done;
+
+	link2 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link2, "attach_ops2"))
+		goto done;
+
+	link3 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops3, cgroup_fd,
+					    &preorder_opts);
+	if (!ASSERT_OK_PTR(link3, "attach_ops3_preorder"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(family);
+
+	ASSERT_EQ(skel->bss->listen_cnt, 3, "listen_cnt");
+	ASSERT_EQ(skel->bss->listen_order[0], 3, "listen_order[0]");
+	ASSERT_EQ(skel->bss->listen_order[1], 1, "listen_order[1]");
+	ASSERT_EQ(skel->bss->listen_order[2], 2, "listen_order[2]");
+
+	ASSERT_EQ(skel->bss->connect_cnt, 3, "connect_cnt");
+	ASSERT_EQ(skel->bss->connect_order[0], 3, "connect_order[0]");
+	ASSERT_EQ(skel->bss->connect_order[1], 1, "connect_order[1]");
+	ASSERT_EQ(skel->bss->connect_order[2], 2, "connect_order[2]");
+
+done:
+	bpf_link__destroy(link3);
+	bpf_link__destroy(link2);
+	bpf_link__destroy(link1);
+}
+
+static void run_order_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_order(cgroup_fd, skel, AF_INET);
+	test_order(cgroup_fd, skel, AF_INET6);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	close(cgroup_fd);
+}
+
+/*
+ * Position a new attachment relative to an existing one. Attach ops1, then
+ * ops2 with BPF_F_BEFORE ops1, then ops3 with BPF_F_AFTER ops2. Expected
+ * execution order: [2, 3, 1]. For struct_ops, relative_fd refers to a link
+ * fd, so BPF_F_LINK must be set.
+ */
+static void test_before_after(int cgroup_fd, struct bpf_tcp_ops *skel)
+{
+	struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL;
+	LIBBPF_OPTS(bpf_cgroup_opts, opts);
+
+	link1 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link1, "attach_ops1"))
+		goto done;
+
+	opts.flags = BPF_F_BEFORE | BPF_F_LINK;
+	opts.relative_fd = bpf_link__fd(link1);
+	link2 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, cgroup_fd, &opts);
+	if (!ASSERT_OK_PTR(link2, "attach_ops2_before"))
+		goto done;
+
+	opts.flags = BPF_F_AFTER | BPF_F_LINK;
+	opts.relative_fd = bpf_link__fd(link2);
+	link3 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops3, cgroup_fd, &opts);
+	if (!ASSERT_OK_PTR(link3, "attach_ops3_after"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(AF_INET6);
+
+	ASSERT_EQ(skel->bss->listen_cnt, 3, "listen_cnt");
+	ASSERT_EQ(skel->bss->listen_order[0], 2, "listen_order[0]");
+	ASSERT_EQ(skel->bss->listen_order[1], 3, "listen_order[1]");
+	ASSERT_EQ(skel->bss->listen_order[2], 1, "listen_order[2]");
+
+done:
+	bpf_link__destroy(link3);
+	bpf_link__destroy(link2);
+	bpf_link__destroy(link1);
+}
+
+static void run_before_after_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_before_after(cgroup_fd, skel);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	close(cgroup_fd);
+}
+
+static void test_query(int cgroup_fd, struct bpf_tcp_ops *skel)
+{
+	struct bpf_map_info info = {};
+	__u32 info_len = sizeof(info);
+	LIBBPF_OPTS(bpf_prog_query_opts, query_opts);
+	struct bpf_link *link1 = NULL, *link2 = NULL;
+	__u32 map1_id, map2_id, map_ids[2] = {};
+	__s32 type_id;
+
+	type_id = get_bpf_tcp_ops_type_id();
+	if (type_id <= 0)
+		return;
+
+	bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.tcp_ops1), &info, &info_len);
+	map1_id = info.id;
+
+	bpf_map_get_info_by_fd(bpf_map__fd(skel->maps.tcp_ops2), &info, &info_len);
+	map2_id = info.id;
+
+	link1 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link1, "attach_ops1"))
+		goto done;
+
+	link2 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link2, "attach_ops2"))
+		goto done;
+
+	/* query effective: expect 2 entries in attachment order */
+	query_opts.type_id = type_id;
+	query_opts.prog_ids = map_ids;
+	query_opts.count = ARRAY_SIZE(map_ids);
+	query_opts.query_flags = BPF_F_QUERY_EFFECTIVE;
+	ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_STRUCT_OPS, &query_opts),
+		  "query_effective");
+	ASSERT_EQ(query_opts.count, 2, "query_effective_count");
+	ASSERT_EQ(map_ids[0], map1_id, "map_ids[0]");
+	ASSERT_EQ(map_ids[1], map2_id, "map_ids[1]");
+
+	/* query attached (non-effective): expect 2 entries */
+	memset(map_ids, 0, sizeof(map_ids));
+	query_opts.query_flags = 0;
+	query_opts.count = ARRAY_SIZE(map_ids);
+	ASSERT_OK(bpf_prog_query_opts(cgroup_fd, BPF_STRUCT_OPS, &query_opts),
+		  "query_attached");
+	ASSERT_EQ(query_opts.count, 2, "query_attached_count");
+	ASSERT_EQ(map_ids[0], map1_id, "attached_map_ids[0]");
+	ASSERT_EQ(map_ids[1], map2_id, "attached_map_ids[1]");
+
+done:
+	bpf_link__destroy(link2);
+	bpf_link__destroy(link1);
+}
+
+static void run_query_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_query(cgroup_fd, skel);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	close(cgroup_fd);
+}
+
+/* Must match progs/bpf_tcp_ops.c */
+#define OPS_RETVAL1	11
+#define OPS_RETVAL2	22
+
+/*
+ * Attach three struct_ops implementing timeout_init to the same cgroup; they
+ * run in attach order [retval1, retval2, retval3]. timeout_init's return value
+ * is chained: the first prog reads the kernel seed via bpf_get_retval() (0,
+ * since no legacy sockops prog is attached) and returns OPS_RETVAL1; each
+ * subsequent prog must then observe the previous prog's return value. This
+ * proves the trampoline inherits the retval across an array of struct_ops.
+ */
+static void test_retval(int cgroup_fd, struct bpf_tcp_ops *skel)
+{
+	struct bpf_link *link1 = NULL, *link2 = NULL, *link3 = NULL;
+
+	skel->bss->retval_saw1 = -1;
+	skel->bss->retval_saw2 = -1;
+	skel->bss->retval_saw3 = -1;
+
+	link1 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops_retval1, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link1, "attach_retval1"))
+		goto done;
+
+	link2 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops_retval2, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link2, "attach_retval2"))
+		goto done;
+
+	link3 = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops_retval3, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link3, "attach_retval3"))
+		goto done;
+
+	do_listen_connect(AF_INET6);
+
+	/* First prog inherits the kernel seed (no legacy sockops -> 0). */
+	ASSERT_EQ(skel->bss->retval_saw1, 0, "retval_saw1");
+	/* Each subsequent prog inherits the previous prog's return value. */
+	ASSERT_EQ(skel->bss->retval_saw2, OPS_RETVAL1, "retval_saw2");
+	ASSERT_EQ(skel->bss->retval_saw3, OPS_RETVAL2, "retval_saw3");
+
+done:
+	bpf_link__destroy(link3);
+	bpf_link__destroy(link2);
+	bpf_link__destroy(link1);
+}
+
+static void run_retval_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_retval(cgroup_fd, skel);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	close(cgroup_fd);
+}
+
+/*
+ * bpf_link__update_map() swaps the struct_ops map backing an attached link.
+ * The link keeps its position, including BPF_F_PREORDER, across the update.
+ * Attach ops1 (normal) and ops2 (preorder): order [2, 1]. Update the normal
+ * link to ops3 -> [2, 3]; update the preorder link to ops1 -> [1, 3].
+ */
+static void test_update(int cgroup_fd, struct bpf_tcp_ops *skel)
+{
+	LIBBPF_OPTS(bpf_cgroup_opts, preorder_opts, .flags = BPF_F_PREORDER);
+	struct bpf_link *link = NULL, *link_pre = NULL;
+
+	link = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, cgroup_fd, NULL);
+	if (!ASSERT_OK_PTR(link, "attach_ops1"))
+		goto done;
+
+	link_pre = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, cgroup_fd,
+					       &preorder_opts);
+	if (!ASSERT_OK_PTR(link_pre, "attach_ops2_preorder"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(AF_INET6);
+	ASSERT_EQ(skel->bss->listen_cnt, 2, "cnt_initial");
+	ASSERT_EQ(skel->bss->listen_order[0], 2, "order0_initial");
+	ASSERT_EQ(skel->bss->listen_order[1], 1, "order1_initial");
+
+	/* Update the normal link's map (ops1 -> ops3); position is unchanged. */
+	if (!ASSERT_OK(bpf_link__update_map(link, skel->maps.tcp_ops3), "update_normal"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(AF_INET6);
+	ASSERT_EQ(skel->bss->listen_order[0], 2, "order0_after_normal");
+	ASSERT_EQ(skel->bss->listen_order[1], 3, "order1_after_normal");
+
+	/* Update the preorder link's map (ops2 -> ops1); it stays first. */
+	if (!ASSERT_OK(bpf_link__update_map(link_pre, skel->maps.tcp_ops1), "update_preorder"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(AF_INET6);
+	ASSERT_EQ(skel->bss->listen_order[0], 1, "order0_after_preorder");
+	ASSERT_EQ(skel->bss->listen_order[1], 3, "order1_after_preorder");
+
+done:
+	bpf_link__destroy(link_pre);
+	bpf_link__destroy(link);
+}
+
+static void run_update_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int cgroup_fd;
+
+	cgroup_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_update(cgroup_fd, skel);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	close(cgroup_fd);
+}
+
+/*
+ * Two-level hierarchy. Attach ops1 to the parent and ops2 to the child, then
+ * trigger from a socket in the child. Descendant progs run before ancestor
+ * progs, so the order is [2 (child), 1 (parent)].
+ */
+static void test_hierarchy(int parent_fd, int child_fd, struct bpf_tcp_ops *skel)
+{
+	struct bpf_link *plink = NULL, *clink = NULL;
+
+	plink = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, parent_fd, NULL);
+	if (!ASSERT_OK_PTR(plink, "attach_parent"))
+		goto done;
+
+	clink = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops2, child_fd, NULL);
+	if (!ASSERT_OK_PTR(clink, "attach_child"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(AF_INET6);
+
+	ASSERT_EQ(skel->bss->listen_cnt, 2, "listen_cnt");
+	ASSERT_EQ(skel->bss->listen_order[0], 2, "listen_order[0]");
+	ASSERT_EQ(skel->bss->listen_order[1], 1, "listen_order[1]");
+
+done:
+	bpf_link__destroy(clink);
+	bpf_link__destroy(plink);
+}
+
+static void run_hierarchy_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int parent_fd, child_fd = -1;
+
+	parent_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(parent_fd, 0, "join_parent_cgroup"))
+		return;
+
+	child_fd = create_and_get_cgroup(CGROUP_PATH "/child");
+	if (!ASSERT_GE(child_fd, 0, "create_child_cgroup"))
+		goto done;
+
+	if (!ASSERT_OK(join_cgroup(CGROUP_PATH "/child"), "join_child_cgroup"))
+		goto done;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_hierarchy(parent_fd, child_fd, skel);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	if (child_fd >= 0)
+		close(child_fd);
+	close(parent_fd);
+}
+
+/*
+ * Attach ops1 to the parent, then create and join the child cgroup. The child
+ * is created after the attach, so it must inherit the parent's effective progs
+ * via cgroup_bpf_inherit(). A socket in the child runs the parent's prog.
+ */
+static void test_inherit(int parent_fd, struct bpf_tcp_ops *skel)
+{
+	struct bpf_link *plink = NULL;
+	int child_fd = -1;
+
+	plink = bpf_map__attach_cgroup_opts(skel->maps.tcp_ops1, parent_fd, NULL);
+	if (!ASSERT_OK_PTR(plink, "attach_parent"))
+		goto done;
+
+	child_fd = create_and_get_cgroup(CGROUP_PATH "/child");
+	if (!ASSERT_GE(child_fd, 0, "create_child_cgroup"))
+		goto done;
+
+	if (!ASSERT_OK(join_cgroup(CGROUP_PATH "/child"), "join_child_cgroup"))
+		goto done;
+
+	reset_order(skel);
+	do_listen_connect(AF_INET6);
+
+	ASSERT_EQ(skel->bss->listen_cnt, 1, "listen_cnt");
+	ASSERT_EQ(skel->bss->listen_order[0], 1, "listen_order[0]");
+
+done:
+	if (child_fd >= 0)
+		close(child_fd);
+	bpf_link__destroy(plink);
+}
+
+static void run_inherit_subtest(void)
+{
+	struct bpf_tcp_ops *skel = NULL;
+	struct netns_obj *ns = NULL;
+	int parent_fd;
+
+	parent_fd = test__join_cgroup(CGROUP_PATH);
+	if (!ASSERT_GE(parent_fd, 0, "join_parent_cgroup"))
+		return;
+
+	ns = netns_new(TEST_NETNS, true);
+	if (!ASSERT_OK_PTR(ns, "netns_new"))
+		goto done;
+
+	skel = bpf_tcp_ops__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
+		goto done;
+
+	test_inherit(parent_fd, skel);
+
+done:
+	bpf_tcp_ops__destroy(skel);
+	netns_free(ns);
+	close(parent_fd);
+}
+
+void test_bpf_tcp_ops(void)
+{
+	if (test__start_subtest("order"))
+		run_order_subtest();
+	if (test__start_subtest("before_after"))
+		run_before_after_subtest();
+	if (test__start_subtest("query"))
+		run_query_subtest();
+	if (test__start_subtest("retval"))
+		run_retval_subtest();
+	if (test__start_subtest("update"))
+		run_update_subtest();
+	if (test__start_subtest("hierarchy"))
+		run_hierarchy_subtest();
+	if (test__start_subtest("inherit"))
+		run_inherit_subtest();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_ops.c b/tools/testing/selftests/bpf/progs/bpf_tcp_ops.c
new file mode 100644
index 000000000000..94a7f52573d5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_tcp_ops.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define MAX_CGROUP_OPS 8
+
+/* Call order for listen and connect, indexed by call sequence */
+u32 listen_order[MAX_CGROUP_OPS];
+u32 listen_cnt;
+
+u32 connect_order[MAX_CGROUP_OPS];
+u32 connect_cnt;
+
+static void record_listen(int id)
+{
+	u32 idx = listen_cnt;
+
+	if (idx < MAX_CGROUP_OPS) {
+		listen_order[idx] = id;
+		listen_cnt = idx + 1;
+	}
+}
+
+static void record_connect(int id)
+{
+	u32 idx = connect_cnt;
+
+	if (idx < MAX_CGROUP_OPS) {
+		connect_order[idx] = id;
+		connect_cnt = idx + 1;
+	}
+}
+
+/* struct_ops instance 1 */
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops1_listen, struct sock *sk)
+{
+	record_listen(1);
+}
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops1_connect, struct sock *sk)
+{
+	record_connect(1);
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops1 = {
+	.listen  = (void *)tcp_ops1_listen,
+	.connect = (void *)tcp_ops1_connect,
+};
+
+/* struct_ops instance 2 */
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops2_listen, struct sock *sk)
+{
+	record_listen(2);
+}
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops2_connect, struct sock *sk)
+{
+	record_connect(2);
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops2 = {
+	.listen  = (void *)tcp_ops2_listen,
+	.connect = (void *)tcp_ops2_connect,
+};
+
+/* struct_ops instance 3 */
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops3_listen, struct sock *sk)
+{
+	record_listen(3);
+}
+
+SEC("struct_ops")
+void BPF_PROG(tcp_ops3_connect, struct sock *sk)
+{
+	record_connect(3);
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops3 = {
+	.listen  = (void *)tcp_ops3_listen,
+	.connect = (void *)tcp_ops3_connect,
+};
+
+#define OPS_RETVAL1	11
+#define OPS_RETVAL2	22
+#define OPS_RETVAL3	33
+
+int retval_saw1;
+int retval_saw2;
+int retval_saw3;
+
+SEC("struct_ops")
+int BPF_PROG(tcp_ops_retval1_timeout_init, struct sock *sk, struct request_sock *req)
+{
+	retval_saw1 = bpf_get_retval();
+	return OPS_RETVAL1;
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops_retval1 = {
+	.timeout_init = (void *)tcp_ops_retval1_timeout_init,
+};
+
+SEC("struct_ops")
+int BPF_PROG(tcp_ops_retval2_timeout_init, struct sock *sk, struct request_sock *req)
+{
+	retval_saw2 = bpf_get_retval();
+	return OPS_RETVAL2;
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops_retval2 = {
+	.timeout_init = (void *)tcp_ops_retval2_timeout_init,
+};
+
+SEC("struct_ops")
+int BPF_PROG(tcp_ops_retval3_timeout_init, struct sock *sk, struct request_sock *req)
+{
+	retval_saw3 = bpf_get_retval();
+	return OPS_RETVAL3;
+}
+
+SEC(".struct_ops.link")
+struct bpf_tcp_ops tcp_ops_retval3 = {
+	.timeout_init = (void *)tcp_ops_retval3_timeout_init,
+};
+
+char _license[] SEC("license") = "GPL";
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 13/15] libbpf: Support attaching struct_ops to a cgroup
From: Amery Hung @ 2026-06-23 17:50 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

Add bpf_map__attach_cgroup_opts() to attach a struct_ops map to a cgroup
through a BPF link.

Also extend struct bpf_prog_query_opts with a type_id field so a
BPF_STRUCT_OPS query on a cgroup can select the struct_ops type to
enumerate.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 tools/lib/bpf/bpf.c            |  2 ++
 tools/lib/bpf/bpf.h            |  3 +-
 tools/lib/bpf/libbpf.c         | 59 ++++++++++++++++++++++++++++++++++
 tools/lib/bpf/libbpf.h         |  3 ++
 tools/lib/bpf/libbpf.map       |  5 +++
 tools/lib/bpf/libbpf_version.h |  2 +-
 6 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 96819c082c77..a9de7f107cf7 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -934,6 +934,7 @@ int bpf_link_create(int prog_fd, int target_fd,
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 	case BPF_LSM_CGROUP:
+	case BPF_STRUCT_OPS:
 		relative_fd = OPTS_GET(opts, cgroup.relative_fd, 0);
 		relative_id = OPTS_GET(opts, cgroup.relative_id, 0);
 		if (relative_fd && relative_id)
@@ -1056,6 +1057,7 @@ int bpf_prog_query_opts(int target, enum bpf_attach_type type,
 	attr.query.attach_type		= type;
 	attr.query.query_flags		= OPTS_GET(opts, query_flags, 0);
 	attr.query.count		= OPTS_GET(opts, count, 0);
+	attr.query.type_id		= OPTS_GET(opts, type_id, 0);
 	attr.query.prog_ids		= ptr_to_u64(OPTS_GET(opts, prog_ids, NULL));
 	attr.query.link_ids		= ptr_to_u64(OPTS_GET(opts, link_ids, NULL));
 	attr.query.prog_attach_flags	= ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL));
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 7534a593edae..490e8cb4ba53 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -637,9 +637,10 @@ struct bpf_prog_query_opts {
 	__u32 *link_ids;
 	__u32 *link_attach_flags;
 	__u64 revision;
+	__u32 type_id;
 	size_t :0;
 };
-#define bpf_prog_query_opts__last_field revision
+#define bpf_prog_query_opts__last_field type_id
 
 /**
  * @brief **bpf_prog_query_opts()** queries the BPF programs and BPF links
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 1368752aa13c..17f8466e33fa 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -14104,6 +14104,65 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map)
 	return &link->link;
 }
 
+struct bpf_link *bpf_map__attach_cgroup_opts(const struct bpf_map *map, int cgroup_fd,
+					     const struct bpf_cgroup_opts *opts)
+{
+	LIBBPF_OPTS(bpf_link_create_opts, link_create_opts);
+	struct bpf_link_struct_ops *link;
+	__u32 relative_id, zero = 0;
+	int err, fd, relative_fd;
+
+	if (!OPTS_VALID(opts, bpf_cgroup_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	if (!bpf_map__is_struct_ops(map)) {
+		pr_warn("map '%s': can't attach non-struct_ops map\n", map->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	if (map->fd < 0) {
+		pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	relative_id = OPTS_GET(opts, relative_id, 0);
+	relative_fd = OPTS_GET(opts, relative_fd, 0);
+
+	if (relative_fd && relative_id) {
+		pr_warn("map '%s': relative_fd and relative_id cannot be set at the same time\n",
+			map->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	link_create_opts.cgroup.expected_revision = OPTS_GET(opts, expected_revision, 0);
+	link_create_opts.cgroup.relative_fd = relative_fd;
+	link_create_opts.cgroup.relative_id = relative_id;
+	link_create_opts.flags = OPTS_GET(opts, flags, 0);
+
+	link = calloc(1, sizeof(*link));
+	if (!link)
+		return libbpf_err_ptr(-ENOMEM);
+
+	err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0);
+	if (err && err != -EBUSY) {
+		free(link);
+		return libbpf_err_ptr(err);
+	}
+
+	link->link.detach = bpf_link__detach_struct_ops;
+
+	fd = bpf_link_create(map->fd, cgroup_fd, BPF_STRUCT_OPS, &link_create_opts);
+	if (fd < 0) {
+		free(link);
+		return libbpf_err_ptr(fd);
+	}
+
+	link->link.fd = fd;
+	link->map_fd = map->fd;
+
+	return &link->link;
+}
+
 /*
  * Swap the back struct_ops of a link with a new struct_ops map.
  */
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b965ad571540..0e5f4e9bba41 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -960,6 +960,9 @@ bpf_program__attach_cgroup_opts(const struct bpf_program *prog, int cgroup_fd,
 struct bpf_map;
 
 LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map);
+LIBBPF_API struct bpf_link *bpf_map__attach_cgroup_opts(const struct bpf_map *map,
+							int cgroup_fd,
+							const struct bpf_cgroup_opts *opts);
 LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map);
 
 struct bpf_iter_attach_opts {
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b731df19ae69..1b01d49e58eb 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -462,3 +462,8 @@ LIBBPF_1.8.0 {
 		bpf_program__clone;
 		btf__new_empty_opts;
 } LIBBPF_1.7.0;
+
+LIBBPF_1.9.0 {
+	global:
+		bpf_map__attach_cgroup_opts;
+} LIBBPF_1.8.0;
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index c446c0cd8cf9..57b74ef3618c 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
 #define __LIBBPF_VERSION_H
 
 #define LIBBPF_MAJOR_VERSION 1
-#define LIBBPF_MINOR_VERSION 8
+#define LIBBPF_MINOR_VERSION 9
 
 #endif /* __LIBBPF_VERSION_H */
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 12/15] bpf: tcp: Support parse/len/write header option hooks in bpf_tcp_ops
From: Amery Hung @ 2026-06-23 17:50 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

Add the TCP header option callbacks to the bpf_tcp_ops struct_ops type:

  parse_hdr     - parse the options of an incoming skb on an established
                  connection
  hdr_opt_len   - reserve space in the TCP header for bpf options
  write_hdr_opt - write the reserved bpf options

These mirror the BPF_SOCK_OPS_PARSE_HDR_OPT_CB, _HDR_OPT_LEN_CB and
_WRITE_HDR_OPT_CB legacy sockops callbacks, but are exposed as struct_ops
members so a program can implement them with normal function signatures
and per-member helper sets.

The reserved header window is shared between the legacy sockops and
bpf_tcp_ops paths. tcp_{syn,synack,established}_options() first run the
legacy BPF_SOCK_OPS_HDR_OPT_LEN_CB and then call hdr_opt_len, so both
sources accumulate into opts->bpf_opt_len; at write time the legacy
options are emitted first and bpf_tcp_ops writes after them.

API design

bpf_tcp_ops overloads the sock_ops header-option helpers rather than
introducing a new API: bpf_reserve_hdr_opt(), bpf_store_hdr_opt() and
bpf_load_hdr_opt() are exposed per-member (reserve for hdr_opt_len,
store/load for write_hdr_opt, load for parse_hdr) and share the existing
kernel option-walking core via _bpf_sock_ops{store,load}hdr_opt(), with
the bpf_tcp_ops wrappers synthesizing a temporary bpf_sock_ops_kern from
the program ctx. This keeps a port from the legacy
BPF_SOCK_OPS*_HDR_OPT_CB callbacks mechanical (same helper calls) and
adds no new UAPI helper/kfunc surface.

An alternative considered was to drop the option helpers entirely: have
hdr_opt_len reserve space purely through its return value, and introduce
a dedicated TCP-header-option dynptr used for both reading and writing.
That is a cleaner, more self-contained interface, but it is a larger
change and does not reuse the legacy helpers, making a port from sockops
less mechanical. It can be pursued as a follow-up; the helper-based
interface here keeps this series focused on moving the hooks to
struct_ops.

The hdr_opt_len fast path in tcp_established_options() is gated by
cgroup_bpf_enabled(CGROUP_TCP_SOCK_OPS). Note this is a global,
per-attach-type static branch: it is enabled whenever any bpf_tcp_ops is
attached, even one that does not implement hdr_opt_len or that is attached
to a different cgroup. In those cases the block still runs but
bpf_tcp_ops_hdr_opt_len() no-ops via the per-member check in the dispatch
macro. A per-member/per-cgroup gate could be added later if the extra
fast-path work proves measurable.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 include/linux/filter.h         |   5 ++
 include/net/tcp.h              |  40 ++++++++++
 include/uapi/linux/bpf.h       |  35 ++++++---
 net/core/filter.c              |  32 +++++---
 net/ipv4/bpf_tcp_ops.c         | 139 ++++++++++++++++++++++++++++++++-
 net/ipv4/tcp_input.c           |  13 +++
 net/ipv4/tcp_output.c          |  46 +++++++++++
 tools/include/uapi/linux/bpf.h |  35 ++++++---
 8 files changed, 306 insertions(+), 39 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 67d337ede91b..fe28db65fb6a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1843,6 +1843,11 @@ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 inde
 	return XDP_REDIRECT;
 }
 
+int __bpf_sock_ops_load_hdr_opt(struct bpf_sock_ops_kern *bpf_sock,
+				void *search_res, u32 len, u64 flags);
+int __bpf_sock_ops_store_hdr_opt(struct bpf_sock_ops_kern *bpf_sock,
+				 const void *from, u32 len, u64 flags);
+
 #ifdef CONFIG_NET
 int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
 int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2102f9f2afd6..7bf702117602 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -3005,6 +3005,45 @@ struct bpf_tcp_ops {
 
 	/* Called on listen(2), right after the socket enters TCP_LISTEN. */
 	void (*listen)(struct sock *sk);
+
+	/* Parse the TCP header options of an incoming skb received on an
+	 * established connection. Use bpf_dynptr_from_skb()/bpf_skb_load_bytes()
+	 * to access the options.
+	 */
+	void (*parse_hdr)(struct sock *sk, struct sk_buff *skb);
+
+	/* Reserve space in the outgoing TCP header for options to be written
+	 * later by write_hdr_opt(). Call bpf_reserve_hdr_opt() to reserve bytes.
+	 *
+	 * @skb: outgoing packet. NULL when called from tcp_current_mss()
+	 *       (MSS sizing).
+	 * @req: request_sock on the synack path; NULL otherwise.
+	 * @syn_skb: incoming SYN on the synack path; NULL otherwise.
+	 * @synack_type: TCP_SYNACK_COOKIE indicates a stateless syncookie.
+	 * @remaining: pointer to the size of space still available; cast it
+	 *             using bpf_rdonly_cast() before dereferencing.
+	 */
+	void (*hdr_opt_len)(struct sock *sk, struct sk_buff *skb,
+			    struct request_sock *req, struct sk_buff *syn_skb,
+			    enum tcp_synack_type synack_type,
+			    unsigned int *remaining);
+
+	/* Write header options into the space reserved earlier by hdr_opt_len().
+	 * Use bpf_store_hdr_opt() to write; it appends within the reserved window
+	 * shared with legacy SOCKOPS.
+	 *
+	 * @skb: outgoing packet.
+	 * @req: request_sock on the synack path; NULL otherwise.
+	 * @syn_skb: incoming SYN on the synack path; NULL otherwise.
+	 * @synack_type: TCP_SYNACK_COOKIE indicates a stateless syncookie.
+	 * @opt_off: offset in the outgoing @skb's TCP header where the
+	 *	     bpf_tcp_ops portion of the reserved window begins, i.e. after
+	 *	     the kernel and legacy options.
+	 */
+	void (*write_hdr_opt)(struct sock *sk, struct sk_buff *skb,
+			      struct request_sock *req, struct sk_buff *syn_skb,
+			      enum tcp_synack_type synack_type,
+			      u32 opt_off);
 };
 
 #define bpf_tcp_ops_call(op, sk, ...)					\
@@ -3056,6 +3095,7 @@ do {									\
 	}								\
 	__retval;							\
 })
+
 #else
 #define bpf_tcp_ops_call(op, sk, ...)		do { } while (0)
 #define bpf_tcp_ops_call_int(op, init_retval, sk, ...)	(init_retval)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2b84c69eb814..45b9ee29e461 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4799,15 +4799,18 @@ union bpf_attr {
  * 		The non-negative copied *buf* length equal to or less than
  * 		*size* on success, or a negative error in case of failure.
  *
- * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
+ * long bpf_load_hdr_opt(void *ctx, void *searchby_res, u32 len, u64 flags)
  *	Description
  *		Load header option.  Support reading a particular TCP header
- *		option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).
+ *		option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).  For the
+ *		**bpf_tcp_ops** struct_ops, this helper can be called from the
+ *		**parse_hdr**\ () and **write_hdr_opt**\ () operators.
  *
- *		If *flags* is 0, it will search the option from the
- *		*skops*\ **->skb_data**.  The comment in **struct bpf_sock_ops**
- *		has details on what skb_data contains under different
- *		*skops*\ **->op**.
+ *		If *flags* is 0, it will search the option from the packet
+ *		associated with the current operation.  For
+ *		**BPF_PROG_TYPE_SOCK_OPS**, the comment in
+ *		**struct bpf_sock_ops** has details on what skb_data
+ *		contains under different *op*.
  *
  *		The first byte of the *searchby_res* specifies the
  *		kind that it wants to search.
@@ -4840,6 +4843,8 @@ union bpf_attr {
  *
  *		* **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
  *		  saved_syn packet or the just-received syn packet.
+ *		  Not supported by the **bpf_tcp_ops** struct_ops, which
+ *		  rejects all flags.
  *
  *	Return
  *		> 0 when found, the header option is copied to *searchby_res*.
@@ -4860,9 +4865,9 @@ union bpf_attr {
  *		packet.
  *
  *		**-EPERM** if the helper cannot be used under the current
- *		*skops*\ **->op**.
+ *		operation.
  *
- * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
+ * long bpf_store_hdr_opt(void *ctx, const void *from, u32 len, u64 flags)
  *	Description
  *		Store header option.  The data will be copied
  *		from buffer *from* with length *len* to the TCP header.
@@ -4878,7 +4883,9 @@ union bpf_attr {
  *		by searching the same option in the outgoing skb.
  *
  *		This helper can only be called during
- *		**BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
+ *		**BPF_SOCK_OPS_WRITE_HDR_OPT_CB**, or from the
+ *		**write_hdr_opt**\ () operator of the **bpf_tcp_ops**
+ *		struct_ops.
  *
  *	Return
  *		0 on success, or negative error in case of failure:
@@ -4893,9 +4900,9 @@ union bpf_attr {
  *		**-EFAULT** on failure to parse the existing header options.
  *
  *		**-EPERM** if the helper cannot be used under the current
- *		*skops*\ **->op**.
+ *		operation.
  *
- * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
+ * long bpf_reserve_hdr_opt(void *ctx, u32 len, u64 flags)
  *	Description
  *		Reserve *len* bytes for the bpf header option.  The
  *		space will be used by **bpf_store_hdr_opt**\ () later in
@@ -4905,7 +4912,9 @@ union bpf_attr {
  *		the total number of bytes will be reserved.
  *
  *		This helper can only be called during
- *		**BPF_SOCK_OPS_HDR_OPT_LEN_CB**.
+ *		**BPF_SOCK_OPS_HDR_OPT_LEN_CB**, or from the
+ *		**hdr_opt_len**\ () operator of the **bpf_tcp_ops**
+ *		struct_ops.
  *
  *	Return
  *		0 on success, or negative error in case of failure:
@@ -4915,7 +4924,7 @@ union bpf_attr {
  *		**-ENOSPC** if there is not enough space in the header.
  *
  *		**-EPERM** if the helper cannot be used under the current
- *		*skops*\ **->op**.
+ *		operation.
  *
  * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
  *	Description
diff --git a/net/core/filter.c b/net/core/filter.c
index f85578772930..dc44ffb7a380 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7885,17 +7885,14 @@ static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
 	return ERR_PTR(-ENOMSG);
 }
 
-BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
-	   void *, search_res, u32, len, u64, flags)
+int __bpf_sock_ops_load_hdr_opt(struct bpf_sock_ops_kern *bpf_sock,
+				void *search_res, u32 len, u64 flags)
 {
 	bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
 	const u8 *op, *opend, *magic, *search = search_res;
 	u8 search_kind, search_len, copy_len, magic_len;
 	int ret;
 
-	if (!is_locked_tcp_sock_ops(bpf_sock))
-		return -EOPNOTSUPP;
-
 	/* 2 byte is the minimal option len except TCPOPT_NOP and
 	 * TCPOPT_EOL which are useless for the bpf prog to learn
 	 * and this helper disallow loading them also.
@@ -7956,6 +7953,15 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
 	return ret;
 }
 
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+	   void *, search_res, u32, len, u64, flags)
+{
+	if (!is_locked_tcp_sock_ops(bpf_sock))
+		return -EOPNOTSUPP;
+
+	return __bpf_sock_ops_load_hdr_opt(bpf_sock, search_res, len, flags);
+}
+
 static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
 	.func		= bpf_sock_ops_load_hdr_opt,
 	.gpl_only	= false,
@@ -7966,17 +7972,14 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
-	   const void *, from, u32, len, u64, flags)
+int __bpf_sock_ops_store_hdr_opt(struct bpf_sock_ops_kern *bpf_sock,
+				 const void *from, u32 len, u64 flags)
 {
 	u8 new_kind, new_kind_len, magic_len = 0, *opend;
 	const u8 *op, *new_op, *magic = NULL;
 	struct sk_buff *skb;
 	bool eol;
 
-	if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
-		return -EPERM;
-
 	if (len < 2 || flags)
 		return -EINVAL;
 
@@ -8034,6 +8037,15 @@ BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
 	return 0;
 }
 
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+	   const void *, from, u32, len, u64, flags)
+{
+	if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+		return -EPERM;
+
+	return __bpf_sock_ops_store_hdr_opt(bpf_sock, from, len, flags);
+}
+
 static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
 	.func		= bpf_sock_ops_store_hdr_opt,
 	.gpl_only	= false,
diff --git a/net/ipv4/bpf_tcp_ops.c b/net/ipv4/bpf_tcp_ops.c
index cf53c95a0dbc..0c7352517ac3 100644
--- a/net/ipv4/bpf_tcp_ops.c
+++ b/net/ipv4/bpf_tcp_ops.c
@@ -4,6 +4,7 @@
 #include <linux/bpf.h>
 #include <linux/btf_ids.h>
 #include <linux/bpf_verifier.h>
+#include <linux/filter.h>
 #include <net/bpf_sk_storage.h>
 #include <net/tcp.h>
 
@@ -55,6 +56,26 @@ static void listen_stub(struct sock *sk)
 {
 }
 
+static void parse_hdr_stub(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void hdr_opt_len_stub(struct sock *sk, struct sk_buff *skb__nullable,
+			     struct request_sock *req__nullable,
+			     struct sk_buff *syn_skb__nullable,
+			     enum tcp_synack_type synack_type,
+			     unsigned int *remaining)
+{
+}
+
+static void write_hdr_opt_stub(struct sock *sk, struct sk_buff *skb,
+			       struct request_sock *req__nullable,
+			       struct sk_buff *syn_skb__nullable,
+			       enum tcp_synack_type synack_type,
+			       u32 opt_off)
+{
+}
+
 static struct bpf_tcp_ops __bpf_tcp_ops = {
 	.timeout_init = timeout_init_stub,
 	.rwnd_init = rwnd_init_stub,
@@ -66,6 +87,99 @@ static struct bpf_tcp_ops __bpf_tcp_ops = {
 	.retrans = retrans_stub,
 	.connect = connect_stub,
 	.listen = listen_stub,
+	.parse_hdr = parse_hdr_stub,
+	.hdr_opt_len = hdr_opt_len_stub,
+	.write_hdr_opt = write_hdr_opt_stub,
+};
+
+BPF_CALL_4(bpf_tcp_ops_store_hdr_opt, void *, ctx, const void *, from,
+	   u32, len, u64, flags)
+{
+	struct sk_buff *skb = ((struct sk_buff **)ctx)[1];
+	struct bpf_sock_ops_kern sock_ops = {};
+	u32 opt_off = ((u64 *)ctx)[5];
+	u8 *op, *opend;
+
+	/* bpf_tcp_ops does not keep track of the end of the written TCP header
+	 * options, so search for it every time the helper is called. The free
+	 * space is NOP-filled, so a TCPOPT_NOP ends the search rather than being
+	 * skipped as in a normal option walk in sockops.
+	 */
+	op = skb->data + opt_off;
+	opend = skb->data + tcp_hdrlen(skb);
+	while (op < opend && *op != TCPOPT_NOP) {
+		if (*op == TCPOPT_EOL || op + 1 >= opend || op[1] < 2)
+			break;
+		op += op[1];
+	}
+
+	sock_ops.skb = skb;
+	sock_ops.skb_data_end = op;
+	sock_ops.remaining_opt_len = opend - op;
+
+	return __bpf_sock_ops_store_hdr_opt(&sock_ops, from, len, flags);
+}
+
+static const struct bpf_func_proto bpf_tcp_ops_store_hdr_opt_proto = {
+	.func		= bpf_tcp_ops_store_hdr_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_tcp_ops_load_hdr_opt, void *, ctx, void *, search_res,
+	   u32, len, u64, flags)
+{
+	struct sk_buff *skb = ((struct sk_buff **)ctx)[1];
+	struct bpf_sock_ops_kern sock_ops = {};
+
+	/* No flags supported. In particular BPF_LOAD_HDR_OPT_TCP_SYN, which
+	 * loads from the saved SYN, is not available because bpf_tcp_ops has no
+	 * carrier to track the SYN source across the hooks.
+	 */
+	if (flags)
+		return -EINVAL;
+
+	sock_ops.skb = skb;
+	sock_ops.skb_data_end = skb->data + tcp_hdrlen(skb);
+
+	return __bpf_sock_ops_load_hdr_opt(&sock_ops, search_res, len, flags);
+}
+
+static const struct bpf_func_proto bpf_tcp_ops_load_hdr_opt_proto = {
+	.func		= bpf_tcp_ops_load_hdr_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM | MEM_WRITE,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_tcp_ops_reserve_hdr_opt, void *, ctx, u32, len, u64, flags)
+{
+	unsigned int *remaining = ((unsigned int **)ctx)[5];
+
+	if (flags || len < 2)
+		return -EINVAL;
+
+	if (len > *remaining)
+		return -ENOSPC;
+
+	*remaining -= len;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_ops_reserve_hdr_opt_proto = {
+	.func		= bpf_tcp_ops_reserve_hdr_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
 };
 
 BPF_CALL_0(bpf_tcp_ops_get_retval)
@@ -102,14 +216,20 @@ get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_proto;
 	case BPF_FUNC_setsockopt:
-		/* The listener is not locked. */
+		/* The sk may be an unlocked listener (synack path) or NULL
+		 * fullsock; disable for members that can run unlocked.
+		 */
 		if (moff == offsetof(struct bpf_tcp_ops, rwnd_init) ||
-		    moff == offsetof(struct bpf_tcp_ops, timeout_init))
+		    moff == offsetof(struct bpf_tcp_ops, timeout_init) ||
+		    moff == offsetof(struct bpf_tcp_ops, hdr_opt_len) ||
+		    moff == offsetof(struct bpf_tcp_ops, write_hdr_opt))
 			return NULL;
 		return &bpf_sk_setsockopt_proto;
 	case BPF_FUNC_getsockopt:
 		if (moff == offsetof(struct bpf_tcp_ops, rwnd_init) ||
-		    moff == offsetof(struct bpf_tcp_ops, timeout_init))
+		    moff == offsetof(struct bpf_tcp_ops, timeout_init) ||
+		    moff == offsetof(struct bpf_tcp_ops, hdr_opt_len) ||
+		    moff == offsetof(struct bpf_tcp_ops, write_hdr_opt))
 			return NULL;
 		return &bpf_sk_getsockopt_proto;
 	case BPF_FUNC_get_retval:
@@ -117,6 +237,19 @@ get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		    moff == offsetof(struct bpf_tcp_ops, rwnd_init))
 			return &bpf_tcp_ops_get_retval_proto;
 		return NULL;
+	case BPF_FUNC_reserve_hdr_opt:
+		if (moff == offsetof(struct bpf_tcp_ops, hdr_opt_len))
+			return &bpf_tcp_ops_reserve_hdr_opt_proto;
+		return NULL;
+	case BPF_FUNC_load_hdr_opt:
+		if (moff == offsetof(struct bpf_tcp_ops, parse_hdr) ||
+		    moff == offsetof(struct bpf_tcp_ops, write_hdr_opt))
+			return &bpf_tcp_ops_load_hdr_opt_proto;
+		return NULL;
+	case BPF_FUNC_store_hdr_opt:
+		if (moff == offsetof(struct bpf_tcp_ops, write_hdr_opt))
+			return &bpf_tcp_ops_store_hdr_opt_proto;
+		return NULL;
 	default:
 		return bpf_base_func_proto(func_id, prog);
 	}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 12fb690d21c4..a36146789138 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -208,6 +208,18 @@ static void bpf_skops_established(struct sock *sk, int bpf_op,
 }
 #endif
 
+static void bpf_tcp_ops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+	switch (sk->sk_state) {
+	case TCP_SYN_RECV:
+	case TCP_SYN_SENT:
+	case TCP_LISTEN:
+		return;
+	}
+
+	bpf_tcp_ops_call(parse_hdr, sk, skb);
+}
+
 static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb,
 				    unsigned int len)
 {
@@ -6431,6 +6443,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 pass:
 	bpf_skops_parse_hdr(sk, skb);
+	bpf_tcp_ops_parse_hdr(sk, skb);
 
 	return true;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 93f4a95399ea..580652d0a135 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -573,6 +573,13 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
 	if (nr_written < max_opt_len)
 		memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
 		       max_opt_len - nr_written);
+
+	/* bpf_tcp_ops portion is NOP-filled (everything past the sockops
+	 * writer's bytes). The writer find the append point by scanning from
+	 * first_opt_off + nr_written to the first NOP.
+	 */
+	bpf_tcp_ops_call(write_hdr_opt, sk, skb, req, syn_skb, synack_type,
+			 first_opt_off + nr_written);
 }
 #else
 static u32 bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
@@ -594,6 +601,32 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
 }
 #endif
 
+static u32 bpf_tcp_ops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+				   struct request_sock *req,
+				   struct sk_buff *syn_skb,
+				   enum tcp_synack_type synack_type,
+				   struct tcp_out_options *opts,
+				   u32 remaining)
+{
+	unsigned int remaining_out = remaining, reserved;
+
+	if (!remaining)
+		return 0;
+
+	/* bpf_tcp_ops_reserve_hdr_opt() reserves space via remaining_out */
+	bpf_tcp_ops_call(hdr_opt_len, sk, skb, req, syn_skb, synack_type, &remaining_out);
+
+	reserved = remaining - remaining_out;
+	if (!reserved)
+		return remaining;
+
+	/* round up to 4 bytes */
+	reserved = (reserved + 3) & ~3;
+
+	opts->bpf_opt_len += reserved;
+	return remaining - reserved;
+}
+
 static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
 				      const struct tcp_request_sock *tcprsk,
 				      struct tcp_out_options *opts,
@@ -1053,6 +1086,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
 	remaining = bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts,
 					  remaining);
+	remaining = bpf_tcp_ops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts,
+					    remaining);
 
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
@@ -1141,6 +1176,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
 
 	remaining = bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
 					  synack_type, opts, remaining);
+	remaining = bpf_tcp_ops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+					    synack_type, opts, remaining);
 
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
@@ -1244,6 +1281,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 		size = MAX_TCP_OPTION_SPACE - remaining;
 	}
 
+	if (cgroup_bpf_enabled(CGROUP_TCP_SOCK_OPS)) {
+		unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+		remaining = bpf_tcp_ops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts,
+						    remaining);
+
+		size = MAX_TCP_OPTION_SPACE - remaining;
+	}
+
 	return size;
 }
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2b84c69eb814..45b9ee29e461 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4799,15 +4799,18 @@ union bpf_attr {
  * 		The non-negative copied *buf* length equal to or less than
  * 		*size* on success, or a negative error in case of failure.
  *
- * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
+ * long bpf_load_hdr_opt(void *ctx, void *searchby_res, u32 len, u64 flags)
  *	Description
  *		Load header option.  Support reading a particular TCP header
- *		option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).
+ *		option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).  For the
+ *		**bpf_tcp_ops** struct_ops, this helper can be called from the
+ *		**parse_hdr**\ () and **write_hdr_opt**\ () operators.
  *
- *		If *flags* is 0, it will search the option from the
- *		*skops*\ **->skb_data**.  The comment in **struct bpf_sock_ops**
- *		has details on what skb_data contains under different
- *		*skops*\ **->op**.
+ *		If *flags* is 0, it will search the option from the packet
+ *		associated with the current operation.  For
+ *		**BPF_PROG_TYPE_SOCK_OPS**, the comment in
+ *		**struct bpf_sock_ops** has details on what skb_data
+ *		contains under different *op*.
  *
  *		The first byte of the *searchby_res* specifies the
  *		kind that it wants to search.
@@ -4840,6 +4843,8 @@ union bpf_attr {
  *
  *		* **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
  *		  saved_syn packet or the just-received syn packet.
+ *		  Not supported by the **bpf_tcp_ops** struct_ops, which
+ *		  rejects all flags.
  *
  *	Return
  *		> 0 when found, the header option is copied to *searchby_res*.
@@ -4860,9 +4865,9 @@ union bpf_attr {
  *		packet.
  *
  *		**-EPERM** if the helper cannot be used under the current
- *		*skops*\ **->op**.
+ *		operation.
  *
- * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
+ * long bpf_store_hdr_opt(void *ctx, const void *from, u32 len, u64 flags)
  *	Description
  *		Store header option.  The data will be copied
  *		from buffer *from* with length *len* to the TCP header.
@@ -4878,7 +4883,9 @@ union bpf_attr {
  *		by searching the same option in the outgoing skb.
  *
  *		This helper can only be called during
- *		**BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
+ *		**BPF_SOCK_OPS_WRITE_HDR_OPT_CB**, or from the
+ *		**write_hdr_opt**\ () operator of the **bpf_tcp_ops**
+ *		struct_ops.
  *
  *	Return
  *		0 on success, or negative error in case of failure:
@@ -4893,9 +4900,9 @@ union bpf_attr {
  *		**-EFAULT** on failure to parse the existing header options.
  *
  *		**-EPERM** if the helper cannot be used under the current
- *		*skops*\ **->op**.
+ *		operation.
  *
- * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
+ * long bpf_reserve_hdr_opt(void *ctx, u32 len, u64 flags)
  *	Description
  *		Reserve *len* bytes for the bpf header option.  The
  *		space will be used by **bpf_store_hdr_opt**\ () later in
@@ -4905,7 +4912,9 @@ union bpf_attr {
  *		the total number of bytes will be reserved.
  *
  *		This helper can only be called during
- *		**BPF_SOCK_OPS_HDR_OPT_LEN_CB**.
+ *		**BPF_SOCK_OPS_HDR_OPT_LEN_CB**, or from the
+ *		**hdr_opt_len**\ () operator of the **bpf_tcp_ops**
+ *		struct_ops.
  *
  *	Return
  *		0 on success, or negative error in case of failure:
@@ -4915,7 +4924,7 @@ union bpf_attr {
  *		**-ENOSPC** if there is not enough space in the header.
  *
  *		**-EPERM** if the helper cannot be used under the current
- *		*skops*\ **->op**.
+ *		operation.
  *
  * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
  *	Description
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 11/15] bpf: tcp: Support selected sock_ops callbacks as struct_ops
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

In LSFMMBPF 2025, I have talked about moving the BPF_PROG_TYPE_SOCK_OPS
to a struct_ops interface [1].

The BPF_SOCK_OPS_*_CB enum interface has grown over time as new TCP
callback points were added. A BPF_PROG_TYPE_SOCK_OPS program now
commonly needs a large switch on sock_ops->op, and the shared
bpf_sock_ops_kern context has become harder to extend because different
callbacks have different locking, argument, skb, and helper
requirements. The existing 'union { u32 args[4]; u32 replylong[4]; }' is
also not reliable in passing args to bpf prog when there are multiple
progs attached to a cgroup.

The above has already been solved in struct_ops. Add a TCP-specific
struct_ops type, bpf_tcp_ops, and support attaching it to cgroups.
This allows each callback have its own func signature and allows
the verifier to select kfuncs/helpers based on the specific
struct_ops member being implemented.

This patch wires up the following existing sock_ops callbacks:
- BPF_SOCK_OPS_TIMEOUT_INIT
- BPF_SOCK_OPS_RWND_INIT
- BPF_SOCK_OPS_RTT_CB
- BPF_SOCK_OPS_STATE_CB
- BPF_SOCK_OPS_RETRANS_CB
- BPF_SOCK_OPS_TCP_CONNECT_CB
- BPF_SOCK_OPS_TCP_LISTEN_CB
- BPF_SOCK_OPS_RTO_CB
- BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB
- BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB

BASE_RTT is ignored as it is not particularly useful. NEEDS_ECN should
be done in bpf-tcp-cc instead. The tstamp ones should be a separate
struct_ops (e.g. "bpf_sock_ops") that can work in both TCP and UDP.

timeout_init and rwnd_init could have a request_sock pointer. This patch
tries a different API and direclty passes the request_sock pointer as
an arg.

Two other approaches were considered before settling on having
bpf_get_retval() read the dispatcher's run_ctx via saved_run_ctx. The
first was to inherit the retval in the trampoline itself: add a helper
in the four __bpf_prog_enter*() paths that, for struct_ops programs,
copies the chained value from the caller's run_ctx (now saved_run_ctx)
into the program's own run_ctx. It works but puts a per-enter
program-type check on the generic trampoline fast path, taxing all
fentry/fexit/lsm callers for a cgroup-struct_ops-only feature. The
second was to do that same inherit only for the int-returning members
via a gen_prologue that emits a hidden kfunc at the start of
timeout_init/rwnd_init; this keeps the cost off the generic path and
scoped to bpf_tcp_ops, but needs a kfunc + BTF_ID + prologue-emission
machinery. The chosen approach avoids both: it touches neither the
trampoline nor the program, since saved_run_ctx already points at the
dispatcher's run_ctx that carries the value.

[1], page 13: https://drive.google.com/file/d/1wjKZth6T0llLJ_ONPAL_6Q_jbxbAjByp/view?usp=sharing

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 include/linux/bpf.h    |   1 +
 include/net/tcp.h      | 113 +++++++++++++++++++++++++-
 net/ipv4/Makefile      |   1 +
 net/ipv4/af_inet.c     |   1 +
 net/ipv4/bpf_tcp_ops.c | 177 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp.c         |   1 +
 net/ipv4/tcp_input.c   |   4 +
 net/ipv4/tcp_output.c  |   2 +
 net/ipv4/tcp_timer.c   |   1 +
 9 files changed, 299 insertions(+), 2 deletions(-)
 create mode 100644 net/ipv4/bpf_tcp_ops.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index df95ae690da5..91024d2da4ea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2597,6 +2597,7 @@ struct bpf_trace_run_ctx {
 struct bpf_tramp_run_ctx {
 	struct bpf_run_ctx run_ctx;
 	u64 bpf_cookie;
+	int retval;
 	struct bpf_run_ctx *saved_run_ctx;
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d376ea4d1c0..2102f9f2afd6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2953,12 +2953,120 @@ static inline void tcp_clear_sock_ops_cb_flags(struct sock *sk)
 
 #endif
 
+#if defined(CONFIG_BPF_JIT) && defined(CONFIG_CGROUP_BPF)
+
+struct bpf_tcp_ops {
+	/* Should return the initial SYN (active open) or SYN-ACK (passive open)
+	 * retransmission timeout. Return the timeout in jiffies, or <= 0 for
+	 * the kernel default.
+	 *
+	 * @req: request_sock on the passive (synack) path; NULL otherwise.
+	 */
+	int (*timeout_init)(struct sock *sk, struct request_sock *req);
+
+	/* Should return the initial advertised receive window, in packets,
+	 * or < 0 for the kernel default. @req as in timeout_init().
+	 */
+	int (*rwnd_init)(struct sock *sk, struct request_sock *req);
+
+	/* Called when an active connection becomes established.
+	 * @skb is the SYNACK that completed the 3WHS, or NULL for a
+	 * TCP_REPAIR socket (tcp_finish_connect() with no skb).
+	 */
+	void (*active_established)(struct sock *sk, struct sk_buff *skb__nullable);
+
+	/* Called when a passive connection becomes established.
+	 * @skb is the ACK that completed the 3WHS.
+	 */
+	void (*passive_established)(struct sock *sk, struct sk_buff *skb);
+
+	/* Called when the retransmission timer fires. */
+	void (*rto)(struct sock *sk);
+
+	/* Called on every RTT sample.
+	 * @mrtt: the measured RTT, in microseconds.
+	 * @srtt: the updated smoothed RTT.
+	 */
+	void (*rtt)(struct sock *sk, long mrtt, u32 srtt);
+
+	/* Called when the connection changes TCP state.
+	 * @state: the new state (one of the TCP_* states).
+	 */
+	void (*set_state)(struct sock *sk, int state);
+
+	/* Called when an skb is retransmitted.
+	 * @skb: the retransmitted skb.
+	 * @err: tcp_transmit_skb() return value (0 on success).
+	 */
+	void (*retrans)(struct sock *sk, struct sk_buff *skb, int err);
+
+	/* Called right before an active connection is initialized. */
+	void (*connect)(struct sock *sk);
+
+	/* Called on listen(2), right after the socket enters TCP_LISTEN. */
+	void (*listen)(struct sock *sk);
+};
+
+#define bpf_tcp_ops_call(op, sk, ...)					\
+do {									\
+	if (cgroup_bpf_enabled(CGROUP_TCP_SOCK_OPS)) {			\
+		const struct bpf_prog_array_item *item;			\
+		const struct bpf_tcp_ops *tcp_ops;			\
+		struct cgroup *cgrp;					\
+									\
+		cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);		\
+		rcu_read_lock_dont_migrate();				\
+		bpf_cgroup_struct_ops_foreach(tcp_ops, item, cgrp,	\
+					      CGROUP_TCP_SOCK_OPS) {	\
+			if (tcp_ops->op)				\
+				tcp_ops->op(sk, ##__VA_ARGS__);		\
+		}							\
+		rcu_read_unlock_migrate();				\
+	}								\
+} while (0)
+
+#define bpf_tcp_ops_call_int(op, init_retval, sk, ...)			\
+({									\
+	int __retval = (init_retval);					\
+	if (cgroup_bpf_enabled(CGROUP_TCP_SOCK_OPS)) {			\
+		const struct bpf_prog_array_item *item;			\
+		const struct bpf_tcp_ops *tcp_ops;			\
+		struct bpf_tramp_run_ctx run_ctx;			\
+		struct bpf_run_ctx *old_run_ctx;			\
+		struct sock *__sk = sk_to_full_sk(sk);                  \
+		struct request_sock *req = NULL;			\
+		struct cgroup *cgrp;					\
+									\
+		if (__sk) {						\
+			run_ctx.retval = (init_retval);			\
+			cgrp = sock_cgroup_ptr(&__sk->sk_cgrp_data);	\
+			if (!sk_fullsock(sk))				\
+				req = (struct request_sock *)sk;	\
+			rcu_read_lock_dont_migrate();			\
+			old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);\
+			bpf_cgroup_struct_ops_foreach(tcp_ops, item, cgrp, \
+						      CGROUP_TCP_SOCK_OPS) { \
+				if (tcp_ops->op)			\
+					run_ctx.retval = tcp_ops->op(__sk, req, ##__VA_ARGS__); \
+			}						\
+			bpf_reset_run_ctx(old_run_ctx);			\
+			rcu_read_unlock_migrate();			\
+			__retval = run_ctx.retval;			\
+		}							\
+	}								\
+	__retval;							\
+})
+#else
+#define bpf_tcp_ops_call(op, sk, ...)		do { } while (0)
+#define bpf_tcp_ops_call_int(op, init_retval, sk, ...)	(init_retval)
+#endif
+
 static inline u32 tcp_timeout_init(struct sock *sk)
 {
 	int timeout;
 
 	timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL);
-
+	timeout = bpf_tcp_ops_call_int(timeout_init, timeout, sk);
 	if (timeout <= 0)
 		timeout = TCP_TIMEOUT_INIT;
 	return min_t(int, timeout, TCP_RTO_MAX);
@@ -2969,7 +3077,7 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
 	int rwnd;
 
 	rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL);
-
+	rwnd = bpf_tcp_ops_call_int(rwnd_init, rwnd, sk);
 	if (rwnd < 0)
 		rwnd = 0;
 	return rwnd;
@@ -2984,6 +3092,7 @@ static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
 {
 	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
 		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt);
+	bpf_tcp_ops_call(rtt, sk, mrtt, srtt);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 06e21c26b76f..afbac63d1cb4 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_TCP_AO) += tcp_ao.o
 
 ifeq ($(CONFIG_BPF_JIT),y)
 obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+obj-$(CONFIG_CGROUP_BPF) += bpf_tcp_ops.o
 endif
 
 ifdef CONFIG_GCOV_PROFILE_NETFILTER
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 32d006c1a8ee..ac8431da67f4 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,6 +227,7 @@ int __inet_listen_sk(struct sock *sk, int backlog)
 			return err;
 
 		tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
+		bpf_tcp_ops_call(listen, sk);
 	}
 	return 0;
 }
diff --git a/net/ipv4/bpf_tcp_ops.c b/net/ipv4/bpf_tcp_ops.c
new file mode 100644
index 000000000000..cf53c95a0dbc
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ops.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_verifier.h>
+#include <net/bpf_sk_storage.h>
+#include <net/tcp.h>
+
+static int timeout_init_stub(struct sock *sk, struct request_sock *req__nullable)
+{
+	struct bpf_tramp_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_tramp_run_ctx, run_ctx);
+
+	return ctx->retval;
+}
+
+static int rwnd_init_stub(struct sock *sk, struct request_sock *req__nullable)
+{
+	struct bpf_tramp_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_tramp_run_ctx, run_ctx);
+
+	return ctx->retval;
+}
+
+static void active_established_stub(struct sock *sk, struct sk_buff *skb__nullable)
+{
+}
+
+static void passive_established_stub(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void rto_stub(struct sock *sk)
+{
+}
+
+static void rtt_stub(struct sock *sk, long mrtt, u32 srtt)
+{
+}
+
+static void set_state_stub(struct sock *sk, int state)
+{
+}
+
+static void retrans_stub(struct sock *sk, struct sk_buff *skb, int err)
+{
+}
+
+static void connect_stub(struct sock *sk)
+{
+}
+
+static void listen_stub(struct sock *sk)
+{
+}
+
+static struct bpf_tcp_ops __bpf_tcp_ops = {
+	.timeout_init = timeout_init_stub,
+	.rwnd_init = rwnd_init_stub,
+	.active_established = active_established_stub,
+	.passive_established = passive_established_stub,
+	.rto = rto_stub,
+	.rtt = rtt_stub,
+	.set_state = set_state_stub,
+	.retrans = retrans_stub,
+	.connect = connect_stub,
+	.listen = listen_stub,
+};
+
+BPF_CALL_0(bpf_tcp_ops_get_retval)
+{
+	struct bpf_tramp_run_ctx *ctx =
+		container_of(current->bpf_ctx, struct bpf_tramp_run_ctx, run_ctx);
+
+	/* bpf_get_retval() is only exposed to timeout_init/rwnd_init, which
+	 * always run via bpf_tcp_ops_call_int(). Its run_ctx carries the int
+	 * return value chained across the bpf_tcp_ops attached to the cgroup
+	 * and is this program's saved_run_ctx.
+	 */
+	if (WARN_ON_ONCE(!ctx->saved_run_ctx))
+		return 0;
+
+	return container_of(ctx->saved_run_ctx, struct bpf_tramp_run_ctx,
+			    run_ctx)->retval;
+}
+
+const struct bpf_func_proto bpf_tcp_ops_get_retval_proto = {
+	.func		= bpf_tcp_ops_get_retval,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static const struct bpf_func_proto *
+get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	u32 moff = prog->aux->attach_st_ops_member_off;
+
+	switch (func_id) {
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_proto;
+	case BPF_FUNC_sk_storage_delete:
+		return &bpf_sk_storage_delete_proto;
+	case BPF_FUNC_setsockopt:
+		/* The listener is not locked. */
+		if (moff == offsetof(struct bpf_tcp_ops, rwnd_init) ||
+		    moff == offsetof(struct bpf_tcp_ops, timeout_init))
+			return NULL;
+		return &bpf_sk_setsockopt_proto;
+	case BPF_FUNC_getsockopt:
+		if (moff == offsetof(struct bpf_tcp_ops, rwnd_init) ||
+		    moff == offsetof(struct bpf_tcp_ops, timeout_init))
+			return NULL;
+		return &bpf_sk_getsockopt_proto;
+	case BPF_FUNC_get_retval:
+		if (moff == offsetof(struct bpf_tcp_ops, timeout_init) ||
+		    moff == offsetof(struct bpf_tcp_ops, rwnd_init))
+			return &bpf_tcp_ops_get_retval_proto;
+		return NULL;
+	default:
+		return bpf_base_func_proto(func_id, prog);
+	}
+}
+
+static bool is_valid_access(int off, int size, enum bpf_access_type type,
+			    const struct bpf_prog *prog, struct bpf_insn_access_aux *info)
+{
+	if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
+		return false;
+
+	if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+	    !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+	    info->btf_id == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+		/* promote it to tcp_sock */
+		info->btf_id = btf_sock_ids[BTF_SOCK_TYPE_TCP];
+
+	return true;
+}
+
+static int bpf_tcp_ops_init_member(const struct btf_type *t,
+				   const struct btf_member *member,
+				   void *kdata, const void *udata)
+{
+	return 0;
+}
+
+static int bpf_tcp_ops_init(struct btf *btf)
+{
+	return 0;
+}
+
+static int bpf_tcp_ops_validate(void *kdata)
+{
+	return 0;
+}
+
+static const struct bpf_verifier_ops bpf_tcp_ops_verifier = {
+	.get_func_proto		= get_func_proto,
+	.is_valid_access	= is_valid_access,
+};
+
+static struct bpf_struct_ops bpf_tcp_ops = {
+	.verifier_ops = &bpf_tcp_ops_verifier,
+	.init_member = bpf_tcp_ops_init_member,
+	.init = bpf_tcp_ops_init,
+	.validate = bpf_tcp_ops_validate,
+	.name = "bpf_tcp_ops",
+	.cgroup_atype = CGROUP_TCP_SOCK_OPS,
+	.cfi_stubs = &__bpf_tcp_ops,
+	.owner = THIS_MODULE,
+};
+
+static int __init __bpf_tcp_ops_init(void)
+{
+	return register_bpf_struct_ops(&bpf_tcp_ops, bpf_tcp_ops);
+}
+late_initcall(__bpf_tcp_ops_init);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 455441f1b694..94ed1ac2abc1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2996,6 +2996,7 @@ void tcp_set_state(struct sock *sk, int state)
 
 	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
 		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
+	bpf_tcp_ops_call(set_state, sk, state);
 
 	switch (state) {
 	case TCP_ESTABLISHED:
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 61045a8886e4..12fb690d21c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6694,6 +6694,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 
 	bpf_skops_established(sk, bpf_op, skb);
+	if (bpf_op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB)
+		bpf_tcp_ops_call(active_established, sk, skb);
+	else
+		bpf_tcp_ops_call(passive_established, sk, skb);
 	/* Initialize congestion control unless BPF initialized it already: */
 	if (!icsk->icsk_ca_initialized)
 		tcp_init_congestion_control(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 26dd751ec72a..93f4a95399ea 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3678,6 +3678,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
 		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
 				  TCP_SKB_CB(skb)->seq, segs, err);
+	bpf_tcp_ops_call(retrans, sk, skb, err);
 
 	if (unlikely(err) && err != -EBUSY)
 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
@@ -4298,6 +4299,7 @@ int tcp_connect(struct sock *sk)
 	int err;
 
 	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
+	bpf_tcp_ops_call(connect, sk);
 
 #if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
 	/* Has to be checked late, after setting daddr/saddr/ops.
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index bf171b5e1eb3..4337627ee0ea 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -290,6 +290,7 @@ static int tcp_write_timeout(struct sock *sk)
 		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
 				  icsk->icsk_retransmits,
 				  icsk->icsk_rto, (int)expired);
+	bpf_tcp_ops_call(rto, sk);
 
 	if (expired) {
 		/* Has it gone just too far? */
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 10/15] bpf: Allow all struct_ops to use bpf_dynptr_from_skb()
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

bpf_dynptr_from_skb() was only made available to bpf_qdisc, so far the
only struct_ops type that needs to read an skb. The upcoming bpf_tcp_ops
header-option hooks (parse_hdr/write_hdr_opt) also want to access the TCP
options of an skb through a dynptr.

All struct_ops programs share BPF_PROG_TYPE_STRUCT_OPS, so register
bpf_kfunc_set_skb (which holds bpf_dynptr_from_skb) for that program type
once, instead of per struct_ops. This makes bpf_dynptr_from_skb()
available to bpf_tcp_ops and any future struct_ops.

With the kfunc now provided to all of struct_ops, the bpf_qdisc-specific
registration becomes redundant and is dropped: bpf_qdisc_kfunc_filter()
only constrains kfuncs listed in qdisc_kfunc_ids, so removing
bpf_dynptr_from_skb from that set (and from qdisc_common_kfunc_set) lets
it fall through the filter unchanged, and bpf_qdisc keeps access via the
generic struct_ops registration.

Widening the registration is safe: a struct_ops that does not receive an
skb in its context has nothing to pass to the helper.

Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 net/core/filter.c     | 1 +
 net/sched/bpf_qdisc.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 1dd5e37ae130..f85578772930 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -12633,6 +12633,7 @@ static int __init bpf_kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_kfunc_set_skb);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
index 098ca02aed89..5691c13781a8 100644
--- a/net/sched/bpf_qdisc.c
+++ b/net/sched/bpf_qdisc.c
@@ -280,7 +280,6 @@ BTF_KFUNCS_START(qdisc_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_skb_get_hash)
 BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
 BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule)
 BTF_ID_FLAGS(func, bpf_qdisc_init_prologue)
 BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue)
@@ -290,7 +289,6 @@ BTF_KFUNCS_END(qdisc_kfunc_ids)
 BTF_SET_START(qdisc_common_kfunc_set)
 BTF_ID(func, bpf_skb_get_hash)
 BTF_ID(func, bpf_kfree_skb)
-BTF_ID(func, bpf_dynptr_from_skb)
 BTF_SET_END(qdisc_common_kfunc_set)

 BTF_SET_START(qdisc_enqueue_kfunc_set)
-- 
2.53.0-Meta

^ permalink raw reply related

* [PATCH bpf-next v2 09/15] bpf: Add infrastructure to support attaching struct_ops to cgroups
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

This patch adds necessary infrastructure to attach a struct_ops
map to a cgroup. The initial need was to support migrating
the legacy BPF_PROG_TYPE_SOCK_OPS to a struct_ops.
Recently, there are other struct_ops use cases that
need to attach struct_ops to a cgroup. For example,
the recent BPF OOM and memcg discussion in LSFMMBPF 2026.

The motivation is to create a consistent expectation
for attaching struct_ops to cgroup instead of each subsystem
creating its own infrastructure. This logic includes
hierarchy expectation, ordering expectation,
attachment API, and rcu gp.

There is already an existing implementation for attaching
multiple bpf progs to a cgroup. There are also tools
built around it for querying. Attaching a struct_ops map
(which is a group of bpf programs) could also adhere to
a similar API and potentially reuse most of the existing
implementation.

A couple of ideas have been tried. One of them
is to use mprog.c. In terms of the amount of changes,
I eventually came to the same conclusion as in
commit 120933984460 ("bpf: Implement mprog API on top of existing cgroup progs").
I then shifted the focus to reusing the current
{update,compute,activate,purge}_effective_progs() which has
the main logic that implements the mprog API.

Since then, I tried to add a 'struct cgroup *cgroup' member
to the existing 'struct bpf_struct_ops_link' and link_create
will create a 'struct bpf_struct_ops_link' object to be stored
in the pl->link. This turns out to have more changes on
both cgroup.c and bpf_struct_ops.c than I like.

This patch directly reuses the 'struct bpf_cgroup_link' which
cgroup.c already understands. Add 'struct bpf_map *map'
to 'struct bpf_cgroup_link'. In the future, as more subsystems
are extended by struct_ops, we may consider to make
'struct bpf_map *map' as a primary citizen of a link
like 'struct bpf_prog *prog' and directly add
'struct bpf_map *map' to the generic 'struct bpf_link'.

The pl->link could be the traditional 'prog' link or the
new 'map' link. The places that need to handle them differently
have already been refactored into the new prog_list_*() added in
the earlier patch. In those new prog_list_*(), this patch will
check "pl->link && pl->link->map", learn that it is a 'map' link
and handle it correctly.

The bpf_prog_array also needs to handle that its item can store
the traditional 'prog' or it can store a struct_ops map.
The places that need to handle them differently have also
been refactored into the new bpf_cgroup_array_*() added
in the earlier patch. The two differences are:
  - different sentinel (dummy_bpf_prog in prog vs cfi_stub in struct_ops)
  - the array for struct_ops may need to go through different
    rcu gp.
The bpf_cgroup_array_*() functions use the cgroup_bpf_attach_type (ie atype)
to distinguish the array is storing prog or storing struct_ops map.

This patch also implements a separate struct bpf_link_ops
"cgroup_struct_ops_link_ops" to have a separate link_ops implementation
that only handles the cgroup's struct_ops link.

Questions:
- Although this patch did not change it, it is not obvious to me how
  the replace_effective_progs() and purge_effective_progs() handle
  cases when there are existing BPF_F_PREORDER progs attached
  in the hlist.

Misc notes:
- CGROUP_TCP_SOCK_OPS is added to the 'enum cgroup_bpf_attach_type'.
  The actual implementation of the tcp_bpf_ops (a struct_ops)
  will be added in the next patch.

- free_after_mult_rcu_gp is added to 'struct bpf_struct_ops' such that
  the bpf_prog_array can have a mix of sleepable and
  non-sleepable prog in a struct_ops. This can tell
  how the bpf_prog_array should be freed.

- For a struct_ops that supports cgroup attachment, it does not need to
  implement its own reg/unreg function. reg/unreg to a cgroup is
  done by the common infrastructure added in this patch.

- The cgroup's struct_ops link only supports BPF_F_ALLOW_MULTI.
  This is enforced internally in cgroup_bpf_struct_ops_attach.
  This should be consistent with the current prog's link
  behavior in cgroup_bpf_link_attach.

  In the future, we may allow each subsystem to choose differently.

- A cgroup_atype member is added to 'struct bpf_struct_ops'.
  When a subsystem struct_ops needs to support cgroup attachment,
  it needs to add a value to 'enum cgroup_bpf_attach_type'
  and then assign it to the newly added cgroup_atype member
  in the bpf_struct_ops.

- During LINK_CREATE in syscall, the patch uses the same
  BPF_STRUCT_OPS (in attr->link_create.attach_type).
  The bpf_struct_ops_link_create learns the map and
  from the map it learns the st_ops. If the st_ops->cgroup_atype
  is not 0, it will create a cgroup's link.

- When a subsystem registers a struct_ops that supports cgroup
  attachment, the struct_ops infrastructure will also ask the
  cgroup infrastructure to remember a few things. This is done
  by calling cgroup_bpf_struct_ops_register().

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 include/linux/bpf-cgroup-defs.h |   1 +
 include/linux/bpf-cgroup.h      |  28 +++
 include/linux/bpf.h             |  19 +-
 include/uapi/linux/bpf.h        |   4 +-
 kernel/bpf/bpf_struct_ops.c     |  29 +++
 kernel/bpf/btf.c                |  23 +-
 kernel/bpf/cgroup.c             | 375 ++++++++++++++++++++++++++++++--
 kernel/bpf/syscall.c            |   1 +
 tools/include/uapi/linux/bpf.h  |   4 +-
 9 files changed, 463 insertions(+), 21 deletions(-)

diff --git a/include/linux/bpf-cgroup-defs.h b/include/linux/bpf-cgroup-defs.h
index c9e6b26abab6..0147b8bec973 100644
--- a/include/linux/bpf-cgroup-defs.h
+++ b/include/linux/bpf-cgroup-defs.h
@@ -47,6 +47,7 @@ enum cgroup_bpf_attach_type {
 	CGROUP_INET6_GETSOCKNAME,
 	CGROUP_UNIX_GETSOCKNAME,
 	CGROUP_INET_SOCK_RELEASE,
+	CGROUP_TCP_SOCK_OPS,
 	CGROUP_LSM_START,
 	CGROUP_LSM_END = CGROUP_LSM_START + CGROUP_LSM_NUM - 1,
 	MAX_CGROUP_BPF_ATTACH_TYPE
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 4d0cc65976a1..8a75a6cd7309 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -100,6 +100,8 @@ struct bpf_cgroup_storage {
 struct bpf_cgroup_link {
 	struct bpf_link link;
 	struct cgroup *cgroup;
+	struct bpf_map *map;
+	wait_queue_head_t wait_hup;
 };
 
 struct bpf_prog_list {
@@ -110,6 +112,18 @@ struct bpf_prog_list {
 	u32 flags;
 };
 
+#define bpf_cgroup_struct_ops_foreach(var, item, cgrp, atype)		\
+	for (item = rcu_dereference((cgrp)->bpf.effective[atype])->items;\
+	     ((var) = READ_ONCE(item->kdata));				\
+	     item++)
+
+static inline bool cgroup_bpf_is_struct_ops_atype(enum cgroup_bpf_attach_type atype)
+{
+	return atype == CGROUP_TCP_SOCK_OPS;
+}
+void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs, bool mult_trace);
+int cgroup_bpf_struct_ops_attach(struct bpf_map *map, const union bpf_attr *attr);
+
 void __init cgroup_bpf_lifetime_notifier_init(void);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
@@ -479,6 +493,20 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 	return 0;
 }
 
+static inline bool cgroup_bpf_is_struct_ops_atype(int atype)
+{
+	return false;
+}
+static inline void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs,
+						  bool mult_trace)
+{
+}
+static inline int cgroup_bpf_struct_ops_attach(struct bpf_map *map,
+					       const union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
 #define cgroup_bpf_enabled(atype) (0)
 #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e371a4733135..df95ae690da5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2100,11 +2100,18 @@ struct btf_member;
  *	   unloaded while in use.
  * @name: The name of the struct bpf_struct_ops object.
  * @func_models: Func models
+ * @cgroup_atype: A value in enum cgroup_bpf_attach_type for cgroup attachment.
+ *		  0 means the struct_ops type does not support cgroup attachment.
+ *		  If cgroup_atype is non-zero, the @reg and @unreg must be NULL
+ *		  because the attachment/detachment will be handled by the bpf core.
  * @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
  *                           a tasks_rcu gp before freeing the struct_ops map
  *                           and its progs. It is unnecessary if the @unreg
  *                           has waited for the correct rcu gp or the @unreg
  *                           has ensured all struct_ops prog has finished running.
+ * @free_after_mult_rcu_gp: Same as @free_after_tasks_rcu_gp but waiting for
+ *                          both tasks_trace_rcu and regular rcu grace period.
+ *                          It is usually needed if the struct_ops has sleepable prog.
  */
 struct bpf_struct_ops {
 	const struct bpf_verifier_ops *verifier_ops;
@@ -2123,7 +2130,9 @@ struct bpf_struct_ops {
 	struct module *owner;
 	const char *name;
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+	int cgroup_atype;
 	bool free_after_tasks_rcu_gp;
+	bool free_after_mult_rcu_gp;
 };
 
 /* Every member of a struct_ops type has an instance even a member is not
@@ -2258,6 +2267,7 @@ void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map);
 bool bpf_struct_ops_valid_to_reg(struct bpf_map *map);
 int bpf_struct_ops_link_update_check(struct bpf_map *new_map, struct bpf_map *old_map,
 				     struct bpf_map *expected_old_map);
+int bpf_struct_ops_map_cgroup_atype(struct bpf_map *map);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -2330,6 +2340,10 @@ static inline u32 bpf_struct_ops_kdata_map_id(void *kdata)
 {
 	return 0;
 }
+static inline int bpf_struct_ops_map_cgroup_atype(struct bpf_map *map)
+{
+	return 0;
+}
 static inline void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
 {
 	return NULL;
@@ -2519,7 +2533,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
  * since other cpus are walking the array of pointers in parallel.
  */
 struct bpf_prog_array_item {
-	struct bpf_prog *prog;
+	union {
+		struct bpf_prog *prog;
+		void *kdata;
+	};
 	union {
 		struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
 		u64 bpf_cookie;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 89b36de5fdbb..2b84c69eb814 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1756,7 +1756,7 @@ union bpf_attr {
 			__u32	prog_cnt;
 			__u32	count;
 		};
-		__u32		:32;
+		__u32		type_id;
 		/* output: per-program attach_flags.
 		 * not allowed to be set during effective query.
 		 */
@@ -6815,6 +6815,8 @@ struct bpf_link_info {
 		} xdp;
 		struct {
 			__u32 map_id;
+			__u32 :32;
+			__u64 cgroup_id;
 		} struct_ops;
 		struct {
 			__u32 pf;
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 1ca44584ed17..85cfc7e88d3f 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -13,6 +13,7 @@
 #include <linux/btf_ids.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/poll.h>
+#include <linux/bpf-cgroup.h>
 
 struct bpf_struct_ops_value {
 	struct bpf_struct_ops_common_value common;
@@ -1076,6 +1077,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 		goto errout;
 	}
 
+	if (st_ops_desc->st_ops->cgroup_atype && !(attr->map_flags & BPF_F_LINK)) {
+		ret = -EOPNOTSUPP;
+		goto errout;
+	}
+
 	vt = st_ops_desc->value_type;
 	if (attr->value_size != vt->size) {
 		ret = -EINVAL;
@@ -1116,6 +1122,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 
 	mutex_init(&st_map->lock);
 	bpf_map_init_from_attr(map, attr);
+	map->free_after_mult_rcu_gp = st_ops_desc->st_ops->free_after_mult_rcu_gp;
 	map->free_after_rcu_gp = true;
 
 	return map;
@@ -1254,6 +1261,14 @@ u32 bpf_struct_ops_kdata_map_id(void *kdata)
 	return st_map->map.id;
 }
 
+int bpf_struct_ops_map_cgroup_atype(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map;
+
+	st_map = container_of(map, struct bpf_struct_ops_map, map);
+	return st_map->st_ops_desc->st_ops->cgroup_atype;
+}
+
 void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map;
@@ -1429,6 +1444,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 	struct bpf_link_primer link_primer;
 	struct bpf_struct_ops_map *st_map;
 	struct bpf_map *map;
+	int cgroup_atype;
 	int err;
 
 	map = bpf_map_get(attr->link_create.map_fd);
@@ -1442,6 +1458,19 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 		goto err_out;
 	}
 
+	cgroup_atype = st_map->st_ops_desc->st_ops->cgroup_atype;
+	if (cgroup_atype) {
+		err = cgroup_bpf_struct_ops_attach(map, attr);
+		bpf_map_put(map);
+		return err;
+	}
+
+	if (memchr_inv(&attr->link_create.cgroup, 0, sizeof(attr->link_create.cgroup)) ||
+	    attr->link_create.target_fd) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
 	link = kzalloc_obj(*link, GFP_USER);
 	if (!link) {
 		err = -ENOMEM;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 64572f85edc8..d591f306ace5 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -20,6 +20,7 @@
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
 #include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
 #include <linux/bpf_lsm.h>
 #include <linux/skmsg.h>
 #include <linux/perf_event.h>
@@ -9836,6 +9837,7 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
 		   struct bpf_verifier_log *log)
 {
 	struct btf_struct_ops_tab *tab, *new_tab;
+	int cgroup_atype;
 	int i, err;
 
 	tab = btf->struct_ops_tab;
@@ -9847,8 +9849,10 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
 		btf->struct_ops_tab = tab;
 	}
 
+	cgroup_atype = st_ops->cgroup_atype;
 	for (i = 0; i < tab->cnt; i++)
-		if (tab->ops[i].st_ops == st_ops)
+		if (tab->ops[i].st_ops == st_ops ||
+		    (cgroup_atype && cgroup_atype == tab->ops[i].st_ops->cgroup_atype))
 			return -EEXIST;
 
 	if (tab->cnt == tab->capacity) {
@@ -9868,6 +9872,23 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
 	if (err)
 		return err;
 
+	if (cgroup_atype) {
+		if (!cgroup_bpf_is_struct_ops_atype(cgroup_atype) ||
+		    st_ops->reg || st_ops->unreg || st_ops->free_after_tasks_rcu_gp) {
+			bpf_struct_ops_desc_release(&tab->ops[btf->struct_ops_tab->cnt]);
+			return -EINVAL;
+		}
+
+		/* There is no need to unregister from cgroup when the
+		 * btf_free(). No struct_ops map and its cgroup link
+		 * can be created once its btf is gone.
+		 */
+		cgroup_bpf_struct_ops_register(cgroup_atype,
+					       tab->ops[btf->struct_ops_tab->cnt].type_id,
+					       st_ops->cfi_stubs,
+					       st_ops->free_after_mult_rcu_gp);
+	}
+
 	btf->struct_ops_tab->cnt++;
 
 	return 0;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 081d81de1816..a808d2a31f11 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,29 @@
 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 
+static u32 struct_ops_type_id[MAX_CGROUP_BPF_ATTACH_TYPE];
+static void *struct_ops_cfi_stubs[MAX_CGROUP_BPF_ATTACH_TYPE];
+static bool struct_ops_mult_rcu[MAX_CGROUP_BPF_ATTACH_TYPE];
+
+void cgroup_bpf_struct_ops_register(int atype, u32 type_id, void *cfi_stubs, bool mult_rcu)
+{
+	struct_ops_type_id[atype] = type_id;
+	struct_ops_cfi_stubs[atype] = cfi_stubs;
+	struct_ops_mult_rcu[atype] = mult_rcu;
+}
+
+static enum cgroup_bpf_attach_type find_atype_by_struct_ops_id(u32 type_id)
+{
+	enum cgroup_bpf_attach_type atype;
+
+	for (atype = 0; atype < MAX_CGROUP_BPF_ATTACH_TYPE; atype++) {
+		if (cgroup_bpf_is_struct_ops_atype(atype) &&
+		    struct_ops_type_id[atype] == type_id)
+			return atype;
+	}
+	return CGROUP_BPF_ATTACH_TYPE_INVALID;
+}
+
 /*
  * cgroup bpf destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup bpf
@@ -306,6 +329,19 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
 }
 
+static void cgroup_struct_ops_link_detach_wake(struct bpf_cgroup_link *link, bool wake_poll)
+{
+	cgroup_put(link->cgroup);
+	link->cgroup = NULL;
+
+	bpf_map_put(link->map);
+	/* READ_ONCE in cgroup_struct_ops_link_poll */
+	WRITE_ONCE(link->map, NULL);
+
+	if (wake_poll)
+		wake_up_interruptible_poll(&link->wait_hup, EPOLLHUP);
+}
+
 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
  * doesn't free link memory, which will eventually be done by bpf_link's
@@ -313,21 +349,37 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
  */
 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
 {
-	if (link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
-		bpf_trampoline_unlink_cgroup_shim(link->link.prog);
-	cgroup_put(link->cgroup);
-	link->cgroup = NULL;
+	if (link->map) {
+		cgroup_struct_ops_link_detach_wake(link, true);
+	} else {
+		if (link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+			bpf_trampoline_unlink_cgroup_shim(link->link.prog);
+		cgroup_put(link->cgroup);
+		link->cgroup = NULL;
+	}
+}
+
+static void bpf_cgroup_array_free_rcu(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct bpf_prog_array, rcu));
 }
 
-static void bpf_cgroup_array_free(struct bpf_prog_array *array)
+static void bpf_cgroup_array_free(struct bpf_prog_array *array,
+				  enum cgroup_bpf_attach_type atype)
 {
 	if (!array || array == &bpf_empty_prog_array)
 		return;
-	kfree_rcu(array, rcu);
+	if (struct_ops_mult_rcu[atype])
+		/* RCU tasks trace grace period implies RCU grace period. */
+		call_rcu_tasks_trace(&array->rcu, bpf_cgroup_array_free_rcu);
+	else
+		kfree_rcu(array, rcu);
 }
 
 static void *bpf_cgroup_array_dummy(enum cgroup_bpf_attach_type atype)
 {
+	if (cgroup_bpf_is_struct_ops_atype(atype))
+		return struct_ops_cfi_stubs[atype];
 	return bpf_prog_dummy();
 }
 
@@ -355,7 +407,12 @@ static int bpf_cgroup_array_copy_to_user(struct bpf_prog_array *array,
 	for (item = array->items; item->prog && i < cnt; item++) {
 		if (item->prog == bpf_cgroup_array_dummy(atype))
 			continue;
-		id = item->prog->aux->id;
+
+		if (cgroup_bpf_is_struct_ops_atype(atype))
+			id = bpf_struct_ops_kdata_map_id(item->kdata);
+		else
+			id = item->prog->aux->id;
+
 		if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 			return -EFAULT;
 		i++;
@@ -417,7 +474,7 @@ static void cgroup_bpf_release(struct work_struct *work)
 		old_array = rcu_dereference_protected(
 				cgrp->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
-		bpf_cgroup_array_free(old_array);
+		bpf_cgroup_array_free(old_array, atype);
 	}
 
 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
@@ -461,17 +518,26 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 
 static void prog_list_init_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
 {
-	item->prog = prog_list_prog(pl);
-	bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage);
+	if (pl->link && pl->link->map) {
+		item->kdata = bpf_struct_ops_map_kdata(pl->link->map);
+	} else {
+		item->prog = prog_list_prog(pl);
+		bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage);
+	}
 }
 
 static void prog_list_replace_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
 {
-	WRITE_ONCE(item->prog, pl->link->link.prog);
+	if (pl->link && pl->link->map)
+		WRITE_ONCE(item->kdata, bpf_struct_ops_map_kdata(pl->link->map));
+	else
+		WRITE_ONCE(item->prog, pl->link->link.prog);
 }
 
 static u32 prog_list_id(struct bpf_prog_list *pl)
 {
+	if (pl->link && pl->link->map)
+		return pl->link->map->id;
 	return prog_list_prog(pl)->aux->id;
 }
 
@@ -591,7 +657,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
 	/* free prog array after grace period, since __cgroup_bpf_run_*()
 	 * might be still walking the array
 	 */
-	bpf_cgroup_array_free(old_array);
+	bpf_cgroup_array_free(old_array, atype);
 }
 
 /**
@@ -631,7 +697,7 @@ static int cgroup_bpf_inherit(struct cgroup *cgrp)
 	return 0;
 cleanup:
 	for (i = 0; i < NR; i++)
-		bpf_cgroup_array_free(arrays[i]);
+		bpf_cgroup_array_free(arrays[i], i);
 
 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 		cgroup_bpf_put(p);
@@ -686,7 +752,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 
 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 			if (unlikely(desc->bpf.inactive)) {
-				bpf_cgroup_array_free(desc->bpf.inactive);
+				bpf_cgroup_array_free(desc->bpf.inactive, atype);
 				desc->bpf.inactive = NULL;
 			}
 			continue;
@@ -705,7 +771,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 	css_for_each_descendant_pre(css, &cgrp->self) {
 		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-		bpf_cgroup_array_free(desc->bpf.inactive);
+		bpf_cgroup_array_free(desc->bpf.inactive, atype);
 		desc->bpf.inactive = NULL;
 	}
 
@@ -940,7 +1006,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (pl) {
 		old_prog = pl->prog;
 	} else {
-		pl = kmalloc_obj(*pl);
+		pl = kzalloc_obj(*pl);
 		if (!pl) {
 			bpf_cgroup_storages_free(new_storage);
 			return -ENOMEM;
@@ -1337,7 +1403,17 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	if (effective_query && prog_attach_flags)
 		return -EINVAL;
 
-	if (type == BPF_LSM_CGROUP) {
+	if (type == BPF_STRUCT_OPS) {
+		u32 type_id = attr->query.type_id;
+
+		atype = find_atype_by_struct_ops_id(type_id);
+		if (atype == CGROUP_BPF_ATTACH_TYPE_INVALID)
+			return -ENOENT;
+		from_atype = to_atype = atype;
+		flags = 0;
+		if (!cgroup_bpf_enabled(atype))
+			goto skip_count;
+	} else if (type == BPF_LSM_CGROUP) {
 		if (!effective_query && attr->query.prog_cnt &&
 		    prog_ids && !prog_attach_flags)
 			return -EINVAL;
@@ -1363,6 +1439,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		}
 	}
 
+skip_count:
 	/* always output uattr->query.attach_flags as 0 during effective query */
 	flags = effective_query ? 0 : flags;
 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
@@ -2820,6 +2897,270 @@ const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
 const struct bpf_prog_ops cg_sockopt_prog_ops = {
 };
 
+static int __cgroup_struct_ops_link_detach(struct bpf_link *link, bool wake_poll)
+{
+	struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+	enum cgroup_bpf_attach_type atype;
+	struct bpf_prog_list *pl;
+	struct bpf_map *map;
+	struct cgroup *cgrp;
+
+	cgroup_lock();
+
+	cgrp = cg_link->cgroup;
+	if (!cgrp) {
+		cgroup_unlock();
+		return 0;
+	}
+
+	map = cg_link->map;
+	atype = bpf_struct_ops_map_cgroup_atype(map);
+
+	hlist_for_each_entry(pl, &cgrp->bpf.progs[atype], node) {
+		if (pl->link == cg_link)
+			break;
+	}
+
+	/* mark deleted so compute_effective_progs() skips it */
+	pl->link = NULL;
+	if (update_effective_progs(cgrp, atype)) {
+		pl->link = cg_link;
+		purge_effective_progs(cgrp, pl, atype);
+	}
+
+	hlist_del(&pl->node);
+	cgroup_struct_ops_link_detach_wake(cg_link, wake_poll);
+	cgrp->bpf.revisions[atype]++;
+
+	kfree(pl);
+	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
+
+	cgroup_unlock();
+
+	return 0;
+}
+
+static int cgroup_struct_ops_link_detach(struct bpf_link *link)
+{
+	return __cgroup_struct_ops_link_detach(link, true);
+}
+
+static void cgroup_struct_ops_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+	__cgroup_struct_ops_link_detach(link, false);
+	kfree(cg_link);
+}
+
+static void cgroup_struct_ops_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq)
+{
+	struct bpf_cgroup_link *cg_link =
+		container_of(link, struct bpf_cgroup_link, link);
+
+	cgroup_lock();
+	if (!cg_link->cgroup) {
+		cgroup_unlock();
+		return;
+	}
+
+	seq_printf(seq, "map_id:\t%u\n", cg_link->map->id);
+	seq_printf(seq, "cgroup_id:\t%llu\n", cgroup_id(cg_link->cgroup));
+	cgroup_unlock();
+}
+
+static int cgroup_struct_ops_link_fill_link_info(const struct bpf_link *link,
+						 struct bpf_link_info *info)
+{
+	struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+	cgroup_lock();
+	if (!cg_link->cgroup) {
+		cgroup_unlock();
+		return 0;
+	}
+
+	info->struct_ops.map_id = cg_link->map->id;
+	info->struct_ops.cgroup_id = cgroup_id(cg_link->cgroup);
+	cgroup_unlock();
+	return 0;
+}
+
+static int cgroup_struct_ops_link_update(struct bpf_link *link, struct bpf_map *new_map,
+					 struct bpf_map *expected_old_map)
+{
+	struct bpf_cgroup_link *cg_link = container_of(link, struct bpf_cgroup_link, link);
+	enum cgroup_bpf_attach_type atype;
+	struct bpf_prog_list *pl;
+	struct bpf_map *old_map;
+	struct cgroup *cgrp;
+	bool found = false;
+	int err;
+
+	if (!bpf_struct_ops_valid_to_reg(new_map))
+		return -EINVAL;
+
+	cgroup_lock();
+
+	cgrp = cg_link->cgroup;
+	if (!cgrp) {
+		err = -ENOLINK;
+		goto out;
+	}
+
+	old_map = cg_link->map;
+	err = bpf_struct_ops_link_update_check(new_map, old_map, expected_old_map);
+	if (err)
+		goto out;
+
+	atype = bpf_struct_ops_map_cgroup_atype(new_map);
+
+	hlist_for_each_entry(pl, &cgrp->bpf.progs[atype], node) {
+		if (pl->link == cg_link) {
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		err = -ENOENT;
+		goto out;
+	}
+
+	bpf_map_inc(new_map);
+	WRITE_ONCE(cg_link->map, new_map);
+	replace_effective_prog(cgrp, atype, pl);
+	bpf_map_put(old_map);
+	cgrp->bpf.revisions[atype]++;
+
+out:
+	cgroup_unlock();
+	return err;
+}
+
+static __poll_t cgroup_struct_ops_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct bpf_cgroup_link *link = file->private_data;
+
+	poll_wait(file, &link->wait_hup, pts);
+
+	return READ_ONCE(link->map) ? 0 : EPOLLHUP;
+}
+
+static const struct bpf_link_ops cgroup_struct_ops_link_ops = {
+	.dealloc = cgroup_struct_ops_link_dealloc,
+	.detach = cgroup_struct_ops_link_detach,
+	.show_fdinfo = cgroup_struct_ops_link_show_fdinfo,
+	.fill_link_info = cgroup_struct_ops_link_fill_link_info,
+	.update_map = cgroup_struct_ops_link_update,
+	.poll = cgroup_struct_ops_link_poll,
+};
+
+int cgroup_bpf_struct_ops_attach(struct bpf_map *map, const union bpf_attr *attr)
+{
+	u32 flags = attr->link_create.flags;
+	u32 pl_flags = (flags & BPF_F_PREORDER) | BPF_F_ALLOW_MULTI;
+	enum cgroup_bpf_attach_type atype;
+	struct bpf_link_primer link_primer;
+	struct bpf_cgroup_link *link;
+	struct bpf_prog_list *pl = NULL;
+	struct hlist_head *progs;
+	struct cgroup *cgrp;
+	int err;
+
+	if (flags & ~BPF_F_LINK_ATTACH_MASK)
+		return -EINVAL;
+
+	/*
+	 * Attaching struct_ops to cgroup is through link only. All relative
+	 * position must be corresponding to a link id or fd.
+	 */
+	if (attr->link_create.cgroup.relative_fd && !(flags & BPF_F_LINK))
+		return -EINVAL;
+
+	link = kzalloc_obj(*link, GFP_USER);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
+		      &cgroup_struct_ops_link_ops, NULL,
+		      attr->link_create.attach_type);
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		return err;
+	}
+
+	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
+	if (IS_ERR(cgrp)) {
+		err = PTR_ERR(cgrp);
+		goto cleanup;
+	}
+
+	bpf_map_inc(map);
+	link->map = map;
+	link->cgroup = cgrp;
+	init_waitqueue_head(&link->wait_hup);
+
+	atype = bpf_struct_ops_map_cgroup_atype(map);
+	progs = &cgrp->bpf.progs[atype];
+
+	cgroup_lock();
+
+	if (attr->link_create.cgroup.expected_revision &&
+	    attr->link_create.cgroup.expected_revision != cgrp->bpf.revisions[atype]) {
+		err = -ESTALE;
+		goto unlock;
+	}
+
+	if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) {
+		err = -E2BIG;
+		goto unlock;
+	}
+
+	pl = kzalloc_obj(*pl);
+	if (!pl) {
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	pl->link = link;
+	pl->flags = pl_flags;
+	cgrp->bpf.flags[atype] = BPF_F_ALLOW_MULTI;
+
+	err = insert_pl_to_hlist(pl, progs, NULL, link,
+				 flags | BPF_F_ALLOW_MULTI, attr->link_create.cgroup.relative_fd);
+	if (err)
+		goto unlock;
+
+	err = update_effective_progs(cgrp, atype);
+	if (err) {
+		hlist_del(&pl->node);
+		goto unlock;
+	}
+
+	cgrp->bpf.revisions[atype]++;
+	static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+
+	cgroup_unlock();
+
+	return bpf_link_settle(&link_primer);
+
+unlock:
+	cgroup_unlock();
+
+cleanup:
+	kfree(pl);
+	if (link->cgroup) {
+		cgroup_put(link->cgroup);
+		link->cgroup = NULL;
+		bpf_map_put(link->map);
+		link->map = NULL;
+	}
+	bpf_link_cleanup(&link_primer);
+	return err;
+}
+
 /* Common helpers for cgroup hooks. */
 const struct bpf_func_proto *
 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b07acf37ad1d..b439c0b0eadd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4813,6 +4813,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
 	case BPF_LSM_CGROUP:
+	case BPF_STRUCT_OPS:
 		return cgroup_bpf_prog_query(attr, uattr, uattr_size);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 89b36de5fdbb..2b84c69eb814 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1756,7 +1756,7 @@ union bpf_attr {
 			__u32	prog_cnt;
 			__u32	count;
 		};
-		__u32		:32;
+		__u32		type_id;
 		/* output: per-program attach_flags.
 		 * not allowed to be set during effective query.
 		 */
@@ -6815,6 +6815,8 @@ struct bpf_link_info {
 		} xdp;
 		struct {
 			__u32 map_id;
+			__u32 :32;
+			__u64 cgroup_id;
 		} struct_ops;
 		struct {
 			__u32 pf;
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 08/15] bpf: Add a few bpf_cgroup_array_* helper functions
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

In the upcoming patch, the array can store a struct_ops map.
The array could have a cfi_stubs acting as a dummy instead of
the dummy_bpf_prog. The array logic will need to skip the cfi_stubs
also in order to support storing struct_ops map in the array.

bpf_cgroup_array_length(), bpf_cgroup_array_copy_to_user(), and
bpf_cgroup_array_delete_safe_at() are added as a preparation work
to allow skipping the cfi_stubs in the upcoming patch. This patch
only skips the dummy_bpf_prog which is the same as the existing behavior.
The current bpf_prog_array_*() callers are changed to call the new
bpf_cgroup_array_*(). This is a no-op change.

Unlike bpf_prog_array_copy_to_user(), bpf_cgroup_array_copy_to_user()
does not need a temporary buffer. The cgroup caller already holds
cgroup_mutex and dereferences the effective array with
rcu_dereference_protected(), so it does not copy to userspace
from an RCU read-side critical section. Details in commit 0911287ce32b.

Another addition is the bpf_cgroup_array_free(). This prepares
the array to have a different rcu gp for the struct_ops use case,
for example, a struct_ops could have mix of sleepable ops and
non-sleepable ops. In this patch, bpf_cgroup_array_free() only
goes through the regular rcu gp. This is a no-op change also.

bpf_prog_dummy() is also added to return the global dummy_bpf_prog.

bpf_cgroup_array_dummy() is added to decide the sentinel based on atype.
It now always returns bpf_prog_dummy(). In the upcoming patch,
it can return a cfi_stubs if the atype belongs to a struct_ops.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 include/linux/bpf.h |  1 +
 kernel/bpf/cgroup.c | 79 +++++++++++++++++++++++++++++++++++++++------
 kernel/bpf/core.c   |  5 +++
 3 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 047ffc029666..e371a4733135 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2561,6 +2561,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
 			struct bpf_prog *include_prog,
 			u64 bpf_cookie,
 			struct bpf_prog_array **new_array);
+struct bpf_prog *bpf_prog_dummy(void);
 
 struct bpf_run_ctx {};
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 7abbe12e108f..081d81de1816 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -319,6 +319,67 @@ static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
 	link->cgroup = NULL;
 }
 
+static void bpf_cgroup_array_free(struct bpf_prog_array *array)
+{
+	if (!array || array == &bpf_empty_prog_array)
+		return;
+	kfree_rcu(array, rcu);
+}
+
+static void *bpf_cgroup_array_dummy(enum cgroup_bpf_attach_type atype)
+{
+	return bpf_prog_dummy();
+}
+
+static int bpf_cgroup_array_length(struct bpf_prog_array *array,
+				   enum cgroup_bpf_attach_type atype)
+{
+	struct bpf_prog_array_item *item;
+	int cnt = 0;
+
+	for (item = array->items; item->prog; item++)
+		if (item->prog != bpf_cgroup_array_dummy(atype))
+			cnt++;
+
+	return cnt;
+}
+
+static int bpf_cgroup_array_copy_to_user(struct bpf_prog_array *array,
+					 __u32 __user *prog_ids, int cnt,
+					 enum cgroup_bpf_attach_type atype)
+{
+	struct bpf_prog_array_item *item;
+	int i = 0;
+	u32 id;
+
+	for (item = array->items; item->prog && i < cnt; item++) {
+		if (item->prog == bpf_cgroup_array_dummy(atype))
+			continue;
+		id = item->prog->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+			return -EFAULT;
+		i++;
+	}
+	return item->prog ? -ENOSPC : 0;
+}
+
+static int bpf_cgroup_array_delete_safe_at(struct bpf_prog_array *array,
+					   int index, enum cgroup_bpf_attach_type atype)
+{
+	struct bpf_prog_array_item *item;
+
+	for (item = array->items; item->prog; item++) {
+		if (item->prog == bpf_cgroup_array_dummy(atype))
+			continue;
+		if (!index) {
+			WRITE_ONCE(item->prog, bpf_cgroup_array_dummy(atype));
+			return 0;
+		}
+		index--;
+	}
+	return -ENOENT;
+}
+
 /**
  * cgroup_bpf_release() - put references of all bpf programs and
  *                        release all cgroup bpf data
@@ -356,7 +417,7 @@ static void cgroup_bpf_release(struct work_struct *work)
 		old_array = rcu_dereference_protected(
 				cgrp->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
-		bpf_prog_array_free(old_array);
+		bpf_cgroup_array_free(old_array);
 	}
 
 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
@@ -530,7 +591,7 @@ static void activate_effective_progs(struct cgroup *cgrp,
 	/* free prog array after grace period, since __cgroup_bpf_run_*()
 	 * might be still walking the array
 	 */
-	bpf_prog_array_free(old_array);
+	bpf_cgroup_array_free(old_array);
 }
 
 /**
@@ -570,7 +631,7 @@ static int cgroup_bpf_inherit(struct cgroup *cgrp)
 	return 0;
 cleanup:
 	for (i = 0; i < NR; i++)
-		bpf_prog_array_free(arrays[i]);
+		bpf_cgroup_array_free(arrays[i]);
 
 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
 		cgroup_bpf_put(p);
@@ -625,7 +686,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 
 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
 			if (unlikely(desc->bpf.inactive)) {
-				bpf_prog_array_free(desc->bpf.inactive);
+				bpf_cgroup_array_free(desc->bpf.inactive);
 				desc->bpf.inactive = NULL;
 			}
 			continue;
@@ -644,7 +705,7 @@ static int update_effective_progs(struct cgroup *cgrp,
 	css_for_each_descendant_pre(css, &cgrp->self) {
 		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-		bpf_prog_array_free(desc->bpf.inactive);
+		bpf_cgroup_array_free(desc->bpf.inactive);
 		desc->bpf.inactive = NULL;
 	}
 
@@ -1166,7 +1227,7 @@ static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog_list *pl,
 				lockdep_is_held(&cgroup_mutex));
 
 		/* Remove the program from the array */
-		WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+		WARN_ONCE(bpf_cgroup_array_delete_safe_at(progs, pos, atype),
 			  "Failed to purge a prog from array at index %d", pos);
 	}
 }
@@ -1296,7 +1357,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		if (effective_query) {
 			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 							      lockdep_is_held(&cgroup_mutex));
-			total_cnt += bpf_prog_array_length(effective);
+			total_cnt += bpf_cgroup_array_length(effective, atype);
 		} else {
 			total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
 		}
@@ -1326,8 +1387,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		if (effective_query) {
 			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 							      lockdep_is_held(&cgroup_mutex));
-			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
-			ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+			cnt = min_t(int, bpf_cgroup_array_length(effective, atype), total_cnt);
+			ret = bpf_cgroup_array_copy_to_user(effective, prog_ids, cnt, atype);
 		} else {
 			struct hlist_head *progs;
 			struct bpf_prog_list *pl;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 649cce41e13f..1837bb7bb4e9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2740,6 +2740,11 @@ void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
 	call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
 }
 
+struct bpf_prog *bpf_prog_dummy(void)
+{
+	return &dummy_bpf_prog.prog;
+}
+
 int bpf_prog_array_length(struct bpf_prog_array *array)
 {
 	struct bpf_prog_array_item *item;
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 07/15] bpf: Move LSM trampoline unlink into bpf_cgroup_link_auto_detach()
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

Move the LSM trampoline unlink into bpf_cgroup_link_auto_detach().
The purpose is to consolidate the auto_detach cleanup logic.

It prepares for the upcoming struct_ops cgroup attachment patch where
bpf_cgroup_link_auto_detach() will need to handle the struct_ops case
(link->map != NULL).

This is a no-op change.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 kernel/bpf/cgroup.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b43f0bff184c..7abbe12e108f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -313,6 +313,8 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
  */
 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
 {
+	if (link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+		bpf_trampoline_unlink_cgroup_shim(link->link.prog);
 	cgroup_put(link->cgroup);
 	link->cgroup = NULL;
 }
@@ -346,11 +348,8 @@ static void cgroup_bpf_release(struct work_struct *work)
 					bpf_trampoline_unlink_cgroup_shim(pl->prog);
 				bpf_prog_put(pl->prog);
 			}
-			if (pl->link) {
-				if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
-					bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
+			if (pl->link)
 				bpf_cgroup_link_auto_detach(pl->link);
-			}
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
 		}
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 06/15] bpf: Add prog_list_init_item(), prog_list_replace_item(), and prog_list_id()
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

Add three helpers to abstract operations on a bpf_prog_list entry.

Right now, bpf_prog_array_item is initialized from prog_list_prog(pl),
which returns either pl->prog or pl->link->link.prog. This will not work
when struct_ops is attached to a cgroup because the attachment is backed
by a struct_ops map instead of a BPF prog.

The same applies to __cgroup_bpf_query(). Instead of always copying a
prog id to userspace, struct_ops cgroup attachment will need to copy the
struct_ops map id.

Refactor bpf_prog_array_item initialization into prog_list_init_item()
and prog_list_replace_item(), and refactor id lookup into prog_list_id().
These helpers will be extended to support pl->link->map in a later patch.

This is a no-op change.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 kernel/bpf/cgroup.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b100c04cb9c8..b43f0bff184c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -399,6 +399,22 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 	return NULL;
 }
 
+static void prog_list_init_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
+{
+	item->prog = prog_list_prog(pl);
+	bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage);
+}
+
+static void prog_list_replace_item(struct bpf_prog_list *pl, struct bpf_prog_array_item *item)
+{
+	WRITE_ONCE(item->prog, pl->link->link.prog);
+}
+
+static u32 prog_list_id(struct bpf_prog_list *pl)
+{
+	return prog_list_prog(pl)->aux->id;
+}
+
 /* count number of elements in the list.
  * it's slow but the list cannot be long
  */
@@ -492,9 +508,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
 				item = &progs->items[fstart];
 				fstart++;
 			}
-			item->prog = prog_list_prog(pl);
-			bpf_cgroup_storages_assign(item->cgroup_storage,
-						   pl->storage);
+			prog_list_init_item(pl, item);
 			cnt++;
 		}
 
@@ -1015,7 +1029,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
 				desc->bpf.effective[atype],
 				lockdep_is_held(&cgroup_mutex));
 		item = &progs->items[pos];
-		WRITE_ONCE(item->prog, pl->link->link.prog);
+		prog_list_replace_item(pl, item);
 	}
 }
 
@@ -1318,15 +1332,13 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		} else {
 			struct hlist_head *progs;
 			struct bpf_prog_list *pl;
-			struct bpf_prog *prog;
 			u32 id;
 
 			progs = &cgrp->bpf.progs[atype];
 			cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
 			i = 0;
 			hlist_for_each_entry(pl, progs, node) {
-				prog = prog_list_prog(pl);
-				id = prog->aux->id;
+				id = prog_list_id(pl);
 				if (copy_to_user(prog_ids + i, &id, sizeof(id)))
 					return -EFAULT;
 				if (++i == cnt)
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 05/15] bpf: Replace prog_list_prog() check with direct pl->prog and pl->link check
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

prog_list_length() and compute_effective_progs() use !prog_list_prog(pl)
to skip a 'detaching' pl.

When pl->link is not NULL, prog_list_prog(pl) returns
the pl->link->link.prog. This does not work for the upcoming struct_ops
patch where pl->link is not NULL but pl->link->link.prog is NULL,
because a struct_ops map is attached to the cgroup instead of a BPF prog.

To prepare for the upcoming struct_ops patch, this patch
replaces the prog_list_prog() test with the
"!pl->prog && !pl->link". In __cgroup_bpf_detach(),
both pl->prog and pl->link are set to NULL, so testing
"!pl->prog && !pl->link" is the same test to tell
if a pl is being detached. This change should be a no-op.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 kernel/bpf/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index b64f6757096c..b100c04cb9c8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -408,7 +408,7 @@ static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
 	u32 cnt = 0;

 	hlist_for_each_entry(pl, head, node) {
-		if (!prog_list_prog(pl))
+		if (!pl->prog && !pl->link)
 			continue;
 		if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
 			(*preorder_cnt)++;
@@ -482,7 +482,7 @@ static int compute_effective_progs(struct cgroup *cgrp,

 		init_bstart = bstart;
 		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
-			if (!prog_list_prog(pl))
+			if (!pl->prog && !pl->link)
 				continue;

 			if (pl->flags & BPF_F_PREORDER) {
-- 
2.53.0-Meta

^ permalink raw reply related

* [PATCH bpf-next v2 04/15] bpf: Remove unnecessary prog_list_prog() check
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

effective_prog_pos(), called from replace_effective_prog() and
purge_effective_progs(), tests "!prog_list_prog(pl)" to skip a
'detaching' pl.

When detaching a pl, pl->prog and pl->link are set to NULL in case
the update_effective_progs() failed.

However, replace_effective_prog() is not detaching a pl,
so the case "!prog_list_prog()" will not happen.

In purge_effective_prog(), the pl->prog and pl->link are restored
before calling purge_effective_progs(), so the case "!prog_list_prog()"
will not happen either.

This patch removes them as a prep work for the upcoming work
in attaching struct_ops to cgroup. When attaching a struct_ops
to cgroup, there is a link->map case and the prog_list_prog()
will not consider the link->map. The replace_effective_prog()
and purge_effective_progs() will then incorrectly skip a pl
with struct_ops map attached to it.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 kernel/bpf/cgroup.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4355ccb78a9c..b64f6757096c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -965,9 +965,10 @@ static int effective_prog_pos(struct cgroup *cgrp,

 		init_bstart = bstart;
 		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
-			if (!prog_list_prog(pl))
-				continue;
-
+			/*
+			 * No detaching pl (NULL prog and link) is visible to the callers,
+			 * so skip the check compute_effective_progs() needs.
+			 */
 			if (pl->flags & BPF_F_PREORDER) {
 				if (pl == target_pl)
 					pos = bstart;
-- 
2.53.0-Meta

^ permalink raw reply related

* [PATCH bpf-next v2 03/15] bpf: Add bpf_struct_ops accessor helpers
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

Add the helper functions bpf_struct_ops_map_kdata(),
bpf_struct_ops_kdata_map_id(), and bpf_struct_ops_map_cfi_stubs()
in bpf_struct_ops.c. They will be called from cgroup.c in the upcoming
patch to create a struct_ops to cgroup attachment link.

bpf_struct_ops_valid_to_reg() is also exposed for the upcoming caller
in cgroup.c.

The link update validation is also refactored into a new function
bpf_struct_ops_link_update_check() such that it can be reused by the caller
in cgroup.c in the upcoming patch.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 include/linux/bpf.h         | 28 +++++++++++++++++
 kernel/bpf/bpf_struct_ops.c | 63 ++++++++++++++++++++++++++++---------
 2 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7ac8873839f4..047ffc029666 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2252,6 +2252,12 @@ u32 bpf_struct_ops_id(const void *kdata);
 int bpf_struct_ops_for_each_prog(const void *kdata,
 				 int (*cb)(struct bpf_prog *prog, void *data),
 				 void *data);
+void *bpf_struct_ops_map_kdata(struct bpf_map *map);
+u32 bpf_struct_ops_kdata_map_id(void *kdata);
+void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map);
+bool bpf_struct_ops_valid_to_reg(struct bpf_map *map);
+int bpf_struct_ops_link_update_check(struct bpf_map *new_map, struct bpf_map *old_map,
+				     struct bpf_map *expected_old_map);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
@@ -2316,6 +2322,28 @@ static inline void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struc
 static inline void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
 {
 }
+static inline void *bpf_struct_ops_map_kdata(struct bpf_map *map)
+{
+	return NULL;
+}
+static inline u32 bpf_struct_ops_kdata_map_id(void *kdata)
+{
+	return 0;
+}
+static inline void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
+{
+	return NULL;
+}
+static inline bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+{
+	return false;
+}
+static inline int bpf_struct_ops_link_update_check(struct bpf_map *new_map,
+						   struct bpf_map *old_map,
+						   struct bpf_map *expected_old_map)
+{
+	return -EOPNOTSUPP;
+}
 
 #endif
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index c422ce41873e..1ca44584ed17 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1236,7 +1236,33 @@ int bpf_struct_ops_for_each_prog(const void *kdata,
 }
 EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
 
-static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
+void *bpf_struct_ops_map_kdata(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map;
+
+	st_map = container_of(map, struct bpf_struct_ops_map, map);
+	return st_map->kvalue.data;
+}
+
+u32 bpf_struct_ops_kdata_map_id(void *kdata)
+{
+	struct bpf_struct_ops_value *kvalue =
+		container_of(kdata, struct bpf_struct_ops_value, data);
+	struct bpf_struct_ops_map *st_map =
+		container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	return st_map->map.id;
+}
+
+void *bpf_struct_ops_map_cfi_stubs(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map;
+
+	st_map = container_of(map, struct bpf_struct_ops_map, map);
+	return st_map->st_ops_desc->st_ops->cfi_stubs;
+}
+
+bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
 
@@ -1289,6 +1315,26 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
 	return 0;
 }
 
+int bpf_struct_ops_link_update_check(struct bpf_map *new_map,
+				     struct bpf_map *old_map,
+				     struct bpf_map *expected_old_map)
+{
+	struct bpf_struct_ops_map *st_map, *old_st_map;
+
+	if (!old_map)
+		return -ENOLINK;
+	if (expected_old_map && old_map != expected_old_map)
+		return -EPERM;
+
+	st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+	/* The new and old struct_ops must be the same type. */
+	if (st_map->st_ops_desc != old_st_map->st_ops_desc)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
 					  struct bpf_map *expected_old_map)
 {
@@ -1307,23 +1353,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 		return -EOPNOTSUPP;
 
 	mutex_lock(&update_mutex);
-
 	old_map = st_link->map;
-	if (!old_map) {
-		err = -ENOLINK;
-		goto err_out;
-	}
-	if (expected_old_map && old_map != expected_old_map) {
-		err = -EPERM;
+	err = bpf_struct_ops_link_update_check(new_map, old_map, expected_old_map);
+	if (err)
 		goto err_out;
-	}
 
 	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
-	/* The new and old struct_ops must be the same type. */
-	if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
-		err = -EINVAL;
-		goto err_out;
-	}
 
 	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
 	if (err)
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 02/15] bpf: Make struct_ops tasks_rcu grace period optional
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

bpf_struct_ops_map_free() currently waits for both a regular RCU grace
period and a tasks RCU grace period for every struct_ops map through
synchronize_rcu_mult(call_rcu, call_rcu_tasks).

A regular RCU grace period is still required for all struct_ops maps
because the struct_ops trampoline ksyms requires a rcu grace period
(take a look at the list_del_rcu in __bpf_ksym_del).
Add a map_free_pre_rcu() callback so the struct_ops map can remove
ksyms before bpf_map_put() wait for the regular rcu grace period.

The tasks RCU grace period is only needed by tcp_congestion_ops.
Add free_after_tasks_rcu_gp only to struct bpf_struct_ops instead
of the bpf_map.

When CONFIG_TASKS_RCU=n, synchronize_rcu_tasks() is the same as
synchronize_rcu(). Since all struct_ops maps now complete a regular RCU
grace period before bpf_struct_ops_map_free() runs, skip the extra
synchronize_rcu_tasks() call in this case.

This cleanup prepares for a later patch that needs to support
free_after_mult_rcu_gp.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 include/linux/bpf.h         |  7 +++++++
 kernel/bpf/bpf_struct_ops.c | 31 +++++++++++++------------------
 kernel/bpf/syscall.c        |  3 +++
 net/ipv4/bpf_tcp_ca.c       | 16 ++++++++++++++++
 4 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7719f6528445..7ac8873839f4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -90,6 +90,7 @@ struct bpf_map_ops {
 	struct bpf_map *(*map_alloc)(union bpf_attr *attr);
 	void (*map_release)(struct bpf_map *map, struct file *map_file);
 	void (*map_free)(struct bpf_map *map);
+	void (*map_free_pre_rcu)(struct bpf_map *map);
 	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
 	void (*map_release_uref)(struct bpf_map *map);
 	void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
@@ -2099,6 +2100,11 @@ struct btf_member;
  *	   unloaded while in use.
  * @name: The name of the struct bpf_struct_ops object.
  * @func_models: Func models
+ * @free_after_tasks_rcu_gp: Set to true if it needs the bpf core to wait for
+ *                           a tasks_rcu gp before freeing the struct_ops map
+ *                           and its progs. It is unnecessary if the @unreg
+ *                           has waited for the correct rcu gp or the @unreg
+ *                           has ensured all struct_ops prog has finished running.
  */
 struct bpf_struct_ops {
 	const struct bpf_verifier_ops *verifier_ops;
@@ -2117,6 +2123,7 @@ struct bpf_struct_ops {
 	struct module *owner;
 	const char *name;
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+	bool free_after_tasks_rcu_gp;
 };
 
 /* Every member of a struct_ops type has an instance even a member is not
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d06b3d9bcc13..c422ce41873e 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -984,9 +984,18 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
 	bpf_map_area_free(st_map);
 }
 
+static void bpf_struct_ops_map_free_pre_rcu(struct bpf_map *map)
+{
+	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+
+	bpf_struct_ops_map_del_ksyms(st_map);
+}
+
 static void bpf_struct_ops_map_free(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+	struct bpf_struct_ops *st_ops = st_map->st_ops_desc->st_ops;
+	bool tasks_rcu = st_ops->free_after_tasks_rcu_gp;
 
 	/* st_ops->owner was acquired during map_alloc to implicitly holds
 	 * the btf's refcnt. The acquire was only done when btf_is_module()
@@ -997,24 +1006,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
 
 	bpf_struct_ops_map_dissoc_progs(st_map);
 
-	bpf_struct_ops_map_del_ksyms(st_map);
-
-	/* The struct_ops's function may switch to another struct_ops.
-	 *
-	 * For example, bpf_tcp_cc_x->init() may switch to
-	 * another tcp_cc_y by calling
-	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
-	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
-	 * and its refcount may reach 0 which then free its
-	 * trampoline image while tcp_cc_x is still running.
-	 *
-	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
-	 * to finish. bpf-tcp-cc prog is non sleepable.
-	 * A rcu_tasks gp is to wait for the last few insn
-	 * in the tramopline image to finish before releasing
-	 * the trampoline image.
-	 */
-	synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+	if (tasks_rcu && IS_ENABLED(CONFIG_TASKS_RCU))
+		synchronize_rcu_tasks();
 
 	__bpf_struct_ops_map_free(map);
 }
@@ -1123,6 +1116,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 
 	mutex_init(&st_map->lock);
 	bpf_map_init_from_attr(map, attr);
+	map->free_after_rcu_gp = true;
 
 	return map;
 
@@ -1155,6 +1149,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
 	.map_alloc_check = bpf_struct_ops_map_alloc_check,
 	.map_alloc = bpf_struct_ops_map_alloc,
 	.map_free = bpf_struct_ops_map_free,
+	.map_free_pre_rcu = bpf_struct_ops_map_free_pre_rcu,
 	.map_get_next_key = bpf_struct_ops_map_get_next_key,
 	.map_lookup_elem = bpf_struct_ops_map_lookup_elem,
 	.map_delete_elem = bpf_struct_ops_map_delete_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6db306d23b47..b07acf37ad1d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -956,6 +956,9 @@ void bpf_map_put(struct bpf_map *map)
 		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
 
+		if (map->ops->map_free_pre_rcu)
+			map->ops->map_free_pre_rcu(map);
+
 		WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
 		/* RCU tasks trace grace period implies RCU grace period. */
 		if (READ_ONCE(map->free_after_mult_rcu_gp))
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 791e15063237..e224ecafbd69 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -339,6 +339,22 @@ static struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
 	.cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+	/* The struct_ops's function may switch to another struct_ops.
+	 *
+	 * For example, bpf_tcp_cc_x->init() may switch to
+	 * another tcp_cc_y by calling
+	 * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+	 * During the switch,  bpf_struct_ops_put(tcp_cc_x) is called
+	 * and its refcount may reach 0 which then free its
+	 * trampoline image while tcp_cc_x is still running.
+	 *
+	 * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+	 * to finish. bpf-tcp-cc prog is non sleepable.
+	 * A rcu_tasks gp is to wait for the last few insn
+	 * in the tramopline image to finish before releasing
+	 * the trampoline image.
+	 */
+	.free_after_tasks_rcu_gp = true,
 	.owner = THIS_MODULE,
 };
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH bpf-next v2 01/15] bpf: Remove __rcu tagging in st_link->map
From: Amery Hung @ 2026-06-23 17:49 UTC (permalink / raw)
  To: bpf
  Cc: netdev, alexei.starovoitov, andrii, daniel, eddyz87, memxor,
	martin.lau, shakeel.butt, roman.gushchin, kuniyu, kerneljasonxing,
	ameryhung, kernel-team
In-Reply-To: <20260623175006.3136053-1-ameryhung@gmail.com>

From: Martin KaFai Lau <martin.lau@kernel.org>

st_link->map is always written under update_mutex. The paths that read
st_link->map with rcu_read_lock() are not in the fast path, so they can
simply take update_mutex instead. Remove the __rcu annotation and replace
all RCU accessors with direct pointer reads under update_mutex. Use
READ_ONCE() in bpf_struct_ops_map_link_poll() which reads the pointer
without holding update_mutex.

It is a simplification change.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
 kernel/bpf/bpf_struct_ops.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 51b16e5f5534..d06b3d9bcc13 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -57,7 +57,7 @@ struct bpf_struct_ops_map {
 
 struct bpf_struct_ops_link {
 	struct bpf_link link;
-	struct bpf_map __rcu *map;
+	struct bpf_map *map;
 	wait_queue_head_t wait_hup;
 };
 
@@ -1257,8 +1257,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
 	struct bpf_struct_ops_map *st_map;
 
 	st_link = container_of(link, struct bpf_struct_ops_link, link);
-	st_map = (struct bpf_struct_ops_map *)
-		rcu_dereference_protected(st_link->map, true);
+	st_map = (struct bpf_struct_ops_map *)st_link->map;
 	if (st_map) {
 		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
 		bpf_map_put(&st_map->map);
@@ -1273,11 +1272,11 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
 	struct bpf_map *map;
 
 	st_link = container_of(link, struct bpf_struct_ops_link, link);
-	rcu_read_lock();
-	map = rcu_dereference(st_link->map);
+	mutex_lock(&update_mutex);
+	map = st_link->map;
 	if (map)
 		seq_printf(seq, "map_id:\t%d\n", map->id);
-	rcu_read_unlock();
+	mutex_unlock(&update_mutex);
 }
 
 static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
@@ -1287,11 +1286,11 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
 	struct bpf_map *map;
 
 	st_link = container_of(link, struct bpf_struct_ops_link, link);
-	rcu_read_lock();
-	map = rcu_dereference(st_link->map);
+	mutex_lock(&update_mutex);
+	map = st_link->map;
 	if (map)
 		info->struct_ops.map_id = map->id;
-	rcu_read_unlock();
+	mutex_unlock(&update_mutex);
 	return 0;
 }
 
@@ -1314,7 +1313,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 
 	mutex_lock(&update_mutex);
 
-	old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+	old_map = st_link->map;
 	if (!old_map) {
 		err = -ENOLINK;
 		goto err_out;
@@ -1336,7 +1335,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 		goto err_out;
 
 	bpf_map_inc(new_map);
-	rcu_assign_pointer(st_link->map, new_map);
+	WRITE_ONCE(st_link->map, new_map);
 	bpf_map_put(old_map);
 
 err_out:
@@ -1353,7 +1352,7 @@ static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
 
 	mutex_lock(&update_mutex);
 
-	map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+	map = st_link->map;
 	if (!map) {
 		mutex_unlock(&update_mutex);
 		return 0;
@@ -1362,7 +1361,7 @@ static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
 
 	st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
 
-	RCU_INIT_POINTER(st_link->map, NULL);
+	WRITE_ONCE(st_link->map, NULL);
 	/* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
 	 * bpf_map_inc() in bpf_struct_ops_map_link_update().
 	 */
@@ -1382,7 +1381,7 @@ static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
 
 	poll_wait(file, &st_link->wait_hup, pts);
 
-	return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+	return READ_ONCE(st_link->map) ? 0 : EPOLLHUP;
 }
 
 static const struct bpf_link_ops bpf_struct_ops_map_lops = {
@@ -1438,7 +1437,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 		link = NULL;
 		goto err_out;
 	}
-	RCU_INIT_POINTER(link->map, map);
+	link->map = map;
 	mutex_unlock(&update_mutex);
 
 	return bpf_link_settle(&link_primer);
-- 
2.53.0-Meta


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox