* Re: [PATCH] bpf, sockmap: keep sk_msg copy state in sync
2026-05-16 16:43 [PATCH] bpf, sockmap: keep sk_msg copy state in sync Zhang Cen
@ 2026-05-18 19:30 ` John Fastabend
0 siblings, 0 replies; 2+ messages in thread
From: John Fastabend @ 2026-05-18 19:30 UTC (permalink / raw)
To: Zhang Cen
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Stanislav Fomichev, Jakub Sitnicki,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, bpf, netdev, linux-kernel, zerocling0077,
2045gemini, stable
On Sun, May 17, 2026 at 12:43:19AM +0800, Zhang Cen wrote:
>SK_MSG helpers use msg->sg.copy as provenance for scatterlist entries that
>still refer to external or shared pages and must not be exposed through
>data/data_end.
>
>bpf_msg_pull_data(), bpf_msg_push_data() and bpf_msg_pop_data() rewrite the
>scatterlist ring by compacting, splitting and shifting entries. Those
>updates move msg->sg.data[] slots around, but leave the parallel copy
>bitmap behind.
>A later helper sequence can then move an external entry back to
>msg->sg.start with its copy bit cleared and make
>sk_msg_compute_data_pointers() treat it as directly writable packet data.
>
>Keep msg->sg.copy synchronized with every scatterlist move, preserve
>the bit for split tail entries, and clear it whenever a helper replaces
>an entry with a freshly allocated private page.
>
>Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
>Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data")
>Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages")
>Cc: stable@vger.kernel.org
>Co-developed-by: Han Guidong <2045gemini@gmail.com>
>Signed-off-by: Han Guidong <2045gemini@gmail.com>
>Signed-off-by: Zhang Cen <rollkingzzc@gmail.com>
>---
>While researching recent page cache bugs, we discovered this bug. We confirmed it allows overwriting the page cache of read-only files via splice(). We haven't attempted to write an exploit, but the corruption primitive is verified. PoC available upon request. Recommend fixing ASAP.
>---
Important note here is it requires a BPF skmsg program that is using
the push/pull/pop API calls correct? Agree its not great we should
fix, but the scope is limited to just this set of users.
> net/core/filter.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 49 insertions(+), 1 deletion(-)
fwiw I also found this looking at things last Friday. Seems tools
are good at finding this.
>
>diff --git a/net/core/filter.c b/net/core/filter.c
>index 9590877b0714f..352233da29429 100644
>--- a/net/core/filter.c
>+++ b/net/core/filter.c
>@@ -2654,6 +2654,19 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
> }
> }
>
>+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
>+{
>+ return test_bit(i, msg->sg.copy);
>+}
>+
>+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i, bool copy)
>+{
>+ if (copy)
>+ __set_bit(i, msg->sg.copy);
>+ else
>+ __clear_bit(i, msg->sg.copy);
>+}
>+
> static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
> .func = bpf_msg_cork_bytes,
> .gpl_only = false,
>@@ -2738,6 +2751,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
> } while (i != last_sge);
>
> sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
>+ sk_msg_set_elem_copy(msg, first_sge, false);
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
The sg.end is cleared at the end of the msg_pull_data() already correct?
>
> /* To repair sg ring we need to shift entries. If we only
> * had a single entry though we can just replace it and
>@@ -2754,6 +2769,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
> sk_msg_iter_var_next(i);
> do {
> u32 move_from;
>+ bool move_copy;
>
> if (i + shift >= NR_MSG_FRAG_IDS)
> move_from = i + shift - NR_MSG_FRAG_IDS;
>@@ -2762,10 +2778,13 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
> if (move_from == msg->sg.end)
> break;
>
>+ move_copy = sk_msg_elem_is_copy(msg, move_from);
> msg->sg.data[i] = msg->sg.data[move_from];
>+ sk_msg_set_elem_copy(msg, i, move_copy);
This block is open coded sk_msg_sg_move()?
> msg->sg.data[move_from].length = 0;
> msg->sg.data[move_from].page_link = 0;
> msg->sg.data[move_from].offset = 0;
>+ sk_msg_set_elem_copy(msg, move_from, false);
+1.
> sk_msg_iter_var_next(i);
> } while (1);
>
>@@ -2794,6 +2813,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> {
> struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
> u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
>+ bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
>+ bool rsge_copy = false;
> u8 *raw, *to, *from;
> struct page *page;
>
>@@ -2866,6 +2887,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> sk_msg_iter_var_prev(i);
> psge = sk_msg_elem(msg, i);
> rsge = sk_msg_elem_cpy(msg, i);
>+ rsge_copy = sk_msg_elem_is_copy(msg, i);
>
> psge->length = start - offset;
> rsge.length -= psge->length;
>@@ -2891,23 +2913,31 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> /* Shift one or two slots as needed */
> sge = sk_msg_elem_cpy(msg, new);
> sg_unmark_end(&sge);
>+ sge_copy = sk_msg_elem_is_copy(msg, new);
>
There is another bug here, maybe arguably not the same issue as the
copy bit handling.
@@ -2857,10 +2870,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, st
art,
sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
+ rsge_copy = test_bit(i, msg->sg.copy);
psge->length = start - offset;
rsge.length -= psge->length;
- rsge.offset += start;
+ rsge.offset += start - offset;
> nsge = sk_msg_elem_cpy(msg, i);
>+ nsge_copy = sk_msg_elem_is_copy(msg, i);
> if (rsge.length) {
> sk_msg_iter_var_next(i);
> nnsge = sk_msg_elem_cpy(msg, i);
>+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
> sk_msg_iter_next(msg, end);
> }
>
> while (i != msg->sg.end) {
> msg->sg.data[i] = sge;
>+ sk_msg_set_elem_copy(msg, i, sge_copy);
> sge = nsge;
>+ sge_copy = nsge_copy;
> sk_msg_iter_var_next(i);
> if (rsge.length) {
> nsge = nnsge;
>+ nsge_copy = nnsge_copy;
> nnsge = sk_msg_elem_cpy(msg, i);
>+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
> } else {
> nsge = sk_msg_elem_cpy(msg, i);
>+ nsge_copy = sk_msg_elem_is_copy(msg, i);
> }
> }
>
>@@ -2915,13 +2945,15 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> /* Place newly allocated data buffer */
> sk_mem_charge(msg->sk, len);
> msg->sg.size += len;
>- __clear_bit(new, msg->sg.copy);
>+ sk_msg_set_elem_copy(msg, new, false);
> sg_set_page(&msg->sg.data[new], page, len + copy, 0);
> if (rsge.length) {
> get_page(sg_page(&rsge));
> sk_msg_iter_var_next(new);
> msg->sg.data[new] = rsge;
>+ sk_msg_set_elem_copy(msg, new, rsge_copy);
> }
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
Is this sk_msg_sel_elem_copy() needed? I'll need to study a bit
more to be sure, but I don't have it on my similar patch so at
least Friday I didn't think it was necessary.
>
> sk_msg_reset_curr(msg);
> sk_msg_compute_data_pointers(msg);
>@@ -2945,29 +2977,41 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
>
> put_page(sg_page(sge));
> do {
>+ bool copy;
>+
> prev = i;
> sk_msg_iter_var_next(i);
>+ copy = sk_msg_elem_is_copy(msg, i);
> msg->sg.data[prev] = msg->sg.data[i];
>+ sk_msg_set_elem_copy(msg, prev, copy);
sk_msg_sg_move()
> } while (i != msg->sg.end);
>
> sk_msg_iter_prev(msg, end);
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
> }
>
> static void sk_msg_shift_right(struct sk_msg *msg, int i)
> {
> struct scatterlist tmp, sge;
>+ bool tmp_copy, sge_copy;
>
> sk_msg_iter_next(msg, end);
> sge = sk_msg_elem_cpy(msg, i);
>+ sge_copy = sk_msg_elem_is_copy(msg, i);
> sk_msg_iter_var_next(i);
> tmp = sk_msg_elem_cpy(msg, i);
>+ tmp_copy = sk_msg_elem_is_copy(msg, i);
>
> while (i != msg->sg.end) {
> msg->sg.data[i] = sge;
>+ sk_msg_set_elem_copy(msg, i, sge_copy);
> sk_msg_iter_var_next(i);
> sge = tmp;
>+ sge_copy = tmp_copy;
> tmp = sk_msg_elem_cpy(msg, i);
>+ tmp_copy = sk_msg_elem_is_copy(msg, i);
> }
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
style nit. There is a lot of
sk_msg_set-elem_copy(..., false);
can we just have a helper
sk_msg_clear_elem_copy(msg, msg->sg.end)
it will be slightly nicer to read.
> }
>
> BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
>@@ -3024,8 +3068,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
> */
> if (start != offset) {
> struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
>+ u32 sge_idx = i;
> int a = start - offset;
> int b = sge->length - pop - a;
>+ bool sge_copy = sk_msg_elem_is_copy(msg, sge_idx);
>
> sk_msg_iter_var_next(i);
>
>@@ -3038,6 +3084,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
> sg_set_page(nsge,
> sg_page(sge),
> b, sge->offset + pop + a);
>+ sk_msg_set_elem_copy(msg, i, sge_copy);
> } else {
> struct page *page, *orig;
> u8 *to, *from;
>@@ -3054,6 +3101,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
> memcpy(to, from, a);
> memcpy(to + a, from + a + pop, b);
> sg_set_page(sge, page, a + b, 0);
>+ sk_msg_set_elem_copy(msg, sge_idx, false);
> put_page(orig);
> }
> pop = 0;
Otherwise, my initial read is this is good. Can you make the small
updates and I'll think a bit more about it in the meantime. The sashiko
commentary is good will investigate but I don't think it blocks this
patch.
^ permalink raw reply [flat|nested] 2+ messages in thread