* [PATCH v2] bpf, sockmap: keep sk_msg copy state in sync
@ 2026-05-17 12:16 Zhang Cen
2026-05-18 21:32 ` John Fastabend
0 siblings, 1 reply; 2+ messages in thread
From: Zhang Cen @ 2026-05-17 12:16 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, John Fastabend, Stanislav Fomichev,
Jakub Sitnicki
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, bpf, netdev, linux-kernel, zerocling0077,
2045gemini, Zhang Cen, stable
SK_MSG uses msg->sg.copy as per-scatterlist-entry provenance. Entries
with this bit set are copied before data/data_end are exposed to SK_MSG
BPF programs for direct packet access.
bpf_msg_pull_data(), bpf_msg_push_data() and bpf_msg_pop_data() rewrite
the sk_msg scatterlist ring by collapsing, splitting and shifting
entries. These operations move msg->sg.data[] entries, but the parallel
copy bitmap can be left behind or stale in slots that no longer contain
the original entry. A copied entry can therefore later occupy a slot whose
copy bit is clear and be exposed as directly writable packet data.
Keep msg->sg.copy synchronized with scatterlist entry moves, preserve the
copy bit when an entry is split, clear it when a helper replaces an entry
with a private page, and clear every slot vacated by pull-data
compaction.
Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data")
Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages")
Cc: stable@vger.kernel.org
Co-developed-by: Han Guidong <2045gemini@gmail.com>
Signed-off-by: Han Guidong <2045gemini@gmail.com>
Signed-off-by: Zhang Cen <rollkingzzc@gmail.com>
---
v2:
Sashiko-bot pointed out that bpf_msg_pull_data() could leave stale copy
bits on collapsed tail entries.
Clear msg->sg.copy for every entry consumed by bpf_msg_pull_data()
before compacting the scatterlist ring.
While researching recent page cache bugs, we discovered this bug.
We confirmed it allows overwriting the page cache of read-only files
via splice(). We haven't attempted to write an exploit, but the
corruption primitive is verified. PoC available upon request.
Recommend fixing ASAP.
---
net/core/filter.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 64 insertions(+), 2 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 9590877b0714f..018c30a0d71fb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2654,6 +2654,27 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
}
}
+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
+{
+ return test_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i, bool copy)
+{
+ if (copy)
+ __set_bit(i, msg->sg.copy);
+ else
+ __clear_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end)
+{
+ while (start != end) {
+ __clear_bit(start, msg->sg.copy);
+ sk_msg_iter_var_next(start);
+ }
+}
+
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.func = bpf_msg_cork_bytes,
.gpl_only = false,
@@ -2738,6 +2759,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
} while (i != last_sge);
sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
+ sk_msg_set_elem_copy(msg, first_sge, false);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
@@ -2747,13 +2769,20 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
shift = last_sge > first_sge ?
last_sge - first_sge - 1 :
NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
- if (!shift)
+ if (!shift) {
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
goto out;
+ }
+
+ i = first_sge;
+ sk_msg_iter_var_next(i);
+ sk_msg_clear_copy_range(msg, i, last_sge);
i = first_sge;
sk_msg_iter_var_next(i);
do {
u32 move_from;
+ bool move_copy;
if (i + shift >= NR_MSG_FRAG_IDS)
move_from = i + shift - NR_MSG_FRAG_IDS;
@@ -2762,16 +2791,20 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
if (move_from == msg->sg.end)
break;
+ move_copy = sk_msg_elem_is_copy(msg, move_from);
msg->sg.data[i] = msg->sg.data[move_from];
+ sk_msg_set_elem_copy(msg, i, move_copy);
msg->sg.data[move_from].length = 0;
msg->sg.data[move_from].page_link = 0;
msg->sg.data[move_from].offset = 0;
+ sk_msg_set_elem_copy(msg, move_from, false);
sk_msg_iter_var_next(i);
} while (1);
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
out:
sk_msg_reset_curr(msg);
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2794,6 +2827,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
{
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
+ bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
+ bool rsge_copy = false;
u8 *raw, *to, *from;
struct page *page;
@@ -2866,6 +2901,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
+ rsge_copy = sk_msg_elem_is_copy(msg, i);
psge->length = start - offset;
rsge.length -= psge->length;
@@ -2891,23 +2927,31 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Shift one or two slots as needed */
sge = sk_msg_elem_cpy(msg, new);
sg_unmark_end(&sge);
+ sge_copy = sk_msg_elem_is_copy(msg, new);
nsge = sk_msg_elem_cpy(msg, i);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
if (rsge.length) {
sk_msg_iter_var_next(i);
nnsge = sk_msg_elem_cpy(msg, i);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_next(msg, end);
}
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
+ sk_msg_set_elem_copy(msg, i, sge_copy);
sge = nsge;
+ sge_copy = nsge_copy;
sk_msg_iter_var_next(i);
if (rsge.length) {
nsge = nnsge;
+ nsge_copy = nnsge_copy;
nnsge = sk_msg_elem_cpy(msg, i);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
} else {
nsge = sk_msg_elem_cpy(msg, i);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
}
}
@@ -2915,13 +2959,15 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, msg->sg.copy);
+ sk_msg_set_elem_copy(msg, new, false);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
sk_msg_iter_var_next(new);
msg->sg.data[new] = rsge;
+ sk_msg_set_elem_copy(msg, new, rsge_copy);
}
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
sk_msg_reset_curr(msg);
sk_msg_compute_data_pointers(msg);
@@ -2945,29 +2991,41 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
put_page(sg_page(sge));
do {
+ bool copy;
+
prev = i;
sk_msg_iter_var_next(i);
+ copy = sk_msg_elem_is_copy(msg, i);
msg->sg.data[prev] = msg->sg.data[i];
+ sk_msg_set_elem_copy(msg, prev, copy);
} while (i != msg->sg.end);
sk_msg_iter_prev(msg, end);
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
}
static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
struct scatterlist tmp, sge;
+ bool tmp_copy, sge_copy;
sk_msg_iter_next(msg, end);
sge = sk_msg_elem_cpy(msg, i);
+ sge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_var_next(i);
tmp = sk_msg_elem_cpy(msg, i);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
+ sk_msg_set_elem_copy(msg, i, sge_copy);
sk_msg_iter_var_next(i);
sge = tmp;
+ sge_copy = tmp_copy;
tmp = sk_msg_elem_cpy(msg, i);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
}
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
}
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
@@ -3024,8 +3082,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
*/
if (start != offset) {
struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ u32 sge_idx = i;
int a = start - offset;
int b = sge->length - pop - a;
+ bool sge_copy = sk_msg_elem_is_copy(msg, sge_idx);
sk_msg_iter_var_next(i);
@@ -3038,6 +3098,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
sg_set_page(nsge,
sg_page(sge),
b, sge->offset + pop + a);
+ sk_msg_set_elem_copy(msg, i, sge_copy);
} else {
struct page *page, *orig;
u8 *to, *from;
@@ -3054,6 +3115,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
memcpy(to, from, a);
memcpy(to + a, from + a + pop, b);
sg_set_page(sge, page, a + b, 0);
+ sk_msg_set_elem_copy(msg, sge_idx, false);
put_page(orig);
}
pop = 0;
--
2.43.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH v2] bpf, sockmap: keep sk_msg copy state in sync
2026-05-17 12:16 [PATCH v2] bpf, sockmap: keep sk_msg copy state in sync Zhang Cen
@ 2026-05-18 21:32 ` John Fastabend
0 siblings, 0 replies; 2+ messages in thread
From: John Fastabend @ 2026-05-18 21:32 UTC (permalink / raw)
To: Zhang Cen
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Stanislav Fomichev, Jakub Sitnicki,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, bpf, netdev, linux-kernel, zerocling0077,
2045gemini, stable
On Sun, May 17, 2026 at 08:16:26PM +0800, Zhang Cen wrote:
>SK_MSG uses msg->sg.copy as per-scatterlist-entry provenance. Entries
>with this bit set are copied before data/data_end are exposed to SK_MSG
>BPF programs for direct packet access.
>
>bpf_msg_pull_data(), bpf_msg_push_data() and bpf_msg_pop_data() rewrite
>the sk_msg scatterlist ring by collapsing, splitting and shifting
>entries. These operations move msg->sg.data[] entries, but the parallel
>copy bitmap can be left behind or stale in slots that no longer contain
>the original entry. A copied entry can therefore later occupy a slot whose
>copy bit is clear and be exposed as directly writable packet data.
>
>Keep msg->sg.copy synchronized with scatterlist entry moves, preserve the
>copy bit when an entry is split, clear it when a helper replaces an entry
>with a private page, and clear every slot vacated by pull-data
>compaction.
>
>Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
>Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data")
>Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages")
>Cc: stable@vger.kernel.org
>Co-developed-by: Han Guidong <2045gemini@gmail.com>
>Signed-off-by: Han Guidong <2045gemini@gmail.com>
>Signed-off-by: Zhang Cen <rollkingzzc@gmail.com>
>---
>v2:
>Sashiko-bot pointed out that bpf_msg_pull_data() could leave stale copy
>bits on collapsed tail entries.
>
>Clear msg->sg.copy for every entry consumed by bpf_msg_pull_data()
>before compacting the scatterlist ring.
>
>While researching recent page cache bugs, we discovered this bug.
>We confirmed it allows overwriting the page cache of read-only files
>via splice(). We haven't attempted to write an exploit, but the
>corruption primitive is verified. PoC available upon request.
>Recommend fixing ASAP.
Sorry I missed your v2 so reviewing again.
Important note here on where this actually happens. It will only
effect users of BPF programs that are making the push/pop/..
calls. So most/all users should not be impacted. Agree though lets
fix this.
>---
> net/core/filter.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 64 insertions(+), 2 deletions(-)
>
>diff --git a/net/core/filter.c b/net/core/filter.c
>index 9590877b0714f..018c30a0d71fb 100644
>--- a/net/core/filter.c
>+++ b/net/core/filter.c
>@@ -2654,6 +2654,27 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
> }
> }
>
>+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
>+{
>+ return test_bit(i, msg->sg.copy);
>+}
>+
>+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i, bool copy)
>+{
>+ if (copy)
>+ __set_bit(i, msg->sg.copy);
>+ else
>+ __clear_bit(i, msg->sg.copy);
>+}
To make this easier to read I think having a,
static void sk_msg_clear_elem_copy(struct sk_msg *msg, u32 i, bool copy)
{
__clear_bit(i, msg->sg.copy);
}
is nice to have. Otherwise we get lots of
sk_msg_clear_elem_copy(..., false)
Or just direclty call __clear_bit() is also cleaner.
>+
>+static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end)
>+{
>+ while (start != end) {
>+ __clear_bit(start, msg->sg.copy);
>+ sk_msg_iter_var_next(start);
>+ }
>+}
>+
> static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
> .func = bpf_msg_cork_bytes,
> .gpl_only = false,
>@@ -2738,6 +2759,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
> } while (i != last_sge);
>
> sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
>+ sk_msg_set_elem_copy(msg, first_sge, false);
>
> /* To repair sg ring we need to shift entries. If we only
> * had a single entry though we can just replace it and
>@@ -2747,13 +2769,20 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
> shift = last_sge > first_sge ?
> last_sge - first_sge - 1 :
> NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
>- if (!shift)
>+ if (!shift) {
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
> goto out;
>+ }
>+
>+ i = first_sge;
>+ sk_msg_iter_var_next(i);
>+ sk_msg_clear_copy_range(msg, i, last_sge);
>
> i = first_sge;
> sk_msg_iter_var_next(i);
> do {
> u32 move_from;
>+ bool move_copy;
>
> if (i + shift >= NR_MSG_FRAG_IDS)
> move_from = i + shift - NR_MSG_FRAG_IDS;
>@@ -2762,16 +2791,20 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
> if (move_from == msg->sg.end)
> break;
>
>+ move_copy = sk_msg_elem_is_copy(msg, move_from);
> msg->sg.data[i] = msg->sg.data[move_from];
>+ sk_msg_set_elem_copy(msg, i, move_copy);
This is sk_msg_sg_move()?
> msg->sg.data[move_from].length = 0;
> msg->sg.data[move_from].page_link = 0;
> msg->sg.data[move_from].offset = 0;
>+ sk_msg_set_elem_copy(msg, move_from, false);
> sk_msg_iter_var_next(i);
> } while (1);
>
> msg->sg.end = msg->sg.end - shift > msg->sg.end ?
> msg->sg.end - shift + NR_MSG_FRAG_IDS :
> msg->sg.end - shift;
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
> out:
> sk_msg_reset_curr(msg);
> msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
>@@ -2794,6 +2827,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> {
> struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
> u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
>+ bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
>+ bool rsge_copy = false;
> u8 *raw, *to, *from;
> struct page *page;
>
>@@ -2866,6 +2901,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> sk_msg_iter_var_prev(i);
> psge = sk_msg_elem(msg, i);
> rsge = sk_msg_elem_cpy(msg, i);
>+ rsge_copy = sk_msg_elem_is_copy(msg, i);
>
> psge->length = start - offset;
> rsge.length -= psge->length;
I think we need another fix here,
rsge.offset += start - offset;
Probably carry in another patch. I can do it if you want?
>@@ -2891,23 +2927,31 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> /* Shift one or two slots as needed */
> sge = sk_msg_elem_cpy(msg, new);
> sg_unmark_end(&sge);
>+ sge_copy = sk_msg_elem_is_copy(msg, new);
>
> nsge = sk_msg_elem_cpy(msg, i);
>+ nsge_copy = sk_msg_elem_is_copy(msg, i);
> if (rsge.length) {
> sk_msg_iter_var_next(i);
> nnsge = sk_msg_elem_cpy(msg, i);
>+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
> sk_msg_iter_next(msg, end);
> }
>
> while (i != msg->sg.end) {
> msg->sg.data[i] = sge;
>+ sk_msg_set_elem_copy(msg, i, sge_copy);
> sge = nsge;
>+ sge_copy = nsge_copy;
> sk_msg_iter_var_next(i);
> if (rsge.length) {
> nsge = nnsge;
>+ nsge_copy = nnsge_copy;
> nnsge = sk_msg_elem_cpy(msg, i);
>+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
> } else {
> nsge = sk_msg_elem_cpy(msg, i);
>+ nsge_copy = sk_msg_elem_is_copy(msg, i);
> }
> }
>
>@@ -2915,13 +2959,15 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
> /* Place newly allocated data buffer */
> sk_mem_charge(msg->sk, len);
> msg->sg.size += len;
>- __clear_bit(new, msg->sg.copy);
>+ sk_msg_set_elem_copy(msg, new, false);
> sg_set_page(&msg->sg.data[new], page, len + copy, 0);
> if (rsge.length) {
> get_page(sg_page(&rsge));
> sk_msg_iter_var_next(new);
> msg->sg.data[new] = rsge;
>+ sk_msg_set_elem_copy(msg, new, rsge_copy);
> }
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
>
> sk_msg_reset_curr(msg);
> sk_msg_compute_data_pointers(msg);
>@@ -2945,29 +2991,41 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
>
> put_page(sg_page(sge));
> do {
>+ bool copy;
>+
> prev = i;
> sk_msg_iter_var_next(i);
>+ copy = sk_msg_elem_is_copy(msg, i);
> msg->sg.data[prev] = msg->sg.data[i];
>+ sk_msg_set_elem_copy(msg, prev, copy);
sk_msg_sg_move()?
> } while (i != msg->sg.end);
>
> sk_msg_iter_prev(msg, end);
>+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
> }
I think this is good with small cleanup. The bot report (need to check
again), but I think it was calling out another issue with a different
fix/patch needed.
Do you want to follow up with the other couple addons or should I?
Also please add a test for this so we capture it in selftests.
Thanks!
John
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2026-05-18 21:32 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-17 12:16 [PATCH v2] bpf, sockmap: keep sk_msg copy state in sync Zhang Cen
2026-05-18 21:32 ` John Fastabend
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox