From: Zhang Cen <rollkingzzc@gmail.com>
To: Alexei Starovoitov <ast@kernel.org>,
Daniel Borkmann <daniel@iogearbox.net>,
Andrii Nakryiko <andrii@kernel.org>,
Martin KaFai Lau <martin.lau@linux.dev>,
John Fastabend <john.fastabend@gmail.com>,
Stanislav Fomichev <sdf@fomichev.me>,
Jakub Sitnicki <jakub@cloudflare.com>
Cc: "David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Simon Horman <horms@kernel.org>,
bpf@vger.kernel.org, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org, zerocling0077@gmail.com,
2045gemini@gmail.com, Zhang Cen <rollkingzzc@gmail.com>,
stable@vger.kernel.org
Subject: [PATCH v2] bpf, sockmap: keep sk_msg copy state in sync
Date: Sun, 17 May 2026 20:16:26 +0800 [thread overview]
Message-ID: <20260517121626.406516-1-rollkingzzc@gmail.com> (raw)
SK_MSG uses msg->sg.copy as per-scatterlist-entry provenance. Entries
with this bit set are copied before data/data_end are exposed to SK_MSG
BPF programs for direct packet access.
bpf_msg_pull_data(), bpf_msg_push_data() and bpf_msg_pop_data() rewrite
the sk_msg scatterlist ring by collapsing, splitting and shifting
entries. These operations move msg->sg.data[] entries, but the parallel
copy bitmap can be left behind or stale in slots that no longer contain
the original entry. A copied entry can therefore later occupy a slot whose
copy bit is clear and be exposed as directly writable packet data.
Keep msg->sg.copy synchronized with scatterlist entry moves, preserve the
copy bit when an entry is split, clear it when a helper replaces an entry
with a private page, and clear every slot vacated by pull-data
compaction.
Fixes: 015632bb30da ("bpf: sk_msg program helper bpf_sk_msg_pull_data")
Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data")
Fixes: 7246d8ed4dcc ("bpf: helper to pop data from messages")
Cc: stable@vger.kernel.org
Co-developed-by: Han Guidong <2045gemini@gmail.com>
Signed-off-by: Han Guidong <2045gemini@gmail.com>
Signed-off-by: Zhang Cen <rollkingzzc@gmail.com>
---
v2:
Sashiko-bot pointed out that bpf_msg_pull_data() could leave stale copy
bits on collapsed tail entries.
Clear msg->sg.copy for every entry consumed by bpf_msg_pull_data()
before compacting the scatterlist ring.
While researching recent page cache bugs, we discovered this bug.
We confirmed it allows overwriting the page cache of read-only files
via splice(). We haven't attempted to write an exploit, but the
corruption primitive is verified. PoC available upon request.
Recommend fixing ASAP.
---
net/core/filter.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 64 insertions(+), 2 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 9590877b0714f..018c30a0d71fb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2654,6 +2654,27 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
}
}
+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
+{
+ return test_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i, bool copy)
+{
+ if (copy)
+ __set_bit(i, msg->sg.copy);
+ else
+ __clear_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end)
+{
+ while (start != end) {
+ __clear_bit(start, msg->sg.copy);
+ sk_msg_iter_var_next(start);
+ }
+}
+
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.func = bpf_msg_cork_bytes,
.gpl_only = false,
@@ -2738,6 +2759,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
} while (i != last_sge);
sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
+ sk_msg_set_elem_copy(msg, first_sge, false);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
@@ -2747,13 +2769,20 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
shift = last_sge > first_sge ?
last_sge - first_sge - 1 :
NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
- if (!shift)
+ if (!shift) {
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
goto out;
+ }
+
+ i = first_sge;
+ sk_msg_iter_var_next(i);
+ sk_msg_clear_copy_range(msg, i, last_sge);
i = first_sge;
sk_msg_iter_var_next(i);
do {
u32 move_from;
+ bool move_copy;
if (i + shift >= NR_MSG_FRAG_IDS)
move_from = i + shift - NR_MSG_FRAG_IDS;
@@ -2762,16 +2791,20 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
if (move_from == msg->sg.end)
break;
+ move_copy = sk_msg_elem_is_copy(msg, move_from);
msg->sg.data[i] = msg->sg.data[move_from];
+ sk_msg_set_elem_copy(msg, i, move_copy);
msg->sg.data[move_from].length = 0;
msg->sg.data[move_from].page_link = 0;
msg->sg.data[move_from].offset = 0;
+ sk_msg_set_elem_copy(msg, move_from, false);
sk_msg_iter_var_next(i);
} while (1);
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
out:
sk_msg_reset_curr(msg);
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2794,6 +2827,8 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
{
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
+ bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
+ bool rsge_copy = false;
u8 *raw, *to, *from;
struct page *page;
@@ -2866,6 +2901,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
+ rsge_copy = sk_msg_elem_is_copy(msg, i);
psge->length = start - offset;
rsge.length -= psge->length;
@@ -2891,23 +2927,31 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Shift one or two slots as needed */
sge = sk_msg_elem_cpy(msg, new);
sg_unmark_end(&sge);
+ sge_copy = sk_msg_elem_is_copy(msg, new);
nsge = sk_msg_elem_cpy(msg, i);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
if (rsge.length) {
sk_msg_iter_var_next(i);
nnsge = sk_msg_elem_cpy(msg, i);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_next(msg, end);
}
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
+ sk_msg_set_elem_copy(msg, i, sge_copy);
sge = nsge;
+ sge_copy = nsge_copy;
sk_msg_iter_var_next(i);
if (rsge.length) {
nsge = nnsge;
+ nsge_copy = nnsge_copy;
nnsge = sk_msg_elem_cpy(msg, i);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
} else {
nsge = sk_msg_elem_cpy(msg, i);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
}
}
@@ -2915,13 +2959,15 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, msg->sg.copy);
+ sk_msg_set_elem_copy(msg, new, false);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
sk_msg_iter_var_next(new);
msg->sg.data[new] = rsge;
+ sk_msg_set_elem_copy(msg, new, rsge_copy);
}
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
sk_msg_reset_curr(msg);
sk_msg_compute_data_pointers(msg);
@@ -2945,29 +2991,41 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
put_page(sg_page(sge));
do {
+ bool copy;
+
prev = i;
sk_msg_iter_var_next(i);
+ copy = sk_msg_elem_is_copy(msg, i);
msg->sg.data[prev] = msg->sg.data[i];
+ sk_msg_set_elem_copy(msg, prev, copy);
} while (i != msg->sg.end);
sk_msg_iter_prev(msg, end);
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
}
static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
struct scatterlist tmp, sge;
+ bool tmp_copy, sge_copy;
sk_msg_iter_next(msg, end);
sge = sk_msg_elem_cpy(msg, i);
+ sge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_var_next(i);
tmp = sk_msg_elem_cpy(msg, i);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
+ sk_msg_set_elem_copy(msg, i, sge_copy);
sk_msg_iter_var_next(i);
sge = tmp;
+ sge_copy = tmp_copy;
tmp = sk_msg_elem_cpy(msg, i);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
}
+ sk_msg_set_elem_copy(msg, msg->sg.end, false);
}
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
@@ -3024,8 +3082,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
*/
if (start != offset) {
struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ u32 sge_idx = i;
int a = start - offset;
int b = sge->length - pop - a;
+ bool sge_copy = sk_msg_elem_is_copy(msg, sge_idx);
sk_msg_iter_var_next(i);
@@ -3038,6 +3098,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
sg_set_page(nsge,
sg_page(sge),
b, sge->offset + pop + a);
+ sk_msg_set_elem_copy(msg, i, sge_copy);
} else {
struct page *page, *orig;
u8 *to, *from;
@@ -3054,6 +3115,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
memcpy(to, from, a);
memcpy(to + a, from + a + pop, b);
sg_set_page(sge, page, a + b, 0);
+ sk_msg_set_elem_copy(msg, sge_idx, false);
put_page(orig);
}
pop = 0;
--
2.43.0
next reply other threads:[~2026-05-17 12:16 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-17 12:16 Zhang Cen [this message]
2026-05-17 12:50 ` [PATCH v2] bpf, sockmap: keep sk_msg copy state in sync sashiko-bot
2026-05-18 21:32 ` John Fastabend
2026-05-19 12:49 ` Cen Zhang
2026-05-19 19:29 ` John Fastabend
2026-05-20 4:28 ` Cen Zhang
2026-05-20 3:13 ` Jiayuan Chen
2026-05-20 3:29 ` Han Guidong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260517121626.406516-1-rollkingzzc@gmail.com \
--to=rollkingzzc@gmail.com \
--cc=2045gemini@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=horms@kernel.org \
--cc=jakub@cloudflare.com \
--cc=john.fastabend@gmail.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=martin.lau@linux.dev \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=sdf@fomichev.me \
--cc=stable@vger.kernel.org \
--cc=zerocling0077@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.