From: Dragos Tatulea <dtatulea@nvidia.com>
To: <almasrymina@google.com>,
Steffen Klassert <steffen.klassert@secunet.com>,
Herbert Xu <herbert@gondor.apana.org.au>,
"David S. Miller" <davem@davemloft.net>,
David Ahern <dsahern@kernel.org>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>
Cc: <leonro@nvidia.com>, <gal@nvidia.com>,
Dragos Tatulea <dtatulea@nvidia.com>,
"Anatoli N . Chechelnickiy"
<Anatoli.Chechelnickiy@m.interpipe.biz>,
Ian Kumlien <ian.kumlien@gmail.com>,
<linux-kernel@vger.kernel.org>, <netdev@vger.kernel.org>
Subject: [RFC PATCH v2] net: esp: fix bad handling of pages from page_pool
Date: Wed, 6 Mar 2024 21:08:15 +0200 [thread overview]
Message-ID: <20240306190822.390086-1-dtatulea@nvidia.com> (raw)
When the skb is reorganized during esp_output (!esp->inline), the pages
coming from the original skb fragments are supposed to be released back
to the system through put_page. But if the skb fragment pages are
originating from a page_pool, calling put_page on them will trigger a
page_pool leak which will eventually result in a crash.
This leak can be easily observed when using CONFIG_DEBUG_VM and doing
ipsec + gre (non offloaded) forwarding:
BUG: Bad page state in process ksoftirqd/16 pfn:1451b6
page:00000000de2b8d32 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1451b6000 pfn:0x1451b6
flags: 0x200000000000000(node=0|zone=2)
page_type: 0xffffffff()
raw: 0200000000000000 dead000000000040 ffff88810d23c000 0000000000000000
raw: 00000001451b6000 0000000000000001 00000000ffffffff 0000000000000000
page dumped because: page_pool leak
Modules linked in: ip_gre gre mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay zram zsmalloc fuse [last unloaded: mlx5_core]
CPU: 16 PID: 96 Comm: ksoftirqd/16 Not tainted 6.8.0-rc4+ #22
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0x36/0x50
bad_page+0x70/0xf0
free_unref_page_prepare+0x27a/0x460
free_unref_page+0x38/0x120
esp_ssg_unref.isra.0+0x15f/0x200
esp_output_tail+0x66d/0x780
esp_xmit+0x2c5/0x360
validate_xmit_xfrm+0x313/0x370
? validate_xmit_skb+0x1d/0x330
validate_xmit_skb_list+0x4c/0x70
sch_direct_xmit+0x23e/0x350
__dev_queue_xmit+0x337/0xba0
? nf_hook_slow+0x3f/0xd0
ip_finish_output2+0x25e/0x580
iptunnel_xmit+0x19b/0x240
ip_tunnel_xmit+0x5fb/0xb60
ipgre_xmit+0x14d/0x280 [ip_gre]
dev_hard_start_xmit+0xc3/0x1c0
__dev_queue_xmit+0x208/0xba0
? nf_hook_slow+0x3f/0xd0
ip_finish_output2+0x1ca/0x580
ip_sublist_rcv_finish+0x32/0x40
ip_sublist_rcv+0x1b2/0x1f0
? ip_rcv_finish_core.constprop.0+0x460/0x460
ip_list_rcv+0x103/0x130
__netif_receive_skb_list_core+0x181/0x1e0
netif_receive_skb_list_internal+0x1b3/0x2c0
napi_gro_receive+0xc8/0x200
gro_cell_poll+0x52/0x90
__napi_poll+0x25/0x1a0
net_rx_action+0x28e/0x300
__do_softirq+0xc3/0x276
? sort_range+0x20/0x20
run_ksoftirqd+0x1e/0x30
smpboot_thread_fn+0xa6/0x130
kthread+0xcd/0x100
? kthread_complete_and_exit+0x20/0x20
ret_from_fork+0x31/0x50
? kthread_complete_and_exit+0x20/0x20
ret_from_fork_asm+0x11/0x20
</TASK>
The suggested fix is to use a new wrapper that covers page refcounting
for page_pool pages as well.
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reported-by: Anatoli N.Chechelnickiy <Anatoli.Chechelnickiy@m.interpipe.biz>
Reported-by: Ian Kumlien <ian.kumlien@gmail.com>
---
Changes in v2:
- Added napi_page_unref api based on discussion in v1 [0].
[0]https://lore.kernel.org/netdev/CAHS8izOoO-EovwMwAm9tLYetwikNPxC0FKyVGu1TPJWSz4bGoA@mail.gmail.com/T/#t
---
include/linux/skbuff.h | 10 +++++++---
net/ipv4/esp4.c | 16 ++++++++++------
net/ipv6/esp6.c | 16 ++++++++++------
3 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 696e7680656f..009603db2a43 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3453,10 +3453,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
bool napi_pp_put_page(struct page *page, bool napi_safe);
static inline void
-napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+napi_page_unref(struct page *page, bool recycle, bool napi_safe)
{
- struct page *page = skb_frag_page(frag);
-
#ifdef CONFIG_PAGE_POOL
if (recycle && napi_pp_put_page(page, napi_safe))
return;
@@ -3464,6 +3462,12 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
put_page(page);
}
+static inline void
+napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+{
+ napi_page_unref(skb_frag_page(frag), recycle, napi_safe);
+}
+
/**
* __skb_frag_unref - release a reference on a paged fragment.
* @frag: the paged fragment
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 4dd9e5040672..3126aeb0588d 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -95,7 +95,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
{
struct crypto_aead *aead = x->data;
int extralen = 0;
@@ -112,9 +112,13 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
/* Unref skb_frag_pages in the src scatterlist if necessary.
* Skip the first sg which comes from skb->data.
*/
- if (req->src != req->dst)
- for (sg = sg_next(req->src); sg; sg = sg_next(sg))
- put_page(sg_page(sg));
+ if (req->src != req->dst) {
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg)) {
+ struct page *page = sg_page(sg);
+
+ napi_page_unref(page, skb->pp_recycle, false);
+ }
+ }
}
#ifdef CONFIG_INET_ESPINTCP
@@ -260,7 +264,7 @@ static void esp_output_done(void *data, int err)
}
tmp = ESP_SKB_CB(skb)->tmp;
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
kfree(tmp);
if (xo && (xo->flags & XFRM_DEV_RESUME)) {
@@ -639,7 +643,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
}
if (sg != dsg)
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
err = esp_output_tail_tcp(x, skb);
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 6e6efe026cdc..7bbe19e74cf0 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -112,7 +112,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
{
struct crypto_aead *aead = x->data;
int extralen = 0;
@@ -129,9 +129,13 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
/* Unref skb_frag_pages in the src scatterlist if necessary.
* Skip the first sg which comes from skb->data.
*/
- if (req->src != req->dst)
- for (sg = sg_next(req->src); sg; sg = sg_next(sg))
- put_page(sg_page(sg));
+ if (req->src != req->dst) {
+ for (sg = sg_next(req->src); sg; sg = sg_next(sg)) {
+ struct page *page = sg_page(sg);
+
+ napi_page_unref(page, skb->pp_recycle, false);
+ }
+ }
}
#ifdef CONFIG_INET6_ESPINTCP
@@ -294,7 +298,7 @@ static void esp_output_done(void *data, int err)
}
tmp = ESP_SKB_CB(skb)->tmp;
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
kfree(tmp);
esp_output_encap_csum(skb);
@@ -677,7 +681,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
}
if (sg != dsg)
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
err = esp_output_tail_tcp(x, skb);
--
2.42.0
next reply other threads:[~2024-03-06 19:08 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-06 19:08 Dragos Tatulea [this message]
2024-03-06 19:09 ` [RFC PATCH v2] net: esp: fix bad handling of pages from page_pool Dragos Tatulea
2024-03-07 3:53 ` Jakub Kicinski
2024-03-06 21:15 ` Mina Almasry
2024-03-07 9:35 ` Dragos Tatulea
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240306190822.390086-1-dtatulea@nvidia.com \
--to=dtatulea@nvidia.com \
--cc=Anatoli.Chechelnickiy@m.interpipe.biz \
--cc=almasrymina@google.com \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=edumazet@google.com \
--cc=gal@nvidia.com \
--cc=herbert@gondor.apana.org.au \
--cc=ian.kumlien@gmail.com \
--cc=kuba@kernel.org \
--cc=leonro@nvidia.com \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=steffen.klassert@secunet.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.