* [PATCH v2 bpf 1/3] xsk: recycle buffer in case Rx queue was full
2023-12-12 12:57 [PATCH v2 bpf 0/3] net: bpf_xdp_adjust_tail() fixes Maciej Fijalkowski
@ 2023-12-12 12:57 ` Maciej Fijalkowski
2023-12-12 13:14 ` Magnus Karlsson
2023-12-12 12:57 ` [PATCH v2 bpf 2/3] xsk: fix usage of multi-buffer BPF helpers for ZC XDP Maciej Fijalkowski
2023-12-12 12:57 ` [PATCH v2 bpf 3/3] ice: work on pre-XDP prog frag count Maciej Fijalkowski
2 siblings, 1 reply; 7+ messages in thread
From: Maciej Fijalkowski @ 2023-12-12 12:57 UTC (permalink / raw)
To: bpf, ast, daniel, andrii
Cc: netdev, magnus.karlsson, bjorn, maciej.fijalkowski, echaudro,
lorenzo
Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce
descriptor to XSK Rx queue.
Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
net/xdp/xsk.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 281d49b4fca4..bd4abb200fa9 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -167,8 +167,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
contd = XDP_PKT_CONTD;
err = __xsk_rcv_zc(xs, xskb, len, contd);
- if (err || likely(!frags))
- goto out;
+ if (err)
+ goto err;
+ if (likely(!frags))
+ return 0;
xskb_list = &xskb->pool->xskb_list;
list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
@@ -177,11 +179,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
len = pos->xdp.data_end - pos->xdp.data;
err = __xsk_rcv_zc(xs, pos, len, contd);
if (err)
- return err;
+ goto err;
list_del(&pos->xskb_list_node);
}
-out:
+ return 0;
+err:
+ xsk_buff_free(xdp);
return err;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH v2 bpf 1/3] xsk: recycle buffer in case Rx queue was full
2023-12-12 12:57 ` [PATCH v2 bpf 1/3] xsk: recycle buffer in case Rx queue was full Maciej Fijalkowski
@ 2023-12-12 13:14 ` Magnus Karlsson
0 siblings, 0 replies; 7+ messages in thread
From: Magnus Karlsson @ 2023-12-12 13:14 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, bjorn,
echaudro, lorenzo
On Tue, 12 Dec 2023 at 13:58, Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce
> descriptor to XSK Rx queue.
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> net/xdp/xsk.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 281d49b4fca4..bd4abb200fa9 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -167,8 +167,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> contd = XDP_PKT_CONTD;
>
> err = __xsk_rcv_zc(xs, xskb, len, contd);
> - if (err || likely(!frags))
> - goto out;
> + if (err)
> + goto err;
> + if (likely(!frags))
> + return 0;
>
> xskb_list = &xskb->pool->xskb_list;
> list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
> @@ -177,11 +179,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
> len = pos->xdp.data_end - pos->xdp.data;
> err = __xsk_rcv_zc(xs, pos, len, contd);
> if (err)
> - return err;
> + goto err;
> list_del(&pos->xskb_list_node);
> }
>
> -out:
> + return 0;
> +err:
> + xsk_buff_free(xdp);
> return err;
> }
>
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2 bpf 2/3] xsk: fix usage of multi-buffer BPF helpers for ZC XDP
2023-12-12 12:57 [PATCH v2 bpf 0/3] net: bpf_xdp_adjust_tail() fixes Maciej Fijalkowski
2023-12-12 12:57 ` [PATCH v2 bpf 1/3] xsk: recycle buffer in case Rx queue was full Maciej Fijalkowski
@ 2023-12-12 12:57 ` Maciej Fijalkowski
2023-12-12 13:30 ` Magnus Karlsson
2023-12-12 12:57 ` [PATCH v2 bpf 3/3] ice: work on pre-XDP prog frag count Maciej Fijalkowski
2 siblings, 1 reply; 7+ messages in thread
From: Maciej Fijalkowski @ 2023-12-12 12:57 UTC (permalink / raw)
To: bpf, ast, daniel, andrii
Cc: netdev, magnus.karlsson, bjorn, maciej.fijalkowski, echaudro,
lorenzo
Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr
dereference happens:
[1136314.192256] BUG: kernel NULL pointer dereference, address:
0000000000000034
[1136314.203943] #PF: supervisor read access in kernel mode
[1136314.213768] #PF: error_code(0x0000) - not-present page
[1136314.223550] PGD 0 P4D 0
[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
0000000000000000
[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
ffffc9003168c000
[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
0000000000010000
[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
0000000000000001
[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
0000000000000001
[1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
knlGS:0000000000000000
[1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
00000000007706f0
[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[1136314.431890] PKRU: 55555554
[1136314.439143] Call Trace:
[1136314.446058] <IRQ>
[1136314.452465] ? __die+0x20/0x70
[1136314.459881] ? page_fault_oops+0x15b/0x440
[1136314.468305] ? exc_page_fault+0x6a/0x150
[1136314.476491] ? asm_exc_page_fault+0x22/0x30
[1136314.484927] ? __xdp_return+0x6c/0x210
[1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0
[1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
[1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice]
[1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice]
[1136314.528506] ice_napi_poll+0x467/0x670 [ice]
[1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0
[1136314.546010] __napi_poll+0x29/0x1b0
[1136314.553462] net_rx_action+0x133/0x270
[1136314.561619] __do_softirq+0xbe/0x28e
[1136314.569303] do_softirq+0x3f/0x60
This comes from __xdp_return() call with xdp_buff argument passed as
NULL which is supposed to be consumed by xsk_buff_free() call.
To address this properly, in ZC case, a node that represents the frag
being removed has to be pulled out of xskb_list. Introduce
appriopriate xsk helpers to do such node operation and use them
accordingly within bpf_xdp_adjust_tail().
Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++
net/core/filter.c | 48 +++++++++++++++++++++++++++++++-------
2 files changed, 65 insertions(+), 9 deletions(-)
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 81e02de3f453..123adc6d68c1 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
return ret;
}
+static inline void xsk_buff_tail_del(struct xdp_buff *tail)
+{
+ struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
+
+ list_del(&xskb->xskb_list_node);
+}
+
+static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
+{
+ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
+ struct xdp_buff_xsk *frag;
+
+ frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
+ xskb_list_node);
+ return &frag->xdp;
+}
+
static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
{
xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
@@ -333,6 +350,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
return NULL;
}
+static inline void xsk_buff_tail_del(struct xdp_buff *tail)
+{
+}
+
+static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
+{
+ return NULL;
+}
+
static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
{
}
diff --git a/net/core/filter.c b/net/core/filter.c
index adcfc2c25754..8ce13d73a660 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,7 @@
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h>
+#include <net/xdp_sock_drv.h>
#include "dev.h"
@@ -4075,6 +4076,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
return 0;
}
+static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
+ skb_frag_t *frag, int shrink)
+{
+ if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
+ struct xdp_buff *tail = xsk_buff_get_tail(xdp);
+
+ if (tail)
+ tail->data_end -= shrink;
+ }
+ skb_frag_size_sub(frag, shrink);
+}
+
+static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
+{
+ struct xdp_mem_info *mem_info = &xdp->rxq->mem;
+
+ if (skb_frag_size(frag) == shrink) {
+ struct page *page = skb_frag_page(frag);
+ struct xdp_buff *zc_frag = NULL;
+
+ if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
+ zc_frag = xsk_buff_get_tail(xdp);
+
+ if (zc_frag) {
+ xdp_buff_clear_frags_flag(zc_frag);
+ xsk_buff_tail_del(zc_frag);
+ }
+ }
+
+ __xdp_return(page_address(page), mem_info, false, zc_frag);
+ return true;
+ }
+ __shrink_data(xdp, mem_info, frag, shrink);
+ return false;
+}
+
static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
{
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
@@ -4089,17 +4126,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
len_free += shrink;
offset -= shrink;
-
- if (skb_frag_size(frag) == shrink) {
- struct page *page = skb_frag_page(frag);
-
- __xdp_return(page_address(page), &xdp->rxq->mem,
- false, NULL);
+ if (shrink_data(xdp, frag, shrink))
n_frags_free++;
- } else {
- skb_frag_size_sub(frag, shrink);
+ else
break;
- }
}
sinfo->nr_frags -= n_frags_free;
sinfo->xdp_frags_size -= len_free;
--
2.34.1
^ permalink raw reply related [flat|nested] 7+ messages in thread* Re: [PATCH v2 bpf 2/3] xsk: fix usage of multi-buffer BPF helpers for ZC XDP
2023-12-12 12:57 ` [PATCH v2 bpf 2/3] xsk: fix usage of multi-buffer BPF helpers for ZC XDP Maciej Fijalkowski
@ 2023-12-12 13:30 ` Magnus Karlsson
2023-12-12 13:36 ` Maciej Fijalkowski
0 siblings, 1 reply; 7+ messages in thread
From: Magnus Karlsson @ 2023-12-12 13:30 UTC (permalink / raw)
To: Maciej Fijalkowski
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, bjorn,
echaudro, lorenzo
On Tue, 12 Dec 2023 at 13:58, Maciej Fijalkowski
<maciej.fijalkowski@intel.com> wrote:
>
> Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr
> dereference happens:
>
> [1136314.192256] BUG: kernel NULL pointer dereference, address:
> 0000000000000034
> [1136314.203943] #PF: supervisor read access in kernel mode
> [1136314.213768] #PF: error_code(0x0000) - not-present page
> [1136314.223550] PGD 0 P4D 0
> [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
> [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
> BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
> [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
> [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
> [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
> [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
> 0000000000000000
> [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> ffffc9003168c000
> [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
> 0000000000010000
> [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
> 0000000000000001
> [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
> 0000000000000001
> [1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
> knlGS:0000000000000000
> [1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
> 00000000007706f0
> [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> 0000000000000400
> [1136314.431890] PKRU: 55555554
> [1136314.439143] Call Trace:
> [1136314.446058] <IRQ>
> [1136314.452465] ? __die+0x20/0x70
> [1136314.459881] ? page_fault_oops+0x15b/0x440
> [1136314.468305] ? exc_page_fault+0x6a/0x150
> [1136314.476491] ? asm_exc_page_fault+0x22/0x30
> [1136314.484927] ? __xdp_return+0x6c/0x210
> [1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0
> [1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
> [1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice]
> [1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice]
> [1136314.528506] ice_napi_poll+0x467/0x670 [ice]
> [1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0
> [1136314.546010] __napi_poll+0x29/0x1b0
> [1136314.553462] net_rx_action+0x133/0x270
> [1136314.561619] __do_softirq+0xbe/0x28e
> [1136314.569303] do_softirq+0x3f/0x60
>
> This comes from __xdp_return() call with xdp_buff argument passed as
> NULL which is supposed to be consumed by xsk_buff_free() call.
>
> To address this properly, in ZC case, a node that represents the frag
> being removed has to be pulled out of xskb_list. Introduce
> appriopriate xsk helpers to do such node operation and use them
> accordingly within bpf_xdp_adjust_tail().
>
> Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> ---
> include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++
> net/core/filter.c | 48 +++++++++++++++++++++++++++++++-------
> 2 files changed, 65 insertions(+), 9 deletions(-)
>
> diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> index 81e02de3f453..123adc6d68c1 100644
> --- a/include/net/xdp_sock_drv.h
> +++ b/include/net/xdp_sock_drv.h
> @@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
> return ret;
> }
>
> +static inline void xsk_buff_tail_del(struct xdp_buff *tail)
> +{
I think it would be easier to remember function calls if we are
consistent in the naming. Most of them are _verb_noun(), so I would
call it xsk_buff_del_tail().
Apart from that:
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part.
> + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
> +
> + list_del(&xskb->xskb_list_node);
> +}
> +
> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> +{
> + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> + struct xdp_buff_xsk *frag;
> +
> + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> + xskb_list_node);
> + return &frag->xdp;
> +}
> +
> static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> {
> xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
> @@ -333,6 +350,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
> return NULL;
> }
>
> +static inline void xsk_buff_tail_del(struct xdp_buff *tail)
> +{
> +}
> +
> +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> +{
> + return NULL;
> +}
> +
> static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> {
> }
> diff --git a/net/core/filter.c b/net/core/filter.c
> index adcfc2c25754..8ce13d73a660 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -83,6 +83,7 @@
> #include <net/netfilter/nf_conntrack_bpf.h>
> #include <net/netkit.h>
> #include <linux/un.h>
> +#include <net/xdp_sock_drv.h>
>
> #include "dev.h"
>
> @@ -4075,6 +4076,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
> return 0;
> }
>
> +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> + skb_frag_t *frag, int shrink)
> +{
> + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> + struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> +
> + if (tail)
> + tail->data_end -= shrink;
> + }
> + skb_frag_size_sub(frag, shrink);
> +}
> +
> +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> +{
> + struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> +
> + if (skb_frag_size(frag) == shrink) {
> + struct page *page = skb_frag_page(frag);
> + struct xdp_buff *zc_frag = NULL;
> +
> + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> + zc_frag = xsk_buff_get_tail(xdp);
> +
> + if (zc_frag) {
> + xdp_buff_clear_frags_flag(zc_frag);
> + xsk_buff_tail_del(zc_frag);
> + }
> + }
> +
> + __xdp_return(page_address(page), mem_info, false, zc_frag);
> + return true;
> + }
> + __shrink_data(xdp, mem_info, frag, shrink);
> + return false;
> +}
> +
> static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
> {
> struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> @@ -4089,17 +4126,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
>
> len_free += shrink;
> offset -= shrink;
> -
> - if (skb_frag_size(frag) == shrink) {
> - struct page *page = skb_frag_page(frag);
> -
> - __xdp_return(page_address(page), &xdp->rxq->mem,
> - false, NULL);
> + if (shrink_data(xdp, frag, shrink))
> n_frags_free++;
> - } else {
> - skb_frag_size_sub(frag, shrink);
> + else
> break;
> - }
> }
> sinfo->nr_frags -= n_frags_free;
> sinfo->xdp_frags_size -= len_free;
> --
> 2.34.1
>
>
^ permalink raw reply [flat|nested] 7+ messages in thread* Re: [PATCH v2 bpf 2/3] xsk: fix usage of multi-buffer BPF helpers for ZC XDP
2023-12-12 13:30 ` Magnus Karlsson
@ 2023-12-12 13:36 ` Maciej Fijalkowski
0 siblings, 0 replies; 7+ messages in thread
From: Maciej Fijalkowski @ 2023-12-12 13:36 UTC (permalink / raw)
To: Magnus Karlsson
Cc: bpf, ast, daniel, andrii, netdev, magnus.karlsson, bjorn,
echaudro, lorenzo
On Tue, Dec 12, 2023 at 02:30:50PM +0100, Magnus Karlsson wrote:
> On Tue, 12 Dec 2023 at 13:58, Maciej Fijalkowski
> <maciej.fijalkowski@intel.com> wrote:
> >
> > Currently when packet is shrunk via bpf_xdp_adjust_tail(), null ptr
> > dereference happens:
> >
> > [1136314.192256] BUG: kernel NULL pointer dereference, address:
> > 0000000000000034
> > [1136314.203943] #PF: supervisor read access in kernel mode
> > [1136314.213768] #PF: error_code(0x0000) - not-present page
> > [1136314.223550] PGD 0 P4D 0
> > [1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
> > [1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
> > [1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
> > BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
> > [1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
> > [1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
> > [1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
> > [1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
> > 0000000000000000
> > [1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
> > ffffc9003168c000
> > [1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
> > 0000000000010000
> > [1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
> > 0000000000000001
> > [1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
> > 0000000000000001
> > [1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
> > knlGS:0000000000000000
> > [1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
> > 00000000007706f0
> > [1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> > 0000000000000000
> > [1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
> > 0000000000000400
> > [1136314.431890] PKRU: 55555554
> > [1136314.439143] Call Trace:
> > [1136314.446058] <IRQ>
> > [1136314.452465] ? __die+0x20/0x70
> > [1136314.459881] ? page_fault_oops+0x15b/0x440
> > [1136314.468305] ? exc_page_fault+0x6a/0x150
> > [1136314.476491] ? asm_exc_page_fault+0x22/0x30
> > [1136314.484927] ? __xdp_return+0x6c/0x210
> > [1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0
> > [1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
> > [1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice]
> > [1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice]
> > [1136314.528506] ice_napi_poll+0x467/0x670 [ice]
> > [1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0
> > [1136314.546010] __napi_poll+0x29/0x1b0
> > [1136314.553462] net_rx_action+0x133/0x270
> > [1136314.561619] __do_softirq+0xbe/0x28e
> > [1136314.569303] do_softirq+0x3f/0x60
> >
> > This comes from __xdp_return() call with xdp_buff argument passed as
> > NULL which is supposed to be consumed by xsk_buff_free() call.
> >
> > To address this properly, in ZC case, a node that represents the frag
> > being removed has to be pulled out of xskb_list. Introduce
> > appriopriate xsk helpers to do such node operation and use them
> > accordingly within bpf_xdp_adjust_tail().
> >
> > Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
> > ---
> > include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++
> > net/core/filter.c | 48 +++++++++++++++++++++++++++++++-------
> > 2 files changed, 65 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
> > index 81e02de3f453..123adc6d68c1 100644
> > --- a/include/net/xdp_sock_drv.h
> > +++ b/include/net/xdp_sock_drv.h
> > @@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
> > return ret;
> > }
> >
> > +static inline void xsk_buff_tail_del(struct xdp_buff *tail)
> > +{
>
> I think it would be easier to remember function calls if we are
> consistent in the naming. Most of them are _verb_noun(), so I would
> call it xsk_buff_del_tail().
Sounds good. I can address that once I'll be sure that bots are happy.
>
> Apart from that:
>
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part.
>
> > + struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
> > +
> > + list_del(&xskb->xskb_list_node);
> > +}
> > +
> > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> > +{
> > + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
> > + struct xdp_buff_xsk *frag;
> > +
> > + frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
> > + xskb_list_node);
> > + return &frag->xdp;
> > +}
> > +
> > static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> > {
> > xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
> > @@ -333,6 +350,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
> > return NULL;
> > }
> >
> > +static inline void xsk_buff_tail_del(struct xdp_buff *tail)
> > +{
> > +}
> > +
> > +static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
> > +{
> > + return NULL;
> > +}
> > +
> > static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
> > {
> > }
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index adcfc2c25754..8ce13d73a660 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -83,6 +83,7 @@
> > #include <net/netfilter/nf_conntrack_bpf.h>
> > #include <net/netkit.h>
> > #include <linux/un.h>
> > +#include <net/xdp_sock_drv.h>
> >
> > #include "dev.h"
> >
> > @@ -4075,6 +4076,42 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
> > return 0;
> > }
> >
> > +static void __shrink_data(struct xdp_buff *xdp, struct xdp_mem_info *mem_info,
> > + skb_frag_t *frag, int shrink)
> > +{
> > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > + struct xdp_buff *tail = xsk_buff_get_tail(xdp);
> > +
> > + if (tail)
> > + tail->data_end -= shrink;
> > + }
> > + skb_frag_size_sub(frag, shrink);
> > +}
> > +
> > +static bool shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink)
> > +{
> > + struct xdp_mem_info *mem_info = &xdp->rxq->mem;
> > +
> > + if (skb_frag_size(frag) == shrink) {
> > + struct page *page = skb_frag_page(frag);
> > + struct xdp_buff *zc_frag = NULL;
> > +
> > + if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
> > + zc_frag = xsk_buff_get_tail(xdp);
> > +
> > + if (zc_frag) {
> > + xdp_buff_clear_frags_flag(zc_frag);
> > + xsk_buff_tail_del(zc_frag);
> > + }
> > + }
> > +
> > + __xdp_return(page_address(page), mem_info, false, zc_frag);
> > + return true;
> > + }
> > + __shrink_data(xdp, mem_info, frag, shrink);
> > + return false;
> > +}
> > +
> > static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
> > {
> > struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
> > @@ -4089,17 +4126,10 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
> >
> > len_free += shrink;
> > offset -= shrink;
> > -
> > - if (skb_frag_size(frag) == shrink) {
> > - struct page *page = skb_frag_page(frag);
> > -
> > - __xdp_return(page_address(page), &xdp->rxq->mem,
> > - false, NULL);
> > + if (shrink_data(xdp, frag, shrink))
> > n_frags_free++;
> > - } else {
> > - skb_frag_size_sub(frag, shrink);
> > + else
> > break;
> > - }
> > }
> > sinfo->nr_frags -= n_frags_free;
> > sinfo->xdp_frags_size -= len_free;
> > --
> > 2.34.1
> >
> >
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2 bpf 3/3] ice: work on pre-XDP prog frag count
2023-12-12 12:57 [PATCH v2 bpf 0/3] net: bpf_xdp_adjust_tail() fixes Maciej Fijalkowski
2023-12-12 12:57 ` [PATCH v2 bpf 1/3] xsk: recycle buffer in case Rx queue was full Maciej Fijalkowski
2023-12-12 12:57 ` [PATCH v2 bpf 2/3] xsk: fix usage of multi-buffer BPF helpers for ZC XDP Maciej Fijalkowski
@ 2023-12-12 12:57 ` Maciej Fijalkowski
2 siblings, 0 replies; 7+ messages in thread
From: Maciej Fijalkowski @ 2023-12-12 12:57 UTC (permalink / raw)
To: bpf, ast, daniel, andrii
Cc: netdev, magnus.karlsson, bjorn, maciej.fijalkowski, echaudro,
lorenzo
Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a
multi-buffer packet by 4k bytes and then redirects it to an AF_XDP
socket.
Since support for handling multi-buffer frames was added to XDP, usage
of bpf_xdp_adjust_tail() helper within XDP program can free the page
that given fragment occupies and in turn decrease the fragment count
within skb_shared_info that is embedded in xdp_buff struct. In current
ice driver codebase, it can become problematic when page recycling logic
decides not to reuse the page. In such case, __page_frag_cache_drain()
is used with ice_rx_buf::pagecnt_bias that was not adjusted after
refcount of page was changed by XDP prog which in turn does not drain
the refcount to 0 and page is never freed.
To address this, let us store the count of frags before the XDP program
was executed on Rx ring struct. This will be used to compare with
current frag count from skb_shared_info embedded in xdp_buff. A smaller
value in the latter indicates that XDP prog freed frag(s). Then, for
given delta decrement pagecnt_bias for XDP_DROP verdict.
While at it, let us also handle the EOP frag within
ice_set_rx_bufs_act() to make our life easier, so all of the adjustments
needed to be applied against freed frags are performed in the single
place.
Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
drivers/net/ethernet/intel/ice/ice_txrx.c | 14 ++++++---
drivers/net/ethernet/intel/ice/ice_txrx.h | 1 +
drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------
3 files changed, 32 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 9e97ea863068..6878448ba112 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -600,9 +600,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
ret = ICE_XDP_CONSUMED;
}
exit:
- rx_buf->act = ret;
- if (unlikely(xdp_buff_has_frags(xdp)))
- ice_set_rx_bufs_act(xdp, rx_ring, ret);
+ ice_set_rx_bufs_act(xdp, rx_ring, ret);
}
/**
@@ -890,14 +888,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
}
if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
- if (unlikely(xdp_buff_has_frags(xdp)))
- ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
+ ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
return -ENOMEM;
}
__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
rx_buf->page_offset, size);
sinfo->xdp_frags_size += size;
+ /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
+ * can pop off frags but driver has to handle it on its own
+ */
+ rx_ring->nr_frags = sinfo->nr_frags;
if (page_is_pfmemalloc(rx_buf->page))
xdp_buff_set_frag_pfmemalloc(xdp);
@@ -1249,6 +1250,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
xdp->data = NULL;
rx_ring->first_desc = ntc;
+ rx_ring->nr_frags = 0;
continue;
construct_skb:
if (likely(ice_ring_uses_build_skb(rx_ring)))
@@ -1264,10 +1266,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
ICE_XDP_CONSUMED);
xdp->data = NULL;
rx_ring->first_desc = ntc;
+ rx_ring->nr_frags = 0;
break;
}
xdp->data = NULL;
rx_ring->first_desc = ntc;
+ rx_ring->nr_frags = 0;
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index daf7b9dbb143..b28b9826bbcd 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -333,6 +333,7 @@ struct ice_rx_ring {
struct ice_channel *ch;
struct ice_tx_ring *xdp_ring;
struct xsk_buff_pool *xsk_pool;
+ u32 nr_frags;
dma_addr_t dma; /* physical address of ring */
u64 cached_phctime;
u16 rx_buf_len;
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
index 115969ecdf7b..b0e56675f98b 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
@@ -12,26 +12,39 @@
* act: action to store onto Rx buffers related to XDP buffer parts
*
* Set action that should be taken before putting Rx buffer from first frag
- * to one before last. Last one is handled by caller of this function as it
- * is the EOP frag that is currently being processed. This function is
- * supposed to be called only when XDP buffer contains frags.
+ * to the last.
*/
static inline void
ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
const unsigned int act)
{
- const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
- u32 first = rx_ring->first_desc;
- u32 nr_frags = sinfo->nr_frags;
+ u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
+ u32 nr_frags = rx_ring->nr_frags + 1;
+ u32 idx = rx_ring->first_desc;
u32 cnt = rx_ring->count;
struct ice_rx_buf *buf;
for (int i = 0; i < nr_frags; i++) {
- buf = &rx_ring->rx_buf[first];
+ buf = &rx_ring->rx_buf[idx];
buf->act = act;
- if (++first == cnt)
- first = 0;
+ if (++idx == cnt)
+ idx = 0;
+ }
+
+ /* adjust pagecnt_bias on frags freed by XDP prog */
+ if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
+ u32 delta = rx_ring->nr_frags - sinfo_frags;
+
+ while (delta) {
+ if (idx == 0)
+ idx = cnt - 1;
+ else
+ idx--;
+ buf = &rx_ring->rx_buf[idx];
+ buf->pagecnt_bias--;
+ delta--;
+ }
}
}
--
2.34.1
^ permalink raw reply related [flat|nested] 7+ messages in thread