From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
To: bpf@vger.kernel.org, ast@kernel.org, daniel@iogearbox.net,
andrii@kernel.org
Cc: netdev@vger.kernel.org, magnus.karlsson@intel.com,
bjorn@kernel.org, maciej.fijalkowski@intel.com,
echaudro@redhat.com, lorenzo@kernel.org, martin.lau@linux.dev,
tirthendu.sarkar@intel.com, john.fastabend@gmail.com,
horms@kernel.org, kuba@kernel.org
Subject: [PATCH v6 bpf 04/11] ice: work on pre-XDP prog frag count
Date: Wed, 24 Jan 2024 20:15:55 +0100 [thread overview]
Message-ID: <20240124191602.566724-5-maciej.fijalkowski@intel.com> (raw)
In-Reply-To: <20240124191602.566724-1-maciej.fijalkowski@intel.com>
Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a
multi-buffer packet by 4k bytes and then redirects it to an AF_XDP
socket.
Since support for handling multi-buffer frames was added to XDP, usage
of bpf_xdp_adjust_tail() helper within XDP program can free the page
that given fragment occupies and in turn decrease the fragment count
within skb_shared_info that is embedded in xdp_buff struct. In current
ice driver codebase, it can become problematic when page recycling logic
decides not to reuse the page. In such case, __page_frag_cache_drain()
is used with ice_rx_buf::pagecnt_bias that was not adjusted after
refcount of page was changed by XDP prog which in turn does not drain
the refcount to 0 and page is never freed.
To address this, let us store the count of frags before the XDP program
was executed on Rx ring struct. This will be used to compare with
current frag count from skb_shared_info embedded in xdp_buff. A smaller
value in the latter indicates that XDP prog freed frag(s). Then, for
given delta decrement pagecnt_bias for XDP_DROP verdict.
While at it, let us also handle the EOP frag within
ice_set_rx_bufs_act() to make our life easier, so all of the adjustments
needed to be applied against freed frags are performed in the single
place.
Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
---
drivers/net/ethernet/intel/ice/ice_txrx.c | 14 ++++++---
drivers/net/ethernet/intel/ice/ice_txrx.h | 1 +
drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------
3 files changed, 32 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 74d13cc5a3a7..0c9b4aa8a049 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -603,9 +603,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
ret = ICE_XDP_CONSUMED;
}
exit:
- rx_buf->act = ret;
- if (unlikely(xdp_buff_has_frags(xdp)))
- ice_set_rx_bufs_act(xdp, rx_ring, ret);
+ ice_set_rx_bufs_act(xdp, rx_ring, ret);
}
/**
@@ -893,14 +891,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
}
if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
- if (unlikely(xdp_buff_has_frags(xdp)))
- ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
+ ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
return -ENOMEM;
}
__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
rx_buf->page_offset, size);
sinfo->xdp_frags_size += size;
+ /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
+ * can pop off frags but driver has to handle it on its own
+ */
+ rx_ring->nr_frags = sinfo->nr_frags;
if (page_is_pfmemalloc(rx_buf->page))
xdp_buff_set_frag_pfmemalloc(xdp);
@@ -1251,6 +1252,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
xdp->data = NULL;
rx_ring->first_desc = ntc;
+ rx_ring->nr_frags = 0;
continue;
construct_skb:
if (likely(ice_ring_uses_build_skb(rx_ring)))
@@ -1266,10 +1268,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
ICE_XDP_CONSUMED);
xdp->data = NULL;
rx_ring->first_desc = ntc;
+ rx_ring->nr_frags = 0;
break;
}
xdp->data = NULL;
rx_ring->first_desc = ntc;
+ rx_ring->nr_frags = 0;
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
index b3379ff73674..af955b0e5dc5 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -358,6 +358,7 @@ struct ice_rx_ring {
struct ice_tx_ring *xdp_ring;
struct ice_rx_ring *next; /* pointer to next ring in q_vector */
struct xsk_buff_pool *xsk_pool;
+ u32 nr_frags;
dma_addr_t dma; /* physical address of ring */
u16 rx_buf_len;
u8 dcb_tc; /* Traffic class of ring */
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
index 762047508619..afcead4baef4 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
@@ -12,26 +12,39 @@
* act: action to store onto Rx buffers related to XDP buffer parts
*
* Set action that should be taken before putting Rx buffer from first frag
- * to one before last. Last one is handled by caller of this function as it
- * is the EOP frag that is currently being processed. This function is
- * supposed to be called only when XDP buffer contains frags.
+ * to the last.
*/
static inline void
ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
const unsigned int act)
{
- const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
- u32 first = rx_ring->first_desc;
- u32 nr_frags = sinfo->nr_frags;
+ u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
+ u32 nr_frags = rx_ring->nr_frags + 1;
+ u32 idx = rx_ring->first_desc;
u32 cnt = rx_ring->count;
struct ice_rx_buf *buf;
for (int i = 0; i < nr_frags; i++) {
- buf = &rx_ring->rx_buf[first];
+ buf = &rx_ring->rx_buf[idx];
buf->act = act;
- if (++first == cnt)
- first = 0;
+ if (++idx == cnt)
+ idx = 0;
+ }
+
+ /* adjust pagecnt_bias on frags freed by XDP prog */
+ if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
+ u32 delta = rx_ring->nr_frags - sinfo_frags;
+
+ while (delta) {
+ if (idx == 0)
+ idx = cnt - 1;
+ else
+ idx--;
+ buf = &rx_ring->rx_buf[idx];
+ buf->pagecnt_bias--;
+ delta--;
+ }
}
}
--
2.34.1
next prev parent reply other threads:[~2024-01-24 19:16 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-01-24 19:15 [PATCH v6 bpf 00/11] net: bpf_xdp_adjust_tail() and Intel mbuf fixes Maciej Fijalkowski
2024-01-24 19:15 ` [PATCH v6 bpf 01/11] xsk: recycle buffer in case Rx queue was full Maciej Fijalkowski
2024-01-24 19:15 ` [PATCH v6 bpf 02/11] xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags Maciej Fijalkowski
2024-01-25 8:00 ` Magnus Karlsson
2024-01-24 19:15 ` [PATCH v6 bpf 03/11] xsk: fix usage of multi-buffer BPF helpers for ZC XDP Maciej Fijalkowski
2024-01-24 19:15 ` Maciej Fijalkowski [this message]
2024-01-25 2:25 ` [PATCH v6 bpf 04/11] ice: work on pre-XDP prog frag count Brett Creeley
2024-01-25 2:34 ` Alexei Starovoitov
2024-01-25 15:53 ` Brett Creeley
2024-01-24 19:15 ` [PATCH v6 bpf 05/11] i40e: handle multi-buffer packets that are shrunk by xdp prog Maciej Fijalkowski
2024-01-24 19:15 ` [PATCH v6 bpf 06/11] ice: remove redundant xdp_rxq_info registration Maciej Fijalkowski
2024-01-24 19:15 ` [PATCH v6 bpf 07/11] intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers Maciej Fijalkowski
2024-01-24 19:15 ` [PATCH v6 bpf 08/11] ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue Maciej Fijalkowski
2024-01-25 8:02 ` Magnus Karlsson
2024-01-24 19:16 ` [PATCH v6 bpf 09/11] xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL Maciej Fijalkowski
2024-01-24 19:16 ` [PATCH v6 bpf 10/11] i40e: set xdp_rxq_info::frag_size Maciej Fijalkowski
2024-01-25 8:03 ` Magnus Karlsson
2024-01-24 19:16 ` [PATCH v6 bpf 11/11] i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue Maciej Fijalkowski
2024-01-25 0:40 ` [PATCH v6 bpf 00/11] net: bpf_xdp_adjust_tail() and Intel mbuf fixes patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240124191602.566724-5-maciej.fijalkowski@intel.com \
--to=maciej.fijalkowski@intel.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bjorn@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=echaudro@redhat.com \
--cc=horms@kernel.org \
--cc=john.fastabend@gmail.com \
--cc=kuba@kernel.org \
--cc=lorenzo@kernel.org \
--cc=magnus.karlsson@intel.com \
--cc=martin.lau@linux.dev \
--cc=netdev@vger.kernel.org \
--cc=tirthendu.sarkar@intel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox