All of lore.kernel.org
 help / color / mirror / Atom feed
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
To: Eryk Kubanski <e.kubanski@partner.samsung.com>
Cc: Stanislav Fomichev <stfomichev@gmail.com>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"bjorn@kernel.org" <bjorn@kernel.org>,
	"magnus.karlsson@intel.com" <magnus.karlsson@intel.com>,
	"jonathan.lemon@gmail.com" <jonathan.lemon@gmail.com>
Subject: Re: Re: Re: Re: [PATCH bpf v2] xsk: Fix out of order segment free in __xsk_generic_xmit()
Date: Mon, 9 Jun 2025 21:41:57 +0200	[thread overview]
Message-ID: <aEc5BVcUJyb+qlg7@boxer> (raw)
In-Reply-To: <20250604141521eucms1p26b794744fb73f84f223927c36ade7239@eucms1p2>

On Wed, Jun 04, 2025 at 04:15:21PM +0200, Eryk Kubanski wrote:
> > Thanks for shedding a bit more light on it. In the future it would be nice
> > if you would be able to come up with a reproducer of a bug that others
> > could use on their side. Plus the overview of your deployment from the
> > beginning would also help with people understanding the issue :)
> 
> Sure, sorry for not giving that in advance, I found this issue
> during code analysis, not during deployment.
> It's not that simple to catch.
> I thought that in finite time we will agree :D.
> Next patchsets from me will have more information up-front.
> 
> > I'm looking into it, bottom line is that we discussed it with Magnus and
> > agree that issue you're reporting needs to be addressed.
> > I'll get back to you to discuss potential way of attacking it.
> > Thanks!
> 
> Thank you.
> Will this be discussed in the same mailing chain?

I've come with something as below. Idea is to embed addr at the end of
linear part of skb/at the end of page frag. For first case we account 8
more bytes when calling sock_alloc_send_skb(), for the latter we alloc
whole page anyways so we can just use the last 8 bytes. then in destructor
we have access to addrs used during xmit descriptor production. This
solution is free of additional struct members so performance-wise it
should not be as impactful as previous approach.

---
 net/xdp/xsk.c       | 37 ++++++++++++++++++++++++++++++-------
 net/xdp/xsk_queue.h |  8 ++++++++
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 72c000c0ae5f..22f314ea9dc2 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -528,24 +528,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 }
 
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
 {
 	unsigned long flags;
 	int ret;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	ret = xskq_prod_reserve_addr(pool->cq, addr);
+	ret = xskq_prod_reserve(pool->cq);
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 
 	return ret;
 }
 
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, struct sk_buff *skb)
 {
+	size_t addr_sz = sizeof(((struct xdp_desc *)0)->addr);
 	unsigned long flags;
+	int nr_frags, i;
+	u64 addr;
 
 	spin_lock_irqsave(&pool->cq_lock, flags);
-	xskq_prod_submit_n(pool->cq, n);
+
+	addr = *(u64 *)(skb->head + skb->end - addr_sz);
+	xskq_prod_write_addr(pool->cq, addr);
+
+	nr_frags = skb_shinfo(skb)->nr_frags;
+
+	for (i = 0; i < nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		addr = *(u64 *)(skb_frag_address(frag) + PAGE_SIZE - addr_sz);
+		xskq_prod_write_addr(pool->cq, addr);
+	}
+
 	spin_unlock_irqrestore(&pool->cq_lock, flags);
 }
 
@@ -572,7 +587,7 @@ static void xsk_destruct_skb(struct sk_buff *skb)
 		*compl->tx_timestamp = ktime_get_tai_fast_ns();
 	}
 
-	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, skb);
 	sock_wfree(skb);
 }
 
@@ -656,6 +671,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 				     struct xdp_desc *desc)
 {
+	size_t addr_sz = sizeof(desc->addr);
 	struct xsk_tx_metadata *meta = NULL;
 	struct net_device *dev = xs->dev;
 	struct sk_buff *skb = xs->skb;
@@ -671,6 +687,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 	} else {
 		u32 hr, tr, len;
 		void *buffer;
+		u8 *trailer;
 
 		buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
 		len = desc->len;
@@ -680,7 +697,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 			hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
 			tr = dev->needed_tailroom;
-			skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+			skb = sock_alloc_send_skb(&xs->sk,
+						  hr + len + tr + addr_sz,
+						  1, &err);
 			if (unlikely(!skb))
 				goto free_err;
 
@@ -690,6 +709,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 			err = skb_store_bits(skb, 0, buffer, len);
 			if (unlikely(err))
 				goto free_err;
+			trailer = skb->head + skb->end - addr_sz;
+			memcpy(trailer, &desc->addr, addr_sz);
+
 		} else {
 			int nr_frags = skb_shinfo(skb)->nr_frags;
 			struct page *page;
@@ -708,6 +730,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 
 			vaddr = kmap_local_page(page);
 			memcpy(vaddr, buffer, len);
+			memcpy(vaddr + PAGE_SIZE - addr_sz, &desc->addr, addr_sz);
 			kunmap_local(vaddr);
 
 			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
@@ -807,7 +830,7 @@ static int __xsk_generic_xmit(struct sock *sk)
 		 * if there is space in it. This avoids having to implement
 		 * any buffering in the Tx path.
 		 */
-		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+		err = xsk_cq_reserve_locked(xs->pool);
 		if (err) {
 			err = -EAGAIN;
 			goto out;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..9cd65d1bc81b 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -390,6 +390,14 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u64 addr)
+{
+	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+	/* A, matches D */
+	ring->desc[q->ring->producer++ & q->ring_mask] = addr;
+}
+
 static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
 					      u32 nb_entries)
 {


> 
> Technically we need to tie descriptor write-back
> with skb lifetime.
> xsk_build_skb() function builds skb for TX,
> if i understand correctly this can work both ways
> either we perform zero-copy, so specific buffer
> page is attached to skb with given offset and size.
> OR perform the copy.
> 
> If there was no zerocopy case, we could store it
> on stack array and simply recycle descriptor back
> right away without waiting for SKB completion.
> 
> This zero-copy case makes it impossible right?
> We need to store these descriptors somewhere else
> and tie it to SKB destruction :(.

  reply	other threads:[~2025-06-09 19:42 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20250530103506eucas1p1e4091678f4157b928ddfa6f6534a0009@eucas1p1.samsung.com>
2025-05-30 10:34 ` [PATCH bpf v2] xsk: Fix out of order segment free in __xsk_generic_xmit() e.kubanski
2025-05-30 11:56   ` Eryk Kubanski
2025-05-30 16:07   ` Stanislav Fomichev
2025-06-02  9:27     ` Eryk Kubanski
2025-06-02 15:28       ` Stanislav Fomichev
2025-06-02 15:58         ` Eryk Kubanski
2025-06-02 16:03         ` Maciej Fijalkowski
2025-06-02 16:18           ` Eryk Kubanski
2025-06-04 13:50             ` Maciej Fijalkowski
2025-06-04 14:15               ` Eryk Kubanski
2025-06-09 19:41                 ` Maciej Fijalkowski [this message]
2025-06-10  9:35                   ` Eryk Kubanski
     [not found]       ` <CGME20250530103506eucas1p1e4091678f4157b928ddfa6f6534a0009@eucms1p3>
2025-06-10  9:11         ` Eryk Kubanski
2025-06-11 13:10           ` Maciej Fijalkowski
2025-07-03 23:37       ` Jason Xing
2025-07-04 12:34         ` Maciej Fijalkowski
2025-07-04 15:29           ` Jason Xing
2025-06-04 14:41   ` kernel test robot

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aEc5BVcUJyb+qlg7@boxer \
    --to=maciej.fijalkowski@intel.com \
    --cc=bjorn@kernel.org \
    --cc=e.kubanski@partner.samsung.com \
    --cc=jonathan.lemon@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=magnus.karlsson@intel.com \
    --cc=netdev@vger.kernel.org \
    --cc=stfomichev@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.