From: Jason Xing <kerneljasonxing@gmail.com>
To: davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
pabeni@redhat.com, bjorn@kernel.org, magnus.karlsson@intel.com,
maciej.fijalkowski@intel.com, jonathan.lemon@gmail.com,
sdf@fomichev.me, ast@kernel.org, daniel@iogearbox.net,
hawk@kernel.org, john.fastabend@gmail.com
Cc: bpf@vger.kernel.org, netdev@vger.kernel.org,
Jason Xing <kernelxing@tencent.com>
Subject: [PATCH RFC net-next v4 03/14] xsk: add xsk_alloc_batch_skb() to build skbs in batch
Date: Wed, 15 Apr 2026 16:26:43 +0800 [thread overview]
Message-ID: <20260415082654.21026-4-kerneljasonxing@gmail.com> (raw)
In-Reply-To: <20260415082654.21026-1-kerneljasonxing@gmail.com>
From: Jason Xing <kernelxing@tencent.com>
Support allocating and building skbs in batch.
There are three steps for one batched allocation:
1. Reserve the skb and count the skb->truesize. It provides a way
that for later patch to speed up small data transmission by
diminishing the impact of kmalloc_reserve().
2. Add the total of truesize to sk_wmem_alloc at one time. The load and
store of sk_wmem_alloc is time-consuming, so this batch process makes
it gain the performance improvement.
3. Copy data and then finish initialization of each skb.
This patch uses kmem_cache_alloc_bulk() to complete the batch allocation
which relies on the global common cache 'net_hotdata.skbuff_cache'. Use
a xsk standalone skb cache (namely, xs->skb_cache) to store allocated
skbs instead of resorting to napi_alloc_cache that was designed for
softirq condition.
After allocating memory for each of skbs, in a 'for' loop, the patch
borrows part of __alloc_skb() to initialize skb and then calls
xsk_build_skb() to complete the rest of initialization process, like
copying data and stuff. To achieve a better result, the allocation
function only uses the function we need to keep it super clean, like
skb_set_owner_w() that is simplified into two lines of codes.
Add batch.send_queue and use the skb->list to make skbs into one chain
so that they can be easily sent which is shown in the subsequent patches.
In terms of freeing skbs process, napi_consume_skb() in the tx completion
would put the skb into global cache 'net_hotdata.skbuff_cache' that
implements the deferred freeing skb feature to avoid freeing skb one
by one to improve the performance.
Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
include/net/xdp_sock.h | 3 +
net/core/skbuff.c | 121 +++++++++++++++++++++++++++++++++++++++++
net/xdp/xsk.c | 7 +++
3 files changed, 131 insertions(+)
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 90c709fd1239..84f0aee3fb10 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -47,8 +47,10 @@ struct xsk_map {
struct xsk_batch {
u32 generic_xmit_batch;
+ unsigned int skb_count;
struct sk_buff **skb_cache;
struct xdp_desc *desc_cache;
+ struct sk_buff_head send_queue;
};
struct xdp_sock {
@@ -136,6 +138,7 @@ INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *));
struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct sk_buff *allocated_skb,
struct xdp_desc *desc);
+int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err);
/**
* xsk_tx_metadata_to_compl - Save enough relevant metadata information
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4045d7c484a1..f29cecacd8bb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -83,6 +83,7 @@
#include <net/psp/types.h>
#include <net/dropreason.h>
#include <net/xdp_sock.h>
+#include <net/xsk_buff_pool.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -647,6 +648,126 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
return obj;
}
+#ifdef CONFIG_XDP_SOCKETS
+int xsk_alloc_batch_skb(struct xdp_sock *xs, u32 nb_pkts, u32 nb_descs, int *err)
+{
+ struct xsk_batch *batch = &xs->batch;
+ struct xdp_desc *descs = batch->desc_cache;
+ struct sk_buff **skbs = batch->skb_cache;
+ u32 alloc_descs, base_len, wmem, sndbuf;
+ gfp_t gfp_mask = xs->sk.sk_allocation;
+ u32 skb_count = batch->skb_count;
+ struct net_device *dev = xs->dev;
+ unsigned int total_truesize = 0;
+ struct sk_buff *skb = NULL;
+ int node = NUMA_NO_NODE;
+ u32 i = 0, j, k = 0;
+ bool need_alloc;
+ u8 *data;
+
+ base_len = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+ if (!(dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+ base_len += dev->needed_tailroom;
+
+ if (xs->skb)
+ nb_pkts--;
+
+ if (skb_count >= nb_pkts)
+ goto alloc_data;
+
+ skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ gfp_mask,
+ nb_pkts - skb_count,
+ (void **)&skbs[skb_count]);
+ if (skb_count < nb_pkts)
+ nb_pkts = skb_count;
+
+alloc_data:
+ /*
+ * Phase 1: Allocate data buffers and initialize SKBs.
+ * Pre-scan descriptors to determine packet boundaries, so we can
+ * batch the sk_wmem_alloc charge in Phase 2.
+ */
+ need_alloc = !xs->skb;
+ wmem = sk_wmem_alloc_get(&xs->sk);
+ sndbuf = READ_ONCE(xs->sk.sk_sndbuf);
+ for (j = 0; j < nb_descs; j++) {
+ if (need_alloc) {
+ u32 size = base_len;
+
+ if (!(dev->priv_flags & IFF_TX_SKB_NO_LINEAR))
+ size += descs[j].len;
+
+ if (i >= nb_pkts) {
+ *err = -EAGAIN;
+ break;
+ }
+
+ if (wmem + size + total_truesize > sndbuf) {
+ *err = -EAGAIN;
+ break;
+ }
+
+ skb = skbs[skb_count - 1 - i];
+ skbuff_clear(skb);
+ data = kmalloc_reserve(&size, gfp_mask, node, skb);
+ if (unlikely(!data)) {
+ *err = -ENOBUFS;
+ break;
+ }
+ __finalize_skb_around(skb, data, size);
+ /* Replace skb_set_owner_w() with the following */
+ skb->sk = &xs->sk;
+ skb->destructor = sock_wfree;
+ total_truesize += skb->truesize;
+ i++;
+ need_alloc = false;
+ }
+ if (!xp_mb_desc(&descs[j]))
+ need_alloc = true;
+ }
+ alloc_descs = j;
+
+ /*
+ * Phase 2: Batch charge sk_wmem_alloc.
+ * One refcount_add() replaces N per-SKB skb_set_owner_w() calls,
+ * which gains much performance improvement.
+ */
+ if (total_truesize)
+ refcount_add(total_truesize, &xs->sk.sk_wmem_alloc);
+
+ /* Phase 3: Build SKBs with packet data */
+ for (j = 0; j < alloc_descs; j++) {
+ if (!xs->skb) {
+ skb = skbs[skb_count - 1 - k];
+ k++;
+ }
+
+ skb = xsk_build_skb(xs, skb, &descs[j]);
+ if (IS_ERR(skb)) {
+ *err = PTR_ERR(skb);
+ break;
+ }
+
+ if (xp_mb_desc(&descs[j])) {
+ xs->skb = skb;
+ continue;
+ }
+
+ xs->skb = NULL;
+ __skb_queue_tail(&batch->send_queue, skb);
+ }
+
+ /* Phase 4: Reclaim unused allocated SKBs */
+ while (k < i)
+ kfree_skb(skbs[skb_count - 1 - k++]);
+
+ batch->skb_count = skb_count - i;
+
+ return j;
+}
+#endif
+
/* Allocate a new skbuff. We do this ourselves so we can fill in a few
* 'private' fields and also do memory statistics to find all the
* [BEEP] leaks.
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ecd5b9c424b8..f97bc9cf9b9a 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -25,6 +25,7 @@
#include <linux/vmalloc.h>
#include <net/xdp_sock_drv.h>
#include <net/busy_poll.h>
+#include <net/hotdata.h>
#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/xdp.h>
@@ -1230,10 +1231,15 @@ static void xsk_delete_from_maps(struct xdp_sock *xs)
static void xsk_batch_reset(struct xsk_batch *batch, struct sk_buff **skbs,
struct xdp_desc *descs, unsigned int size)
{
+ if (batch->skb_count)
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache,
+ batch->skb_count,
+ (void **)batch->skb_cache);
kfree(batch->skb_cache);
kvfree(batch->desc_cache);
batch->skb_cache = skbs;
batch->desc_cache = descs;
+ batch->skb_count = 0;
batch->generic_xmit_batch = size;
}
@@ -1946,6 +1952,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
INIT_LIST_HEAD(&xs->map_list);
spin_lock_init(&xs->map_list_lock);
+ __skb_queue_head_init(&xs->batch.send_queue);
mutex_lock(&net->xdp.lock);
sk_add_node_rcu(sk, &net->xdp.list);
--
2.41.3
next prev parent reply other threads:[~2026-04-15 8:27 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-15 8:26 [PATCH RFC net-next v4 00/14] xsk: batch xmit in copy mode Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 01/14] xsk: introduce XDP_GENERIC_XMIT_BATCH setsockopt Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 02/14] xsk: extend xsk_build_skb() to support passing an already allocated skb Jason Xing
2026-04-15 8:26 ` Jason Xing [this message]
2026-04-15 8:26 ` [PATCH RFC net-next v4 04/14] xsk: cache data buffers to avoid frequently calling kmalloc_reserve Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 05/14] xsk: add direct xmit in batch function Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 06/14] xsk: support dynamic xmit.more control for batch xmit Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 07/14] xsk: try to skip validating skb list in xmit path Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 08/14] xsk: rename nb_pkts to nb_descs in xsk_tx_peek_release_desc_batch Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 09/14] xsk: extend xskq_cons_read_desc_batch to count nb_pkts Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 10/14] xsk: extend xsk_cq_reserve_locked() to reserve n slots Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 11/14] xsk: support batch xmit main logic Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 12/14] xsk: separate read-mostly and write-heavy fields in xsk_buff_pool Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 13/14] xsk: retire old xmit path in copy mode Jason Xing
2026-04-15 8:26 ` [PATCH RFC net-next v4 14/14] xsk: optimize xsk_build_skb for batch copy-mode fast path Jason Xing
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260415082654.21026-4-kerneljasonxing@gmail.com \
--to=kerneljasonxing@gmail.com \
--cc=ast@kernel.org \
--cc=bjorn@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=hawk@kernel.org \
--cc=john.fastabend@gmail.com \
--cc=jonathan.lemon@gmail.com \
--cc=kernelxing@tencent.com \
--cc=kuba@kernel.org \
--cc=maciej.fijalkowski@intel.com \
--cc=magnus.karlsson@intel.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=sdf@fomichev.me \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox