[PATCH] swiotlb: avoid double copy with swiotlb on tx socket

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
@ 2026-06-15 23:42 Luigi Rizzo
  2026-06-16  0:25 ` Jakub Kicinski
                   ` (6 more replies)
  0 siblings, 7 replies; 12+ messages in thread
From: Luigi Rizzo @ 2026-06-15 23:42 UTC (permalink / raw)
  To: rizzo.unipi, lrizzo, m.szyprowski, robin.murphy, willemb, kuniyu,
	davem, edumazet, kuba, pabeni
  Cc: gregkh, rafael, akpm, david, netdev, linux-mm, iommu, driver-core,
	linux-kernel

The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
especially with greedy senders, this has a high chance of happening in
the softirq handler for tx network interrupts, creating a significant
performance bottleneck.

Allow tx sockets to allocate socket buffers directly from the bounce
buffers. This avoids the second copy and removes the above bottleneck.
The fraction of swiotlb buffers allowed for this feature is set with
   /sys/module/swiotlb/parameters/zerocopy_tx_percent
(0 means disabled, 90 is the maximum, to avoid persistent I/O failures).

Implementation:
- define a new page type to unambiguously identify bounce buffers used
  as backing storage for socket buffers
- modify skb_page_frag_refill to perform the modified allocation
- modify the destructors __free_frozen_pages(), free_unref_folio() to
  handle those pages and return them to the pool.

The savings are especially visible with fewer queues. In synthetic
benchmarks, senders with 1-2 queues would cap around 50Gbps with
conventional swiotlb, and reach over 170Gbps with the feature enabled.

Signed-off-by: Luigi Rizzo <lrizzo@google.com>
---
 drivers/base/core.c        |   1 +
 include/linux/netdevice.h  |  22 ++++
 include/linux/page-flags.h |   4 +
 include/linux/skbuff.h     |   7 +-
 include/linux/swiotlb.h    |  74 ++++++++++++
 include/net/sock.h         |  29 +++++
 kernel/dma/swiotlb.c       | 227 +++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c            |  32 ++++++
 net/core/sock.c            |  98 ++++++++++++++--
 9 files changed, 485 insertions(+), 9 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index bd2ddf2aab505..e1257dea37ba0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3855,6 +3855,7 @@ void device_del(struct device *dev)
 	unsigned int noio_flag;
 
 	device_lock(dev);
+	swiotlb_device_deleted();
 	kill_device(dev);
 	device_unlock(dev);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0e1e581efc5ac..d7e5929e73c92 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -5368,13 +5368,35 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
 	return ops->ndo_start_xmit(skb, dev);
 }
 
+struct sock;
+
+#ifdef CONFIG_SWIOTLB
+/* Per-CPU pointer to the socket currently performing transmission.
+ * Used to bridge the networking and DMA layers, allowing the dma_map_page()
+ * path to identify the socket originating the packet and apply SWIOTLB optimizations.
+ */
+DECLARE_PER_CPU(struct sock *, current_tx_socket);
+static inline struct sock *__set_current_tx_socket(struct sock *sk)
+{
+	struct sock *old_sk = this_cpu_read(current_tx_socket);
+
+	this_cpu_write(current_tx_socket, sk);
+	return old_sk;
+}
+#else
+static inline struct sock *__set_current_tx_socket(struct sock *sk) { return NULL; }
+#endif
+
 static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
 					    struct netdev_queue *txq, bool more)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
+	struct sock *old_sk;
 	netdev_tx_t rc;
 
+	old_sk = __set_current_tx_socket(skb->sk);
 	rc = __netdev_start_xmit(ops, skb, dev, more);
+	__set_current_tx_socket(old_sk);
 	if (rc == NETDEV_TX_OK)
 		txq_trans_update(dev, txq);
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7223f6f4e2b40..0ecbb404038a0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -923,6 +923,7 @@ enum pagetype {
 	PGTY_zsmalloc		= 0xf6,
 	PGTY_unaccepted		= 0xf7,
 	PGTY_large_kmalloc	= 0xf8,
+	PGTY_zcswiotlb		= 0xf9,
 
 	PGTY_mapcount_underflow = 0xff
 };
@@ -1055,6 +1056,9 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
 PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
 PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc)
 
+/* Pages in socket buffers from the swiotlb pool. */
+PAGE_TYPE_OPS(ZCSwiotlb, zcswiotlb, zcswiotlb)
+
 /**
  * PageHuge - Determine if the page belongs to hugetlbfs
  * @page: The page to test.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3f06254ab1b72..62340909409e5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3787,7 +3787,12 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto,
 	fragto->netmem = fragfrom->netmem;
 }
 
-bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
+/* zerocopy swiotlb uses an additional non-null struct sock pointer. */
+bool __skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio, struct sock *sk);
+static inline bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
+{
+	return __skb_page_frag_refill(sz, pfrag, prio, NULL);
+}
 
 /**
  * __skb_frag_dma_map - maps a paged fragment via the DMA API
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 3dae0f592063e..bd2d0e160a9d8 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -7,8 +7,10 @@
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/limits.h>
+#include <linux/percpu.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
+#include <linux/atomic.h>
 
 struct device;
 struct page;
@@ -122,6 +124,9 @@ struct io_tlb_mem {
 	atomic_long_t total_used;
 	atomic_long_t used_hiwater;
 	atomic_long_t transient_nslabs;
+#else
+	unsigned long last_used_slots;
+	unsigned long last_used_jiffies;
 #endif
 };
 
@@ -185,6 +190,69 @@ bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
 phys_addr_t default_swiotlb_base(void);
 phys_addr_t default_swiotlb_limit(void);
+
+/* Helpers for zerocopy swiotlb. */
+/* Control allocation fraction. */
+extern unsigned int swiotlb_zc_tx_percent;
+
+/* Track freshness of the leaf device info. */
+extern atomic_t global_device_serial;
+
+static inline u32 swiotlb_get_device_serial(void)
+{
+	return atomic_read(&global_device_serial);
+}
+
+static inline void swiotlb_device_deleted(void)
+{
+	atomic_inc(&global_device_serial);
+}
+
+struct page *swiotlb_alloc_pages(struct device *dev, unsigned int order);
+bool swiotlb_free_pages(struct page *page, bool where_debug_only);
+void swiotlb_safe_put_device(struct device *dev);
+
+static inline void swiotlb_set_page_dev(struct page *page, struct device *dev)
+{
+	page->private = (unsigned long)dev;
+}
+
+static inline struct device *swiotlb_page_to_dev(struct page *page)
+{
+	return (struct device *)compound_head(page)->private;
+}
+
+static inline bool is_zerocopy_swiotlb_folio(struct page *page)
+{
+	struct folio *folio = page_folio(page);
+
+	return folio_test_zcswiotlb(folio) && folio->private != 0;
+}
+
+/* These two are in mm/page_alloc.c */
+void swiotlb_prep_compound_page(struct page *page, unsigned int order);
+void swiotlb_destroy_compound_page(struct page *page, unsigned int order);
+
+#if defined(CONFIG_NET)
+/*
+ * Track the socket for the currently transmitted packet, so the dma mapping
+ * function can record there the leaf device if it needs bounce buffers.
+ */
+struct sock;
+DECLARE_PER_CPU(struct sock *, current_tx_socket);
+void sk_set_bounce_device(struct sock *sk, struct device *dev);
+static inline void dma_learn_bounce_device(struct device *dev)
+{
+	struct sock *sk = this_cpu_read(current_tx_socket);
+
+	if (sk)
+		sk_set_bounce_device(sk, dev);
+}
+#else
+static inline void dma_learn_bounce_device(struct device *dev) {}
+#endif
+/* End helpers for zerocopy swiotlb. */
+
 #else
 static inline void swiotlb_init(bool addressing_limited, unsigned int flags)
 {
@@ -234,6 +302,12 @@ static inline phys_addr_t default_swiotlb_limit(void)
 {
 	return 0;
 }
+
+/* zerocopy swiotlb stubs */
+static inline bool swiotlb_free_pages(struct page *page, int reason) { return false; }
+static inline u32 swiotlb_get_device_serial(void) { return 0; }
+static inline void swiotlb_device_deleted(void) {}
+
 #endif /* CONFIG_SWIOTLB */
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
diff --git a/include/net/sock.h b/include/net/sock.h
index dccd3738c3687..1e6caf4bd1366 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -47,6 +47,7 @@
 #include <linux/skbuff.h>	/* struct sk_buff */
 #include <linux/mm.h>
 #include <linux/security.h>
+#include <linux/swiotlb.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/page_counter.h>
@@ -70,6 +71,14 @@
 #include <net/l3mdev.h>
 #include <uapi/linux/socket.h>
 
+#ifdef CONFIG_SWIOTLB
+struct sk_swiotlb_info {
+	struct device		*dev;
+	u32			serial;
+	unsigned long		jiffies;
+};
+#endif
+
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
@@ -602,8 +611,28 @@ struct sock {
 #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
 	struct module		*sk_owner;
 #endif
+#ifdef CONFIG_SWIOTLB
+	struct sk_swiotlb_info	sk_swiotlb;
+#endif
 };
 
+#ifdef CONFIG_SWIOTLB
+static inline void sk_init_bounce_device(struct sock *sk)
+{
+	sk->sk_swiotlb.dev = NULL;
+}
+static inline void sk_cleanup_bounce_device(struct sock *sk)
+{
+	if (sk->sk_swiotlb.dev) {
+		swiotlb_safe_put_device(sk->sk_swiotlb.dev);
+		sk->sk_swiotlb.dev = NULL;
+	}
+}
+#else
+static inline void sk_init_bounce_device(struct sock *sk) {}
+static inline void sk_cleanup_bounce_device(struct sock *sk) {}
+#endif
+
 struct sock_bh_locked {
 	struct sock *sock;
 	local_lock_t bh_lock;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1abd3e6146f45..e27f23d03c482 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -37,12 +37,16 @@
 #include <linux/mm.h>
 #include <linux/pfn.h>
 #include <linux/rculist.h>
+#include <linux/refcount.h>
 #include <linux/scatterlist.h>
 #include <linux/set_memory.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/swiotlb.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
 #include <linux/types.h>
+#include <linux/atomic.h>
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 #include <linux/of.h>
 #include <linux/of_fdt.h>
@@ -81,6 +85,17 @@ struct io_tlb_slot {
 static bool swiotlb_force_bounce;
 static bool swiotlb_force_disable;
 
+/**
+ * global_device_serial - Global sequence number for device deletions
+ *
+ * Incremented every time a device is unregistered (in device_del()).
+ * Used by subsystems (like SWIOTLB zero-copy sockets) as a fast, lockless
+ * O(1) cache invalidation serial to detect when a cached device pointer
+ * might have been deleted and needs to be expired to prevent Use-After-Free.
+ */
+atomic_t global_device_serial = ATOMIC_INIT(0);
+EXPORT_SYMBOL(global_device_serial);
+
 #ifdef CONFIG_SWIOTLB_DYNAMIC
 
 static void swiotlb_dyn_alloc(struct work_struct *work);
@@ -1442,6 +1457,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	offset &= (IO_TLB_SIZE - 1);
 	index += pad_slots;
 	pool->slots[index].pad_slots = pad_slots;
+	/* Fix an upstream bug with alloc_align_mask = 0xffff */
+	pool->slots[index].alloc_size = mapping_size;
 	for (i = 0; i < (nr_slots(size) - pad_slots); i++)
 		pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
 	tlb_addr = slot_addr(pool->start, index) + offset;
@@ -1554,6 +1571,13 @@ void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
 		size_t mapping_size, enum dma_data_direction dir,
 		unsigned long attrs, struct io_tlb_pool *pool)
 {
+	/*
+	 * Recognize and avoid unmapping pages allocated for Zero-Copy SWIOTLB Page Bypass.
+	 * They will be eventually released when the page reference count drops to 0.
+	 */
+	if (is_zerocopy_swiotlb_folio(pfn_to_page(PHYS_PFN(tlb_addr))))
+		return;
+
 	/*
 	 * First, sync the memory before unmapping the entry
 	 */
@@ -1597,6 +1621,21 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	phys_addr_t swiotlb_addr;
 	dma_addr_t dma_addr;
 
+	dma_learn_bounce_device(dev);
+
+	/*
+	 * If the page was allocated via Zero-Copy SWIOTLB Page Bypass, it is likely
+	 * already good for DMA so we can return its dma address.
+	 */
+	if (is_zerocopy_swiotlb_folio(pfn_to_page(PHYS_PFN(paddr)))) {
+		dma_addr = phys_to_dma_unencrypted(dev, paddr);
+		if (likely(dma_capable(dev, dma_addr, size, true))) {
+			if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+				arch_sync_dma_for_device(paddr, size, dir);
+			return dma_addr;
+		}
+	}
+
 	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
 
 	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
@@ -1899,3 +1938,191 @@ static const struct reserved_mem_ops rmem_swiotlb_ops = {
 
 RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops);
 #endif /* CONFIG_DMA_RESTRICTED_POOL */
+
+/*
+ * Asynchronous/Deferred Device Release.
+ * put_device() can trigger the final release path of a device which may sleep.
+ * Since SWIOTLB pages can be freed in atomic or interrupt context (e.g. TX completion),
+ * we must defer the put_device() call to task context using a workqueue.
+ */
+struct swiotlb_deferred_put {
+	struct work_struct work;
+	struct device *dev;
+};
+
+static void swiotlb_deferred_put_work(struct work_struct *work)
+{
+	struct swiotlb_deferred_put *dp = container_of(work, struct swiotlb_deferred_put, work);
+
+	put_device(dp->dev);
+	kfree(dp);
+}
+
+/**
+ * swiotlb_safe_put_device() - Safely release device reference from atomic/interrupt context
+ * @dev: The device structure to release.
+ *
+ * Enqueues a deferred put_device() call on a workqueue using GFP_ATOMIC.
+ * If memory allocation fails, the reference is leaked to avoid an immediate crash.
+ */
+void swiotlb_safe_put_device(struct device *dev)
+{
+	struct swiotlb_deferred_put *dp;
+
+	if (!dev)
+		return;
+
+	/*
+	 * FAST PATH (O(1) lockless): If this is not the last reference,
+	 * we can decrement it atomically and safely in any context
+	 * without allocating memory or scheduling work!
+	 */
+	if (refcount_dec_not_one(&dev->kobj.kref.refcount))
+		return;
+
+	/*
+	 * SLOW PATH: It is the last reference (refcount == 1). We must
+	 * defer the final put_device() to task context because it will
+	 * trigger device_release() which can sleep.
+	 */
+	dp = kmalloc_obj(*dp, GFP_ATOMIC);
+	if (dp) {
+		INIT_WORK(&dp->work, swiotlb_deferred_put_work);
+		dp->dev = dev;
+		schedule_work(&dp->work);
+	} else {
+		pr_warn_ratelimited("swiotlb: failed to allocate deferred put, leaking device ref\n");
+	}
+}
+EXPORT_SYMBOL_GPL(swiotlb_safe_put_device);
+
+unsigned int swiotlb_zc_tx_percent;
+module_param_named(zerocopy_tx_percent, swiotlb_zc_tx_percent, uint, 0644);
+
+static unsigned long fast_mem_used(struct io_tlb_mem *mem)
+{
+#ifdef CONFIG_DEBUG_FS
+	return mem_used(mem);
+#else
+	unsigned long last_j = READ_ONCE(mem->last_used_jiffies);
+	unsigned long now = jiffies;
+
+	if (time_after(now, last_j + HZ / 100) &&
+	    try_cmpxchg(&mem->last_used_jiffies, &last_j, now)) {
+		WRITE_ONCE(mem->last_used_slots, mem_used(mem));
+	}
+	return READ_ONCE(mem->last_used_slots);
+#endif
+}
+
+/**
+ * swiotlb_alloc_pages() - Allocate long-lived contiguous pages from SWIOTLB pool
+ * @dev: Device which requires the SWIOTLB bounce buffers.
+ * @order: Allocation order (log2 of number of pages).
+ */
+struct page *swiotlb_alloc_pages(struct device *dev, unsigned int order)
+{
+	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+	struct io_tlb_pool *pool;
+	int npages = 1 << order;
+	unsigned int max_pct;
+	phys_addr_t tlb_addr;
+	struct page *page;
+	int index;
+
+	if (!mem || !mem->nslabs)
+		return NULL;
+
+	max_pct = clamp(READ_ONCE(swiotlb_zc_tx_percent), 0u, 90u);
+	if (max_pct == 0 || max_pct * mem->nslabs <= fast_mem_used(mem) * 100)
+		return NULL;
+
+	/*
+	 * Enforce natural alignment for compound pages. The mask-based
+	 * compound_head() optimization (used when HVO is enabled and struct page
+	 * size is a power of 2) assumes that compound pages are naturally aligned
+	 * to their size. Without this, compound_head() on tail pages can return
+	 * a wrong head page pointer, leading to refcount corruption.
+	 */
+	index = swiotlb_find_slots(dev, 0, PAGE_SIZE * npages, ~(PAGE_MASK << order), &pool);
+	if (index == -1)
+		return NULL;
+
+	tlb_addr = slot_addr(pool->start, index);
+
+	pool->slots[index].pad_slots = 0;
+	pool->slots[index].alloc_size = PAGE_SIZE * npages;
+
+	page = pfn_to_page(PHYS_PFN(tlb_addr));
+
+	set_page_count(page, 1);
+
+	/* Strictly tag page[0] to prevent clobbering folio tail overlays */
+	__SetPageZCSwiotlb(page);
+
+	swiotlb_set_page_dev(page, dev);
+	get_device(dev);
+	swiotlb_prep_compound_page(page, order);
+	return page;
+}
+EXPORT_SYMBOL_GPL(swiotlb_alloc_pages);
+
+/*
+ * Debugging to track how swiotlb_free_pages() was called.
+ * b2: 0 from __free_frozen_pages(), 1 from free_unref_folios()
+ * b1: pool found b0: dev present,
+ */
+static unsigned long zc_debug[8];
+static int ctrs_num = 8;
+module_param_array(zc_debug, ulong, &ctrs_num, 0644);
+static void __zc_debug_stats(bool where, bool has_dev, bool has_pool)
+{
+	zc_debug[has_dev + has_pool * 2 + where * 4]++;
+}
+
+/**
+ * swiotlb_free_pages() - Free pages allocated via swiotlb_alloc_pages()
+ * @page: The starting struct page to release.
+ */
+bool swiotlb_free_pages(struct page *page, bool where_debug_only)
+{
+	struct page *head = compound_head(page);
+	struct device *dev = swiotlb_page_to_dev(head);
+	phys_addr_t head_tlb_addr = page_to_phys(head);
+	struct io_tlb_pool *pool;
+	int index, npages, i;
+
+	if (!folio_test_zcswiotlb(page_folio(head)))
+		return false;
+
+	pool = dev ? swiotlb_find_pool(dev, head_tlb_addr) : NULL;
+	__zc_debug_stats(where_debug_only, !!dev, !!pool);
+
+	/* Check for any false positives. */
+	if (!pool)
+		return false;
+
+	/* Read alloc_size first, it is reset by swiotlb_release_slots(). */
+	index = (head_tlb_addr - pool->start) >> IO_TLB_SHIFT;
+	npages = pool->slots[index].alloc_size >> PAGE_SHIFT;
+
+	WARN_ON_ONCE(!is_power_of_2(npages));
+
+	/* Step 1: Sever compound links (clobbers compound_info / lru.next) */
+	swiotlb_destroy_compound_page(head, ilog2(npages));
+
+	/* Step 2: Re-init LRU, drop refcounts, and strip flag across all constituent pages */
+	for (i = 0; i < npages; i++) {
+		INIT_LIST_HEAD(&head[i].lru);
+		set_page_count(&head[i], 0);
+		head[i].private = 0;
+		__ClearPageZCSwiotlb(&head[i]);
+	}
+
+	/* Step 3: Safely release slots back to the pool */
+	swiotlb_release_slots(dev, head_tlb_addr, pool);
+	swiotlb_del_transient(dev, head_tlb_addr, pool);
+	swiotlb_safe_put_device(dev);
+	return true;
+}
+EXPORT_SYMBOL_GPL(swiotlb_free_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d49c254174da7..eaba683b5b2a8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
 
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/swiotlb.h>
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/jiffies.h>
@@ -705,6 +706,31 @@ void prep_compound_page(struct page *page, unsigned int order)
 	prep_compound_head(page, order);
 }
 
+#ifdef CONFIG_SWIOTLB
+void swiotlb_prep_compound_page(struct page *page, unsigned int order)
+{
+	if (order > 0)
+		prep_compound_page(page, order);
+}
+
+void swiotlb_destroy_compound_page(struct page *page, unsigned int order)
+{
+	if (order > 0) {
+		struct folio *folio = (struct folio *)page;
+
+		__ClearPageHead(page);
+		page[1].flags.f &= ~PAGE_FLAGS_SECOND;
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+		folio->_nr_pages = 0;
+#endif
+		for (int i = 1; i < (1 << order); i++) {
+			page[i].mapping = NULL;
+			clear_compound_head(&page[i]);
+		}
+	}
+}
+#endif /* CONFIG_SWIOTLB */
+
 static inline void set_buddy_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
@@ -2930,6 +2956,9 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
+	if (unlikely(swiotlb_free_pages(page, false)))
+		return;
+
 	if (!pcp_allowed_order(order)) {
 		__free_pages_ok(page, order, fpi_flags);
 		return;
@@ -2996,6 +3025,9 @@ void free_unref_folios(struct folio_batch *folios)
 		unsigned long pfn = folio_pfn(folio);
 		unsigned int order = folio_order(folio);
 
+		if (unlikely(swiotlb_free_pages(&folio->page, true)))
+			continue;
+
 		if (!__free_pages_prepare(&folio->page, order, FPI_NONE))
 			continue;
 		/*
diff --git a/net/core/sock.c b/net/core/sock.c
index d097025c116a8..c6fbb469f9ce5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -103,6 +103,9 @@
 #include <linux/sockios.h>
 #include <linux/net.h>
 #include <linux/mm.h>
+#include <linux/swiotlb.h>
+#include <linux/device.h>
+#include <linux/moduleparam.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/poll.h>
@@ -152,6 +155,83 @@
 
 #include "dev.h"
 
+#ifdef CONFIG_SWIOTLB
+
+DEFINE_PER_CPU(struct sock *, current_tx_socket);
+EXPORT_PER_CPU_SYMBOL(current_tx_socket);
+
+void sk_set_bounce_device(struct sock *sk, struct device *dev)
+{
+	struct device *old_dev;
+
+	if (in_hardirq() || !sk_fullsock(sk) || sock_flag(sk, SOCK_ZEROCOPY))
+		return;
+
+	old_dev = READ_ONCE(sk->sk_swiotlb.dev);
+
+	if (dev != old_dev) {
+		/* Rate-limit updates to once per second to prevent bonding thrashing */
+		if (old_dev && time_before(jiffies, sk->sk_swiotlb.jiffies + HZ))
+			return;
+
+		get_device(dev);
+
+		/* Atomically swap in the new device and get the actual old one */
+		old_dev = xchg(&sk->sk_swiotlb.dev, dev);
+
+		WRITE_ONCE(sk->sk_swiotlb.serial, swiotlb_get_device_serial());
+		sk->sk_swiotlb.jiffies = jiffies;
+
+		/* Only drop the reference to the device we actually replaced */
+		if (old_dev)
+			swiotlb_safe_put_device(old_dev);
+	}
+}
+EXPORT_SYMBOL(sk_set_bounce_device);
+
+/*
+ * Wrap alloc_pages in __skb_page_frag_refill(). If the socket's dma_device requires
+ * SWIOTLB bounce buffering, divert allocation to the SWIOTLB slot allocator.
+ * This ensures the packet payload is written directly to a bounce buffer from the start,
+ * enabling zero-copy during driver DMA mapping.
+ */
+static inline struct page *alloc_any_pg(gfp_t gfp, unsigned int order, struct sock *sk)
+{
+	if (sk && READ_ONCE(swiotlb_zc_tx_percent) && !sock_flag(sk, SOCK_ZEROCOPY)) {
+		u32 serial = READ_ONCE(sk->sk_swiotlb.serial);
+		struct device *dev;
+
+		/* Force serial read BEFORE device pointer read. */
+		smp_rmb();
+
+		dev = READ_ONCE(sk->sk_swiotlb.dev);
+
+		if (dev) {
+			/*
+			 * The serial check is just for cache invalidation, UAF is
+			 * protected by the reference held in the sk.
+			 */
+			if (swiotlb_get_device_serial() != serial) {
+				if (cmpxchg(&sk->sk_swiotlb.dev, dev, NULL) == dev)
+					swiotlb_safe_put_device(dev);
+			} else {
+				struct page *page = swiotlb_alloc_pages(dev, order);
+
+				if (page)
+					return page;
+				/* On failure, fallback to alloc_pages(). */
+			}
+		}
+	}
+	return alloc_pages(gfp, order);
+}
+#else
+static inline struct page *alloc_any_pg(gfp_t gfp, unsigned int order, struct sock *sk)
+{
+	return alloc_pages(gfp, order);
+}
+#endif
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -2383,6 +2463,7 @@ static void __sk_destruct(struct rcu_head *head)
 		__netns_tracker_free(net, &sk->ns_tracker, false);
 		net_passive_dec(net);
 	}
+	sk_cleanup_bounce_device(sk);
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
@@ -2485,6 +2566,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
 		goto out;
 
 	sock_copy(newsk, sk);
+	sk_init_bounce_device(newsk);
 
 	newsk->sk_prot_creator = prot;
 
@@ -3134,7 +3216,7 @@ DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
  * no guarantee that allocations succeed. Therefore, @sz MUST be
  * less or equal than PAGE_SIZE.
  */
-bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
+bool __skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp, struct sock *sk)
 {
 	if (pfrag->page) {
 		if (page_ref_count(pfrag->page) == 1) {
@@ -3150,27 +3232,27 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 	if (SKB_FRAG_PAGE_ORDER &&
 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
 		/* Avoid direct reclaim but allow kswapd to wake */
-		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
-					  __GFP_COMP | __GFP_NOWARN |
-					  __GFP_NORETRY,
-					  SKB_FRAG_PAGE_ORDER);
+		pfrag->page = alloc_any_pg((gfp & ~__GFP_DIRECT_RECLAIM) |
+					   __GFP_COMP | __GFP_NOWARN |
+					   __GFP_NORETRY,
+					   SKB_FRAG_PAGE_ORDER, sk);
 		if (likely(pfrag->page)) {
 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
 			return true;
 		}
 	}
-	pfrag->page = alloc_page(gfp);
+	pfrag->page = alloc_any_pg(gfp, 0, sk);
 	if (likely(pfrag->page)) {
 		pfrag->size = PAGE_SIZE;
 		return true;
 	}
 	return false;
 }
-EXPORT_SYMBOL(skb_page_frag_refill);
+EXPORT_SYMBOL(__skb_page_frag_refill);
 
 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 {
-	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
+	if (likely(__skb_page_frag_refill(32U, pfrag, sk->sk_allocation, sk)))
 		return true;
 
 	if (!sk->sk_bypass_prot_mem)
-- 
2.54.0.1136.gdb2ca164c4-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
@ 2026-06-16  0:25 ` Jakub Kicinski
  2026-06-16  0:33   ` Luigi Rizzo
  2026-06-16  4:17 ` Eric Dumazet
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 12+ messages in thread
From: Jakub Kicinski @ 2026-06-16  0:25 UTC (permalink / raw)
  To: Luigi Rizzo
  Cc: rizzo.unipi, m.szyprowski, robin.murphy, willemb, kuniyu, davem,
	edumazet, pabeni, gregkh, rafael, akpm, david, netdev, linux-mm,
	iommu, driver-core, linux-kernel

On Mon, 15 Jun 2026 23:42:20 +0000 Luigi Rizzo wrote:
> The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> especially with greedy senders, this has a high chance of happening in
> the softirq handler for tx network interrupts, creating a significant
> performance bottleneck.

What's the use case? I associate swiotlb with debug / testing mostly,
so it'd be useful for people like me to explain why you care.

BTW net-next is closed: https://netdev.bots.linux.dev/net-next.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-16  0:25 ` Jakub Kicinski
@ 2026-06-16  0:33   ` Luigi Rizzo
  2026-06-16 11:06     ` Mostafa Saleh
  0 siblings, 1 reply; 12+ messages in thread
From: Luigi Rizzo @ 2026-06-16  0:33 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: rizzo.unipi, m.szyprowski, robin.murphy, willemb, kuniyu, davem,
	edumazet, pabeni, gregkh, rafael, akpm, david, netdev, linux-mm,
	iommu, driver-core, linux-kernel

On Tue, Jun 16, 2026 at 2:25 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Mon, 15 Jun 2026 23:42:20 +0000 Luigi Rizzo wrote:
> > The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> > especially with greedy senders, this has a high chance of happening in
> > the softirq handler for tx network interrupts, creating a significant
> > performance bottleneck.
>
> What's the use case? I associate swiotlb with debug / testing mostly,
> so it'd be useful for people like me to explain why you care.

Ah sorry, I forgot to mention.
swiotlb is used in guest kernels for confidential computing VMs.
Ordinary memory pages are encrypted and the host or devices
have no way to decrypt them, so the kernel must use
unencrypted bounce buffers to exchange data with I/O devices.

cheers
luigi

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-16  0:33   ` Luigi Rizzo
@ 2026-06-16 11:06     ` Mostafa Saleh
  0 siblings, 0 replies; 12+ messages in thread
From: Mostafa Saleh @ 2026-06-16 11:06 UTC (permalink / raw)
  To: Luigi Rizzo
  Cc: Jakub Kicinski, rizzo.unipi, m.szyprowski, robin.murphy, willemb,
	kuniyu, davem, edumazet, pabeni, gregkh, rafael, akpm, david,
	netdev, linux-mm, iommu, driver-core, linux-kernel

On Tue, Jun 16, 2026 at 02:33:52AM +0200, Luigi Rizzo wrote:
> On Tue, Jun 16, 2026 at 2:25 AM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Mon, 15 Jun 2026 23:42:20 +0000 Luigi Rizzo wrote:
> > > The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> > > especially with greedy senders, this has a high chance of happening in
> > > the softirq handler for tx network interrupts, creating a significant
> > > performance bottleneck.
> >
> > What's the use case? I associate swiotlb with debug / testing mostly,
> > so it'd be useful for people like me to explain why you care.
> 
> Ah sorry, I forgot to mention.
> swiotlb is used in guest kernels for confidential computing VMs.
> Ordinary memory pages are encrypted and the host or devices
> have no way to decrypt them, so the kernel must use
> unencrypted bounce buffers to exchange data with I/O devices.

I started looking into the same problem recently, to reduce the
bouncing in protected KVM (pKVM) confidential guests.
My first attempt was to update dma_direct_map_phys() to skip
bouncing and do inline memory decryption (for pKVM that is a hypercall
which updates the stage-2 page tables), however, that was really slow
compared to the memcpy in bouncing even for massive pages.
My conclusion was similar that we need to solve this at construction
by making this memory allocated from a pre-decrypted pool (which
does not have to be part of the SWIOTLB)
My initial idea was to teach some of the kernel subsystems (SKB,
BLK, SLAB) about "CoCo allocators" that allocate decrypted memory,
as this is not a net specific problem.

I am still looking into this, I was planning to bring this up in the
upcoming LPC.
I will give this patch a try. However, I believe that we need a more
generalised concept for CoCo pre-decrypted allocators in the kernel.

Thanks,
Mostafa

> 
> cheers
> luigi
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
  2026-06-16  0:25 ` Jakub Kicinski
@ 2026-06-16  4:17 ` Eric Dumazet
  2026-06-16  5:31 ` kernel test robot
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2026-06-16  4:17 UTC (permalink / raw)
  To: Luigi Rizzo
  Cc: rizzo.unipi, m.szyprowski, robin.murphy, willemb, kuniyu, davem,
	kuba, pabeni, gregkh, rafael, akpm, david, netdev, linux-mm,
	iommu, driver-core, linux-kernel

On Mon, Jun 15, 2026 at 4:42 PM Luigi Rizzo <lrizzo@google.com> wrote:
>
> The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> especially with greedy senders, this has a high chance of happening in
> the softirq handler for tx network interrupts, creating a significant
> performance bottleneck.
>
> Allow tx sockets to allocate socket buffers directly from the bounce
> buffers. This avoids the second copy and removes the above bottleneck.
> The fraction of swiotlb buffers allowed for this feature is set with
>    /sys/module/swiotlb/parameters/zerocopy_tx_percent

Strange name, because your patch targets the regular tcp sendmsg()
path (with a user -> kernel copy).

Typical high performance RPC libraries use TCP TX zerocopy these days.
They won't benefit from this idea.
Perhaps you should state this in your changelog or documentation.

Also, what is the typical size of the bounce buffers in your guests?

With standard tcp_wmem settings, each TCP flow can consume 4 MB.


> (0 means disabled, 90 is the maximum, to avoid persistent I/O failures).
>
> Implementation:
> - define a new page type to unambiguously identify bounce buffers used
>   as backing storage for socket buffers
> - modify skb_page_frag_refill to perform the modified allocation
> - modify the destructors __free_frozen_pages(), free_unref_folio() to
>   handle those pages and return them to the pool.
>
> The savings are especially visible with fewer queues. In synthetic
> benchmarks, senders with 1-2 queues would cap around 50Gbps with
> conventional swiotlb, and reach over 170Gbps with the feature enabled.

This patch is too large; please split it into smaller functional
units, so that each domain experts
can focus on their part.

I see you test SOCK_ZEROCOPY, but some applications setting this flag
can mix tcp sendmsg() with or without zero-copy.

I also see your patch missed CONFIG_PREEMPT_RT case.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
  2026-06-16  0:25 ` Jakub Kicinski
  2026-06-16  4:17 ` Eric Dumazet
@ 2026-06-16  5:31 ` kernel test robot
  2026-06-16  8:01 ` kernel test robot
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2026-06-16  5:31 UTC (permalink / raw)
  To: Luigi Rizzo, rizzo.unipi, m.szyprowski, robin.murphy, willemb,
	kuniyu, davem, edumazet, kuba, pabeni
  Cc: llvm, oe-kbuild-all, gregkh, rafael, akpm, david, netdev,
	linux-mm, iommu, driver-core, linux-kernel

Hi Luigi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v7.1 next-20260615]
[cannot apply to driver-core/driver-core-testing driver-core/driver-core-next driver-core/driver-core-linus]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Luigi-Rizzo/swiotlb-avoid-double-copy-with-swiotlb-on-tx-socket/20260616-074655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260615234220.3946885-1-lrizzo%40google.com
patch subject: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
config: powerpc-pmac32_defconfig (https://download.01.org/0day-ci/archive/20260616/202606161322.zGyw68Qa-lkp@intel.com/config)
compiler: clang version 23.0.0git (https://github.com/llvm/llvm-project e19d1f51a2c80b63cd8ca95bcc757b7077112808)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260616/202606161322.zGyw68Qa-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202606161322.zGyw68Qa-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> Warning: net/core/sock.c:3215 function parameter 'sk' not described in '__skb_page_frag_refill'
>> Warning: net/core/sock.c:3215 expecting prototype for skb_page_frag_refill(). Prototype was for __skb_page_frag_refill() instead

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
                   ` (2 preceding siblings ...)
  2026-06-16  5:31 ` kernel test robot
@ 2026-06-16  8:01 ` kernel test robot
  2026-06-16  8:36 ` David Hildenbrand (Arm)
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2026-06-16  8:01 UTC (permalink / raw)
  To: Luigi Rizzo, rizzo.unipi, m.szyprowski, robin.murphy, willemb,
	kuniyu, davem, edumazet, kuba, pabeni
  Cc: llvm, oe-kbuild-all, gregkh, rafael, akpm, david, netdev,
	linux-mm, iommu, driver-core, linux-kernel

Hi Luigi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v7.1 next-20260615]
[cannot apply to driver-core/driver-core-testing driver-core/driver-core-next driver-core/driver-core-linus]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Luigi-Rizzo/swiotlb-avoid-double-copy-with-swiotlb-on-tx-socket/20260616-074655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260615234220.3946885-1-lrizzo%40google.com
patch subject: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
config: loongarch-allnoconfig (https://download.01.org/0day-ci/archive/20260616/202606161519.z7SY98jp-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260616/202606161519.z7SY98jp-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202606161519.z7SY98jp-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/page_alloc.c:721:17: warning: unused variable 'folio' [-Wunused-variable]
     721 |                 struct folio *folio = (struct folio *)page;
         |                               ^~~~~
   1 warning generated.
--
>> Warning: kernel/dma/swiotlb.c:95 cannot understand function prototype: 'atomic_t global_device_serial = ATOMIC_INIT(0);'
>> Warning: kernel/dma/swiotlb.c:2087 function parameter 'where_debug_only' not described in 'swiotlb_free_pages'
>> Warning: kernel/dma/swiotlb.c:2087 function parameter 'where_debug_only' not described in 'swiotlb_free_pages'


vim +/folio +721 mm/page_alloc.c

   717	
   718	void swiotlb_destroy_compound_page(struct page *page, unsigned int order)
   719	{
   720		if (order > 0) {
 > 721			struct folio *folio = (struct folio *)page;
   722	
   723			__ClearPageHead(page);
   724			page[1].flags.f &= ~PAGE_FLAGS_SECOND;
   725	#ifdef NR_PAGES_IN_LARGE_FOLIO
   726			folio->_nr_pages = 0;
   727	#endif
   728			for (int i = 1; i < (1 << order); i++) {
   729				page[i].mapping = NULL;
   730				clear_compound_head(&page[i]);
   731			}
   732		}
   733	}
   734	#endif /* CONFIG_SWIOTLB */
   735	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
                   ` (3 preceding siblings ...)
  2026-06-16  8:01 ` kernel test robot
@ 2026-06-16  8:36 ` David Hildenbrand (Arm)
  2026-06-16  9:20 ` Pedro Falcato
  2026-06-16 11:21 ` kernel test robot
  6 siblings, 0 replies; 12+ messages in thread
From: David Hildenbrand (Arm) @ 2026-06-16  8:36 UTC (permalink / raw)
  To: Luigi Rizzo, rizzo.unipi, m.szyprowski, robin.murphy, willemb,
	kuniyu, davem, edumazet, kuba, pabeni
  Cc: gregkh, rafael, akpm, netdev, linux-mm, iommu, driver-core,
	linux-kernel

On 6/16/26 01:42, Luigi Rizzo wrote:
> The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> especially with greedy senders, this has a high chance of happening in
> the softirq handler for tx network interrupts, creating a significant
> performance bottleneck.
> 
> Allow tx sockets to allocate socket buffers directly from the bounce
> buffers. This avoids the second copy and removes the above bottleneck.
> The fraction of swiotlb buffers allowed for this feature is set with
>    /sys/module/swiotlb/parameters/zerocopy_tx_percent
> (0 means disabled, 90 is the maximum, to avoid persistent I/O failures).
> 
> Implementation:
> - define a new page type to unambiguously identify bounce buffers used
>   as backing storage for socket buffers
> - modify skb_page_frag_refill to perform the modified allocation
> - modify the destructors __free_frozen_pages(), free_unref_folio() to
>   handle those pages and return them to the pool.
> 
> The savings are especially visible with fewer queues. In synthetic
> benchmarks, senders with 1-2 queues would cap around 50Gbps with
> conventional swiotlb, and reach over 170Gbps with the feature enabled.
> 
> Signed-off-by: Luigi Rizzo <lrizzo@google.com>
> ---
>  drivers/base/core.c        |   1 +
>  include/linux/netdevice.h  |  22 ++++
>  include/linux/page-flags.h |   4 +
>  include/linux/skbuff.h     |   7 +-
>  include/linux/swiotlb.h    |  74 ++++++++++++
>  include/net/sock.h         |  29 +++++
>  kernel/dma/swiotlb.c       | 227 +++++++++++++++++++++++++++++++++++++
>  mm/page_alloc.c            |  32 ++++++
>  net/core/sock.c            |  98 ++++++++++++++--
>  9 files changed, 485 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index bd2ddf2aab505..e1257dea37ba0 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -3855,6 +3855,7 @@ void device_del(struct device *dev)
>  	unsigned int noio_flag;
>  
>  	device_lock(dev);
> +	swiotlb_device_deleted();
>  	kill_device(dev);
>  	device_unlock(dev);
>  
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0e1e581efc5ac..d7e5929e73c92 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -5368,13 +5368,35 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
>  	return ops->ndo_start_xmit(skb, dev);
>  }
>  
> +struct sock;
> +
> +#ifdef CONFIG_SWIOTLB
> +/* Per-CPU pointer to the socket currently performing transmission.
> + * Used to bridge the networking and DMA layers, allowing the dma_map_page()
> + * path to identify the socket originating the packet and apply SWIOTLB optimizations.
> + */
> +DECLARE_PER_CPU(struct sock *, current_tx_socket);
> +static inline struct sock *__set_current_tx_socket(struct sock *sk)
> +{
> +	struct sock *old_sk = this_cpu_read(current_tx_socket);
> +
> +	this_cpu_write(current_tx_socket, sk);
> +	return old_sk;
> +}
> +#else
> +static inline struct sock *__set_current_tx_socket(struct sock *sk) { return NULL; }
> +#endif
> +
>  static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
>  					    struct netdev_queue *txq, bool more)
>  {
>  	const struct net_device_ops *ops = dev->netdev_ops;
> +	struct sock *old_sk;
>  	netdev_tx_t rc;
>  
> +	old_sk = __set_current_tx_socket(skb->sk);
>  	rc = __netdev_start_xmit(ops, skb, dev, more);
> +	__set_current_tx_socket(old_sk);
>  	if (rc == NETDEV_TX_OK)
>  		txq_trans_update(dev, txq);
>  
> diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
> index 7223f6f4e2b40..0ecbb404038a0 100644
> --- a/include/linux/page-flags.h
> +++ b/include/linux/page-flags.h
> @@ -923,6 +923,7 @@ enum pagetype {
>  	PGTY_zsmalloc		= 0xf6,
>  	PGTY_unaccepted		= 0xf7,
>  	PGTY_large_kmalloc	= 0xf8,
> +	PGTY_zcswiotlb		= 0xf9,
>  
>  	PGTY_mapcount_underflow = 0xff
>  };
> @@ -1055,6 +1056,9 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
>  PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
>  PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc)
>  
> +/* Pages in socket buffers from the swiotlb pool. */
> +PAGE_TYPE_OPS(ZCSwiotlb, zcswiotlb, zcswiotlb)
> +
>  /**
>   * PageHuge - Determine if the page belongs to hugetlbfs
>   * @page: The page to test.
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 3f06254ab1b72..62340909409e5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3787,7 +3787,12 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto,
>  	fragto->netmem = fragfrom->netmem;
>  }
>  
> -bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);
> +/* zerocopy swiotlb uses an additional non-null struct sock pointer. */
> +bool __skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio, struct sock *sk);
> +static inline bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
> +{
> +	return __skb_page_frag_refill(sz, pfrag, prio, NULL);
> +}
>  
>  /**
>   * __skb_frag_dma_map - maps a paged fragment via the DMA API
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 3dae0f592063e..bd2d0e160a9d8 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -7,8 +7,10 @@
>  #include <linux/init.h>
>  #include <linux/types.h>
>  #include <linux/limits.h>
> +#include <linux/percpu.h>
>  #include <linux/spinlock.h>
>  #include <linux/workqueue.h>
> +#include <linux/atomic.h>
>  
>  struct device;
>  struct page;
> @@ -122,6 +124,9 @@ struct io_tlb_mem {
>  	atomic_long_t total_used;
>  	atomic_long_t used_hiwater;
>  	atomic_long_t transient_nslabs;
> +#else
> +	unsigned long last_used_slots;
> +	unsigned long last_used_jiffies;
>  #endif
>  };
>  
> @@ -185,6 +190,69 @@ bool is_swiotlb_active(struct device *dev);
>  void __init swiotlb_adjust_size(unsigned long size);
>  phys_addr_t default_swiotlb_base(void);
>  phys_addr_t default_swiotlb_limit(void);
> +
> +/* Helpers for zerocopy swiotlb. */
> +/* Control allocation fraction. */
> +extern unsigned int swiotlb_zc_tx_percent;
> +
> +/* Track freshness of the leaf device info. */
> +extern atomic_t global_device_serial;
> +
> +static inline u32 swiotlb_get_device_serial(void)
> +{
> +	return atomic_read(&global_device_serial);
> +}
> +
> +static inline void swiotlb_device_deleted(void)
> +{
> +	atomic_inc(&global_device_serial);
> +}
> +
> +struct page *swiotlb_alloc_pages(struct device *dev, unsigned int order);
> +bool swiotlb_free_pages(struct page *page, bool where_debug_only);
> +void swiotlb_safe_put_device(struct device *dev);
> +
> +static inline void swiotlb_set_page_dev(struct page *page, struct device *dev)
> +{
> +	page->private = (unsigned long)dev;
> +}
> +
> +static inline struct device *swiotlb_page_to_dev(struct page *page)
> +{
> +	return (struct device *)compound_head(page)->private;
> +}
> +
> +static inline bool is_zerocopy_swiotlb_folio(struct page *page)
> +{
> +	struct folio *folio = page_folio(page);
> +
> +	return folio_test_zcswiotlb(folio) && folio->private != 0;
> +}
> +
> +/* These two are in mm/page_alloc.c */
> +void swiotlb_prep_compound_page(struct page *page, unsigned int order);
> +void swiotlb_destroy_compound_page(struct page *page, unsigned int order);
> +
> +#if defined(CONFIG_NET)
> +/*
> + * Track the socket for the currently transmitted packet, so the dma mapping
> + * function can record there the leaf device if it needs bounce buffers.
> + */
> +struct sock;
> +DECLARE_PER_CPU(struct sock *, current_tx_socket);
> +void sk_set_bounce_device(struct sock *sk, struct device *dev);
> +static inline void dma_learn_bounce_device(struct device *dev)
> +{
> +	struct sock *sk = this_cpu_read(current_tx_socket);
> +
> +	if (sk)
> +		sk_set_bounce_device(sk, dev);
> +}
> +#else
> +static inline void dma_learn_bounce_device(struct device *dev) {}
> +#endif
> +/* End helpers for zerocopy swiotlb. */
> +
>  #else
>  static inline void swiotlb_init(bool addressing_limited, unsigned int flags)
>  {
> @@ -234,6 +302,12 @@ static inline phys_addr_t default_swiotlb_limit(void)
>  {
>  	return 0;
>  }
> +
> +/* zerocopy swiotlb stubs */
> +static inline bool swiotlb_free_pages(struct page *page, int reason) { return false; }
> +static inline u32 swiotlb_get_device_serial(void) { return 0; }
> +static inline void swiotlb_device_deleted(void) {}
> +
>  #endif /* CONFIG_SWIOTLB */
>  
>  phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
> diff --git a/include/net/sock.h b/include/net/sock.h
> index dccd3738c3687..1e6caf4bd1366 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -47,6 +47,7 @@
>  #include <linux/skbuff.h>	/* struct sk_buff */
>  #include <linux/mm.h>
>  #include <linux/security.h>
> +#include <linux/swiotlb.h>
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  #include <linux/page_counter.h>
> @@ -70,6 +71,14 @@
>  #include <net/l3mdev.h>
>  #include <uapi/linux/socket.h>
>  
> +#ifdef CONFIG_SWIOTLB
> +struct sk_swiotlb_info {
> +	struct device		*dev;
> +	u32			serial;
> +	unsigned long		jiffies;
> +};
> +#endif
> +
>  /*
>   * This structure really needs to be cleaned up.
>   * Most of it is for TCP, and not used by any of
> @@ -602,8 +611,28 @@ struct sock {
>  #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
>  	struct module		*sk_owner;
>  #endif
> +#ifdef CONFIG_SWIOTLB
> +	struct sk_swiotlb_info	sk_swiotlb;
> +#endif
>  };
>  
> +#ifdef CONFIG_SWIOTLB
> +static inline void sk_init_bounce_device(struct sock *sk)
> +{
> +	sk->sk_swiotlb.dev = NULL;
> +}
> +static inline void sk_cleanup_bounce_device(struct sock *sk)
> +{
> +	if (sk->sk_swiotlb.dev) {
> +		swiotlb_safe_put_device(sk->sk_swiotlb.dev);
> +		sk->sk_swiotlb.dev = NULL;
> +	}
> +}
> +#else
> +static inline void sk_init_bounce_device(struct sock *sk) {}
> +static inline void sk_cleanup_bounce_device(struct sock *sk) {}
> +#endif
> +
>  struct sock_bh_locked {
>  	struct sock *sock;
>  	local_lock_t bh_lock;
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 1abd3e6146f45..e27f23d03c482 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -37,12 +37,16 @@
>  #include <linux/mm.h>
>  #include <linux/pfn.h>
>  #include <linux/rculist.h>
> +#include <linux/refcount.h>
>  #include <linux/scatterlist.h>
>  #include <linux/set_memory.h>
>  #include <linux/spinlock.h>
>  #include <linux/string.h>
>  #include <linux/swiotlb.h>
> +#include <linux/moduleparam.h>
> +#include <linux/percpu.h>
>  #include <linux/types.h>
> +#include <linux/atomic.h>
>  #ifdef CONFIG_DMA_RESTRICTED_POOL
>  #include <linux/of.h>
>  #include <linux/of_fdt.h>
> @@ -81,6 +85,17 @@ struct io_tlb_slot {
>  static bool swiotlb_force_bounce;
>  static bool swiotlb_force_disable;
>  
> +/**
> + * global_device_serial - Global sequence number for device deletions
> + *
> + * Incremented every time a device is unregistered (in device_del()).
> + * Used by subsystems (like SWIOTLB zero-copy sockets) as a fast, lockless
> + * O(1) cache invalidation serial to detect when a cached device pointer
> + * might have been deleted and needs to be expired to prevent Use-After-Free.
> + */
> +atomic_t global_device_serial = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(global_device_serial);
> +
>  #ifdef CONFIG_SWIOTLB_DYNAMIC
>  
>  static void swiotlb_dyn_alloc(struct work_struct *work);
> @@ -1442,6 +1457,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>  	offset &= (IO_TLB_SIZE - 1);
>  	index += pad_slots;
>  	pool->slots[index].pad_slots = pad_slots;
> +	/* Fix an upstream bug with alloc_align_mask = 0xffff */
> +	pool->slots[index].alloc_size = mapping_size;
>  	for (i = 0; i < (nr_slots(size) - pad_slots); i++)
>  		pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
>  	tlb_addr = slot_addr(pool->start, index) + offset;
> @@ -1554,6 +1571,13 @@ void __swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
>  		size_t mapping_size, enum dma_data_direction dir,
>  		unsigned long attrs, struct io_tlb_pool *pool)
>  {
> +	/*
> +	 * Recognize and avoid unmapping pages allocated for Zero-Copy SWIOTLB Page Bypass.
> +	 * They will be eventually released when the page reference count drops to 0.
> +	 */
> +	if (is_zerocopy_swiotlb_folio(pfn_to_page(PHYS_PFN(tlb_addr))))
> +		return;
> +
>  	/*
>  	 * First, sync the memory before unmapping the entry
>  	 */
> @@ -1597,6 +1621,21 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
>  	phys_addr_t swiotlb_addr;
>  	dma_addr_t dma_addr;
>  
> +	dma_learn_bounce_device(dev);
> +
> +	/*
> +	 * If the page was allocated via Zero-Copy SWIOTLB Page Bypass, it is likely
> +	 * already good for DMA so we can return its dma address.
> +	 */
> +	if (is_zerocopy_swiotlb_folio(pfn_to_page(PHYS_PFN(paddr)))) {
> +		dma_addr = phys_to_dma_unencrypted(dev, paddr);
> +		if (likely(dma_capable(dev, dma_addr, size, true))) {
> +			if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
> +				arch_sync_dma_for_device(paddr, size, dir);
> +			return dma_addr;
> +		}
> +	}
> +
>  	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
>  
>  	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
> @@ -1899,3 +1938,191 @@ static const struct reserved_mem_ops rmem_swiotlb_ops = {
>  
>  RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", &rmem_swiotlb_ops);
>  #endif /* CONFIG_DMA_RESTRICTED_POOL */
> +
> +/*
> + * Asynchronous/Deferred Device Release.
> + * put_device() can trigger the final release path of a device which may sleep.
> + * Since SWIOTLB pages can be freed in atomic or interrupt context (e.g. TX completion),
> + * we must defer the put_device() call to task context using a workqueue.
> + */
> +struct swiotlb_deferred_put {
> +	struct work_struct work;
> +	struct device *dev;
> +};
> +
> +static void swiotlb_deferred_put_work(struct work_struct *work)
> +{
> +	struct swiotlb_deferred_put *dp = container_of(work, struct swiotlb_deferred_put, work);
> +
> +	put_device(dp->dev);
> +	kfree(dp);
> +}
> +
> +/**
> + * swiotlb_safe_put_device() - Safely release device reference from atomic/interrupt context
> + * @dev: The device structure to release.
> + *
> + * Enqueues a deferred put_device() call on a workqueue using GFP_ATOMIC.
> + * If memory allocation fails, the reference is leaked to avoid an immediate crash.
> + */
> +void swiotlb_safe_put_device(struct device *dev)
> +{
> +	struct swiotlb_deferred_put *dp;
> +
> +	if (!dev)
> +		return;
> +
> +	/*
> +	 * FAST PATH (O(1) lockless): If this is not the last reference,
> +	 * we can decrement it atomically and safely in any context
> +	 * without allocating memory or scheduling work!
> +	 */
> +	if (refcount_dec_not_one(&dev->kobj.kref.refcount))
> +		return;
> +
> +	/*
> +	 * SLOW PATH: It is the last reference (refcount == 1). We must
> +	 * defer the final put_device() to task context because it will
> +	 * trigger device_release() which can sleep.
> +	 */
> +	dp = kmalloc_obj(*dp, GFP_ATOMIC);
> +	if (dp) {
> +		INIT_WORK(&dp->work, swiotlb_deferred_put_work);
> +		dp->dev = dev;
> +		schedule_work(&dp->work);
> +	} else {
> +		pr_warn_ratelimited("swiotlb: failed to allocate deferred put, leaking device ref\n");
> +	}
> +}
> +EXPORT_SYMBOL_GPL(swiotlb_safe_put_device);
> +
> +unsigned int swiotlb_zc_tx_percent;
> +module_param_named(zerocopy_tx_percent, swiotlb_zc_tx_percent, uint, 0644);
> +
> +static unsigned long fast_mem_used(struct io_tlb_mem *mem)
> +{
> +#ifdef CONFIG_DEBUG_FS
> +	return mem_used(mem);
> +#else
> +	unsigned long last_j = READ_ONCE(mem->last_used_jiffies);
> +	unsigned long now = jiffies;
> +
> +	if (time_after(now, last_j + HZ / 100) &&
> +	    try_cmpxchg(&mem->last_used_jiffies, &last_j, now)) {
> +		WRITE_ONCE(mem->last_used_slots, mem_used(mem));
> +	}
> +	return READ_ONCE(mem->last_used_slots);
> +#endif
> +}
> +
> +/**
> + * swiotlb_alloc_pages() - Allocate long-lived contiguous pages from SWIOTLB pool
> + * @dev: Device which requires the SWIOTLB bounce buffers.
> + * @order: Allocation order (log2 of number of pages).
> + */
> +struct page *swiotlb_alloc_pages(struct device *dev, unsigned int order)
> +{
> +	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> +	struct io_tlb_pool *pool;
> +	int npages = 1 << order;
> +	unsigned int max_pct;
> +	phys_addr_t tlb_addr;
> +	struct page *page;
> +	int index;
> +
> +	if (!mem || !mem->nslabs)
> +		return NULL;
> +
> +	max_pct = clamp(READ_ONCE(swiotlb_zc_tx_percent), 0u, 90u);
> +	if (max_pct == 0 || max_pct * mem->nslabs <= fast_mem_used(mem) * 100)
> +		return NULL;
> +
> +	/*
> +	 * Enforce natural alignment for compound pages. The mask-based
> +	 * compound_head() optimization (used when HVO is enabled and struct page
> +	 * size is a power of 2) assumes that compound pages are naturally aligned
> +	 * to their size. Without this, compound_head() on tail pages can return
> +	 * a wrong head page pointer, leading to refcount corruption.
> +	 */
> +	index = swiotlb_find_slots(dev, 0, PAGE_SIZE * npages, ~(PAGE_MASK << order), &pool);
> +	if (index == -1)
> +		return NULL;
> +
> +	tlb_addr = slot_addr(pool->start, index);
> +
> +	pool->slots[index].pad_slots = 0;
> +	pool->slots[index].alloc_size = PAGE_SIZE * npages;
> +
> +	page = pfn_to_page(PHYS_PFN(tlb_addr));
> +
> +	set_page_count(page, 1);
> +
> +	/* Strictly tag page[0] to prevent clobbering folio tail overlays */
> +	__SetPageZCSwiotlb(page);
> +
> +	swiotlb_set_page_dev(page, dev);
> +	get_device(dev);
> +	swiotlb_prep_compound_page(page, order);
> +	return page;
> +}
> +EXPORT_SYMBOL_GPL(swiotlb_alloc_pages);
> +
> +/*
> + * Debugging to track how swiotlb_free_pages() was called.
> + * b2: 0 from __free_frozen_pages(), 1 from free_unref_folios()
> + * b1: pool found b0: dev present,
> + */
> +static unsigned long zc_debug[8];
> +static int ctrs_num = 8;
> +module_param_array(zc_debug, ulong, &ctrs_num, 0644);
> +static void __zc_debug_stats(bool where, bool has_dev, bool has_pool)
> +{
> +	zc_debug[has_dev + has_pool * 2 + where * 4]++;
> +}
> +
> +/**
> + * swiotlb_free_pages() - Free pages allocated via swiotlb_alloc_pages()
> + * @page: The starting struct page to release.
> + */
> +bool swiotlb_free_pages(struct page *page, bool where_debug_only)
> +{
> +	struct page *head = compound_head(page);
> +	struct device *dev = swiotlb_page_to_dev(head);
> +	phys_addr_t head_tlb_addr = page_to_phys(head);
> +	struct io_tlb_pool *pool;
> +	int index, npages, i;
> +
> +	if (!folio_test_zcswiotlb(page_folio(head)))
> +		return false;
> +
> +	pool = dev ? swiotlb_find_pool(dev, head_tlb_addr) : NULL;
> +	__zc_debug_stats(where_debug_only, !!dev, !!pool);
> +
> +	/* Check for any false positives. */
> +	if (!pool)
> +		return false;
> +
> +	/* Read alloc_size first, it is reset by swiotlb_release_slots(). */
> +	index = (head_tlb_addr - pool->start) >> IO_TLB_SHIFT;
> +	npages = pool->slots[index].alloc_size >> PAGE_SHIFT;
> +
> +	WARN_ON_ONCE(!is_power_of_2(npages));
> +
> +	/* Step 1: Sever compound links (clobbers compound_info / lru.next) */
> +	swiotlb_destroy_compound_page(head, ilog2(npages));
> +
> +	/* Step 2: Re-init LRU, drop refcounts, and strip flag across all constituent pages */
> +	for (i = 0; i < npages; i++) {
> +		INIT_LIST_HEAD(&head[i].lru);
> +		set_page_count(&head[i], 0);
> +		head[i].private = 0;
> +		__ClearPageZCSwiotlb(&head[i]);
> +	}
> +
> +	/* Step 3: Safely release slots back to the pool */
> +	swiotlb_release_slots(dev, head_tlb_addr, pool);
> +	swiotlb_del_transient(dev, head_tlb_addr, pool);
> +	swiotlb_safe_put_device(dev);
> +	return true;
> +}
> +EXPORT_SYMBOL_GPL(swiotlb_free_pages);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d49c254174da7..eaba683b5b2a8 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -16,6 +16,7 @@
>  
>  #include <linux/stddef.h>
>  #include <linux/mm.h>
> +#include <linux/swiotlb.h>
>  #include <linux/highmem.h>
>  #include <linux/interrupt.h>
>  #include <linux/jiffies.h>
> @@ -705,6 +706,31 @@ void prep_compound_page(struct page *page, unsigned int order)
>  	prep_compound_head(page, order);
>  }
>  
> +#ifdef CONFIG_SWIOTLB
> +void swiotlb_prep_compound_page(struct page *page, unsigned int order)
> +{
> +	if (order > 0)
> +		prep_compound_page(page, order);
> +}

Gah.

> +
> +void swiotlb_destroy_compound_page(struct page *page, unsigned int order)
> +{
> +	if (order > 0) {
> +		struct folio *folio = (struct folio *)page;
> +
> +		__ClearPageHead(page);
> +		page[1].flags.f &= ~PAGE_FLAGS_SECOND;
> +#ifdef NR_PAGES_IN_LARGE_FOLIO
> +		folio->_nr_pages = 0;
> +#endif
> +		for (int i = 1; i < (1 << order); i++) {
> +			page[i].mapping = NULL;
> +			clear_compound_head(&page[i]);
> +		}
> +	}
> +}

Gah.

> +#endif /* CONFIG_SWIOTLB */
> +
>  static inline void set_buddy_order(struct page *page, unsigned int order)
>  {
>  	set_page_private(page, order);
> @@ -2930,6 +2956,9 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
>  	unsigned long pfn = page_to_pfn(page);
>  	int migratetype;
>  
> +	if (unlikely(swiotlb_free_pages(page, false)))
> +		return;
> +

Oh my.

We shouldn't be handling randomg swiotlb stuff in the page allocator like that.

IIUC, you are writing your own pool+allocator and roughly mimic what hugetlb +
ZONE_DEVICE does.

The creation+destruction of compound pages should very likely be factored out
from other code in a type-unspecific fashion, if really required.

You should probably look into

https://lore.kernel.org/all/20250318161823.4005529-2-tabba@google.com/

to see how to possibly hook into the page freeing path in a cleaner way.

-- 
Cheers,

David

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
                   ` (4 preceding siblings ...)
  2026-06-16  8:36 ` David Hildenbrand (Arm)
@ 2026-06-16  9:20 ` Pedro Falcato
  2026-06-16  9:48   ` Luigi Rizzo
  2026-06-16 11:21 ` kernel test robot
  6 siblings, 1 reply; 12+ messages in thread
From: Pedro Falcato @ 2026-06-16  9:20 UTC (permalink / raw)
  To: Luigi Rizzo
  Cc: rizzo.unipi, m.szyprowski, robin.murphy, willemb, kuniyu, davem,
	edumazet, kuba, pabeni, gregkh, rafael, akpm, david, netdev,
	linux-mm, iommu, driver-core, linux-kernel,
	Jesper Dangaard Brouer, Ilias Apalodimas

(+cc page pool maintainers)
On Mon, Jun 15, 2026 at 11:42:20PM +0000, Luigi Rizzo wrote:
> The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> especially with greedy senders, this has a high chance of happening in
> the softirq handler for tx network interrupts, creating a significant
> performance bottleneck.
> 
> Allow tx sockets to allocate socket buffers directly from the bounce
> buffers. This avoids the second copy and removes the above bottleneck.
> The fraction of swiotlb buffers allowed for this feature is set with
>    /sys/module/swiotlb/parameters/zerocopy_tx_percent
> (0 means disabled, 90 is the maximum, to avoid persistent I/O failures).
> 
> Implementation:
> - define a new page type to unambiguously identify bounce buffers used
>   as backing storage for socket buffers
> - modify skb_page_frag_refill to perform the modified allocation
> - modify the destructors __free_frozen_pages(), free_unref_folio() to
>   handle those pages and return them to the pool.
> 
> The savings are especially visible with fewer queues. In synthetic
> benchmarks, senders with 1-2 queues would cap around 50Gbps with
> conventional swiotlb, and reach over 170Gbps with the feature enabled.

I could be wrong, but I genuinely think that the way to go about this is
using page_pool for regular TX as well. page_pool pages are all dma-mapped
(so whatever swiotlb optimization you want can be done there), and the net
stack already has awareness of these special pages and special skbs, so it
won't Just Return Them back to the page allocator.

Otherwise you can easily go all over the place, and that's just not great.
Also this could possibly benefit setups that use IOMMU as well.

-- 
Pedro

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-16  9:20 ` Pedro Falcato
@ 2026-06-16  9:48   ` Luigi Rizzo
  2026-06-16 10:28     ` Pedro Falcato
  0 siblings, 1 reply; 12+ messages in thread
From: Luigi Rizzo @ 2026-06-16  9:48 UTC (permalink / raw)
  To: Pedro Falcato
  Cc: rizzo.unipi, m.szyprowski, robin.murphy, willemb, kuniyu, davem,
	edumazet, kuba, pabeni, gregkh, rafael, akpm, david, netdev,
	linux-mm, iommu, driver-core, linux-kernel,
	Jesper Dangaard Brouer, Ilias Apalodimas

On Tue, Jun 16, 2026 at 11:20 AM Pedro Falcato <pfalcato@suse.de> wrote:
>
> (+cc page pool maintainers)
> On Mon, Jun 15, 2026 at 11:42:20PM +0000, Luigi Rizzo wrote:
> > The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> > especially with greedy senders, this has a high chance of happening in
> > the softirq handler for tx network interrupts, creating a significant
> > performance bottleneck.
> >
> > Allow tx sockets to allocate socket buffers directly from the bounce
> > buffers. This avoids the second copy and removes the above bottleneck.
> > The fraction of swiotlb buffers allowed for this feature is set with
> >    /sys/module/swiotlb/parameters/zerocopy_tx_percent
> > (0 means disabled, 90 is the maximum, to avoid persistent I/O failures).
> >
> > Implementation:
> > - define a new page type to unambiguously identify bounce buffers used
> >   as backing storage for socket buffers
> > - modify skb_page_frag_refill to perform the modified allocation
> > - modify the destructors __free_frozen_pages(), free_unref_folio() to
> >   handle those pages and return them to the pool.
> >
> > The savings are especially visible with fewer queues. In synthetic
> > benchmarks, senders with 1-2 queues would cap around 50Gbps with
> > conventional swiotlb, and reach over 170Gbps with the feature enabled.
>
> I could be wrong, but I genuinely think that the way to go about this is
> using page_pool for regular TX as well. page_pool pages are all dma-mapped
> (so whatever swiotlb optimization you want can be done there), and the net
> stack already has awareness of these special pages and special skbs, so it
> won't Just Return Them back to the page allocator.

I am not sure I follow your comment above, can you expand/clarify?

The problem I am dealing with is that the copy from the socket buffer
to the bounce buffer is done in the device xmit function. Under high
it is almost always done by the tx softirq.
This means that even if we move the copy outside the HARD_TX_LOCK(),
it would still be almost completely serialized.
Hence the proposed method to make skb_page_frag_refill() allocate
directly a bounce buffer (under specific conditions) so there is a single copy
done directly to the dma-able buffer, and ii is done  in the user threads/CPUs
and is not seriallized in the softirq thread.

I am not sure how page_pool on tx could help here.

cheers
luigi

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-16  9:48   ` Luigi Rizzo
@ 2026-06-16 10:28     ` Pedro Falcato
  0 siblings, 0 replies; 12+ messages in thread
From: Pedro Falcato @ 2026-06-16 10:28 UTC (permalink / raw)
  To: Luigi Rizzo
  Cc: rizzo.unipi, m.szyprowski, robin.murphy, willemb, kuniyu, davem,
	edumazet, kuba, pabeni, gregkh, rafael, akpm, david, netdev,
	linux-mm, iommu, driver-core, linux-kernel,
	Jesper Dangaard Brouer, Ilias Apalodimas

On Tue, Jun 16, 2026 at 11:48:36AM +0200, Luigi Rizzo wrote:
> On Tue, Jun 16, 2026 at 11:20 AM Pedro Falcato <pfalcato@suse.de> wrote:
> >
> > (+cc page pool maintainers)
> > On Mon, Jun 15, 2026 at 11:42:20PM +0000, Luigi Rizzo wrote:
> > > The use of swiotlb causes an extra data copy on I/O.  For tx sockets,
> > > especially with greedy senders, this has a high chance of happening in
> > > the softirq handler for tx network interrupts, creating a significant
> > > performance bottleneck.
> > >
> > > Allow tx sockets to allocate socket buffers directly from the bounce
> > > buffers. This avoids the second copy and removes the above bottleneck.
> > > The fraction of swiotlb buffers allowed for this feature is set with
> > >    /sys/module/swiotlb/parameters/zerocopy_tx_percent
> > > (0 means disabled, 90 is the maximum, to avoid persistent I/O failures).
> > >
> > > Implementation:
> > > - define a new page type to unambiguously identify bounce buffers used
> > >   as backing storage for socket buffers
> > > - modify skb_page_frag_refill to perform the modified allocation
> > > - modify the destructors __free_frozen_pages(), free_unref_folio() to
> > >   handle those pages and return them to the pool.
> > >
> > > The savings are especially visible with fewer queues. In synthetic
> > > benchmarks, senders with 1-2 queues would cap around 50Gbps with
> > > conventional swiotlb, and reach over 170Gbps with the feature enabled.
> >
> > I could be wrong, but I genuinely think that the way to go about this is
> > using page_pool for regular TX as well. page_pool pages are all dma-mapped
> > (so whatever swiotlb optimization you want can be done there), and the net
> > stack already has awareness of these special pages and special skbs, so it
> > won't Just Return Them back to the page allocator.
> 
> I am not sure I follow your comment above, can you expand/clarify?
> 
> The problem I am dealing with is that the copy from the socket buffer
> to the bounce buffer is done in the device xmit function. Under high
> it is almost always done by the tx softirq.
> This means that even if we move the copy outside the HARD_TX_LOCK(),
> it would still be almost completely serialized.
> Hence the proposed method to make skb_page_frag_refill() allocate
> directly a bounce buffer (under specific conditions) so there is a single copy
> done directly to the dma-able buffer, and ii is done  in the user threads/CPUs
> and is not seriallized in the softirq thread.
> 
> I am not sure how page_pool on tx could help here.

Page pool would provide both the means of passing around an iommu-mapped page,
and a concrete "this is where we allocate these pages" spot. Then introducing
a "zero-copy" swiotlb allocation would be a simple matter of introducing this
on page pool's side. In pseudo-code, something like:

static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
						 gfp_t gfp)
{
	struct page *page;

	gfp |= __GFP_COMP;
	
	if (pool->dma_map && /* is_swiotlb */) {
		page = swiotlb_alloc_pages(pool->p.nid, gfp, pool->p.order, ...);
		if (!page)
			return NULL;
		/* page is implicitly swiotlb mapped (well, _actually_ it's
		 * not that simple, because of the dma_mapped tracking that
		 * was introduced, but PoC anyway..). */
	} else {
		page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
		if (unlikely(!page))
			return NULL;

		if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
			put_page(page);
			return NULL;
		}
	}
}

(plus other spots, obviously). No copying should be required, and the
netmem desc will keep the dma_addr around. The network stack will notice
pp_recycle on all of these skbs and simply refuse to throw the pages away to
the page allocator.

In any case, it might be that this is not feasible for XYZ reasons, but I've
thought about this (making net use and reuse page pool pre-iommu-mapped pages
exclusively) for a while and I definitely see a lot of similarities with your
problem (that more or less reduces down to "I want to get an iommu-mapped page
from the get-go").

-- 
Pedro

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
  2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
                   ` (5 preceding siblings ...)
  2026-06-16  9:20 ` Pedro Falcato
@ 2026-06-16 11:21 ` kernel test robot
  6 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2026-06-16 11:21 UTC (permalink / raw)
  To: Luigi Rizzo, rizzo.unipi, m.szyprowski, robin.murphy, willemb,
	kuniyu, davem, edumazet, kuba, pabeni
  Cc: oe-kbuild-all, gregkh, rafael, akpm, david, netdev, linux-mm,
	iommu, driver-core, linux-kernel

Hi Luigi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on linus/master v7.1 next-20260615]
[cannot apply to driver-core/driver-core-testing driver-core/driver-core-next driver-core/driver-core-linus]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Luigi-Rizzo/swiotlb-avoid-double-copy-with-swiotlb-on-tx-socket/20260616-074655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20260615234220.3946885-1-lrizzo%40google.com
patch subject: [PATCH] swiotlb: avoid double copy with swiotlb on tx socket
config: arm-randconfig-r122-20260616 (https://download.01.org/0day-ci/archive/20260616/202606161921.OPkgBApm-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 16.1.0
sparse: v0.6.5-rc1
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260616/202606161921.OPkgBApm-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202606161921.OPkgBApm-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
   kernel/dma/swiotlb.c: note: in included file (through include/linux/dma-direct.h):
>> include/linux/swiotlb.h:229:65: sparse: sparse: Using plain integer as NULL pointer
>> include/linux/swiotlb.h:229:65: sparse: sparse: Using plain integer as NULL pointer

vim +229 include/linux/swiotlb.h

   224	
   225	static inline bool is_zerocopy_swiotlb_folio(struct page *page)
   226	{
   227		struct folio *folio = page_folio(page);
   228	
 > 229		return folio_test_zcswiotlb(folio) && folio->private != 0;
   230	}
   231	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-06-16 11:22 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-15 23:42 [PATCH] swiotlb: avoid double copy with swiotlb on tx socket Luigi Rizzo
2026-06-16  0:25 ` Jakub Kicinski
2026-06-16  0:33   ` Luigi Rizzo
2026-06-16 11:06     ` Mostafa Saleh
2026-06-16  4:17 ` Eric Dumazet
2026-06-16  5:31 ` kernel test robot
2026-06-16  8:01 ` kernel test robot
2026-06-16  8:36 ` David Hildenbrand (Arm)
2026-06-16  9:20 ` Pedro Falcato
2026-06-16  9:48   ` Luigi Rizzo
2026-06-16 10:28     ` Pedro Falcato
2026-06-16 11:21 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.