[PATCH net-next 0/2] net: devmem: improve cpu cost of RX token management

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/2] net: devmem: improve cpu cost of RX token management
@ 2025-09-02 21:36 Bobby Eshleman
  2025-09-02 21:36 ` [PATCH net-next 1/2] net: devmem: rename tx_vec to vec in dmabuf binding Bobby Eshleman
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Bobby Eshleman @ 2025-09-02 21:36 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, Neal Cardwell,
	David Ahern
  Cc: netdev, linux-kernel, Stanislav Fomichev, Mina Almasry,
	Bobby Eshleman

This series improves the CPU cost of RX token management by replacing
the xarray allocator with a normal array of atomics. Similar to devmem
TX's page-index lookup scheme for niovs, RX also uses page indices to
lookup the corresponding atomic in the array.

Improvement is ~5% per RX user thread.

Two other approaches were tested, but with no improvement. Namely, 1)
using a hashmap for tokens and 2) keeping an xarray of atomic counters
but using RCU so that the hotpath could be mostly lockless. Neither of
these approaches proved better than the simple array in terms of CPU.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
Bobby Eshleman (2):
      net: devmem: rename tx_vec to vec in dmabuf binding
      net: devmem: use niov array for token management

 include/net/sock.h       |   5 ++-
 net/core/devmem.c        |  31 +++++++-------
 net/core/devmem.h        |   4 +-
 net/core/sock.c          |  24 +++++++----
 net/ipv4/tcp.c           | 107 +++++++++++++++--------------------------------
 net/ipv4/tcp_ipv4.c      |  40 +++++++++++++++---
 net/ipv4/tcp_minisocks.c |   2 -
 7 files changed, 107 insertions(+), 106 deletions(-)
---
base-commit: cd8a4cfa6bb43a441901e82f5c222dddc75a18a3
change-id: 20250829-scratch-bobbyeshleman-devmem-tcp-token-upstream-292be174d503

Best regards,
-- 
Bobby Eshleman <bobbyeshleman@meta.com>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH net-next 1/2] net: devmem: rename tx_vec to vec in dmabuf binding
  2025-09-02 21:36 [PATCH net-next 0/2] net: devmem: improve cpu cost of RX token management Bobby Eshleman
@ 2025-09-02 21:36 ` Bobby Eshleman
  2025-09-02 21:36 ` [PATCH net-next 2/2] net: devmem: use niov array for token management Bobby Eshleman
  2025-09-03 17:46 ` [syzbot ci] Re: net: devmem: improve cpu cost of RX " syzbot ci
  2 siblings, 0 replies; 7+ messages in thread
From: Bobby Eshleman @ 2025-09-02 21:36 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, Neal Cardwell,
	David Ahern
  Cc: netdev, linux-kernel, Stanislav Fomichev, Mina Almasry,
	Bobby Eshleman

From: Bobby Eshleman <bobbyeshleman@meta.com>

Rename the 'tx_vec' field in struct net_devmem_dmabuf_binding to 'vec'.
This field holds pointers to net_iov structures. The rename prepares for
reusing 'vec' for both TX and RX directions.

No functional change intended.

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
 net/core/devmem.c | 22 +++++++++++-----------
 net/core/devmem.h |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/core/devmem.c b/net/core/devmem.c
index d9de31a6cc7f..b4c570d4f37a 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -74,7 +74,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
 	dma_buf_detach(binding->dmabuf, binding->attachment);
 	dma_buf_put(binding->dmabuf);
 	xa_destroy(&binding->bound_rxqs);
-	kvfree(binding->tx_vec);
+	kvfree(binding->vec);
 	kfree(binding);
 }
 
@@ -231,10 +231,10 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 	}
 
 	if (direction == DMA_TO_DEVICE) {
-		binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
-						 sizeof(struct net_iov *),
-						 GFP_KERNEL);
-		if (!binding->tx_vec) {
+		binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+					      sizeof(struct net_iov *),
+					      GFP_KERNEL);
+		if (!binding->vec) {
 			err = -ENOMEM;
 			goto err_unmap;
 		}
@@ -248,7 +248,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 					      dev_to_node(&dev->dev));
 	if (!binding->chunk_pool) {
 		err = -ENOMEM;
-		goto err_tx_vec;
+		goto err_vec;
 	}
 
 	virtual = 0;
@@ -294,7 +294,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 			page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
 						      net_devmem_get_dma_addr(niov));
 			if (direction == DMA_TO_DEVICE)
-				binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
+				binding->vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
 		}
 
 		virtual += len;
@@ -314,8 +314,8 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 	gen_pool_for_each_chunk(binding->chunk_pool,
 				net_devmem_dmabuf_free_chunk_owner, NULL);
 	gen_pool_destroy(binding->chunk_pool);
-err_tx_vec:
-	kvfree(binding->tx_vec);
+err_vec:
+	kvfree(binding->vec);
 err_unmap:
 	dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
 					  direction);
@@ -361,7 +361,7 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
 	int err = 0;
 
 	binding = net_devmem_lookup_dmabuf(dmabuf_id);
-	if (!binding || !binding->tx_vec) {
+	if (!binding || !binding->vec) {
 		err = -EINVAL;
 		goto out_err;
 	}
@@ -393,7 +393,7 @@ net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
 	*off = virt_addr % PAGE_SIZE;
 	*size = PAGE_SIZE - *off;
 
-	return binding->tx_vec[virt_addr / PAGE_SIZE];
+	return binding->vec[virt_addr / PAGE_SIZE];
 }
 
 /*** "Dmabuf devmem memory provider" ***/
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 101150d761af..2ada54fb63d7 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -63,7 +63,7 @@ struct net_devmem_dmabuf_binding {
 	 * address. This array is convenient to map the virtual addresses to
 	 * net_iovs in the TX path.
 	 */
-	struct net_iov **tx_vec;
+	struct net_iov **vec;
 
 	struct work_struct unbind_w;
 };

-- 
2.47.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH net-next 2/2] net: devmem: use niov array for token management
  2025-09-02 21:36 [PATCH net-next 0/2] net: devmem: improve cpu cost of RX token management Bobby Eshleman
  2025-09-02 21:36 ` [PATCH net-next 1/2] net: devmem: rename tx_vec to vec in dmabuf binding Bobby Eshleman
@ 2025-09-02 21:36 ` Bobby Eshleman
  2025-09-03 10:50   ` kernel test robot
  2025-09-03 20:20   ` Mina Almasry
  2025-09-03 17:46 ` [syzbot ci] Re: net: devmem: improve cpu cost of RX " syzbot ci
  2 siblings, 2 replies; 7+ messages in thread
From: Bobby Eshleman @ 2025-09-02 21:36 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, Neal Cardwell,
	David Ahern
  Cc: netdev, linux-kernel, Stanislav Fomichev, Mina Almasry,
	Bobby Eshleman

From: Bobby Eshleman <bobbyeshleman@meta.com>

Improve CPU performance of devmem token management by using page offsets
as dmabuf tokens and using them for direct array access lookups instead
of xarray lookups. Consequently, the xarray can be removed. The result
is an average 5% reduction in CPU cycles spent by devmem RX user
threads.

This patch changes the meaning of tokens. Tokens previously referred to
unique fragments of pages. In this patch tokens instead represent
references to pages, not fragments.  Because of this, multiple tokens
may refer to the same page and so have identical value (e.g., two small
fragments may coexist on the same page). The token and offset pair that
the user receives uniquely identifies fragments if needed.  This assumes
that the user is not attempting to sort / uniq the token list using
tokens alone.

A new restriction is added to the implementation: devmem RX sockets
cannot switch dmabuf bindings. In practice, this is a symptom of invalid
configuration as a flow would have to be steered to a different queue or
device where there is a different binding, which is generally bad for
TCP flows. This restriction is necessary because the 32-bit dmabuf token
does not have enough bits to represent both the pages in a large dmabuf
and also a binding or dmabuf ID. For example, a system with 8 NICs and
32 queues requires 8 bits for a binding / queue ID (8 NICs * 32 queues
== 256 queues total == 2^8), which leaves only 24 bits for dmabuf pages
(2^24 * 4096 / (1<<30) == 64GB). This is insufficient for the device and
queue numbers on many current systems or systems that may need larger
GPU dmabufs (as for hard limits, my current H100 has 80GB GPU memory per
device).

Using kperf[1] with 4 flows and workers, this patch improves receive
worker CPU util by ~4.9% with slightly better throughput.

Before, mean cpu util for rx workers ~83.6%:

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
Average:       4    2.30    0.00   79.43    0.00    0.65    0.21    0.00    0.00    0.00   17.41
Average:       5    2.27    0.00   80.40    0.00    0.45    0.21    0.00    0.00    0.00   16.67
Average:       6    2.28    0.00   80.47    0.00    0.46    0.25    0.00    0.00    0.00   16.54
Average:       7    2.42    0.00   82.05    0.00    0.46    0.21    0.00    0.00    0.00   14.86

After, mean cpu util % for rx workers ~78.7%:

Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
Average:       4    2.61    0.00   73.31    0.00    0.76    0.11    0.00    0.00    0.00   23.20
Average:       5    2.95    0.00   74.24    0.00    0.66    0.22    0.00    0.00    0.00   21.94
Average:       6    2.81    0.00   73.38    0.00    0.97    0.11    0.00    0.00    0.00   22.73
Average:       7    3.05    0.00   78.76    0.00    0.76    0.11    0.00    0.00    0.00   17.32

Mean throughput improves, but falls within a standard deviation (~45GB/s
for 4 flows on a 50GB/s NIC, one hop).

This patch adds an array of atomics for counting the tokens returned to
the user for a given page. There is a 4-byte atomic per page in the
dmabuf per socket. Given a 2GB dmabuf, this array is 2MB.

[1]: https://github.com/facebookexperimental/kperf

Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
 include/net/sock.h       |   5 ++-
 net/core/devmem.c        |  17 ++++----
 net/core/devmem.h        |   2 +-
 net/core/sock.c          |  24 +++++++----
 net/ipv4/tcp.c           | 107 +++++++++++++++--------------------------------
 net/ipv4/tcp_ipv4.c      |  40 +++++++++++++++---
 net/ipv4/tcp_minisocks.c |   2 -
 7 files changed, 99 insertions(+), 98 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 1e7f124871d2..70c97880229d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -573,7 +573,10 @@ struct sock {
 #endif
 	struct rcu_head		sk_rcu;
 	netns_tracker		ns_tracker;
-	struct xarray		sk_user_frags;
+	struct {
+		struct net_devmem_dmabuf_binding	*binding;
+		atomic_t				*urefs;
+	} sk_user_frags;
 
 #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
 	struct module		*sk_owner;
diff --git a/net/core/devmem.c b/net/core/devmem.c
index b4c570d4f37a..50e92dcf5bf1 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -187,6 +187,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 	struct dma_buf *dmabuf;
 	unsigned int sg_idx, i;
 	unsigned long virtual;
+	gfp_t flags;
 	int err;
 
 	if (!dma_dev) {
@@ -230,14 +231,14 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 		goto err_detach;
 	}
 
-	if (direction == DMA_TO_DEVICE) {
-		binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
-					      sizeof(struct net_iov *),
-					      GFP_KERNEL);
-		if (!binding->vec) {
-			err = -ENOMEM;
-			goto err_unmap;
-		}
+	flags = (direction == DMA_FROM_DEVICE) ? __GFP_ZERO : 0;
+
+	binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+				      sizeof(struct net_iov *),
+				      GFP_KERNEL | flags);
+	if (!binding->vec) {
+		err = -ENOMEM;
+		goto err_unmap;
 	}
 
 	/* For simplicity we expect to make PAGE_SIZE allocations, but the
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 2ada54fb63d7..d4eb28d079bb 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -61,7 +61,7 @@ struct net_devmem_dmabuf_binding {
 
 	/* Array of net_iov pointers for this binding, sorted by virtual
 	 * address. This array is convenient to map the virtual addresses to
-	 * net_iovs in the TX path.
+	 * net_iovs.
 	 */
 	struct net_iov **vec;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 9a8290fcc35d..3a5cb4e10519 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -87,6 +87,7 @@
 
 #include <linux/unaligned.h>
 #include <linux/capability.h>
+#include <linux/dma-buf.h>
 #include <linux/errno.h>
 #include <linux/errqueue.h>
 #include <linux/types.h>
@@ -151,6 +152,7 @@
 #include <uapi/linux/pidfd.h>
 
 #include "dev.h"
+#include "devmem.h"
 
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
@@ -1100,32 +1102,40 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
 		return -EFAULT;
 	}
 
-	xa_lock_bh(&sk->sk_user_frags);
 	for (i = 0; i < num_tokens; i++) {
 		for (j = 0; j < tokens[i].token_count; j++) {
+			struct net_iov *niov;
+			unsigned int token;
+			netmem_ref netmem;
+
+			token = tokens[i].token_start + j;
+			if (WARN_ONCE(token >= sk->sk_user_frags.binding->dmabuf->size / PAGE_SIZE,
+				      "invalid token passed from user"))
+				break;
+
 			if (++num_frags > MAX_DONTNEED_FRAGS)
 				goto frag_limit_reached;
-
-			netmem_ref netmem = (__force netmem_ref)__xa_erase(
-				&sk->sk_user_frags, tokens[i].token_start + j);
+			niov = sk->sk_user_frags.binding->vec[token];
+			netmem = net_iov_to_netmem(niov);
 
 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
 				continue;
 
+			if (WARN_ONCE(atomic_dec_if_positive(&sk->sk_user_frags.urefs[token])
+						< 0, "user released token too many times"))
+				continue;
+
 			netmems[netmem_num++] = netmem;
 			if (netmem_num == ARRAY_SIZE(netmems)) {
-				xa_unlock_bh(&sk->sk_user_frags);
 				for (k = 0; k < netmem_num; k++)
 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
 				netmem_num = 0;
-				xa_lock_bh(&sk->sk_user_frags);
 			}
 			ret++;
 		}
 	}
 
 frag_limit_reached:
-	xa_unlock_bh(&sk->sk_user_frags);
 	for (k = 0; k < netmem_num; k++)
 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 40b774b4f587..585b50fa8c00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -261,6 +261,7 @@
 #include <linux/memblock.h>
 #include <linux/highmem.h>
 #include <linux/cache.h>
+#include <linux/dma-buf.h>
 #include <linux/err.h>
 #include <linux/time.h>
 #include <linux/slab.h>
@@ -475,7 +476,8 @@ void tcp_init_sock(struct sock *sk)
 
 	set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
 	sk_sockets_allocated_inc(sk);
-	xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
+	sk->sk_user_frags.binding = NULL;
+	sk->sk_user_frags.urefs = NULL;
 }
 EXPORT_IPV6_MOD(tcp_init_sock);
 
@@ -2386,68 +2388,6 @@ static int tcp_inq_hint(struct sock *sk)
 	return inq;
 }
 
-/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
-struct tcp_xa_pool {
-	u8		max; /* max <= MAX_SKB_FRAGS */
-	u8		idx; /* idx <= max */
-	__u32		tokens[MAX_SKB_FRAGS];
-	netmem_ref	netmems[MAX_SKB_FRAGS];
-};
-
-static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
-{
-	int i;
-
-	/* Commit part that has been copied to user space. */
-	for (i = 0; i < p->idx; i++)
-		__xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
-			     (__force void *)p->netmems[i], GFP_KERNEL);
-	/* Rollback what has been pre-allocated and is no longer needed. */
-	for (; i < p->max; i++)
-		__xa_erase(&sk->sk_user_frags, p->tokens[i]);
-
-	p->max = 0;
-	p->idx = 0;
-}
-
-static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
-{
-	if (!p->max)
-		return;
-
-	xa_lock_bh(&sk->sk_user_frags);
-
-	tcp_xa_pool_commit_locked(sk, p);
-
-	xa_unlock_bh(&sk->sk_user_frags);
-}
-
-static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
-			      unsigned int max_frags)
-{
-	int err, k;
-
-	if (p->idx < p->max)
-		return 0;
-
-	xa_lock_bh(&sk->sk_user_frags);
-
-	tcp_xa_pool_commit_locked(sk, p);
-
-	for (k = 0; k < max_frags; k++) {
-		err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
-				 XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
-		if (err)
-			break;
-	}
-
-	xa_unlock_bh(&sk->sk_user_frags);
-
-	p->max = k;
-	p->idx = 0;
-	return k ? 0 : err;
-}
-
 /* On error, returns the -errno. On success, returns number of bytes sent to the
  * user. May not consume all of @remaining_len.
  */
@@ -2456,14 +2396,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 			      int remaining_len)
 {
 	struct dmabuf_cmsg dmabuf_cmsg = { 0 };
-	struct tcp_xa_pool tcp_xa_pool;
 	unsigned int start;
 	int i, copy, n;
 	int sent = 0;
 	int err = 0;
 
-	tcp_xa_pool.max = 0;
-	tcp_xa_pool.idx = 0;
 	do {
 		start = skb_headlen(skb);
 
@@ -2510,8 +2447,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 		 */
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			struct net_devmem_dmabuf_binding *binding;
 			struct net_iov *niov;
 			u64 frag_offset;
+			size_t size;
+			u32 token;
 			int end;
 
 			/* !skb_frags_readable() should indicate that ALL the
@@ -2544,13 +2484,35 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 					      start;
 				dmabuf_cmsg.frag_offset = frag_offset;
 				dmabuf_cmsg.frag_size = copy;
-				err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
-							 skb_shinfo(skb)->nr_frags - i);
-				if (err)
+
+				binding = net_devmem_iov_binding(niov);
+
+				if (!sk->sk_user_frags.binding) {
+					sk->sk_user_frags.binding = binding;
+
+					size = binding->dmabuf->size / PAGE_SIZE;
+					sk->sk_user_frags.urefs = kzalloc(size,
+									  GFP_KERNEL);
+					if (!sk->sk_user_frags.urefs) {
+						sk->sk_user_frags.binding = NULL;
+						err = -ENOMEM;
+						goto out;
+					}
+
+					net_devmem_dmabuf_binding_get(binding);
+				}
+
+				if (WARN_ONCE(sk->sk_user_frags.binding != binding,
+					      "binding changed for devmem socket")) {
+					err = -EFAULT;
 					goto out;
+				}
+
+				token = net_iov_virtual_addr(niov) >> PAGE_SHIFT;
+				binding->vec[token] = niov;
+				dmabuf_cmsg.frag_token = token;
 
 				/* Will perform the exchange later */
-				dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
 				dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
 
 				offset += copy;
@@ -2563,8 +2525,9 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 				if (err)
 					goto out;
 
+				atomic_inc(&sk->sk_user_frags.urefs[token]);
+
 				atomic_long_inc(&niov->pp_ref_count);
-				tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
 
 				sent += copy;
 
@@ -2574,7 +2537,6 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 			start = end;
 		}
 
-		tcp_xa_pool_commit(sk, &tcp_xa_pool);
 		if (!remaining_len)
 			goto out;
 
@@ -2592,7 +2554,6 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
 	}
 
 out:
-	tcp_xa_pool_commit(sk, &tcp_xa_pool);
 	if (!sent)
 		sent = err;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e58a8a9ff7a..bdcb8cc003af 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -87,6 +87,9 @@
 #include <crypto/hash.h>
 #include <linux/scatterlist.h>
 
+#include <linux/dma-buf.h>
+#include "../core/devmem.h"
+
 #include <trace/events/tcp.h>
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -2529,11 +2532,38 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
 static void tcp_release_user_frags(struct sock *sk)
 {
 #ifdef CONFIG_PAGE_POOL
-	unsigned long index;
-	void *netmem;
+	struct net_devmem_dmabuf_binding *binding;
+	struct net_iov *niov;
+	unsigned int token;
+	netmem_ref netmem;
+
+	if (!sk->sk_user_frags.urefs)
+		return;
+
+	binding = sk->sk_user_frags.binding;
+	if (!binding || !binding->vec)
+		return;
+
+	for (token = 0; token < binding->dmabuf->size / PAGE_SIZE; token++) {
+		niov = binding->vec[token];
+
+		/* never used by recvmsg() */
+		if (!niov)
+			continue;
+
+		if (!net_is_devmem_iov(niov))
+			continue;
+
+		netmem = net_iov_to_netmem(niov);
 
-	xa_for_each(&sk->sk_user_frags, index, netmem)
-		WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
+		while (atomic_dec_return(&sk->sk_user_frags.urefs[token]) >= 0)
+			WARN_ON_ONCE(!napi_pp_put_page(netmem));
+	}
+
+	net_devmem_dmabuf_binding_put(binding);
+	sk->sk_user_frags.binding = NULL;
+	kvfree(sk->sk_user_frags.urefs);
+	sk->sk_user_frags.urefs = NULL;
 #endif
 }
 
@@ -2543,8 +2573,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_release_user_frags(sk);
 
-	xa_destroy(&sk->sk_user_frags);
-
 	trace_tcp_destroy_sock(sk);
 
 	tcp_clear_xmit_timers(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d1c9e4088646..4e8ea73daab7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -639,8 +639,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 
-	xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
-
 	return newsk;
 }
 EXPORT_SYMBOL(tcp_create_openreq_child);

-- 
2.47.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next 2/2] net: devmem: use niov array for token management
  2025-09-02 21:36 ` [PATCH net-next 2/2] net: devmem: use niov array for token management Bobby Eshleman
@ 2025-09-03 10:50   ` kernel test robot
  2025-09-03 20:20   ` Mina Almasry
  1 sibling, 0 replies; 7+ messages in thread
From: kernel test robot @ 2025-09-03 10:50 UTC (permalink / raw)
  To: Bobby Eshleman, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Kuniyuki Iwashima, Willem de Bruijn,
	Neal Cardwell, David Ahern
  Cc: oe-kbuild-all, netdev, linux-kernel, Stanislav Fomichev,
	Mina Almasry, Bobby Eshleman

Hi Bobby,

kernel test robot noticed the following build errors:

[auto build test ERROR on cd8a4cfa6bb43a441901e82f5c222dddc75a18a3]

url:    https://github.com/intel-lab-lkp/linux/commits/Bobby-Eshleman/net-devmem-rename-tx_vec-to-vec-in-dmabuf-binding/20250903-054553
base:   cd8a4cfa6bb43a441901e82f5c222dddc75a18a3
patch link:    https://lore.kernel.org/r/20250902-scratch-bobbyeshleman-devmem-tcp-token-upstream-v1-2-d946169b5550%40meta.com
patch subject: [PATCH net-next 2/2] net: devmem: use niov array for token management
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20250903/202509031855.54vuvsX1-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250903/202509031855.54vuvsX1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509031855.54vuvsX1-lkp@intel.com/

All errors (new ones prefixed by >>):

   net/ipv4/tcp.c: In function 'tcp_recvmsg_dmabuf':
>> net/ipv4/tcp.c:2502:41: error: implicit declaration of function 'net_devmem_dmabuf_binding_get'; did you mean 'net_devmem_dmabuf_binding_put'? [-Werror=implicit-function-declaration]
    2502 |                                         net_devmem_dmabuf_binding_get(binding);
         |                                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         |                                         net_devmem_dmabuf_binding_put
   cc1: some warnings being treated as errors


vim +2502 net/ipv4/tcp.c

  2390	
  2391	/* On error, returns the -errno. On success, returns number of bytes sent to the
  2392	 * user. May not consume all of @remaining_len.
  2393	 */
  2394	static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
  2395				      unsigned int offset, struct msghdr *msg,
  2396				      int remaining_len)
  2397	{
  2398		struct dmabuf_cmsg dmabuf_cmsg = { 0 };
  2399		unsigned int start;
  2400		int i, copy, n;
  2401		int sent = 0;
  2402		int err = 0;
  2403	
  2404		do {
  2405			start = skb_headlen(skb);
  2406	
  2407			if (skb_frags_readable(skb)) {
  2408				err = -ENODEV;
  2409				goto out;
  2410			}
  2411	
  2412			/* Copy header. */
  2413			copy = start - offset;
  2414			if (copy > 0) {
  2415				copy = min(copy, remaining_len);
  2416	
  2417				n = copy_to_iter(skb->data + offset, copy,
  2418						 &msg->msg_iter);
  2419				if (n != copy) {
  2420					err = -EFAULT;
  2421					goto out;
  2422				}
  2423	
  2424				offset += copy;
  2425				remaining_len -= copy;
  2426	
  2427				/* First a dmabuf_cmsg for # bytes copied to user
  2428				 * buffer.
  2429				 */
  2430				memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
  2431				dmabuf_cmsg.frag_size = copy;
  2432				err = put_cmsg_notrunc(msg, SOL_SOCKET,
  2433						       SO_DEVMEM_LINEAR,
  2434						       sizeof(dmabuf_cmsg),
  2435						       &dmabuf_cmsg);
  2436				if (err)
  2437					goto out;
  2438	
  2439				sent += copy;
  2440	
  2441				if (remaining_len == 0)
  2442					goto out;
  2443			}
  2444	
  2445			/* after that, send information of dmabuf pages through a
  2446			 * sequence of cmsg
  2447			 */
  2448			for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
  2449				skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
  2450				struct net_devmem_dmabuf_binding *binding;
  2451				struct net_iov *niov;
  2452				u64 frag_offset;
  2453				size_t size;
  2454				u32 token;
  2455				int end;
  2456	
  2457				/* !skb_frags_readable() should indicate that ALL the
  2458				 * frags in this skb are dmabuf net_iovs. We're checking
  2459				 * for that flag above, but also check individual frags
  2460				 * here. If the tcp stack is not setting
  2461				 * skb_frags_readable() correctly, we still don't want
  2462				 * to crash here.
  2463				 */
  2464				if (!skb_frag_net_iov(frag)) {
  2465					net_err_ratelimited("Found non-dmabuf skb with net_iov");
  2466					err = -ENODEV;
  2467					goto out;
  2468				}
  2469	
  2470				niov = skb_frag_net_iov(frag);
  2471				if (!net_is_devmem_iov(niov)) {
  2472					err = -ENODEV;
  2473					goto out;
  2474				}
  2475	
  2476				end = start + skb_frag_size(frag);
  2477				copy = end - offset;
  2478	
  2479				if (copy > 0) {
  2480					copy = min(copy, remaining_len);
  2481	
  2482					frag_offset = net_iov_virtual_addr(niov) +
  2483						      skb_frag_off(frag) + offset -
  2484						      start;
  2485					dmabuf_cmsg.frag_offset = frag_offset;
  2486					dmabuf_cmsg.frag_size = copy;
  2487	
  2488					binding = net_devmem_iov_binding(niov);
  2489	
  2490					if (!sk->sk_user_frags.binding) {
  2491						sk->sk_user_frags.binding = binding;
  2492	
  2493						size = binding->dmabuf->size / PAGE_SIZE;
  2494						sk->sk_user_frags.urefs = kzalloc(size,
  2495										  GFP_KERNEL);
  2496						if (!sk->sk_user_frags.urefs) {
  2497							sk->sk_user_frags.binding = NULL;
  2498							err = -ENOMEM;
  2499							goto out;
  2500						}
  2501	
> 2502						net_devmem_dmabuf_binding_get(binding);
  2503					}
  2504	
  2505					if (WARN_ONCE(sk->sk_user_frags.binding != binding,
  2506						      "binding changed for devmem socket")) {
  2507						err = -EFAULT;
  2508						goto out;
  2509					}
  2510	
  2511					token = net_iov_virtual_addr(niov) >> PAGE_SHIFT;
  2512					binding->vec[token] = niov;
  2513					dmabuf_cmsg.frag_token = token;
  2514	
  2515					/* Will perform the exchange later */
  2516					dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
  2517	
  2518					offset += copy;
  2519					remaining_len -= copy;
  2520	
  2521					err = put_cmsg_notrunc(msg, SOL_SOCKET,
  2522							       SO_DEVMEM_DMABUF,
  2523							       sizeof(dmabuf_cmsg),
  2524							       &dmabuf_cmsg);
  2525					if (err)
  2526						goto out;
  2527	
  2528					atomic_inc(&sk->sk_user_frags.urefs[token]);
  2529	
  2530					atomic_long_inc(&niov->pp_ref_count);
  2531	
  2532					sent += copy;
  2533	
  2534					if (remaining_len == 0)
  2535						goto out;
  2536				}
  2537				start = end;
  2538			}
  2539	
  2540			if (!remaining_len)
  2541				goto out;
  2542	
  2543			/* if remaining_len is not satisfied yet, we need to go to the
  2544			 * next frag in the frag_list to satisfy remaining_len.
  2545			 */
  2546			skb = skb_shinfo(skb)->frag_list ?: skb->next;
  2547	
  2548			offset = offset - start;
  2549		} while (skb);
  2550	
  2551		if (remaining_len) {
  2552			err = -EFAULT;
  2553			goto out;
  2554		}
  2555	
  2556	out:
  2557		if (!sent)
  2558			sent = err;
  2559	
  2560		return sent;
  2561	}
  2562	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [syzbot ci] Re: net: devmem: improve cpu cost of RX token management
  2025-09-02 21:36 [PATCH net-next 0/2] net: devmem: improve cpu cost of RX token management Bobby Eshleman
  2025-09-02 21:36 ` [PATCH net-next 1/2] net: devmem: rename tx_vec to vec in dmabuf binding Bobby Eshleman
  2025-09-02 21:36 ` [PATCH net-next 2/2] net: devmem: use niov array for token management Bobby Eshleman
@ 2025-09-03 17:46 ` syzbot ci
  2 siblings, 0 replies; 7+ messages in thread
From: syzbot ci @ 2025-09-03 17:46 UTC (permalink / raw)
  To: almasrymina, bobbyeshleman, bobbyeshleman, davem, dsahern,
	edumazet, horms, kuba, kuniyu, linux-kernel, ncardwell, netdev,
	pabeni, sdf, willemb
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v1] net: devmem: improve cpu cost of RX token management
https://lore.kernel.org/all/20250902-scratch-bobbyeshleman-devmem-tcp-token-upstream-v1-0-d946169b5550@meta.com
* [PATCH net-next 1/2] net: devmem: rename tx_vec to vec in dmabuf binding
* [PATCH net-next 2/2] net: devmem: use niov array for token management

and found the following issue:
general protection fault in sock_devmem_dontneed

Full report is available here:
https://ci.syzbot.org/series/c0dc7223-4222-461c-b04b-b6f0004c7509

***

general protection fault in sock_devmem_dontneed

tree:      net-next
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/netdev/net-next.git
base:      864ecc4a6dade82d3f70eab43dad0e277aa6fc78
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/6c521908-0a73-48bc-a0fd-2576cf868361/config
C repro:   https://ci.syzbot.org/findings/9374f388-d643-42ea-831b-872eb5000b3c/c_repro
syz repro: https://ci.syzbot.org/findings/9374f388-d643-42ea-831b-872eb5000b3c/syz_repro

Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN PTI
KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
CPU: 0 UID: 0 PID: 5993 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:sock_devmem_dontneed+0x3fd/0x960 net/core/sock.c:1112
Code: 8b 44 24 40 44 8b 28 44 03 6c 24 14 48 8b 44 24 20 42 80 3c 20 00 74 08 4c 89 ff e8 4d eb c9 f8 4d 8b 3f 4c 89 f8 48 c1 e8 03 <42> 80 3c 20 00 74 08 4c 89 ff e8 34 eb c9 f8 4d 8b 3f 4c 89 f8 48
RSP: 0018:ffffc90001c4fac0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 1ffff11004ddcfe0
RDX: ffff88801f6a8000 RSI: 0000000000000400 RDI: 0000000000000000
RBP: ffffc90001c4fc50 R08: ffffc90001c4fbdf R09: 0000000000000000
R10: ffffc90001c4fb60 R11: fffff52000389f7c R12: dffffc0000000000
R13: 00000000ffffffff R14: 0000000000000000 R15: 0000000000000000
FS:  000055558634d500(0000) GS:ffff8880b8614000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b30e63fff CR3: 0000000026b68000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 sk_setsockopt+0x682/0x2dc0 net/core/sock.c:1302
 do_sock_setsockopt+0x11b/0x1b0 net/socket.c:2340
 __sys_setsockopt net/socket.c:2369 [inline]
 __do_sys_setsockopt net/socket.c:2375 [inline]
 __se_sys_setsockopt net/socket.c:2372 [inline]
 __x64_sys_setsockopt+0x13f/0x1b0 net/socket.c:2372
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fea2bb8ebe9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffc81c63348 EFLAGS: 00000246 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00007fea2bdc5fa0 RCX: 00007fea2bb8ebe9
RDX: 0000000000000050 RSI: 0000000000000001 RDI: 0000000000000003
RBP: 00007fea2bc11e19 R08: 0000000000000048 R09: 0000000000000000
R10: 0000200000000100 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fea2bdc5fa0 R14: 00007fea2bdc5fa0 R15: 0000000000000005
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:sock_devmem_dontneed+0x3fd/0x960 net/core/sock.c:1112
Code: 8b 44 24 40 44 8b 28 44 03 6c 24 14 48 8b 44 24 20 42 80 3c 20 00 74 08 4c 89 ff e8 4d eb c9 f8 4d 8b 3f 4c 89 f8 48 c1 e8 03 <42> 80 3c 20 00 74 08 4c 89 ff e8 34 eb c9 f8 4d 8b 3f 4c 89 f8 48
RSP: 0018:ffffc90001c4fac0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 1ffff11004ddcfe0
RDX: ffff88801f6a8000 RSI: 0000000000000400 RDI: 0000000000000000
RBP: ffffc90001c4fc50 R08: ffffc90001c4fbdf R09: 0000000000000000
R10: ffffc90001c4fb60 R11: fffff52000389f7c R12: dffffc0000000000
R13: 00000000ffffffff R14: 0000000000000000 R15: 0000000000000000
FS:  000055558634d500(0000) GS:ffff8880b8614000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000001b30e63fff CR3: 0000000026b68000 CR4: 00000000000006f0
----------------
Code disassembly (best guess):
   0:	8b 44 24 40          	mov    0x40(%rsp),%eax
   4:	44 8b 28             	mov    (%rax),%r13d
   7:	44 03 6c 24 14       	add    0x14(%rsp),%r13d
   c:	48 8b 44 24 20       	mov    0x20(%rsp),%rax
  11:	42 80 3c 20 00       	cmpb   $0x0,(%rax,%r12,1)
  16:	74 08                	je     0x20
  18:	4c 89 ff             	mov    %r15,%rdi
  1b:	e8 4d eb c9 f8       	call   0xf8c9eb6d
  20:	4d 8b 3f             	mov    (%r15),%r15
  23:	4c 89 f8             	mov    %r15,%rax
  26:	48 c1 e8 03          	shr    $0x3,%rax
* 2a:	42 80 3c 20 00       	cmpb   $0x0,(%rax,%r12,1) <-- trapping instruction
  2f:	74 08                	je     0x39
  31:	4c 89 ff             	mov    %r15,%rdi
  34:	e8 34 eb c9 f8       	call   0xf8c9eb6d
  39:	4d 8b 3f             	mov    (%r15),%r15
  3c:	4c 89 f8             	mov    %r15,%rax
  3f:	48                   	rex.W


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next 2/2] net: devmem: use niov array for token management
  2025-09-02 21:36 ` [PATCH net-next 2/2] net: devmem: use niov array for token management Bobby Eshleman
  2025-09-03 10:50   ` kernel test robot
@ 2025-09-03 20:20   ` Mina Almasry
  2025-09-04  0:15     ` Bobby Eshleman
  1 sibling, 1 reply; 7+ messages in thread
From: Mina Almasry @ 2025-09-03 20:20 UTC (permalink / raw)
  To: Bobby Eshleman
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, Neal Cardwell,
	David Ahern, netdev, linux-kernel, Stanislav Fomichev,
	Bobby Eshleman

On Tue, Sep 2, 2025 at 2:36 PM Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
>
> From: Bobby Eshleman <bobbyeshleman@meta.com>
>
> Improve CPU performance of devmem token management by using page offsets
> as dmabuf tokens and using them for direct array access lookups instead
> of xarray lookups. Consequently, the xarray can be removed. The result
> is an average 5% reduction in CPU cycles spent by devmem RX user
> threads.
>

Great!

> This patch changes the meaning of tokens. Tokens previously referred to
> unique fragments of pages. In this patch tokens instead represent
> references to pages, not fragments.  Because of this, multiple tokens
> may refer to the same page and so have identical value (e.g., two small
> fragments may coexist on the same page). The token and offset pair that
> the user receives uniquely identifies fragments if needed.  This assumes
> that the user is not attempting to sort / uniq the token list using
> tokens alone.
>
> A new restriction is added to the implementation: devmem RX sockets
> cannot switch dmabuf bindings. In practice, this is a symptom of invalid
> configuration as a flow would have to be steered to a different queue or
> device where there is a different binding, which is generally bad for
> TCP flows.

Please do not assume configurations you don't use/care about are
invalid. Currently reconfiguring flow steering while a flow is active
works as intended today. This is a regression that needs to be
resolved. But more importantly, looking at your code, I don't think
this is a restriction you need to introduce?

> This restriction is necessary because the 32-bit dmabuf token
> does not have enough bits to represent both the pages in a large dmabuf
> and also a binding or dmabuf ID. For example, a system with 8 NICs and
> 32 queues requires 8 bits for a binding / queue ID (8 NICs * 32 queues
> == 256 queues total == 2^8), which leaves only 24 bits for dmabuf pages
> (2^24 * 4096 / (1<<30) == 64GB). This is insufficient for the device and
> queue numbers on many current systems or systems that may need larger
> GPU dmabufs (as for hard limits, my current H100 has 80GB GPU memory per
> device).
>
> Using kperf[1] with 4 flows and workers, this patch improves receive
> worker CPU util by ~4.9% with slightly better throughput.
>
> Before, mean cpu util for rx workers ~83.6%:
>
> Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
> Average:       4    2.30    0.00   79.43    0.00    0.65    0.21    0.00    0.00    0.00   17.41
> Average:       5    2.27    0.00   80.40    0.00    0.45    0.21    0.00    0.00    0.00   16.67
> Average:       6    2.28    0.00   80.47    0.00    0.46    0.25    0.00    0.00    0.00   16.54
> Average:       7    2.42    0.00   82.05    0.00    0.46    0.21    0.00    0.00    0.00   14.86
>
> After, mean cpu util % for rx workers ~78.7%:
>
> Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
> Average:       4    2.61    0.00   73.31    0.00    0.76    0.11    0.00    0.00    0.00   23.20
> Average:       5    2.95    0.00   74.24    0.00    0.66    0.22    0.00    0.00    0.00   21.94
> Average:       6    2.81    0.00   73.38    0.00    0.97    0.11    0.00    0.00    0.00   22.73
> Average:       7    3.05    0.00   78.76    0.00    0.76    0.11    0.00    0.00    0.00   17.32
>

I think effectively all you're doing in this patch is removing xarray
with a regular array, right? I'm surprised an xarray account for 5%
cpu utilization. I wonder if you have debug configs turned on during
these experiments. Can you perf trace what about the xarray is taking
so long? I wonder if we're just using xarrays improperly (maybe
hitting constant resizing slow paths or something), and a similar
improvement can be gotten by adjusting the xarray flags or what not.

> Mean throughput improves, but falls within a standard deviation (~45GB/s
> for 4 flows on a 50GB/s NIC, one hop).
>
> This patch adds an array of atomics for counting the tokens returned to
> the user for a given page. There is a 4-byte atomic per page in the
> dmabuf per socket. Given a 2GB dmabuf, this array is 2MB.
>
> [1]: https://github.com/facebookexperimental/kperf
>
> Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
> ---
>  include/net/sock.h       |   5 ++-
>  net/core/devmem.c        |  17 ++++----
>  net/core/devmem.h        |   2 +-
>  net/core/sock.c          |  24 +++++++----
>  net/ipv4/tcp.c           | 107 +++++++++++++++--------------------------------
>  net/ipv4/tcp_ipv4.c      |  40 +++++++++++++++---
>  net/ipv4/tcp_minisocks.c |   2 -
>  7 files changed, 99 insertions(+), 98 deletions(-)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 1e7f124871d2..70c97880229d 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -573,7 +573,10 @@ struct sock {
>  #endif
>         struct rcu_head         sk_rcu;
>         netns_tracker           ns_tracker;
> -       struct xarray           sk_user_frags;
> +       struct {
> +               struct net_devmem_dmabuf_binding        *binding;
> +               atomic_t                                *urefs;
> +       } sk_user_frags;
>

AFAIU, if you made sk_user_frags an array of (unref, binding) tuples
instead of just an array of urefs then you can remove the
single-binding restriction.

Although, I wonder what happens if the socket receives the netmem at
the same index on 2 different dmabufs. At that point I assume the
wrong uref gets incremented? :(

One way or another the single-binding restriction needs to be removed
I think. It's regressing a UAPI that currently works.

>  #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
>         struct module           *sk_owner;
> diff --git a/net/core/devmem.c b/net/core/devmem.c
> index b4c570d4f37a..50e92dcf5bf1 100644
> --- a/net/core/devmem.c
> +++ b/net/core/devmem.c
> @@ -187,6 +187,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
>         struct dma_buf *dmabuf;
>         unsigned int sg_idx, i;
>         unsigned long virtual;
> +       gfp_t flags;
>         int err;
>
>         if (!dma_dev) {
> @@ -230,14 +231,14 @@ net_devmem_bind_dmabuf(struct net_device *dev,
>                 goto err_detach;
>         }
>
> -       if (direction == DMA_TO_DEVICE) {
> -               binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
> -                                             sizeof(struct net_iov *),
> -                                             GFP_KERNEL);
> -               if (!binding->vec) {
> -                       err = -ENOMEM;
> -                       goto err_unmap;
> -               }
> +       flags = (direction == DMA_FROM_DEVICE) ? __GFP_ZERO : 0;
> +

Why not pass __GFP_ZERO unconditionally?

> +       binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
> +                                     sizeof(struct net_iov *),
> +                                     GFP_KERNEL | flags);
> +       if (!binding->vec) {
> +               err = -ENOMEM;
> +               goto err_unmap;
>         }
>
>         /* For simplicity we expect to make PAGE_SIZE allocations, but the
> diff --git a/net/core/devmem.h b/net/core/devmem.h
> index 2ada54fb63d7..d4eb28d079bb 100644
> --- a/net/core/devmem.h
> +++ b/net/core/devmem.h
> @@ -61,7 +61,7 @@ struct net_devmem_dmabuf_binding {
>
>         /* Array of net_iov pointers for this binding, sorted by virtual
>          * address. This array is convenient to map the virtual addresses to
> -        * net_iovs in the TX path.
> +        * net_iovs.
>          */
>         struct net_iov **vec;
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 9a8290fcc35d..3a5cb4e10519 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -87,6 +87,7 @@
>
>  #include <linux/unaligned.h>
>  #include <linux/capability.h>
> +#include <linux/dma-buf.h>
>  #include <linux/errno.h>
>  #include <linux/errqueue.h>
>  #include <linux/types.h>
> @@ -151,6 +152,7 @@
>  #include <uapi/linux/pidfd.h>
>
>  #include "dev.h"
> +#include "devmem.h"
>
>  static DEFINE_MUTEX(proto_list_mutex);
>  static LIST_HEAD(proto_list);
> @@ -1100,32 +1102,40 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
>                 return -EFAULT;
>         }
>
> -       xa_lock_bh(&sk->sk_user_frags);
>         for (i = 0; i < num_tokens; i++) {
>                 for (j = 0; j < tokens[i].token_count; j++) {
> +                       struct net_iov *niov;
> +                       unsigned int token;
> +                       netmem_ref netmem;
> +
> +                       token = tokens[i].token_start + j;
> +                       if (WARN_ONCE(token >= sk->sk_user_frags.binding->dmabuf->size / PAGE_SIZE,
> +                                     "invalid token passed from user"))
> +                               break;

WARNs on invalid user behavior are a non-starter AFAIU. For one,
syzbot trivially reproduces them and files a bug. Please remove all of
them. pr_err may be acceptable for extremely bad errors. Invalid user
input is not worthy of WARN or pr_err.

> +
>                         if (++num_frags > MAX_DONTNEED_FRAGS)
>                                 goto frag_limit_reached;
> -
> -                       netmem_ref netmem = (__force netmem_ref)__xa_erase(
> -                               &sk->sk_user_frags, tokens[i].token_start + j);
> +                       niov = sk->sk_user_frags.binding->vec[token];
> +                       netmem = net_iov_to_netmem(niov);

So token is the index to both vec and sk->sk_user_frags.binding->vec?

xarrays are a resizable array. AFAIU what you're doing abstractly here
is replacing a resizable array with an array of max size, no? (I
didn't read too closely yet, I may be missing something).

Which makes me think either due to a bug or due to specifics of your
setup, xarray is unreasonably expensive. Without investigating the
details I wonder if we're constantly running into a resizing slowpath
in xarray code and I think this needs some investigation.

>
>                         if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
>                                 continue;
>
> +                       if (WARN_ONCE(atomic_dec_if_positive(&sk->sk_user_frags.urefs[token])
> +                                               < 0, "user released token too many times"))

Here and everywhere, please remove the WARNs for weird user behavior.

> +                               continue;
> +
>                         netmems[netmem_num++] = netmem;
>                         if (netmem_num == ARRAY_SIZE(netmems)) {
> -                               xa_unlock_bh(&sk->sk_user_frags);
>                                 for (k = 0; k < netmem_num; k++)
>                                         WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
>                                 netmem_num = 0;
> -                               xa_lock_bh(&sk->sk_user_frags);

You remove the locking but it's not clear to me why we don't need it
anymore. What's stopping 2 dontneeds from freeing at the same time?
I'm guessing it's because urefs are atomic so we don't need any extra
sync?

>                         }
>                         ret++;
>                 }
>         }
>
>  frag_limit_reached:
> -       xa_unlock_bh(&sk->sk_user_frags);
>         for (k = 0; k < netmem_num; k++)
>                 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
>
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 40b774b4f587..585b50fa8c00 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -261,6 +261,7 @@
>  #include <linux/memblock.h>
>  #include <linux/highmem.h>
>  #include <linux/cache.h>
> +#include <linux/dma-buf.h>
>  #include <linux/err.h>
>  #include <linux/time.h>
>  #include <linux/slab.h>
> @@ -475,7 +476,8 @@ void tcp_init_sock(struct sock *sk)
>
>         set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
>         sk_sockets_allocated_inc(sk);
> -       xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
> +       sk->sk_user_frags.binding = NULL;
> +       sk->sk_user_frags.urefs = NULL;
>  }
>  EXPORT_IPV6_MOD(tcp_init_sock);
>
> @@ -2386,68 +2388,6 @@ static int tcp_inq_hint(struct sock *sk)
>         return inq;
>  }
>
> -/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
> -struct tcp_xa_pool {
> -       u8              max; /* max <= MAX_SKB_FRAGS */
> -       u8              idx; /* idx <= max */
> -       __u32           tokens[MAX_SKB_FRAGS];
> -       netmem_ref      netmems[MAX_SKB_FRAGS];
> -};
> -
> -static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
> -{
> -       int i;
> -
> -       /* Commit part that has been copied to user space. */
> -       for (i = 0; i < p->idx; i++)
> -               __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
> -                            (__force void *)p->netmems[i], GFP_KERNEL);
> -       /* Rollback what has been pre-allocated and is no longer needed. */
> -       for (; i < p->max; i++)
> -               __xa_erase(&sk->sk_user_frags, p->tokens[i]);
> -
> -       p->max = 0;
> -       p->idx = 0;
> -}
> -
> -static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
> -{
> -       if (!p->max)
> -               return;
> -
> -       xa_lock_bh(&sk->sk_user_frags);
> -
> -       tcp_xa_pool_commit_locked(sk, p);
> -
> -       xa_unlock_bh(&sk->sk_user_frags);
> -}
> -
> -static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
> -                             unsigned int max_frags)
> -{
> -       int err, k;
> -
> -       if (p->idx < p->max)
> -               return 0;
> -
> -       xa_lock_bh(&sk->sk_user_frags);
> -
> -       tcp_xa_pool_commit_locked(sk, p);
> -
> -       for (k = 0; k < max_frags; k++) {
> -               err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
> -                                XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
> -               if (err)
> -                       break;
> -       }
> -
> -       xa_unlock_bh(&sk->sk_user_frags);
> -
> -       p->max = k;
> -       p->idx = 0;
> -       return k ? 0 : err;
> -}
> -
>  /* On error, returns the -errno. On success, returns number of bytes sent to the
>   * user. May not consume all of @remaining_len.
>   */
> @@ -2456,14 +2396,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>                               int remaining_len)
>  {
>         struct dmabuf_cmsg dmabuf_cmsg = { 0 };
> -       struct tcp_xa_pool tcp_xa_pool;
>         unsigned int start;
>         int i, copy, n;
>         int sent = 0;
>         int err = 0;
>
> -       tcp_xa_pool.max = 0;
> -       tcp_xa_pool.idx = 0;
>         do {
>                 start = skb_headlen(skb);
>
> @@ -2510,8 +2447,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>                  */
>                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
>                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +                       struct net_devmem_dmabuf_binding *binding;
>                         struct net_iov *niov;
>                         u64 frag_offset;
> +                       size_t size;
> +                       u32 token;
>                         int end;
>
>                         /* !skb_frags_readable() should indicate that ALL the
> @@ -2544,13 +2484,35 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>                                               start;
>                                 dmabuf_cmsg.frag_offset = frag_offset;
>                                 dmabuf_cmsg.frag_size = copy;
> -                               err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
> -                                                        skb_shinfo(skb)->nr_frags - i);
> -                               if (err)
> +
> +                               binding = net_devmem_iov_binding(niov);
> +
> +                               if (!sk->sk_user_frags.binding) {
> +                                       sk->sk_user_frags.binding = binding;
> +
> +                                       size = binding->dmabuf->size / PAGE_SIZE;
> +                                       sk->sk_user_frags.urefs = kzalloc(size,
> +                                                                         GFP_KERNEL);
> +                                       if (!sk->sk_user_frags.urefs) {
> +                                               sk->sk_user_frags.binding = NULL;
> +                                               err = -ENOMEM;
> +                                               goto out;
> +                                       }
> +
> +                                       net_devmem_dmabuf_binding_get(binding);

It's not clear to me why we need to get the binding. AFAIR the way it
works is that we grab a reference on the net_iov, which guarantees
that the associated pp is alive, which in turn guarantees that the
binding remains alive and we don't need to get the binding for every
frag.

> +                               }
> +
> +                               if (WARN_ONCE(sk->sk_user_frags.binding != binding,
> +                                             "binding changed for devmem socket")) {

Remove WARN_ONCE. It's not reasonable to kernel splat because the user
reconfigured flow steering me thinks.

> +                                       err = -EFAULT;
>                                         goto out;
> +                               }
> +
> +                               token = net_iov_virtual_addr(niov) >> PAGE_SHIFT;
> +                               binding->vec[token] = niov;

I don't think you can do this? I thought vec[token] was already == niov.

binding->vec should be initialized when teh binding is initialized,
not re-written on every recvmsg, no?

> +                               dmabuf_cmsg.frag_token = token;
>
>                                 /* Will perform the exchange later */
> -                               dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
>                                 dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
>
>                                 offset += copy;
> @@ -2563,8 +2525,9 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>                                 if (err)
>                                         goto out;
>
> +                               atomic_inc(&sk->sk_user_frags.urefs[token]);
> +
>                                 atomic_long_inc(&niov->pp_ref_count);
> -                               tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
>
>                                 sent += copy;
>
> @@ -2574,7 +2537,6 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>                         start = end;
>                 }
>
> -               tcp_xa_pool_commit(sk, &tcp_xa_pool);
>                 if (!remaining_len)
>                         goto out;
>
> @@ -2592,7 +2554,6 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
>         }
>
>  out:
> -       tcp_xa_pool_commit(sk, &tcp_xa_pool);
>         if (!sent)
>                 sent = err;
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 1e58a8a9ff7a..bdcb8cc003af 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -87,6 +87,9 @@
>  #include <crypto/hash.h>
>  #include <linux/scatterlist.h>
>
> +#include <linux/dma-buf.h>
> +#include "../core/devmem.h"
> +
>  #include <trace/events/tcp.h>
>
>  #ifdef CONFIG_TCP_MD5SIG
> @@ -2529,11 +2532,38 @@ static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
>  static void tcp_release_user_frags(struct sock *sk)
>  {
>  #ifdef CONFIG_PAGE_POOL
> -       unsigned long index;
> -       void *netmem;
> +       struct net_devmem_dmabuf_binding *binding;
> +       struct net_iov *niov;
> +       unsigned int token;
> +       netmem_ref netmem;
> +
> +       if (!sk->sk_user_frags.urefs)
> +               return;
> +
> +       binding = sk->sk_user_frags.binding;
> +       if (!binding || !binding->vec)
> +               return;
> +
> +       for (token = 0; token < binding->dmabuf->size / PAGE_SIZE; token++) {
> +               niov = binding->vec[token];
> +
> +               /* never used by recvmsg() */
> +               if (!niov)
> +                       continue;
> +
> +               if (!net_is_devmem_iov(niov))
> +                       continue;
> +
> +               netmem = net_iov_to_netmem(niov);
>
> -       xa_for_each(&sk->sk_user_frags, index, netmem)
> -               WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
> +               while (atomic_dec_return(&sk->sk_user_frags.urefs[token]) >= 0)
> +                       WARN_ON_ONCE(!napi_pp_put_page(netmem));
> +       }
> +
> +       net_devmem_dmabuf_binding_put(binding);
> +       sk->sk_user_frags.binding = NULL;
> +       kvfree(sk->sk_user_frags.urefs);
> +       sk->sk_user_frags.urefs = NULL;
>  #endif
>  }
>
> @@ -2543,8 +2573,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
>
>         tcp_release_user_frags(sk);
>
> -       xa_destroy(&sk->sk_user_frags);
> -
>         trace_tcp_destroy_sock(sk);
>
>         tcp_clear_xmit_timers(sk);
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index d1c9e4088646..4e8ea73daab7 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -639,8 +639,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
>
>         __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
>
> -       xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
> -
>         return newsk;
>  }
>  EXPORT_SYMBOL(tcp_create_openreq_child);
>
> --
> 2.47.3
>


-- 
Thanks,
Mina

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next 2/2] net: devmem: use niov array for token management
  2025-09-03 20:20   ` Mina Almasry
@ 2025-09-04  0:15     ` Bobby Eshleman
  0 siblings, 0 replies; 7+ messages in thread
From: Bobby Eshleman @ 2025-09-04  0:15 UTC (permalink / raw)
  To: Mina Almasry
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Kuniyuki Iwashima, Willem de Bruijn, Neal Cardwell,
	David Ahern, netdev, linux-kernel, Stanislav Fomichev,
	Bobby Eshleman

On Wed, Sep 03, 2025 at 01:20:57PM -0700, Mina Almasry wrote:
> On Tue, Sep 2, 2025 at 2:36 PM Bobby Eshleman <bobbyeshleman@gmail.com> wrote:
> >
> > From: Bobby Eshleman <bobbyeshleman@meta.com>
> >
> > Improve CPU performance of devmem token management by using page offsets
> > as dmabuf tokens and using them for direct array access lookups instead
> > of xarray lookups. Consequently, the xarray can be removed. The result
> > is an average 5% reduction in CPU cycles spent by devmem RX user
> > threads.
> >
> 
> Great!
> 

Hey Mina, thanks for the feedback!

> > This patch changes the meaning of tokens. Tokens previously referred to
> > unique fragments of pages. In this patch tokens instead represent
> > references to pages, not fragments.  Because of this, multiple tokens
> > may refer to the same page and so have identical value (e.g., two small
> > fragments may coexist on the same page). The token and offset pair that
> > the user receives uniquely identifies fragments if needed.  This assumes
> > that the user is not attempting to sort / uniq the token list using
> > tokens alone.
> >
> > A new restriction is added to the implementation: devmem RX sockets
> > cannot switch dmabuf bindings. In practice, this is a symptom of invalid
> > configuration as a flow would have to be steered to a different queue or
> > device where there is a different binding, which is generally bad for
> > TCP flows.
> 
> Please do not assume configurations you don't use/care about are
> invalid. Currently reconfiguring flow steering while a flow is active
> works as intended today. This is a regression that needs to be
> resolved. But more importantly, looking at your code, I don't think
> this is a restriction you need to introduce?
> 

That's fair, let's see if we can lift it.

> > This restriction is necessary because the 32-bit dmabuf token
> > does not have enough bits to represent both the pages in a large dmabuf
> > and also a binding or dmabuf ID. For example, a system with 8 NICs and
> > 32 queues requires 8 bits for a binding / queue ID (8 NICs * 32 queues
> > == 256 queues total == 2^8), which leaves only 24 bits for dmabuf pages
> > (2^24 * 4096 / (1<<30) == 64GB). This is insufficient for the device and
> > queue numbers on many current systems or systems that may need larger
> > GPU dmabufs (as for hard limits, my current H100 has 80GB GPU memory per
> > device).
> >
> > Using kperf[1] with 4 flows and workers, this patch improves receive
> > worker CPU util by ~4.9% with slightly better throughput.
> >
> > Before, mean cpu util for rx workers ~83.6%:
> >
> > Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
> > Average:       4    2.30    0.00   79.43    0.00    0.65    0.21    0.00    0.00    0.00   17.41
> > Average:       5    2.27    0.00   80.40    0.00    0.45    0.21    0.00    0.00    0.00   16.67
> > Average:       6    2.28    0.00   80.47    0.00    0.46    0.25    0.00    0.00    0.00   16.54
> > Average:       7    2.42    0.00   82.05    0.00    0.46    0.21    0.00    0.00    0.00   14.86
> >
> > After, mean cpu util % for rx workers ~78.7%:
> >
> > Average:     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
> > Average:       4    2.61    0.00   73.31    0.00    0.76    0.11    0.00    0.00    0.00   23.20
> > Average:       5    2.95    0.00   74.24    0.00    0.66    0.22    0.00    0.00    0.00   21.94
> > Average:       6    2.81    0.00   73.38    0.00    0.97    0.11    0.00    0.00    0.00   22.73
> > Average:       7    3.05    0.00   78.76    0.00    0.76    0.11    0.00    0.00    0.00   17.32
> >
> 
> I think effectively all you're doing in this patch is removing xarray
> with a regular array, right? I'm surprised an xarray account for 5%
> cpu utilization. I wonder if you have debug configs turned on during
> these experiments. Can you perf trace what about the xarray is taking
> so long? I wonder if we're just using xarrays improperly (maybe
> hitting constant resizing slow paths or something), and a similar
> improvement can be gotten by adjusting the xarray flags or what not.
> 

That is right.

Here is some perf data gathered from:

	perf record -a -g -F 99 -C 0-7 -- sleep 5

RX queues pinned to 0-3 and kperf server pinned to 4-7.

    11.25%  server       [kernel.kallsyms]                                      [k] tcp_recvmsg
            |          
             --10.98%--tcp_recvmsg
                       bpf_trampoline_6442594803
                       tcp_recvmsg
                       inet6_recvmsg
                       ____sys_recvmsg
                       ___sys_recvmsg
                       __x64_sys_recvmsg
                       do_syscall_64
                       entry_SYSCALL_64_after_hwframe
                       __libc_recvmsg
                       |          
                        --2.74%--0x100000082

     5.65%  server       [kernel.kallsyms]                                      [k] xas_store
            |          
             --5.63%--xas_store
                       |          
                       |--3.92%--__xa_erase
                       |          sock_devmem_dontneed
                       |          sk_setsockopt
                       |          __x64_sys_setsockopt
                       |          do_syscall_64
                       |          entry_SYSCALL_64_after_hwframe
                       |          __GI___setsockopt
                       |          0x4f00000001
                       |          
                       |--0.94%--__xa_alloc
                       |          tcp_recvmsg
                       |          bpf_trampoline_6442594803
                       |          tcp_recvmsg
                       |          inet6_recvmsg
                       |          ____sys_recvmsg
                       |          ___sys_recvmsg
                       |          __x64_sys_recvmsg
                       |          do_syscall_64
                       |          entry_SYSCALL_64_after_hwframe
                       |          __libc_recvmsg
                       |          
                        --0.76%--__xa_cmpxchg
                                  tcp_xa_pool_commit_locked
                                  tcp_xa_pool_commit
                                  tcp_recvmsg
                                  bpf_trampoline_6442594803
                                  tcp_recvmsg
                                  inet6_recvmsg
                                  ____sys_recvmsg
                                  ___sys_recvmsg
                                  __x64_sys_recvmsg
                                  do_syscall_64
                                  entry_SYSCALL_64_after_hwframe
                                  __libc_recvmsg


     [...]

     1.22%  server       [kernel.kallsyms]                                      [k] xas_find_marked
            |          
             --1.19%--xas_find_marked
                       __xa_alloc
                       tcp_recvmsg
                       bpf_trampoline_6442594803
                       tcp_recvmsg
                       inet6_recvmsg
                       ____sys_recvmsg
                       ___sys_recvmsg
                       __x64_sys_recvmsg
                       do_syscall_64
                       entry_SYSCALL_64_after_hwframe
                       __libc_recvmsg


Here is the output from zcat /proc/config.gz | grep DEBUG | grep =y, I'm
not 100% sure which may be worth toggling. I'm happy to rerun the
experiments if any of these are suspicious looking:

CONFIG_X86_DEBUGCTLMSR=y
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_BLK_DEBUG_FS=y
CONFIG_BFQ_CGROUP_DEBUG=y
CONFIG_CMA_DEBUGFS=y
CONFIG_FW_LOADER_DEBUG=y
CONFIG_PNP_DEBUG_MESSAGES=y
CONFIG_SCSI_LPFC_DEBUG_FS=y
CONFIG_MLX4_DEBUG=y
CONFIG_INFINIBAND_MTHCA_DEBUG=y
CONFIG_NFS_DEBUG=y
CONFIG_SUNRPC_DEBUG=y
CONFIG_DYNAMIC_DEBUG=y
CONFIG_DYNAMIC_DEBUG_CORE=y
CONFIG_DEBUG_BUGVERBOSE=y
CONFIG_DEBUG_KERNEL=y
CONFIG_DEBUG_MISC=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
CONFIG_DEBUG_INFO_COMPRESSED_NONE=y
CONFIG_DEBUG_INFO_BTF=y
CONFIG_DEBUG_INFO_BTF_MODULES=y
CONFIG_DEBUG_FS=y
CONFIG_DEBUG_FS_ALLOW_ALL=y
CONFIG_SLUB_DEBUG=y
CONFIG_DEBUG_PAGE_REF=y
CONFIG_ARCH_HAS_DEBUG_WX=y
CONFIG_HAVE_DEBUG_KMEMLEAK=y
CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y
CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_SCHED_DEBUG=y
CONFIG_LOCK_DEBUGGING_SUPPORT=y
CONFIG_CSD_LOCK_WAIT_DEBUG=y
CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT=y
CONFIG_DEBUG_CGROUP_REF=y
CONFIG_FAULT_INJECTION_DEBUG_FS=y


For more context, I did a few other experiments before eventually
landing on this patch:

1) hacky approach - don't do any token management at all, replace dmabuf
   token with 64-bit pointer to niov, teach recvmsg() /
   sock_devmem_dontneed() to inc/dec the niov reference directly... this
   actually reduced the CPU util by a very consistent 10%+ per worker
   (the largest delta of all my experiements).

2) keep xarray, but use RCU/lockless lookups in both recvmsg/dontneed.
   Use page indices + xa_insert instead of xa_alloc. Acquire lock only if
   lockless lookup returns null in recvmsg path. Don't erase in dontneed,
   only take rcu_read_lock() and do lookup. Let xarray grow according to
   usage and cleanup when the socket is destroyed. Surprisingly, this
   didn't offer noticeable improvement.

3) use normal array but no atomics -- incorrect, but saw good improvement,
   despite possibility of leaked references.

4) use a hashmap + bucket locks instead of xarray --  performed way
   worse than xarray, no change to token

> > Mean throughput improves, but falls within a standard deviation (~45GB/s
> > for 4 flows on a 50GB/s NIC, one hop).
> >
> > This patch adds an array of atomics for counting the tokens returned to
> > the user for a given page. There is a 4-byte atomic per page in the
> > dmabuf per socket. Given a 2GB dmabuf, this array is 2MB.
> >
> > [1]: https://github.com/facebookexperimental/kperf
> >
> > Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
> > ---
> >  include/net/sock.h       |   5 ++-
> >  net/core/devmem.c        |  17 ++++----
> >  net/core/devmem.h        |   2 +-
> >  net/core/sock.c          |  24 +++++++----
> >  net/ipv4/tcp.c           | 107 +++++++++++++++--------------------------------
> >  net/ipv4/tcp_ipv4.c      |  40 +++++++++++++++---
> >  net/ipv4/tcp_minisocks.c |   2 -
> >  7 files changed, 99 insertions(+), 98 deletions(-)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 1e7f124871d2..70c97880229d 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -573,7 +573,10 @@ struct sock {
> >  #endif
> >         struct rcu_head         sk_rcu;
> >         netns_tracker           ns_tracker;
> > -       struct xarray           sk_user_frags;
> > +       struct {
> > +               struct net_devmem_dmabuf_binding        *binding;
> > +               atomic_t                                *urefs;
> > +       } sk_user_frags;
> >
> 
> AFAIU, if you made sk_user_frags an array of (unref, binding) tuples
> instead of just an array of urefs then you can remove the
> single-binding restriction.
> 
> Although, I wonder what happens if the socket receives the netmem at
> the same index on 2 different dmabufs. At that point I assume the
> wrong uref gets incremented? :(
> 

Right. We need some bits to differentiate bindings. Here are some ideas
I've had about this, I wonder what your thoughts are on them:

1) Encode a subset of bindings and wait for availability if the encoding
space becomes exhausted. For example, we could encode the binding in 5
bits for outstanding references across 32 bindings and 27 bits (512 GB)
of dmabuf. If recvmsg wants to return a reference to a 33rd binding, it
waits until the user returns enough tokens to release one of the binding
encoding bits (at which point it could be reused for the new reference).

2) opt into an extended token (dmabuf_token_v2) via sockopts, and add
the binding ID or other needed information there.

> One way or another the single-binding restriction needs to be removed
> I think. It's regressing a UAPI that currently works.
> 
> >  #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
> >         struct module           *sk_owner;
> > diff --git a/net/core/devmem.c b/net/core/devmem.c
> > index b4c570d4f37a..50e92dcf5bf1 100644
> > --- a/net/core/devmem.c
> > +++ b/net/core/devmem.c
> > @@ -187,6 +187,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
> >         struct dma_buf *dmabuf;
> >         unsigned int sg_idx, i;
> >         unsigned long virtual;
> > +       gfp_t flags;
> >         int err;
> >
> >         if (!dma_dev) {
> > @@ -230,14 +231,14 @@ net_devmem_bind_dmabuf(struct net_device *dev,
> >                 goto err_detach;
> >         }
> >
> > -       if (direction == DMA_TO_DEVICE) {
> > -               binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
> > -                                             sizeof(struct net_iov *),
> > -                                             GFP_KERNEL);
> > -               if (!binding->vec) {
> > -                       err = -ENOMEM;
> > -                       goto err_unmap;
> > -               }
> > +       flags = (direction == DMA_FROM_DEVICE) ? __GFP_ZERO : 0;
> > +
> 
> Why not pass __GFP_ZERO unconditionally?
> 

Seemed unnecessary for the TX side, though I see no problem adding it
for the next rev.

> > +       binding->vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
> > +                                     sizeof(struct net_iov *),
> > +                                     GFP_KERNEL | flags);
> > +       if (!binding->vec) {
> > +               err = -ENOMEM;
> > +               goto err_unmap;
> >         }
> >
> >         /* For simplicity we expect to make PAGE_SIZE allocations, but the
> > diff --git a/net/core/devmem.h b/net/core/devmem.h
> > index 2ada54fb63d7..d4eb28d079bb 100644
> > --- a/net/core/devmem.h
> > +++ b/net/core/devmem.h
> > @@ -61,7 +61,7 @@ struct net_devmem_dmabuf_binding {
> >
> >         /* Array of net_iov pointers for this binding, sorted by virtual
> >          * address. This array is convenient to map the virtual addresses to
> > -        * net_iovs in the TX path.
> > +        * net_iovs.
> >          */
> >         struct net_iov **vec;
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 9a8290fcc35d..3a5cb4e10519 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -87,6 +87,7 @@
> >
> >  #include <linux/unaligned.h>
> >  #include <linux/capability.h>
> > +#include <linux/dma-buf.h>
> >  #include <linux/errno.h>
> >  #include <linux/errqueue.h>
> >  #include <linux/types.h>
> > @@ -151,6 +152,7 @@
> >  #include <uapi/linux/pidfd.h>
> >
> >  #include "dev.h"
> > +#include "devmem.h"
> >
> >  static DEFINE_MUTEX(proto_list_mutex);
> >  static LIST_HEAD(proto_list);
> > @@ -1100,32 +1102,40 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
> >                 return -EFAULT;
> >         }
> >
> > -       xa_lock_bh(&sk->sk_user_frags);
> >         for (i = 0; i < num_tokens; i++) {
> >                 for (j = 0; j < tokens[i].token_count; j++) {
> > +                       struct net_iov *niov;
> > +                       unsigned int token;
> > +                       netmem_ref netmem;
> > +
> > +                       token = tokens[i].token_start + j;
> > +                       if (WARN_ONCE(token >= sk->sk_user_frags.binding->dmabuf->size / PAGE_SIZE,
> > +                                     "invalid token passed from user"))
> > +                               break;
> 
> WARNs on invalid user behavior are a non-starter AFAIU. For one,
> syzbot trivially reproduces them and files a bug. Please remove all of
> them. pr_err may be acceptable for extremely bad errors. Invalid user
> input is not worthy of WARN or pr_err.
> 

Gotcha, I did not know that. Sounds good to me.

> > +
> >                         if (++num_frags > MAX_DONTNEED_FRAGS)
> >                                 goto frag_limit_reached;
> > -
> > -                       netmem_ref netmem = (__force netmem_ref)__xa_erase(
> > -                               &sk->sk_user_frags, tokens[i].token_start + j);
> > +                       niov = sk->sk_user_frags.binding->vec[token];
> > +                       netmem = net_iov_to_netmem(niov);
> 
> So token is the index to both vec and sk->sk_user_frags.binding->vec?
> 
> xarrays are a resizable array. AFAIU what you're doing abstractly here
> is replacing a resizable array with an array of max size, no? (I
> didn't read too closely yet, I may be missing something).
> 

Yes, 100%.

> Which makes me think either due to a bug or due to specifics of your
> setup, xarray is unreasonably expensive. Without investigating the
> details I wonder if we're constantly running into a resizing slowpath
> in xarray code and I think this needs some investigation.
> 

Certainly possible, I found the result surprising myself, hence my
experiment with a more read-mostly usage of xarray.

> >
> >                         if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
> >                                 continue;
> >
> > +                       if (WARN_ONCE(atomic_dec_if_positive(&sk->sk_user_frags.urefs[token])
> > +                                               < 0, "user released token too many times"))
> 
> Here and everywhere, please remove the WARNs for weird user behavior.
> 
> > +                               continue;
> > +
> >                         netmems[netmem_num++] = netmem;
> >                         if (netmem_num == ARRAY_SIZE(netmems)) {
> > -                               xa_unlock_bh(&sk->sk_user_frags);
> >                                 for (k = 0; k < netmem_num; k++)
> >                                         WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
> >                                 netmem_num = 0;
> > -                               xa_lock_bh(&sk->sk_user_frags);
> 
> You remove the locking but it's not clear to me why we don't need it
> anymore. What's stopping 2 dontneeds from freeing at the same time?
> I'm guessing it's because urefs are atomic so we don't need any extra
> sync?
> 

Yes, atomic uref. The array itself doesn't change until the socket is
destroyed.

> >                         }
> >                         ret++;
> >                 }
> >         }
> >
> >  frag_limit_reached:
> > -       xa_unlock_bh(&sk->sk_user_frags);
> >         for (k = 0; k < netmem_num; k++)
> >                 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
> >
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 40b774b4f587..585b50fa8c00 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -261,6 +261,7 @@
> >  #include <linux/memblock.h>
> >  #include <linux/highmem.h>
> >  #include <linux/cache.h>
> > +#include <linux/dma-buf.h>
> >  #include <linux/err.h>
> >  #include <linux/time.h>
> >  #include <linux/slab.h>
> > @@ -475,7 +476,8 @@ void tcp_init_sock(struct sock *sk)
> >
> >         set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
> >         sk_sockets_allocated_inc(sk);
> > -       xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
> > +       sk->sk_user_frags.binding = NULL;
> > +       sk->sk_user_frags.urefs = NULL;
> >  }
> >  EXPORT_IPV6_MOD(tcp_init_sock);
> >
> > @@ -2386,68 +2388,6 @@ static int tcp_inq_hint(struct sock *sk)
> >         return inq;
> >  }
> >
> > -/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
> > -struct tcp_xa_pool {
> > -       u8              max; /* max <= MAX_SKB_FRAGS */
> > -       u8              idx; /* idx <= max */
> > -       __u32           tokens[MAX_SKB_FRAGS];
> > -       netmem_ref      netmems[MAX_SKB_FRAGS];
> > -};
> > -
> > -static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
> > -{
> > -       int i;
> > -
> > -       /* Commit part that has been copied to user space. */
> > -       for (i = 0; i < p->idx; i++)
> > -               __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
> > -                            (__force void *)p->netmems[i], GFP_KERNEL);
> > -       /* Rollback what has been pre-allocated and is no longer needed. */
> > -       for (; i < p->max; i++)
> > -               __xa_erase(&sk->sk_user_frags, p->tokens[i]);
> > -
> > -       p->max = 0;
> > -       p->idx = 0;
> > -}
> > -
> > -static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
> > -{
> > -       if (!p->max)
> > -               return;
> > -
> > -       xa_lock_bh(&sk->sk_user_frags);
> > -
> > -       tcp_xa_pool_commit_locked(sk, p);
> > -
> > -       xa_unlock_bh(&sk->sk_user_frags);
> > -}
> > -
> > -static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
> > -                             unsigned int max_frags)
> > -{
> > -       int err, k;
> > -
> > -       if (p->idx < p->max)
> > -               return 0;
> > -
> > -       xa_lock_bh(&sk->sk_user_frags);
> > -
> > -       tcp_xa_pool_commit_locked(sk, p);
> > -
> > -       for (k = 0; k < max_frags; k++) {
> > -               err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
> > -                                XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
> > -               if (err)
> > -                       break;
> > -       }
> > -
> > -       xa_unlock_bh(&sk->sk_user_frags);
> > -
> > -       p->max = k;
> > -       p->idx = 0;
> > -       return k ? 0 : err;
> > -}
> > -
> >  /* On error, returns the -errno. On success, returns number of bytes sent to the
> >   * user. May not consume all of @remaining_len.
> >   */
> > @@ -2456,14 +2396,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
> >                               int remaining_len)
> >  {
> >         struct dmabuf_cmsg dmabuf_cmsg = { 0 };
> > -       struct tcp_xa_pool tcp_xa_pool;
> >         unsigned int start;
> >         int i, copy, n;
> >         int sent = 0;
> >         int err = 0;
> >
> > -       tcp_xa_pool.max = 0;
> > -       tcp_xa_pool.idx = 0;
> >         do {
> >                 start = skb_headlen(skb);
> >
> > @@ -2510,8 +2447,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
> >                  */
> >                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> >                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> > +                       struct net_devmem_dmabuf_binding *binding;
> >                         struct net_iov *niov;
> >                         u64 frag_offset;
> > +                       size_t size;
> > +                       u32 token;
> >                         int end;
> >
> >                         /* !skb_frags_readable() should indicate that ALL the
> > @@ -2544,13 +2484,35 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
> >                                               start;
> >                                 dmabuf_cmsg.frag_offset = frag_offset;
> >                                 dmabuf_cmsg.frag_size = copy;
> > -                               err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
> > -                                                        skb_shinfo(skb)->nr_frags - i);
> > -                               if (err)
> > +
> > +                               binding = net_devmem_iov_binding(niov);
> > +
> > +                               if (!sk->sk_user_frags.binding) {
> > +                                       sk->sk_user_frags.binding = binding;
> > +
> > +                                       size = binding->dmabuf->size / PAGE_SIZE;
> > +                                       sk->sk_user_frags.urefs = kzalloc(size,
> > +                                                                         GFP_KERNEL);
> > +                                       if (!sk->sk_user_frags.urefs) {
> > +                                               sk->sk_user_frags.binding = NULL;
> > +                                               err = -ENOMEM;
> > +                                               goto out;
> > +                                       }
> > +
> > +                                       net_devmem_dmabuf_binding_get(binding);
> 
> It's not clear to me why we need to get the binding. AFAIR the way it
> works is that we grab a reference on the net_iov, which guarantees
> that the associated pp is alive, which in turn guarantees that the
> binding remains alive and we don't need to get the binding for every
> frag.
> 

Oh you are right, the chain of references will keep it alive at least
until dontneed.

> > +                               }
> > +
> > +                               if (WARN_ONCE(sk->sk_user_frags.binding != binding,
> > +                                             "binding changed for devmem socket")) {
> 
> Remove WARN_ONCE. It's not reasonable to kernel splat because the user
> reconfigured flow steering me thinks.
> 

Sounds good.

> > +                                       err = -EFAULT;
> >                                         goto out;
> > +                               }
> > +
> > +                               token = net_iov_virtual_addr(niov) >> PAGE_SHIFT;
> > +                               binding->vec[token] = niov;
> 
> I don't think you can do this? I thought vec[token] was already == niov.
> 
> binding->vec should be initialized when teh binding is initialized,
> not re-written on every recvmsg, no?
> 

Currently binding->vec is only initialized like that if direction ==
DMA_TO_DEVICE, but now that you mention it I don't see why RX shouldn't
also initialize upfront too.

[...]

> Thanks,
> Mina

Thanks again for the review, much appreciated!

Best,
Bobby

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-09-04  0:15 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-02 21:36 [PATCH net-next 0/2] net: devmem: improve cpu cost of RX token management Bobby Eshleman
2025-09-02 21:36 ` [PATCH net-next 1/2] net: devmem: rename tx_vec to vec in dmabuf binding Bobby Eshleman
2025-09-02 21:36 ` [PATCH net-next 2/2] net: devmem: use niov array for token management Bobby Eshleman
2025-09-03 10:50   ` kernel test robot
2025-09-03 20:20   ` Mina Almasry
2025-09-04  0:15     ` Bobby Eshleman
2025-09-03 17:46 ` [syzbot ci] Re: net: devmem: improve cpu cost of RX " syzbot ci

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).