public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Bobby Eshleman <bobbyeshleman@gmail.com>
To: Andrew Lunn <andrew+netdev@lunn.ch>,
	 "David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	 Jakub Kicinski <kuba@kernel.org>,
	Paolo Abeni <pabeni@redhat.com>,  Simon Horman <horms@kernel.org>,
	Jonathan Corbet <corbet@lwn.net>,
	 Shuah Khan <skhan@linuxfoundation.org>,
	Alex Shi <alexs@kernel.org>,  Yanteng Si <si.yanteng@linux.dev>,
	Dongliang Mu <dzm91@hust.edu.cn>,
	 Michael Chan <michael.chan@broadcom.com>,
	 Pavan Chebbi <pavan.chebbi@broadcom.com>,
	 Joshua Washington <joshwash@google.com>,
	 Harshitha Ramamurthy <hramamurthy@google.com>,
	 Saeed Mahameed <saeedm@nvidia.com>,
	Tariq Toukan <tariqt@nvidia.com>,  Mark Bloch <mbloch@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	 Alexander Duyck <alexanderduyck@fb.com>,
	kernel-team@meta.com,  Daniel Borkmann <daniel@iogearbox.net>,
	 Nikolay Aleksandrov <razor@blackwall.org>,
	Shuah Khan <shuah@kernel.org>
Cc: netdev@vger.kernel.org, linux-doc@vger.kernel.org,
	 linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	 bpf@vger.kernel.org, linux-kselftest@vger.kernel.org,
	 Stanislav Fomichev <sdf@fomichev.me>,
	Mina Almasry <almasrymina@google.com>,
	 Bobby Eshleman <bobbyeshleman@meta.com>
Subject: [PATCH net-next 07/11] net: devmem: support TX over NETMEM_TX_NO_DMA devices
Date: Tue, 28 Apr 2026 15:42:04 -0700	[thread overview]
Message-ID: <20260428-tcp-dm-netkit-v1-7-719280eba4d2@meta.com> (raw)
In-Reply-To: <20260428-tcp-dm-netkit-v1-0-719280eba4d2@meta.com>

From: Bobby Eshleman <bobbyeshleman@meta.com>

When a netkit virtual device leases queues from a physical NIC, devmem
TX bindings created on the netkit device must still result in the dmabuf
being mapped for dma by the physical device. This patch accomplishes
this by teaching the bind handler to search for the underlying
DMA-capable device by looking it up via leased rx queues. The function
netdev_find_netmem_tx_dev(), used for finding the underlying DMA-capable
device, can be extended to support other non-netkit NETMEM_TX_NO_DMA
devices in the future if needed.

Additionally, this patch extends validate_xmit_unreadable_skb() to
support the netkit case, where the skb is validated twice: once on the
netkit guest device and again on the physical NIC after BPF redirect or
ip forwarding.

Assisted-by: Claude Code:claude-sonnet-4-6
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
 net/core/dev.c         | 24 ++++++++++++++++-------
 net/core/devmem.c      |  6 ++++--
 net/core/devmem.h      |  9 +++++++--
 net/core/netdev-genl.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 06c195906231..f6575cf48287 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3990,22 +3990,32 @@ static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
 static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
 						    struct net_device *dev)
 {
+	struct net_devmem_dmabuf_binding *binding;
 	struct skb_shared_info *shinfo;
 	struct net_iov *niov;
 
 	if (likely(skb_frags_readable(skb)))
 		goto out;
 
-	if (!dev->netmem_tx)
-		goto out_free;
-
 	shinfo = skb_shinfo(skb);
+	if (shinfo->nr_frags == 0)
+		goto out;
 
-	if (shinfo->nr_frags > 0) {
-		niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
-		if (net_is_devmem_iov(niov) &&
-		    READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev)
+	niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+	if (!net_is_devmem_iov(niov))
+		goto out;
+
+	binding = net_devmem_iov_binding(niov);
+
+	switch (dev->netmem_tx) {
+	case NETMEM_TX_DMA:
+		if (READ_ONCE(binding->dev) != dev)
 			goto out_free;
+		break;
+	case NETMEM_TX_NO_DMA:
+		break;
+	default: /* NETMEM_TX_NONE */
+		goto out_free;
 	}
 
 out:
diff --git a/net/core/devmem.c b/net/core/devmem.c
index cde4c89bc146..644c286b778f 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -181,7 +181,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
 }
 
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *vdev,
 		       struct device *dma_dev,
 		       enum dma_data_direction direction,
 		       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
@@ -212,6 +212,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
 	}
 
 	binding->dev = dev;
+	binding->vdev = vdev;
 	xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
 
 	err = percpu_ref_init(&binding->ref,
@@ -397,7 +398,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
 	 */
 	dst_dev = dst_dev_rcu(dst);
 	if (unlikely(!dst_dev) ||
-	    unlikely(dst_dev != READ_ONCE(binding->dev))) {
+	    unlikely(dst_dev != READ_ONCE(binding->dev) &&
+		     dst_dev != READ_ONCE(binding->vdev))) {
 		err = -ENODEV;
 		goto out_unlock;
 	}
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 1c5c18581fcb..f399632b3c4b 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -19,7 +19,12 @@ struct net_devmem_dmabuf_binding {
 	struct dma_buf *dmabuf;
 	struct dma_buf_attachment *attachment;
 	struct sg_table *sgt;
+	/* Physical NIC that does the actual DMA for this binding. */
 	struct net_device *dev;
+	/* Virtual device (e.g. netkit) the user called bind-tx on. Must be
+	 * NETMEM_TX_NO_DMA.
+	 */
+	struct net_device *vdev;
 	struct gen_pool *chunk_pool;
 	/* Protect dev */
 	struct mutex lock;
@@ -84,7 +89,7 @@ struct dmabuf_genpool_chunk_owner {
 
 void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
 struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *vdev,
 		       struct device *dma_dev,
 		       enum dma_data_direction direction,
 		       unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
@@ -165,7 +170,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
 }
 
 static inline struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev,
+net_devmem_bind_dmabuf(struct net_device *dev, struct net_device *vdev,
 		       struct device *dma_dev,
 		       enum dma_data_direction direction,
 		       unsigned int dmabuf_fd,
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index b8f6076d8007..bc6057aee98e 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -1077,7 +1077,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 		goto err_rxq_bitmap;
 	}
 
-	binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+	binding = net_devmem_bind_dmabuf(netdev, NULL, dma_dev, DMA_FROM_DEVICE,
 					 dmabuf_fd, priv, info->extack);
 	if (IS_ERR(binding)) {
 		err = PTR_ERR(binding);
@@ -1119,9 +1119,42 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+/* Find the DMA-capable device for netmem TX binding.
+ * For NETMEM_TX_DMA devices, returns the device itself.
+ * For NETMEM_TX_NO_DMA devices (e.g. netkit), walks leased queues
+ * to find the underlying physical device.
+ * Returns NULL if no suitable device is found.
+ */
+static struct net_device *netdev_find_netmem_tx_dev(struct net_device *dev)
+{
+	struct netdev_rx_queue *lease_rxq;
+	struct net_device *phys_dev;
+	int i;
+
+	if (dev->netmem_tx == NETMEM_TX_DMA)
+		return dev;
+
+	if (dev->netmem_tx != NETMEM_TX_NO_DMA)
+		return NULL;
+
+	for (i = 0; i < dev->real_num_rx_queues; i++) {
+		lease_rxq = READ_ONCE(__netif_get_rx_queue(dev, i)->lease);
+		if (!lease_rxq)
+			continue;
+
+		phys_dev = lease_rxq->dev;
+		if (netif_device_present(phys_dev) &&
+		    phys_dev->netmem_tx == NETMEM_TX_DMA)
+			return phys_dev;
+	}
+
+	return NULL;
+}
+
 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net_devmem_dmabuf_binding *binding;
+	struct net_device *bind_dev;
 	struct netdev_nl_sock *priv;
 	struct net_device *netdev;
 	struct device *dma_dev;
@@ -1164,16 +1197,26 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
 		goto err_unlock_netdev;
 	}
 
-	if (!netdev->netmem_tx) {
+	if (netdev->netmem_tx == NETMEM_TX_NONE) {
 		err = -EOPNOTSUPP;
 		NL_SET_ERR_MSG(info->extack,
 			       "Driver does not support netmem TX");
 		goto err_unlock_netdev;
 	}
 
-	dma_dev = netdev_queue_get_dma_dev(netdev, 0, NETDEV_QUEUE_TYPE_TX);
-	binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
-					 dmabuf_fd, priv, info->extack);
+	bind_dev = netdev_find_netmem_tx_dev(netdev);
+	if (!bind_dev) {
+		err = -EOPNOTSUPP;
+		NL_SET_ERR_MSG(info->extack,
+			       "No DMA-capable device found for netmem TX");
+		goto err_unlock_netdev;
+	}
+
+	dma_dev = netdev_queue_get_dma_dev(bind_dev, 0, NETDEV_QUEUE_TYPE_TX);
+	binding = net_devmem_bind_dmabuf(bind_dev,
+					 bind_dev != netdev ? netdev : NULL,
+					 dma_dev, DMA_TO_DEVICE, dmabuf_fd,
+					 priv, info->extack);
 	if (IS_ERR(binding)) {
 		err = PTR_ERR(binding);
 		goto err_unlock_netdev;

-- 
2.52.0


  parent reply	other threads:[~2026-04-28 22:42 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-28 22:41 [PATCH net-next 00/11] net: devmem: support devmem with netkit devices Bobby Eshleman
2026-04-28 22:41 ` [PATCH net-next 01/11] net: add netmem_tx modes that indicate dma capability Bobby Eshleman
2026-04-28 22:41 ` [PATCH net-next 02/11] net: bnxt: convert netmem_tx from bool to NETMEM_TX_DMA enum Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 03/11] gve: " Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 04/11] net/mlx5e: " Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 05/11] eth: fbnic: " Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 06/11] netkit: set NETMEM_TX_NO_DMA for unreadable skb passthrough Bobby Eshleman
2026-04-28 22:42 ` Bobby Eshleman [this message]
2026-05-01  0:57   ` [PATCH net-next 07/11] net: devmem: support TX over NETMEM_TX_NO_DMA devices Jakub Kicinski
2026-05-01  1:07     ` Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 08/11] selftests: drv-net: ncdevmem: add -n flag to skip NIC configuration Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 09/11] selftests: drv-net: refactor devmem command builders into lib module Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 10/11] selftests: drv-net: add primary_rx_redirect support to NetDrvContEnv Bobby Eshleman
2026-04-28 22:42 ` [PATCH net-next 11/11] selftests: drv-net: add netkit devmem tests Bobby Eshleman
2026-04-29 12:08 ` [PATCH net-next 00/11] net: devmem: support devmem with netkit devices Daniel Borkmann
2026-04-29 15:18   ` Bobby Eshleman
2026-04-29 15:33     ` Daniel Borkmann
2026-05-01  0:59 ` Jakub Kicinski
2026-05-01  1:04   ` Bobby Eshleman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260428-tcp-dm-netkit-v1-7-719280eba4d2@meta.com \
    --to=bobbyeshleman@gmail.com \
    --cc=alexanderduyck@fb.com \
    --cc=alexs@kernel.org \
    --cc=almasrymina@google.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=bobbyeshleman@meta.com \
    --cc=bpf@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=dzm91@hust.edu.cn \
    --cc=edumazet@google.com \
    --cc=horms@kernel.org \
    --cc=hramamurthy@google.com \
    --cc=joshwash@google.com \
    --cc=kernel-team@meta.com \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=michael.chan@broadcom.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=pavan.chebbi@broadcom.com \
    --cc=razor@blackwall.org \
    --cc=saeedm@nvidia.com \
    --cc=sdf@fomichev.me \
    --cc=shuah@kernel.org \
    --cc=si.yanteng@linux.dev \
    --cc=skhan@linuxfoundation.org \
    --cc=tariqt@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox