All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jakub Kicinski <kuba@kernel.org>
To: Daniel Borkmann <daniel@iogearbox.net>
Cc: netdev@vger.kernel.org, bpf@vger.kernel.org, davem@davemloft.net,
	razor@blackwall.org, pabeni@redhat.com, willemb@google.com,
	sdf@fomichev.me, john.fastabend@gmail.com, martin.lau@kernel.org,
	jordan@jrife.io, maciej.fijalkowski@intel.com,
	magnus.karlsson@intel.com, dw@davidwei.uk, toke@redhat.com,
	yangzhenze@bytedance.com, wangdongdong.6@bytedance.com
Subject: Re: [PATCH net-next v8 02/16] net: Implement netdev_nl_queue_create_doit
Date: Sat, 31 Jan 2026 15:45:23 -0800	[thread overview]
Message-ID: <20260131154523.5e495380@kernel.org> (raw)
In-Reply-To: <20260129222830.439687-3-daniel@iogearbox.net>

On Thu, 29 Jan 2026 23:28:16 +0100 Daniel Borkmann wrote:
> Implement netdev_nl_queue_create_doit which creates a new rx queue in a
> virtual netdev and then leases it to a rx queue in a physical netdev.
> 
> Example with ynl client:
> 
>   # ./pyynl/cli.py \
>       --spec ~/netlink/specs/netdev.yaml \

nit: please use "ynl --family netdev" instead of ./pyynl/cli.py
--spec...

> Note that the netdevice locking order is always from the virtual to
> the physical device.

There is a big comment in netdevice.h (search for "netdev-scope lock")
documenting the instance lock, since you are adding technical deb^w^w 
an ordering rule we should mention it there concisely and also update
the "netdev instance lock" section of netdevices.rst

> +	int	(*ndo_queue_create)(struct net_device *dev);

please propagate extack to the driver

>  	unsigned int supported_params;
>  };
> @@ -185,7 +191,9 @@ struct netdev_queue_mgmt_ops {
>  void netdev_queue_config(struct net_device *dev, int rxq,
>  			 struct netdev_queue_config *qcfg);
>  
> -bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx);
> +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
> +bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx);
> +bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx);

The new function is internal to the core, please add the declarations
in the appropriate place in net/core/dev.h instead.

Coincidentally we should probably delete netif_rxq_has_unreadable_mp()
completely and pass the "has unreadable mp" as a flag inside qcfg
instead. I'll clean that up after your patches.

>  /**
>   * DOC: Lockless queue stopping / waking helpers.
> @@ -374,5 +382,10 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
>  	})
>  
>  struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
> -
> -#endif
> +bool netdev_can_create_queue(const struct net_device *dev,
> +			     struct netlink_ext_ack *extack);
> +bool netdev_can_lease_queue(const struct net_device *dev,
> +			    struct netlink_ext_ack *extack);
> +bool netdev_queue_busy(struct net_device *dev, int idx,
> +		       struct netlink_ext_ack *extack);

ditto. FWIW it's fine for ethtool to include using a relative path,
it already does that in a few places

> +#endif /* _LINUX_NET_QUEUES_H */
> diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
> index cfa72c485387..967bec9b3c6a 100644
> --- a/include/net/netdev_rx_queue.h
> +++ b/include/net/netdev_rx_queue.h
> @@ -30,6 +30,8 @@ struct netdev_rx_queue {
>  	struct napi_struct		*napi;
>  	struct netdev_queue_config	qcfg;
>  	struct pp_memory_provider_params mp_params;

Could you add a comment here explaining whether this pointer is to 
or from the lease or both, depending whether the device is virt?

> +	struct netdev_rx_queue		*lease;
> +	netdevice_tracker		lease_tracker;
>  } ____cacheline_aligned_in_smp;
>  
>  /*
> @@ -59,5 +61,8 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
>  }
>  
>  int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
> -
> -#endif
> +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
> +			   struct netdev_rx_queue *rxq_src);
> +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
> +			     struct netdev_rx_queue *rxq_src);

dev.h

>  int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info)
>  {
> -	return -EOPNOTSUPP;
> +	const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
> +	const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1;
> +	int err, ifindex, ifindex_lease, queue_id, queue_id_lease;
> +	struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
> +	struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)];
> +	struct netdev_rx_queue *rxq, *rxq_lease;
> +	struct net_device *dev, *dev_lease;
> +	netdevice_tracker dev_tracker;
> +	s32 netns_lease = -1;
> +	struct nlattr *nest;
> +	struct sk_buff *rsp;
> +	struct net *net;
> +	void *hdr;
> +
> +	if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) ||
> +	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
> +	    GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE))
> +		return -EINVAL;
> +	if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) !=
> +	    NETDEV_QUEUE_TYPE_RX) {
> +		NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]);
> +		return -EINVAL;
> +	}
> +
> +	ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
> +
> +	nest = info->attrs[NETDEV_A_QUEUE_LEASE];
> +	err = nla_parse_nested(ltb, lmaxtype, nest,
> +			       netdev_lease_nl_policy, info->extack);
> +	if (err < 0)
> +		return err;
> +	if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) ||
> +	    NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE))
> +		return -EINVAL;
> +	if (ltb[NETDEV_A_LEASE_NETNS_ID]) {
> +		netns_lease = nla_get_s32(ltb[NETDEV_A_LEASE_NETNS_ID]);
> +		if (netns_lease < 0) {

Let's add this to the spec / policy then?

checks:
  min: 0

> +			NL_SET_BAD_ATTR(info->extack, ltb[NETDEV_A_LEASE_NETNS_ID]);
> +			return -EINVAL;
> +		}
> +		if (!capable(CAP_NET_ADMIN))
> +			return -EPERM;
> +	}
> +
> +	ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]);
> +
> +	nest = ltb[NETDEV_A_LEASE_QUEUE];
> +	err = nla_parse_nested(qtb, qmaxtype, nest,
> +			       netdev_queue_id_nl_policy, info->extack);
> +	if (err < 0)
> +		return err;
> +	if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) ||
> +	    NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE))
> +		return -EINVAL;
> +	if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
> +		NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]);
> +		return -EINVAL;
> +	}
> +
> +	queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]);
> +
> +	rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
> +	if (!rsp)
> +		return -ENOMEM;
> +
> +	hdr = genlmsg_iput(rsp, info);
> +	if (!hdr) {
> +		err = -EMSGSIZE;
> +		goto err_genlmsg_free;
> +	}
> +
> +	/* Locking order is always from the virtual to the physical device
> +	 * since this is also the same order when applications open the
> +	 * memory provider later on.
> +	 */
> +	dev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
> +	if (!dev) {
> +		err = -ENODEV;
> +		goto err_genlmsg_free;
> +	}
> +	if (!netdev_can_create_queue(dev, info->extack)) {
> +		err = -EINVAL;
> +		goto err_unlock_dev;
> +	}
> +
> +	net = genl_info_net(info);
> +	if (netns_lease >= 0) {
> +		net = get_net_ns_by_id(net, netns_lease);
> +		if (!net) {
> +			err = -ENONET;
> +			goto err_unlock_dev;
> +		}
> +	}
> +	if (net_eq(net, dev_net(dev)) &&
> +	    ifindex == ifindex_lease) {

Is this check actually needed? The device can't be physical and virtual
at once so the locking safety check would fail anyway, no?

> +		NL_SET_ERR_MSG(info->extack,
> +			"Lease ifindex cannot be the same as queue creation ifindex");
> +		err = -EINVAL;
> +		goto err_put_netns;
> +	}
> +
> +	dev_lease = netdev_get_by_index(net, ifindex_lease, &dev_tracker,
> +					GFP_KERNEL);
> +	if (!dev_lease) {
> +		err = -ENODEV;
> +		goto err_put_netns;
> +	}
> +	if (!netdev_can_lease_queue(dev_lease, info->extack)) {
> +		netdev_put(dev_lease, &dev_tracker);
> +		err = -EINVAL;
> +		goto err_put_netns;
> +	}
> +
> +	dev_lease = netdev_put_lock(dev_lease, &dev_tracker);
> +	if (!dev_lease) {
> +		err = -ENODEV;
> +		goto err_put_netns;
> +	}
> +	if (queue_id_lease >= dev_lease->real_num_rx_queues) {
> +		err = -ERANGE;
> +		NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]);
> +		goto err_unlock_dev_lease;
> +	}
> +	if (netdev_queue_busy(dev_lease, queue_id_lease, info->extack)) {
> +		err = -EBUSY;
> +		goto err_unlock_dev_lease;
> +	}
> +
> +	rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease);
> +	rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1);
> +
> +	if (rxq->lease && rxq->lease->dev != dev_lease) {

IIUC the simplification of having all leases from one devices is still
a netkit thing? I mean - there's nothing in the core that depends on
this, just the cleanup / notifier handling in netkit? If that's the
case let's move this check into netkit.

Sorry if you moved this here because I asked to move as much as
possible into the core.

> +		err = -EOPNOTSUPP;
> +		NL_SET_ERR_MSG(info->extack,
> +			       "Leasing multiple queues from different devices not supported");
> +		goto err_unlock_dev_lease;
> +	}
> +
> +	err = queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev);
> +	if (err < 0) {
> +		NL_SET_ERR_MSG(info->extack,
> +			       "Device is unable to create a new queue");

As flagged, if we pass extack to the driver it should be able to give us
a more accurate reason

> +		goto err_unlock_dev_lease;
> +	}
> +
> +	rxq = __netif_get_rx_queue(dev, queue_id);
> +	netdev_rx_queue_lease(rxq, rxq_lease);

nit: the call to __netif_get_rx_queue() could move into
netdev_rx_queue_lease()

> +	nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id);
> +	genlmsg_end(rsp, hdr);
> +
> +	netdev_unlock(dev_lease);
> +	netdev_unlock(dev);
> +	if (netns_lease >= 0)
> +		put_net(net);

  reply	other threads:[~2026-01-31 23:45 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-29 22:28 [PATCH net-next v8 00/16] netkit: Support for io_uring zero-copy and AF_XDP Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 01/16] net: Add queue-create operation Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 02/16] net: Implement netdev_nl_queue_create_doit Daniel Borkmann
2026-01-31 23:45   ` Jakub Kicinski [this message]
2026-03-05  4:38     ` Daniel Borkmann
2026-03-06  2:10       ` Jakub Kicinski
2026-03-06  5:49         ` Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 03/16] net: Add lease info to queue-get response Daniel Borkmann
2026-01-31 23:43   ` Jakub Kicinski
2026-02-01  0:15   ` Jakub Kicinski
2026-01-29 22:28 ` [PATCH net-next v8 04/16] net, ethtool: Disallow leased real rxqs to be resized Daniel Borkmann
2026-01-31 23:45   ` Jakub Kicinski
2026-01-29 22:28 ` [PATCH net-next v8 05/16] net: Slightly simplify net_mp_{open,close}_rxq Daniel Borkmann
2026-01-31 23:48   ` Jakub Kicinski
2026-01-29 22:28 ` [PATCH net-next v8 06/16] net: Proxy net_mp_{open,close}_rxq for leased queues Daniel Borkmann
2026-02-01  0:02   ` Jakub Kicinski
2026-02-01 22:09     ` David Wei
2026-01-29 22:28 ` [PATCH net-next v8 07/16] net: Proxy netdev_queue_get_dma_dev " Daniel Borkmann
2026-02-01  0:04   ` Jakub Kicinski
2026-02-01 22:23     ` David Wei
2026-01-29 22:28 ` [PATCH net-next v8 08/16] xsk: Extend xsk_rcv_check validation Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 09/16] xsk: Proxy pool management for leased queues Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 10/16] netkit: Add single device mode for netkit Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 11/16] netkit: Implement rtnl_link_ops->alloc and ndo_queue_create Daniel Borkmann
2026-02-01  0:19   ` Jakub Kicinski
2026-02-01 22:27     ` David Wei
2026-01-29 22:28 ` [PATCH net-next v8 12/16] netkit: Add netkit notifier to check for unregistering devices Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 13/16] netkit: Add xsk support for af_xdp applications Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 14/16] selftests/net: Add bpf skb forwarding program Daniel Borkmann
2026-01-29 22:28 ` [PATCH net-next v8 15/16] selftests/net: Add env for container based tests Daniel Borkmann
2026-02-01  0:38   ` Jakub Kicinski
2026-02-01 22:53     ` David Wei
2026-02-02 18:41       ` Jakub Kicinski
2026-02-10  0:25         ` David Wei
2026-02-05  2:08       ` Bobby Eshleman
2026-02-05  2:34         ` Bobby Eshleman
2026-02-10 17:30           ` David Wei
2026-01-29 22:28 ` [PATCH net-next v8 16/16] selftests/net: Add netkit container tests Daniel Borkmann
2026-02-01  0:24   ` Jakub Kicinski
2026-02-01 22:54     ` David Wei
2026-02-05  1:44   ` Bobby Eshleman

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260131154523.5e495380@kernel.org \
    --to=kuba@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=dw@davidwei.uk \
    --cc=john.fastabend@gmail.com \
    --cc=jordan@jrife.io \
    --cc=maciej.fijalkowski@intel.com \
    --cc=magnus.karlsson@intel.com \
    --cc=martin.lau@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=razor@blackwall.org \
    --cc=sdf@fomichev.me \
    --cc=toke@redhat.com \
    --cc=wangdongdong.6@bytedance.com \
    --cc=willemb@google.com \
    --cc=yangzhenze@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.