All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	Jason Wang <jasowang@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
	linux-kselftest@vger.kernel.org, bpf@vger.kernel.org,
	davem@davemloft.net, kuba@kernel.org, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, kafai@fb.com,
	songliubraving@fb.com, yhs@fb.com, john.fastabend@gmail.com,
	kpsingh@kernel.org, rdunlap@infradead.org, willemb@google.com,
	gustavoars@kernel.org, herbert@gondor.apana.org.au,
	steffen.klassert@secunet.com, nogikh@google.com,
	pablo@netfilter.org, decui@microsoft.com, cai@lca.pw,
	jakub@cloudflare.com, elver@google.com, pabeni@redhat.com,
	Yuri Benditovich <yuri.benditovich@daynix.com>,
	Akihiko Odaki <akihiko.odaki@daynix.com>
Cc: xuanzhuo@linux.alibaba.com, shuah@kernel.org
Subject: Re: [RFC PATCH 5/7] tun: Introduce virtio-net hashing feature
Date: Mon, 9 Oct 2023 07:38:52 -0400	[thread overview]
Message-ID: <20231009071226-mutt-send-email-mst@kernel.org> (raw)
In-Reply-To: <20231008052101.144422-6-akihiko.odaki@daynix.com>

Akihiko Odaki sorry about reposts.
Having an email with two "@" in the CC list:
	 xuanzhuo@linux.alibaba.comshuah@kernel.org
tripped up mutt's reply-all for me and made it send to you only.

I am guessing you meant two addresses: xuanzhuo@linux.alibaba.com
and shuah@kernel.org.


On Sun, Oct 08, 2023 at 02:20:49PM +0900, Akihiko Odaki wrote:
> virtio-net have two usage of hashes: one is RSS and another is hash
> reporting. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
> 
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF.
> 
> Introduce the code to compute hashes to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but it makes
> little sense to allow to implement different hashing algorithms with
> eBPF since the hash value reported by virtio-net is strictly defined by
> the specification.
> 
> The hash value already stored in sk_buff is not used and computed
> independently since it may have been computed in a way not conformant
> with the specification.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>  drivers/net/tun.c           | 187 ++++++++++++++++++++++++++++++++----
>  include/uapi/linux/if_tun.h |  16 +++
>  2 files changed, 182 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 89ab9efe522c..561a573cd008 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -171,6 +171,9 @@ struct tun_prog {
>  	struct bpf_prog *prog;
>  };
>  
> +#define TUN_VNET_HASH_MAX_KEY_SIZE 40
> +#define TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH 128
> +

where do these come from?

>  /* Since the socket were moved to tun_file, to preserve the behavior of persist
>   * device, socket filter, sndbuf and vnet header size were restore when the
>   * file were attached to a persist device.
> @@ -209,6 +212,9 @@ struct tun_struct {
>  	struct tun_prog __rcu *steering_prog;
>  	struct tun_prog __rcu *filter_prog;
>  	struct ethtool_link_ksettings link_ksettings;
> +	struct tun_vnet_hash vnet_hash;
> +	u16 vnet_hash_indirection_table[TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH];
> +	u32 vnet_hash_key[TUN_VNET_HASH_MAX_KEY_SIZE / 4];

That's quite a lot of data to add in this struct, and will be used
by a small minority of users. Are you pushing any hot data out of cache
with this? Why not allocate these as needed?

>  	/* init args */
>  	struct file *file;
>  	struct ifreq *ifr;
> @@ -219,6 +225,13 @@ struct veth {
>  	__be16 h_vlan_TCI;
>  };
>  
> +static const struct tun_vnet_hash_cap tun_vnet_hash_cap = {
> +	.max_indirection_table_length =
> +		TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH,
> +
> +	.types = VIRTIO_NET_SUPPORTED_HASH_TYPES
> +};
> +
>  static void tun_flow_init(struct tun_struct *tun);
>  static void tun_flow_uninit(struct tun_struct *tun);
>  
> @@ -320,10 +333,16 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
>  	if (get_user(be, argp))
>  		return -EFAULT;
>  
> -	if (be)
> +	if (be) {
> +		if (!(tun->flags & TUN_VNET_LE) &&
> +		    (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT)) {
> +			return -EINVAL;
> +		}
> +
>  		tun->flags |= TUN_VNET_BE;
> -	else
> +	} else {
>  		tun->flags &= ~TUN_VNET_BE;
> +	}
>  
>  	return 0;
>  }
> @@ -558,15 +577,47 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>  	return ret % numqueues;
>  }
>  
> +static u16 tun_vnet_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +	u32 value = qdisc_skb_cb(skb)->tun_vnet_hash_value;
> +	u32 numqueues;
> +	u32 index;
> +	u16 queue;
> +
> +	numqueues = READ_ONCE(tun->numqueues);
> +	if (!numqueues)
> +		return 0;
> +
> +	index = value & READ_ONCE(tun->vnet_hash.indirection_table_mask);
> +	queue = READ_ONCE(tun->vnet_hash_indirection_table[index]);
> +	if (!queue)
> +		queue = READ_ONCE(tun->vnet_hash.unclassified_queue);

Apparently 0 is an illegal queue value? You are making this part
of UAPI better document things like this.

> +
> +	return queue % numqueues;
> +}
> +
>  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>  			    struct net_device *sb_dev)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
> +	u8 vnet_hash_flags = READ_ONCE(tun->vnet_hash.flags);
> +	struct virtio_net_hash hash;
>  	u16 ret;
>  
> +	if (vnet_hash_flags & (TUN_VNET_HASH_RSS | TUN_VNET_HASH_REPORT)) {
> +		virtio_net_hash(skb, READ_ONCE(tun->vnet_hash.types),
> +				tun->vnet_hash_key, &hash);
> +

What are all these READ_ONCE things doing?
E.g. you seem to be very content to have tun->vnet_hash.types read twice apparently?
This is volatile which basically breaks all compiler's attempts
to optimize code - needs to be used judiciously.



> +		skb->tun_vnet_hash = true;
> +		qdisc_skb_cb(skb)->tun_vnet_hash_value = hash.value;
> +		qdisc_skb_cb(skb)->tun_vnet_hash_report = hash.report;
> +	}
> +
>  	rcu_read_lock();
>  	if (rcu_dereference(tun->steering_prog))
>  		ret = tun_ebpf_select_queue(tun, skb);
> +	else if (vnet_hash_flags & TUN_VNET_HASH_RSS)
> +		ret = tun_vnet_select_queue(tun, skb);
>  	else
>  		ret = tun_automq_select_queue(tun, skb);
>  	rcu_read_unlock();
> @@ -2088,10 +2139,15 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  			    struct iov_iter *iter)
>  {
>  	struct tun_pi pi = { 0, skb->protocol };
> +	struct virtio_net_hash vnet_hash = {
> +		.value = qdisc_skb_cb(skb)->tun_vnet_hash_value,
> +		.report = qdisc_skb_cb(skb)->tun_vnet_hash_report
> +	};
>  	ssize_t total;
>  	int vlan_offset = 0;
>  	int vlan_hlen = 0;
>  	int vnet_hdr_sz = 0;
> +	size_t vnet_hdr_content_sz;
>  
>  	if (skb_vlan_tag_present(skb))
>  		vlan_hlen = VLAN_HLEN;
> @@ -2116,31 +2172,49 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  	}
>  
>  	if (vnet_hdr_sz) {
> -		struct virtio_net_hdr gso;
> +		union {
> +			struct virtio_net_hdr hdr;
> +			struct virtio_net_hdr_v1_hash v1_hash_hdr;
> +		} hdr;
> +		int ret;
>  
>  		if (iov_iter_count(iter) < vnet_hdr_sz)
>  			return -EINVAL;
>  
> -		if (virtio_net_hdr_from_skb(skb, &gso,
> -					    tun_is_little_endian(tun), true,
> -					    vlan_hlen)) {
> +		if ((READ_ONCE(tun->vnet_hash.flags) & TUN_VNET_HASH_REPORT) &&
> +		    vnet_hdr_sz >= sizeof(hdr.v1_hash_hdr) &&
> +		    skb->tun_vnet_hash) {
> +			vnet_hdr_content_sz = sizeof(hdr.v1_hash_hdr);
> +			ret = virtio_net_hdr_v1_hash_from_skb(skb,
> +							      &hdr.v1_hash_hdr,
> +							      true,
> +							      vlan_hlen,
> +							      &vnet_hash);
> +		} else {
> +			vnet_hdr_content_sz = sizeof(hdr.hdr);
> +			ret = virtio_net_hdr_from_skb(skb, &hdr.hdr,
> +						      tun_is_little_endian(tun),
> +						      true, vlan_hlen);
> +		}
> +
> +		if (ret) {
>  			struct skb_shared_info *sinfo = skb_shinfo(skb);
>  			pr_err("unexpected GSO type: "
>  			       "0x%x, gso_size %d, hdr_len %d\n",
> -			       sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
> -			       tun16_to_cpu(tun, gso.hdr_len));
> +			       sinfo->gso_type, tun16_to_cpu(tun, hdr.hdr.gso_size),
> +			       tun16_to_cpu(tun, hdr.hdr.hdr_len));
>  			print_hex_dump(KERN_ERR, "tun: ",
>  				       DUMP_PREFIX_NONE,
>  				       16, 1, skb->head,
> -				       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
> +				       min((int)tun16_to_cpu(tun, hdr.hdr.hdr_len), 64), true);
>  			WARN_ON_ONCE(1);
>  			return -EINVAL;
>  		}
>  
> -		if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
> +		if (copy_to_iter(&hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz)
>  			return -EFAULT;
>  
> -		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> +		iov_iter_advance(iter, vnet_hdr_sz - vnet_hdr_content_sz);
>  	}
>  
>  	if (vlan_hlen) {
> @@ -3007,24 +3081,27 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>  	return ret;
>  }
>  
> -static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
> -			void __user *data)
> +static struct bpf_prog *tun_set_ebpf(struct tun_struct *tun,
> +				     struct tun_prog __rcu **prog_p,
> +				     void __user *data)
>  {
>  	struct bpf_prog *prog;
>  	int fd;
> +	int ret;
>  
>  	if (copy_from_user(&fd, data, sizeof(fd)))
> -		return -EFAULT;
> +		return ERR_PTR(-EFAULT);
>  
>  	if (fd == -1) {
>  		prog = NULL;
>  	} else {
>  		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>  		if (IS_ERR(prog))
> -			return PTR_ERR(prog);
> +			return prog;
>  	}
>  
> -	return __tun_set_ebpf(tun, prog_p, prog);
> +	ret = __tun_set_ebpf(tun, prog_p, prog);
> +	return ret ? ERR_PTR(ret) : prog;
>  }
>  
>  /* Return correct value for tun->dev->addr_len based on tun->dev->type. */
> @@ -3082,6 +3159,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  	int le;
>  	int ret;
>  	bool do_notify = false;
> +	struct bpf_prog *bpf_ret;
> +	struct tun_vnet_hash vnet_hash;
> +	u16 vnet_hash_indirection_table[TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH];
> +	u8 vnet_hash_key[TUN_VNET_HASH_MAX_KEY_SIZE];
> +	size_t len;
>  
>  	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
>  	    (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
> @@ -3295,7 +3377,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			ret = -EFAULT;
>  			break;
>  		}
> -		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
> +		if (vnet_hdr_sz <
> +		    (int)((tun->vnet_hash.flags & TUN_VNET_HASH_REPORT) ?
> +			  sizeof(struct virtio_net_hdr_v1_hash) :
> +			  sizeof(struct virtio_net_hdr))) {
>  			ret = -EINVAL;
>  			break;
>  		}
> @@ -3314,10 +3399,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			ret = -EFAULT;
>  			break;
>  		}
> -		if (le)
> +		if (le) {
>  			tun->flags |= TUN_VNET_LE;
> -		else
> +		} else {
> +			if (!tun_legacy_is_little_endian(tun)) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +
>  			tun->flags &= ~TUN_VNET_LE;
> +		}
>  		break;
>  
>  	case TUNGETVNETBE:
> @@ -3360,11 +3451,17 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		break;
>  
>  	case TUNSETSTEERINGEBPF:
> -		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +		bpf_ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +		if (IS_ERR(bpf_ret))
> +			ret = PTR_ERR(bpf_ret);
> +		else if (bpf_ret)
> +			tun->vnet_hash.flags &= ~TUN_VNET_HASH_RSS;

what is this doing?

>  		break;
>  
>  	case TUNSETFILTEREBPF:
> -		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +		bpf_ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +		if (IS_ERR(bpf_ret))
> +			ret = PTR_ERR(bpf_ret);
>  		break;
>  
>  	case TUNSETCARRIER:
> @@ -3382,6 +3479,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		ret = open_related_ns(&net->ns, get_net_ns);
>  		break;
>  
> +	case TUNGETVNETHASHCAP:
> +		if (copy_to_user(argp, &tun_vnet_hash_cap,
> +				 sizeof(tun_vnet_hash_cap)))
> +			ret = -EFAULT;
> +		break;
> +
> +	case TUNSETVNETHASH:
> +		len = sizeof(vnet_hash);
> +		if (copy_from_user(&vnet_hash, argp, len)) {
> +			ret = -EFAULT;
> +			break;
> +		}


what if flags has some bits set you don't know how to handle?
should these be ignored as now or cause a failure?
these decisions all affect uapi.

> +
> +		if (((vnet_hash.flags & TUN_VNET_HASH_REPORT) &&
> +		     (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) ||
> +		      !tun_is_little_endian(tun))) ||
> +		     vnet_hash.indirection_table_mask >=
> +		     TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH) {
> +			ret = -EINVAL;
> +			break;
> +		}

Given this is later used to index within an array one has to
be very careful about spectre things here, which this code isn't.


> +
> +		argp = (u8 __user *)argp + len;
> +		len = (vnet_hash.indirection_table_mask + 1) * 2;

comment pointer math tricks like this extensively please.

> +		if (copy_from_user(vnet_hash_indirection_table, argp, len)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		argp = (u8 __user *)argp + len;
> +		len = virtio_net_hash_key_length(vnet_hash.types);
> +
> +		if (copy_from_user(vnet_hash_key, argp, len)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		tun->vnet_hash = vnet_hash;
> +		memcpy(tun->vnet_hash_indirection_table,
> +		       vnet_hash_indirection_table,
> +		       (vnet_hash.indirection_table_mask + 1) * 2);
> +		memcpy(tun->vnet_hash_key, vnet_hash_key, len);
> +
> +		if (vnet_hash.flags & TUN_VNET_HASH_RSS)
> +			__tun_set_ebpf(tun, &tun->steering_prog, NULL);
> +
> +		break;
> +
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 287cdc81c939..dc591cd897c8 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -61,6 +61,8 @@
>  #define TUNSETFILTEREBPF _IOR('T', 225, int)
>  #define TUNSETCARRIER _IOW('T', 226, int)
>  #define TUNGETDEVNETNS _IO('T', 227)
> +#define TUNGETVNETHASHCAP _IO('T', 228)
> +#define TUNSETVNETHASH _IOW('T', 229, unsigned int)
>  
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN		0x0001
> @@ -115,4 +117,18 @@ struct tun_filter {
>  	__u8   addr[][ETH_ALEN];
>  };
>  
> +struct tun_vnet_hash_cap {
> +	__u16 max_indirection_table_length;
> +	__u32 types;
> +};
> +

There's hidden padding in this struct - not good, copy
will leak kernel info out.



> +#define TUN_VNET_HASH_RSS	0x01
> +#define TUN_VNET_HASH_REPORT	0x02


Do you intend to add more flags down the road?
How will userspace know what is supported?

> +struct tun_vnet_hash {
> +	__u8 flags;
> +	__u32 types;
> +	__u16 indirection_table_mask;
> +	__u16 unclassified_queue;
> +};
> +

Padding here too. Best avoided.

In any case, document UAPI please.


>  #endif /* _UAPI__IF_TUN_H */
> -- 
> 2.42.0


WARNING: multiple messages have this Message-ID (diff)
From: "Michael S. Tsirkin" <mst@redhat.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	Jason Wang <jasowang@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
	linux-kselftest@vger.kernel.org, bpf@vger.kernel.org,
	davem@davemloft.net, kuba@kernel.org, ast@kernel.org,
	daniel@iogearbox.net, andrii@kernel.org, kafai@fb.com,
	songliubraving@fb.com, yhs@fb.com, john.fastabend@gmail.com,
	kpsingh@kernel.org, rdunlap@infradead.org, willemb@google.com,
	gustavoars@kernel.org, herbert@gondor.apana.org.au,
	steffen.klassert@secunet.com, nogikh@google.com,
	pablo@netfilter.org, decui@microsoft.com, cai@lca.pw,
	jakub@cloudflare.com, elver@google.com, pabeni@redhat.com,
	Yuri Benditovich <yuri.benditovich@daynix.com>,
	Akihiko Odaki <akihiko.odaki@daynix.com>
Cc: xuanzhuo@linux.alibaba.com, shuah@kernel.org
Subject: Re: [RFC PATCH 5/7] tun: Introduce virtio-net hashing feature
Date: Mon, 9 Oct 2023 07:38:52 -0400	[thread overview]
Message-ID: <20231009071226-mutt-send-email-mst@kernel.org> (raw)
In-Reply-To: <20231008052101.144422-6-akihiko.odaki@daynix.com>

Akihiko Odaki sorry about reposts.
Having an email with two "@" in the CC list:
	 xuanzhuo@linux.alibaba.comshuah@kernel.org
tripped up mutt's reply-all for me and made it send to you only.

I am guessing you meant two addresses: xuanzhuo@linux.alibaba.com
and shuah@kernel.org.


On Sun, Oct 08, 2023 at 02:20:49PM +0900, Akihiko Odaki wrote:
> virtio-net have two usage of hashes: one is RSS and another is hash
> reporting. Conventionally the hash calculation was done by the VMM.
> However, computing the hash after the queue was chosen defeats the
> purpose of RSS.
> 
> Another approach is to use eBPF steering program. This approach has
> another downside: it cannot report the calculated hash due to the
> restrictive nature of eBPF.
> 
> Introduce the code to compute hashes to the kernel in order to overcome
> thse challenges. An alternative solution is to extend the eBPF steering
> program so that it will be able to report to the userspace, but it makes
> little sense to allow to implement different hashing algorithms with
> eBPF since the hash value reported by virtio-net is strictly defined by
> the specification.
> 
> The hash value already stored in sk_buff is not used and computed
> independently since it may have been computed in a way not conformant
> with the specification.
> 
> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
> ---
>  drivers/net/tun.c           | 187 ++++++++++++++++++++++++++++++++----
>  include/uapi/linux/if_tun.h |  16 +++
>  2 files changed, 182 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 89ab9efe522c..561a573cd008 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -171,6 +171,9 @@ struct tun_prog {
>  	struct bpf_prog *prog;
>  };
>  
> +#define TUN_VNET_HASH_MAX_KEY_SIZE 40
> +#define TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH 128
> +

where do these come from?

>  /* Since the socket were moved to tun_file, to preserve the behavior of persist
>   * device, socket filter, sndbuf and vnet header size were restore when the
>   * file were attached to a persist device.
> @@ -209,6 +212,9 @@ struct tun_struct {
>  	struct tun_prog __rcu *steering_prog;
>  	struct tun_prog __rcu *filter_prog;
>  	struct ethtool_link_ksettings link_ksettings;
> +	struct tun_vnet_hash vnet_hash;
> +	u16 vnet_hash_indirection_table[TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH];
> +	u32 vnet_hash_key[TUN_VNET_HASH_MAX_KEY_SIZE / 4];

That's quite a lot of data to add in this struct, and will be used
by a small minority of users. Are you pushing any hot data out of cache
with this? Why not allocate these as needed?

>  	/* init args */
>  	struct file *file;
>  	struct ifreq *ifr;
> @@ -219,6 +225,13 @@ struct veth {
>  	__be16 h_vlan_TCI;
>  };
>  
> +static const struct tun_vnet_hash_cap tun_vnet_hash_cap = {
> +	.max_indirection_table_length =
> +		TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH,
> +
> +	.types = VIRTIO_NET_SUPPORTED_HASH_TYPES
> +};
> +
>  static void tun_flow_init(struct tun_struct *tun);
>  static void tun_flow_uninit(struct tun_struct *tun);
>  
> @@ -320,10 +333,16 @@ static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
>  	if (get_user(be, argp))
>  		return -EFAULT;
>  
> -	if (be)
> +	if (be) {
> +		if (!(tun->flags & TUN_VNET_LE) &&
> +		    (tun->vnet_hash.flags & TUN_VNET_HASH_REPORT)) {
> +			return -EINVAL;
> +		}
> +
>  		tun->flags |= TUN_VNET_BE;
> -	else
> +	} else {
>  		tun->flags &= ~TUN_VNET_BE;
> +	}
>  
>  	return 0;
>  }
> @@ -558,15 +577,47 @@ static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
>  	return ret % numqueues;
>  }
>  
> +static u16 tun_vnet_select_queue(struct tun_struct *tun, struct sk_buff *skb)
> +{
> +	u32 value = qdisc_skb_cb(skb)->tun_vnet_hash_value;
> +	u32 numqueues;
> +	u32 index;
> +	u16 queue;
> +
> +	numqueues = READ_ONCE(tun->numqueues);
> +	if (!numqueues)
> +		return 0;
> +
> +	index = value & READ_ONCE(tun->vnet_hash.indirection_table_mask);
> +	queue = READ_ONCE(tun->vnet_hash_indirection_table[index]);
> +	if (!queue)
> +		queue = READ_ONCE(tun->vnet_hash.unclassified_queue);

Apparently 0 is an illegal queue value? You are making this part
of UAPI better document things like this.

> +
> +	return queue % numqueues;
> +}
> +
>  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
>  			    struct net_device *sb_dev)
>  {
>  	struct tun_struct *tun = netdev_priv(dev);
> +	u8 vnet_hash_flags = READ_ONCE(tun->vnet_hash.flags);
> +	struct virtio_net_hash hash;
>  	u16 ret;
>  
> +	if (vnet_hash_flags & (TUN_VNET_HASH_RSS | TUN_VNET_HASH_REPORT)) {
> +		virtio_net_hash(skb, READ_ONCE(tun->vnet_hash.types),
> +				tun->vnet_hash_key, &hash);
> +

What are all these READ_ONCE things doing?
E.g. you seem to be very content to have tun->vnet_hash.types read twice apparently?
This is volatile which basically breaks all compiler's attempts
to optimize code - needs to be used judiciously.



> +		skb->tun_vnet_hash = true;
> +		qdisc_skb_cb(skb)->tun_vnet_hash_value = hash.value;
> +		qdisc_skb_cb(skb)->tun_vnet_hash_report = hash.report;
> +	}
> +
>  	rcu_read_lock();
>  	if (rcu_dereference(tun->steering_prog))
>  		ret = tun_ebpf_select_queue(tun, skb);
> +	else if (vnet_hash_flags & TUN_VNET_HASH_RSS)
> +		ret = tun_vnet_select_queue(tun, skb);
>  	else
>  		ret = tun_automq_select_queue(tun, skb);
>  	rcu_read_unlock();
> @@ -2088,10 +2139,15 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  			    struct iov_iter *iter)
>  {
>  	struct tun_pi pi = { 0, skb->protocol };
> +	struct virtio_net_hash vnet_hash = {
> +		.value = qdisc_skb_cb(skb)->tun_vnet_hash_value,
> +		.report = qdisc_skb_cb(skb)->tun_vnet_hash_report
> +	};
>  	ssize_t total;
>  	int vlan_offset = 0;
>  	int vlan_hlen = 0;
>  	int vnet_hdr_sz = 0;
> +	size_t vnet_hdr_content_sz;
>  
>  	if (skb_vlan_tag_present(skb))
>  		vlan_hlen = VLAN_HLEN;
> @@ -2116,31 +2172,49 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>  	}
>  
>  	if (vnet_hdr_sz) {
> -		struct virtio_net_hdr gso;
> +		union {
> +			struct virtio_net_hdr hdr;
> +			struct virtio_net_hdr_v1_hash v1_hash_hdr;
> +		} hdr;
> +		int ret;
>  
>  		if (iov_iter_count(iter) < vnet_hdr_sz)
>  			return -EINVAL;
>  
> -		if (virtio_net_hdr_from_skb(skb, &gso,
> -					    tun_is_little_endian(tun), true,
> -					    vlan_hlen)) {
> +		if ((READ_ONCE(tun->vnet_hash.flags) & TUN_VNET_HASH_REPORT) &&
> +		    vnet_hdr_sz >= sizeof(hdr.v1_hash_hdr) &&
> +		    skb->tun_vnet_hash) {
> +			vnet_hdr_content_sz = sizeof(hdr.v1_hash_hdr);
> +			ret = virtio_net_hdr_v1_hash_from_skb(skb,
> +							      &hdr.v1_hash_hdr,
> +							      true,
> +							      vlan_hlen,
> +							      &vnet_hash);
> +		} else {
> +			vnet_hdr_content_sz = sizeof(hdr.hdr);
> +			ret = virtio_net_hdr_from_skb(skb, &hdr.hdr,
> +						      tun_is_little_endian(tun),
> +						      true, vlan_hlen);
> +		}
> +
> +		if (ret) {
>  			struct skb_shared_info *sinfo = skb_shinfo(skb);
>  			pr_err("unexpected GSO type: "
>  			       "0x%x, gso_size %d, hdr_len %d\n",
> -			       sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
> -			       tun16_to_cpu(tun, gso.hdr_len));
> +			       sinfo->gso_type, tun16_to_cpu(tun, hdr.hdr.gso_size),
> +			       tun16_to_cpu(tun, hdr.hdr.hdr_len));
>  			print_hex_dump(KERN_ERR, "tun: ",
>  				       DUMP_PREFIX_NONE,
>  				       16, 1, skb->head,
> -				       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
> +				       min((int)tun16_to_cpu(tun, hdr.hdr.hdr_len), 64), true);
>  			WARN_ON_ONCE(1);
>  			return -EINVAL;
>  		}
>  
> -		if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
> +		if (copy_to_iter(&hdr, vnet_hdr_content_sz, iter) != vnet_hdr_content_sz)
>  			return -EFAULT;
>  
> -		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> +		iov_iter_advance(iter, vnet_hdr_sz - vnet_hdr_content_sz);
>  	}
>  
>  	if (vlan_hlen) {
> @@ -3007,24 +3081,27 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>  	return ret;
>  }
>  
> -static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
> -			void __user *data)
> +static struct bpf_prog *tun_set_ebpf(struct tun_struct *tun,
> +				     struct tun_prog __rcu **prog_p,
> +				     void __user *data)
>  {
>  	struct bpf_prog *prog;
>  	int fd;
> +	int ret;
>  
>  	if (copy_from_user(&fd, data, sizeof(fd)))
> -		return -EFAULT;
> +		return ERR_PTR(-EFAULT);
>  
>  	if (fd == -1) {
>  		prog = NULL;
>  	} else {
>  		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
>  		if (IS_ERR(prog))
> -			return PTR_ERR(prog);
> +			return prog;
>  	}
>  
> -	return __tun_set_ebpf(tun, prog_p, prog);
> +	ret = __tun_set_ebpf(tun, prog_p, prog);
> +	return ret ? ERR_PTR(ret) : prog;
>  }
>  
>  /* Return correct value for tun->dev->addr_len based on tun->dev->type. */
> @@ -3082,6 +3159,11 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  	int le;
>  	int ret;
>  	bool do_notify = false;
> +	struct bpf_prog *bpf_ret;
> +	struct tun_vnet_hash vnet_hash;
> +	u16 vnet_hash_indirection_table[TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH];
> +	u8 vnet_hash_key[TUN_VNET_HASH_MAX_KEY_SIZE];
> +	size_t len;
>  
>  	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
>  	    (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
> @@ -3295,7 +3377,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			ret = -EFAULT;
>  			break;
>  		}
> -		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
> +		if (vnet_hdr_sz <
> +		    (int)((tun->vnet_hash.flags & TUN_VNET_HASH_REPORT) ?
> +			  sizeof(struct virtio_net_hdr_v1_hash) :
> +			  sizeof(struct virtio_net_hdr))) {
>  			ret = -EINVAL;
>  			break;
>  		}
> @@ -3314,10 +3399,16 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  			ret = -EFAULT;
>  			break;
>  		}
> -		if (le)
> +		if (le) {
>  			tun->flags |= TUN_VNET_LE;
> -		else
> +		} else {
> +			if (!tun_legacy_is_little_endian(tun)) {
> +				ret = -EINVAL;
> +				break;
> +			}
> +
>  			tun->flags &= ~TUN_VNET_LE;
> +		}
>  		break;
>  
>  	case TUNGETVNETBE:
> @@ -3360,11 +3451,17 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		break;
>  
>  	case TUNSETSTEERINGEBPF:
> -		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +		bpf_ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
> +		if (IS_ERR(bpf_ret))
> +			ret = PTR_ERR(bpf_ret);
> +		else if (bpf_ret)
> +			tun->vnet_hash.flags &= ~TUN_VNET_HASH_RSS;

what is this doing?

>  		break;
>  
>  	case TUNSETFILTEREBPF:
> -		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +		bpf_ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
> +		if (IS_ERR(bpf_ret))
> +			ret = PTR_ERR(bpf_ret);
>  		break;
>  
>  	case TUNSETCARRIER:
> @@ -3382,6 +3479,54 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
>  		ret = open_related_ns(&net->ns, get_net_ns);
>  		break;
>  
> +	case TUNGETVNETHASHCAP:
> +		if (copy_to_user(argp, &tun_vnet_hash_cap,
> +				 sizeof(tun_vnet_hash_cap)))
> +			ret = -EFAULT;
> +		break;
> +
> +	case TUNSETVNETHASH:
> +		len = sizeof(vnet_hash);
> +		if (copy_from_user(&vnet_hash, argp, len)) {
> +			ret = -EFAULT;
> +			break;
> +		}


what if flags has some bits set you don't know how to handle?
should these be ignored as now or cause a failure?
these decisions all affect uapi.

> +
> +		if (((vnet_hash.flags & TUN_VNET_HASH_REPORT) &&
> +		     (tun->vnet_hdr_sz < sizeof(struct virtio_net_hdr_v1_hash) ||
> +		      !tun_is_little_endian(tun))) ||
> +		     vnet_hash.indirection_table_mask >=
> +		     TUN_VNET_HASH_MAX_INDIRECTION_TABLE_LENGTH) {
> +			ret = -EINVAL;
> +			break;
> +		}

Given this is later used to index within an array one has to
be very careful about spectre things here, which this code isn't.


> +
> +		argp = (u8 __user *)argp + len;
> +		len = (vnet_hash.indirection_table_mask + 1) * 2;

comment pointer math tricks like this extensively please.

> +		if (copy_from_user(vnet_hash_indirection_table, argp, len)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		argp = (u8 __user *)argp + len;
> +		len = virtio_net_hash_key_length(vnet_hash.types);
> +
> +		if (copy_from_user(vnet_hash_key, argp, len)) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +
> +		tun->vnet_hash = vnet_hash;
> +		memcpy(tun->vnet_hash_indirection_table,
> +		       vnet_hash_indirection_table,
> +		       (vnet_hash.indirection_table_mask + 1) * 2);
> +		memcpy(tun->vnet_hash_key, vnet_hash_key, len);
> +
> +		if (vnet_hash.flags & TUN_VNET_HASH_RSS)
> +			__tun_set_ebpf(tun, &tun->steering_prog, NULL);
> +
> +		break;
> +
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
> index 287cdc81c939..dc591cd897c8 100644
> --- a/include/uapi/linux/if_tun.h
> +++ b/include/uapi/linux/if_tun.h
> @@ -61,6 +61,8 @@
>  #define TUNSETFILTEREBPF _IOR('T', 225, int)
>  #define TUNSETCARRIER _IOW('T', 226, int)
>  #define TUNGETDEVNETNS _IO('T', 227)
> +#define TUNGETVNETHASHCAP _IO('T', 228)
> +#define TUNSETVNETHASH _IOW('T', 229, unsigned int)
>  
>  /* TUNSETIFF ifr flags */
>  #define IFF_TUN		0x0001
> @@ -115,4 +117,18 @@ struct tun_filter {
>  	__u8   addr[][ETH_ALEN];
>  };
>  
> +struct tun_vnet_hash_cap {
> +	__u16 max_indirection_table_length;
> +	__u32 types;
> +};
> +

There's hidden padding in this struct - not good, copy
will leak kernel info out.



> +#define TUN_VNET_HASH_RSS	0x01
> +#define TUN_VNET_HASH_REPORT	0x02


Do you intend to add more flags down the road?
How will userspace know what is supported?

> +struct tun_vnet_hash {
> +	__u8 flags;
> +	__u32 types;
> +	__u16 indirection_table_mask;
> +	__u16 unclassified_queue;
> +};
> +

Padding here too. Best avoided.

In any case, document UAPI please.


>  #endif /* _UAPI__IF_TUN_H */
> -- 
> 2.42.0

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

  parent reply	other threads:[~2023-10-09 11:39 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-10-08  5:20 [RFC PATCH 0/7] tun: Introduce virtio-net hashing feature Akihiko Odaki
2023-10-08  5:20 ` [RFC PATCH 1/7] net: skbuff: Add tun_vnet_hash flag Akihiko Odaki
2023-10-08 18:39   ` Willem de Bruijn
2023-10-08 18:39     ` Willem de Bruijn
2023-10-08 19:52     ` Akihiko Odaki
2023-10-08  5:20 ` [RFC PATCH 2/7] net/core: Ensure qdisc_skb_cb will not be overwritten Akihiko Odaki
2023-10-08  5:20 ` [RFC PATCH 3/7] net: sched: Add members to qdisc_skb_cb Akihiko Odaki
2023-10-15 15:04   ` kernel test robot
2023-10-08  5:20 ` [RFC PATCH 4/7] virtio_net: Add functions for hashing Akihiko Odaki
2023-10-08  5:20 ` [RFC PATCH 5/7] tun: Introduce virtio-net hashing feature Akihiko Odaki
2023-10-08 19:07   ` Willem de Bruijn
2023-10-08 19:07     ` Willem de Bruijn
2023-10-08 20:04     ` Akihiko Odaki
2023-10-08 20:08       ` Willem de Bruijn
2023-10-08 20:08         ` Willem de Bruijn
2023-10-08 20:46         ` Akihiko Odaki
2023-10-09  8:04           ` Willem de Bruijn
2023-10-09  8:04             ` Willem de Bruijn
2023-10-09  8:57             ` Akihiko Odaki
2023-10-09  9:57               ` Willem de Bruijn
2023-10-09  9:57                 ` Willem de Bruijn
2023-10-09 10:01                 ` Akihiko Odaki
2023-10-09 10:06                   ` Willem de Bruijn
2023-10-09 10:06                     ` Willem de Bruijn
2023-10-09 10:12                     ` Akihiko Odaki
2023-10-09 10:44                       ` Willem de Bruijn
2023-10-09 10:44                         ` Willem de Bruijn
2023-10-10  1:52                         ` Akihiko Odaki
2023-10-10  5:45                           ` Jason Wang
2023-10-10  5:45                             ` Jason Wang
2023-10-10  5:51                             ` Akihiko Odaki
2023-10-10  6:00                               ` Jason Wang
2023-10-10  6:00                                 ` Jason Wang
2023-10-10  6:19                                 ` Akihiko Odaki
2023-10-11  3:18                                   ` Jason Wang
2023-10-11  3:18                                     ` Jason Wang
2023-10-11  3:57                                     ` Akihiko Odaki
2023-10-09  8:13   ` Willem de Bruijn
2023-10-09  8:13     ` Willem de Bruijn
2023-10-09  8:44     ` Akihiko Odaki
2023-10-09  9:54       ` Willem de Bruijn
2023-10-09  9:54         ` Willem de Bruijn
2023-10-09 10:05         ` Akihiko Odaki
2023-10-09 10:07           ` Willem de Bruijn
2023-10-09 10:07             ` Willem de Bruijn
2023-10-09 10:11             ` Akihiko Odaki
2023-10-09 10:32               ` Willem de Bruijn
2023-10-09 10:32                 ` Willem de Bruijn
2023-10-09 11:50       ` Michael S. Tsirkin
2023-10-09 11:50         ` Michael S. Tsirkin
2023-10-10  2:34         ` Akihiko Odaki
2023-10-09 11:38   ` Michael S. Tsirkin [this message]
2023-10-09 11:38     ` Michael S. Tsirkin
2023-10-10  2:32     ` Akihiko Odaki
2023-10-08  5:20 ` [RFC PATCH 6/7] selftest: tun: Add tests for virtio-net hashing Akihiko Odaki
2023-10-20  1:52   ` kernel test robot
2023-10-08  5:20 ` [RFC PATCH 7/7] vhost_net: Support VIRTIO_NET_F_HASH_REPORT Akihiko Odaki
2023-10-08 18:36 ` [RFC PATCH 0/7] tun: Introduce virtio-net hashing feature Willem de Bruijn
2023-10-08 18:36   ` Willem de Bruijn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231009071226-mutt-send-email-mst@kernel.org \
    --to=mst@redhat.com \
    --cc=akihiko.odaki@daynix.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=cai@lca.pw \
    --cc=daniel@iogearbox.net \
    --cc=davem@davemloft.net \
    --cc=decui@microsoft.com \
    --cc=elver@google.com \
    --cc=gustavoars@kernel.org \
    --cc=herbert@gondor.apana.org.au \
    --cc=jakub@cloudflare.com \
    --cc=jasowang@redhat.com \
    --cc=john.fastabend@gmail.com \
    --cc=kafai@fb.com \
    --cc=kpsingh@kernel.org \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=nogikh@google.com \
    --cc=pabeni@redhat.com \
    --cc=pablo@netfilter.org \
    --cc=rdunlap@infradead.org \
    --cc=shuah@kernel.org \
    --cc=songliubraving@fb.com \
    --cc=steffen.klassert@secunet.com \
    --cc=virtualization@lists.linux-foundation.org \
    --cc=willemb@google.com \
    --cc=willemdebruijn.kernel@gmail.com \
    --cc=xuanzhuo@linux.alibaba.com \
    --cc=yhs@fb.com \
    --cc=yuri.benditovich@daynix.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.