From: Akihiko Odaki <akihiko.odaki@daynix.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
Jonathan Corbet <corbet@lwn.net>,
Jason Wang <jasowang@redhat.com>,
"David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
"Michael S. Tsirkin" <mst@redhat.com>,
Xuan Zhuo <xuanzhuo@linux.alibaba.com>,
Shuah Khan <shuah@kernel.org>,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
netdev@vger.kernel.org, kvm@vger.kernel.org,
virtualization@lists.linux-foundation.org,
linux-kselftest@vger.kernel.org,
Yuri Benditovich <yuri.benditovich@daynix.com>,
Andrew Melnychenko <andrew@daynix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS
Date: Tue, 24 Sep 2024 10:57:49 +0200 [thread overview]
Message-ID: <cc2bc65b-d75a-4232-a0bd-9c759aba8aba@daynix.com> (raw)
In-Reply-To: <694a8f81-616e-47d0-8185-5b73626c4109@daynix.com>
On 2024/09/24 10:56, Akihiko Odaki wrote:
> On 2024/09/18 15:28, Willem de Bruijn wrote:
>> Akihiko Odaki wrote:
>>> RSS is a receive steering algorithm that can be negotiated to use with
>>> virtio_net. Conventionally the hash calculation was done by the VMM.
>>> However, computing the hash after the queue was chosen defeats the
>>> purpose of RSS.
>>>
>>> Another approach is to use eBPF steering program. This approach has
>>> another downside: it cannot report the calculated hash due to the
>>> restrictive nature of eBPF steering program.
>>>
>>> Introduce the code to perform RSS to the kernel in order to overcome
>>> thse challenges. An alternative solution is to extend the eBPF steering
>>> program so that it will be able to report to the userspace, but I didn't
>>> opt for it because extending the current mechanism of eBPF steering
>>> program as is because it relies on legacy context rewriting, and
>>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>>> the other relevant virtualization APIs such as KVM and vhost_net are
>>> UAPIs.
>>>
>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>>> ---
>>> drivers/net/tun.c | 119
>>> +++++++++++++++++++++++++++++++++++++++-----
>>> include/uapi/linux/if_tun.h | 27 ++++++++++
>>> 2 files changed, 133 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index b8fcd71becac..5a429b391144 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -175,6 +175,9 @@ struct tun_prog {
>>> struct tun_vnet_hash_container {
>>> struct tun_vnet_hash common;
>>> + struct tun_vnet_hash_rss rss;
>>> + __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>>> + u16 rss_indirection_table[];
>>> };
>>> /* Since the socket were moved to tun_file, to preserve the
>>> behavior of persist
>>> @@ -227,7 +230,7 @@ struct veth {
>>> };
>>> static const struct tun_vnet_hash tun_vnet_hash_cap = {
>>> - .flags = TUN_VNET_HASH_REPORT,
>>> + .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>>> .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>>> };
>>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct
>>> tun_struct *tun, struct sk_buff *skb)
>>> return ret % numqueues;
>>> }
>>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
>>> + struct sk_buff *skb,
>>> + const struct tun_vnet_hash_container *vnet_hash)
>>> +{
>>> + struct tun_vnet_hash_ext *ext;
>>> + struct virtio_net_hash hash;
>>> + u32 numqueues = READ_ONCE(tun->numqueues);
>>> + u16 txq, index;
>>> +
>>> + if (!numqueues)
>>> + return 0;
>>> +
>>> + if (!virtio_net_hash_rss(skb, vnet_hash->common.types,
>>> vnet_hash->rss_key,
>>> + &hash))
>>> + return vnet_hash->rss.unclassified_queue % numqueues;
>>> +
>>> + if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
>>> + ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>>> + if (ext) {
>>> + ext->value = hash.value;
>>> + ext->report = hash.report;
>>> + }
>>> + }
>>> +
>>> + index = hash.value & vnet_hash->rss.indirection_table_mask;
>>> + txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
>>> +
>>> + return txq % numqueues;
>>> +}
>>> +
>>> static u16 tun_select_queue(struct net_device *dev, struct sk_buff
>>> *skb,
>>> struct net_device *sb_dev)
>>> {
>>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device
>>> *dev, struct sk_buff *skb,
>>> } else {
>>> struct tun_vnet_hash_container *vnet_hash =
>>> rcu_dereference(tun->vnet_hash);
>>> - ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>> + if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>>> + ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
>>> + else
>>> + ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>> }
>>> rcu_read_unlock();
>>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file,
>>> struct ifreq *ifr)
>>> }
>>> static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog
>>> __rcu **prog_p,
>>> - void __user *data)
>>> + int fd)
>>> {
>>> struct bpf_prog *prog;
>>> - int fd;
>>> -
>>> - if (copy_from_user(&fd, data, sizeof(fd)))
>>> - return -EFAULT;
>>> if (fd == -1) {
>>> prog = NULL;
>>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file,
>>> unsigned int cmd,
>>> int ifindex;
>>> int sndbuf;
>>> int vnet_hdr_sz;
>>> + int fd;
>>> int le;
>>> int ret;
>>> bool do_notify = false;
>>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file
>>> *file, unsigned int cmd,
>>> break;
>>> case TUNSETSTEERINGEBPF:
>>> - ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>>> + if (get_user(fd, (int __user *)argp)) {
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> +
>>> + vnet_hash = rtnl_dereference(tun->vnet_hash);
>>> + if (fd != -1 && vnet_hash && (vnet_hash->common.flags &
>>> TUN_VNET_HASH_RSS)) {
>>> + ret = -EBUSY;
>>> + break;
>>> + }
>>> +
>>> + ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>>> break;
>>> case TUNSETFILTEREBPF:
>>> - ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>>> + if (get_user(fd, (int __user *)argp)) {
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> +
>>> + ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>>> break;
>>> case TUNSETCARRIER:
>>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file
>>> *file, unsigned int cmd,
>>> break;
>>> }
>>> - vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> - if (!vnet_hash) {
>>> - ret = -ENOMEM;
>>> - break;
>>> + if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
>>> + struct tun_vnet_hash_rss rss;
>>> + size_t indirection_table_size;
>>> + size_t key_size;
>>> + size_t size;
>>> +
>>> + if (tun->steering_prog) {
>>> + ret = -EBUSY;
>>> + break;
>>> + }
>>> +
>>> + if (copy_from_user(&rss, argp, sizeof(rss))) {
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> + argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>>> +
>>> + indirection_table_size =
>>> ((size_t)rss.indirection_table_mask + 1) * 2;
>>
>> Why make uapi a mask rather than a length?
>
> It follows the virtio specification. It is actually used as a mask in
> tun_vnet_rss_select_queue().
>
>>
>> Also is there a upper length bounds sanity check for this input from
>> userspace?
>
> No, but the maximum size is limited to 128 bytes because the
> indirection_table_mask is 16-bit and it indexes an array of 16-bit
> integers.
Not 128 bytes but 128 KiB.
>
>>
>>> + key_size =
>>> virtio_net_hash_key_length(vnet_hash_common.types);
>>> + size = sizeof(*vnet_hash) + indirection_table_size +
>>> key_size;
>>
>> key_size is included in sizeof(*vnet_hash), always
>> VIRTIO_NET_RSS_MAX_KEY_SIZE.
>
> I will fix this by replacing it with:
> struct_size(vnet_hash, rss_indirection_table,
> (size_t)rss.indirection_table_mask + 1)
>
> Regards,
> Akihiko Odaki
>
>>> +
>>> + vnet_hash = kmalloc(size, GFP_KERNEL);
>>> + if (!vnet_hash) {
>>> + ret = -ENOMEM;
>>> + break;
>>> + }
>>> +
>>> + if (copy_from_user(vnet_hash->rss_indirection_table,
>>> + argp, indirection_table_size)) {
>>> + kfree(vnet_hash);
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> + argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>>> +
>>> + if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
>>> + kfree(vnet_hash);
>>> + ret = -EFAULT;
>>> + break;
>>> + }
>>> +
>>> + vnet_hash->rss = rss;
>>> + } else {
>>> + vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> + if (!vnet_hash) {
>>> + ret = -ENOMEM;
>>> + break;
>>> + }
>>> }
>>> vnet_hash->common = vnet_hash_common;
>>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>>> index 1561e8ce0a0a..1c130409db5d 100644
>>> --- a/include/uapi/linux/if_tun.h
>>> +++ b/include/uapi/linux/if_tun.h
>>> @@ -75,6 +75,14 @@
>>> *
>>> * The argument is a pointer to &struct tun_vnet_hash.
>>> *
>>> + * The argument is a pointer to the compound of the following in
>>> order if
>>> + * %TUN_VNET_HASH_RSS is set:
>>> + *
>>> + * 1. &struct tun_vnet_hash
>>> + * 2. &struct tun_vnet_hash_rss
>>> + * 3. Indirection table
>>> + * 4. Key
>>> + *
>>> * %TUNSETVNETHDRSZ ioctl must be called with a number greater than
>>> or equal to
>>> * the size of &struct virtio_net_hdr_v1_hash before calling this
>>> ioctl with
>>> * %TUN_VNET_HASH_REPORT.
>>> @@ -144,6 +152,13 @@ struct tun_filter {
>>> */
>>> #define TUN_VNET_HASH_REPORT 0x0001
>>> +/**
>>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>>> + *
>>> + * This is mutually exclusive with eBPF steering program.
>>> + */
>>> +#define TUN_VNET_HASH_RSS 0x0002
>>> +
>>> /**
>>> * struct tun_vnet_hash - virtio_net hashing configuration
>>> * @flags:
>>> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>>> __u32 types;
>>> };
>>> +/**
>>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>>> + * @indirection_table_mask:
>>> + * Bitmask to be applied to the indirection table index
>>> + * @unclassified_queue:
>>> + * The index of the queue to place unclassified packets in
>>> + */
>>> +struct tun_vnet_hash_rss {
>>> + __u16 indirection_table_mask;
>>> + __u16 unclassified_queue;
>>> +};
>>> +
>>> #endif /* _UAPI__IF_TUN_H */
>>>
>>> --
>>> 2.46.0
>>>
>>
>>
next prev parent reply other threads:[~2024-09-24 8:57 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-15 1:17 [PATCH RFC v3 0/9] tun: Introduce virtio-net hashing feature Akihiko Odaki
2024-09-15 1:17 ` [PATCH RFC v3 1/9] skbuff: Introduce SKB_EXT_TUN_VNET_HASH Akihiko Odaki
2024-09-18 12:46 ` Willem de Bruijn
2024-09-15 1:17 ` [PATCH RFC v3 2/9] virtio_net: Add functions for hashing Akihiko Odaki
2024-09-16 7:12 ` gur.stavi
2024-09-16 8:01 ` gur.stavi
2024-09-19 12:51 ` Akihiko Odaki
2024-09-16 8:46 ` gur.stavi
2024-09-18 12:50 ` Willem de Bruijn
2024-09-23 18:15 ` Akihiko Odaki
2024-09-15 1:17 ` [PATCH RFC v3 3/9] net: flow_dissector: Export flow_keys_dissector_symmetric Akihiko Odaki
2024-09-15 1:17 ` [PATCH RFC v3 4/9] tap: Pad virtio header with zero Akihiko Odaki
2024-09-18 12:52 ` Willem de Bruijn
2024-09-15 1:17 ` [PATCH RFC v3 5/9] tun: " Akihiko Odaki
2024-09-15 1:17 ` [PATCH RFC v3 6/9] tun: Introduce virtio-net hash reporting feature Akihiko Odaki
2024-09-18 13:17 ` Willem de Bruijn
2024-09-23 18:35 ` Akihiko Odaki
2024-09-15 1:17 ` [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS Akihiko Odaki
2024-09-18 13:28 ` Willem de Bruijn
2024-09-24 8:56 ` Akihiko Odaki
2024-09-24 8:57 ` Akihiko Odaki [this message]
2024-09-15 1:17 ` [PATCH RFC v3 8/9] selftest: tun: Add tests for virtio-net hashing Akihiko Odaki
2024-09-15 1:17 ` [PATCH RFC v3 9/9] vhost/net: Support VIRTIO_NET_F_HASH_REPORT Akihiko Odaki
2024-09-15 19:48 ` [PATCH RFC v3 0/9] tun: Introduce virtio-net hashing feature Stephen Hemminger
2024-09-23 17:57 ` Akihiko Odaki
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=cc2bc65b-d75a-4232-a0bd-9c759aba8aba@daynix.com \
--to=akihiko.odaki@daynix.com \
--cc=andrew@daynix.com \
--cc=corbet@lwn.net \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=jasowang@redhat.com \
--cc=kuba@kernel.org \
--cc=kvm@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=mst@redhat.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=shuah@kernel.org \
--cc=virtualization@lists.linux-foundation.org \
--cc=willemdebruijn.kernel@gmail.com \
--cc=xuanzhuo@linux.alibaba.com \
--cc=yuri.benditovich@daynix.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox