Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Akihiko Odaki <akihiko.odaki@daynix.com>
To: Willem de Bruijn <willemdebruijn.kernel@gmail.com>,
	Jonathan Corbet <corbet@lwn.net>,
	Jason Wang <jasowang@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	Xuan Zhuo <xuanzhuo@linux.alibaba.com>,
	Shuah Khan <shuah@kernel.org>,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, kvm@vger.kernel.org,
	virtualization@lists.linux-foundation.org,
	linux-kselftest@vger.kernel.org,
	Yuri Benditovich <yuri.benditovich@daynix.com>,
	Andrew Melnychenko <andrew@daynix.com>
Subject: Re: [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS
Date: Tue, 24 Sep 2024 10:57:49 +0200	[thread overview]
Message-ID: <cc2bc65b-d75a-4232-a0bd-9c759aba8aba@daynix.com> (raw)
In-Reply-To: <694a8f81-616e-47d0-8185-5b73626c4109@daynix.com>



On 2024/09/24 10:56, Akihiko Odaki wrote:
> On 2024/09/18 15:28, Willem de Bruijn wrote:
>> Akihiko Odaki wrote:
>>> RSS is a receive steering algorithm that can be negotiated to use with
>>> virtio_net. Conventionally the hash calculation was done by the VMM.
>>> However, computing the hash after the queue was chosen defeats the
>>> purpose of RSS.
>>>
>>> Another approach is to use eBPF steering program. This approach has
>>> another downside: it cannot report the calculated hash due to the
>>> restrictive nature of eBPF steering program.
>>>
>>> Introduce the code to perform RSS to the kernel in order to overcome
>>> thse challenges. An alternative solution is to extend the eBPF steering
>>> program so that it will be able to report to the userspace, but I didn't
>>> opt for it because extending the current mechanism of eBPF steering
>>> program as is because it relies on legacy context rewriting, and
>>> introducing kfunc-based eBPF will result in non-UAPI dependency while
>>> the other relevant virtualization APIs such as KVM and vhost_net are
>>> UAPIs.
>>>
>>> Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
>>> ---
>>>   drivers/net/tun.c           | 119 
>>> +++++++++++++++++++++++++++++++++++++++-----
>>>   include/uapi/linux/if_tun.h |  27 ++++++++++
>>>   2 files changed, 133 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index b8fcd71becac..5a429b391144 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -175,6 +175,9 @@ struct tun_prog {
>>>   struct tun_vnet_hash_container {
>>>       struct tun_vnet_hash common;
>>> +    struct tun_vnet_hash_rss rss;
>>> +    __be32 rss_key[VIRTIO_NET_RSS_MAX_KEY_SIZE];
>>> +    u16 rss_indirection_table[];
>>>   };
>>>   /* Since the socket were moved to tun_file, to preserve the 
>>> behavior of persist
>>> @@ -227,7 +230,7 @@ struct veth {
>>>   };
>>>   static const struct tun_vnet_hash tun_vnet_hash_cap = {
>>> -    .flags = TUN_VNET_HASH_REPORT,
>>> +    .flags = TUN_VNET_HASH_REPORT | TUN_VNET_HASH_RSS,
>>>       .types = VIRTIO_NET_SUPPORTED_HASH_TYPES
>>>   };
>>> @@ -591,6 +594,36 @@ static u16 tun_ebpf_select_queue(struct 
>>> tun_struct *tun, struct sk_buff *skb)
>>>       return ret % numqueues;
>>>   }
>>> +static u16 tun_vnet_rss_select_queue(struct tun_struct *tun,
>>> +                     struct sk_buff *skb,
>>> +                     const struct tun_vnet_hash_container *vnet_hash)
>>> +{
>>> +    struct tun_vnet_hash_ext *ext;
>>> +    struct virtio_net_hash hash;
>>> +    u32 numqueues = READ_ONCE(tun->numqueues);
>>> +    u16 txq, index;
>>> +
>>> +    if (!numqueues)
>>> +        return 0;
>>> +
>>> +    if (!virtio_net_hash_rss(skb, vnet_hash->common.types, 
>>> vnet_hash->rss_key,
>>> +                 &hash))
>>> +        return vnet_hash->rss.unclassified_queue % numqueues;
>>> +
>>> +    if (vnet_hash->common.flags & TUN_VNET_HASH_REPORT) {
>>> +        ext = skb_ext_add(skb, SKB_EXT_TUN_VNET_HASH);
>>> +        if (ext) {
>>> +            ext->value = hash.value;
>>> +            ext->report = hash.report;
>>> +        }
>>> +    }
>>> +
>>> +    index = hash.value & vnet_hash->rss.indirection_table_mask;
>>> +    txq = READ_ONCE(vnet_hash->rss_indirection_table[index]);
>>> +
>>> +    return txq % numqueues;
>>> +}
>>> +
>>>   static u16 tun_select_queue(struct net_device *dev, struct sk_buff 
>>> *skb,
>>>                   struct net_device *sb_dev)
>>>   {
>>> @@ -603,7 +636,10 @@ static u16 tun_select_queue(struct net_device 
>>> *dev, struct sk_buff *skb,
>>>       } else {
>>>           struct tun_vnet_hash_container *vnet_hash = 
>>> rcu_dereference(tun->vnet_hash);
>>> -        ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>> +        if (vnet_hash && (vnet_hash->common.flags & TUN_VNET_HASH_RSS))
>>> +            ret = tun_vnet_rss_select_queue(tun, skb, vnet_hash);
>>> +        else
>>> +            ret = tun_automq_select_queue(tun, skb, vnet_hash);
>>>       }
>>>       rcu_read_unlock();
>>> @@ -3085,13 +3121,9 @@ static int tun_set_queue(struct file *file, 
>>> struct ifreq *ifr)
>>>   }
>>>   static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog 
>>> __rcu **prog_p,
>>> -            void __user *data)
>>> +            int fd)
>>>   {
>>>       struct bpf_prog *prog;
>>> -    int fd;
>>> -
>>> -    if (copy_from_user(&fd, data, sizeof(fd)))
>>> -        return -EFAULT;
>>>       if (fd == -1) {
>>>           prog = NULL;
>>> @@ -3157,6 +3189,7 @@ static long __tun_chr_ioctl(struct file *file, 
>>> unsigned int cmd,
>>>       int ifindex;
>>>       int sndbuf;
>>>       int vnet_hdr_sz;
>>> +    int fd;
>>>       int le;
>>>       int ret;
>>>       bool do_notify = false;
>>> @@ -3460,11 +3493,27 @@ static long __tun_chr_ioctl(struct file 
>>> *file, unsigned int cmd,
>>>           break;
>>>       case TUNSETSTEERINGEBPF:
>>> -        ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
>>> +        if (get_user(fd, (int __user *)argp)) {
>>> +            ret = -EFAULT;
>>> +            break;
>>> +        }
>>> +
>>> +        vnet_hash = rtnl_dereference(tun->vnet_hash);
>>> +        if (fd != -1 && vnet_hash && (vnet_hash->common.flags & 
>>> TUN_VNET_HASH_RSS)) {
>>> +            ret = -EBUSY;
>>> +            break;
>>> +        }
>>> +
>>> +        ret = tun_set_ebpf(tun, &tun->steering_prog, fd);
>>>           break;
>>>       case TUNSETFILTEREBPF:
>>> -        ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
>>> +        if (get_user(fd, (int __user *)argp)) {
>>> +            ret = -EFAULT;
>>> +            break;
>>> +        }
>>> +
>>> +        ret = tun_set_ebpf(tun, &tun->filter_prog, fd);
>>>           break;
>>>       case TUNSETCARRIER:
>>> @@ -3496,10 +3545,54 @@ static long __tun_chr_ioctl(struct file 
>>> *file, unsigned int cmd,
>>>               break;
>>>           }
>>> -        vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> -        if (!vnet_hash) {
>>> -            ret = -ENOMEM;
>>> -            break;
>>> +        if (vnet_hash_common.flags & TUN_VNET_HASH_RSS) {
>>> +            struct tun_vnet_hash_rss rss;
>>> +            size_t indirection_table_size;
>>> +            size_t key_size;
>>> +            size_t size;
>>> +
>>> +            if (tun->steering_prog) {
>>> +                ret = -EBUSY;
>>> +                break;
>>> +            }
>>> +
>>> +            if (copy_from_user(&rss, argp, sizeof(rss))) {
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +            argp = (struct tun_vnet_hash_rss __user *)argp + 1;
>>> +
>>> +            indirection_table_size = 
>>> ((size_t)rss.indirection_table_mask + 1) * 2;
>>
>> Why make uapi a mask rather than a length?
> 
> It follows the virtio specification. It is actually used as a mask in 
> tun_vnet_rss_select_queue().
> 
>>
>> Also is there a upper length bounds sanity check for this input from
>> userspace?
> 
> No, but the maximum size is limited to 128 bytes because the 
> indirection_table_mask is 16-bit and it indexes an array of 16-bit 
> integers.

Not 128 bytes but 128 KiB.

> 
>>
>>> +            key_size = 
>>> virtio_net_hash_key_length(vnet_hash_common.types);
>>> +            size = sizeof(*vnet_hash) + indirection_table_size + 
>>> key_size;
>>
>> key_size is included in sizeof(*vnet_hash), always
>> VIRTIO_NET_RSS_MAX_KEY_SIZE.
> 
> I will fix this by replacing it with:
> struct_size(vnet_hash, rss_indirection_table,
>              (size_t)rss.indirection_table_mask + 1)
> 
> Regards,
> Akihiko Odaki
> 
>>> +
>>> +            vnet_hash = kmalloc(size, GFP_KERNEL);
>>> +            if (!vnet_hash) {
>>> +                ret = -ENOMEM;
>>> +                break;
>>> +            }
>>> +
>>> +            if (copy_from_user(vnet_hash->rss_indirection_table,
>>> +                       argp, indirection_table_size)) {
>>> +                kfree(vnet_hash);
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +            argp = (u16 __user *)argp + rss.indirection_table_mask + 1;
>>> +
>>> +            if (copy_from_user(vnet_hash->rss_key, argp, key_size)) {
>>> +                kfree(vnet_hash);
>>> +                ret = -EFAULT;
>>> +                break;
>>> +            }
>>> +
>>> +            vnet_hash->rss = rss;
>>> +        } else {
>>> +            vnet_hash = kmalloc(sizeof(vnet_hash->common), GFP_KERNEL);
>>> +            if (!vnet_hash) {
>>> +                ret = -ENOMEM;
>>> +                break;
>>> +            }
>>>           }
>>>           vnet_hash->common = vnet_hash_common;
>>> diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
>>> index 1561e8ce0a0a..1c130409db5d 100644
>>> --- a/include/uapi/linux/if_tun.h
>>> +++ b/include/uapi/linux/if_tun.h
>>> @@ -75,6 +75,14 @@
>>>    *
>>>    * The argument is a pointer to &struct tun_vnet_hash.
>>>    *
>>> + * The argument is a pointer to the compound of the following in 
>>> order if
>>> + * %TUN_VNET_HASH_RSS is set:
>>> + *
>>> + * 1. &struct tun_vnet_hash
>>> + * 2. &struct tun_vnet_hash_rss
>>> + * 3. Indirection table
>>> + * 4. Key
>>> + *
>>>    * %TUNSETVNETHDRSZ ioctl must be called with a number greater than 
>>> or equal to
>>>    * the size of &struct virtio_net_hdr_v1_hash before calling this 
>>> ioctl with
>>>    * %TUN_VNET_HASH_REPORT.
>>> @@ -144,6 +152,13 @@ struct tun_filter {
>>>    */
>>>   #define TUN_VNET_HASH_REPORT    0x0001
>>> +/**
>>> + * define TUN_VNET_HASH_RSS - Request virtio_net RSS
>>> + *
>>> + * This is mutually exclusive with eBPF steering program.
>>> + */
>>> +#define TUN_VNET_HASH_RSS    0x0002
>>> +
>>>   /**
>>>    * struct tun_vnet_hash - virtio_net hashing configuration
>>>    * @flags:
>>> @@ -159,4 +174,16 @@ struct tun_vnet_hash {
>>>       __u32 types;
>>>   };
>>> +/**
>>> + * struct tun_vnet_hash_rss - virtio_net RSS configuration
>>> + * @indirection_table_mask:
>>> + *        Bitmask to be applied to the indirection table index
>>> + * @unclassified_queue:
>>> + *        The index of the queue to place unclassified packets in
>>> + */
>>> +struct tun_vnet_hash_rss {
>>> +    __u16 indirection_table_mask;
>>> +    __u16 unclassified_queue;
>>> +};
>>> +
>>>   #endif /* _UAPI__IF_TUN_H */
>>>
>>> -- 
>>> 2.46.0
>>>
>>
>>

next prev parent reply	other threads:[~2024-09-24  8:57 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-15  1:17 [PATCH RFC v3 0/9] tun: Introduce virtio-net hashing feature Akihiko Odaki
2024-09-15  1:17 ` [PATCH RFC v3 1/9] skbuff: Introduce SKB_EXT_TUN_VNET_HASH Akihiko Odaki
2024-09-18 12:46   ` Willem de Bruijn
2024-09-15  1:17 ` [PATCH RFC v3 2/9] virtio_net: Add functions for hashing Akihiko Odaki
2024-09-16  7:12   ` gur.stavi
2024-09-16  8:01     ` gur.stavi
2024-09-19 12:51       ` Akihiko Odaki
2024-09-16  8:46   ` gur.stavi
2024-09-18 12:50   ` Willem de Bruijn
2024-09-23 18:15     ` Akihiko Odaki
2024-09-15  1:17 ` [PATCH RFC v3 3/9] net: flow_dissector: Export flow_keys_dissector_symmetric Akihiko Odaki
2024-09-15  1:17 ` [PATCH RFC v3 4/9] tap: Pad virtio header with zero Akihiko Odaki
2024-09-18 12:52   ` Willem de Bruijn
2024-09-15  1:17 ` [PATCH RFC v3 5/9] tun: " Akihiko Odaki
2024-09-15  1:17 ` [PATCH RFC v3 6/9] tun: Introduce virtio-net hash reporting feature Akihiko Odaki
2024-09-18 13:17   ` Willem de Bruijn
2024-09-23 18:35     ` Akihiko Odaki
2024-09-15  1:17 ` [PATCH RFC v3 7/9] tun: Introduce virtio-net RSS Akihiko Odaki
2024-09-18 13:28   ` Willem de Bruijn
2024-09-24  8:56     ` Akihiko Odaki
2024-09-24  8:57       ` Akihiko Odaki [this message]
2024-09-15  1:17 ` [PATCH RFC v3 8/9] selftest: tun: Add tests for virtio-net hashing Akihiko Odaki
2024-09-15  1:17 ` [PATCH RFC v3 9/9] vhost/net: Support VIRTIO_NET_F_HASH_REPORT Akihiko Odaki
2024-09-15 19:48 ` [PATCH RFC v3 0/9] tun: Introduce virtio-net hashing feature Stephen Hemminger
2024-09-23 17:57   ` Akihiko Odaki

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=cc2bc65b-d75a-4232-a0bd-9c759aba8aba@daynix.com \
    --to=akihiko.odaki@daynix.com \
    --cc=andrew@daynix.com \
    --cc=corbet@lwn.net \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=jasowang@redhat.com \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=shuah@kernel.org \
    --cc=virtualization@lists.linux-foundation.org \
    --cc=willemdebruijn.kernel@gmail.com \
    --cc=xuanzhuo@linux.alibaba.com \
    --cc=yuri.benditovich@daynix.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox