* Re: [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup
[not found] ` <20221114070233.248-4-xieyongji@bytedance.com>
@ 2022-11-14 7:58 ` Jason Wang
[not found] ` <CACycT3s8CbO1YD3AAzN-iXEkf6MKM7ihK+=NFik+33HDjanBJQ@mail.gmail.com>
0 siblings, 1 reply; 9+ messages in thread
From: Jason Wang @ 2022-11-14 7:58 UTC (permalink / raw)
To: Xie Yongji; +Cc: virtualization, mst
On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
>
> Add sysfs interface for each vduse virtqueue to setup
> irq affinity. This would be useful for performance
> tuning, e.g., mitigate the virtqueue lock contention
> in virtio block driver.
Do we have any perforamnce numbers for this?
Btw, I wonder if irq is the best for the name since we actually don't
use IRQ at all. I guess using "callback" might be better?
Thanks
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 113 ++++++++++++++++++++++++++---
> 1 file changed, 102 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 9303942c2e64..3a0922fa7eb2 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -57,6 +57,7 @@ struct vduse_virtqueue {
> struct vdpa_callback cb;
> struct work_struct inject;
> struct work_struct kick;
> + struct kobject kobj;
> int irq_affinity;
> };
>
> @@ -1347,6 +1348,88 @@ static const struct file_operations vduse_dev_fops = {
> .llseek = noop_llseek,
> };
>
> +static ssize_t irq_affinity_show(struct vduse_virtqueue *vq, char *buf)
> +{
> + return sprintf(buf, "%d\n", vq->irq_affinity);
> +}
> +
> +static ssize_t irq_affinity_store(struct vduse_virtqueue *vq,
> + const char *buf, size_t count)
> +{
> + int val;
> +
> + if (kstrtoint(buf, 0, &val) < 0)
> + return -EINVAL;
> +
> + if (!(val == -1 || (val <= nr_cpu_ids && val >= 0 && cpu_online(val))))
> + return -EINVAL;
> +
> + vq->irq_affinity = val;
> +
> + return count;
> +}
> +
> +struct vq_sysfs_entry {
> + struct attribute attr;
> + ssize_t (*show)(struct vduse_virtqueue *vq, char *buf);
> + ssize_t (*store)(struct vduse_virtqueue *vq, const char *buf,
> + size_t count);
> +};
> +
> +static struct vq_sysfs_entry irq_affinity_attr = __ATTR_RW(irq_affinity);
> +
> +static struct attribute *vq_attrs[] = {
> + &irq_affinity_attr.attr,
> + NULL,
> +};
> +ATTRIBUTE_GROUPS(vq);
> +
> +static ssize_t vq_attr_show(struct kobject *kobj, struct attribute *attr,
> + char *buf)
> +{
> + struct vduse_virtqueue *vq = container_of(kobj,
> + struct vduse_virtqueue, kobj);
> + struct vq_sysfs_entry *entry = container_of(attr,
> + struct vq_sysfs_entry, attr);
> +
> + if (!entry->show)
> + return -EIO;
> +
> + return entry->show(vq, buf);
> +}
> +
> +static ssize_t vq_attr_store(struct kobject *kobj, struct attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct vduse_virtqueue *vq = container_of(kobj,
> + struct vduse_virtqueue, kobj);
> + struct vq_sysfs_entry *entry = container_of(attr,
> + struct vq_sysfs_entry, attr);
> +
> + if (!entry->store)
> + return -EIO;
> +
> + return entry->store(vq, buf, count);
> +}
> +
> +static const struct sysfs_ops vq_sysfs_ops = {
> + .show = vq_attr_show,
> + .store = vq_attr_store,
> +};
> +
> +static void vq_release(struct kobject *kobj)
> +{
> + struct vduse_virtqueue *vq = container_of(kobj,
> + struct vduse_virtqueue, kobj);
> + kfree(vq);
> +}
> +
> +static struct kobj_type vq_type = {
> + .release = vq_release,
> + .sysfs_ops = &vq_sysfs_ops,
> + .default_groups = vq_groups,
> +};
> +
> static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
> {
> int i;
> @@ -1355,13 +1438,13 @@ static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
> return;
>
> for (i = 0; i < dev->vq_num; i++)
> - kfree(dev->vqs[i]);
> + kobject_put(&dev->vqs[i]->kobj);
> kfree(dev->vqs);
> }
>
> static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> {
> - int i;
> + int ret, i;
>
> dev->vq_align = vq_align;
> dev->vq_num = vq_num;
> @@ -1371,8 +1454,10 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
>
> for (i = 0; i < vq_num; i++) {
> dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
> - if (!dev->vqs[i])
> + if (!dev->vqs[i]) {
> + ret = -ENOMEM;
> goto err;
> + }
>
> dev->vqs[i]->index = i;
> dev->vqs[i]->irq_affinity = -1;
> @@ -1380,15 +1465,20 @@ static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
> spin_lock_init(&dev->vqs[i]->kick_lock);
> spin_lock_init(&dev->vqs[i]->irq_lock);
> + kobject_init(&dev->vqs[i]->kobj, &vq_type);
> + ret = kobject_add(&dev->vqs[i]->kobj,
> + &dev->dev->kobj, "vq%d", i);
> + if (ret)
> + goto err;
> }
>
> return 0;
> err:
> while (i--)
> - kfree(dev->vqs[i]);
> + kobject_put(&dev->vqs[i]->kobj);
> kfree(dev->vqs);
> dev->vqs = NULL;
> - return -ENOMEM;
> + return ret;
> }
>
> static struct vduse_dev *vduse_dev_create(void)
> @@ -1563,10 +1653,6 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> dev->config = config_buf;
> dev->config_size = config->config_size;
>
> - ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> - if (ret)
> - goto err_vqs;
> -
> ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
> if (ret < 0)
> goto err_idr;
> @@ -1580,14 +1666,19 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> ret = PTR_ERR(dev->dev);
> goto err_dev;
> }
> +
> + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> + if (ret)
> + goto err_vqs;
> +
> __module_get(THIS_MODULE);
>
> return 0;
> +err_vqs:
> + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
> err_dev:
> idr_remove(&vduse_idr, dev->minor);
> err_idr:
> - vduse_dev_deinit_vqs(dev);
> -err_vqs:
> vduse_domain_destroy(dev->domain);
> err_domain:
> kfree(dev->name);
> --
> 2.20.1
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup
[not found] ` <CACycT3s8CbO1YD3AAzN-iXEkf6MKM7ihK+=NFik+33HDjanBJQ@mail.gmail.com>
@ 2022-11-14 8:55 ` Jason Wang
[not found] ` <CACycT3uwqNb=+9P=Ta7pw5qUCRfJXveMUX==CYPrtE=+OQBCrg@mail.gmail.com>
0 siblings, 1 reply; 9+ messages in thread
From: Jason Wang @ 2022-11-14 8:55 UTC (permalink / raw)
To: Yongji Xie; +Cc: virtualization, Michael S. Tsirkin
On Mon, Nov 14, 2022 at 4:20 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Mon, Nov 14, 2022 at 3:58 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > >
> > > Add sysfs interface for each vduse virtqueue to setup
> > > irq affinity. This would be useful for performance
> > > tuning, e.g., mitigate the virtqueue lock contention
> > > in virtio block driver.
> >
> > Do we have any perforamnce numbers for this?
> >
>
> Almost 50% improvement (600k iops -> 900k iops) in the high iops
> workloads. I have mentioned it in the cover-letter.
For some reason, I miss that.
I also wonder if we can do this automatically, then there's no need to
play with sysfs which is kind of a burden for the management layer.
Thanks
>
> > Btw, I wonder if irq is the best for the name since we actually don't
> > use IRQ at all. I guess using "callback" might be better?
> >
>
> Looks good to me.
>
> Thanks,
> Yongji
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup
[not found] ` <CACycT3uwqNb=+9P=Ta7pw5qUCRfJXveMUX==CYPrtE=+OQBCrg@mail.gmail.com>
@ 2022-11-16 7:11 ` Jason Wang
0 siblings, 0 replies; 9+ messages in thread
From: Jason Wang @ 2022-11-16 7:11 UTC (permalink / raw)
To: Yongji Xie; +Cc: virtualization, Michael S. Tsirkin
On Tue, Nov 15, 2022 at 10:49 AM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Mon, Nov 14, 2022 at 4:55 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Mon, Nov 14, 2022 at 4:20 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > >
> > > On Mon, Nov 14, 2022 at 3:58 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > >
> > > > > Add sysfs interface for each vduse virtqueue to setup
> > > > > irq affinity. This would be useful for performance
> > > > > tuning, e.g., mitigate the virtqueue lock contention
> > > > > in virtio block driver.
> > > >
> > > > Do we have any perforamnce numbers for this?
> > > >
> > >
> > > Almost 50% improvement (600k iops -> 900k iops) in the high iops
> > > workloads. I have mentioned it in the cover-letter.
> >
> > For some reason, I miss that.
> >
> > I also wonder if we can do this automatically, then there's no need to
> > play with sysfs which is kind of a burden for the management layer.
> >
>
> This is hard to do since vduse doesn't know which cpu should be bound
> for a certain virtqueue.
Probably via the kick_vq()? It probably won't work when notification
is disabled. But we need to think a little bit more about this.
Requiring management software to do ad-hoc running just for VDUSE
seems not easy.
Thanks
>
> Thanks,
> Yongji
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup
[not found] <CACycT3siOCZv+u+-Xcto9BEdY1c8t_ivw-DF73bmuLqBRxF7=A@mail.gmail.com>
@ 2022-11-17 3:36 ` Jason Wang
[not found] ` <CACycT3vPyEuQcJEEPJE2Dv-49f=w8xLoOXsoLb5Fjtx9uqQoyQ@mail.gmail.com>
0 siblings, 1 reply; 9+ messages in thread
From: Jason Wang @ 2022-11-17 3:36 UTC (permalink / raw)
To: Yongji Xie; +Cc: virtualization, Michael S. Tsirkin
On Wed, Nov 16, 2022 at 3:46 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Wed, Nov 16, 2022 at 3:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Tue, Nov 15, 2022 at 10:49 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > >
> > > On Mon, Nov 14, 2022 at 4:55 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Mon, Nov 14, 2022 at 4:20 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > >
> > > > > On Mon, Nov 14, 2022 at 3:58 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > > > >
> > > > > > > Add sysfs interface for each vduse virtqueue to setup
> > > > > > > irq affinity. This would be useful for performance
> > > > > > > tuning, e.g., mitigate the virtqueue lock contention
> > > > > > > in virtio block driver.
> > > > > >
> > > > > > Do we have any perforamnce numbers for this?
> > > > > >
> > > > >
> > > > > Almost 50% improvement (600k iops -> 900k iops) in the high iops
> > > > > workloads. I have mentioned it in the cover-letter.
> > > >
> > > > For some reason, I miss that.
> > > >
> > > > I also wonder if we can do this automatically, then there's no need to
> > > > play with sysfs which is kind of a burden for the management layer.
> > > >
> > >
> > > This is hard to do since vduse doesn't know which cpu should be bound
> > > for a certain virtqueue.
> >
> > Probably via the kick_vq()? It probably won't work when notification
> > is disabled. But we need to think a little bit more about this.
>
> Yes, another problem is that this way can only work when the cpu and
> virtqueue are 1:1 mapping. It's still hard to decide which cpu to bind
> in the N:1 mapping case.
This is the same situation as what you propose here. I think it would
be better to use cpumask instead of cpu id here.
>
> So I think it could be an optimization, but the sysfs interface is still needed.
>
> > Requiring management software to do ad-hoc running just for VDUSE
> > seems not easy.
> >
>
> I'm not sure. In the kubernetes environment, something like a CSI/CNI
> plugin can do it.
Only works when the process is bound to a specific cpu. If a process
is migrated to another CPU, it would be hard to track.
Thanks
>
> Thanks,
> Yongji
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/6] vduse: Refactor allocation for vduse virtqueues
[not found] ` <20221114070233.248-2-xieyongji@bytedance.com>
@ 2022-11-17 5:17 ` Jason Wang
0 siblings, 0 replies; 9+ messages in thread
From: Jason Wang @ 2022-11-17 5:17 UTC (permalink / raw)
To: Xie Yongji; +Cc: virtualization, mst
On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
>
> Allocate memory for vduse virtqueues one by one instead of
> doing one allocation for all of them.
>
> This is a preparation for adding sysfs interface for virtqueues.
The code looks but I think it's not a must since each kobject could be
allocated indepdpently?
Thanks
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 98 ++++++++++++++++++++----------
> 1 file changed, 66 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 35dceee3ed56..37809bfcb7ef 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -76,7 +76,7 @@ struct vduse_umem {
> struct vduse_dev {
> struct vduse_vdpa *vdev;
> struct device *dev;
> - struct vduse_virtqueue *vqs;
> + struct vduse_virtqueue **vqs;
> struct vduse_iova_domain *domain;
> char *name;
> struct mutex lock;
> @@ -434,7 +434,7 @@ static void vduse_dev_reset(struct vduse_dev *dev)
> flush_work(&dev->inject);
>
> for (i = 0; i < dev->vq_num; i++) {
> - struct vduse_virtqueue *vq = &dev->vqs[i];
> + struct vduse_virtqueue *vq = dev->vqs[i];
>
> vq->ready = false;
> vq->desc_addr = 0;
> @@ -466,7 +466,7 @@ static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
> u64 device_area)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> vq->desc_addr = desc_area;
> vq->driver_addr = driver_area;
> @@ -500,7 +500,7 @@ static void vduse_vq_kick_work(struct work_struct *work)
> static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> if (!eventfd_signal_allowed()) {
> schedule_work(&vq->kick);
> @@ -513,7 +513,7 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
> struct vdpa_callback *cb)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> spin_lock(&vq->irq_lock);
> vq->cb.callback = cb->callback;
> @@ -524,7 +524,7 @@ static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
> static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> vq->num = num;
> }
> @@ -533,7 +533,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
> u16 idx, bool ready)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> vq->ready = ready;
> }
> @@ -541,7 +541,7 @@ static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa,
> static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> return vq->ready;
> }
> @@ -550,7 +550,7 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx,
> const struct vdpa_vq_state *state)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) {
> vq->state.packed.last_avail_counter =
> @@ -569,7 +569,7 @@ static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx,
> struct vdpa_vq_state *state)
> {
> struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> - struct vduse_virtqueue *vq = &dev->vqs[idx];
> + struct vduse_virtqueue *vq = dev->vqs[idx];
>
> if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED))
> return vduse_dev_get_vq_state_packed(dev, vq, &state->packed);
> @@ -624,8 +624,8 @@ static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa)
> int i;
>
> for (i = 0; i < dev->vq_num; i++)
> - if (num_max < dev->vqs[i].num_max)
> - num_max = dev->vqs[i].num_max;
> + if (num_max < dev->vqs[i]->num_max)
> + num_max = dev->vqs[i]->num_max;
>
> return num_max;
> }
> @@ -863,7 +863,7 @@ static int vduse_kickfd_setup(struct vduse_dev *dev,
> return -EINVAL;
>
> index = array_index_nospec(eventfd->index, dev->vq_num);
> - vq = &dev->vqs[index];
> + vq = dev->vqs[index];
> if (eventfd->fd >= 0) {
> ctx = eventfd_ctx_fdget(eventfd->fd);
> if (IS_ERR(ctx))
> @@ -889,7 +889,7 @@ static bool vduse_dev_is_ready(struct vduse_dev *dev)
> int i;
>
> for (i = 0; i < dev->vq_num; i++)
> - if (!dev->vqs[i].num_max)
> + if (!dev->vqs[i]->num_max)
> return false;
>
> return true;
> @@ -1130,7 +1130,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(config.index, dev->vq_num);
> - dev->vqs[index].num_max = config.max_size;
> + dev->vqs[index]->num_max = config.max_size;
> ret = 0;
> break;
> }
> @@ -1148,7 +1148,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(vq_info.index, dev->vq_num);
> - vq = &dev->vqs[index];
> + vq = dev->vqs[index];
> vq_info.desc_addr = vq->desc_addr;
> vq_info.driver_addr = vq->driver_addr;
> vq_info.device_addr = vq->device_addr;
> @@ -1198,7 +1198,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
> break;
>
> index = array_index_nospec(index, dev->vq_num);
> - ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
> + ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index]->inject);
> break;
> }
> case VDUSE_IOTLB_REG_UMEM: {
> @@ -1339,6 +1339,49 @@ static const struct file_operations vduse_dev_fops = {
> .llseek = noop_llseek,
> };
>
> +static void vduse_dev_deinit_vqs(struct vduse_dev *dev)
> +{
> + int i;
> +
> + if (!dev->vqs)
> + return;
> +
> + for (i = 0; i < dev->vq_num; i++)
> + kfree(dev->vqs[i]);
> + kfree(dev->vqs);
> +}
> +
> +static int vduse_dev_init_vqs(struct vduse_dev *dev, u32 vq_align, u32 vq_num)
> +{
> + int i;
> +
> + dev->vq_align = vq_align;
> + dev->vq_num = vq_num;
> + dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
> + if (!dev->vqs)
> + return -ENOMEM;
> +
> + for (i = 0; i < vq_num; i++) {
> + dev->vqs[i] = kzalloc(sizeof(*dev->vqs[i]), GFP_KERNEL);
> + if (!dev->vqs[i])
> + goto err;
> +
> + dev->vqs[i]->index = i;
> + INIT_WORK(&dev->vqs[i]->inject, vduse_vq_irq_inject);
> + INIT_WORK(&dev->vqs[i]->kick, vduse_vq_kick_work);
> + spin_lock_init(&dev->vqs[i]->kick_lock);
> + spin_lock_init(&dev->vqs[i]->irq_lock);
> + }
> +
> + return 0;
> +err:
> + while (i--)
> + kfree(dev->vqs[i]);
> + kfree(dev->vqs);
> + dev->vqs = NULL;
> + return -ENOMEM;
> +}
> +
> static struct vduse_dev *vduse_dev_create(void)
> {
> struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> @@ -1396,7 +1439,7 @@ static int vduse_destroy_dev(char *name)
> device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor));
> idr_remove(&vduse_idr, dev->minor);
> kvfree(dev->config);
> - kfree(dev->vqs);
> + vduse_dev_deinit_vqs(dev);
> vduse_domain_destroy(dev->domain);
> kfree(dev->name);
> vduse_dev_destroy(dev);
> @@ -1483,7 +1526,7 @@ ATTRIBUTE_GROUPS(vduse_dev);
> static int vduse_create_dev(struct vduse_dev_config *config,
> void *config_buf, u64 api_version)
> {
> - int i, ret;
> + int ret;
> struct vduse_dev *dev;
>
> ret = -EEXIST;
> @@ -1510,19 +1553,10 @@ static int vduse_create_dev(struct vduse_dev_config *config,
>
> dev->config = config_buf;
> dev->config_size = config->config_size;
> - dev->vq_align = config->vq_align;
> - dev->vq_num = config->vq_num;
> - dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL);
> - if (!dev->vqs)
> - goto err_vqs;
>
> - for (i = 0; i < dev->vq_num; i++) {
> - dev->vqs[i].index = i;
> - INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject);
> - INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work);
> - spin_lock_init(&dev->vqs[i].kick_lock);
> - spin_lock_init(&dev->vqs[i].irq_lock);
> - }
> + ret = vduse_dev_init_vqs(dev, config->vq_align, config->vq_num);
> + if (ret)
> + goto err_vqs;
>
> ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL);
> if (ret < 0)
> @@ -1543,7 +1577,7 @@ static int vduse_create_dev(struct vduse_dev_config *config,
> err_dev:
> idr_remove(&vduse_idr, dev->minor);
> err_idr:
> - kfree(dev->vqs);
> + vduse_dev_deinit_vqs(dev);
> err_vqs:
> vduse_domain_destroy(dev->domain);
> err_domain:
> --
> 2.20.1
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 6/6] vduse: Support specifying bounce buffer size via sysfs
[not found] ` <20221114070233.248-7-xieyongji@bytedance.com>
@ 2022-11-17 5:26 ` Jason Wang
[not found] ` <CACycT3vSbwhsrM6R8Sd23e=AfkYh18rx-4FscnWEVGiWiu4rQQ@mail.gmail.com>
0 siblings, 1 reply; 9+ messages in thread
From: Jason Wang @ 2022-11-17 5:26 UTC (permalink / raw)
To: Xie Yongji; +Cc: virtualization, mst
On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
>
> Add sysfs interface to support specifying bounce
> buffer size in virtio-vdpa case. This is a performance
> tuning parameter for high throughput workloads.
I wonder what's the reason for not having this in VDUSE_CREATE_DEV?
Thanks
>
> Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
> ---
> drivers/vdpa/vdpa_user/vduse_dev.c | 45 +++++++++++++++++++++++++++++-
> 1 file changed, 44 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index 428615093c28..3f97e2d7f7d7 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -37,8 +37,11 @@
> #define DRV_LICENSE "GPL v2"
>
> #define VDUSE_DEV_MAX (1U << MINORBITS)
> +#define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024)
> +#define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024)
> #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024)
> -#define VDUSE_IOVA_SIZE (128 * 1024 * 1024)
> +/* 128 MB reserved for virtqueue creation */
> +#define VDUSE_IOVA_SIZE (VDUSE_MAX_BOUNCE_SIZE + 128 * 1024 * 1024)
> #define VDUSE_MSG_DEFAULT_TIMEOUT 30
>
> struct vduse_virtqueue {
> @@ -1687,8 +1690,48 @@ static ssize_t msg_timeout_store(struct device *device,
>
> static DEVICE_ATTR_RW(msg_timeout);
>
> +static ssize_t bounce_size_show(struct device *device,
> + struct device_attribute *attr, char *buf)
> +{
> + struct vduse_dev *dev = dev_get_drvdata(device);
> +
> + return sysfs_emit(buf, "%u\n", dev->bounce_size);
> +}
> +
> +static ssize_t bounce_size_store(struct device *device,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + struct vduse_dev *dev = dev_get_drvdata(device);
> + unsigned int bounce_size;
> + int ret;
> +
> + ret = -EPERM;
> + mutex_lock(&dev->domain_lock);
> + if (dev->domain)
> + goto unlock;
> +
> + ret = kstrtouint(buf, 10, &bounce_size);
> + if (ret < 0)
> + goto unlock;
> +
> + ret = -EINVAL;
> + if (bounce_size > VDUSE_MAX_BOUNCE_SIZE ||
> + bounce_size < VDUSE_MIN_BOUNCE_SIZE)
> + goto unlock;
> +
> + dev->bounce_size = bounce_size;
> + ret = count;
> +unlock:
> + mutex_unlock(&dev->domain_lock);
> + return ret;
> +}
> +
> +static DEVICE_ATTR_RW(bounce_size);
> +
> static struct attribute *vduse_dev_attrs[] = {
> &dev_attr_msg_timeout.attr,
> + &dev_attr_bounce_size.attr,
> NULL
> };
>
> --
> 2.20.1
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup
[not found] ` <CACycT3vPyEuQcJEEPJE2Dv-49f=w8xLoOXsoLb5Fjtx9uqQoyQ@mail.gmail.com>
@ 2022-11-17 6:07 ` Jason Wang
[not found] ` <CACycT3uH-OPZ9BUR1OnXmY8Lx0vhYTq=Gb3S9i6thGApa5GHFQ@mail.gmail.com>
0 siblings, 1 reply; 9+ messages in thread
From: Jason Wang @ 2022-11-17 6:07 UTC (permalink / raw)
To: Yongji Xie; +Cc: virtualization, Michael S. Tsirkin
On Thu, Nov 17, 2022 at 1:48 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Thu, Nov 17, 2022 at 11:37 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Wed, Nov 16, 2022 at 3:46 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > >
> > > On Wed, Nov 16, 2022 at 3:11 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Tue, Nov 15, 2022 at 10:49 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > >
> > > > > On Mon, Nov 14, 2022 at 4:55 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, Nov 14, 2022 at 4:20 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > > > >
> > > > > > > On Mon, Nov 14, 2022 at 3:58 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > > > > > >
> > > > > > > > > Add sysfs interface for each vduse virtqueue to setup
> > > > > > > > > irq affinity. This would be useful for performance
> > > > > > > > > tuning, e.g., mitigate the virtqueue lock contention
> > > > > > > > > in virtio block driver.
> > > > > > > >
> > > > > > > > Do we have any perforamnce numbers for this?
> > > > > > > >
> > > > > > >
> > > > > > > Almost 50% improvement (600k iops -> 900k iops) in the high iops
> > > > > > > workloads. I have mentioned it in the cover-letter.
> > > > > >
> > > > > > For some reason, I miss that.
> > > > > >
> > > > > > I also wonder if we can do this automatically, then there's no need to
> > > > > > play with sysfs which is kind of a burden for the management layer.
> > > > > >
> > > > >
> > > > > This is hard to do since vduse doesn't know which cpu should be bound
> > > > > for a certain virtqueue.
> > > >
> > > > Probably via the kick_vq()? It probably won't work when notification
> > > > is disabled. But we need to think a little bit more about this.
> > >
> > > Yes, another problem is that this way can only work when the cpu and
> > > virtqueue are 1:1 mapping. It's still hard to decide which cpu to bind
> > > in the N:1 mapping case.
> >
> > This is the same situation as what you propose here. I think it would
> > be better to use cpumask instead of cpu id here.
> >
>
> If so, we need to know which cpu to bind for one virtqueue. Do you
> mean using the cpu who kicks the virtqueue?
I meant you're using:
int irq_affinity;
This seems to assume that the callback can only be delivered to a
specific cpu. It would make more sense to use cpumask_t. This may have
broader use cases.
>
> > >
> > > So I think it could be an optimization, but the sysfs interface is still needed.
> > >
> > > > Requiring management software to do ad-hoc running just for VDUSE
> > > > seems not easy.
> > > >
> > >
> > > I'm not sure. In the kubernetes environment, something like a CSI/CNI
> > > plugin can do it.
> >
> > Only works when the process is bound to a specific cpu. If a process
> > is migrated to another CPU, it would be hard to track.
> >
>
> OK, I see. Seems like there's no good way to handle this case.
Yes, using cpumask_t might improve things a little bit.
> Maybe
> it's better to leave it as it is.
It would be better to think of an automatic method to do this as
affinity managed irq used by virtio-pci (not sure how hard it is
though).
Thanks
>
> Thanks,
> Yongji
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 6/6] vduse: Support specifying bounce buffer size via sysfs
[not found] ` <CACycT3vSbwhsrM6R8Sd23e=AfkYh18rx-4FscnWEVGiWiu4rQQ@mail.gmail.com>
@ 2022-11-17 6:12 ` Jason Wang
0 siblings, 0 replies; 9+ messages in thread
From: Jason Wang @ 2022-11-17 6:12 UTC (permalink / raw)
To: Yongji Xie; +Cc: virtualization, Michael S. Tsirkin
On Thu, Nov 17, 2022 at 2:08 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Thu, Nov 17, 2022 at 1:26 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > >
> > > Add sysfs interface to support specifying bounce
> > > buffer size in virtio-vdpa case. This is a performance
> > > tuning parameter for high throughput workloads.
> >
> > I wonder what's the reason for not having this in VDUSE_CREATE_DEV?
> >
>
> This was discussed [1] before.
>
> [1] https://lore.kernel.org/netdev/20210713084656.232-3-xieyongji@bytedance.com/T/#m48ccd5bb514c40345d476bac80a59a99eeb9e9a7
>
> And the device's sysfs interface should be better than module parameters.
Right, so let's document this somewhere (probably the changelog).
Thanks
>
> Thanks,
> Yongji
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup
[not found] ` <CACycT3uH-OPZ9BUR1OnXmY8Lx0vhYTq=Gb3S9i6thGApa5GHFQ@mail.gmail.com>
@ 2022-11-18 7:23 ` Jason Wang
0 siblings, 0 replies; 9+ messages in thread
From: Jason Wang @ 2022-11-18 7:23 UTC (permalink / raw)
To: Yongji Xie; +Cc: virtualization, Michael S. Tsirkin
On Thu, Nov 17, 2022 at 4:54 PM Yongji Xie <xieyongji@bytedance.com> wrote:
>
> On Thu, Nov 17, 2022 at 2:07 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Nov 17, 2022 at 1:48 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > >
> > > On Thu, Nov 17, 2022 at 11:37 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Wed, Nov 16, 2022 at 3:46 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > >
> > > > > On Wed, Nov 16, 2022 at 3:11 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Tue, Nov 15, 2022 at 10:49 AM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > > > >
> > > > > > > On Mon, Nov 14, 2022 at 4:55 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, Nov 14, 2022 at 4:20 PM Yongji Xie <xieyongji@bytedance.com> wrote:
> > > > > > > > >
> > > > > > > > > On Mon, Nov 14, 2022 at 3:58 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Mon, Nov 14, 2022 at 3:16 PM Xie Yongji <xieyongji@bytedance.com> wrote:
> > > > > > > > > > >
> > > > > > > > > > > Add sysfs interface for each vduse virtqueue to setup
> > > > > > > > > > > irq affinity. This would be useful for performance
> > > > > > > > > > > tuning, e.g., mitigate the virtqueue lock contention
> > > > > > > > > > > in virtio block driver.
> > > > > > > > > >
> > > > > > > > > > Do we have any perforamnce numbers for this?
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Almost 50% improvement (600k iops -> 900k iops) in the high iops
> > > > > > > > > workloads. I have mentioned it in the cover-letter.
> > > > > > > >
> > > > > > > > For some reason, I miss that.
> > > > > > > >
> > > > > > > > I also wonder if we can do this automatically, then there's no need to
> > > > > > > > play with sysfs which is kind of a burden for the management layer.
> > > > > > > >
> > > > > > >
> > > > > > > This is hard to do since vduse doesn't know which cpu should be bound
> > > > > > > for a certain virtqueue.
> > > > > >
> > > > > > Probably via the kick_vq()? It probably won't work when notification
> > > > > > is disabled. But we need to think a little bit more about this.
> > > > >
> > > > > Yes, another problem is that this way can only work when the cpu and
> > > > > virtqueue are 1:1 mapping. It's still hard to decide which cpu to bind
> > > > > in the N:1 mapping case.
> > > >
> > > > This is the same situation as what you propose here. I think it would
> > > > be better to use cpumask instead of cpu id here.
> > > >
> > >
> > > If so, we need to know which cpu to bind for one virtqueue. Do you
> > > mean using the cpu who kicks the virtqueue?
> >
> > I meant you're using:
> >
> > int irq_affinity;
> >
> > This seems to assume that the callback can only be delivered to a
> > specific cpu. It would make more sense to use cpumask_t. This may have
> > broader use cases.
> >
>
> Yes, I see. I meant we need to know how to choose the cpu to run the
> irq callback if we use cpumask_t, e.g., round-robin or choosing the
> cpu who kicked the virtqueue before.
>
> > >
> > > > >
> > > > > So I think it could be an optimization, but the sysfs interface is still needed.
> > > > >
> > > > > > Requiring management software to do ad-hoc running just for VDUSE
> > > > > > seems not easy.
> > > > > >
> > > > >
> > > > > I'm not sure. In the kubernetes environment, something like a CSI/CNI
> > > > > plugin can do it.
> > > >
> > > > Only works when the process is bound to a specific cpu. If a process
> > > > is migrated to another CPU, it would be hard to track.
> > > >
> > >
> > > OK, I see. Seems like there's no good way to handle this case.
> >
> > Yes, using cpumask_t might improve things a little bit.
> >
> > > Maybe
> > > it's better to leave it as it is.
> >
> > It would be better to think of an automatic method to do this as
> > affinity managed irq used by virtio-pci (not sure how hard it is
> > though).
> >
>
> Do you mean making use of .set_vq_affinity and .get_vq_affinity callbacks?
This works for net but not block.
I know little about block but looks like block is using affinity
descriptor to allow blk mq to do proper irq steering. Maybe we can do
something the same.
Thanks
>
> Thanks,
> Yongji
>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2022-11-18 7:23 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
[not found] <20221114070233.248-1-xieyongji@bytedance.com>
[not found] ` <20221114070233.248-4-xieyongji@bytedance.com>
2022-11-14 7:58 ` [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup Jason Wang
[not found] ` <CACycT3s8CbO1YD3AAzN-iXEkf6MKM7ihK+=NFik+33HDjanBJQ@mail.gmail.com>
2022-11-14 8:55 ` Jason Wang
[not found] ` <CACycT3uwqNb=+9P=Ta7pw5qUCRfJXveMUX==CYPrtE=+OQBCrg@mail.gmail.com>
2022-11-16 7:11 ` Jason Wang
[not found] ` <20221114070233.248-2-xieyongji@bytedance.com>
2022-11-17 5:17 ` [PATCH 1/6] vduse: Refactor allocation for vduse virtqueues Jason Wang
[not found] ` <20221114070233.248-7-xieyongji@bytedance.com>
2022-11-17 5:26 ` [PATCH 6/6] vduse: Support specifying bounce buffer size via sysfs Jason Wang
[not found] ` <CACycT3vSbwhsrM6R8Sd23e=AfkYh18rx-4FscnWEVGiWiu4rQQ@mail.gmail.com>
2022-11-17 6:12 ` Jason Wang
[not found] <CACycT3siOCZv+u+-Xcto9BEdY1c8t_ivw-DF73bmuLqBRxF7=A@mail.gmail.com>
2022-11-17 3:36 ` [PATCH 3/6] vduse: Add sysfs interface for irq affinity setup Jason Wang
[not found] ` <CACycT3vPyEuQcJEEPJE2Dv-49f=w8xLoOXsoLb5Fjtx9uqQoyQ@mail.gmail.com>
2022-11-17 6:07 ` Jason Wang
[not found] ` <CACycT3uH-OPZ9BUR1OnXmY8Lx0vhYTq=Gb3S9i6thGApa5GHFQ@mail.gmail.com>
2022-11-18 7:23 ` Jason Wang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).