* [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
@ 2019-08-09 6:04 piaojun
2019-08-16 5:57 ` piaojun
2019-08-21 15:38 ` Stefan Hajnoczi
0 siblings, 2 replies; 11+ messages in thread
From: piaojun @ 2019-08-09 6:04 UTC (permalink / raw)
To: virtio-fs
Set cpu affinity for each queue in multiqueue mode to improve the iops
performance.
>From my test, the iops is increased by adding multiqueues as below,
but it has not achieved my expect yet due to some reason. So I'm
considering if we could drop some locks when operating vq as it is
binded to one vCPU. I'm very glad to have a discuss with other
developers.
Further more, I modified virtiofsd to support multiqueue which just for
testing.
Test Environment:
Guest configuration:
8 vCPU
8GB RAM
Linux 5.1 (vivek-aug-06-2019)
Host configuration:
Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
32GB RAM
Linux 3.10.0
EXT4 + 4G Ramdisk
---
Single-queue:
# fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
...
fio-2.13
Starting 8 processes
Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
Multi-queues:
# fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
...
fio-2.13
Starting 8 processes
Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
---
Signed-off-by: Jun Piao <piaojun@huawei.com>
---
fs/fuse/virtio_fs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 66 insertions(+), 2 deletions(-)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index a04c320..7ba36fc 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -12,6 +12,7 @@
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
#include <linux/delay.h>
+#include <linux/cpu.h>
#include "fuse_i.h"
/* List of virtio-fs device instances and a lock for the list */
@@ -61,6 +62,9 @@ struct virtio_fs {
void *window_kaddr;
phys_addr_t window_phys_addr;
size_t window_len;
+
+ /* Does the affinity hint is set for virtqueues? */
+ bool affinity_hint_set;
};
struct virtio_fs_forget {
@@ -378,6 +382,44 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
schedule_work(&fsvq->done_work);
}
+static void virtio_fs_clean_affinity(struct virtio_fs *fs)
+{
+ int i;
+
+ if (fs->affinity_hint_set) {
+ for (i = 0; i < fs->num_queues; i++)
+ virtqueue_set_affinity(fs->vqs[i].vq, NULL);
+
+ fs->affinity_hint_set = false;
+ }
+}
+
+static void virtio_fs_set_affinity(struct virtio_fs *fs)
+{
+ int i = 0, cpu;
+
+ /*
+ * In single queue mode, we don't set the cpu affinity.
+ */
+ if (fs->num_queues == 1) {
+ virtio_fs_clean_affinity(fs);
+ fs->affinity_hint_set = false;
+ return;
+ }
+
+ /*
+ * In multiqueue mode, we let the queue to be private to one cpu
+ * by setting the affinity hint to eliminate the contention.
+ */
+ for_each_online_cpu(cpu) {
+ virtqueue_set_affinity(fs->vqs[i].vq, cpumask_of(cpu));
+ if (++i >= fs->num_queues)
+ break;
+ }
+
+ fs->affinity_hint_set = true;
+}
+
/* Initialize virtqueues */
static int virtio_fs_setup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
@@ -440,6 +482,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
fs->vqs[i].vq = vqs[i];
fs->vqs[i].connected = true;
}
+
+ /* set affinity for vqs */
+ get_online_cpus();
+ virtio_fs_set_affinity(fs);
+ put_online_cpus();
out:
kfree(names);
kfree(callbacks);
@@ -451,6 +498,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
{
+ virtio_fs_clean_affinity(fs);
vdev->config->del_vqs(vdev);
}
@@ -954,10 +1002,22 @@ static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
return ret;
}
+static unsigned virtio_fs_pick_vq_mq(struct virtio_fs *fs)
+{
+ unsigned queue_id;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ queue_id = (smp_processor_id() % fs->num_queues) + VQ_REQUEST;
+ local_irq_restore(flags);
+
+ return queue_id;
+}
+
static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
__releases(fiq->waitq.lock)
{
- unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */
+ unsigned queue_id = VQ_REQUEST;
struct virtio_fs *fs;
struct fuse_conn *fc;
struct fuse_req *req;
@@ -972,6 +1032,8 @@ static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
spin_unlock(&fiq->waitq.lock);
fs = fiq->priv;
+ if (fs->num_queues > 1)
+ queue_id = virtio_fs_pick_vq_mq(fs);
fc = fs->vqs[queue_id].fud->fc;
dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
@@ -1066,9 +1128,11 @@ static int virtio_fs_fill_super(struct super_block *sb, char *opts,
err = -ENOMEM;
/* Allocate fuse_dev for hiprio and notification queues */
- for (i = 0; i < VQ_REQUEST; i++) {
+ for (i = 0; i < VQ_REQUEST + fs->num_queues; i++) {
struct virtio_fs_vq *fsvq = &fs->vqs[i];
+ if (i == VQ_REQUEST)
+ continue; /* will be allocated in fuse_fill_super_common */
fsvq->fud = fuse_dev_alloc();
if (!fsvq->fud)
goto err_free_fuse_devs;
--
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-09 6:04 [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity piaojun
@ 2019-08-16 5:57 ` piaojun
2019-08-21 15:38 ` Stefan Hajnoczi
1 sibling, 0 replies; 11+ messages in thread
From: piaojun @ 2019-08-16 5:57 UTC (permalink / raw)
To: virtio-fs
Ping, any comments on this patch?
On 2019/8/9 14:04, piaojun wrote:
> Set cpu affinity for each queue in multiqueue mode to improve the iops
> performance.
>
>>From my test, the iops is increased by adding multiqueues as below,
> but it has not achieved my expect yet due to some reason. So I'm
> considering if we could drop some locks when operating vq as it is
> binded to one vCPU. I'm very glad to have a discuss with other
> developers.
>
> Further more, I modified virtiofsd to support multiqueue which just for
> testing.
>
> Test Environment:
> Guest configuration:
> 8 vCPU
> 8GB RAM
> Linux 5.1 (vivek-aug-06-2019)
>
> Host configuration:
> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
> 32GB RAM
> Linux 3.10.0
> EXT4 + 4G Ramdisk
>
> ---
> Single-queue:
> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> ...
> fio-2.13
> Starting 8 processes
> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
>
> Multi-queues:
> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> ...
> fio-2.13
> Starting 8 processes
> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
> ---
>
> Signed-off-by: Jun Piao <piaojun@huawei.com>
> ---
> fs/fuse/virtio_fs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 66 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index a04c320..7ba36fc 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -12,6 +12,7 @@
> #include <linux/virtio.h>
> #include <linux/virtio_fs.h>
> #include <linux/delay.h>
> +#include <linux/cpu.h>
> #include "fuse_i.h"
>
> /* List of virtio-fs device instances and a lock for the list */
> @@ -61,6 +62,9 @@ struct virtio_fs {
> void *window_kaddr;
> phys_addr_t window_phys_addr;
> size_t window_len;
> +
> + /* Does the affinity hint is set for virtqueues? */
> + bool affinity_hint_set;
> };
>
> struct virtio_fs_forget {
> @@ -378,6 +382,44 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
> schedule_work(&fsvq->done_work);
> }
>
> +static void virtio_fs_clean_affinity(struct virtio_fs *fs)
> +{
> + int i;
> +
> + if (fs->affinity_hint_set) {
> + for (i = 0; i < fs->num_queues; i++)
> + virtqueue_set_affinity(fs->vqs[i].vq, NULL);
> +
> + fs->affinity_hint_set = false;
> + }
> +}
> +
> +static void virtio_fs_set_affinity(struct virtio_fs *fs)
> +{
> + int i = 0, cpu;
> +
> + /*
> + * In single queue mode, we don't set the cpu affinity.
> + */
> + if (fs->num_queues == 1) {
> + virtio_fs_clean_affinity(fs);
> + fs->affinity_hint_set = false;
> + return;
> + }
> +
> + /*
> + * In multiqueue mode, we let the queue to be private to one cpu
> + * by setting the affinity hint to eliminate the contention.
> + */
> + for_each_online_cpu(cpu) {
> + virtqueue_set_affinity(fs->vqs[i].vq, cpumask_of(cpu));
> + if (++i >= fs->num_queues)
> + break;
> + }
> +
> + fs->affinity_hint_set = true;
> +}
> +
> /* Initialize virtqueues */
> static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> struct virtio_fs *fs)
> @@ -440,6 +482,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> fs->vqs[i].vq = vqs[i];
> fs->vqs[i].connected = true;
> }
> +
> + /* set affinity for vqs */
> + get_online_cpus();
> + virtio_fs_set_affinity(fs);
> + put_online_cpus();
> out:
> kfree(names);
> kfree(callbacks);
> @@ -451,6 +498,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
> struct virtio_fs *fs)
> {
> + virtio_fs_clean_affinity(fs);
> vdev->config->del_vqs(vdev);
> }
>
> @@ -954,10 +1002,22 @@ static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
> return ret;
> }
>
> +static unsigned virtio_fs_pick_vq_mq(struct virtio_fs *fs)
> +{
> + unsigned queue_id;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + queue_id = (smp_processor_id() % fs->num_queues) + VQ_REQUEST;
> + local_irq_restore(flags);
> +
> + return queue_id;
> +}
> +
> static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> __releases(fiq->waitq.lock)
> {
> - unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */
> + unsigned queue_id = VQ_REQUEST;
> struct virtio_fs *fs;
> struct fuse_conn *fc;
> struct fuse_req *req;
> @@ -972,6 +1032,8 @@ static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> spin_unlock(&fiq->waitq.lock);
>
> fs = fiq->priv;
> + if (fs->num_queues > 1)
> + queue_id = virtio_fs_pick_vq_mq(fs);
> fc = fs->vqs[queue_id].fud->fc;
>
> dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
> @@ -1066,9 +1128,11 @@ static int virtio_fs_fill_super(struct super_block *sb, char *opts,
>
> err = -ENOMEM;
> /* Allocate fuse_dev for hiprio and notification queues */
> - for (i = 0; i < VQ_REQUEST; i++) {
> + for (i = 0; i < VQ_REQUEST + fs->num_queues; i++) {
> struct virtio_fs_vq *fsvq = &fs->vqs[i];
>
> + if (i == VQ_REQUEST)
> + continue; /* will be allocated in fuse_fill_super_common */
> fsvq->fud = fuse_dev_alloc();
> if (!fsvq->fud)
> goto err_free_fuse_devs;
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-09 6:04 [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity piaojun
2019-08-16 5:57 ` piaojun
@ 2019-08-21 15:38 ` Stefan Hajnoczi
2019-08-22 5:18 ` piaojun
2019-08-26 1:08 ` piaojun
1 sibling, 2 replies; 11+ messages in thread
From: Stefan Hajnoczi @ 2019-08-21 15:38 UTC (permalink / raw)
To: piaojun; +Cc: virtio-fs
[-- Attachment #1: Type: text/plain, Size: 6490 bytes --]
On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
> Set cpu affinity for each queue in multiqueue mode to improve the iops
> performance.
>
> >From my test, the iops is increased by adding multiqueues as below,
> but it has not achieved my expect yet due to some reason. So I'm
> considering if we could drop some locks when operating vq as it is
> binded to one vCPU. I'm very glad to have a discuss with other
> developers.
>
> Further more, I modified virtiofsd to support multiqueue which just for
> testing.
>
> Test Environment:
> Guest configuration:
> 8 vCPU
> 8GB RAM
> Linux 5.1 (vivek-aug-06-2019)
>
> Host configuration:
> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
> 32GB RAM
> Linux 3.10.0
> EXT4 + 4G Ramdisk
>
> ---
> Single-queue:
> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> ...
> fio-2.13
> Starting 8 processes
> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
>
> Multi-queues:
> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> ...
> fio-2.13
> Starting 8 processes
> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
> ---
How does the same fio command-line perform on the host when bound to 8
CPUs?
What about the virtiofsd changes? Did you implement host CPU affinity
for the virtqueue processing threads and their workqueues?
I wonder if numbers are better if you use 8 files instead of 1 file.
> Signed-off-by: Jun Piao <piaojun@huawei.com>
> ---
> fs/fuse/virtio_fs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 66 insertions(+), 2 deletions(-)
>
> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
> index a04c320..7ba36fc 100644
> --- a/fs/fuse/virtio_fs.c
> +++ b/fs/fuse/virtio_fs.c
> @@ -12,6 +12,7 @@
> #include <linux/virtio.h>
> #include <linux/virtio_fs.h>
> #include <linux/delay.h>
> +#include <linux/cpu.h>
> #include "fuse_i.h"
>
> /* List of virtio-fs device instances and a lock for the list */
> @@ -61,6 +62,9 @@ struct virtio_fs {
> void *window_kaddr;
> phys_addr_t window_phys_addr;
> size_t window_len;
> +
> + /* Does the affinity hint is set for virtqueues? */
> + bool affinity_hint_set;
> };
>
> struct virtio_fs_forget {
> @@ -378,6 +382,44 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
> schedule_work(&fsvq->done_work);
> }
>
> +static void virtio_fs_clean_affinity(struct virtio_fs *fs)
> +{
> + int i;
> +
> + if (fs->affinity_hint_set) {
> + for (i = 0; i < fs->num_queues; i++)
> + virtqueue_set_affinity(fs->vqs[i].vq, NULL);
> +
> + fs->affinity_hint_set = false;
> + }
> +}
> +
> +static void virtio_fs_set_affinity(struct virtio_fs *fs)
> +{
> + int i = 0, cpu;
> +
> + /*
> + * In single queue mode, we don't set the cpu affinity.
> + */
> + if (fs->num_queues == 1) {
> + virtio_fs_clean_affinity(fs);
> + fs->affinity_hint_set = false;
> + return;
> + }
> +
> + /*
> + * In multiqueue mode, we let the queue to be private to one cpu
> + * by setting the affinity hint to eliminate the contention.
> + */
> + for_each_online_cpu(cpu) {
> + virtqueue_set_affinity(fs->vqs[i].vq, cpumask_of(cpu));
> + if (++i >= fs->num_queues)
> + break;
> + }
> +
> + fs->affinity_hint_set = true;
> +}
> +
> /* Initialize virtqueues */
> static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> struct virtio_fs *fs)
> @@ -440,6 +482,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> fs->vqs[i].vq = vqs[i];
> fs->vqs[i].connected = true;
> }
> +
> + /* set affinity for vqs */
> + get_online_cpus();
> + virtio_fs_set_affinity(fs);
> + put_online_cpus();
> out:
> kfree(names);
> kfree(callbacks);
> @@ -451,6 +498,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
> static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
> struct virtio_fs *fs)
> {
> + virtio_fs_clean_affinity(fs);
> vdev->config->del_vqs(vdev);
> }
>
> @@ -954,10 +1002,22 @@ static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
> return ret;
> }
>
> +static unsigned virtio_fs_pick_vq_mq(struct virtio_fs *fs)
> +{
> + unsigned queue_id;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + queue_id = (smp_processor_id() % fs->num_queues) + VQ_REQUEST;
> + local_irq_restore(flags);
> +
> + return queue_id;
> +}
> +
> static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> __releases(fiq->waitq.lock)
> {
> - unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */
> + unsigned queue_id = VQ_REQUEST;
> struct virtio_fs *fs;
> struct fuse_conn *fc;
> struct fuse_req *req;
> @@ -972,6 +1032,8 @@ static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
> spin_unlock(&fiq->waitq.lock);
>
> fs = fiq->priv;
> + if (fs->num_queues > 1)
> + queue_id = virtio_fs_pick_vq_mq(fs);
> fc = fs->vqs[queue_id].fud->fc;
>
> dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
> @@ -1066,9 +1128,11 @@ static int virtio_fs_fill_super(struct super_block *sb, char *opts,
>
> err = -ENOMEM;
> /* Allocate fuse_dev for hiprio and notification queues */
> - for (i = 0; i < VQ_REQUEST; i++) {
> + for (i = 0; i < VQ_REQUEST + fs->num_queues; i++) {
> struct virtio_fs_vq *fsvq = &fs->vqs[i];
>
> + if (i == VQ_REQUEST)
> + continue; /* will be allocated in fuse_fill_super_common */
> fsvq->fud = fuse_dev_alloc();
> if (!fsvq->fud)
> goto err_free_fuse_devs;
> --
>
> _______________________________________________
> Virtio-fs mailing list
> Virtio-fs@redhat.com
> https://www.redhat.com/mailman/listinfo/virtio-fs
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-21 15:38 ` Stefan Hajnoczi
@ 2019-08-22 5:18 ` piaojun
2019-08-26 1:08 ` piaojun
1 sibling, 0 replies; 11+ messages in thread
From: piaojun @ 2019-08-22 5:18 UTC (permalink / raw)
To: Stefan Hajnoczi; +Cc: virtio-fs
On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
>> Set cpu affinity for each queue in multiqueue mode to improve the iops
>> performance.
>>
>> >From my test, the iops is increased by adding multiqueues as below,
>> but it has not achieved my expect yet due to some reason. So I'm
>> considering if we could drop some locks when operating vq as it is
>> binded to one vCPU. I'm very glad to have a discuss with other
>> developers.
>>
>> Further more, I modified virtiofsd to support multiqueue which just for
>> testing.
>>
>> Test Environment:
>> Guest configuration:
>> 8 vCPU
>> 8GB RAM
>> Linux 5.1 (vivek-aug-06-2019)
>>
>> Host configuration:
>> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
>> 32GB RAM
>> Linux 3.10.0
>> EXT4 + 4G Ramdisk
>>
>> ---
>> Single-queue:
>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
>> ...
>> fio-2.13
>> Starting 8 processes
>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
>> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
>> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
>>
>> Multi-queues:
>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
>> ...
>> fio-2.13
>> Starting 8 processes
>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
>> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
>> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
>> ---
>
> How does the same fio command-line perform on the host when bound to 8
> CPUs?
Not test yet.
>
> What about the virtiofsd changes? Did you implement host CPU affinity
> for the virtqueue processing threads and their workqueues?
I just delete the limit code for single queue:
fv_queue_set_started()
if (qidx > 1) // delete this check
And I have not implemented host CPU affinity yet, but I'm interested in it.
>
> I wonder if numbers are better if you use 8 files instead of 1 file.
I will test as you suggest, and share the result again.
Jun
>
>> Signed-off-by: Jun Piao <piaojun@huawei.com>
>> ---
>> fs/fuse/virtio_fs.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>> 1 file changed, 66 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>> index a04c320..7ba36fc 100644
>> --- a/fs/fuse/virtio_fs.c
>> +++ b/fs/fuse/virtio_fs.c
>> @@ -12,6 +12,7 @@
>> #include <linux/virtio.h>
>> #include <linux/virtio_fs.h>
>> #include <linux/delay.h>
>> +#include <linux/cpu.h>
>> #include "fuse_i.h"
>>
>> /* List of virtio-fs device instances and a lock for the list */
>> @@ -61,6 +62,9 @@ struct virtio_fs {
>> void *window_kaddr;
>> phys_addr_t window_phys_addr;
>> size_t window_len;
>> +
>> + /* Does the affinity hint is set for virtqueues? */
>> + bool affinity_hint_set;
>> };
>>
>> struct virtio_fs_forget {
>> @@ -378,6 +382,44 @@ static void virtio_fs_vq_done(struct virtqueue *vq)
>> schedule_work(&fsvq->done_work);
>> }
>>
>> +static void virtio_fs_clean_affinity(struct virtio_fs *fs)
>> +{
>> + int i;
>> +
>> + if (fs->affinity_hint_set) {
>> + for (i = 0; i < fs->num_queues; i++)
>> + virtqueue_set_affinity(fs->vqs[i].vq, NULL);
>> +
>> + fs->affinity_hint_set = false;
>> + }
>> +}
>> +
>> +static void virtio_fs_set_affinity(struct virtio_fs *fs)
>> +{
>> + int i = 0, cpu;
>> +
>> + /*
>> + * In single queue mode, we don't set the cpu affinity.
>> + */
>> + if (fs->num_queues == 1) {
>> + virtio_fs_clean_affinity(fs);
>> + fs->affinity_hint_set = false;
>> + return;
>> + }
>> +
>> + /*
>> + * In multiqueue mode, we let the queue to be private to one cpu
>> + * by setting the affinity hint to eliminate the contention.
>> + */
>> + for_each_online_cpu(cpu) {
>> + virtqueue_set_affinity(fs->vqs[i].vq, cpumask_of(cpu));
>> + if (++i >= fs->num_queues)
>> + break;
>> + }
>> +
>> + fs->affinity_hint_set = true;
>> +}
>> +
>> /* Initialize virtqueues */
>> static int virtio_fs_setup_vqs(struct virtio_device *vdev,
>> struct virtio_fs *fs)
>> @@ -440,6 +482,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
>> fs->vqs[i].vq = vqs[i];
>> fs->vqs[i].connected = true;
>> }
>> +
>> + /* set affinity for vqs */
>> + get_online_cpus();
>> + virtio_fs_set_affinity(fs);
>> + put_online_cpus();
>> out:
>> kfree(names);
>> kfree(callbacks);
>> @@ -451,6 +498,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
>> static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
>> struct virtio_fs *fs)
>> {
>> + virtio_fs_clean_affinity(fs);
>> vdev->config->del_vqs(vdev);
>> }
>>
>> @@ -954,10 +1002,22 @@ static int virtio_fs_enqueue_req(struct virtqueue *vq, struct fuse_req *req)
>> return ret;
>> }
>>
>> +static unsigned virtio_fs_pick_vq_mq(struct virtio_fs *fs)
>> +{
>> + unsigned queue_id;
>> + unsigned long flags;
>> +
>> + local_irq_save(flags);
>> + queue_id = (smp_processor_id() % fs->num_queues) + VQ_REQUEST;
>> + local_irq_restore(flags);
>> +
>> + return queue_id;
>> +}
>> +
>> static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
>> __releases(fiq->waitq.lock)
>> {
>> - unsigned queue_id = VQ_REQUEST; /* TODO multiqueue */
>> + unsigned queue_id = VQ_REQUEST;
>> struct virtio_fs *fs;
>> struct fuse_conn *fc;
>> struct fuse_req *req;
>> @@ -972,6 +1032,8 @@ static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq)
>> spin_unlock(&fiq->waitq.lock);
>>
>> fs = fiq->priv;
>> + if (fs->num_queues > 1)
>> + queue_id = virtio_fs_pick_vq_mq(fs);
>> fc = fs->vqs[queue_id].fud->fc;
>>
>> dev_dbg(&fs->vqs[queue_id].vq->vdev->dev,
>> @@ -1066,9 +1128,11 @@ static int virtio_fs_fill_super(struct super_block *sb, char *opts,
>>
>> err = -ENOMEM;
>> /* Allocate fuse_dev for hiprio and notification queues */
>> - for (i = 0; i < VQ_REQUEST; i++) {
>> + for (i = 0; i < VQ_REQUEST + fs->num_queues; i++) {
>> struct virtio_fs_vq *fsvq = &fs->vqs[i];
>>
>> + if (i == VQ_REQUEST)
>> + continue; /* will be allocated in fuse_fill_super_common */
>> fsvq->fud = fuse_dev_alloc();
>> if (!fsvq->fud)
>> goto err_free_fuse_devs;
>> --
>>
>> _______________________________________________
>> Virtio-fs mailing list
>> Virtio-fs@redhat.com
>> https://www.redhat.com/mailman/listinfo/virtio-fs
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-21 15:38 ` Stefan Hajnoczi
2019-08-22 5:18 ` piaojun
@ 2019-08-26 1:08 ` piaojun
2019-08-27 14:42 ` Stefan Hajnoczi
1 sibling, 1 reply; 11+ messages in thread
From: piaojun @ 2019-08-26 1:08 UTC (permalink / raw)
To: Stefan Hajnoczi; +Cc: virtio-fs
On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
>> Set cpu affinity for each queue in multiqueue mode to improve the iops
>> performance.
>>
>> >From my test, the iops is increased by adding multiqueues as below,
>> but it has not achieved my expect yet due to some reason. So I'm
>> considering if we could drop some locks when operating vq as it is
>> binded to one vCPU. I'm very glad to have a discuss with other
>> developers.
>>
>> Further more, I modified virtiofsd to support multiqueue which just for
>> testing.
>>
>> Test Environment:
>> Guest configuration:
>> 8 vCPU
>> 8GB RAM
>> Linux 5.1 (vivek-aug-06-2019)
>>
>> Host configuration:
>> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
>> 32GB RAM
>> Linux 3.10.0
>> EXT4 + 4G Ramdisk
>>
>> ---
>> Single-queue:
>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
>> ...
>> fio-2.13
>> Starting 8 processes
>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
>> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
>> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
>>
>> Multi-queues:
>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
>> ...
>> fio-2.13
>> Starting 8 processes
>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
>> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
>> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
>> ---
>
> How does the same fio command-line perform on the host when bound to 8
> CPUs?
fio has great performance on host side, so the bottleneck should be at virtiofsd.
---
Run status group 0 (all jobs):
WRITE: bw=12.7GiB/s (13.6GB/s), 12.7GiB/s-12.7GiB/s (13.6GB/s-13.6GB/s), io=381GiB (409GB), run=30001-30001msec
>
> What about the virtiofsd changes? Did you implement host CPU affinity
> for the virtqueue processing threads and their workqueues?
>
> I wonder if numbers are better if you use 8 files instead of 1 file.
>
I implement host CPU affinity and re-design the testcase with 8 files,
the result looks better:
---
[global]
runtime=30
time_based
group_reporting
direct=1
bs=1M
size=1G
ioengine=libaio
rw=write
numjobs=8
iodepth=128
thread=1
[file1]
filename=/mnt/virtiofs/file1
numjobs=1
[file2]
filename=/mnt/virtiofs/file2
numjobs=1
[file3]
filename=/mnt/virtiofs/file3
numjobs=1
[file4]
filename=/mnt/virtiofs/file4
numjobs=1
[file5]
filename=/mnt/virtiofs/file5
numjobs=1
[file6]
filename=/mnt/virtiofs/file6
numjobs=1
[file7]
filename=/mnt/virtiofs/file7
numjobs=1
[file8]
filename=/mnt/virtiofs/file8
numjobs=1
Single-Queue:
Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1594MB/0KB /s] [0/1594/0 iops] [eta 00m:00s]
file1: (groupid=0, jobs=8): err= 0: pid=6379: Mon Aug 26 16:24:10 2019
write: io=46676MB, bw=1555.6MB/s, iops=1555, runt= 30007msec
Multi-Queues(8):
Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/4064MB/0KB /s] [0/4064/0 iops] [eta 00m:00s]
file1: (groupid=0, jobs=8): err= 0: pid=5785: Mon Aug 26 16:26:46 2019
write: io=115421MB, bw=3847.2MB/s, iops=3847, runt= 30002msec
I write a draft patch for virtiofsd, but the sandbox make it hard to
set affinity for each vq, as _SC_NPROCESSORS_ONLN always equals 1. So I
just delete the related code for testing. Maybe we could create a
thread pool before setup_sandbox() or some effective way. I'm glad to
help finding out the solution.
Thanks,
Jun
---
contrib/virtiofsd/fuse_virtio.c | 23 ++++++++++++++++++-----
contrib/virtiofsd/passthrough_ll.c | 4 ++--
contrib/virtiofsd/seccomp.c | 2 ++
3 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/contrib/virtiofsd/fuse_virtio.c b/contrib/virtiofsd/fuse_virtio.c
index bd50723..efc4ba7 100644
--- a/contrib/virtiofsd/fuse_virtio.c
+++ b/contrib/virtiofsd/fuse_virtio.c
@@ -748,8 +748,11 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
{
struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
struct fv_QueueInfo *ourqi;
+ cpu_set_t mask;
+ int num = sysconf(_SC_NPROCESSORS_ONLN);
- fuse_info("%s: qidx=%d started=%d\n", __func__, qidx, started);
+ fuse_info("%s: nqueues %lu, qidx=%d, started=%d, cpunum %d\n",
+ __func__, vud->nqueues, qidx, started, num);
assert(qidx>=0);
/*
@@ -759,9 +762,9 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
* races yet.
*/
if (qidx > 1) {
- fuse_err("%s: multiple request queues not yet implemented, please only configure 1 request queue\n",
- __func__);
- exit(EXIT_FAILURE);
+ //fuse_err("%s: multiple request queues not yet implemented, please only configure 1 request queue\n",
+ // __func__);
+ //exit(EXIT_FAILURE);
}
if (started) {
@@ -798,6 +801,16 @@ static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
__func__, qidx);
assert(0);
}
+ if (qidx > 0) {
+ fuse_info("%s: thread[%ld], set CPU[%d] affinity for vq[%d]\n", __func__, ourqi->thread, qidx, qidx);
+ /* set CPU affinity for vqs */
+ CPU_ZERO(&mask);
+ CPU_SET(qidx, &mask);
+ if (pthread_setaffinity_np(ourqi->thread, sizeof(mask), &mask) < 0) {
+ fuse_err("%s: Failed to setaffinity for vq[%d]\n", __func__, qidx);
+ assert(0);
+ }
+ }
} else {
int ret;
assert(qidx < vud->nqueues);
@@ -962,7 +975,7 @@ int virtio_session_mount(struct fuse_session *se)
se->virtio_dev = calloc(sizeof(struct fv_VuDev), 1);
se->virtio_dev->se = se;
pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
- vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd,
+ vu_init(&se->virtio_dev->dev, 16, se->vu_socketfd,
fv_panic,
fv_set_watch, fv_remove_watch,
&fv_iface);
diff --git a/contrib/virtiofsd/passthrough_ll.c b/contrib/virtiofsd/passthrough_ll.c
index ca11764..7eabe73 100644
--- a/contrib/virtiofsd/passthrough_ll.c
+++ b/contrib/virtiofsd/passthrough_ll.c
@@ -2773,7 +2773,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root)
int fd, res;
struct stat stat;
- fd = open("/", O_PATH);
+ fd = open(lo->source, O_PATH);
if (fd == -1)
err(1, "open(%s, O_PATH)", lo->source);
@@ -2990,7 +2990,7 @@ int main(int argc, char *argv[])
/* Must be after daemonize to get the right /proc/self/fd */
setup_proc_self_fd(&lo);
- setup_sandbox(&lo, opts.syslog);
+ //setup_sandbox(&lo, opts.syslog);
setup_root(&lo, &lo.root);
diff --git a/contrib/virtiofsd/seccomp.c b/contrib/virtiofsd/seccomp.c
index 3b92c6e..e9f0737 100644
--- a/contrib/virtiofsd/seccomp.c
+++ b/contrib/virtiofsd/seccomp.c
@@ -82,6 +82,8 @@ static const int syscall_whitelist[] = {
SCMP_SYS(writev),
SCMP_SYS(capget),
SCMP_SYS(capset),
+ SCMP_SYS(sched_setaffinity),
+ SCMP_SYS(sched_getaffinity),
};
/* Syscalls used when --syslog is enabled */
--
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-26 1:08 ` piaojun
@ 2019-08-27 14:42 ` Stefan Hajnoczi
2019-08-28 7:05 ` piaojun
0 siblings, 1 reply; 11+ messages in thread
From: Stefan Hajnoczi @ 2019-08-27 14:42 UTC (permalink / raw)
To: piaojun; +Cc: virtio-fs
[-- Attachment #1: Type: text/plain, Size: 4914 bytes --]
On Mon, Aug 26, 2019 at 09:08:20AM +0800, piaojun wrote:
> On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> > On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
> >> Set cpu affinity for each queue in multiqueue mode to improve the iops
> >> performance.
> >>
> >> >From my test, the iops is increased by adding multiqueues as below,
> >> but it has not achieved my expect yet due to some reason. So I'm
> >> considering if we could drop some locks when operating vq as it is
> >> binded to one vCPU. I'm very glad to have a discuss with other
> >> developers.
> >>
> >> Further more, I modified virtiofsd to support multiqueue which just for
> >> testing.
> >>
> >> Test Environment:
> >> Guest configuration:
> >> 8 vCPU
> >> 8GB RAM
> >> Linux 5.1 (vivek-aug-06-2019)
> >>
> >> Host configuration:
> >> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
> >> 32GB RAM
> >> Linux 3.10.0
> >> EXT4 + 4G Ramdisk
> >>
> >> ---
> >> Single-queue:
> >> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> >> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> >> ...
> >> fio-2.13
> >> Starting 8 processes
> >> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
> >> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
> >> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
> >>
> >> Multi-queues:
> >> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> >> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> >> ...
> >> fio-2.13
> >> Starting 8 processes
> >> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
> >> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
> >> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
> >> ---
> >
> > How does the same fio command-line perform on the host when bound to 8
> > CPUs?
>
> fio has great performance on host side, so the bottleneck should be at virtiofsd.
>
> ---
> Run status group 0 (all jobs):
> WRITE: bw=12.7GiB/s (13.6GB/s), 12.7GiB/s-12.7GiB/s (13.6GB/s-13.6GB/s), io=381GiB (409GB), run=30001-30001msec
Using just one file?
> >
> > What about the virtiofsd changes? Did you implement host CPU affinity
> > for the virtqueue processing threads and their workqueues?
> >
> > I wonder if numbers are better if you use 8 files instead of 1 file.
> >
> I implement host CPU affinity and re-design the testcase with 8 files,
> the result looks better:
>
> ---
> [global]
> runtime=30
> time_based
> group_reporting
> direct=1
> bs=1M
> size=1G
> ioengine=libaio
> rw=write
> numjobs=8
> iodepth=128
> thread=1
>
> [file1]
> filename=/mnt/virtiofs/file1
> numjobs=1
> [file2]
> filename=/mnt/virtiofs/file2
> numjobs=1
> [file3]
> filename=/mnt/virtiofs/file3
> numjobs=1
> [file4]
> filename=/mnt/virtiofs/file4
> numjobs=1
> [file5]
> filename=/mnt/virtiofs/file5
> numjobs=1
> [file6]
> filename=/mnt/virtiofs/file6
> numjobs=1
> [file7]
> filename=/mnt/virtiofs/file7
> numjobs=1
> [file8]
> filename=/mnt/virtiofs/file8
> numjobs=1
>
> Single-Queue:
> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1594MB/0KB /s] [0/1594/0 iops] [eta 00m:00s]
> file1: (groupid=0, jobs=8): err= 0: pid=6379: Mon Aug 26 16:24:10 2019
> write: io=46676MB, bw=1555.6MB/s, iops=1555, runt= 30007msec
The result improves greatly when using separate files. I wonder what
the bottleneck is, maybe serialization in the guest kernel?
>
> Multi-Queues(8):
> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/4064MB/0KB /s] [0/4064/0 iops] [eta 00m:00s]
> file1: (groupid=0, jobs=8): err= 0: pid=5785: Mon Aug 26 16:26:46 2019
> write: io=115421MB, bw=3847.2MB/s, iops=3847, runt= 30002msec
>
> I write a draft patch for virtiofsd, but the sandbox make it hard to
> set affinity for each vq, as _SC_NPROCESSORS_ONLN always equals 1. So I
> just delete the related code for testing. Maybe we could create a
> thread pool before setup_sandbox() or some effective way. I'm glad to
> help finding out the solution.
Doing the setup before entering the sandbox sounds like a good idea.
That way the sandbox does not need to whitelist the required syscalls.
Will you add an option similar to:
--request-queues N
--request-queue-cpu-affinity N=CPU_A[,CPU_B][-CPU_C]
?
For example, with 2 request queues where queue#1 is bound to CPUs 0-4
and queue#2 is bound to CPUs 2, 6, and 8:
--request-queues 2
--request-queue-cpu-affinity 1=0-4
--request-queue-cpu-affinity 2=5,6,8
Stefan
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-27 14:42 ` Stefan Hajnoczi
@ 2019-08-28 7:05 ` piaojun
2019-08-28 11:39 ` Stefan Hajnoczi
2020-04-24 2:15 ` Eryu Guan
0 siblings, 2 replies; 11+ messages in thread
From: piaojun @ 2019-08-28 7:05 UTC (permalink / raw)
To: Stefan Hajnoczi; +Cc: virtio-fs
On 2019/8/27 22:42, Stefan Hajnoczi wrote:
> On Mon, Aug 26, 2019 at 09:08:20AM +0800, piaojun wrote:
>> On 2019/8/21 23:38, Stefan Hajnoczi wrote:
>>> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
>>>> Set cpu affinity for each queue in multiqueue mode to improve the iops
>>>> performance.
>>>>
>>>> >From my test, the iops is increased by adding multiqueues as below,
>>>> but it has not achieved my expect yet due to some reason. So I'm
>>>> considering if we could drop some locks when operating vq as it is
>>>> binded to one vCPU. I'm very glad to have a discuss with other
>>>> developers.
>>>>
>>>> Further more, I modified virtiofsd to support multiqueue which just for
>>>> testing.
>>>>
>>>> Test Environment:
>>>> Guest configuration:
>>>> 8 vCPU
>>>> 8GB RAM
>>>> Linux 5.1 (vivek-aug-06-2019)
>>>>
>>>> Host configuration:
>>>> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
>>>> 32GB RAM
>>>> Linux 3.10.0
>>>> EXT4 + 4G Ramdisk
>>>>
>>>> ---
>>>> Single-queue:
>>>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
>>>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
>>>> ...
>>>> fio-2.13
>>>> Starting 8 processes
>>>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
>>>> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
>>>> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
>>>>
>>>> Multi-queues:
>>>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
>>>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
>>>> ...
>>>> fio-2.13
>>>> Starting 8 processes
>>>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
>>>> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
>>>> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
>>>> ---
>>>
>>> How does the same fio command-line perform on the host when bound to 8
>>> CPUs?
>>
>> fio has great performance on host side, so the bottleneck should be at virtiofsd.
>>
>> ---
>> Run status group 0 (all jobs):
>> WRITE: bw=12.7GiB/s (13.6GB/s), 12.7GiB/s-12.7GiB/s (13.6GB/s-13.6GB/s), io=381GiB (409GB), run=30001-30001msec
>
> Using just one file?
Also 8 files.
>
>>>
>>> What about the virtiofsd changes? Did you implement host CPU affinity
>>> for the virtqueue processing threads and their workqueues?
>>>
>>> I wonder if numbers are better if you use 8 files instead of 1 file.
>>>
>> I implement host CPU affinity and re-design the testcase with 8 files,
>> the result looks better:
>>
>> ---
>> [global]
>> runtime=30
>> time_based
>> group_reporting
>> direct=1
>> bs=1M
>> size=1G
>> ioengine=libaio
>> rw=write
>> numjobs=8
>> iodepth=128
>> thread=1
>>
>> [file1]
>> filename=/mnt/virtiofs/file1
>> numjobs=1
>> [file2]
>> filename=/mnt/virtiofs/file2
>> numjobs=1
>> [file3]
>> filename=/mnt/virtiofs/file3
>> numjobs=1
>> [file4]
>> filename=/mnt/virtiofs/file4
>> numjobs=1
>> [file5]
>> filename=/mnt/virtiofs/file5
>> numjobs=1
>> [file6]
>> filename=/mnt/virtiofs/file6
>> numjobs=1
>> [file7]
>> filename=/mnt/virtiofs/file7
>> numjobs=1
>> [file8]
>> filename=/mnt/virtiofs/file8
>> numjobs=1
>>
>> Single-Queue:
>> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1594MB/0KB /s] [0/1594/0 iops] [eta 00m:00s]
>> file1: (groupid=0, jobs=8): err= 0: pid=6379: Mon Aug 26 16:24:10 2019
>> write: io=46676MB, bw=1555.6MB/s, iops=1555, runt= 30007msec
>
> The result improves greatly when using separate files. I wonder what
> the bottleneck is, maybe serialization in the guest kernel?
I run the fio testcases again, and find out the bottleneck is not the
serialization in guest kernel:
---
8 Files:
Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1559MB/0KB /s] [0/1558/0 iops] [eta 00m:00s]
file1: (groupid=0, jobs=8): err= 0: pid=6540: Wed Aug 28 22:49:51 2019
write: io=46367MB, bw=1545.3MB/s, iops=1545, runt= 30006msec
Single File:
Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1567MB/0KB /s] [0/1566/0 iops] [eta 00m:00s]
file1: (groupid=0, jobs=8): err= 0: pid=6569: Wed Aug 28 22:50:33 2019
write: io=47315MB, bw=1576.9MB/s, iops=1576, runt= 30006msec
>
>>
>> Multi-Queues(8):
>> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/4064MB/0KB /s] [0/4064/0 iops] [eta 00m:00s]
>> file1: (groupid=0, jobs=8): err= 0: pid=5785: Mon Aug 26 16:26:46 2019
>> write: io=115421MB, bw=3847.2MB/s, iops=3847, runt= 30002msec
>>
>> I write a draft patch for virtiofsd, but the sandbox make it hard to
>> set affinity for each vq, as _SC_NPROCESSORS_ONLN always equals 1. So I
>> just delete the related code for testing. Maybe we could create a
>> thread pool before setup_sandbox() or some effective way. I'm glad to
>> help finding out the solution.
>
> Doing the setup before entering the sandbox sounds like a good idea.
> That way the sandbox does not need to whitelist the required syscalls.
>
> Will you add an option similar to:
>
> --request-queues N
> --request-queue-cpu-affinity N=CPU_A[,CPU_B][-CPU_C]
>
> ?
I'm writing the multi-queue code for virtiofsd according to your
suggestion, but the final shape may look a bit different.
Thanks,
Jun
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-28 7:05 ` piaojun
@ 2019-08-28 11:39 ` Stefan Hajnoczi
2020-04-24 2:15 ` Eryu Guan
1 sibling, 0 replies; 11+ messages in thread
From: Stefan Hajnoczi @ 2019-08-28 11:39 UTC (permalink / raw)
To: piaojun; +Cc: virtio-fs
[-- Attachment #1: Type: text/plain, Size: 4642 bytes --]
On Wed, Aug 28, 2019 at 03:05:01PM +0800, piaojun wrote:
>
>
> On 2019/8/27 22:42, Stefan Hajnoczi wrote:
> > On Mon, Aug 26, 2019 at 09:08:20AM +0800, piaojun wrote:
> >> On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> >>> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
> >>>> Set cpu affinity for each queue in multiqueue mode to improve the iops
> >>>> performance.
> >>>>
> >>>> >From my test, the iops is increased by adding multiqueues as below,
> >>>> but it has not achieved my expect yet due to some reason. So I'm
> >>>> considering if we could drop some locks when operating vq as it is
> >>>> binded to one vCPU. I'm very glad to have a discuss with other
> >>>> developers.
> >>>>
> >>>> Further more, I modified virtiofsd to support multiqueue which just for
> >>>> testing.
> >>>>
> >>>> Test Environment:
> >>>> Guest configuration:
> >>>> 8 vCPU
> >>>> 8GB RAM
> >>>> Linux 5.1 (vivek-aug-06-2019)
> >>>>
> >>>> Host configuration:
> >>>> Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz (8 cores x 4 threads)
> >>>> 32GB RAM
> >>>> Linux 3.10.0
> >>>> EXT4 + 4G Ramdisk
> >>>>
> >>>> ---
> >>>> Single-queue:
> >>>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> >>>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> >>>> ...
> >>>> fio-2.13
> >>>> Starting 8 processes
> >>>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/316.5MB/0KB /s] [0/81.2K/0 iops] [eta 00m:00s]
> >>>> file: (groupid=0, jobs=8): err= 0: pid=5808: Fri Aug 9 20:35:22 2019
> >>>> write: io=9499.9MB, bw=324251KB/s, iops=81062, runt= 30001msec
> >>>>
> >>>> Multi-queues:
> >>>> # fio -direct=1 -time_based -iodepth=128 -rw=randwrite -ioengine=libaio -bs=4k -size=1G -numjob=8 -runtime=30 -group_reporting -name=file -filename=/mnt/virtiofs/file
> >>>> file: (g=0): rw=randwrite, bs=4K-4K/4K-4K/4K-4K, ioengine=libaio, iodepth=128
> >>>> ...
> >>>> fio-2.13
> >>>> Starting 8 processes
> >>>> Jobs: 8 (f=8): [w(8)] [100.0% done] [0KB/444.6MB/0KB /s] [0/114K/0 iops] [eta 00m:00s]
> >>>> file: (groupid=0, jobs=8): err= 0: pid=5704: Fri Aug 9 20:38:47 2019
> >>>> write: io=12967MB, bw=442582KB/s, iops=110645, runt= 30001msec
> >>>> ---
> >>>
> >>> How does the same fio command-line perform on the host when bound to 8
> >>> CPUs?
> >>
> >> fio has great performance on host side, so the bottleneck should be at virtiofsd.
> >>
> >> ---
> >> Run status group 0 (all jobs):
> >> WRITE: bw=12.7GiB/s (13.6GB/s), 12.7GiB/s-12.7GiB/s (13.6GB/s-13.6GB/s), io=381GiB (409GB), run=30001-30001msec
> >
> > Using just one file?
>
> Also 8 files.
Great, this makes me a little happier :). I was worried that the host
could achieve very high throughput on just a single file while the guest
cannot.
> I run the fio testcases again, and find out the bottleneck is not the
> serialization in guest kernel:
>
> ---
> 8 Files:
> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1559MB/0KB /s] [0/1558/0 iops] [eta 00m:00s]
> file1: (groupid=0, jobs=8): err= 0: pid=6540: Wed Aug 28 22:49:51 2019
> write: io=46367MB, bw=1545.3MB/s, iops=1545, runt= 30006msec
>
> Single File:
> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/1567MB/0KB /s] [0/1566/0 iops] [eta 00m:00s]
> file1: (groupid=0, jobs=8): err= 0: pid=6569: Wed Aug 28 22:50:33 2019
> write: io=47315MB, bw=1576.9MB/s, iops=1576, runt= 30006msec
Also good news.
> >> Multi-Queues(8):
> >> Jobs: 8 (f=8): [W(8)] [100.0% done] [0KB/4064MB/0KB /s] [0/4064/0 iops] [eta 00m:00s]
> >> file1: (groupid=0, jobs=8): err= 0: pid=5785: Mon Aug 26 16:26:46 2019
> >> write: io=115421MB, bw=3847.2MB/s, iops=3847, runt= 30002msec
> >>
> >> I write a draft patch for virtiofsd, but the sandbox make it hard to
> >> set affinity for each vq, as _SC_NPROCESSORS_ONLN always equals 1. So I
> >> just delete the related code for testing. Maybe we could create a
> >> thread pool before setup_sandbox() or some effective way. I'm glad to
> >> help finding out the solution.
> >
> > Doing the setup before entering the sandbox sounds like a good idea.
> > That way the sandbox does not need to whitelist the required syscalls.
> >
> > Will you add an option similar to:
> >
> > --request-queues N
> > --request-queue-cpu-affinity N=CPU_A[,CPU_B][-CPU_C]
> >
> > ?
>
> I'm writing the multi-queue code for virtiofsd according to your
> suggestion, but the final shape may look a bit different.
Excellent, I look forward to your patches.
Stefan
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2019-08-28 7:05 ` piaojun
2019-08-28 11:39 ` Stefan Hajnoczi
@ 2020-04-24 2:15 ` Eryu Guan
2020-04-26 2:12 ` [Virtio-fs] 答复: " piaojun
1 sibling, 1 reply; 11+ messages in thread
From: Eryu Guan @ 2020-04-24 2:15 UTC (permalink / raw)
To: piaojun; +Cc: virtio-fs
Hi Jun,
On Wed, Aug 28, 2019 at 03:05:01PM +0800, piaojun wrote:
>
>
> On 2019/8/27 22:42, Stefan Hajnoczi wrote:
> > On Mon, Aug 26, 2019 at 09:08:20AM +0800, piaojun wrote:
> >> On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> >>> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
> >>>> Set cpu affinity for each queue in multiqueue mode to improve the iops
> >>>> performance.
[snip]
> >
> > Doing the setup before entering the sandbox sounds like a good idea.
> > That way the sandbox does not need to whitelist the required syscalls.
> >
> > Will you add an option similar to:
> >
> > --request-queues N
> > --request-queue-cpu-affinity N=CPU_A[,CPU_B][-CPU_C]
> >
> > ?
>
> I'm writing the multi-queue code for virtiofsd according to your
> suggestion, but the final shape may look a bit different.
Just curious, what's the status of this multi-queue support? I'd like to
try it out :)
Thanks,
Eryu
^ permalink raw reply [flat|nested] 11+ messages in thread
* [Virtio-fs] 答复: [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2020-04-24 2:15 ` Eryu Guan
@ 2020-04-26 2:12 ` piaojun
2020-04-29 11:39 ` Stefan Hajnoczi
0 siblings, 1 reply; 11+ messages in thread
From: piaojun @ 2020-04-26 2:12 UTC (permalink / raw)
To: Eryu Guan; +Cc: virtio-fs@redhat.com
-----邮件原件-----
发件人: Eryu Guan [mailto:eguan@linux.alibaba.com]
发送时间: 2020年4月24日 10:15
收件人: piaojun <piaojun@huawei.com>
抄送: Stefan Hajnoczi <stefanha@redhat.com>; virtio-fs@redhat.com
主题: Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
Hi Jun,
On Wed, Aug 28, 2019 at 03:05:01PM +0800, piaojun wrote:
>
>
> On 2019/8/27 22:42, Stefan Hajnoczi wrote:
> > On Mon, Aug 26, 2019 at 09:08:20AM +0800, piaojun wrote:
> >> On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> >>> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
> >>>> Set cpu affinity for each queue in multiqueue mode to improve the
> >>>> iops performance.
[snip]
> >
> > Doing the setup before entering the sandbox sounds like a good idea.
> > That way the sandbox does not need to whitelist the required syscalls.
> >
> > Will you add an option similar to:
> >
> > --request-queues N
> > --request-queue-cpu-affinity N=CPU_A[,CPU_B][-CPU_C]
> >
> > ?
>
> I'm writing the multi-queue code for virtiofsd according to your
> suggestion, but the final shape may look a bit different.
Just curious, what's the status of this multi-queue support? I'd like to try it out :)
OK, I wrote a demo a few months ago, but it has little performance promote even setting CPU affinity.
So, I'm glad you can try it.
Jun
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [Virtio-fs] 答复: [PATCH][RFC] Support multiqueue mode by setting cpu affinity
2020-04-26 2:12 ` [Virtio-fs] 答复: " piaojun
@ 2020-04-29 11:39 ` Stefan Hajnoczi
0 siblings, 0 replies; 11+ messages in thread
From: Stefan Hajnoczi @ 2020-04-29 11:39 UTC (permalink / raw)
To: piaojun; +Cc: virtio-fs@redhat.com
[-- Attachment #1: Type: text/plain, Size: 2012 bytes --]
On Sun, Apr 26, 2020 at 02:12:15AM +0000, piaojun wrote:
>
>
> -----邮件原件-----
> 发件人: Eryu Guan [mailto:eguan@linux.alibaba.com]
> 发送时间: 2020年4月24日 10:15
> 收件人: piaojun <piaojun@huawei.com>
> 抄送: Stefan Hajnoczi <stefanha@redhat.com>; virtio-fs@redhat.com
> 主题: Re: [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity
>
> Hi Jun,
>
> On Wed, Aug 28, 2019 at 03:05:01PM +0800, piaojun wrote:
> >
> >
> > On 2019/8/27 22:42, Stefan Hajnoczi wrote:
> > > On Mon, Aug 26, 2019 at 09:08:20AM +0800, piaojun wrote:
> > >> On 2019/8/21 23:38, Stefan Hajnoczi wrote:
> > >>> On Fri, Aug 09, 2019 at 02:04:54PM +0800, piaojun wrote:
> > >>>> Set cpu affinity for each queue in multiqueue mode to improve the
> > >>>> iops performance.
>
> [snip]
>
> > >
> > > Doing the setup before entering the sandbox sounds like a good idea.
> > > That way the sandbox does not need to whitelist the required syscalls.
> > >
> > > Will you add an option similar to:
> > >
> > > --request-queues N
> > > --request-queue-cpu-affinity N=CPU_A[,CPU_B][-CPU_C]
> > >
> > > ?
> >
> > I'm writing the multi-queue code for virtiofsd according to your
> > suggestion, but the final shape may look a bit different.
>
> Just curious, what's the status of this multi-queue support? I'd like to try it out :)
>
> OK, I wrote a demo a few months ago, but it has little performance promote even setting CPU affinity.
> So, I'm glad you can try it.
Do you want to send RFC patches to virtio-fs@redhat.com so it can be
discussed?
A bug in the virtiofs.ko guest driver has just been discussed that
prevented multiqueue from working:
https://www.redhat.com/archives/virtio-fs/2020-April/msg00030.html
or
https://www.redhat.com/archives/virtio-fs/2020-April/msg00021.html
There is also discussion about multiqueue and performance here:
https://www.redhat.com/archives/virtio-fs/2020-April/msg00026.html
Stefan
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2020-04-29 11:39 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-08-09 6:04 [Virtio-fs] [PATCH][RFC] Support multiqueue mode by setting cpu affinity piaojun
2019-08-16 5:57 ` piaojun
2019-08-21 15:38 ` Stefan Hajnoczi
2019-08-22 5:18 ` piaojun
2019-08-26 1:08 ` piaojun
2019-08-27 14:42 ` Stefan Hajnoczi
2019-08-28 7:05 ` piaojun
2019-08-28 11:39 ` Stefan Hajnoczi
2020-04-24 2:15 ` Eryu Guan
2020-04-26 2:12 ` [Virtio-fs] 答复: " piaojun
2020-04-29 11:39 ` Stefan Hajnoczi
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.