* [patch 1/3 v2] MD: add a specific workqueue to do dispatch
2012-05-28 1:13 [patch 0/3 v2] MD: improve raid1/10 write performance for fast storage Shaohua Li
@ 2012-05-28 1:13 ` Shaohua Li
2012-05-28 1:13 ` [patch 2/3 v2] raid1: percpu dispatch for write request if bitmap supported Shaohua Li
2012-05-28 1:13 ` [patch 3/3 v2] raid10: " Shaohua Li
2 siblings, 0 replies; 4+ messages in thread
From: Shaohua Li @ 2012-05-28 1:13 UTC (permalink / raw)
To: linux-raid; +Cc: neilb, axboe, shli
[-- Attachment #1: md-workqueue.patch --]
[-- Type: text/plain, Size: 2510 bytes --]
Add a specific workqueue to do dispatch. Later patches will use it to do
per-cpu request queue dispatch.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
drivers/md/md.c | 14 ++++++++++++++
drivers/md/md.h | 1 +
2 files changed, 15 insertions(+)
Index: linux/drivers/md/md.c
===================================================================
--- linux.orig/drivers/md/md.c 2012-05-24 15:42:56.892251685 +0800
+++ linux/drivers/md/md.c 2012-05-24 15:44:55.618759412 +0800
@@ -71,6 +71,7 @@ static void md_print_devices(void);
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
+static struct workqueue_struct *md_run_wq;
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
@@ -8446,6 +8447,12 @@ static void md_geninit(void)
proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
}
+int md_schedule_work_on(int cpu, struct work_struct *work)
+{
+ return queue_work_on(cpu, md_run_wq, work);
+}
+EXPORT_SYMBOL(md_schedule_work_on);
+
static int __init md_init(void)
{
int ret = -ENOMEM;
@@ -8458,6 +8465,10 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;
+ md_run_wq = alloc_workqueue("md_run", WQ_MEM_RECLAIM, 0);
+ if (!md_run_wq)
+ goto err_run_wq;
+
if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
goto err_md;
@@ -8479,6 +8490,8 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
+ destroy_workqueue(md_run_wq);
+err_run_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
@@ -8571,6 +8584,7 @@ static __exit void md_exit(void)
export_array(mddev);
mddev->hold_active = 0;
}
+ destroy_workqueue(md_run_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
}
Index: linux/drivers/md/md.h
===================================================================
--- linux.orig/drivers/md/md.h 2012-05-24 15:42:56.896251635 +0800
+++ linux/drivers/md/md.h 2012-05-24 15:44:55.622759274 +0800
@@ -616,6 +616,7 @@ extern int md_integrity_register(struct
extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void restore_bitmap_write_access(struct file *file);
+extern int md_schedule_work_on(int cpu, struct work_struct *work);
extern void mddev_init(struct mddev *mddev);
extern int md_run(struct mddev *mddev);
^ permalink raw reply [flat|nested] 4+ messages in thread
* [patch 2/3 v2] raid1: percpu dispatch for write request if bitmap supported
2012-05-28 1:13 [patch 0/3 v2] MD: improve raid1/10 write performance for fast storage Shaohua Li
2012-05-28 1:13 ` [patch 1/3 v2] MD: add a specific workqueue to do dispatch Shaohua Li
@ 2012-05-28 1:13 ` Shaohua Li
2012-05-28 1:13 ` [patch 3/3 v2] raid10: " Shaohua Li
2 siblings, 0 replies; 4+ messages in thread
From: Shaohua Li @ 2012-05-28 1:13 UTC (permalink / raw)
To: linux-raid; +Cc: neilb, axboe, shli
[-- Attachment #1: raid1-write-percpulist.patch --]
[-- Type: text/plain, Size: 6584 bytes --]
In raid1, all write requests are dispatched in raid1d thread. In fast storage,
the raid1d thread is a bottleneck, because it dispatches request too slow. Also
raid1d thread migrates freely, which makes request completion cpu not match
with submission cpu even driver/block layer has such capability. This will
cause bad cache issue.
If bitmap support is enabled, write requests can only be dispatched after dirty
bitmap is flushed out. After bitmap is flushed, how write requests are
dispatched doesn't impact correctness. A natural idea is to distribute request
dispatch to several threads. With this patch, requests are added to a percpu
list first. After bitmap is flushed, then the percpu list requests will
dispatched in a workqueue. In this way, above bottleneck is removed.
In a 4k randwrite test with a 2 disks setup, below patch can provide 10% ~ 50%
performance improvements depending on numa binding.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
drivers/md/raid1.c | 97 +++++++++++++++++++++++++++++++++++++++++++----------
drivers/md/raid1.h | 10 ++++-
2 files changed, 88 insertions(+), 19 deletions(-)
Index: linux/drivers/md/raid1.h
===================================================================
--- linux.orig/drivers/md/raid1.h 2012-05-24 15:41:58.940980194 +0800
+++ linux/drivers/md/raid1.h 2012-05-24 15:45:03.074656334 +0800
@@ -22,6 +22,14 @@ struct pool_info {
int raid_disks;
};
+struct write_list {
+ struct bio_list pending_bio_list;
+ struct bio_list running_bio_list;
+ struct bio_list tmp_bio_list;
+ struct work_struct work;
+ struct r1conf *conf;
+};
+
struct r1conf {
struct mddev *mddev;
struct mirror_info *mirrors; /* twice 'raid_disks' to
@@ -50,7 +58,7 @@ struct r1conf {
struct list_head retry_list;
/* queue pending writes to be submitted on unplug */
- struct bio_list pending_bio_list;
+ struct write_list __percpu *write_list;
int pending_count;
/* for use when syncing mirrors:
Index: linux/drivers/md/raid1.c
===================================================================
--- linux.orig/drivers/md/raid1.c 2012-05-24 15:42:56.896251635 +0800
+++ linux/drivers/md/raid1.c 2012-05-24 15:45:03.074656334 +0800
@@ -687,22 +687,21 @@ static int raid1_congested(void *data, i
md_raid1_congested(mddev, bits);
}
-static void flush_pending_writes(struct r1conf *conf)
+static void raid1_write_work(struct work_struct *work)
{
- /* Any writes that have been queued but are awaiting
- * bitmap updates get flushed here.
- */
- spin_lock_irq(&conf->device_lock);
+ struct write_list *list = container_of(work, struct write_list, work);
+ struct bio *bio;
+ struct blk_plug plug;
+ bool try_again = true;
- if (conf->pending_bio_list.head) {
- struct bio *bio;
- bio = bio_list_get(&conf->pending_bio_list);
- conf->pending_count = 0;
- spin_unlock_irq(&conf->device_lock);
- /* flush any pending bitmap writes to
- * disk before proceeding w/ I/O */
- bitmap_unplug(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
+ blk_start_plug(&plug);
+
+ while (try_again) {
+ spin_lock_irq(&list->conf->device_lock);
+ bio = bio_list_get(&list->running_bio_list);
+ spin_unlock_irq(&list->conf->device_lock);
+
+ try_again = (bio != NULL);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -710,8 +709,53 @@ static void flush_pending_writes(struct
generic_make_request(bio);
bio = next;
}
- } else
- spin_unlock_irq(&conf->device_lock);
+ }
+ blk_finish_plug(&plug);
+}
+
+static void flush_pending_writes(struct r1conf *conf)
+{
+ int c;
+ struct write_list *list;
+
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ */
+ spin_lock_irq(&conf->device_lock);
+
+ for_each_present_cpu(c) {
+ list = per_cpu_ptr(conf->write_list, c);
+ if (!bio_list_empty(&list->pending_bio_list)) {
+ bio_list_merge(&list->tmp_bio_list,
+ &list->pending_bio_list);
+ bio_list_init(&list->pending_bio_list);
+ }
+ }
+
+ conf->pending_count = 0;
+ spin_unlock_irq(&conf->device_lock);
+
+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ spin_lock_irq(&conf->device_lock);
+ for_each_present_cpu(c) {
+ list = per_cpu_ptr(conf->write_list, c);
+ if (!bio_list_empty(&list->tmp_bio_list)) {
+ bio_list_merge(&list->running_bio_list,
+ &list->tmp_bio_list);
+ bio_list_init(&list->tmp_bio_list);
+ if (likely(cpu_online(c)))
+ md_schedule_work_on(c, &list->work);
+ else {
+ int cpu = cpumask_any(cpu_online_mask);
+ md_schedule_work_on(cpu, &list->work);
+ }
+ }
+ }
+
+ spin_unlock_irq(&conf->device_lock);
}
/* Barriers....
@@ -1137,6 +1181,7 @@ read_again:
first_clone = 1;
for (i = 0; i < disks; i++) {
struct bio *mbio;
+ struct write_list *list;
if (!r1_bio->bios[i])
continue;
@@ -1188,7 +1233,8 @@ read_again:
atomic_inc(&r1_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
+ list = this_cpu_ptr(conf->write_list);
+ bio_list_add(&list->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
@@ -2572,7 +2618,6 @@ static struct r1conf *setup_conf(struct
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
- bio_list_init(&conf->pending_bio_list);
conf->pending_count = 0;
conf->recovery_disabled = mddev->recovery_disabled - 1;
@@ -2617,6 +2662,19 @@ static struct r1conf *setup_conf(struct
goto abort;
}
err = -ENOMEM;
+
+ conf->write_list = alloc_percpu(struct write_list);
+ if (!conf->write_list)
+ goto abort;
+ for_each_present_cpu(i) {
+ struct write_list *list = per_cpu_ptr(conf->write_list, i);
+ bio_list_init(&list->pending_bio_list);
+ bio_list_init(&list->running_bio_list);
+ bio_list_init(&list->tmp_bio_list);
+ INIT_WORK(&list->work, raid1_write_work);
+ list->conf = conf;
+ }
+
conf->thread = md_register_thread(raid1d, mddev, NULL);
if (!conf->thread) {
printk(KERN_ERR
@@ -2629,6 +2687,7 @@ static struct r1conf *setup_conf(struct
abort:
if (conf) {
+ free_percpu(conf->write_list);
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
@@ -2735,6 +2794,8 @@ static int stop(struct mddev *mddev)
lower_barrier(conf);
md_unregister_thread(&mddev->thread);
+ free_percpu(conf->write_list);
+
if (conf->r1bio_pool)
mempool_destroy(conf->r1bio_pool);
kfree(conf->mirrors);
^ permalink raw reply [flat|nested] 4+ messages in thread
* [patch 3/3 v2] raid10: percpu dispatch for write request if bitmap supported
2012-05-28 1:13 [patch 0/3 v2] MD: improve raid1/10 write performance for fast storage Shaohua Li
2012-05-28 1:13 ` [patch 1/3 v2] MD: add a specific workqueue to do dispatch Shaohua Li
2012-05-28 1:13 ` [patch 2/3 v2] raid1: percpu dispatch for write request if bitmap supported Shaohua Li
@ 2012-05-28 1:13 ` Shaohua Li
2 siblings, 0 replies; 4+ messages in thread
From: Shaohua Li @ 2012-05-28 1:13 UTC (permalink / raw)
To: linux-raid; +Cc: neilb, axboe, shli
[-- Attachment #1: raid10-write-percpulist.patch --]
[-- Type: text/plain, Size: 6986 bytes --]
In raid10, all write requests are dispatched in raid10d thread. In fast
storage, the raid10d thread is a bottleneck, because it dispatches request too
slow. Also raid10d thread migrates freely, which makes request completion cpu
not match with submission cpu even driver/block layer has such capability. This
will cause bad cache issue.
If bitmap support is enabled, write requests can only be dispatched after dirty
bitmap is flushed out. After bitmap is flushed, how write requests are
dispatched doesn't impact correctness. A natural idea is to distribute request
dispatch to several threads. With this patch, requests are added to a percpu
list first. After bitmap is flushed, then the percpu list requests will
dispatched in a workqueue. In this way, above bottleneck is removed.
In a 4k randwrite test with a 4 disks setup, below patch can provide about 95%
performance improvements depending on numa binding.
Signed-off-by: Shaohua Li <shli@fusionio.com>
---
drivers/md/raid10.c | 98 ++++++++++++++++++++++++++++++++++++++++++----------
drivers/md/raid10.h | 10 ++++-
2 files changed, 89 insertions(+), 19 deletions(-)
Index: linux/drivers/md/raid10.h
===================================================================
--- linux.orig/drivers/md/raid10.h 2012-05-24 15:56:50.053778924 +0800
+++ linux/drivers/md/raid10.h 2012-05-24 16:01:10.094489173 +0800
@@ -11,6 +11,14 @@ struct mirror_info {
*/
};
+struct write_list {
+ struct bio_list pending_bio_list;
+ struct bio_list running_bio_list;
+ struct bio_list tmp_bio_list;
+ struct work_struct work;
+ struct r10conf *conf;
+};
+
struct r10conf {
struct mddev *mddev;
struct mirror_info *mirrors;
@@ -49,7 +57,7 @@ struct r10conf {
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
- struct bio_list pending_bio_list;
+ struct write_list __percpu *write_list;
int pending_count;
spinlock_t resync_lock;
Index: linux/drivers/md/raid10.c
===================================================================
--- linux.orig/drivers/md/raid10.c 2012-05-24 15:56:50.065778328 +0800
+++ linux/drivers/md/raid10.c 2012-05-24 16:02:01.237859524 +0800
@@ -867,22 +867,21 @@ static int raid10_congested(void *data,
return ret;
}
-static void flush_pending_writes(struct r10conf *conf)
+static void raid10_write_work(struct work_struct *work)
{
- /* Any writes that have been queued but are awaiting
- * bitmap updates get flushed here.
- */
- spin_lock_irq(&conf->device_lock);
+ struct write_list *list = container_of(work, struct write_list, work);
+ struct bio *bio;
+ struct blk_plug plug;
+ bool try_again = true;
- if (conf->pending_bio_list.head) {
- struct bio *bio;
- bio = bio_list_get(&conf->pending_bio_list);
- conf->pending_count = 0;
- spin_unlock_irq(&conf->device_lock);
- /* flush any pending bitmap writes to disk
- * before proceeding w/ I/O */
- bitmap_unplug(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
+ blk_start_plug(&plug);
+
+ while (try_again) {
+ spin_lock_irq(&list->conf->device_lock);
+ bio = bio_list_get(&list->running_bio_list);
+ spin_unlock_irq(&list->conf->device_lock);
+
+ try_again = (bio != NULL);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -890,8 +889,53 @@ static void flush_pending_writes(struct
generic_make_request(bio);
bio = next;
}
- } else
- spin_unlock_irq(&conf->device_lock);
+ }
+ blk_finish_plug(&plug);
+}
+
+static void flush_pending_writes(struct r10conf *conf)
+{
+ int c;
+ struct write_list *list;
+
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ */
+ spin_lock_irq(&conf->device_lock);
+
+ for_each_present_cpu(c) {
+ list = per_cpu_ptr(conf->write_list, c);
+ if (!bio_list_empty(&list->pending_bio_list)) {
+ bio_list_merge(&list->tmp_bio_list,
+ &list->pending_bio_list);
+ bio_list_init(&list->pending_bio_list);
+ }
+ }
+
+ conf->pending_count = 0;
+ spin_unlock_irq(&conf->device_lock);
+
+ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ spin_lock_irq(&conf->device_lock);
+ for_each_present_cpu(c) {
+ list = per_cpu_ptr(conf->write_list, c);
+ if (!bio_list_empty(&list->tmp_bio_list)) {
+ bio_list_merge(&list->running_bio_list,
+ &list->tmp_bio_list);
+ bio_list_init(&list->tmp_bio_list);
+ if (likely(cpu_online(c)))
+ md_schedule_work_on(c, &list->work);
+ else {
+ int cpu = cpumask_any(cpu_online_mask);
+ md_schedule_work_on(cpu, &list->work);
+ }
+ }
+ }
+
+ spin_unlock_irq(&conf->device_lock);
}
/* Barriers....
@@ -1374,6 +1418,7 @@ retry_write:
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
+ struct write_list *list;
int d = r10_bio->devs[i].devnum;
if (!r10_bio->devs[i].bio)
continue;
@@ -1393,7 +1438,8 @@ retry_write:
atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
+ list = this_cpu_ptr(conf->write_list);
+ bio_list_add(&list->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -1420,7 +1466,8 @@ retry_write:
atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
- bio_list_add(&conf->pending_bio_list, mbio);
+ list = this_cpu_ptr(conf->write_list);
+ bio_list_add(&list->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
}
@@ -3360,6 +3407,7 @@ static struct r10conf *setup_conf(struct
int err = -EINVAL;
struct geom geo;
int copies;
+ int cpu;
copies = setup_geo(&geo, mddev, geo_new);
@@ -3421,6 +3469,18 @@ static struct r10conf *setup_conf(struct
spin_lock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier);
+ conf->write_list = alloc_percpu(struct write_list);
+ if (!conf->write_list)
+ goto out;
+ for_each_present_cpu(cpu) {
+ struct write_list *list = per_cpu_ptr(conf->write_list, cpu);
+ bio_list_init(&list->pending_bio_list);
+ bio_list_init(&list->running_bio_list);
+ bio_list_init(&list->tmp_bio_list);
+ INIT_WORK(&list->work, raid10_write_work);
+ list->conf = conf;
+ }
+
conf->thread = md_register_thread(raid10d, mddev, NULL);
if (!conf->thread)
goto out;
@@ -3433,6 +3493,7 @@ static struct r10conf *setup_conf(struct
printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
mdname(mddev));
if (conf) {
+ free_percpu(conf->write_list);
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
@@ -3632,6 +3693,7 @@ static int stop(struct mddev *mddev)
md_unregister_thread(&mddev->thread);
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+ free_percpu(conf->write_list);
if (conf->r10bio_pool)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
^ permalink raw reply [flat|nested] 4+ messages in thread