From: NeilBrown <neilb@suse.de>
To: "Yucong Sun (叶雨飞)" <sunyucong@gmail.com>
Cc: linux-raid@vger.kernel.org
Subject: Re: Raid10 and page cache
Date: Thu, 8 Dec 2011 11:10:09 +1100 [thread overview]
Message-ID: <20111208111009.6ca1ca19@notabene.brown> (raw)
In-Reply-To: <CAJygYd1+cVGeRwZXq_WLZr89Q9GX_yPEDvFBuagSG4D9neyoqQ@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 7881 bytes --]
On Wed, 7 Dec 2011 15:37:30 -0800 Yucong Sun (叶雨飞) <sunyucong@gmail.com>
wrote:
> Neil, I can't compile latest MD against 2.6.32, and that commit can't
> be patched into 2.6.32 directly either, can you help me on this?
>
This should do it.
NeilBrown
commit ef54b7cf955dc3b7d33248e8591b1a00b4fa998c
Author: NeilBrown <neilb@suse.de>
Date: Tue Oct 11 16:50:01 2011 +1100
md: add proper write-congestion reporting to RAID1 and RAID10.
RAID1 and RAID10 handle write requests by queuing them for handling by
a separate thread. This is because when a write-intent-bitmap is
active we might need to update the bitmap first, so it is good to
queue a lot of writes, then do one big bitmap update for them all.
However writeback request devices to appear to be congested after a
while so it can make some guesstimate of throughput. The infinite
queue defeats that (note that RAID5 has already has a finite queue so
it doesn't suffer from this problem).
So impose a limit on the number of pending write requests. By default
it is 1024 which seems to be generally suitable. Make it configurable
via module option just in case someone finds a regression.
Signed-off-by: NeilBrown <neilb@suse.de>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e07ce2e..fe7ae3c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -50,6 +50,11 @@
*/
#define NR_RAID1_BIOS 256
+/* When there are this many requests queue to be written by
+ * the raid1 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
static void unplug_slaves(mddev_t *mddev);
@@ -576,7 +581,8 @@ static int raid1_congested(void *data, int bits)
conf_t *conf = mddev->private;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
+ if (mddev_congested(mddev, bits) &&
+ conf->pending_count >= max_queued_requests)
return 1;
rcu_read_lock();
@@ -613,10 +619,12 @@ static int flush_pending_writes(conf_t *conf)
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
blk_remove_plug(conf->mddev->queue);
+ conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to
* disk before proceeding w/ I/O */
bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -789,6 +797,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
int cpu;
bool do_barriers;
mdk_rdev_t *blocked_rdev;
+ int cnt = 0;
/*
* Register the new request and wait if the reconstruction
@@ -864,6 +873,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
/*
* WRITE:
*/
+ if (conf->pending_count >= max_queued_requests) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued_requests);
+ }
/* first select target devices under spinlock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
@@ -970,6 +984,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
atomic_inc(&r1_bio->remaining);
bio_list_add(&bl, mbio);
+ cnt++;
}
kfree(behind_pages); /* the behind pages are attached to the bios now */
@@ -978,6 +993,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_merge(&conf->pending_bio_list, &bl);
bio_list_init(&bl);
+ conf->pending_count += cnt;
blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2021,7 +2037,7 @@ static int run(mddev_t *mddev)
bio_list_init(&conf->pending_bio_list);
bio_list_init(&conf->flushing_bio_list);
-
+ conf->pending_count = 0;
mddev->degraded = 0;
for (i = 0; i < conf->raid_disks; i++) {
@@ -2317,3 +2333,5 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-3"); /* RAID1 */
MODULE_ALIAS("md-raid1");
MODULE_ALIAS("md-level-1");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84d..520288c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -38,6 +38,7 @@ struct r1_private_data_s {
/* queue of writes that have been unplugged */
struct bio_list flushing_bio_list;
+ int pending_count;
/* for use when syncing mirrors: */
spinlock_t resync_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b8..4c7d9b5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -59,6 +59,11 @@ static void unplug_slaves(mddev_t *mddev);
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
+/* When there are this many requests queue to be written by
+ * the raid10 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
@@ -631,6 +636,10 @@ static int raid10_congested(void *data, int bits)
conf_t *conf = mddev->private;
int i, ret = 0;
+ if ((bits & (1 << BDI_async_congested)) &&
+ conf->pending_count >= max_queued_requests)
+ return 1;
+
if (mddev_congested(mddev, bits))
return 1;
rcu_read_lock();
@@ -660,10 +669,12 @@ static int flush_pending_writes(conf_t *conf)
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
blk_remove_plug(conf->mddev->queue);
+ conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to disk
* before proceeding w/ I/O */
bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -802,6 +813,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
struct bio_list bl;
unsigned long flags;
mdk_rdev_t *blocked_rdev;
+ int cnt = 0;
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
@@ -894,6 +906,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
/*
* WRITE:
*/
+ if (conf->pending_count >= max_queued_requests) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued_requests);
+ }
/* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
@@ -957,6 +974,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
atomic_inc(&r10_bio->remaining);
bio_list_add(&bl, mbio);
+ cnt++
}
if (unlikely(!atomic_read(&r10_bio->remaining))) {
@@ -970,6 +988,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_merge(&conf->pending_bio_list, &bl);
blk_plug_device(mddev->queue);
+ conf->pending_count += cnt;
spin_unlock_irqrestore(&conf->device_lock, flags);
/* In case raid10d snuck in to freeze_array */
@@ -2318,3 +2337,5 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-9"); /* RAID10 */
MODULE_ALIAS("md-raid10");
MODULE_ALIAS("md-level-10");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 59cd1ef..e6e1613 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -39,7 +39,7 @@ struct r10_private_data_s {
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
-
+ int pending_count;
spinlock_t resync_lock;
int nr_pending;
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]
next prev parent reply other threads:[~2011-12-08 0:10 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2011-12-06 21:29 Raid10 and page cache Yucong Sun (叶雨飞)
2011-12-06 22:01 ` Yucong Sun (叶雨飞)
2011-12-06 22:26 ` NeilBrown
2011-12-06 23:13 ` Yucong Sun (叶雨飞)
2011-12-06 23:22 ` Marcus Sorensen
2011-12-07 1:01 ` NeilBrown
2011-12-07 4:04 ` Yucong Sun (叶雨飞)
2011-12-07 4:28 ` NeilBrown
2011-12-07 4:50 ` Yucong Sun (叶雨飞)
2011-12-07 5:10 ` NeilBrown
2011-12-07 6:14 ` Yucong Sun (叶雨飞)
2011-12-07 9:21 ` Yucong Sun (叶雨飞)
2011-12-07 23:37 ` Yucong Sun (叶雨飞)
2011-12-08 0:10 ` NeilBrown [this message]
2011-12-08 6:31 ` Yucong Sun (叶雨飞)
[not found] ` <CAJygYd16PWfKe8fK-b150N46CEwzBUqJn1N6dfsGR4yyTgGbTQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2011-12-06 22:01 ` Yucong Sun (叶雨飞)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20111208111009.6ca1ca19@notabene.brown \
--to=neilb@suse.de \
--cc=linux-raid@vger.kernel.org \
--cc=sunyucong@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.