From: NeilBrown <neilb@suse.de>
To: "Yucong Sun (叶雨飞)" <sunyucong@gmail.com>
Cc: linux-raid@vger.kernel.org
Subject: Re: Raid10 and page cache
Date: Thu, 8 Dec 2011 11:10:09 +1100 [thread overview]
Message-ID: <20111208111009.6ca1ca19@notabene.brown> (raw)
In-Reply-To: <CAJygYd1+cVGeRwZXq_WLZr89Q9GX_yPEDvFBuagSG4D9neyoqQ@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 7881 bytes --]
On Wed, 7 Dec 2011 15:37:30 -0800 Yucong Sun (叶雨飞) <sunyucong@gmail.com>
wrote:
> Neil, I can't compile latest MD against 2.6.32, and that commit can't
> be patched into 2.6.32 directly either, can you help me on this?
>
This should do it.
NeilBrown
commit ef54b7cf955dc3b7d33248e8591b1a00b4fa998c
Author: NeilBrown <neilb@suse.de>
Date: Tue Oct 11 16:50:01 2011 +1100
md: add proper write-congestion reporting to RAID1 and RAID10.
RAID1 and RAID10 handle write requests by queuing them for handling by
a separate thread. This is because when a write-intent-bitmap is
active we might need to update the bitmap first, so it is good to
queue a lot of writes, then do one big bitmap update for them all.
However writeback request devices to appear to be congested after a
while so it can make some guesstimate of throughput. The infinite
queue defeats that (note that RAID5 has already has a finite queue so
it doesn't suffer from this problem).
So impose a limit on the number of pending write requests. By default
it is 1024 which seems to be generally suitable. Make it configurable
via module option just in case someone finds a regression.
Signed-off-by: NeilBrown <neilb@suse.de>
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e07ce2e..fe7ae3c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -50,6 +50,11 @@
*/
#define NR_RAID1_BIOS 256
+/* When there are this many requests queue to be written by
+ * the raid1 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
static void unplug_slaves(mddev_t *mddev);
@@ -576,7 +581,8 @@ static int raid1_congested(void *data, int bits)
conf_t *conf = mddev->private;
int i, ret = 0;
- if (mddev_congested(mddev, bits))
+ if (mddev_congested(mddev, bits) &&
+ conf->pending_count >= max_queued_requests)
return 1;
rcu_read_lock();
@@ -613,10 +619,12 @@ static int flush_pending_writes(conf_t *conf)
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
blk_remove_plug(conf->mddev->queue);
+ conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to
* disk before proceeding w/ I/O */
bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -789,6 +797,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
int cpu;
bool do_barriers;
mdk_rdev_t *blocked_rdev;
+ int cnt = 0;
/*
* Register the new request and wait if the reconstruction
@@ -864,6 +873,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
/*
* WRITE:
*/
+ if (conf->pending_count >= max_queued_requests) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued_requests);
+ }
/* first select target devices under spinlock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
@@ -970,6 +984,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
atomic_inc(&r1_bio->remaining);
bio_list_add(&bl, mbio);
+ cnt++;
}
kfree(behind_pages); /* the behind pages are attached to the bios now */
@@ -978,6 +993,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_merge(&conf->pending_bio_list, &bl);
bio_list_init(&bl);
+ conf->pending_count += cnt;
blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
@@ -2021,7 +2037,7 @@ static int run(mddev_t *mddev)
bio_list_init(&conf->pending_bio_list);
bio_list_init(&conf->flushing_bio_list);
-
+ conf->pending_count = 0;
mddev->degraded = 0;
for (i = 0; i < conf->raid_disks; i++) {
@@ -2317,3 +2333,5 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-3"); /* RAID1 */
MODULE_ALIAS("md-raid1");
MODULE_ALIAS("md-level-1");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e87b84d..520288c 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -38,6 +38,7 @@ struct r1_private_data_s {
/* queue of writes that have been unplugged */
struct bio_list flushing_bio_list;
+ int pending_count;
/* for use when syncing mirrors: */
spinlock_t resync_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b8..4c7d9b5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -59,6 +59,11 @@ static void unplug_slaves(mddev_t *mddev);
static void allow_barrier(conf_t *conf);
static void lower_barrier(conf_t *conf);
+/* When there are this many requests queue to be written by
+ * the raid10 thread, we become 'congested' to provide back-pressure
+ * for writeback.
+ */
+static int max_queued_requests = 1024;
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
{
@@ -631,6 +636,10 @@ static int raid10_congested(void *data, int bits)
conf_t *conf = mddev->private;
int i, ret = 0;
+ if ((bits & (1 << BDI_async_congested)) &&
+ conf->pending_count >= max_queued_requests)
+ return 1;
+
if (mddev_congested(mddev, bits))
return 1;
rcu_read_lock();
@@ -660,10 +669,12 @@ static int flush_pending_writes(conf_t *conf)
struct bio *bio;
bio = bio_list_get(&conf->pending_bio_list);
blk_remove_plug(conf->mddev->queue);
+ conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
/* flush any pending bitmap writes to disk
* before proceeding w/ I/O */
bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
@@ -802,6 +813,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
struct bio_list bl;
unsigned long flags;
mdk_rdev_t *blocked_rdev;
+ int cnt = 0;
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
bio_endio(bio, -EOPNOTSUPP);
@@ -894,6 +906,11 @@ static int make_request(struct request_queue *q, struct bio * bio)
/*
* WRITE:
*/
+ if (conf->pending_count >= max_queued_requests) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued_requests);
+ }
/* first select target devices under rcu_lock and
* inc refcount on their rdev. Record them by setting
* bios[x] to bio
@@ -957,6 +974,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
atomic_inc(&r10_bio->remaining);
bio_list_add(&bl, mbio);
+ cnt++
}
if (unlikely(!atomic_read(&r10_bio->remaining))) {
@@ -970,6 +988,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_merge(&conf->pending_bio_list, &bl);
blk_plug_device(mddev->queue);
+ conf->pending_count += cnt;
spin_unlock_irqrestore(&conf->device_lock, flags);
/* In case raid10d snuck in to freeze_array */
@@ -2318,3 +2337,5 @@ MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-9"); /* RAID10 */
MODULE_ALIAS("md-raid10");
MODULE_ALIAS("md-level-10");
+
+module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 59cd1ef..e6e1613 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -39,7 +39,7 @@ struct r10_private_data_s {
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
-
+ int pending_count;
spinlock_t resync_lock;
int nr_pending;
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]
next prev parent reply other threads:[~2011-12-08 0:10 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CAJygYd16PWfKe8fK-b150N46CEwzBUqJn1N6dfsGR4yyTgGbTQ@mail.gmail.com>
2011-12-06 22:01 ` Raid10 and page cache Yucong Sun (叶雨飞)
2011-12-06 22:26 ` NeilBrown
2011-12-06 23:13 ` Yucong Sun (叶雨飞)
2011-12-06 23:22 ` Marcus Sorensen
2011-12-07 1:01 ` NeilBrown
2011-12-07 4:04 ` Yucong Sun (叶雨飞)
2011-12-07 4:28 ` NeilBrown
2011-12-07 4:50 ` Yucong Sun (叶雨飞)
2011-12-07 5:10 ` NeilBrown
2011-12-07 6:14 ` Yucong Sun (叶雨飞)
2011-12-07 9:21 ` Yucong Sun (叶雨飞)
2011-12-07 23:37 ` Yucong Sun (叶雨飞)
2011-12-08 0:10 ` NeilBrown [this message]
2011-12-08 6:31 ` Yucong Sun (叶雨飞)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20111208111009.6ca1ca19@notabene.brown \
--to=neilb@suse.de \
--cc=linux-raid@vger.kernel.org \
--cc=sunyucong@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox