From: Dan Williams <dan.j.williams@intel.com>
To: neilb@suse.de, akpm@linux-foundation.org
Cc: linux-raid@vger.kernel.org
Subject: [PATCH -mm 3/4] raid5: convert add_stripe_bio to add_queue_bio
Date: Sat, 06 Oct 2007 10:06:54 -0700 [thread overview]
Message-ID: <20071006170654.23741.84018.stgit@dwillia2-linux.ch.intel.com> (raw)
In-Reply-To: <20071006170538.23741.75193.stgit@dwillia2-linux.ch.intel.com>
The stripe_queue object collects i/o requests before they are handled by
the stripe-cache (via the stripe_head object). add_stripe_bio currently
looks at the state of the stripe-cache to implement bitmap support,
reimplement this using stripe_queue attributes.
Introduce the STRIPE_QUEUE_FIRSTWRITE flag to track when a stripe is first
written. When a stripe_head is available record the bitmap batch sequence
number and set STRIPE_BIT_DELAY. For now a stripe_head will always be
available at 'add_queue_bio' time, going forward the 'sh' field of the
stripe_queue will indicate whether a stripe_head is attached.
Tested-by: Mr. James W. Laferriere <babydr@baby-dragons.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
drivers/md/raid5.c | 53 ++++++++++++++++++++++++++++----------------
include/linux/raid/raid5.h | 6 +++++
2 files changed, 40 insertions(+), 19 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7bc206c..d566fc9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -31,8 +31,10 @@
* conf->bm_flush is the number of the last batch that was closed to
* new additions.
* When we discover that we will need to write to any block in a stripe
- * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is bm_flush+1.
+ * (in add_queue_bio) we update the in-memory bitmap and record in the
+ * stripe_queue that a bitmap write was started. Then, in handle_stripe when
+ * we have a stripe_head available, we update sh->bm_seq to record the
+ * sequence number (target batch number) of this request. This is bm_flush+1.
* When we are ready to do a write, if that batch hasn't been written yet,
* we plug the array and queue the stripe for later.
* When an unplug happens, we increment bm_flush, thus closing the current
@@ -360,8 +362,14 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
}
} while (sh == NULL);
- if (sh)
+ if (sh) {
atomic_inc(&sh->count);
+ if (test_and_clear_bit(STRIPE_QUEUE_FIRSTWRITE,
+ &sh->sq->state)) {
+ sh->bm_seq = conf->seq_flush+1;
+ set_bit(STRIPE_BIT_DELAY, &sh->state);
+ }
+ }
spin_unlock_irq(&conf->device_lock);
return sh;
@@ -1991,26 +1999,34 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
* toread/towrite point to the first in a chain.
* The bi_next chain must be in order.
*/
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_queue_bio(struct stripe_queue *sq, struct bio *bi, int dd_idx,
+ int forwrite)
{
struct bio **bip;
- struct stripe_queue *sq = sh->sq;
raid5_conf_t *conf = sq->raid_conf;
int firstwrite=0;
- pr_debug("adding bh b#%llu to stripe s#%llu\n",
+ pr_debug("adding bio (%llu) to queue (%llu)\n",
(unsigned long long)bi->bi_sector,
- (unsigned long long)sh->sector);
-
+ (unsigned long long)sq->sector);
spin_lock(&sq->lock);
spin_lock_irq(&conf->device_lock);
if (forwrite) {
bip = &sq->dev[dd_idx].towrite;
- if (*bip == NULL && sq->dev[dd_idx].written == NULL)
+ set_bit(dd_idx, sq->to_write);
+ if (*bip == NULL && sq->dev[dd_idx].written == NULL) {
+ /* flag the queue to be assigned a bitmap
+ * sequence number
+ */
+ set_bit(STRIPE_QUEUE_FIRSTWRITE, &sq->state);
firstwrite = 1;
- } else
+ }
+ } else {
bip = &sq->dev[dd_idx].toread;
+ set_bit(dd_idx, sq->to_read);
+ }
+
while (*bip && (*bip)->bi_sector < bi->bi_sector) {
if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
goto overlap;
@@ -2024,19 +2040,17 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
bi->bi_next = *bip;
*bip = bi;
bi->bi_phys_segments ++;
+
spin_unlock_irq(&conf->device_lock);
spin_unlock(&sq->lock);
pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
(unsigned long long)bi->bi_sector,
- (unsigned long long)sh->sector, dd_idx);
+ (unsigned long long)sq->sector, dd_idx);
- if (conf->mddev->bitmap && firstwrite) {
- bitmap_startwrite(conf->mddev->bitmap, sh->sector,
+ if (conf->mddev->bitmap && firstwrite)
+ bitmap_startwrite(conf->mddev->bitmap, sq->sector,
STRIPE_SECTORS, 0);
- sh->bm_seq = conf->seq_flush+1;
- set_bit(STRIPE_BIT_DELAY, &sh->state);
- }
if (forwrite) {
/* check if page is covered */
@@ -2049,7 +2063,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
sector = bi->bi_sector + (bi->bi_size>>9);
}
if (sector >= sq->dev[dd_idx].sector + STRIPE_SECTORS)
- set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+ set_bit(dd_idx, sq->overwrite);
}
return 1;
@@ -3827,7 +3841,8 @@ static int make_request(struct request_queue *q, struct bio * bi)
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+ !add_queue_bio(sh->sq, bi, dd_idx,
+ bi->bi_rw & RW_MASK)) {
/* Stripe is busy expanding or
* add failed due to overlap. Flush everything
* and wait a while
@@ -4128,7 +4143,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
}
set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
- if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+ if (!add_queue_bio(sh->sq, raid_bio, dd_idx, 0)) {
release_stripe(sh);
raid_bio->bi_hw_segments = scnt;
conf->retry_read_aligned = raid_bio;
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index fbe622c..3d4938c 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -218,6 +218,7 @@ struct stripe_queue {
unsigned long *overlap; /* There is a pending overlapping request */
spinlock_t lock; /* protect bio lists and stripe_head state */
struct raid5_private_data *raid_conf;
+ unsigned long state;
struct list_head list_node;
int pd_idx; /* parity disk index */
int disks; /* disks in stripe */
@@ -288,6 +289,11 @@ struct stripe_queue {
#define STRIPE_OP_MOD_DMA_CHECK 8
/*
+ * Stripe-queue state
+ */
+#define STRIPE_QUEUE_FIRSTWRITE 0
+
+/*
* Plugging:
*
* To improve write throughput, we need to delay the handling of some
next prev parent reply other threads:[~2007-10-06 17:06 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-10-06 17:06 [PATCH -mm 0/4] raid5: stripe_queue (+20% to +90% write performance) Dan Williams
2007-10-06 17:06 ` [PATCH -mm 1/4] raid5: add the stripe_queue object for tracking raid io requests (rev3) Dan Williams
2007-10-06 17:06 ` [PATCH -mm 2/4] raid5: split allocation of stripe_heads and stripe_queues Dan Williams
2007-10-06 17:06 ` Dan Williams [this message]
2007-10-06 17:06 ` [PATCH -mm 4/4] raid5: use stripe_queues to prioritize the "most deserving" requests (rev7) Dan Williams
2007-10-06 18:34 ` [PATCH -mm 0/4] raid5: stripe_queue (+20% to +90% write performance) Justin Piszcz
2007-10-07 17:30 ` Dan Williams
2007-10-08 0:47 ` Neil Brown
2007-10-09 6:21 ` Neil Brown
2007-10-09 22:56 ` Dan Williams
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20071006170654.23741.84018.stgit@dwillia2-linux.ch.intel.com \
--to=dan.j.williams@intel.com \
--cc=akpm@linux-foundation.org \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.de \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).