* [md PATCH 05/10] md/raid1: factor out flush_bio_list()
From: NeilBrown @ 2017-04-05 4:05 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>
flush_pending_writes() and raid1_unplug() each contain identical
copies of a fairly large slab of code. So factor that out into
new flush_bio_list() to simplify maintenance.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/raid1.c | 67 +++++++++++++++++++++-------------------------------
1 file changed, 27 insertions(+), 40 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d15268fff052..a70283753a35 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -787,6 +787,31 @@ static int raid1_congested(struct mddev *mddev, int bits)
return ret;
}
+static void flush_bio_list(struct r1conf *conf, struct bio *bio)
+{
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+ wake_up(&conf->wait_barrier);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ struct md_rdev *rdev = (void*)bio->bi_bdev;
+ bio->bi_next = NULL;
+ bio->bi_bdev = rdev->bdev;
+ if (test_bit(Faulty, &rdev->flags)) {
+ bio->bi_error = -EIO;
+ bio_endio(bio);
+ } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+ /* Just ignore it */
+ bio_endio(bio);
+ else
+ generic_make_request(bio);
+ bio = next;
+ }
+}
+
static void flush_pending_writes(struct r1conf *conf)
{
/* Any writes that have been queued but are awaiting
@@ -799,27 +824,7 @@ static void flush_pending_writes(struct r1conf *conf)
bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock);
- /* flush any pending bitmap writes to
- * disk before proceeding w/ I/O */
- bitmap_unplug(conf->mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
+ flush_bio_list(conf, bio);
} else
spin_unlock_irq(&conf->device_lock);
}
@@ -1152,25 +1157,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- bitmap_unplug(mddev->bitmap);
- wake_up(&conf->wait_barrier);
-
- while (bio) { /* submit pending writes */
- struct bio *next = bio->bi_next;
- struct md_rdev *rdev = (void*)bio->bi_bdev;
- bio->bi_next = NULL;
- bio->bi_bdev = rdev->bdev;
- if (test_bit(Faulty, &rdev->flags)) {
- bio->bi_error = -EIO;
- bio_endio(bio);
- } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
- /* Just ignore it */
- bio_endio(bio);
- else
- generic_make_request(bio);
- bio = next;
- }
+ flush_bio_list(conf, bio);
kfree(plug);
}
^ permalink raw reply related
* [md PATCH 06/10] md/raid10: simplify the splitting of requests.
From: NeilBrown @ 2017-04-05 4:05 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>
raid10 splits requests in two different ways for two different
reasons.
First, bio_split() is used to ensure the bio fits with a chunk.
Second, multiple r10bio structures are allocated to represent the
different sections that need to go to different devices, to avoid
known bad blocks.
This can be simplified to just use bio_split() once, and not to use
multiple r10bios.
We delay the split until we know a maximum bio size that can
be handled with a single r10bio, and then split the bio and queue
the remainder for later handling.
As with raid1, we allocate a new bio_set to help with the splitting.
It is not correct to use fs_bio_set in a device driver.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/raid10.c | 164 ++++++++++++++++-----------------------------------
drivers/md/raid10.h | 1
2 files changed, 51 insertions(+), 114 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0f13d57ef646..ff02c058dbdd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct bio *read_bio;
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
- int sectors_handled;
int max_sectors;
sector_t sectors;
struct md_rdev *rdev;
@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
wait_barrier(conf);
}
-read_again:
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
raid_end_bio_io(r10_bio);
return;
}
+ if (max_sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, max_sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
+ r10_bio->sectors = max_sectors;
+ }
slot = r10_bio->read_slot;
read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
- max_sectors);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1186,40 +1191,13 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
- if (max_sectors < r10_bio->sectors) {
- /*
- * Could not read all from this device, so we will need another
- * r10_bio.
- */
- sectors_handled = (r10_bio->sector + max_sectors
- - bio->bi_iter.bi_sector);
- r10_bio->sectors = max_sectors;
- inc_pending(conf);
- bio_inc_remaining(bio);
- /*
- * Cannot call generic_make_request directly as that will be
- * queued in __generic_make_request and subsequent
- * mempool_alloc might block waiting for it. so hand bio over
- * to raid10d.
- */
- reschedule_retry(r10_bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
- r10_bio->state = 0;
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- goto read_again;
- } else
- generic_make_request(read_bio);
+ generic_make_request(read_bio);
return;
}
static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
struct bio *bio, bool replacement,
- int n_copy, int max_sectors)
+ int n_copy)
{
const int op = bio_op(bio);
const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
rdev = conf->mirrors[devnum].rdev;
mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
- bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
if (replacement)
r10_bio->devs[n_copy].repl_bio = mbio;
else
@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int i;
struct md_rdev *blocked_rdev;
sector_t sectors;
- int sectors_handled;
int max_sectors;
md_write_start(mddev, bio);
@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
*/
wait_barrier(conf);
- sectors = bio_sectors(bio);
+ sectors = r10_bio->sectors;
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1476,44 +1452,29 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < r10_bio->sectors)
r10_bio->sectors = max_sectors;
- sectors_handled = r10_bio->sector + max_sectors -
- bio->bi_iter.bi_sector;
+
+ if (r10_bio->sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, r10_bio->sectors,
+ GFP_NOIO, conf->bio_split);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ r10_bio->master_bio = bio;
+ }
atomic_set(&r10_bio->remaining, 1);
bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
- raid10_write_one_disk(mddev, r10_bio, bio, false,
- i, max_sectors);
+ raid10_write_one_disk(mddev, r10_bio, bio, false, i);
if (r10_bio->devs[i].repl_bio)
- raid10_write_one_disk(mddev, r10_bio, bio, true,
- i, max_sectors);
- }
-
- /* Don't remove the bias on 'remaining' (one_write_done) until
- * after checking if we need to go around again.
- */
-
- if (sectors_handled < bio_sectors(bio)) {
- /* We need another r10_bio and it needs to be counted */
- inc_pending(conf);
- bio_inc_remaining(bio);
- one_write_done(r10_bio);
- r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
- r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-
- r10_bio->mddev = mddev;
- r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
- r10_bio->state = 0;
- goto retry_write;
+ raid10_write_one_disk(mddev, r10_bio, bio, true, i);
}
one_write_done(r10_bio);
}
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
{
struct r10conf *conf = mddev->private;
struct r10bio *r10_bio;
@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
r10_bio->master_bio = bio;
- r10_bio->sectors = bio_sectors(bio);
+ r10_bio->sectors = sectors;
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
struct r10conf *conf = mddev->private;
sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
int chunk_sects = chunk_mask + 1;
-
- struct bio *split;
+ int sectors = bio_sectors(bio);
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
-
- /*
- * If this request crosses a chunk boundary, we need to split
- * it.
- */
- if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
- bio_sectors(bio) > chunk_sects
- && (conf->geo.near_copies < conf->geo.raid_disks
- || conf->prev.near_copies <
- conf->prev.raid_disks))) {
- split = bio_split(bio, chunk_sects -
- (bio->bi_iter.bi_sector &
- (chunk_sects - 1)),
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
-
- /*
- * If a bio is splitted, the first part of bio will pass
- * barrier but the bio is queued in current->bio_list (see
- * generic_make_request). If there is a raise_barrier() called
- * here, the second part of bio can't pass barrier. But since
- * the first part bio isn't dispatched to underlaying disks
- * yet, the barrier is never released, hence raise_barrier will
- * alays wait. We have a deadlock.
- * Note, this only happens in read path. For write path, the
- * first part of bio is dispatched in a schedule() call
- * (because of blk plug) or offloaded to raid10d.
- * Quitting from the function immediately can change the bio
- * order queued in bio_list and avoid the deadlock.
- */
- __make_request(mddev, split);
- if (split != bio && bio_data_dir(bio) == READ) {
- generic_make_request(bio);
- break;
- }
- } while (split != bio);
+ /*
+ * If this request crosses a chunk boundary, we need to split
+ * it.
+ */
+ if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+ sectors > chunk_sects
+ && (conf->geo.near_copies < conf->geo.raid_disks
+ || conf->prev.near_copies <
+ conf->prev.raid_disks)))
+ sectors = chunk_sects -
+ (bio->bi_iter.bi_sector &
+ (chunk_sects - 1));
+ __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread)
recovery_request_write(mddev, r10_bio);
else if (test_bit(R10BIO_ReadError, &r10_bio->state))
handle_read_error(mddev, r10_bio);
- else {
- /* just a partial read to be scheduled from a
- * separate context
- */
- int slot = r10_bio->read_slot;
- generic_make_request(r10_bio->devs[slot].bio);
- }
+ else
+ WARN_ON_ONCE(1);
cond_resched();
if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
if (!conf->r10bio_pool)
goto out;
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto out;
+
calc_sectors(conf, mddev->dev_sectors);
if (mddev->reshape_position == MaxSector) {
conf->prev = conf->geo;
@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
mempool_destroy(conf->r10bio_pool);
kfree(conf->mirrors);
safe_put_page(conf->tmppage);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
return ERR_PTR(err);
@@ -3898,6 +3832,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf->mirrors);
kfree(conf->mirrors_old);
kfree(conf->mirrors_new);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf);
}
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3162615e57bd..735ce1a3d260 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
mempool_t *r10bio_pool;
mempool_t *r10buf_pool;
struct page *tmppage;
+ struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
^ permalink raw reply related
* [md PATCH 07/10] md/raid10: simplify handle_read_error()
From: NeilBrown @ 2017-04-05 4:05 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>
handle_read_error() duplicates a lot of the work that raid10_read_request()
does, so it makes sense to just use that function.
handle_read_error() relies on the same r10bio being re-used so that,
in the case of a read-only array, setting IO_BLOCKED in r1bio->devs[].bio
ensures read_balance() won't re-use that device.
So when called from raid10_make_request() we clear that array, but not
when called from handle_read_error().
Two parts of handle_read_error() that need to be preserved are the warning
message it prints, so they are conditionally added to
raid10_read_request(). If the failing rdev can be found, messages
are printed. Otherwise they aren't.
Not that as rdev_dec_pending() has already been called on the failing
rdev, we need to use rcu_read_lock() to get a new reference from
the conf. We only use this to get the name of the failing block device.
With this change, we no longer need inc_pending().
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/raid10.c | 120 +++++++++++++++++++--------------------------------
1 file changed, 45 insertions(+), 75 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ff02c058dbdd..ca722570bd38 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1008,15 +1008,6 @@ static void wait_barrier(struct r10conf *conf)
spin_unlock_irq(&conf->resync_lock);
}
-static void inc_pending(struct r10conf *conf)
-{
- /* The current request requires multiple r10_bio, so
- * we need to increment the pending count.
- */
- WARN_ON(!atomic_read(&conf->nr_pending));
- atomic_inc(&conf->nr_pending);
-}
-
static void allow_barrier(struct r10conf *conf)
{
if ((atomic_dec_and_test(&conf->nr_pending)) ||
@@ -1130,8 +1121,36 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
int max_sectors;
sector_t sectors;
struct md_rdev *rdev;
- int slot;
+ char b[BDEVNAME_SIZE];
+ int slot = r10_bio->read_slot;
+ struct md_rdev *err_rdev = NULL;
+ gfp_t gfp = GFP_NOIO;
+
+ if (r10_bio->devs[slot].rdev) {
+ /* This is an error retry, but we cannot
+ * safely dereference the rdev in the r10_bio,
+ * we must use the one in conf.
+ * If it has already been disconnected (unlikely)
+ * we lose the device name in error messages.
+ */
+ int disk;
+ /* As we are blocking raid10, it is a little safer to
+ * use __GFP_HIGH.
+ */
+ gfp = GFP_NOIO | __GFP_HIGH;
+ rcu_read_lock();
+ disk = r10_bio->devs[slot].devnum;
+ err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (err_rdev)
+ bdevname(err_rdev->bdev, b);
+ else {
+ strcpy(b, "???");
+ /* This never gets dereferenced */
+ err_rdev = r10_bio->devs[slot].rdev;
+ }
+ rcu_read_unlock();
+ }
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -1158,12 +1177,22 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
rdev = read_balance(conf, r10_bio, &max_sectors);
if (!rdev) {
+ if (err_rdev) {
+ pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+ mdname(mddev), b,
+ (unsigned long long)r10_bio->sector);
+ }
raid_end_bio_io(r10_bio);
return;
}
+ if (err_rdev)
+ pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+ mdname(mddev),
+ bdevname(rdev->bdev, b),
+ (unsigned long long)r10_bio->sector);
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
- GFP_NOIO, conf->bio_split);
+ gfp, conf->bio_split);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
@@ -1172,7 +1201,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
- read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+ read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
r10_bio->devs[slot].rdev = rdev;
@@ -1487,6 +1516,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
+ memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
@@ -2556,9 +2586,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
- char b[BDEVNAME_SIZE];
- unsigned long do_sync;
- int max_sectors;
dev_t bio_dev;
sector_t bio_last_sector;
@@ -2571,7 +2598,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
- bdevname(bio->bi_bdev, b);
bio_dev = bio->bi_bdev->bd_dev;
bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
@@ -2587,65 +2613,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
md_error(mddev, rdev);
rdev_dec_pending(rdev, mddev);
-
-read_more:
- rdev = read_balance(conf, r10_bio, &max_sectors);
- if (rdev == NULL) {
- pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
- mdname(mddev), b,
- (unsigned long long)r10_bio->sector);
- raid_end_bio_io(r10_bio);
- return;
- }
-
- do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
- slot = r10_bio->read_slot;
- pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- (unsigned long long)r10_bio->sector);
- bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
- bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
- r10_bio->devs[slot].bio = bio;
- r10_bio->devs[slot].rdev = rdev;
- bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
- + choose_data_offset(r10_bio, rdev);
- bio->bi_bdev = rdev->bdev;
- bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
- if (test_bit(FailFast, &rdev->flags) &&
- test_bit(R10BIO_FailFast, &r10_bio->state))
- bio->bi_opf |= MD_FAILFAST;
- bio->bi_private = r10_bio;
- bio->bi_end_io = raid10_end_read_request;
- trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
- bio, bio_dev,
- bio_last_sector - r10_bio->sectors);
-
- if (max_sectors < r10_bio->sectors) {
- /* Drat - have to split this up more */
- struct bio *mbio = r10_bio->master_bio;
- int sectors_handled =
- r10_bio->sector + max_sectors
- - mbio->bi_iter.bi_sector;
- r10_bio->sectors = max_sectors;
- bio_inc_remaining(mbio);
- inc_pending(conf);
- generic_make_request(bio);
-
- r10_bio = mempool_alloc(conf->r10bio_pool,
- GFP_NOIO);
- r10_bio->master_bio = mbio;
- r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
- r10_bio->state = 0;
- set_bit(R10BIO_ReadError,
- &r10_bio->state);
- r10_bio->mddev = mddev;
- r10_bio->sector = mbio->bi_iter.bi_sector
- + sectors_handled;
-
- goto read_more;
- } else
- generic_make_request(bio);
+ allow_barrier(conf);
+ r10_bio->state = 0;
+ raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
}
static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
^ permalink raw reply related
* [md PATCH 08/10] md/raid5: make chunk_aligned_read() split bios more cleanly.
From: NeilBrown @ 2017-04-05 4:05 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>
chunk_aligned_read() currently uses fs_bio_set - which is meant for
filesystems to use - and loops if multiple splits are needed, which is
not best practice.
As this is only used for READ requests, not writes, it is unlikely
to cause a problem. However it is best to be consistent in how
we split bios, and to follow the pattern used in raid1/raid10.
So create a private bioset, bio_split, and use it to perform a single
split, submitting the remainder to generic_make_request() for later
processing.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/raid5.c | 33 +++++++++++++++++----------------
drivers/md/raid5.h | 1 +
2 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6036d5e41ddd..c523fd69a7bc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5234,24 +5234,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
{
struct bio *split;
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
- do {
- sector_t sector = raid_bio->bi_iter.bi_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
- if (sectors < bio_sectors(raid_bio)) {
- split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, raid_bio);
- } else
- split = raid_bio;
+ if (sectors < bio_sectors(raid_bio)) {
+ struct r5conf *conf = mddev->private;
+ split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+ bio_chain(split, raid_bio);
+ generic_make_request(raid_bio);
+ raid_bio = split;
+ }
- if (!raid5_read_one_chunk(mddev, split)) {
- if (split != raid_bio)
- generic_make_request(raid_bio);
- return split;
- }
- } while (split != raid_bio);
+ if (!raid5_read_one_chunk(mddev, raid_bio))
+ return split;
return NULL;
}
@@ -6735,6 +6731,8 @@ static void free_conf(struct r5conf *conf)
if (conf->disks[i].extra_page)
put_page(conf->disks[i].extra_page);
kfree(conf->disks);
+ if (conf->bio_split)
+ bioset_free(conf->bio_split);
kfree(conf->stripe_hashtbl);
kfree(conf->pending_data);
kfree(conf);
@@ -6910,6 +6908,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
goto abort;
}
+ conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+ if (!conf->bio_split)
+ goto abort;
conf->mddev = mddev;
if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cdc7f92e1806..625c7f16fd6b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -646,6 +646,7 @@ struct r5conf {
int pool_size; /* number of disks in stripeheads in pool */
spinlock_t device_lock;
struct disk_info *disks;
+ struct bio_set *bio_split;
/* When taking over an array from a different personality, we store
* the new thread here until we fully activate the array.
^ permalink raw reply related
* [md PATCH 09/10] md/linear: improve bio splitting.
From: NeilBrown @ 2017-04-05 4:05 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>
linear_make_request() uses fs_bio_set, which is meant for filesystems
to use, and loops, possible allocating from the same bio set multiple
times.
These behaviors can theoretically cause deadlocks, though as
linear requests are hardly ever split, it is unlikely in practice.
Change to use mddev->bio_set - otherwise unused for linear, and submit
the tail of a split request to generic_make_request() for it to
handle.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/linear.c | 75 ++++++++++++++++++++++++---------------------------
1 file changed, 36 insertions(+), 39 deletions(-)
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3e38e0207a3e..d67105a15a39 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -249,53 +249,50 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
{
char b[BDEVNAME_SIZE];
struct dev_info *tmp_dev;
- struct bio *split;
sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- tmp_dev = which_dev(mddev, bio_sector);
- start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
- end_sector = tmp_dev->end_sector;
- data_offset = tmp_dev->rdev->data_offset;
- bio->bi_bdev = tmp_dev->rdev->bdev;
-
- if (unlikely(bio_sector >= end_sector ||
- bio_sector < start_sector))
- goto out_of_bounds;
-
- if (unlikely(bio_end_sector(bio) > end_sector)) {
- /* This bio crosses a device boundary, so we have to
- * split it.
- */
- split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to
+ * split it.
+ */
+ struct bio *split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- split->bi_iter.bi_sector = split->bi_iter.bi_sector -
- start_sector + data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, split);
- generic_make_request(split);
- }
- } while (split != bio);
+ bio->bi_bdev = tmp_dev->rdev->bdev;
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ generic_make_request(bio);
+ }
return;
out_of_bounds:
^ permalink raw reply related
* [md PATCH 10/10] md/raid0: fix up bio splitting.
From: NeilBrown @ 2017-04-05 4:05 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>
raid0_make_request() should use a private bio_set rather than the
shared fs_bio_set, which is only meant for filesystems to use.
raid0_make_request() shouldn't loop around using the bio_set
multiple times as that can deadlock.
So use mddev->bio_set and pass the tail to generic_make_request()
instead of looping on it.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/raid0.c | 73 ++++++++++++++++++++++++++--------------------------
1 file changed, 37 insertions(+), 36 deletions(-)
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 56f70c3ad37c..e777e48f55f6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -462,52 +462,53 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
struct md_rdev *tmp_dev;
- struct bio *split;
+ sector_t bio_sector;
+ sector_t sector;
+ unsigned chunk_sects;
+ unsigned sectors;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- sector_t sector = bio_sector;
- unsigned chunk_sects = mddev->chunk_sectors;
+ bio_sector = bio->bi_iter.bi_sector;
+ sector = bio_sector;
+ chunk_sects = mddev->chunk_sectors;
- unsigned sectors = chunk_sects -
- (likely(is_power_of_2(chunk_sects))
- ? (sector & (chunk_sects-1))
- : sector_div(sector, chunk_sects));
+ sectors = chunk_sects -
+ (likely(is_power_of_2(chunk_sects))
+ ? (sector & (chunk_sects-1))
+ : sector_div(sector, chunk_sects));
- /* Restore due to sector_div */
- sector = bio_sector;
+ /* Restore due to sector_div */
+ sector = bio_sector;
- if (sectors < bio_sectors(bio)) {
- split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ if (sectors < bio_sectors(bio)) {
+ struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- zone = find_zone(mddev->private, §or);
- tmp_dev = map_sector(mddev, zone, sector, §or);
- split->bi_bdev = tmp_dev->bdev;
- split->bi_iter.bi_sector = sector + zone->dev_start +
- tmp_dev->data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, split);
- generic_make_request(split);
- }
- } while (split != bio);
+ zone = find_zone(mddev->private, §or);
+ tmp_dev = map_sector(mddev, zone, sector, §or);
+ bio->bi_bdev = tmp_dev->bdev;
+ bio->bi_iter.bi_sector = sector + zone->dev_start +
+ tmp_dev->data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ generic_make_request(bio);
+ }
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
^ permalink raw reply related
* mdadm - dmraid
From: gmx @ 2017-04-05 6:56 UTC (permalink / raw)
To: linux-raid
there exist a problem between packages dmraid and mdadm
incompatibility between both is known. but both packages can be
installed simultan without message
(result: problems - perhaps there are no problems in both packages but
together - even no kernel-problem)
greetings hj
^ permalink raw reply
* Re: [RFC PATCH] raid1: reset 'bi_next' before reuse the bio
From: Michael Wang @ 2017-04-05 7:40 UTC (permalink / raw)
To: NeilBrown, linux-raid, linux-kernel@vger.kernel.org
Cc: Shaohua Li, Jinpu Wang
In-Reply-To: <87shlnizqn.fsf@notabene.neil.brown.name>
On 04/05/2017 12:17 AM, NeilBrown wrote:
[snip]
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 7d67235..0554110 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1986,11 +1986,13 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
>> /* Don't try recovering from here - just fail it
>> * ... unless it is the last working device of course */
>> md_error(mddev, rdev);
>> - if (test_bit(Faulty, &rdev->flags))
>> + if (test_bit(Faulty, &rdev->flags)) {
>> /* Don't try to read from here, but make sure
>> * put_buf does it's thing
>> */
>> bio->bi_end_io = end_sync_write;
>> + bio->bi_next = NULL;
>> + }
>> }
>>
>> while(sectors) {
>
>
> Ah - I see what is happening now. I was looking at the vanilla 4.4
> code, which doesn't have the failfast changes.
My bad to forgot mention... yes our md stuff is very much close to the
upstream.
>
> I don't think your patch is correct though. We really shouldn't be
> re-using that bio, and setting bi_next to NULL just hides the bug. It
> doesn't fix it.
> As the rdev is now Faulty, it doesn't make sense for
> sync_request_write() to submit a write request to it.
Make sense, while still have concerns regarding the design:
* in this case since the read_disk already abandoned, is it fine to
keep r1_bio->read_disk recording the faulty device index?
* we assign the 'end_sync_write' to the original read bio in this
case, but when is this supposed to be called?
>
> Can you confirm that this works please.
Yes, it works.
Tested-by: Michael Wang <yun.wang@profitbricks.com>
Regards,
Michael Wang
>
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index d2d8b8a5bd56..219f1e1f1d1d 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -2180,6 +2180,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
> (i == r1_bio->read_disk ||
> !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
> continue;
> + if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
> + continue;
>
> bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
> if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
>
>
> Thanks,
> NeilBrown
>
^ permalink raw reply
* Re: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Martin K. Petersen @ 2017-04-05 11:40 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-1-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
Christoph,
> This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> supported by the block layer, and switches existing implementations
> of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> removes incorrect discard_zeroes_data, and also switches WRITE SAME
> based zeroing in SCSI to this new method.
Very, very nice. I think this is the correct approach.
I'm going to send two follow-up patches that allow us to use UNMAP for
discards and WRITE SAME w/ UNMAP for zeroout. That appears to be the
preferred configuration for most storage devices.
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 01/25] ѕd: split sd_setup_discard_cmnd
From: Martin K. Petersen @ 2017-04-05 11:42 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
lars.ellenberg, linux-block, linux-scsi, drbd-dev, dm-devel,
linux-raid
In-Reply-To: <20170331163313.31821-2-hch@lst.de>
Christoph Hellwig <hch@lst.de> writes:
> Split sd_setup_discard_cmnd into one function per provisioning type. While
> this creates some very slight duplication of boilerplate code it keeps the
> code modular for additions of new provisioning types, and for reusing the
> write same functions for the upcoming scsi implementation of the Write Zeroes
> operation.
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Minor nit: Patch header should be "sd: ..." instead of " d: ...".
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 02/25] block: renumber REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:43 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
lars.ellenberg, linux-block, linux-scsi, drbd-dev, dm-devel,
linux-raid
In-Reply-To: <20170331163313.31821-3-hch@lst.de>
Christoph Hellwig <hch@lst.de> writes:
> Make life easy for implementations that needs to send a data buffer
> to the device (e.g. SCSI) by numbering it as a data out command.
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 03/25] block: implement splitting of REQ_OP_WRITE_ZEROES bios
From: Martin K. Petersen @ 2017-04-05 11:43 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-4-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> Copy and past the REQ_OP_WRITE_SAME code to prepare to implementations
> that limit the write zeroes size.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 04/25] sd: implement REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:44 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-5-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 09/25] block: stop using blkdev_issue_write_same for zeroing
From: Martin K. Petersen @ 2017-04-05 11:45 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-10-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> We'll always use the WRITE ZEROES code for zeroing now.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 10/25] block: add a flags argument to (__)blkdev_issue_zeroout
From: Martin K. Petersen @ 2017-04-05 11:46 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-11-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch@lst.de> writes:
> Turn the existin discard flag into a new BLKDEV_ZERO_UNMAP flag with
g
> similar semantics, but without referring to diѕcard.
s
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
--
Martin K. Petersen Oracle Linux Engineering
_______________________________________________
drbd-dev mailing list
drbd-dev@lists.linbit.com
http://lists.linbit.com/mailman/listinfo/drbd-dev
^ permalink raw reply
* Re: [PATCH 11/25] block: add a REQ_UNMAP flag for REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:46 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-12-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> If this flag is set logical provisioning capable device should
> release space for the zeroed blocks if possible, if it is not set
> devices should keep the blocks anchored.
>
> Also remove an out of sync kerneldoc comment for a static function
> that would have become even more out of data with this change.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 12/25] block: add a new BLKDEV_ZERO_NOFALLBACK flag
From: Martin K. Petersen @ 2017-04-05 11:47 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-13-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout
> if the caller doesn't want them.
>
> Also clean up the convoluted check for the return condition that this
> new flag is added to.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 13/25] block_dev: use blkdev_issue_zerout for hole punches
From: Martin K. Petersen @ 2017-04-05 11:48 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-14-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> This gets us support for non-discard efficient write of zeroes
> (e.g. NVMe) and prepare for removing the discard_zeroes_data flag.
s
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 14/25] sd: implement unmapping Write Zeroes
From: Martin K. Petersen @ 2017-04-05 11:48 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-15-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> Try to use a write same with unmap bit variant if the device supports it
> and the caller allows for it.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 15/25] nvme: implement REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:49 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-16-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> But now for the real NVMe Write Zeroes yet, just to get rid of the
> discard abuse for zeroing. Also rename the quirk flag to be a bit
> more self-explanatory.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 22/25] block: stop using discards for zeroing
From: Martin K. Petersen @ 2017-04-05 11:55 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-23-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> Now that we have REQ_OP_WRITE_ZEROES implemented for all devices that
> support efficient zeroing of devices we can remove the call to
s/of devices/
> blkdev_issue_discard. This means we only have two ways of zeroing
> left and can simply the code.
simplify
> + * Note that this function may fail with -EOPNOTSUPP if the driver supports
> + * efficient zeroing operation, but the device capabilities can only be
> + * discovered by trial and error. In this case the caller should call the
> + * function again, and it will use the fallback path.
Maybe:
"Note that this function may fail with -EOPNOTSUPP if the driver signals
zeroing offload support but the device fails to process the command (for
some devices there is no non-destructive way to verify whether this
operation is actually supported). If -EOPNOTSUPP is returned, the caller
should retry the blkdev_issue_zeroout() and the fallback path will be
used."
Otherwise OK.
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: [PATCH 25/25] block: remove the discard_zeroes_data flag
From: Martin K. Petersen @ 2017-04-05 11:56 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-26-hch-jcswGhMUV9g@public.gmane.org>
Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:
> Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we
> can kill this hack.
Oh yeah!
Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply
* Re: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Christoph Hellwig @ 2017-04-05 12:08 UTC (permalink / raw)
To: Martin K. Petersen
Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA, snitzer-H+wXaHxf7aLQT0dZR+AlfA,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
linux-block-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A, Christoph Hellwig,
agk-H+wXaHxf7aLQT0dZR+AlfA, drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <yq1vaqjt74b.fsf-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
On Wed, Apr 05, 2017 at 07:40:36AM -0400, Martin K. Petersen wrote:
> > This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> > supported by the block layer, and switches existing implementations
> > of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> > removes incorrect discard_zeroes_data, and also switches WRITE SAME
> > based zeroing in SCSI to this new method.
>
> Very, very nice. I think this is the correct approach.
>
> I'm going to send two follow-up patches that allow us to use UNMAP for
> discards and WRITE SAME w/ UNMAP for zeroout. That appears to be the
> preferred configuration for most storage devices.
Thanks, I'll resend it with your nitpicks fixed, your follow on
patches tacked on and the various reviewed-by tags in a bit.
^ permalink raw reply
* 4.10 + 765d704db: no improvemtn in write rates with md/raid5 group_thread_cnt > 0
From: Nix @ 2017-04-05 14:13 UTC (permalink / raw)
To: linux-raid; +Cc: Shaohua Li
So you'd expect write rates on a RAID-5 array to be higher than write rates on a
single spinning-rust disk, right? Because, even with Shaohua's commit
765d704db1f583630d52 applied atop 4.10, I see little sign of it. Does this
commit depend upon something else to stop death by seeking with
group_thread_cnt > 0? It didn't look like it to me...
The results Shaohua showed in the original commit were very impressive, but for
the life of me I can't figure out how to get anything like them.
With group_thread_cnt 0, I max out at a bit higher than the 240MiB/s one disk in
this array can manage on its own, for obvious reasons: md_raid5 CPU saturation.
(This is with a 512KiB chunksize, stripe_cache_size of 512: yes, I know that's
small, it's just a random slice taken out of a much larger test series: the
array is a smallish non-degraded unjournalled four-element md5 initialized with
--assume-clean for benchmarking). Similar results are seen with ext4 and xfs.
Trimmed-down iozone -a output, so only one serial writer, but still:
stride
kB reclen write rewrite read reread read
64 4 6752 15647 26489 30145 26678
64 8 6639 25236 45101 56289 43158
64 16 6014 9799 67364 89009 60900
64 32 35200 48781 7374 177207 7336
64 64 32420 70551 109395 229470 97868
[...]
32768 64 28181 30576 265403 178438 299889
32768 128 41659 39989 319709 320689 330949
32768 256 45402 44555 320689 357564 451256
32768 512 42559 40556 177862 299744 466529
32768 1024 68005 52814 415747 391507 706177
32768 2048 91701 103918 520689 540128 1061339
32768 4096 177716 169486 487277 514111 683463
32768 8192 218923 233152 539853 616869 453021
32768 16384 199068 198872 569353 619913 535240
[...]
262144 64 25148 32423 385802 378681 27762
262144 128 42510 41626 436994 380669 48004
262144 256 43415 44004 436209 418971 76697
262144 512 41408 40399 342862 401145 116781
262144 1024 68870 59341 465737 507454 265154
262144 2048 101994 91693 589277 582836 296474
262144 4096 176852 166200 581922 649215 421253
262144 8192 226696 221838 601174 633347 569766
262144 16384 307843 297985 644679 659060 569302
524288 64 25155 24527 392401 401908 21461
524288 128 41422 41525 433156 464331 35360
524288 256 42059 43742 443281 415799 70171
524288 512 41253 39360 414306 428993 75387
524288 1024 66081 61151 498880 517952 186959
524288 2048 101272 90418 610467 623258 274331
524288 4096 171489 173381 601689 576333 314290
524288 8192 220943 215226 641713 607459 444827
524288 16384 289055 296340 651010 671623 503633
Read rates are as high as I'd expect for a four-disk RAID-5 array, and the
sequential write output rates, while higher than one one disk can manage, are
thresholded here by the performance of the md I/O thread, as expected.
If I boost group_thread_cnt to, say, 2, I see:
64 4 3677 14565 27936 36056 29629
64 8 6670 21608 53422 69187 32045
64 16 6682 26209 70329 103891 66662
64 32 28624 40048 7312 154556 7345
64 64 38327 43213 89127 260160 90540
[...]
32768 64 14328 18580 265136 282946 308082
32768 128 26310 24803 265762 323414 354685
32768 256 29115 27073 238659 308974 345723
32768 512 21572 21345 293312 314086 345365
32768 1024 43978 38071 395715 345161 545821
32768 2048 82898 70840 293151 470398 922082
32768 4096 143350 124658 391980 659819 617984
32768 8192 164297 227661 570423 645141 515009
32768 16384 157701 171804 568484 451448 350715
[...]
262144 64 17150 17693 391561 382751 28374
262144 128 25385 26498 423685 410359 47148
262144 256 29219 30244 392992 421748 80403
262144 512 24303 24686 399453 371882 122861
262144 1024 42296 42535 403020 508195 261339
262144 2048 75740 63125 606979 589329 296124
262144 4096 134646 137543 562749 590893 392938
262144 8192 237800 239847 631752 620766 475791
262144 16384 267889 304517 635674 612164 598521
524288 64 17691 17776 403333 374628 21673
524288 128 25575 25609 396568 439018 34526
524288 256 29984 29990 412587 437099 71650
524288 512 24971 25599 403074 431581 75471
524288 1024 42545 43657 505740 519112 200811
524288 2048 72519 75604 559987 589069 257654
524288 4096 135122 140745 622450 499336 331273
524288 8192 232848 231307 592729 604849 432296
524288 16384 280105 271252 647725 664868 472363
Larger writes are clearly still thresholded.
Boost the thread count more, here, to 8:
64 4 7834 14388 30346 40300 6124
64 8 17236 21282 6984 37644 6842
64 16 21100 24720 7208 120277 7199
64 32 29411 45553 7374 162411 7357
64 64 3671 59588 78128 256923 82804
[...]
32768 64 14261 17866 261303 289135 294245
32768 128 25832 27639 298172 324766 342822
32768 256 26477 27196 277318 339353 352967
32768 512 17848 19875 339424 272225 387746
32768 1024 36017 38945 482068 464194 110825
32768 2048 64240 67976 551762 505772 76629
32768 4096 71022 117680 578561 696507 752493
32768 8192 161080 207790 564343 556796 546488
32768 16384 172937 233103 521368 603562 418679
[...]
262144 64 17170 17452 352337 351258 27824
262144 128 25318 25522 418977 424859 47112
262144 256 26405 27092 426170 419684 79047
262144 512 20185 20271 398733 411974 135554
262144 1024 39013 38238 497919 438150 180384
262144 2048 71054 70921 586634 535676 258955
262144 4096 113222 121554 616548 604177 293088
262144 8192 184086 187845 551395 586126 496147
262144 16384 286319 272419 645900 659103 589384
524288 64 16980 16756 385746 381476 21462
524288 128 24993 25482 428855 438250 34889
524288 256 26517 26134 448088 395352 70225
524288 512 19534 19484 418764 416630 76975
524288 1024 37645 38370 514030 511638 177818
524288 2048 68469 72200 602688 542627 251162
524288 4096 115467 121220 598738 629120 289589
524288 8192 185093 182044 621233 586919 437162
524288 16384 250990 266257 620428 660663 494770
Still thresholded. Yes, this is only one serial writer, but nonetheless this
seems a bit od.
--
NULL && (void)
^ permalink raw reply
* always use REQ_OP_WRITE_ZEROES for zeroing offload V2
From: Christoph Hellwig @ 2017-04-05 14:21 UTC (permalink / raw)
To: axboe-tSWWG44O7X1aa/9Udqfwiw,
martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
agk-H+wXaHxf7aLQT0dZR+AlfA, snitzer-H+wXaHxf7aLQT0dZR+AlfA,
shli-DgEjT+Ai2ygdnm+yROfE0A,
philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA
Cc: linux-block-u79uwXL29TY76Z2rM5mHXA,
linux-raid-u79uwXL29TY76Z2rM5mHXA,
dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
linux-scsi-u79uwXL29TY76Z2rM5mHXA,
drbd-dev-cunTk1MwBs8qoQakbn7OcQ
This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
supported by the block layer, and switches existing implementations
of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
removes incorrect discard_zeroes_data, and also switches WRITE SAME
based zeroing in SCSI to this new method.
The series is against the block for-next tree.
A git tree is also avaiable at:
git://git.infradead.org/users/hch/block.git discard-rework.2
Gitweb:
http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/discard-rework.2
Changes since V2:
- various spelling fixes
- various reviews captured
- two new patches from Martin at the end
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox