Linux RAID subsystem development
 help / color / mirror / Atom feed
* [md PATCH 05/10] md/raid1: factor out flush_bio_list()
From: NeilBrown @ 2017-04-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>

flush_pending_writes() and raid1_unplug() each contain identical
copies of a fairly large slab of code.  So factor that out into
new flush_bio_list() to simplify maintenance.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c |   67 +++++++++++++++++++++-------------------------------
 1 file changed, 27 insertions(+), 40 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d15268fff052..a70283753a35 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -787,6 +787,31 @@ static int raid1_congested(struct mddev *mddev, int bits)
 	return ret;
 }
 
+static void flush_bio_list(struct r1conf *conf, struct bio *bio)
+{
+	/* flush any pending bitmap writes to
+	 * disk before proceeding w/ I/O */
+	bitmap_unplug(conf->mddev->bitmap);
+	wake_up(&conf->wait_barrier);
+
+	while (bio) { /* submit pending writes */
+		struct bio *next = bio->bi_next;
+		struct md_rdev *rdev = (void*)bio->bi_bdev;
+		bio->bi_next = NULL;
+		bio->bi_bdev = rdev->bdev;
+		if (test_bit(Faulty, &rdev->flags)) {
+			bio->bi_error = -EIO;
+			bio_endio(bio);
+		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
+			/* Just ignore it */
+			bio_endio(bio);
+		else
+			generic_make_request(bio);
+		bio = next;
+	}
+}
+
 static void flush_pending_writes(struct r1conf *conf)
 {
 	/* Any writes that have been queued but are awaiting
@@ -799,27 +824,7 @@ static void flush_pending_writes(struct r1conf *conf)
 		bio = bio_list_get(&conf->pending_bio_list);
 		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
-		/* flush any pending bitmap writes to
-		 * disk before proceeding w/ I/O */
-		bitmap_unplug(conf->mddev->bitmap);
-		wake_up(&conf->wait_barrier);
-
-		while (bio) { /* submit pending writes */
-			struct bio *next = bio->bi_next;
-			struct md_rdev *rdev = (void*)bio->bi_bdev;
-			bio->bi_next = NULL;
-			bio->bi_bdev = rdev->bdev;
-			if (test_bit(Faulty, &rdev->flags)) {
-				bio->bi_error = -EIO;
-				bio_endio(bio);
-			} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-					    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
-				/* Just ignore it */
-				bio_endio(bio);
-			else
-				generic_make_request(bio);
-			bio = next;
-		}
+		flush_bio_list(conf, bio);
 	} else
 		spin_unlock_irq(&conf->device_lock);
 }
@@ -1152,25 +1157,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 
 	/* we aren't scheduling, so we can do the write-out directly. */
 	bio = bio_list_get(&plug->pending);
-	bitmap_unplug(mddev->bitmap);
-	wake_up(&conf->wait_barrier);
-
-	while (bio) { /* submit pending writes */
-		struct bio *next = bio->bi_next;
-		struct md_rdev *rdev = (void*)bio->bi_bdev;
-		bio->bi_next = NULL;
-		bio->bi_bdev = rdev->bdev;
-		if (test_bit(Faulty, &rdev->flags)) {
-			bio->bi_error = -EIO;
-			bio_endio(bio);
-		} else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-				    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
-			/* Just ignore it */
-			bio_endio(bio);
-		else
-			generic_make_request(bio);
-		bio = next;
-	}
+	flush_bio_list(conf, bio);
 	kfree(plug);
 }
 



^ permalink raw reply related

* [md PATCH 06/10] md/raid10: simplify the splitting of requests.
From: NeilBrown @ 2017-04-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>

raid10 splits requests in two different ways for two different
reasons.

First, bio_split() is used to ensure the bio fits with a chunk.
Second, multiple r10bio structures are allocated to represent the
different sections that need to go to different devices, to avoid
known bad blocks.

This can be simplified to just use bio_split() once, and not to use
multiple r10bios.
We delay the split until we know a maximum bio size that can
be handled with a single r10bio, and then split the bio and queue
the remainder for later handling.

As with raid1, we allocate a new bio_set to help with the splitting.
It is not correct to use fs_bio_set in a device driver.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid10.c |  164 ++++++++++++++++-----------------------------------
 drivers/md/raid10.h |    1 
 2 files changed, 51 insertions(+), 114 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0f13d57ef646..ff02c058dbdd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1127,7 +1127,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	struct bio *read_bio;
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
-	int sectors_handled;
 	int max_sectors;
 	sector_t sectors;
 	struct md_rdev *rdev;
@@ -1140,7 +1139,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	 */
 	wait_barrier(conf);
 
-	sectors = bio_sectors(bio);
+	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1157,17 +1156,23 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		wait_barrier(conf);
 	}
 
-read_again:
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	if (!rdev) {
 		raid_end_bio_io(r10_bio);
 		return;
 	}
+	if (max_sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, max_sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r10_bio->master_bio = bio;
+		r10_bio->sectors = max_sectors;
+	}
 	slot = r10_bio->read_slot;
 
 	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
-		 max_sectors);
 
 	r10_bio->devs[slot].bio = read_bio;
 	r10_bio->devs[slot].rdev = rdev;
@@ -1186,40 +1191,13 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
 	                              read_bio, disk_devt(mddev->gendisk),
 	                              r10_bio->sector);
-	if (max_sectors < r10_bio->sectors) {
-		/*
-		 * Could not read all from this device, so we will need another
-		 * r10_bio.
-		 */
-		sectors_handled = (r10_bio->sector + max_sectors
-				   - bio->bi_iter.bi_sector);
-		r10_bio->sectors = max_sectors;
-		inc_pending(conf);
-		bio_inc_remaining(bio);
-		/*
-		 * Cannot call generic_make_request directly as that will be
-		 * queued in __generic_make_request and subsequent
-		 * mempool_alloc might block waiting for it.  so hand bio over
-		 * to raid10d.
-		 */
-		reschedule_retry(r10_bio);
-
-		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-		r10_bio->master_bio = bio;
-		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-		r10_bio->state = 0;
-		r10_bio->mddev = mddev;
-		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
-		goto read_again;
-	} else
-		generic_make_request(read_bio);
+	generic_make_request(read_bio);
 	return;
 }
 
 static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 				  struct bio *bio, bool replacement,
-				  int n_copy, int max_sectors)
+				  int n_copy)
 {
 	const int op = bio_op(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
@@ -1243,7 +1221,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 		rdev = conf->mirrors[devnum].rdev;
 
 	mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
 	if (replacement)
 		r10_bio->devs[n_copy].repl_bio = mbio;
 	else
@@ -1294,7 +1271,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	int i;
 	struct md_rdev *blocked_rdev;
 	sector_t sectors;
-	int sectors_handled;
 	int max_sectors;
 
 	md_write_start(mddev, bio);
@@ -1306,7 +1282,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 	wait_barrier(conf);
 
-	sectors = bio_sectors(bio);
+	sectors = r10_bio->sectors;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
@@ -1476,44 +1452,29 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
 	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
-	sectors_handled = r10_bio->sector + max_sectors -
-		bio->bi_iter.bi_sector;
+
+	if (r10_bio->sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, r10_bio->sectors,
+					      GFP_NOIO, conf->bio_split);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		r10_bio->master_bio = bio;
+	}
 
 	atomic_set(&r10_bio->remaining, 1);
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 
 	for (i = 0; i < conf->copies; i++) {
 		if (r10_bio->devs[i].bio)
-			raid10_write_one_disk(mddev, r10_bio, bio, false,
-					      i, max_sectors);
+			raid10_write_one_disk(mddev, r10_bio, bio, false, i);
 		if (r10_bio->devs[i].repl_bio)
-			raid10_write_one_disk(mddev, r10_bio, bio, true,
-					      i, max_sectors);
-	}
-
-	/* Don't remove the bias on 'remaining' (one_write_done) until
-	 * after checking if we need to go around again.
-	 */
-
-	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r10_bio and it needs to be counted */
-		inc_pending(conf);
-		bio_inc_remaining(bio);
-		one_write_done(r10_bio);
-		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-		r10_bio->master_bio = bio;
-		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-
-		r10_bio->mddev = mddev;
-		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
-		r10_bio->state = 0;
-		goto retry_write;
+			raid10_write_one_disk(mddev, r10_bio, bio, true, i);
 	}
 	one_write_done(r10_bio);
 }
 
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
@@ -1521,7 +1482,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 	r10_bio->master_bio = bio;
-	r10_bio->sectors = bio_sectors(bio);
+	r10_bio->sectors = sectors;
 
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_iter.bi_sector;
@@ -1538,54 +1499,26 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 	struct r10conf *conf = mddev->private;
 	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
 	int chunk_sects = chunk_mask + 1;
-
-	struct bio *split;
+	int sectors = bio_sectors(bio);
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	do {
-
-		/*
-		 * If this request crosses a chunk boundary, we need to split
-		 * it.
-		 */
-		if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
-			     bio_sectors(bio) > chunk_sects
-			     && (conf->geo.near_copies < conf->geo.raid_disks
-				 || conf->prev.near_copies <
-				 conf->prev.raid_disks))) {
-			split = bio_split(bio, chunk_sects -
-					  (bio->bi_iter.bi_sector &
-					   (chunk_sects - 1)),
-					  GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
-
-		/*
-		 * If a bio is splitted, the first part of bio will pass
-		 * barrier but the bio is queued in current->bio_list (see
-		 * generic_make_request). If there is a raise_barrier() called
-		 * here, the second part of bio can't pass barrier. But since
-		 * the first part bio isn't dispatched to underlaying disks
-		 * yet, the barrier is never released, hence raise_barrier will
-		 * alays wait. We have a deadlock.
-		 * Note, this only happens in read path. For write path, the
-		 * first part of bio is dispatched in a schedule() call
-		 * (because of blk plug) or offloaded to raid10d.
-		 * Quitting from the function immediately can change the bio
-		 * order queued in bio_list and avoid the deadlock.
-		 */
-		__make_request(mddev, split);
-		if (split != bio && bio_data_dir(bio) == READ) {
-			generic_make_request(bio);
-			break;
-		}
-	} while (split != bio);
+	/*
+	 * If this request crosses a chunk boundary, we need to split
+	 * it.
+	 */
+	if (unlikely((bio->bi_iter.bi_sector & chunk_mask) +
+		     sectors > chunk_sects
+		     && (conf->geo.near_copies < conf->geo.raid_disks
+			 || conf->prev.near_copies <
+			 conf->prev.raid_disks)))
+		sectors = chunk_sects -
+			(bio->bi_iter.bi_sector &
+			 (chunk_sects - 1));
+	__make_request(mddev, bio, sectors);
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
@@ -2873,13 +2806,8 @@ static void raid10d(struct md_thread *thread)
 			recovery_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
 			handle_read_error(mddev, r10_bio);
-		else {
-			/* just a partial read to be scheduled from a
-			 * separate context
-			 */
-			int slot = r10_bio->read_slot;
-			generic_make_request(r10_bio->devs[slot].bio);
-		}
+		else
+			WARN_ON_ONCE(1);
 
 		cond_resched();
 		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
@@ -3652,6 +3580,10 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	if (!conf->r10bio_pool)
 		goto out;
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto out;
+
 	calc_sectors(conf, mddev->dev_sectors);
 	if (mddev->reshape_position == MaxSector) {
 		conf->prev = conf->geo;
@@ -3689,6 +3621,8 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 		mempool_destroy(conf->r10bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
+		if (conf->bio_split)
+			bioset_free(conf->bio_split);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
@@ -3898,6 +3832,8 @@ static void raid10_free(struct mddev *mddev, void *priv)
 	kfree(conf->mirrors);
 	kfree(conf->mirrors_old);
 	kfree(conf->mirrors_new);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf);
 }
 
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 3162615e57bd..735ce1a3d260 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -82,6 +82,7 @@ struct r10conf {
 	mempool_t		*r10bio_pool;
 	mempool_t		*r10buf_pool;
 	struct page		*tmppage;
+	struct bio_set		*bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.



^ permalink raw reply related

* [md PATCH 07/10] md/raid10: simplify handle_read_error()
From: NeilBrown @ 2017-04-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>

handle_read_error() duplicates a lot of the work that raid10_read_request()
does, so it makes sense to just use that function.

handle_read_error() relies on the same r10bio being re-used so that,
in the case of a read-only array, setting IO_BLOCKED in r1bio->devs[].bio
ensures read_balance() won't re-use that device.
So when called from raid10_make_request() we clear that array, but not
when called from handle_read_error().

Two parts of handle_read_error() that need to be preserved are the warning
message it prints, so they are conditionally added to
raid10_read_request().  If the failing rdev can be found, messages
are printed.  Otherwise they aren't.

Not that as rdev_dec_pending() has already been called on the failing
rdev, we need to use rcu_read_lock() to get a new reference from
the conf.  We only use this to get the name of the failing block device.

With this change, we no longer need inc_pending().

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid10.c |  120 +++++++++++++++++++--------------------------------
 1 file changed, 45 insertions(+), 75 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ff02c058dbdd..ca722570bd38 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1008,15 +1008,6 @@ static void wait_barrier(struct r10conf *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
-static void inc_pending(struct r10conf *conf)
-{
-	/* The current request requires multiple r10_bio, so
-	 * we need to increment the pending count.
-	 */
-	WARN_ON(!atomic_read(&conf->nr_pending));
-	atomic_inc(&conf->nr_pending);
-}
-
 static void allow_barrier(struct r10conf *conf)
 {
 	if ((atomic_dec_and_test(&conf->nr_pending)) ||
@@ -1130,8 +1121,36 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	int max_sectors;
 	sector_t sectors;
 	struct md_rdev *rdev;
-	int slot;
+	char b[BDEVNAME_SIZE];
+	int slot = r10_bio->read_slot;
+	struct md_rdev *err_rdev = NULL;
+	gfp_t gfp = GFP_NOIO;
+
+	if (r10_bio->devs[slot].rdev) {
+		/* This is an error retry, but we cannot
+		 * safely dereference the rdev in the r10_bio,
+		 * we must use the one in conf.
+		 * If it has already been disconnected (unlikely)
+		 * we lose the device name in error messages.
+		 */
+		int disk;
+		/* As we are blocking raid10, it is a little safer to
+		 * use __GFP_HIGH.
+		 */
+		gfp = GFP_NOIO | __GFP_HIGH;
 
+		rcu_read_lock();
+		disk = r10_bio->devs[slot].devnum;
+		err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
+		if (err_rdev)
+			bdevname(err_rdev->bdev, b);
+		else {
+			strcpy(b, "???");
+			/* This never gets dereferenced */
+			err_rdev = r10_bio->devs[slot].rdev;
+		}
+		rcu_read_unlock();
+	}
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
@@ -1158,12 +1177,22 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	if (!rdev) {
+		if (err_rdev) {
+			pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
+					    mdname(mddev), b,
+					    (unsigned long long)r10_bio->sector);
+		}
 		raid_end_bio_io(r10_bio);
 		return;
 	}
+	if (err_rdev)
+		pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
+				   mdname(mddev),
+				   bdevname(rdev->bdev, b),
+				   (unsigned long long)r10_bio->sector);
 	if (max_sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, max_sectors,
-					      GFP_NOIO, conf->bio_split);
+					      gfp, conf->bio_split);
 		bio_chain(split, bio);
 		generic_make_request(bio);
 		bio = split;
@@ -1172,7 +1201,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}
 	slot = r10_bio->read_slot;
 
-	read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
+	read_bio = bio_clone_fast(bio, gfp, mddev->bio_set);
 
 	r10_bio->devs[slot].bio = read_bio;
 	r10_bio->devs[slot].rdev = rdev;
@@ -1487,6 +1516,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
 
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
@@ -2556,9 +2586,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	struct bio *bio;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev = r10_bio->devs[slot].rdev;
-	char b[BDEVNAME_SIZE];
-	unsigned long do_sync;
-	int max_sectors;
 	dev_t bio_dev;
 	sector_t bio_last_sector;
 
@@ -2571,7 +2598,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 	 * frozen.
 	 */
 	bio = r10_bio->devs[slot].bio;
-	bdevname(bio->bi_bdev, b);
 	bio_dev = bio->bi_bdev->bd_dev;
 	bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
 	bio_put(bio);
@@ -2587,65 +2613,9 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 		md_error(mddev, rdev);
 
 	rdev_dec_pending(rdev, mddev);
-
-read_more:
-	rdev = read_balance(conf, r10_bio, &max_sectors);
-	if (rdev == NULL) {
-		pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n",
-				    mdname(mddev), b,
-				    (unsigned long long)r10_bio->sector);
-		raid_end_bio_io(r10_bio);
-		return;
-	}
-
-	do_sync = (r10_bio->master_bio->bi_opf & REQ_SYNC);
-	slot = r10_bio->read_slot;
-	pr_err_ratelimited("md/raid10:%s: %s: redirecting sector %llu to another mirror\n",
-			   mdname(mddev),
-			   bdevname(rdev->bdev, b),
-			   (unsigned long long)r10_bio->sector);
-	bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set);
-	bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors);
-	r10_bio->devs[slot].bio = bio;
-	r10_bio->devs[slot].rdev = rdev;
-	bio->bi_iter.bi_sector = r10_bio->devs[slot].addr
-		+ choose_data_offset(r10_bio, rdev);
-	bio->bi_bdev = rdev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
-	if (test_bit(FailFast, &rdev->flags) &&
-	    test_bit(R10BIO_FailFast, &r10_bio->state))
-		bio->bi_opf |= MD_FAILFAST;
-	bio->bi_private = r10_bio;
-	bio->bi_end_io = raid10_end_read_request;
-	trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-			      bio, bio_dev,
-			      bio_last_sector - r10_bio->sectors);
-
-	if (max_sectors < r10_bio->sectors) {
-		/* Drat - have to split this up more */
-		struct bio *mbio = r10_bio->master_bio;
-		int sectors_handled =
-			r10_bio->sector + max_sectors
-			- mbio->bi_iter.bi_sector;
-		r10_bio->sectors = max_sectors;
-		bio_inc_remaining(mbio);
-		inc_pending(conf);
-		generic_make_request(bio);
-
-		r10_bio = mempool_alloc(conf->r10bio_pool,
-					GFP_NOIO);
-		r10_bio->master_bio = mbio;
-		r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
-		r10_bio->state = 0;
-		set_bit(R10BIO_ReadError,
-			&r10_bio->state);
-		r10_bio->mddev = mddev;
-		r10_bio->sector = mbio->bi_iter.bi_sector
-			+ sectors_handled;
-
-		goto read_more;
-	} else
-		generic_make_request(bio);
+	allow_barrier(conf);
+	r10_bio->state = 0;
+	raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
 }
 
 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)



^ permalink raw reply related

* [md PATCH 08/10] md/raid5: make chunk_aligned_read() split bios more cleanly.
From: NeilBrown @ 2017-04-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>

chunk_aligned_read() currently uses fs_bio_set - which is meant for
filesystems to use - and loops if multiple splits are needed, which is
not best practice.
As this is only used for READ requests, not writes, it is unlikely
to cause a problem.  However it is best to be consistent in how
we split bios, and to follow the pattern used in raid1/raid10.

So create a private bioset, bio_split, and use it to perform a single
split, submitting the remainder to generic_make_request() for later
processing.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |   33 +++++++++++++++++----------------
 drivers/md/raid5.h |    1 +
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6036d5e41ddd..c523fd69a7bc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5234,24 +5234,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 {
 	struct bio *split;
+	sector_t sector = raid_bio->bi_iter.bi_sector;
+	unsigned chunk_sects = mddev->chunk_sectors;
+	unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
 
-	do {
-		sector_t sector = raid_bio->bi_iter.bi_sector;
-		unsigned chunk_sects = mddev->chunk_sectors;
-		unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
-		if (sectors < bio_sectors(raid_bio)) {
-			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, raid_bio);
-		} else
-			split = raid_bio;
+	if (sectors < bio_sectors(raid_bio)) {
+		struct r5conf *conf = mddev->private;
+		split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+		bio_chain(split, raid_bio);
+		generic_make_request(raid_bio);
+		raid_bio = split;
+	}
 
-		if (!raid5_read_one_chunk(mddev, split)) {
-			if (split != raid_bio)
-				generic_make_request(raid_bio);
-			return split;
-		}
-	} while (split != raid_bio);
+	if (!raid5_read_one_chunk(mddev, raid_bio))
+		return split;
 
 	return NULL;
 }
@@ -6735,6 +6731,8 @@ static void free_conf(struct r5conf *conf)
 		if (conf->disks[i].extra_page)
 			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf->pending_data);
 	kfree(conf);
@@ -6910,6 +6908,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			goto abort;
 	}
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto abort;
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cdc7f92e1806..625c7f16fd6b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -646,6 +646,7 @@ struct r5conf {
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
+	struct bio_set		*bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.



^ permalink raw reply related

* [md PATCH 09/10] md/linear: improve bio splitting.
From: NeilBrown @ 2017-04-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>

linear_make_request() uses fs_bio_set, which is meant for filesystems
to use, and loops, possible allocating  from the same bio set multiple
times.
These behaviors can theoretically cause deadlocks, though as
linear requests are hardly ever split, it is unlikely in practice.

Change to use mddev->bio_set - otherwise unused for linear, and submit
the tail of a split request to generic_make_request() for it to
handle.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/linear.c |   75 ++++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 39 deletions(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3e38e0207a3e..d67105a15a39 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -249,53 +249,50 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	struct dev_info *tmp_dev;
-	struct bio *split;
 	sector_t start_sector, end_sector, data_offset;
+	sector_t bio_sector = bio->bi_iter.bi_sector;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	do {
-		sector_t bio_sector = bio->bi_iter.bi_sector;
-		tmp_dev = which_dev(mddev, bio_sector);
-		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
-		end_sector = tmp_dev->end_sector;
-		data_offset = tmp_dev->rdev->data_offset;
-		bio->bi_bdev = tmp_dev->rdev->bdev;
-
-		if (unlikely(bio_sector >= end_sector ||
-			     bio_sector < start_sector))
-			goto out_of_bounds;
-
-		if (unlikely(bio_end_sector(bio) > end_sector)) {
-			/* This bio crosses a device boundary, so we have to
-			 * split it.
-			 */
-			split = bio_split(bio, end_sector - bio_sector,
-					  GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
+	tmp_dev = which_dev(mddev, bio_sector);
+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+	end_sector = tmp_dev->end_sector;
+	data_offset = tmp_dev->rdev->data_offset;
+
+	if (unlikely(bio_sector >= end_sector ||
+		     bio_sector < start_sector))
+		goto out_of_bounds;
+
+	if (unlikely(bio_end_sector(bio) > end_sector)) {
+		/* This bio crosses a device boundary, so we have to
+		 * split it.
+		 */
+		struct bio *split = bio_split(bio, end_sector - bio_sector,
+					      GFP_NOIO, mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+	}
 
-		split->bi_iter.bi_sector = split->bi_iter.bi_sector -
-			start_sector + data_offset;
-
-		if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
-			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
-			/* Just ignore it */
-			bio_endio(split);
-		} else {
-			if (mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
-						      split, disk_devt(mddev->gendisk),
-						      bio_sector);
-			mddev_check_writesame(mddev, split);
-			generic_make_request(split);
-		}
-	} while (split != bio);
+	bio->bi_bdev = tmp_dev->rdev->bdev;
+	bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+		start_sector + data_offset;
+
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio);
+	} else {
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+					      bio, disk_devt(mddev->gendisk),
+					      bio_sector);
+		mddev_check_writesame(mddev, bio);
+		generic_make_request(bio);
+	}
 	return;
 
 out_of_bounds:



^ permalink raw reply related

* [md PATCH 10/10] md/raid0: fix up bio splitting.
From: NeilBrown @ 2017-04-05  4:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <149136485390.25893.1797855041954158826.stgit@noble>

raid0_make_request() should use a private bio_set rather than the
shared fs_bio_set, which is only meant for filesystems to use.

raid0_make_request() shouldn't loop around using the bio_set
multiple times as that can deadlock.

So use mddev->bio_set and pass the tail to generic_make_request()
instead of looping on it.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid0.c |   73 ++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 56f70c3ad37c..e777e48f55f6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -462,52 +462,53 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
-	struct bio *split;
+	sector_t bio_sector;
+	sector_t sector;
+	unsigned chunk_sects;
+	unsigned sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 
-	do {
-		sector_t bio_sector = bio->bi_iter.bi_sector;
-		sector_t sector = bio_sector;
-		unsigned chunk_sects = mddev->chunk_sectors;
+	bio_sector = bio->bi_iter.bi_sector;
+	sector = bio_sector;
+	chunk_sects = mddev->chunk_sectors;
 
-		unsigned sectors = chunk_sects -
-			(likely(is_power_of_2(chunk_sects))
-			 ? (sector & (chunk_sects-1))
-			 : sector_div(sector, chunk_sects));
+	sectors = chunk_sects -
+		(likely(is_power_of_2(chunk_sects))
+		 ? (sector & (chunk_sects-1))
+		 : sector_div(sector, chunk_sects));
 
-		/* Restore due to sector_div */
-		sector = bio_sector;
+	/* Restore due to sector_div */
+	sector = bio_sector;
 
-		if (sectors < bio_sectors(bio)) {
-			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, bio);
-		} else {
-			split = bio;
-		}
+	if (sectors < bio_sectors(bio)) {
+		struct bio *split = bio_split(bio, sectors, GFP_NOIO, mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+	}
 
-		zone = find_zone(mddev->private, &sector);
-		tmp_dev = map_sector(mddev, zone, sector, &sector);
-		split->bi_bdev = tmp_dev->bdev;
-		split->bi_iter.bi_sector = sector + zone->dev_start +
-			tmp_dev->data_offset;
-
-		if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
-			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
-			/* Just ignore it */
-			bio_endio(split);
-		} else {
-			if (mddev->gendisk)
-				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
-						      split, disk_devt(mddev->gendisk),
-						      bio_sector);
-			mddev_check_writesame(mddev, split);
-			generic_make_request(split);
-		}
-	} while (split != bio);
+	zone = find_zone(mddev->private, &sector);
+	tmp_dev = map_sector(mddev, zone, sector, &sector);
+	bio->bi_bdev = tmp_dev->bdev;
+	bio->bi_iter.bi_sector = sector + zone->dev_start +
+		tmp_dev->data_offset;
+
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+		/* Just ignore it */
+		bio_endio(bio);
+	} else {
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+					      bio, disk_devt(mddev->gendisk),
+					      bio_sector);
+		mddev_check_writesame(mddev, bio);
+		generic_make_request(bio);
+	}
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)



^ permalink raw reply related

* mdadm - dmraid
From: gmx @ 2017-04-05  6:56 UTC (permalink / raw)
  To: linux-raid

there exist a problem between packages dmraid and mdadm

incompatibility between both is known. but both packages can be 
installed simultan without message

(result: problems - perhaps there are no problems in both packages but 
together - even no kernel-problem)

greetings hj


^ permalink raw reply

* Re: [RFC PATCH] raid1: reset 'bi_next' before reuse the bio
From: Michael Wang @ 2017-04-05  7:40 UTC (permalink / raw)
  To: NeilBrown, linux-raid, linux-kernel@vger.kernel.org
  Cc: Shaohua Li, Jinpu Wang
In-Reply-To: <87shlnizqn.fsf@notabene.neil.brown.name>



On 04/05/2017 12:17 AM, NeilBrown wrote:
[snip]
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index 7d67235..0554110 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -1986,11 +1986,13 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
>>  		/* Don't try recovering from here - just fail it
>>  		 * ... unless it is the last working device of course */
>>  		md_error(mddev, rdev);
>> -		if (test_bit(Faulty, &rdev->flags))
>> +		if (test_bit(Faulty, &rdev->flags)) {
>>  			/* Don't try to read from here, but make sure
>>  			 * put_buf does it's thing
>>  			 */
>>  			bio->bi_end_io = end_sync_write;
>> +			bio->bi_next = NULL;
>> +		}
>>  	}
>>  
>>  	while(sectors) {
> 
> 
> Ah - I see what is happening now.  I was looking at the vanilla 4.4
> code, which doesn't have the failfast changes.

My bad to forgot mention... yes our md stuff is very much close to the
upstream.

> 
> I don't think your patch is correct though.  We really shouldn't be
> re-using that bio, and setting bi_next to NULL just hides the bug.  It
> doesn't fix it.
> As the rdev is now Faulty, it doesn't make sense for
> sync_request_write() to submit a write request to it.

Make sense, while still have concerns regarding the design:
  * in this case since the read_disk already abandoned, is it fine to
    keep r1_bio->read_disk recording the faulty device index?
  * we assign the 'end_sync_write' to the original read bio in this
    case, but when is this supposed to be called?

> 
> Can you confirm that this works please.

Yes, it works.

Tested-by: Michael Wang <yun.wang@profitbricks.com>

Regards,
Michael Wang

> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index d2d8b8a5bd56..219f1e1f1d1d 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -2180,6 +2180,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
>  		     (i == r1_bio->read_disk ||
>  		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
>  			continue;
> +		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags))
> +			continue;
>  
>  		bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
>  		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
> 
> 
> Thanks,
> NeilBrown
> 

^ permalink raw reply

* Re: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Martin K. Petersen @ 2017-04-05 11:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-1-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

Christoph,

> This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> supported by the block layer, and switches existing implementations
> of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> removes incorrect discard_zeroes_data, and also switches WRITE SAME
> based zeroing in SCSI to this new method.

Very, very nice. I think this is the correct approach.

I'm going to send two follow-up patches that allow us to use UNMAP for
discards and WRITE SAME w/ UNMAP for zeroout. That appears to be the
preferred configuration for most storage devices.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 01/25] ѕd: split sd_setup_discard_cmnd
From: Martin K. Petersen @ 2017-04-05 11:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg, linux-block, linux-scsi, drbd-dev, dm-devel,
	linux-raid
In-Reply-To: <20170331163313.31821-2-hch@lst.de>

Christoph Hellwig <hch@lst.de> writes:

> Split sd_setup_discard_cmnd into one function per provisioning type.  While
> this creates some very slight duplication of boilerplate code it keeps the
> code modular for additions of new provisioning types, and for reusing the
> write same functions for the upcoming scsi implementation of the Write Zeroes
> operation.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>

Minor nit: Patch header should be "sd: ..." instead of " d: ...".

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 02/25] block: renumber REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg, linux-block, linux-scsi, drbd-dev, dm-devel,
	linux-raid
In-Reply-To: <20170331163313.31821-3-hch@lst.de>

Christoph Hellwig <hch@lst.de> writes:

> Make life easy for implementations that needs to send a data buffer
> to the device (e.g. SCSI) by numbering it as a data out command.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 03/25] block: implement splitting of REQ_OP_WRITE_ZEROES bios
From: Martin K. Petersen @ 2017-04-05 11:43 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-4-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> Copy and past the REQ_OP_WRITE_SAME code to prepare to implementations
> that limit the write zeroes size.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 04/25] sd: implement REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-5-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 09/25] block: stop using blkdev_issue_write_same for zeroing
From: Martin K. Petersen @ 2017-04-05 11:45 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-10-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> We'll always use the WRITE ZEROES code for zeroing now.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 10/25] block: add a flags argument to (__)blkdev_issue_zeroout
From: Martin K. Petersen @ 2017-04-05 11:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-11-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch@lst.de> writes:

> Turn the existin discard flag into a new BLKDEV_ZERO_UNMAP flag with
                  g

> similar semantics, but without referring to diѕcard.
                                                s

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>

-- 
Martin K. Petersen	Oracle Linux Engineering
_______________________________________________
drbd-dev mailing list
drbd-dev@lists.linbit.com
http://lists.linbit.com/mailman/listinfo/drbd-dev

^ permalink raw reply

* Re: [PATCH 11/25] block: add a REQ_UNMAP flag for REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-12-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> If this flag is set logical provisioning capable device should
> release space for the zeroed blocks if possible, if it is not set
> devices should keep the blocks anchored.
>
> Also remove an out of sync kerneldoc comment for a static function
> that would have become even more out of data with this change.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 12/25] block: add a new BLKDEV_ZERO_NOFALLBACK flag
From: Martin K. Petersen @ 2017-04-05 11:47 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-13-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout
> if the caller doesn't want them.
>
> Also clean up the convoluted check for the return condition that this
> new flag is added to.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 13/25] block_dev: use blkdev_issue_zerout for hole punches
From: Martin K. Petersen @ 2017-04-05 11:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-14-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> This gets us support for non-discard efficient write of zeroes
> (e.g. NVMe) and prepare for removing the discard_zeroes_data flag.
                         s

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 14/25] sd: implement unmapping Write Zeroes
From: Martin K. Petersen @ 2017-04-05 11:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-15-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> Try to use a write same with unmap bit variant if the device supports it
> and the caller allows for it.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 15/25] nvme: implement REQ_OP_WRITE_ZEROES
From: Martin K. Petersen @ 2017-04-05 11:49 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-16-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> But now for the real NVMe Write Zeroes yet, just to get rid of the
> discard abuse for zeroing.  Also rename the quirk flag to be a bit
> more self-explanatory.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 22/25] block: stop using discards for zeroing
From: Martin K. Petersen @ 2017-04-05 11:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-23-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> Now that we have REQ_OP_WRITE_ZEROES implemented for all devices that
> support efficient zeroing of devices we can remove the call to

s/of devices/

> blkdev_issue_discard.  This means we only have two ways of zeroing
> left and can simply the code.

simplify

> + *  Note that this function may fail with -EOPNOTSUPP if the driver supports
> + *  efficient zeroing operation, but the device capabilities can only be
> + *  discovered by trial and error.  In this case the caller should call the
> + *  function again, and it will use the fallback path.

Maybe:

"Note that this function may fail with -EOPNOTSUPP if the driver signals
zeroing offload support but the device fails to process the command (for
some devices there is no non-destructive way to verify whether this
operation is actually supported). If -EOPNOTSUPP is returned, the caller
should retry the blkdev_issue_zeroout() and the fallback path will be
used."

Otherwise OK.

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: [PATCH 25/25] block: remove the discard_zeroes_data flag
From: Martin K. Petersen @ 2017-04-05 11:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, agk-H+wXaHxf7aLQT0dZR+AlfA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <20170331163313.31821-26-hch-jcswGhMUV9g@public.gmane.org>

Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org> writes:

> Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we
> can kill this hack.

Oh yeah!

Reviewed-by: Martin K. Petersen <martin.petersen-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Christoph Hellwig @ 2017-04-05 12:08 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: axboe-tSWWG44O7X1aa/9Udqfwiw, linux-raid-u79uwXL29TY76Z2rM5mHXA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA, snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	linux-block-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A, Christoph Hellwig,
	agk-H+wXaHxf7aLQT0dZR+AlfA, drbd-dev-cunTk1MwBs8qoQakbn7OcQ
In-Reply-To: <yq1vaqjt74b.fsf-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>

On Wed, Apr 05, 2017 at 07:40:36AM -0400, Martin K. Petersen wrote:
> > This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> > supported by the block layer, and switches existing implementations
> > of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> > removes incorrect discard_zeroes_data, and also switches WRITE SAME
> > based zeroing in SCSI to this new method.
> 
> Very, very nice. I think this is the correct approach.
> 
> I'm going to send two follow-up patches that allow us to use UNMAP for
> discards and WRITE SAME w/ UNMAP for zeroout. That appears to be the
> preferred configuration for most storage devices.

Thanks, I'll resend it with your nitpicks fixed, your follow on
patches tacked on and the various reviewed-by tags in a bit.

^ permalink raw reply

* 4.10 + 765d704db: no improvemtn in write rates with md/raid5 group_thread_cnt > 0
From: Nix @ 2017-04-05 14:13 UTC (permalink / raw)
  To: linux-raid; +Cc: Shaohua Li

So you'd expect write rates on a RAID-5 array to be higher than write rates on a
single spinning-rust disk, right? Because, even with Shaohua's commit
765d704db1f583630d52 applied atop 4.10, I see little sign of it. Does this
commit depend upon something else to stop death by seeking with
group_thread_cnt > 0? It didn't look like it to me...

The results Shaohua showed in the original commit were very impressive, but for
the life of me I can't figure out how to get anything like them.


With group_thread_cnt 0, I max out at a bit higher than the 240MiB/s one disk in
this array can manage on its own, for obvious reasons: md_raid5 CPU saturation.
(This is with a 512KiB chunksize, stripe_cache_size of 512: yes, I know that's
small, it's just a random slice taken out of a much larger test series: the
array is a smallish non-degraded unjournalled four-element md5 initialized with
--assume-clean for benchmarking). Similar results are seen with ext4 and xfs.
Trimmed-down iozone -a output, so only one serial writer, but still:

                                                       stride
     kB  reclen    write  rewrite    read    reread      read
     64       4     6752    15647    26489    30145     26678
     64       8     6639    25236    45101    56289     43158
     64      16     6014     9799    67364    89009     60900
     64      32    35200    48781     7374   177207      7336
     64      64    32420    70551   109395   229470     97868
[...]
  32768      64    28181    30576   265403   178438    299889
  32768     128    41659    39989   319709   320689    330949
  32768     256    45402    44555   320689   357564    451256
  32768     512    42559    40556   177862   299744    466529
  32768    1024    68005    52814   415747   391507    706177
  32768    2048    91701   103918   520689   540128   1061339
  32768    4096   177716   169486   487277   514111    683463
  32768    8192   218923   233152   539853   616869    453021
  32768   16384   199068   198872   569353   619913    535240
[...]
 262144      64    25148    32423   385802   378681     27762
 262144     128    42510    41626   436994   380669     48004
 262144     256    43415    44004   436209   418971     76697
 262144     512    41408    40399   342862   401145    116781
 262144    1024    68870    59341   465737   507454    265154
 262144    2048   101994    91693   589277   582836    296474
 262144    4096   176852   166200   581922   649215    421253
 262144    8192   226696   221838   601174   633347    569766
 262144   16384   307843   297985   644679   659060    569302
 524288      64    25155    24527   392401   401908     21461
 524288     128    41422    41525   433156   464331     35360
 524288     256    42059    43742   443281   415799     70171
 524288     512    41253    39360   414306   428993     75387
 524288    1024    66081    61151   498880   517952    186959
 524288    2048   101272    90418   610467   623258    274331
 524288    4096   171489   173381   601689   576333    314290
 524288    8192   220943   215226   641713   607459    444827
 524288   16384   289055   296340   651010   671623    503633

Read rates are as high as I'd expect for a four-disk RAID-5 array, and the
sequential write output rates, while higher than one one disk can manage, are
thresholded here by the performance of the md I/O thread, as expected.

If I boost group_thread_cnt to, say, 2, I see:

     64       4     3677    14565    27936    36056     29629
     64       8     6670    21608    53422    69187     32045
     64      16     6682    26209    70329   103891     66662
     64      32    28624    40048     7312   154556      7345
     64      64    38327    43213    89127   260160     90540
[...]
  32768      64    14328    18580   265136   282946    308082
  32768     128    26310    24803   265762   323414    354685
  32768     256    29115    27073   238659   308974    345723
  32768     512    21572    21345   293312   314086    345365
  32768    1024    43978    38071   395715   345161    545821
  32768    2048    82898    70840   293151   470398    922082
  32768    4096   143350   124658   391980   659819    617984
  32768    8192   164297   227661   570423   645141    515009
  32768   16384   157701   171804   568484   451448    350715
[...]
 262144      64    17150    17693   391561   382751     28374
 262144     128    25385    26498   423685   410359     47148
 262144     256    29219    30244   392992   421748     80403
 262144     512    24303    24686   399453   371882    122861
 262144    1024    42296    42535   403020   508195    261339
 262144    2048    75740    63125   606979   589329    296124
 262144    4096   134646   137543   562749   590893    392938
 262144    8192   237800   239847   631752   620766    475791
 262144   16384   267889   304517   635674   612164    598521
 524288      64    17691    17776   403333   374628     21673
 524288     128    25575    25609   396568   439018     34526
 524288     256    29984    29990   412587   437099     71650
 524288     512    24971    25599   403074   431581     75471
 524288    1024    42545    43657   505740   519112    200811
 524288    2048    72519    75604   559987   589069    257654
 524288    4096   135122   140745   622450   499336    331273
 524288    8192   232848   231307   592729   604849    432296
 524288   16384   280105   271252   647725   664868    472363

Larger writes are clearly still thresholded.

Boost the thread count more, here, to 8:

     64       4     7834    14388    30346    40300      6124
     64       8    17236    21282     6984    37644      6842
     64      16    21100    24720     7208   120277      7199
     64      32    29411    45553     7374   162411      7357
     64      64     3671    59588    78128   256923     82804
[...]
  32768      64    14261    17866   261303   289135    294245
  32768     128    25832    27639   298172   324766    342822
  32768     256    26477    27196   277318   339353    352967
  32768     512    17848    19875   339424   272225    387746
  32768    1024    36017    38945   482068   464194    110825
  32768    2048    64240    67976   551762   505772     76629
  32768    4096    71022   117680   578561   696507    752493
  32768    8192   161080   207790   564343   556796    546488
  32768   16384   172937   233103   521368   603562    418679
[...]
 262144      64    17170    17452   352337   351258     27824
 262144     128    25318    25522   418977   424859     47112
 262144     256    26405    27092   426170   419684     79047
 262144     512    20185    20271   398733   411974    135554
 262144    1024    39013    38238   497919   438150    180384
 262144    2048    71054    70921   586634   535676    258955
 262144    4096   113222   121554   616548   604177    293088
 262144    8192   184086   187845   551395   586126    496147
 262144   16384   286319   272419   645900   659103    589384
 524288      64    16980    16756   385746   381476     21462
 524288     128    24993    25482   428855   438250     34889
 524288     256    26517    26134   448088   395352     70225
 524288     512    19534    19484   418764   416630     76975
 524288    1024    37645    38370   514030   511638    177818
 524288    2048    68469    72200   602688   542627    251162
 524288    4096   115467   121220   598738   629120    289589
 524288    8192   185093   182044   621233   586919    437162
 524288   16384   250990   266257   620428   660663    494770

Still thresholded. Yes, this is only one serial writer, but nonetheless this
seems a bit od.

-- 
NULL && (void)

^ permalink raw reply

* always use REQ_OP_WRITE_ZEROES for zeroing offload V2
From: Christoph Hellwig @ 2017-04-05 14:21 UTC (permalink / raw)
  To: axboe-tSWWG44O7X1aa/9Udqfwiw,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA,
	agk-H+wXaHxf7aLQT0dZR+AlfA, snitzer-H+wXaHxf7aLQT0dZR+AlfA,
	shli-DgEjT+Ai2ygdnm+yROfE0A,
	philipp.reisner-63ez5xqkn6DQT0dZR+AlfA,
	lars.ellenberg-63ez5xqkn6DQT0dZR+AlfA
  Cc: linux-block-u79uwXL29TY76Z2rM5mHXA,
	linux-raid-u79uwXL29TY76Z2rM5mHXA,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA,
	drbd-dev-cunTk1MwBs8qoQakbn7OcQ

This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
supported by the block layer, and switches existing implementations
of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
removes incorrect discard_zeroes_data, and also switches WRITE SAME
based zeroing in SCSI to this new method.

The series is against the block for-next tree.

A git tree is also avaiable at:

    git://git.infradead.org/users/hch/block.git discard-rework.2

Gitweb:

    http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/discard-rework.2

Changes since V2:
 - various spelling fixes
 - various reviews captured
 - two new patches from Martin at the end

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox