Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH 18/23] rsxx: remove the discard_zeroes_data flag
From: Christoph Hellwig @ 2017-03-23 14:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

rsxx only supports discarding on large alignments, so the zeroing code
would always fall back to explicit writings of zeroes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/rsxx/dev.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index f81d70b39d10..9c566364ac9c 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -300,7 +300,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
 						RSXX_HW_BLK_SIZE >> 9);
 		card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
 		card->queue->limits.discard_alignment   = RSXX_HW_BLK_SIZE;
-		card->queue->limits.discard_zeroes_data = 1;
 	}
 
 	card->queue->queuedata = card;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 19/23] mmc: remove the discard_zeroes_data flag
From: Christoph Hellwig @ 2017-03-23 14:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

mmc only supports discarding on large alignments, so the zeroing code
would always fall back to explicit writings of zeroes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/mmc/core/queue.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 493eb10ce580..4c54ad34e17a 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -167,8 +167,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
 
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 	blk_queue_max_discard_sectors(q, max_discard);
-	if (card->erased_byte == 0 && !mmc_can_discard(card))
-		q->limits.discard_zeroes_data = 1;
 	q->limits.discard_granularity = card->pref_erase << 9;
 	/* granularity must not be greater than max. discard */
 	if (card->pref_erase > max_discard)
-- 
2.11.0

^ permalink raw reply related

* [PATCH 20/23] block: stop using discards for zeroing
From: Christoph Hellwig @ 2017-03-23 14:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

Now that we have REQ_OP_WRITE_ZEROES implemented for all devices that
support efficient zeroing of devices we can remove the call to
blkdev_issue_discard.  This means we only have two ways of zeroing left
and can simply the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 33c5bf373b7f..153ca59393a7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -279,6 +279,11 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  *  Zero-fill a block range, either using hardware offload or by explicitly
  *  writing zeroes to the device.
  *
+ *  Note that this function may fail with -EOPNOTSUPP if the driver supports
+ *  efficient zeroing operation, but the device capabilities can only be
+ *  discovered by trial and error.  In this case the caller should call the
+ *  function again, and it will use the fallback path.
+ *
  *  If a device is using logical block provisioning, the underlying space will
  *  only be release if %flags contains BLKDEV_ZERO_UNMAP.
  *
@@ -349,12 +354,6 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 
-	if (flags & BLKDEV_ZERO_UNMAP) {
-		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
-				BLKDEV_DISCARD_ZERO))
-			return 0;
-	}
-
 	blk_start_plug(&plug);
 	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
 			&bio, flags);
-- 
2.11.0

^ permalink raw reply related

* [PATCH 21/23] drbd: make intelligent use of blkdev_issue_zeroout
From: Christoph Hellwig @ 2017-03-23 14:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

drbd always wants its discard wire operations to zero the blocks, so
use blkdev_issue_zeroout with the BLKDEV_ZERO_UNMAP flag instead of
reinventing it poorly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/drbd/drbd_debugfs.c  |  3 --
 drivers/block/drbd/drbd_int.h      |  6 ---
 drivers/block/drbd/drbd_receiver.c | 99 ++------------------------------------
 drivers/block/drbd/drbd_req.c      |  6 +--
 4 files changed, 7 insertions(+), 107 deletions(-)

diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index de5c3ee8a790..494837e59f23 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -236,9 +236,6 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
 	seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
 	seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
 	seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
-
-	if (f & EE_IS_TRIM)
-		__seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
 	seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
 	seq_putc(m, '\n');
 }
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 724d1c50fc52..d5da45bb03a6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -437,9 +437,6 @@ enum {
 
 	/* is this a TRIM aka REQ_DISCARD? */
 	__EE_IS_TRIM,
-	/* our lower level cannot handle trim,
-	 * and we want to fall back to zeroout instead */
-	__EE_IS_TRIM_USE_ZEROOUT,
 
 	/* In case a barrier failed,
 	 * we need to resubmit without the barrier flag. */
@@ -482,7 +479,6 @@ enum {
 #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
 #define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
 #define EE_IS_TRIM             (1<<__EE_IS_TRIM)
-#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
 #define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
 #define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
 #define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
@@ -1561,8 +1557,6 @@ extern void start_resync_timer_fn(unsigned long data);
 extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
 
 /* drbd_receiver.c */
-extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
-		sector_t start, unsigned int nr_sectors, bool discard);
 extern int drbd_receiver(struct drbd_thread *thi);
 extern int drbd_ack_receiver(struct drbd_thread *thi);
 extern void drbd_send_ping_wf(struct work_struct *ws);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index aa6bf9692eff..8641776f043a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1448,105 +1448,14 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
-/*
- * We *may* ignore the discard-zeroes-data setting, if so configured.
- *
- * Assumption is that it "discard_zeroes_data=0" is only because the backend
- * may ignore partial unaligned discards.
- *
- * LVM/DM thin as of at least
- *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
- *   Library version: 1.02.93-RHEL7 (2015-01-28)
- *   Driver version:  4.29.0
- * still behaves this way.
- *
- * For unaligned (wrt. alignment and granularity) or too small discards,
- * we zero-out the initial (and/or) trailing unaligned partial chunks,
- * but discard all the aligned full chunks.
- *
- * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
- */
-int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
-{
-	struct block_device *bdev = device->ldev->backing_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	sector_t tmp, nr;
-	unsigned int max_discard_sectors, granularity;
-	int alignment;
-	int err = 0;
-
-	if (!discard)
-		goto zero_out;
-
-	/* Zero-sector (unknown) and one-sector granularities are the same.  */
-	granularity = max(q->limits.discard_granularity >> 9, 1U);
-	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
-
-	max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
-	max_discard_sectors -= max_discard_sectors % granularity;
-	if (unlikely(!max_discard_sectors))
-		goto zero_out;
-
-	if (nr_sectors < granularity)
-		goto zero_out;
-
-	tmp = start;
-	if (sector_div(tmp, granularity) != alignment) {
-		if (nr_sectors < 2*granularity)
-			goto zero_out;
-		/* start + gran - (start + gran - align) % gran */
-		tmp = start + granularity - alignment;
-		tmp = start + granularity - sector_div(tmp, granularity);
-
-		nr = tmp - start;
-		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
-		nr_sectors -= nr;
-		start = tmp;
-	}
-	while (nr_sectors >= granularity) {
-		nr = min_t(sector_t, nr_sectors, max_discard_sectors);
-		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
-		nr_sectors -= nr;
-		start += nr;
-	}
- zero_out:
-	if (nr_sectors) {
-		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
-	}
-	return err != 0;
-}
-
-static bool can_do_reliable_discards(struct drbd_device *device)
-{
-	struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
-	struct disk_conf *dc;
-	bool can_do;
-
-	if (!blk_queue_discard(q))
-		return false;
-
-	if (q->limits.discard_zeroes_data)
-		return true;
-
-	rcu_read_lock();
-	dc = rcu_dereference(device->ldev->disk_conf);
-	can_do = dc->discard_zeroes_if_aligned;
-	rcu_read_unlock();
-	return can_do;
-}
-
 static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
 {
-	/* If the backend cannot discard, or does not guarantee
-	 * read-back zeroes in discarded ranges, we fall back to
-	 * zero-out.  Unless configuration specifically requested
-	 * otherwise. */
-	if (!can_do_reliable_discards(device))
-		peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+	struct block_device *bdev = device->ldev->backing_bdev;
 
-	if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
-	    peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
+	if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9,
+			GFP_NOIO, BLKDEV_ZERO_UNMAP))
 		peer_req->flags |= EE_WAS_ERROR;
+
 	drbd_endio_write_sec_final(peer_req);
 }
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 652114ae1a8a..bdd42b8360cc 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1148,10 +1148,10 @@ static int drbd_process_write_request(struct drbd_request *req)
 
 static void drbd_process_discard_req(struct drbd_request *req)
 {
-	int err = drbd_issue_discard_or_zero_out(req->device,
-				req->i.sector, req->i.size >> 9, true);
+	struct block_device *bdev = req->device->ldev->backing_bdev;
 
-	if (err)
+	if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9,
+			GFP_NOIO, BLKDEV_ZERO_UNMAP))
 		req->private_bio->bi_error = -EIO;
 	bio_endio(req->private_bio);
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH 22/23] drbd: implement REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-23 14:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

It seems like DRBD assumes its on the wire TRIM request always zeroes data.
Use that fact to implement REQ_OP_WRITE_ZEROES.

XXX: will need a careful audit from the drbd team!

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/drbd/drbd_main.c     | 3 ++-
 drivers/block/drbd/drbd_nl.c       | 2 ++
 drivers/block/drbd/drbd_receiver.c | 6 +++---
 drivers/block/drbd/drbd_req.c      | 7 +++++--
 drivers/block/drbd/drbd_worker.c   | 4 +++-
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92c60cbd04ee..8e62d9f65510 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1668,7 +1668,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
 			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
 			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
 			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
-			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
+			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
+			(bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
 	else
 		return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 908c704e20aa..e4516d3b971d 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1217,10 +1217,12 @@ static void decide_on_discard_support(struct drbd_device *device,
 		blk_queue_discard_granularity(q, 512);
 		q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+		q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
 	} else {
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		blk_queue_discard_granularity(q, 0);
 		q->limits.max_discard_sectors = 0;
+		q->limits.max_write_zeroes_sectors = 0;
 	}
 }
 
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 8641776f043a..e64e01ca4d1a 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -2285,7 +2285,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
 static unsigned long wire_flags_to_bio_op(u32 dpf)
 {
 	if (dpf & DP_DISCARD)
-		return REQ_OP_DISCARD;
+		return REQ_OP_WRITE_ZEROES;
 	else
 		return REQ_OP_WRITE;
 }
@@ -2476,7 +2476,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	op_flags = wire_flags_to_bio_flags(dp_flags);
 	if (pi->cmd == P_TRIM) {
 		D_ASSERT(peer_device, peer_req->i.size > 0);
-		D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+		D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
 		D_ASSERT(peer_device, peer_req->pages == NULL);
 	} else if (peer_req->pages == NULL) {
 		D_ASSERT(device, peer_req->i.size == 0);
@@ -4789,7 +4789,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
 
 	if (get_ldev(device)) {
 		struct drbd_peer_request *peer_req;
-		const int op = REQ_OP_DISCARD;
+		const int op = REQ_OP_WRITE_ZEROES;
 
 		peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
 					       size, 0, GFP_NOIO);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index bdd42b8360cc..d76ee53ce528 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,6 +59,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 	drbd_req_make_private_bio(req, bio_src);
 	req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
 		      | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
+		      | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0)
 		      | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
 	req->device = device;
 	req->master_bio = bio_src;
@@ -1180,7 +1181,8 @@ drbd_submit_req_private_bio(struct drbd_request *req)
 	if (get_ldev(device)) {
 		if (drbd_insert_fault(device, type))
 			bio_io_error(bio);
-		else if (bio_op(bio) == REQ_OP_DISCARD)
+		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+			 bio_op(bio) == REQ_OP_DISCARD)
 			drbd_process_discard_req(req);
 		else
 			generic_make_request(bio);
@@ -1234,7 +1236,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
 	_drbd_start_io_acct(device, req);
 
 	/* process discards always from our submitter thread */
-	if (bio_op(bio) & REQ_OP_DISCARD)
+	if ((bio_op(bio) & REQ_OP_WRITE_ZEROES) ||
+	    (bio_op(bio) & REQ_OP_DISCARD))
 		goto queue_for_submitter_thread;
 
 	if (rw == WRITE && req->private_bio && req->i.size
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3bff33f21435..1afcb4e02d8d 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -174,7 +174,8 @@ void drbd_peer_request_endio(struct bio *bio)
 	struct drbd_peer_request *peer_req = bio->bi_private;
 	struct drbd_device *device = peer_req->peer_device->device;
 	bool is_write = bio_data_dir(bio) == WRITE;
-	bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
+	bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+			  bio_op(bio) == REQ_OP_DISCARD;
 
 	if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -249,6 +250,7 @@ void drbd_request_endio(struct bio *bio)
 	/* to avoid recursion in __req_mod */
 	if (unlikely(bio->bi_error)) {
 		switch (bio_op(bio)) {
+		case REQ_OP_WRITE_ZEROES:
 		case REQ_OP_DISCARD:
 			if (bio->bi_error == -EOPNOTSUPP)
 				what = DISCARD_COMPLETED_NOTSUPP;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 23/23] block: remove the discard_zeroes_data flag
From: Christoph Hellwig @ 2017-03-23 14:33 UTC (permalink / raw)
  To: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	lars.ellenberg
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can
kill this hack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 Documentation/ABI/testing/sysfs-block | 10 ++--------
 Documentation/block/queue-sysfs.txt   |  5 -----
 block/blk-lib.c                       |  7 +------
 block/blk-settings.c                  |  3 ---
 block/blk-sysfs.c                     |  2 +-
 block/compat_ioctl.c                  |  2 +-
 block/ioctl.c                         |  2 +-
 drivers/ata/libata-scsi.c             |  2 +-
 drivers/block/drbd/drbd_main.c        |  2 --
 drivers/block/drbd/drbd_nl.c          |  7 +------
 drivers/block/loop.c                  |  2 --
 drivers/block/mtip32xx/mtip32xx.c     |  1 -
 drivers/block/nbd.c                   |  1 -
 drivers/md/dm-cache-target.c          |  1 -
 drivers/md/dm-crypt.c                 |  1 -
 drivers/md/dm-raid.c                  |  6 +++---
 drivers/md/dm-raid1.c                 |  1 -
 drivers/md/dm-table.c                 | 19 -------------------
 drivers/md/dm-thin.c                  |  2 --
 drivers/md/raid5.c                    | 12 +-----------
 drivers/scsi/sd.c                     |  7 -------
 drivers/target/target_core_device.c   |  2 +-
 include/linux/blkdev.h                | 15 ---------------
 include/linux/device-mapper.h         |  5 -----
 24 files changed, 13 insertions(+), 104 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef..dea212db9df3 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
 Date:		May 2011
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-		Devices that support discard functionality may return
-		stale or random data when a previously discarded block
-		is read back. This can cause problems if the filesystem
-		expects discarded blocks to be explicitly cleared. If a
-		device reports that it deterministically returns zeroes
-		when a discarded area is read the discard_zeroes_data
-		parameter will be set to one. Otherwise it will be 0 and
-		the result of reading a discarded area is undefined.
+		Will always return 0.  Don't rely on any specific behavior
+		for discards, and don't read this file.
 
 What:		/sys/block/<disk>/queue/write_same_max_bytes
 Date:		January 2012
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index c0a3bb5a6e4e..009150ed7db8 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 153ca59393a7..be44d2725ede 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -ENXIO;
 
 	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (flags & BLKDEV_DISCARD_ZERO)
-			return -EOPNOTSUPP;
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
 		op = REQ_OP_SECURE_ERASE;
 	} else {
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
-		if ((flags & BLKDEV_DISCARD_ZERO) &&
-		    !q->limits.discard_zeroes_data)
-			return -EOPNOTSUPP;
 		op = REQ_OP_DISCARD;
 	}
 
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
-		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+		if (ret == -EOPNOTSUPP)
 			ret = 0;
 		bio_put(bio);
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9d515ae3a405..fe2794986eb9 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -104,7 +104,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -128,7 +127,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	blk_set_default_limits(lim);
 
 	/* Inherit limits from component devices */
-	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
 	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
@@ -623,7 +621,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
 	t->cluster &= b->cluster;
-	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c44b321335f3..ca88a533b735 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_discard_zeroes_data(q), page);
+	return 0;
 }
 
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return compat_put_uint(arg, 0);
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 304685022d9f..1a1586f3baa1 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -546,7 +546,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return put_uint(arg, 0);
 	case BLKSECTGET:
 		max_sectors = min_t(unsigned int, USHRT_MAX,
 				    queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 965b9e7dbb7d..6fb870cac932 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1327,7 +1327,7 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
 		sdev->ata_trim = 1;
 		if (ata_id_has_zero_after_trim(dev->id) &&
 		    (dev->horkage & ATA_HORKAGE_ZERO_AFTER_TRIM)) {
-			ata_dev_info(dev, "Enabling discard_zeroes_data\n");
+			ata_dev_info(dev, "Enabling zeroing using TRIM\n");
 			sdev->ata_trim_zeroes_data = 1;
 		}
 	}
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8e62d9f65510..84455c365f57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = blk_queue_discard(q);
-		p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
 		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 	} else {
 		q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = 0;
-		p->qlim->discard_zeroes_data = 0;
 		p->qlim->write_same_capable = 0;
 	}
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e4516d3b971d..02255a0d68b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
 	struct drbd_connection *connection = first_peer_device(device)->connection;
 	bool can_do = b ? blk_queue_discard(b) : true;
 
-	if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
-		can_do = false;
-		drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
-	}
 	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
 		can_do = false;
 		drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1484,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
 	if (disk_conf->al_extents > drbd_al_extents_max(nbc))
 		disk_conf->al_extents = drbd_al_extents_max(nbc);
 
-	if (!blk_queue_discard(q)
-	    || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+	if (!blk_queue_discard(q)) {
 		if (disk_conf->rs_discard_granularity) {
 			disk_conf->rs_discard_granularity = 0; /* disable feature */
 			drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 265cd2e33ff0..57f68f4ee886 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -828,7 +828,6 @@ static void loop_config_discard(struct loop_device *lo)
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		q->limits.discard_zeroes_data = 0;
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		return;
 	}
@@ -837,7 +836,6 @@ static void loop_config_discard(struct loop_device *lo)
 	q->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	q->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index f96ab717534c..7efac4ee26b2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4025,7 +4025,6 @@ static int mtip_block_initialize(struct driver_data *dd)
 		dd->queue->limits.discard_granularity = 4096;
 		blk_queue_max_discard_sectors(dd->queue,
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
-		dd->queue->limits.discard_zeroes_data = 0;
 	}
 
 	/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7e4287bc19e5..616e5c6d3ebd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1040,7 +1040,6 @@ static int nbd_dev_add(int index)
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 	disk->queue->limits.discard_granularity = 512;
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
-	disk->queue->limits.discard_zeroes_data = 0;
 	blk_queue_max_hw_sectors(disk->queue, 65536);
 	disk->queue->limits.max_sectors = 256;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..975922c8f231 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->discard_zeroes_data_unsupported = true;
 	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..ef1d836bd81b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	wake_up_process(cc->write_thread);
 
 	ti->num_flush_bios = 1;
-	ti->discard_zeroes_data_unsupported = true;
 
 	return 0;
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f8564d63982f..468f1380de1d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
 
-	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+	/*
+	 * XXX: RAID level 4,5,6 require zeroing for safety.
+	 */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
 	for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
 			return;
 
 		if (raid456) {
-			if (!q->limits.discard_zeroes_data)
-				return;
 			if (!devices_handle_discard_safely) {
 				DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
 				DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a95cbb80fb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
-	ti->discard_zeroes_data_unsupported = true;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5cd665c91ead..958275aca008 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	return false;
 }
 
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	/* Ensure that all targets supports discard_zeroes_data. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (ti->discard_zeroes_data_unsupported)
-			return false;
-	}
-
-	return true;
-}
-
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
@@ -1620,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
-	if (!dm_table_discard_zeroes_data(t))
-		q->limits.discard_zeroes_data = 0;
-
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..a5f1916f621a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
-	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
 
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
-	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8cf1f86dcd05..a4b04d0dc829 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7265,12 +7265,6 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      0xfffe * STRIPE_SECTORS);
 
-		/*
-		 * unaligned part of discard request will be ignored, so can't
-		 * guarantee discard_zeroes_data
-		 */
-		mddev->queue->limits.discard_zeroes_data = 0;
-
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
@@ -7280,7 +7274,7 @@ static int raid5_run(struct mddev *mddev)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
 			/*
-			 * discard_zeroes_data is required, otherwise data
+			 * zeroing is required, otherwise data
 			 * could be lost. Consider a scenario: discard a stripe
 			 * (the stripe could be inconsistent if
 			 * discard_zeroes_data is 0); write one disk of the
@@ -7289,10 +7283,6 @@ static int raid5_run(struct mddev *mddev)
 			 * parity); the disk is broken; The stripe data of this
 			 * disk is lost.
 			 */
-			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-			    !bdev_get_queue(rdev->bdev)->
-						limits.discard_zeroes_data)
-				discard_supported = false;
 			/* Unfortunately, discard_zeroes_data is not currently
 			 * a guarantee - just a hint.  So we only allow DISCARD
 			 * if the sysadmin has confirmed that only safe devices
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ca96bb33471b..65ed02438565 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -645,8 +645,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	unsigned int logical_block_size = sdkp->device->sector_size;
 	unsigned int max_blocks = 0, max_ranges = 0, max_range_size = 0;
 
-	q->limits.discard_zeroes_data = 0;
-
 	/*
 	 * When LBPRZ is reported, discard alignment and granularity
 	 * must be fixed to the logical block size. Otherwise the block
@@ -682,27 +680,22 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	case SD_LBP_WS16:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS16_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_WS10:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_ZERO:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = 1;
 		break;
 
 	case SD_LBP_ATA_TRIM:
 		max_ranges = 512 / sizeof(__le64);
 		max_range_size = USHRT_MAX;
 		max_blocks = max_ranges * max_range_size;
-		if (sdkp->device->ata_trim_zeroes_data)
-			q->limits.discard_zeroes_data = 1;
 		break;
 	}
 
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b..d2f089cfa9ae 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
 	attrib->unmap_granularity = q->limits.discard_granularity / block_size;
 	attrib->unmap_granularity_alignment = q->limits.discard_alignment /
 								block_size;
-	attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+	attrib->unmap_zeroes_data = 0;
 	return true;
 }
 EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1bddcfc631a4..aa1ac2c0f393 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -338,7 +338,6 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
-	unsigned char		discard_zeroes_data;
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 };
@@ -1338,7 +1337,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
@@ -1543,19 +1541,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 	return q->limits.discard_alignment;
 }
 
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
-	if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
-	return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
 static inline unsigned int bdev_write_same(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3829bee2302a..c7ea33e38fb9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -296,11 +296,6 @@ struct dm_target {
 	 * on max_io_len boundary.
 	 */
 	bool split_discard_bios:1;
-
-	/*
-	 * Set if this target does not return zeroes on discarded blocks.
-	 */
-	bool discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
2.11.0

^ permalink raw reply related

* Re: support ranges TRIM for libata
From: James Bottomley @ 2017-03-23 14:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Tejun Heo, martin.petersen, axboe, linux-ide, linux-scsi,
	linux-block
In-Reply-To: <20170323135521.GA30361@lst.de>

On Thu, 2017-03-23 at 14:55 +0100, Christoph Hellwig wrote:
> On Thu, Mar 23, 2017 at 09:47:41AM -0400, James Bottomley wrote:
> > > The current implementation already has the issue of that it does
> > > corrupt user data reliably if the using SG_IO for WRITE SAME
> > > commands.
> > 
> > That does need fixing.
> 
> I don't think it's fixable as long as we translate the data payload.
> 
> > Why can't we do what the t10 sat document recommends: if the ATA 
> > device doesn't support the XL version (32 bit ranges) then 
> > translate unmap to multiple non-XL commands?
> 
> Because libata sits underneath the tag allocator we'd getting into
> a giant set of problems.  Where do you expect the new commands to
> magically come from?  And adhere to the queueing limits and not 
> actually deadlock in one way or another?

I'm certainly not saying we blindly follow t10, but I believe their
intent is to issue the next command from the completion of the first
(we can do this using qc->complete_fn, like atapi_request_sense).  That
way we don't get any tag problems because there's only one command
outstanding at once; reusing the qc means no allocation issues either.

The t10 approach does mean the SG_IO problem is actually fixable rather
than simply erroring out.

> > I don't necessarily object to the vendor specific 1<->1 approach, 
> > it's just it won't fix the problem you cited above (SG_IO WRITE 
> > SAME), its just that now we error the command, which may cause some
> > surprise.
> 
> We now error WRITE SAME for passthrough consistently.  Before we only
> accepted it only with the unmap bit set, and even did so incorrectly
> (not checking that the payload was all zeros).

If it never worked for anyone, I'm OK with changing it to error out.

> > I also wonder if we couldn't simply do an ATA_16 TRIM if we're
> > already going to all the trouble of recognising ATA devices in the 
> > sd discard path?
> 
> We probably could do that as well.  But that would drag a lot more
> ATA-specific code into sd.c than just formatting the ranges.

That's up to you ... from the point of view of code documenting itself,
forming the ATA_16 TRIM in sd and not doing any satl transformation is
easier for others to follow, but if it's going to cause more code, I'm
only marginal on the advantages of easier to follow code.

James

^ permalink raw reply

* Re: support ranges TRIM for libata
From: Christoph Hellwig @ 2017-03-23 14:43 UTC (permalink / raw)
  To: James Bottomley
  Cc: Christoph Hellwig, Tejun Heo, martin.petersen, axboe, linux-ide,
	linux-scsi, linux-block
In-Reply-To: <1490279706.2202.17.camel@HansenPartnership.com>

On Thu, Mar 23, 2017 at 10:35:06AM -0400, James Bottomley wrote:
> I'm certainly not saying we blindly follow t10, but I believe their
> intent is to issue the next command from the completion of the first
> (we can do this using qc->complete_fn, like atapi_request_sense).  That
> way we don't get any tag problems because there's only one command
> outstanding at once; reusing the qc means no allocation issues either.
> 
> The t10 approach does mean the SG_IO problem is actually fixable rather
> than simply erroring out.

It would be sort of fixable, but with a lot of hackery.

> That's up to you ... from the point of view of code documenting itself,
> forming the ATA_16 TRIM in sd and not doing any satl transformation is
> easier for others to follow, but if it's going to cause more code, I'm
> only marginal on the advantages of easier to follow code.

I tried this earlier before giving up on it because it looked to ugly.
But I can complete that version of it and post it for people to compare.

^ permalink raw reply

* Re: [PATCH 06/23] dm-kcopyd: switch to use REQ_OP_WRITE_ZEROES
From: Mike Snitzer @ 2017-03-23 14:55 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, martin.petersen, agk, shli, philipp.reisner,
	lars.ellenberg, linux-block, linux-scsi, drbd-dev, dm-devel,
	linux-raid
In-Reply-To: <20170323143341.31549-7-hch@lst.de>

On Thu, Mar 23 2017 at 10:33am -0400,
Christoph Hellwig <hch@lst.de> wrote:

> It seems like the code currently passes whatever it was using for writes
> to WRITE SAME.  Just switch it to WRITE ZEROES, although that doesn't
> need any payload.
> 
> Untested, and confused by the code, maybe someone who understands it
> better than me can help..
> 
> Not-yet-signed-off-by: Christoph Hellwig <hch@lst.de>

See commit 70d6c400a ("dm kcopyd: add WRITE SAME support to dm_kcopyd_zero")
drivers/md/dm-io.c:do_region() adjusts the WRITE SAME payload to be a
single page.

So you'd want to tweak dm-io.c accordingly for WRITE ZEROES (presummably
no payload?)

Mike

> ---
>  drivers/md/dm-kcopyd.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
> index 9e9d04cb7d51..f85846741d50 100644
> --- a/drivers/md/dm-kcopyd.c
> +++ b/drivers/md/dm-kcopyd.c
> @@ -733,11 +733,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
>  		job->pages = &zero_page_list;
>  
>  		/*
> -		 * Use WRITE SAME to optimize zeroing if all dests support it.
> +		 * Use WRITE ZEROES to optimize zeroing if all dests support it.
>  		 */
> -		job->rw = REQ_OP_WRITE_SAME;
> +		job->rw = REQ_OP_WRITE_ZEROES;
>  		for (i = 0; i < job->num_dests; i++)
> -			if (!bdev_write_same(job->dests[i].bdev)) {
> +			if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
>  				job->rw = WRITE;
>  				break;
>  			}
> -- 
> 2.11.0
> 

^ permalink raw reply

* Re: [PATCH 06/23] dm-kcopyd: switch to use REQ_OP_WRITE_ZEROES
From: Christoph Hellwig @ 2017-03-23 14:56 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, axboe, martin.petersen, agk, shli,
	philipp.reisner, lars.ellenberg, linux-block, linux-scsi,
	drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323145522.GB17127@redhat.com>

On Thu, Mar 23, 2017 at 10:55:22AM -0400, Mike Snitzer wrote:
> See commit 70d6c400a ("dm kcopyd: add WRITE SAME support to dm_kcopyd_zero")
> drivers/md/dm-io.c:do_region() adjusts the WRITE SAME payload to be a
> single page.
> 
> So you'd want to tweak dm-io.c accordingly for WRITE ZEROES (presummably
> no payload?)

Yeah, I'll look into it.  Any good test case for verifying this code
while I've got your attention?

^ permalink raw reply

* Re: support ranges TRIM for libata
From: Tejun Heo @ 2017-03-23 15:04 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: James Bottomley, martin.petersen, axboe, linux-ide, linux-scsi,
	linux-block
In-Reply-To: <20170323144330.GA31447@lst.de>

Hello, Christoph.

On Thu, Mar 23, 2017 at 03:43:30PM +0100, Christoph Hellwig wrote:
> > That's up to you ... from the point of view of code documenting itself,
> > forming the ATA_16 TRIM in sd and not doing any satl transformation is
> > easier for others to follow, but if it's going to cause more code, I'm
> > only marginal on the advantages of easier to follow code.
> 
> I tried this earlier before giving up on it because it looked to ugly.
> But I can complete that version of it and post it for people to compare.

I kinda like the idea of sticking with satl as that's how libata has
been doing most things even if the implementation is uglier.  It'd be
great to find out whether the ugliness would be acceptable or too
much.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 06/23] dm-kcopyd: switch to use REQ_OP_WRITE_ZEROES
From: Mike Snitzer @ 2017-03-23 15:10 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, martin.petersen, agk, shli, philipp.reisner,
	lars.ellenberg, linux-block, linux-scsi, drbd-dev, dm-devel,
	linux-raid
In-Reply-To: <20170323145651.GA31771@lst.de>

On Thu, Mar 23 2017 at 10:56am -0400,
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Mar 23, 2017 at 10:55:22AM -0400, Mike Snitzer wrote:
> > See commit 70d6c400a ("dm kcopyd: add WRITE SAME support to dm_kcopyd_zero")
> > drivers/md/dm-io.c:do_region() adjusts the WRITE SAME payload to be a
> > single page.
> > 
> > So you'd want to tweak dm-io.c accordingly for WRITE ZEROES (presummably
> > no payload?)
> 
> Yeah, I'll look into it.

Ah, your previous 05/23 patch re-uses the discard branch in
dm-io.c:do_region.  Looks correct.  So you should be good.

Not sure why you've split out the dm-kcopyd patch, likely best to just
fold it into the previous dm support patch.

I'll try your code out on my testbed, probably tomorrow, but in general
the DM code looks solid (thanks for doing it!).  But I'll take a closer
look and will report back with my Reviewed-by if all looks (and tests
out) good.

> Any good test case for verifying this code while I've got your
> attention?

DM thinp is the only consumer of dm_kcopyd_zero().  DM thinp
conservatively defaults to zeroing (but we recommend
'skip_block_zeroing' for most setups).  So anyway, just using a DM thin
device with partial thin blocksize IO (without 'skip_block_zeroing')
should get you coverage.

The device-mapper-test-suite thin tests have thinp w/ zeroing coverage
so I'll be sure to run those.

^ permalink raw reply

* Re: support ranges TRIM for libata
From: Christoph Hellwig @ 2017-03-23 15:27 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Christoph Hellwig, James Bottomley, martin.petersen, axboe,
	linux-ide, linux-scsi, linux-block
In-Reply-To: <20170323150458.GA3241@htj.duckdns.org>

On Thu, Mar 23, 2017 at 11:04:58AM -0400, Tejun Heo wrote:
> I kinda like the idea of sticking with satl as that's how libata has
> been doing most things even if the implementation is uglier.  It'd be
> great to find out whether the ugliness would be acceptable or too
> much.

The SATL way is simply broken.  I'll just leave the code corrupting
user data and 5 times slower than it could be then.

^ permalink raw reply

* Re: support ranges TRIM for libata
From: Christoph Hellwig @ 2017-03-23 15:30 UTC (permalink / raw)
  To: James Bottomley
  Cc: Christoph Hellwig, Tejun Heo, martin.petersen, axboe, linux-ide,
	linux-scsi, linux-block
In-Reply-To: <20170323144330.GA31447@lst.de>

On Thu, Mar 23, 2017 at 03:43:30PM +0100, Christoph Hellwig wrote:
> I tried this earlier before giving up on it because it looked to ugly.
> But I can complete that version of it and post it for people to compare.

Meh, I remember why I gave up on it - to support queued trim passthrough
we'd need to implement ATA 32 for the auxiliary fields and thus support
32-bit CDBs.  I don't really want to go there..

^ permalink raw reply

* Re: [PATCH 02/11] block: Fix race of bdev open with gendisk shutdown
From: Tejun Heo @ 2017-03-23 15:38 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Dan Williams,
	Thiago Jung Bauermann, Tahsin Erdogan, Omar Sandoval
In-Reply-To: <20170323002105.GB18402@quack2.suse.cz>

Hello, Jan.

On Thu, Mar 23, 2017 at 01:21:05AM +0100, Jan Kara wrote:
> > * A bdi is the object which knows which request_queue it's associated
> >   and when that association dies.
> 
> So bdi is associated with a request_queue but it does not have any
> direct pointer or reference to it. It is the request_queue that points to
> bdi. However you are right that bdi_unregister() call is telling bdi that
> the device is going away.
> 
> > * bdev is permanently associated with a bdi.
> 
> Yes.
> 
> > So, it's kinda weird to look up the request_queue again on each open
> > and then verify that the bd_bdi and request_queue match.  I think it
> > would make more sense to skip disk look up and just go through bdi to
> > determine the associated request_queue as that's the primary nexus
> > object tying everything up now.
> 
> So I was thinking about this as well. The problem is that you cannot really
> keep a reference to request_queue (or gendisk) from block device inode or
> bdi as that could unexpectedly pin the driver in memory after user thinks
> everything should be cleaned up. So I think we really do have to establish
> the connection between block device inode and request_queue / gendisk on
> opening of the block device and drop the reference when the block device is
> closed.

Hmmm... but for bdi to be able to serve as the sever point, it must
know when it gets disassociated (del_gendisk) and be able to safely
clear its pointer and reject further access to the request_queue.  I
think we're trying to shift that synchronization to the open path but
no matter where we do that we have to do that anyway and it'll likely
be the more straight-forward to do it at the sever point.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: support ranges TRIM for libata
From: Martin K. Petersen @ 2017-03-23 15:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: James Bottomley, Tejun Heo, martin.petersen, axboe, linux-ide,
	linux-scsi, linux-block
In-Reply-To: <20170323153001.GA32484@lst.de>

Christoph Hellwig <hch@lst.de> writes:

> Meh, I remember why I gave up on it - to support queued trim passthrough
> we'd need to implement ATA 32 for the auxiliary fields and thus support
> 32-bit CDBs.  I don't really want to go there..

I wish we could stick with SATL. However, I have attempted this a few
times over the years and I am with Christoph here. Due to the
discrepancies between ACS and SBC it's a twisted mess. And the iterative
approach has not worked well for HBA SATLs either. Quite the contrary.

So I think sticking the DSM TRIM payload onto a SCSI command is the path
of least resistance. And then NAK'ing attempts to issue these commands
through sg.

I'll take closer look at the entire series tomorrow or Monday. I want to
test multiple ranges on my "Little Shop of SSD Horrors" when I get back
home from LSF/MM.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply

* Re: RFC: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Lars Ellenberg @ 2017-03-23 15:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, martin.petersen, agk, snitzer, shli, philipp.reisner,
	linux-block, linux-scsi, drbd-dev, dm-devel, linux-raid
In-Reply-To: <20170323143341.31549-1-hch@lst.de>

On Thu, Mar 23, 2017 at 10:33:18AM -0400, Christoph Hellwig wrote:
> This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> supported by the block layer, and switches existing implementations
> of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> removes incorrect discard_zeroes_data, and also switches WRITE SAME
> based zeroing in SCSI to this new method.
> 
> I've done testing with ATA, SCSI and NVMe setups, but there are
> a few things that will need more attention:
> 

>  - The DRBD code in this area was very odd,

DRBD wants all replicas to give back identical data.
If what comes back after a discard is "undefined",
we cannot really use that.

We used to "stack" discard only if our local backend claimed
"discard_zeroes_data". We replicate that IO request to the peer
as discard, and if the peer cannot do discards itself, or has
discard_zeroes_data == 0, the peer will use zeroout instead.

One use-case for this is the device mapper "thin provisioning".
At the time I wrote those "odd" hacks, dm thin targets
would set discard_zeroes_data=0, NOT change discard granularity,
but only actually discard (drop from the tree) whole "chunks",
leaving partial start/end chunks in the mapping tree unchanged.

The logic of "only stack discard, if backend discard_zeroes_data"
would mean that we would not be able to accept and pass down discards
to dm-thin targets. But with data on dm-thin, you would really like
to do the occasional fstrim.

Also, IO backends on the peers do not have to have the same
characteristics.  You could have the DRBD Primary on some SSD,
and the Secondary on some thin-pool LV,
scheduling thin snapthots in intervals or on demand.

With the logic of "use zero-out instead", fstrim would cause it to
fully allocate what was supposed to be thinly provisioned :-(

So what I did there was optionally tell DRBD that
"discard_zeroes_data == 0" on that peer would actually mean
"discard_zeroes_data == 1,
 IF you zero-out the partial chunks of this granularity yourself".

And implemented this "discard aligned chunks of that granularity,
and zero-out partial start/end chunks, if any".

And then claim to upper layers that, yes, discard_zeroes_data=1,
in that case, if so configured, even if our backend (dm-thin)
would say discard_zeroes_data=0.

Does that make sense?  Can we still do that?  Has something like that
been done in block core or device mapper meanwhile?

> and will need an audit from the maintainers.

Will need to make some time for review and testing.

Thanks,

    Lars

^ permalink raw reply

* Re: RFC: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Mike Snitzer @ 2017-03-23 17:02 UTC (permalink / raw)
  To: Lars Ellenberg
  Cc: Christoph Hellwig, axboe, martin.petersen, agk, shli,
	philipp.reisner, linux-block, linux-scsi, drbd-dev, dm-devel,
	linux-raid
In-Reply-To: <20170323155410.GD1138@soda.linbit>

On Thu, Mar 23 2017 at 11:54am -0400,
Lars Ellenberg <lars.ellenberg@linbit.com> wrote:

> On Thu, Mar 23, 2017 at 10:33:18AM -0400, Christoph Hellwig wrote:
> > This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> > supported by the block layer, and switches existing implementations
> > of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> > removes incorrect discard_zeroes_data, and also switches WRITE SAME
> > based zeroing in SCSI to this new method.
> > 
> > I've done testing with ATA, SCSI and NVMe setups, but there are
> > a few things that will need more attention:
> > 
> 
> >  - The DRBD code in this area was very odd,
> 
> DRBD wants all replicas to give back identical data.
> If what comes back after a discard is "undefined",
> we cannot really use that.
> 
> We used to "stack" discard only if our local backend claimed
> "discard_zeroes_data". We replicate that IO request to the peer
> as discard, and if the peer cannot do discards itself, or has
> discard_zeroes_data == 0, the peer will use zeroout instead.
> 
> One use-case for this is the device mapper "thin provisioning".
> At the time I wrote those "odd" hacks, dm thin targets
> would set discard_zeroes_data=0, NOT change discard granularity,
> but only actually discard (drop from the tree) whole "chunks",
> leaving partial start/end chunks in the mapping tree unchanged.
> 
> The logic of "only stack discard, if backend discard_zeroes_data"
> would mean that we would not be able to accept and pass down discards
> to dm-thin targets. But with data on dm-thin, you would really like
> to do the occasional fstrim.

Are you sure you aren't thinking of MD raid?  E.g. raid5's
"devices_handle_discard_safely":
parm:           devices_handle_discard_safely:Set to Y if all devices in each array reliably return zeroes on reads from discarded regions (bool)

I don't recall DM thinp's discard support ever having a requirement for
discard_zeroes_data.

In fact, see header from commit b60ab990ccdf3 ("dm thin: do not expose
non-zero discard limits if discards disabled"):

    Also, always set discard_zeroes_data_unsupported in targets because they
    should never advertise the 'discard_zeroes_data' capability (even if the
    pool's data device supports it).

To this day, dm-thin.c has: ti->discard_zeroes_data_unsupported = true

^ permalink raw reply

* Re: [RFC PATCH 1/1] nbd: replace kill_bdev() with __invalidate_device()
From: Josef Bacik @ 2017-03-23 17:51 UTC (permalink / raw)
  To: Ming Lin, nbd-general@lists.sourceforge.net, Ratna Manoj Bolla
  Cc: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	jianshu.ljs@alibaba-inc.com, xiongwei.jiang@alibaba-inc.com,
	james.liu@alibaba-inc.com
In-Reply-To: <1490050729-3578-2-git-send-email-mlin@kernel.org>

WWVhaCBJIHRoaW5rIHRoaXMgaXMgb2ssIEnigJlsbCB0aHJvdyBpdCBvbiBteSBxdWV1ZSBmb3Ig
Zml4ZXMgZm9yIHRoaXMgY3ljbGUuICBUaGFua3MsDQoNCkpvc2VmDQoNCk9uIDMvMjAvMTcsIDY6
NTggUE0sICJNaW5nIExpbiIgPG1saW5Aa2VybmVsLm9yZz4gd3JvdGU6DQoNCkZyb206IFJhdG5h
IE1hbm9qIEJvbGxhIDxtYW5vai5ickBnbWFpbC5jb20+DQoNCldoZW4gYSBmaWxlc3lzdGVtIGlz
IG1vdW50ZWQgb24gYSBuYmQgZGV2aWNlIGFuZCBvbiBhIGRpc2Nvbm5lY3QsIGJlY2F1c2UNCm9m
IGtpbGxfYmRldigpLCBhbmQgcmVzZXR0aW5nIGJkZXYgc2l6ZSB0byB6ZXJvLCBidWZmZXJfaGVh
ZCBtYXBwaW5ncyBhcmUNCmdldHRpbmcgZGVzdHJveWVkIHVuZGVyIG1vdW50ZWQgZmlsZXN5c3Rl
bS4NCg0KQWZ0ZXIgYSBiZGV2IHNpemUgcmVzZXQoaS5lIGJkZXYtPmJkX2lub2RlLT5pX3NpemUg
PSAwKSBvbiBhIGRpc2Nvbm5lY3QsDQpmb2xsb3dlZCBieSBhIHN5c191bW91bnQoKSwNCiAgICAg
ICAgZ2VuZXJpY19zaHV0ZG93bl9zdXBlcigpLT4uLi4NCiAgICAgICAgLT5fX3N5bmNfYmxvY2tk
ZXYoKS0+Li4uDQogICAgICAgIC1ibGtkZXZfd3JpdGVwYWdlcygpLT4uLi4NCiAgICAgICAgLT5k
b19pbnZhbGlkYXRlcGFnZSgpLT4uLi4NCiAgICAgICAgLWRpc2NhcmRfYnVmZmVyKCkgICBpcyBk
aXNjYXJkaW5nIHN1cGVyYmxvY2sgYnVmZmVyX2hlYWQgYXNzdW1lZA0KdG8gYmUgaW4gbWFwcGVk
IHN0YXRlIGJ5IGV4dDRfY29tbWl0X3N1cGVyKCkuDQoNClttbGluOiBwb3J0ZWQgdG8gNC4xMS1y
YzJdDQpTaWduZWQtb2ZmLWJ5OiBSYXRuYSBNYW5vaiBCb2xsYSA8bWFub2ouYnJAZ21haWwuY29t
DQotLS0NCiBkcml2ZXJzL2Jsb2NrL25iZC5jIHwgOCArKysrKystLQ0KIDEgZmlsZSBjaGFuZ2Vk
LCA2IGluc2VydGlvbnMoKyksIDIgZGVsZXRpb25zKC0pDQoNCmRpZmYgLS1naXQgYS9kcml2ZXJz
L2Jsb2NrL25iZC5jIGIvZHJpdmVycy9ibG9jay9uYmQuYw0KaW5kZXggY2I0Y2NmYy4uYTZhMzY0
MyAxMDA2NDQNCi0tLSBhL2RyaXZlcnMvYmxvY2svbmJkLmMNCisrKyBiL2RyaXZlcnMvYmxvY2sv
bmJkLmMNCkBAIC0xMjUsNyArMTI1LDggQEAgc3RhdGljIGNvbnN0IGNoYXIgKm5iZGNtZF90b19h
c2NpaShpbnQgY21kKQ0KIA0KIHN0YXRpYyBpbnQgbmJkX3NpemVfY2xlYXIoc3RydWN0IG5iZF9k
ZXZpY2UgKm5iZCwgc3RydWN0IGJsb2NrX2RldmljZSAqYmRldikNCiB7DQotCWJkX3NldF9zaXpl
KGJkZXYsIDApOw0KKwlpZiAoYmRldi0+YmRfb3BlbmVycyA8PSAxKQ0KKwkJYmRfc2V0X3NpemUo
YmRldiwgMCk7DQogCXNldF9jYXBhY2l0eShuYmQtPmRpc2ssIDApOw0KIAlrb2JqZWN0X3VldmVu
dCgmbmJkX3RvX2RldihuYmQpLT5rb2JqLCBLT0JKX0NIQU5HRSk7DQogDQpAQCAtNjAzLDYgKzYw
NCw4IEBAIHN0YXRpYyB2b2lkIG5iZF9yZXNldChzdHJ1Y3QgbmJkX2RldmljZSAqbmJkKQ0KIA0K
IHN0YXRpYyB2b2lkIG5iZF9iZGV2X3Jlc2V0KHN0cnVjdCBibG9ja19kZXZpY2UgKmJkZXYpDQog
ew0KKwlpZiAoYmRldi0+YmRfb3BlbmVycyA+IDEpDQorCQlyZXR1cm47DQogCXNldF9kZXZpY2Vf
cm8oYmRldiwgZmFsc2UpOw0KIAliZGV2LT5iZF9pbm9kZS0+aV9zaXplID0gMDsNCiAJaWYgKG1h
eF9wYXJ0ID4gMCkgew0KQEAgLTY2Niw3ICs2NjksOCBAQCBzdGF0aWMgaW50IG5iZF9jbGVhcl9z
b2NrKHN0cnVjdCBuYmRfZGV2aWNlICpuYmQsIHN0cnVjdCBibG9ja19kZXZpY2UgKmJkZXYpDQog
ew0KIAlzb2NrX3NodXRkb3duKG5iZCk7DQogCW5iZF9jbGVhcl9xdWUobmJkKTsNCi0Ja2lsbF9i
ZGV2KGJkZXYpOw0KKw0KKwlfX2ludmFsaWRhdGVfZGV2aWNlKGJkZXYsIHRydWUpOw0KIAluYmRf
YmRldl9yZXNldChiZGV2KTsNCiAJLyoNCiAJICogV2Ugd2FudCB0byBnaXZlIHRoZSBydW4gdGhy
ZWFkIGEgY2hhbmNlIHRvIHdhaXQgZm9yIGV2ZXJ5Ym9keQ0KLS0gDQoxLjguMy4xDQoNCg0KDQo=

^ permalink raw reply

* Re: [RFC PATCH 1/1] nbd: replace kill_bdev() with __invalidate_device()
From: Ming Lin @ 2017-03-23 20:58 UTC (permalink / raw)
  To: Josef Bacik
  Cc: nbd-general@lists.sourceforge.net, Ratna Manoj Bolla,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	jianshu.ljs@alibaba-inc.com, xiongwei.jiang@alibaba-inc.com,
	james.liu@alibaba-inc.com
In-Reply-To: <8AA3DAC1-5E9C-438E-904B-D4C708A4F624@fb.com>

On Thu, Mar 23, 2017 at 10:51 AM, Josef Bacik <jbacik@fb.com> wrote:
> Yeah I think this is ok, I’ll throw it on my queue for fixes for this cycle.  Thanks,

Great. Thanks.

>
> Josef

^ permalink raw reply

* Re: RFC: always use REQ_OP_WRITE_ZEROES for zeroing offload
From: Lars Ellenberg @ 2017-03-23 22:53 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, axboe, martin.petersen, agk, shli,
	philipp.reisner, linux-block, linux-scsi, drbd-dev, dm-devel,
	linux-raid
In-Reply-To: <20170323170221.GA20854@redhat.com>

On Thu, Mar 23, 2017 at 01:02:22PM -0400, Mike Snitzer wrote:
> On Thu, Mar 23 2017 at 11:54am -0400,
> Lars Ellenberg <lars.ellenberg@linbit.com> wrote:
> 
> > On Thu, Mar 23, 2017 at 10:33:18AM -0400, Christoph Hellwig wrote:
> > > This series makes REQ_OP_WRITE_ZEROES the only zeroing offload
> > > supported by the block layer, and switches existing implementations
> > > of REQ_OP_DISCARD that correctly set discard_zeroes_data to it,
> > > removes incorrect discard_zeroes_data, and also switches WRITE SAME
> > > based zeroing in SCSI to this new method.
> > > 
> > > I've done testing with ATA, SCSI and NVMe setups, but there are
> > > a few things that will need more attention:
> > > 
> > 
> > >  - The DRBD code in this area was very odd,
> > 
> > DRBD wants all replicas to give back identical data.
> > If what comes back after a discard is "undefined",
> > we cannot really use that.
> > 
> > We used to "stack" discard only if our local backend claimed
> > "discard_zeroes_data". We replicate that IO request to the peer
> > as discard, and if the peer cannot do discards itself, or has
> > discard_zeroes_data == 0, the peer will use zeroout instead.
> > 
> > One use-case for this is the device mapper "thin provisioning".
> > At the time I wrote those "odd" hacks, dm thin targets
> > would set discard_zeroes_data=0, NOT change discard granularity,
> > but only actually discard (drop from the tree) whole "chunks",
> > leaving partial start/end chunks in the mapping tree unchanged.
> > 
> > The logic of "only stack discard, if backend discard_zeroes_data"

That is DRBDs logic I just explained above.
And the "backend" (to DRBD) in that sentence would be thin, and not
the "real" hardware below thin, which may not even support discard.

> > would mean that we would not be able to accept and pass down discards
> > to dm-thin targets. But with data on dm-thin, you would really like
> > to do the occasional fstrim.
> 
> Are you sure you aren't thinking of MD raid?

Yes.

> To this day, dm-thin.c has: ti->discard_zeroes_data_unsupported = true

That is exactly what I was saying.

Thin does not claim to zero data on discard.  which is ok, and correct,
because it only punches holes on full chunks (or whatever you call
them), and leaves the rest in the mapping tree as is.

And that behaviour would prevent DRBD from exposing discards if
configured on top of thin. (see above)

But thin *could* easily guarantee zeroing, by simply punching holes
where it can, and zeroing out the not fully-aligned partial start and
end of the range.

Which is what I added as an option between DRBD and whatever is below,
with the use-case of dm-thin in mind.

And that made it possible for DRBD to
 a) expose "discard" to upper layers, even if we would usually only do
    if the DRBD Primary sits on top of a device that guarantees discard
    zeros data,
 b) still use discards on a secondary, without falling back to zero-out,
    which would unexpectedly fully allocate, instead of trim, a thinly
    provisioned device-mapper target.


Thanks,

    Lars

^ permalink raw reply

* Re: [PATCH v2] block: trace completion of all bios.
From: NeilBrown @ 2017-03-24  0:06 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, linux-block, linux-raid, dm-devel,
	Alasdair Kergon, Mike Snitzer, Shaohua Li, linux-kernel,
	Martin K . Petersen
In-Reply-To: <20170323104331.GA16903@ming.t460p>

[-- Attachment #1: Type: text/plain, Size: 1533 bytes --]

On Thu, Mar 23 2017, Ming Lei wrote:

> On Thu, Mar 23, 2017 at 05:29:02PM +1100, NeilBrown wrote:
>> 
>>  /**
>> + * bio_endio_notrace - end I/O on a bio without tracing
>> + * @bio:	bio
>> + *
>> + * Description:
>> + *   bio_endio_notrace() will end I/O on the whole bio.
>> + *   bio_endio_notrace() should only be call if a completion trace
>> + *   event is not needed.  This can be the case if a request-level
>> + *   completion event has already been generated, if the bio is
>> + *   being completed early, before it was even queued.
>> + *
>> + **/
>> +void bio_endio_notrace(struct bio *bio)
>> +{
>> +again:
...
>> +
>> +	if (bio->bi_bdev)
>> +		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
>> +					 bio, bio->bi_error);
>
> The notrace version still traces?

Ugh.  Thanks :-(

>
>> +	if (bio->bi_end_io)
>> +		bio->bi_end_io(bio);
>> +}
>> +EXPORT_SYMBOL(bio_endio_notrace);
>
> It isn't a good idea to duplicate bio_endio here, and any change on
> bio_endio() may be needed for this _notrace version too in future.

I uhmed and arhhed about that.  The function is so small....
But I've had a change of heart.  I don't think that having separate
bio_endio() and bio_endio_notrace() is such a good idea. It is too easy
to use the wrong one.  It is much better to make it automatically do the
right thing.
So following is a new patch - more thoroughly tested - which handles
more cases, and doesn't need any follow-up changes for filesystems.

Thanks,
Neilbrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* [PATCH v3] block: trace completion of all bios.
From: NeilBrown @ 2017-03-24  0:07 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Jens Axboe
  Cc: linux-block, linux-raid, dm-devel, Alasdair Kergon, Mike Snitzer,
	Shaohua Li, linux-kernel, Martin K . Petersen
In-Reply-To: <20170323104331.GA16903@ming.t460p>

[-- Attachment #1: Type: text/plain, Size: 7627 bytes --]


Currently only dm and md/raid5 bios trigger
trace_block_bio_complete().  Now that we have bio_chain() and
bio_inc_remaining(), it is not possible, in general, for a driver to
know when the bio is really complete.  Only bio_endio() knows that.

So move the trace_block_bio_complete() call to bio_endio().

Now trace_block_bio_complete() pairs with trace_block_bio_queue().
Any bio for which a 'queue' event is traced, will subsequently
generate a 'complete' event.

There are a few cases where completion tracing is not wanted.
1/ If blk_update_request() has already generated a completion
   trace event at the 'request' level, there is no point generating
   one at the bio level too.  In this case the bi_sector and bi_size
   will have changed, so the bio level event would be wrong

2/ If the bio hasn't actually been queued yet, but is being aborted
   early, then a trace event could be confusing.  Some filesystems
   call bio_endio() but do not want tracing.

3/ The bio_integrity code interposes itself by replacing bi_end_io,
   then restoring it and calling bio_endio() again.  This would produce
   two identical trace events if left like that.

To handle these, we introduce a flag BIO_TRACE_COMPLETION and only
produce the trace event when this is set.
We address point 1 above by clearing the flag in blk_update_request().
We address point 2 above by only setting the flag when
generic_make_request() is called.
We address point 3 above by clearing the flag after generating a
completion event.

When bio_split() is used on a bio, particularly in blk_queue_split(),
there is an extra complication.  A new bio is split off the front, and
may be handle directly without going through generic_make_request().
The old bio, which has been advanced, is passed to
generic_make_request(), so it will trigger a trace event a second
time.
Probably the best result when a split happens is to see a single
'queue' event for the whole bio, then multiple 'complete' events - one
for each component.  To achieve this was can:
- copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split()
- avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set.
This way, the split-off bio won't create a queue event, the original
won't either even if it re-submitted to generic_make_request(),
but both will produce completion events, each for their own range.

So if generic_make_request() is called (which generates a QUEUED
event), then bi_endio() will create a single COMPLETE event for each
range that the bio is split into, unless the driver has explicitly
requested it not to.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/bio.c               | 13 +++++++++++++
 block/blk-core.c          | 10 +++++++++-
 drivers/md/dm.c           |  1 -
 drivers/md/raid5.c        |  8 --------
 include/linux/blk_types.h |  4 +++-
 5 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..c1272986133e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1818,6 +1818,11 @@ static inline bool bio_remaining_done(struct bio *bio)
  *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
  *   way to end I/O on a bio. No one should call bi_end_io() directly on a
  *   bio unless they own it and thus know that it has an end_io function.
+ *
+ *   bio_endio() can be called several times on a bio that has been chained
+ *   using bio_chain().  The ->bi_end_io() function will only be call the
+ *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
+ *   generated if BIO_TRACE_COMPLETION is set.
  **/
 void bio_endio(struct bio *bio)
 {
@@ -1838,6 +1843,11 @@ void bio_endio(struct bio *bio)
 		goto again;
 	}
 
+	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+					 bio, bio->bi_error);
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+	}
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
@@ -1876,6 +1886,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
 
 	bio_advance(bio, split->bi_iter.bi_size);
 
+	if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
 	return split;
 }
 EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-core.c b/block/blk-core.c
index 0eeb99ef654f..b34b5b1b1bbf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1936,7 +1936,13 @@ generic_make_request_checks(struct bio *bio)
 	if (!blkcg_bio_issue_check(q, bio))
 		return false;
 
-	trace_block_bio_queue(q, bio);
+	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_queue(q, bio);
+		/* Now that enqueuing has been traced, we need to trace
+		 * completion as well.
+		 */
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+	}
 	return true;
 
 not_supported:
@@ -2595,6 +2601,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		if (bio_bytes == bio->bi_iter.bi_size)
 			req->bio = bio->bi_next;
 
+		/* Completion has already been traced */
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 		req_bio_endio(req, bio, bio_bytes, error);
 
 		total_bytes += bio_bytes;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..f5f09ace690a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			trace_block_bio_complete(md->queue, bio, io_error);
 			bio->bi_error = io_error;
 			bio_endio(bio);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9a3b7da34137..f684cb566721 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5141,8 +5141,6 @@ static void raid5_align_endio(struct bio *bi)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
-					 raid_bi, 0);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
@@ -5727,10 +5725,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
-
-
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
 		bio_endio(bi);
 	}
 }
@@ -6138,8 +6132,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
 	if (remaining == 0) {
-		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
-					 raid_bio, 0);
 		bio_endio(raid_bio);
 	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb55d0f..db7a57ee0e58 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -29,7 +29,7 @@ struct bio {
 						 * top bits REQ_OP. Use
 						 * accessors.
 						 */
-	unsigned short		bi_flags;	/* status, command, etc */
+	unsigned short		bi_flags;	/* status, etc */
 	unsigned short		bi_ioprio;
 
 	struct bvec_iter	bi_iter;
@@ -102,6 +102,8 @@ struct bio {
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED	9	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
+#define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
+				 * of this bio. */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
-- 
2.12.0


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply related

* Re: [PATCH v3 08/14] block: introduce bio_copy_data_partial
From: Shaohua Li @ 2017-03-24  5:34 UTC (permalink / raw)
  To: Ming Lei, axboe; +Cc: Jens Axboe, linux-raid, linux-block, Christoph Hellwig
In-Reply-To: <20170316161235.27110-9-tom.leiming@gmail.com>

Jens,
can you look at this patch? If it's ok, I'd like to route it through md tree.

Thanks,
Shaohua

On Fri, Mar 17, 2017 at 12:12:29AM +0800, Ming Lei wrote:
> Turns out we can use bio_copy_data in raid1's write behind,
> and we can make alloc_behind_pages() more clean/efficient,
> but we need to partial version of bio_copy_data().
> 
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  block/bio.c         | 60 +++++++++++++++++++++++++++++++++++++++++------------
>  include/linux/bio.h |  2 ++
>  2 files changed, 49 insertions(+), 13 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index e75878f8b14a..1ccff0dace89 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1025,19 +1025,8 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL(bio_alloc_pages);
>  
> -/**
> - * bio_copy_data - copy contents of data buffers from one chain of bios to
> - * another
> - * @src: source bio list
> - * @dst: destination bio list
> - *
> - * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
> - * @src and @dst as linked lists of bios.
> - *
> - * Stops when it reaches the end of either @src or @dst - that is, copies
> - * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
> - */
> -void bio_copy_data(struct bio *dst, struct bio *src)
> +static void __bio_copy_data(struct bio *dst, struct bio *src,
> +			    int offset, int size)
>  {
>  	struct bvec_iter src_iter, dst_iter;
>  	struct bio_vec src_bv, dst_bv;
> @@ -1047,6 +1036,12 @@ void bio_copy_data(struct bio *dst, struct bio *src)
>  	src_iter = src->bi_iter;
>  	dst_iter = dst->bi_iter;
>  
> +	/* for supporting partial copy */
> +	if (offset || size != src->bi_iter.bi_size) {
> +		bio_advance_iter(src, &src_iter, offset);
> +		src_iter.bi_size = size;
> +	}
> +
>  	while (1) {
>  		if (!src_iter.bi_size) {
>  			src = src->bi_next;
> @@ -1083,8 +1078,47 @@ void bio_copy_data(struct bio *dst, struct bio *src)
>  		bio_advance_iter(dst, &dst_iter, bytes);
>  	}
>  }
> +
> +/**
> + * bio_copy_data - copy contents of data buffers from one chain of bios to
> + * another
> + * @src: source bio list
> + * @dst: destination bio list
> + *
> + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
> + * @src and @dst as linked lists of bios.
> + *
> + * Stops when it reaches the end of either @src or @dst - that is, copies
> + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
> + */
> +void bio_copy_data(struct bio *dst, struct bio *src)
> +{
> +	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
> +}
>  EXPORT_SYMBOL(bio_copy_data);
>  
> +/**
> + * bio_copy_data_partial - copy partial contents of data buffers from one
> + * chain of bios to another
> + * @dst: destination bio list
> + * @src: source bio list
> + * @offset: starting copy from the offset
> + * @size: how many bytes to copy
> + *
> + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
> + * @src and @dst as linked lists of bios.
> + *
> + * Stops when it reaches the end of either @src or @dst - that is, copies
> + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
> + */
> +void bio_copy_data_partial(struct bio *dst, struct bio *src,
> +			   int offset, int size)
> +{
> +	__bio_copy_data(dst, src, offset, size);
> +
> +}
> +EXPORT_SYMBOL(bio_copy_data_partial);
> +
>  struct bio_map_data {
>  	int is_our_pages;
>  	struct iov_iter iter;
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 8e521194f6fc..42b62a0288b0 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -468,6 +468,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
>  #endif
>  
>  extern void bio_copy_data(struct bio *dst, struct bio *src);
> +extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
> +				  int offset, int size);
>  extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
>  extern void bio_free_pages(struct bio *bio);
>  
> -- 
> 2.9.3
> 

^ permalink raw reply

* Re: [PATCH v3 02/14] md: move two macros into md.h
From: NeilBrown @ 2017-03-24  5:57 UTC (permalink / raw)
  To: Ming Lei, Shaohua Li, Jens Axboe, linux-raid, linux-block,
	Christoph Hellwig
  Cc: Ming Lei
In-Reply-To: <20170316161235.27110-3-tom.leiming@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2750 bytes --]

On Fri, Mar 17 2017, Ming Lei wrote:

> Both raid1 and raid10 share common resync
> block size and page count, so move them into md.h.

I don't think this is necessary.
These are just "magic" numbers.  They don't have any real
meaning and so don't belong in md.h, or and .h file.

Possibly we should find more meaningful numbers, or make them auto-size
or something.  I'm also happy for them to stay as they are for now.
But I don't think we should pretend that they are meaningful.

Thanks,
NeilBrown


>
> Signed-off-by: Ming Lei <tom.leiming@gmail.com>
> ---
>  drivers/md/md.h     | 5 +++++
>  drivers/md/raid1.c  | 2 --
>  drivers/md/raid10.c | 3 ---
>  3 files changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index b8859cbf84b6..1d63239a1be4 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -715,4 +715,9 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
>  	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
>  		mddev->queue->limits.max_write_same_sectors = 0;
>  }
> +
> +/* Maximum size of each resync request */
> +#define RESYNC_BLOCK_SIZE (64*1024)
> +#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
> +
>  #endif /* _MD_MD_H */
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 4a0b2ad5025e..908e2caeb704 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -94,10 +94,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
>  	kfree(r1_bio);
>  }
>  
> -#define RESYNC_BLOCK_SIZE (64*1024)
>  #define RESYNC_DEPTH 32
>  #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
> -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
>  #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
>  #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
>  #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index b4a56a488668..2b40420299e3 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -125,9 +125,6 @@ static void r10bio_pool_free(void *r10_bio, void *data)
>  	kfree(r10_bio);
>  }
>  
> -/* Maximum size of each resync request */
> -#define RESYNC_BLOCK_SIZE (64*1024)
> -#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
>  /* amount of memory to reserve for resync requests */
>  #define RESYNC_WINDOW (1024*1024)
>  /* maximum number of concurrent requests, memory permitting */
> -- 
> 2.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox