Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH 8/8] block: use bio_has_data to check if a bio has bvecs
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Now that Write Same is gone and discard bios never have a payload we
can simply use bio_has_data as an indicator that the bio has bvecs
that need to be handled.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c         |  8 +-------
 block/blk-merge.c   |  9 +--------
 include/linux/bio.h | 21 +++++----------------
 3 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b310e7ef3fbf..1c9f04c30ba9 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -679,15 +679,9 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_ZEROES:
-		break;
-	default:
+	if (bio_has_data(bio)) {
 		__bio_for_each_segment(bv, bio_src, iter, iter_src)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
-		break;
 	}
 
 	if (bio_integrity(bio_src)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d6c86bfc5722..549d060097f1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -232,16 +232,9 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
 
-	if (!bio)
+	if (!bio || !bio_has_data(bio))
 		return 0;
 
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_ZEROES:
-		return 0;
-	}
-
 	fbio = bio;
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7a24a1a24967..86bf531f97aa 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -178,26 +178,15 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 {
 	unsigned segs = 0;
-	struct bio_vec bv;
-	struct bvec_iter iter;
 
-	/*
-	 * We special case discard/write same/write zeroes, because they
-	 * interpret bi_size differently:
-	 */
+	if (bio_has_data(bio)) {
+		struct bio_vec bv;
+		struct bvec_iter iter;
 
-	switch (bio_op(bio)) {
-	case REQ_OP_DISCARD:
-	case REQ_OP_SECURE_ERASE:
-	case REQ_OP_WRITE_ZEROES:
-		return 0;
-	default:
-		break;
+		__bio_for_each_segment(bv, bio, iter, *bvec)
+			segs++;
 	}
 
-	__bio_for_each_segment(bv, bio, iter, *bvec)
-		segs++;
-
 	return segs;
 }
 
-- 
2.11.0

^ permalink raw reply related

* [PATCH 7/8] block: remove bio_no_advance_iter
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Now that we don't have to support the odd Write Same special case
we can simply increment the iter if the bio has data, else just
manipulate bi_size directly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/bio.h | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 96a20afb8575..7a24a1a24967 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -83,13 +83,6 @@ static inline bool bio_has_data(struct bio *bio)
 	return false;
 }
 
-static inline bool bio_no_advance_iter(struct bio *bio)
-{
-	return bio_op(bio) == REQ_OP_DISCARD ||
-	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
-	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
-}
-
 static inline bool bio_mergeable(struct bio *bio)
 {
 	if (bio->bi_opf & REQ_NOMERGE_FLAGS)
@@ -165,10 +158,10 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 {
 	iter->bi_sector += bytes >> 9;
 
-	if (bio_no_advance_iter(bio))
-		iter->bi_size -= bytes;
-	else
+	if (bio_has_data(bio))
 		bvec_iter_advance(bio->bi_io_vec, iter, bytes);
+	else
+		iter->bi_size -= bytes;
 }
 
 #define __bio_for_each_segment(bvl, bio, iter, start)			\
-- 
2.11.0

^ permalink raw reply related

* [PATCH 6/8] block: remove REQ_OP_WRITE_SAME support
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c                 |  3 --
 block/blk-core.c            | 11 +-----
 block/blk-lib.c             | 90 ---------------------------------------------
 block/blk-merge.c           | 32 ----------------
 block/blk-settings.c        | 16 --------
 block/blk-sysfs.c           | 12 ------
 include/linux/bio.h         |  3 --
 include/linux/blk_types.h   |  4 +-
 include/linux/blkdev.h      | 26 -------------
 include/trace/events/f2fs.h |  1 -
 kernel/trace/blktrace.c     |  1 -
 11 files changed, 2 insertions(+), 197 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index f4d207180266..b310e7ef3fbf 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -684,9 +684,6 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
 		break;
-	case REQ_OP_WRITE_SAME:
-		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-		break;
 	default:
 		__bio_for_each_segment(bv, bio_src, iter, iter_src)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
diff --git a/block/blk-core.c b/block/blk-core.c
index 8654aa0cef6d..92336bc8495c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1929,10 +1929,6 @@ generic_make_request_checks(struct bio *bio)
 		if (!blk_queue_secure_erase(q))
 			goto not_supported;
 		break;
-	case REQ_OP_WRITE_SAME:
-		if (!bdev_write_same(bio->bi_bdev))
-			goto not_supported;
-		break;
 	case REQ_OP_ZONE_REPORT:
 	case REQ_OP_ZONE_RESET:
 		if (!bdev_is_zoned(bio->bi_bdev))
@@ -2100,12 +2096,7 @@ blk_qc_t submit_bio(struct bio *bio)
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
-		unsigned int count;
-
-		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
-		else
-			count = bio_sectors(bio);
+		unsigned int count = bio_sectors(bio);
 
 		if (op_is_write(bio_op(bio))) {
 			count_vm_events(PGPGOUT, count);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e8caecd71688..57c99b9b3b78 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -131,96 +131,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
 
-/**
- * __blkdev_issue_write_same - generate number of bios with same page
- * @bdev:	target blockdev
- * @sector:	start sector
- * @nr_sects:	number of sectors to write
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @page:	page containing data to write
- * @biop:	pointer to anchor bio
- *
- * Description:
- *  Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
- */
-static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct page *page,
-		struct bio **biop)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-	unsigned int max_write_same_sectors;
-	struct bio *bio = *biop;
-	sector_t bs_mask;
-
-	if (!q)
-		return -ENXIO;
-
-	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
-	if ((sector | nr_sects) & bs_mask)
-		return -EINVAL;
-
-	if (!bdev_write_same(bdev))
-		return -EOPNOTSUPP;
-
-	/* Ensure that max_write_same_sectors doesn't overflow bi_size */
-	max_write_same_sectors = UINT_MAX >> 9;
-
-	while (nr_sects) {
-		bio = next_bio(bio, 1, gfp_mask);
-		bio->bi_iter.bi_sector = sector;
-		bio->bi_bdev = bdev;
-		bio->bi_vcnt = 1;
-		bio->bi_io_vec->bv_page = page;
-		bio->bi_io_vec->bv_offset = 0;
-		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
-		bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
-
-		if (nr_sects > max_write_same_sectors) {
-			bio->bi_iter.bi_size = max_write_same_sectors << 9;
-			nr_sects -= max_write_same_sectors;
-			sector += max_write_same_sectors;
-		} else {
-			bio->bi_iter.bi_size = nr_sects << 9;
-			nr_sects = 0;
-		}
-		cond_resched();
-	}
-
-	*biop = bio;
-	return 0;
-}
-
-/**
- * blkdev_issue_write_same - queue a write same operation
- * @bdev:	target blockdev
- * @sector:	start sector
- * @nr_sects:	number of sectors to write
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @page:	page containing data
- *
- * Description:
- *    Issue a write same request for the sectors in question.
- */
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-				sector_t nr_sects, gfp_t gfp_mask,
-				struct page *page)
-{
-	struct bio *bio = NULL;
-	struct blk_plug plug;
-	int ret;
-
-	blk_start_plug(&plug);
-	ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
-			&bio);
-	if (ret == 0 && bio) {
-		ret = submit_bio_wait(bio);
-		bio_put(bio);
-	}
-	blk_finish_plug(&plug);
-	return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_write_same);
-
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
 		struct bio **biop, unsigned flags)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3990ae406341..d6c86bfc5722 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -68,22 +68,6 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 	return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
 }
 
-static struct bio *blk_bio_write_same_split(struct request_queue *q,
-					    struct bio *bio,
-					    struct bio_set *bs,
-					    unsigned *nsegs)
-{
-	*nsegs = 1;
-
-	if (!q->limits.max_write_same_sectors)
-		return NULL;
-
-	if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
-		return NULL;
-
-	return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
-}
-
 static inline unsigned get_max_io_size(struct request_queue *q,
 				       struct bio *bio)
 {
@@ -216,9 +200,6 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	case REQ_OP_WRITE_ZEROES:
 		split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
 		break;
-	case REQ_OP_WRITE_SAME:
-		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
-		break;
 	default:
 		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
 		break;
@@ -259,8 +240,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
 		return 0;
-	case REQ_OP_WRITE_SAME:
-		return 1;
 	}
 
 	fbio = bio;
@@ -454,8 +433,6 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
 		nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg);
-	else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
-		nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg);
 	else if (rq->bio)
 		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
 
@@ -688,10 +665,6 @@ static struct request *attempt_merge(struct request_queue *q,
 	    || req_no_special_merge(next))
 		return NULL;
 
-	if (req_op(req) == REQ_OP_WRITE_SAME &&
-	    !blk_write_same_mergeable(req->bio, next->bio))
-		return NULL;
-
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
@@ -806,11 +779,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 		return false;
 
-	/* must be using the same buffer */
-	if (req_op(rq) == REQ_OP_WRITE_SAME &&
-	    !blk_write_same_mergeable(rq->bio, bio))
-		return false;
-
 	return true;
 }
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 4fa81ed383ca..aea05adfd6b4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,7 +96,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
 	lim->max_dev_sectors = 0;
 	lim->chunk_sectors = 0;
-	lim->max_write_same_sectors = 0;
 	lim->max_write_zeroes_sectors = 0;
 	lim->max_discard_sectors = 0;
 	lim->max_hw_discard_sectors = 0;
@@ -132,7 +131,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	lim->max_segment_size = UINT_MAX;
 	lim->max_sectors = UINT_MAX;
 	lim->max_dev_sectors = UINT_MAX;
-	lim->max_write_same_sectors = UINT_MAX;
 	lim->max_write_zeroes_sectors = UINT_MAX;
 }
 EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -291,18 +289,6 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 
 /**
- * blk_queue_max_write_same_sectors - set max sectors for a single write same
- * @q:  the request queue for the device
- * @max_write_same_sectors: maximum number of sectors to write per command
- **/
-void blk_queue_max_write_same_sectors(struct request_queue *q,
-				      unsigned int max_write_same_sectors)
-{
-	q->limits.max_write_same_sectors = max_write_same_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
-
-/**
  * blk_queue_max_write_zeroes_sectors - set max sectors for a single
  *                                      write zeroes
  * @q:  the request queue for the device
@@ -557,8 +543,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
-	t->max_write_same_sectors = min(t->max_write_same_sectors,
-					b->max_write_same_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fc20489f0d2b..2ea4aca4ec1c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -211,12 +211,6 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
 	return queue_var_show(0, page);
 }
 
-static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
-{
-	return sprintf(page, "%llu\n",
-		(unsigned long long)q->limits.max_write_same_sectors << 9);
-}
-
 static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
 {
 	return sprintf(page, "%llu\n",
@@ -603,11 +597,6 @@ static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
 	.show = queue_discard_zeroes_data_show,
 };
 
-static struct queue_sysfs_entry queue_write_same_max_entry = {
-	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
-	.show = queue_write_same_max_show,
-};
-
 static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
 	.attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
 	.show = queue_write_zeroes_max_show,
@@ -705,7 +694,6 @@ static struct attribute *default_attrs[] = {
 	&queue_discard_max_entry.attr,
 	&queue_discard_max_hw_entry.attr,
 	&queue_discard_zeroes_data_entry.attr,
-	&queue_write_same_max_entry.attr,
 	&queue_write_zeroes_max_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 4931756d86d9..96a20afb8575 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -87,7 +87,6 @@ static inline bool bio_no_advance_iter(struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
-	       bio_op(bio) == REQ_OP_WRITE_SAME ||
 	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
 }
 
@@ -199,8 +198,6 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
 		return 0;
-	case REQ_OP_WRITE_SAME:
-		return 1;
 	default:
 		break;
 	}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 61339bc44400..fc4fc927dcc4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -171,10 +171,8 @@ enum req_opf {
 	REQ_OP_SECURE_ERASE	= 5,
 	/* seset a zone write pointer */
 	REQ_OP_ZONE_RESET	= 6,
-	/* write the same sector many times */
-	REQ_OP_WRITE_SAME	= 7,
 	/* write the zero filled sector many times */
-	REQ_OP_WRITE_ZEROES	= 9,
+	REQ_OP_WRITE_ZEROES	= 7,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec993573e0a8..1f066f246dd7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -326,7 +326,6 @@ struct queue_limits {
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
 	unsigned int		max_hw_discard_sectors;
-	unsigned int		max_write_same_sectors;
 	unsigned int		max_write_zeroes_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
@@ -806,14 +805,6 @@ static inline bool rq_mergeable(struct request *rq)
 	return true;
 }
 
-static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
-{
-	if (bio_data(a) == bio_data(b))
-		return true;
-
-	return false;
-}
-
 static inline unsigned int blk_queue_depth(struct request_queue *q)
 {
 	if (q->queue_depth)
@@ -1035,9 +1026,6 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
 		return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
-	if (unlikely(op == REQ_OP_WRITE_SAME))
-		return q->limits.max_write_same_sectors;
-
 	if (unlikely(op == REQ_OP_WRITE_ZEROES))
 		return q->limits.max_write_zeroes_sectors;
 
@@ -1157,8 +1145,6 @@ extern void blk_queue_max_discard_segments(struct request_queue *,
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
-extern void blk_queue_max_write_same_sectors(struct request_queue *q,
-		unsigned int max_write_same_sectors);
 extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
 		unsigned int max_write_same_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
@@ -1336,8 +1322,6 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
 
@@ -1539,16 +1523,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 	return q->limits.discard_alignment;
 }
 
-static inline unsigned int bdev_write_same(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-
-	if (q)
-		return q->limits.max_write_same_sectors;
-
-	return 0;
-}
-
 static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index c80fcad0a6c9..da1b542ef8d6 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -71,7 +71,6 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ REQ_OP_ZONE_REPORT,		"ZONE_REPORT" },	\
 		{ REQ_OP_SECURE_ERASE,		"SECURE_ERASE" },	\
 		{ REQ_OP_ZONE_RESET,		"ZONE_RESET" },		\
-		{ REQ_OP_WRITE_SAME,		"WRITE_SAME" },		\
 		{ REQ_OP_WRITE_ZEROES,		"WRITE_ZEROES" })
 
 #define show_bio_op_flags(flags)					\
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd..99060c96a4bd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1750,7 +1750,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes)
 
 	switch (op & REQ_OP_MASK) {
 	case REQ_OP_WRITE:
-	case REQ_OP_WRITE_SAME:
 		rwbs[i++] = 'W';
 		break;
 	case REQ_OP_DISCARD:
-- 
2.11.0

^ permalink raw reply related

* [PATCH 5/8] dm: remove write same support
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/dm-core.h          |  1 -
 drivers/md/dm-io.c            | 21 +--------------------
 drivers/md/dm-linear.c        |  1 -
 drivers/md/dm-mpath.c         |  1 -
 drivers/md/dm-rq.c            |  3 ---
 drivers/md/dm-stripe.c        |  4 +---
 drivers/md/dm-table.c         | 29 -----------------------------
 drivers/md/dm.c               | 23 -----------------------
 include/linux/device-mapper.h |  6 ------
 9 files changed, 2 insertions(+), 87 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index fea5bd52ada8..d661801d72e7 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -131,7 +131,6 @@ struct mapped_device {
 void dm_init_md_queue(struct mapped_device *md);
 void dm_init_normal_md_queue(struct mapped_device *md);
 int md_in_flight(struct mapped_device *md);
-void disable_write_same(struct mapped_device *md);
 void disable_write_zeroes(struct mapped_device *md);
 
 static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3702e502466d..105e68dabd3e 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -303,7 +303,6 @@ static void do_region(int op, int op_flags, unsigned region,
 	unsigned num_bvecs;
 	sector_t remaining = where->count;
 	struct request_queue *q = bdev_get_queue(where->bdev);
-	unsigned short logical_block_size = queue_logical_block_size(q);
 	sector_t num_sectors;
 	unsigned int uninitialized_var(special_cmd_max_sectors);
 
@@ -314,10 +313,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		special_cmd_max_sectors = q->limits.max_discard_sectors;
 	else if (op == REQ_OP_WRITE_ZEROES)
 		special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
-	else if (op == REQ_OP_WRITE_SAME)
-		special_cmd_max_sectors = q->limits.max_write_same_sectors;
-	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
-	     op == REQ_OP_WRITE_SAME)  &&
+	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) &&
 	    special_cmd_max_sectors == 0) {
 		dec_count(io, region, -EOPNOTSUPP);
 		return;
@@ -336,9 +332,6 @@ static void do_region(int op, int op_flags, unsigned region,
 		case REQ_OP_WRITE_ZEROES:
 			num_bvecs = 0;
 			break;
-		case REQ_OP_WRITE_SAME:
-			num_bvecs = 1;
-			break;
 		default:
 			num_bvecs = min_t(int, BIO_MAX_PAGES,
 					  dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
@@ -355,18 +348,6 @@ static void do_region(int op, int op_flags, unsigned region,
 			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
 			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 			remaining -= num_sectors;
-		} else if (op == REQ_OP_WRITE_SAME) {
-			/*
-			 * WRITE SAME only uses a single page.
-			 */
-			dp->get_page(dp, &page, &len, &offset);
-			bio_add_page(bio, page, logical_block_size, offset);
-			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
-			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
-
-			offset = 0;
-			remaining -= num_sectors;
-			dp->next_page(dp);
 		} else while (remaining) {
 			/*
 			 * Try and add as many pages as possible.
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index e17fd44ceef5..f928f7e9ee4a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -58,7 +58,6 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
-	ti->num_write_same_bios = 1;
 	ti->num_write_zeroes_bios = 1;
 	ti->private = lc;
 	return 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index ab55955ed704..ece53947b99d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1102,7 +1102,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
-	ti->num_write_same_bios = 1;
 	ti->num_write_zeroes_bios = 1;
 	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index e60f1b6845be..6f8dc99685f2 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -299,9 +299,6 @@ static void dm_done(struct request *clone, int error, bool mapped)
 	}
 
 	if (unlikely(r == -EREMOTEIO)) {
-		if (req_op(clone) == REQ_OP_WRITE_SAME &&
-		    !clone->q->limits.max_write_same_sectors)
-			disable_write_same(tio->md);
 		if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
 		    !clone->q->limits.max_write_zeroes_sectors)
 			disable_write_zeroes(tio->md);
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 5ef49c121d99..cc5a00f2e2de 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -168,7 +168,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->num_flush_bios = stripes;
 	ti->num_discard_bios = stripes;
-	ti->num_write_same_bios = stripes;
 	ti->num_write_zeroes_bios = stripes;
 
 	sc->chunk_size = chunk_size;
@@ -294,8 +293,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
-	    unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
-	    unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
+	    unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES)) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
 		return stripe_map_range(sc, bio, target_bio_nr);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 958275aca008..8bbc3d57fcc7 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1490,33 +1490,6 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 	return true;
 }
 
-static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
-					 sector_t start, sector_t len, void *data)
-{
-	struct request_queue *q = bdev_get_queue(dev->bdev);
-
-	return q && !q->limits.max_write_same_sectors;
-}
-
-static bool dm_table_supports_write_same(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (!ti->num_write_same_bios)
-			return false;
-
-		if (!ti->type->iterate_devices ||
-		    ti->type->iterate_devices(ti, device_not_write_same_capable, NULL))
-			return false;
-	}
-
-	return true;
-}
-
 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
 					   sector_t start, sector_t len, void *data)
 {
@@ -1610,8 +1583,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	else
 		queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
 
-	if (!dm_table_supports_write_same(t))
-		q->limits.max_write_same_sectors = 0;
 	if (!dm_table_supports_write_zeroes(t))
 		q->limits.max_write_zeroes_sectors = 0;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8bf397729bbd..8259aa76839e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -816,14 +816,6 @@ static void dec_pending(struct dm_io *io, int error)
 	}
 }
 
-void disable_write_same(struct mapped_device *md)
-{
-	struct queue_limits *limits = dm_get_queue_limits(md);
-
-	/* device doesn't really support WRITE SAME, disable it */
-	limits->max_write_same_sectors = 0;
-}
-
 void disable_write_zeroes(struct mapped_device *md)
 {
 	struct queue_limits *limits = dm_get_queue_limits(md);
@@ -859,9 +851,6 @@ static void clone_endio(struct bio *bio)
 	}
 
 	if (unlikely(r == -EREMOTEIO)) {
-		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
-			disable_write_same(md);
 		if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
 			disable_write_zeroes(md);
@@ -1209,11 +1198,6 @@ static unsigned get_num_discard_bios(struct dm_target *ti)
 	return ti->num_discard_bios;
 }
 
-static unsigned get_num_write_same_bios(struct dm_target *ti)
-{
-	return ti->num_write_same_bios;
-}
-
 static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
 {
 	return ti->num_write_zeroes_bios;
@@ -1268,11 +1252,6 @@ static int __send_discard(struct clone_info *ci)
 					   is_split_required_for_discard);
 }
 
-static int __send_write_same(struct clone_info *ci)
-{
-	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
-}
-
 static int __send_write_zeroes(struct clone_info *ci)
 {
 	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
@@ -1290,8 +1269,6 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 
 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
 		return __send_discard(ci);
-	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-		return __send_write_same(ci);
 	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
 		return __send_write_zeroes(ci);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index c7ea33e38fb9..58f451ba9b75 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -249,12 +249,6 @@ struct dm_target {
 	unsigned num_discard_bios;
 
 	/*
-	 * The number of WRITE SAME bios that will be submitted to the target.
-	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
-	 */
-	unsigned num_write_same_bios;
-
-	/*
 	 * The number of WRITE ZEROES bios that will be submitted to the target.
 	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
 	 */
-- 
2.11.0

^ permalink raw reply related

* [PATCH 4/8] md: drop WRITE_SAME support
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/linear.c    | 1 -
 drivers/md/md.h        | 7 -------
 drivers/md/multipath.c | 1 -
 drivers/md/raid0.c     | 2 --
 drivers/md/raid1.c     | 4 +---
 drivers/md/raid10.c    | 1 -
 drivers/md/raid5.c     | 1 -
 7 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 377a8a3672e3..da363f5d54b0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -292,7 +292,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
 						      split, disk_devt(mddev->gendisk),
 						      bio_sector);
-			mddev_check_writesame(mddev, split);
 			mddev_check_write_zeroes(mddev, split);
 			generic_make_request(split);
 		}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1e76d64ce180..d82b11b5ae5a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -703,13 +703,6 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
 	mddev->flags &= ~unsupported_flags;
 }
 
-static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
-{
-	if (bio_op(bio) == REQ_OP_WRITE_SAME &&
-	    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
-		mddev->queue->limits.max_write_same_sectors = 0;
-}
-
 static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
 {
 	if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e95d521d93e9..68d67a404aab 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -138,7 +138,6 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
 	mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
 	mp_bh->bio.bi_end_io = multipath_end_request;
 	mp_bh->bio.bi_private = mp_bh;
-	mddev_check_writesame(mddev, &mp_bh->bio);
 	mddev_check_write_zeroes(mddev, &mp_bh->bio);
 	generic_make_request(&mp_bh->bio);
 	return;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ce7a6a56cf73..c094749c11e5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -382,7 +382,6 @@ static int raid0_run(struct mddev *mddev)
 		bool discard_supported = false;
 
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
-		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
 
@@ -504,7 +503,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
 						      split, disk_devt(mddev->gendisk),
 						      bio_sector);
-			mddev_check_writesame(mddev, split);
 			mddev_check_write_zeroes(mddev, split);
 			generic_make_request(split);
 		}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b59cc100320a..ac9ef686e625 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3177,10 +3177,8 @@ static int raid1_run(struct mddev *mddev)
 	if (IS_ERR(conf))
 		return PTR_ERR(conf);
 
-	if (mddev->queue) {
-		blk_queue_max_write_same_sectors(mddev->queue, 0);
+	if (mddev->queue)
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-	}
 
 	rdev_for_each(rdev, mddev) {
 		if (!mddev->gendisk)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 28ec3a93acee..79988908f862 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3748,7 +3748,6 @@ static int raid10_run(struct mddev *mddev)
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
-		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2efdb0d67460..04fd6a946825 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7262,7 +7262,6 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      0xfffe * STRIPE_SECTORS);
 
-		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
 		rdev_for_each(rdev, mddev) {
-- 
2.11.0

^ permalink raw reply related

* [PATCH 3/8] sd: remove write same support
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

There are no more end-users of REQ_OP_WRITE_SAME left, so we can start
deleting it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/scsi/sd.c     | 70 ---------------------------------------------------
 drivers/scsi/sd_zbc.c |  1 -
 2 files changed, 71 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 8cf34a8e3eea..a905802e927e 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -878,77 +878,10 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
 		sdkp->zeroing_mode = SD_ZERO_WRITE;
 
 out:
-	blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
-					 (logical_block_size >> 9));
 	blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
 					 (logical_block_size >> 9));
 }
 
-/**
- * sd_setup_write_same_cmnd - write the same data to multiple blocks
- * @cmd: command to prepare
- *
- * Will issue either WRITE SAME(10) or WRITE SAME(16) depending on
- * preference indicated by target device.
- **/
-static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
-{
-	struct request *rq = cmd->request;
-	struct scsi_device *sdp = cmd->device;
-	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
-	struct bio *bio = rq->bio;
-	sector_t sector = blk_rq_pos(rq);
-	unsigned int nr_sectors = blk_rq_sectors(rq);
-	unsigned int nr_bytes = blk_rq_bytes(rq);
-	int ret;
-
-	if (sdkp->device->no_write_same)
-		return BLKPREP_INVALID;
-
-	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
-
-	if (sd_is_zoned(sdkp)) {
-		ret = sd_zbc_setup_write_cmnd(cmd);
-		if (ret != BLKPREP_OK)
-			return ret;
-	}
-
-	sector >>= ilog2(sdp->sector_size) - 9;
-	nr_sectors >>= ilog2(sdp->sector_size) - 9;
-
-	rq->timeout = SD_WRITE_SAME_TIMEOUT;
-
-	if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) {
-		cmd->cmd_len = 16;
-		cmd->cmnd[0] = WRITE_SAME_16;
-		put_unaligned_be64(sector, &cmd->cmnd[2]);
-		put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
-	} else {
-		cmd->cmd_len = 10;
-		cmd->cmnd[0] = WRITE_SAME;
-		put_unaligned_be32(sector, &cmd->cmnd[2]);
-		put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
-	}
-
-	cmd->transfersize = sdp->sector_size;
-	cmd->allowed = SD_MAX_RETRIES;
-
-	/*
-	 * For WRITE SAME the data transferred via the DATA OUT buffer is
-	 * different from the amount of data actually written to the target.
-	 *
-	 * We set up __data_len to the amount of data transferred via the
-	 * DATA OUT buffer so that blk_rq_map_sg sets up the proper S/G list
-	 * to transfer a single sector of data first, but then reset it to
-	 * the amount of data to be written right after so that the I/O path
-	 * knows how much to actually write.
-	 */
-	rq->__data_len = sdp->sector_size;
-	ret = scsi_init_io(cmd);
-	rq->__data_len = nr_bytes;
-	return ret;
-}
-
 static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
@@ -1232,8 +1165,6 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		}
 	case REQ_OP_WRITE_ZEROES:
 		return sd_setup_write_zeroes_cmnd(cmd);
-	case REQ_OP_WRITE_SAME:
-		return sd_setup_write_same_cmnd(cmd);
 	case REQ_OP_FLUSH:
 		return sd_setup_flush_cmnd(cmd);
 	case REQ_OP_READ:
@@ -1872,7 +1803,6 @@ static int sd_done(struct scsi_cmnd *SCpnt)
 	switch (req_op(req)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE_SAME:
 	case REQ_OP_ZONE_RESET:
 		if (!result) {
 			good_bytes = blk_rq_bytes(req);
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 1994f7799fce..8af6c9cd30ca 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -330,7 +330,6 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
 	switch (req_op(rq)) {
 	case REQ_OP_WRITE:
 	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE_SAME:
 	case REQ_OP_ZONE_RESET:
 
 		/* Unlock the zone */
-- 
2.11.0

^ permalink raw reply related

* [PATCH 2/8] target: remove iblock WRITE_SAME passthrough support
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Use the pscsi driver to support arbitrary command passthrough
instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/target/target_core_iblock.c | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c
index d316ed537d59..9da31970a004 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -415,39 +415,8 @@ iblock_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb)
 }
 
 static sense_reason_t
-iblock_execute_write_same_direct(struct block_device *bdev, struct se_cmd *cmd)
-{
-	struct se_device *dev = cmd->se_dev;
-	struct scatterlist *sg = &cmd->t_data_sg[0];
-	struct page *page = NULL;
-	int ret;
-
-	if (sg->offset) {
-		page = alloc_page(GFP_KERNEL);
-		if (!page)
-			return TCM_OUT_OF_RESOURCES;
-		sg_copy_to_buffer(sg, cmd->t_data_nents, page_address(page),
-				  dev->dev_attrib.block_size);
-	}
-
-	ret = blkdev_issue_write_same(bdev,
-				target_to_linux_sector(dev, cmd->t_task_lba),
-				target_to_linux_sector(dev,
-					sbc_get_write_same_sectors(cmd)),
-				GFP_KERNEL, page ? page : sg_page(sg));
-	if (page)
-		__free_page(page);
-	if (ret)
-		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
-
-	target_complete_cmd(cmd, GOOD);
-	return 0;
-}
-
-static sense_reason_t
 iblock_execute_write_same(struct se_cmd *cmd)
 {
-	struct block_device *bdev = IBLOCK_DEV(cmd->se_dev)->ibd_bd;
 	struct iblock_req *ibr;
 	struct scatterlist *sg;
 	struct bio *bio;
@@ -472,9 +441,6 @@ iblock_execute_write_same(struct se_cmd *cmd)
 		return TCM_INVALID_CDB_FIELD;
 	}
 
-	if (bdev_write_same(bdev))
-		return iblock_execute_write_same_direct(bdev, cmd);
-
 	ibr = kzalloc(sizeof(struct iblock_req), GFP_KERNEL);
 	if (!ibr)
 		goto fail;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 1/8] drbd: drop REQ_OP_WRITE_SAME support
From: Christoph Hellwig @ 2017-04-10 16:08 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel
In-Reply-To: <20170410160807.23674-1-hch@lst.de>

Linux only used it for zeroing, for which we have better methods now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/drbd/drbd_main.c     | 28 ++----------------
 drivers/block/drbd/drbd_nl.c       | 60 --------------------------------------
 drivers/block/drbd/drbd_receiver.c | 38 +++---------------------
 drivers/block/drbd/drbd_req.c      |  1 -
 drivers/block/drbd/drbd_worker.c   |  4 ---
 5 files changed, 7 insertions(+), 124 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 84455c365f57..183468e0b959 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,7 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = blk_queue_discard(q);
-		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
+		p->qlim->write_same_capable = 0;
 	} else {
 		q = device->rq_queue;
 		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
@@ -1610,9 +1610,6 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
 					 ? 0 : MSG_MORE);
 		if (err)
 			return err;
-		/* REQ_OP_WRITE_SAME has only one segment */
-		if (bio_op(bio) == REQ_OP_WRITE_SAME)
-			break;
 	}
 	return 0;
 }
@@ -1631,9 +1628,6 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
 				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
 		if (err)
 			return err;
-		/* REQ_OP_WRITE_SAME has only one segment */
-		if (bio_op(bio) == REQ_OP_WRITE_SAME)
-			break;
 	}
 	return 0;
 }
@@ -1665,7 +1659,6 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
 		return  (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) |
 			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
 			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
-			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
 			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
 			(bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
 	else
@@ -1680,7 +1673,6 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 	struct drbd_device *device = peer_device->device;
 	struct drbd_socket *sock;
 	struct p_data *p;
-	struct p_wsame *wsame = NULL;
 	void *digest_out;
 	unsigned int dp_flags = 0;
 	int digest_size;
@@ -1717,27 +1709,13 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
 		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
 		goto out;
 	}
-	if (dp_flags & DP_WSAME) {
-		/* this will only work if DRBD_FF_WSAME is set AND the
-		 * handshake agreed that all nodes and backend devices are
-		 * WRITE_SAME capable and agree on logical_block_size */
-		wsame = (struct p_wsame*)p;
-		digest_out = wsame + 1;
-		wsame->size = cpu_to_be32(req->i.size);
-	} else
-		digest_out = p + 1;
+	digest_out = p + 1;
 
 	/* our digest is still only over the payload.
 	 * TRIM does not carry any payload. */
 	if (digest_size)
 		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
-	if (wsame) {
-		err =
-		    __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
-				   sizeof(*wsame) + digest_size, NULL,
-				   bio_iovec(req->master_bio).bv_len);
-	} else
-		err =
+	err =
 		    __send_command(peer_device->connection, device->vnr, sock, P_DATA,
 				   sizeof(*p) + digest_size, NULL, req->i.size);
 	if (!err) {
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 02255a0d68b9..53aeed040eb4 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1234,65 +1234,6 @@ static void fixup_discard_if_not_supported(struct request_queue *q)
 	}
 }
 
-static void decide_on_write_same_support(struct drbd_device *device,
-			struct request_queue *q,
-			struct request_queue *b, struct o_qlim *o)
-{
-	struct drbd_peer_device *peer_device = first_peer_device(device);
-	struct drbd_connection *connection = peer_device->connection;
-	bool can_do = b ? b->limits.max_write_same_sectors : true;
-
-	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
-		can_do = false;
-		drbd_info(peer_device, "peer does not support WRITE_SAME\n");
-	}
-
-	if (o) {
-		/* logical block size; queue_logical_block_size(NULL) is 512 */
-		unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
-		unsigned int me_lbs_b = queue_logical_block_size(b);
-		unsigned int me_lbs = queue_logical_block_size(q);
-
-		if (me_lbs_b != me_lbs) {
-			drbd_warn(device,
-				"logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
-				me_lbs, me_lbs_b);
-			/* rather disable write same than trigger some BUG_ON later in the scsi layer. */
-			can_do = false;
-		}
-		if (me_lbs_b != peer_lbs) {
-			drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
-				me_lbs, peer_lbs);
-			if (can_do) {
-				drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
-				can_do = false;
-			}
-			me_lbs = max(me_lbs, me_lbs_b);
-			/* We cannot change the logical block size of an in-use queue.
-			 * We can only hope that access happens to be properly aligned.
-			 * If not, the peer will likely produce an IO error, and detach. */
-			if (peer_lbs > me_lbs) {
-				if (device->state.role != R_PRIMARY) {
-					blk_queue_logical_block_size(q, peer_lbs);
-					drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
-				} else {
-					drbd_warn(peer_device,
-						"current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
-						me_lbs, peer_lbs);
-				}
-			}
-		}
-		if (can_do && !o->write_same_capable) {
-			/* If we introduce an open-coded write-same loop on the receiving side,
-			 * the peer would present itself as "capable". */
-			drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
-			can_do = false;
-		}
-	}
-
-	blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
-}
-
 static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
 				   unsigned int max_bio_size, struct o_qlim *o)
 {
@@ -1321,7 +1262,6 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
 	blk_queue_segment_boundary(q, PAGE_SIZE-1);
 	decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
-	decide_on_write_same_support(device, q, b, o);
 
 	if (b) {
 		blk_queue_stack_limits(q, b);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 1b0a2be24f39..980d53793007 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1459,18 +1459,6 @@ static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer
 	drbd_endio_write_sec_final(peer_req);
 }
 
-static void drbd_issue_peer_wsame(struct drbd_device *device,
-				  struct drbd_peer_request *peer_req)
-{
-	struct block_device *bdev = device->ldev->backing_bdev;
-	sector_t s = peer_req->i.sector;
-	sector_t nr = peer_req->i.size >> 9;
-	if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
-		peer_req->flags |= EE_WAS_ERROR;
-	drbd_endio_write_sec_final(peer_req);
-}
-
-
 /**
  * drbd_submit_peer_request()
  * @device:	DRBD device.
@@ -1508,7 +1496,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 	 * Correctness first, performance later.  Next step is to code an
 	 * asynchronous variant of the same.
 	 */
-	if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
+	if (peer_req->flags & EE_IS_TRIM) {
 		/* wait for all pending IO completions, before we start
 		 * zeroing things out. */
 		conn_wait_active_ee_empty(peer_req->peer_device->connection);
@@ -1527,8 +1515,6 @@ int drbd_submit_peer_request(struct drbd_device *device,
 
 		if (peer_req->flags & EE_IS_TRIM)
 			drbd_issue_peer_discard(device, peer_req);
-		else /* EE_WRITE_SAME */
-			drbd_issue_peer_wsame(device, peer_req);
 		return 0;
 	}
 
@@ -1723,7 +1709,6 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 	void *dig_vv = peer_device->connection->int_dig_vv;
 	unsigned long *data;
 	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
-	struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
 
 	digest_size = 0;
 	if (!trim && peer_device->connection->peer_integrity_tfm) {
@@ -1738,29 +1723,17 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 		data_size -= digest_size;
 	}
 
-	/* assume request_size == data_size, but special case trim and wsame. */
+	/* assume request_size == data_size, but special case trim. */
 	ds = data_size;
 	if (trim) {
 		if (!expect(data_size == 0))
 			return NULL;
 		ds = be32_to_cpu(trim->size);
-	} else if (wsame) {
-		if (data_size != queue_logical_block_size(device->rq_queue)) {
-			drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
-				data_size, queue_logical_block_size(device->rq_queue));
-			return NULL;
-		}
-		if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
-			drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
-				data_size, bdev_logical_block_size(device->ldev->backing_bdev));
-			return NULL;
-		}
-		ds = be32_to_cpu(wsame->size);
 	}
 
 	if (!expect(IS_ALIGNED(ds, 512)))
 		return NULL;
-	if (trim || wsame) {
+	if (trim) {
 		if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
 			return NULL;
 	} else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
@@ -1788,8 +1761,6 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 		peer_req->flags |= EE_IS_TRIM;
 		return peer_req;
 	}
-	if (wsame)
-		peer_req->flags |= EE_WRITE_SAME;
 
 	/* receive payload size bytes into page chain */
 	ds = data_size;
@@ -2545,7 +2516,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	 * we wait for all pending requests, respectively wait for
 	 * active_ee to become empty in drbd_submit_peer_request();
 	 * better not add ourselves here. */
-	if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
+	if ((peer_req->flags & EE_IS_TRIM) == 0)
 		list_add_tail(&peer_req->w.list, &device->active_ee);
 	spin_unlock_irq(&device->resource->req_lock);
 
@@ -4869,7 +4840,6 @@ static struct data_cmd drbd_cmd_handler[] = {
 	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
 	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
 	[P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
-	[P_WSAME]	    = { 1, sizeof(struct p_wsame), receive_Data },
 };
 
 static void drbdd(struct drbd_connection *connection)
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index b5730e17b455..3d3fba937a92 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -58,7 +58,6 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
 
 	drbd_req_make_private_bio(req, bio_src);
 	req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
-		      | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
 		      | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0)
 		      | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
 	req->device = device;
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 1afcb4e02d8d..9fad03491250 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -330,10 +330,6 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
 		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 		ahash_request_set_crypt(req, &sg, NULL, sg.length);
 		crypto_ahash_update(req);
-		/* REQ_OP_WRITE_SAME has only one segment,
-		 * checksum the payload only once. */
-		if (bio_op(bio) == REQ_OP_WRITE_SAME)
-			break;
 	}
 	ahash_request_set_crypt(req, NULL, digest, 0);
 	crypto_ahash_final(req);
-- 
2.11.0

^ permalink raw reply related

* RFC: remove REQ_OP_WRITE_SAME
From: Christoph Hellwig @ 2017-04-10 16:07 UTC (permalink / raw)
  To: axboe, martin.petersen, philipp.reisner, lars.ellenberg,
	target-devel
  Cc: linux-block, linux-scsi, drbd-dev, dm-devel

Now that we are using REQ_OP_WRITE_ZEROES for all zeroing needs in the
kernel there is very little use left for REQ_OP_WRITE_SAME.  We only
have two callers left, and both just export optional protocol features
to remote systems: DRBD and the target code.

Do we have any major users of those?  If not removing it will clean up
a few warts in the block layer.

    git://git.infradead.org/users/hch/block.git delete-write-same

Gitweb:

    http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/delete-write-same

^ permalink raw reply

* [PATCH 3/3] blk-mq: unify hctx delay_work and run_work
From: Jens Axboe @ 2017-04-10 15:54 UTC (permalink / raw)
  To: linux-block; +Cc: osandov, hch, bart.vanassche, Jens Axboe
In-Reply-To: <1491839696-24783-1-git-send-email-axboe@fb.com>

The only difference between ->run_work and ->delay_work, is that
the latter is used to defer running a queue. This is done by
marking the queue stopped, and scheduling ->delay_work to run
sometime in the future. While the queue is stopped, direct runs
or runs through ->run_work will not run the queue.

If we combine the handlers, then we need to handle two things:

1) If a delayed/stopped run is scheduled, then we should not run
   the queue before that has been completed.
2) If a queue is delayed/stopped, the handler needs to restart
   the queue. Normally a run of a queue with the stopped bit set
   would be a no-op.

Case 1 is handled by modifying a currently pending queue run
to the deadline set by the caller of blk_mq_delay_queue().
Subsequent attempts to queue a queue run will find the work
item already pending, and direct runs will see a stopped queue
as before.

Case 2 is handled by adding a new bit, BLK_MQ_S_START_ON_RUN,
that tells the work handler that it should clear a stopped
queue and run the handler.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  4 +---
 block/blk-mq.c         | 34 ++++++++++++++++++++++------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index bffb8640346b..4f0104afa848 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
-		queue_for_each_hw_ctx(q, hctx, i) {
+		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
-			cancel_delayed_work_sync(&hctx->delay_work);
-		}
 	} else {
 		cancel_delayed_work_sync(&q->delay_work);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7afba6ab5a96..e97ed8e7f359 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1223,7 +1223,6 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	cancel_delayed_work(&hctx->run_work);
-	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1281,27 +1280,39 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-	__blk_mq_run_hw_queue(hctx);
-}
 
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
+	/*
+	 * If we are stopped, don't run the queue. The exception is if
+	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+	 * the STOPPED bit and run it.
+	 */
+	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+			return;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	}
 
-	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
-		__blk_mq_run_hw_queue(hctx);
+	__blk_mq_run_hw_queue(hctx);
 }
 
+
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
 	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 		return;
 
+	/*
+	 * Stop the hw queue, then modify currently delayed work.
+	 * This should prevent us from running the queue prematurely.
+	 * Mark the queue as auto-clearing STOPPED when it runs.
+	 */
 	blk_mq_stop_hw_queue(hctx);
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->delay_work, msecs_to_jiffies(msecs));
+	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					&hctx->run_work,
+					msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
@@ -1886,7 +1897,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		node = hctx->numa_node = set->numa_node;
 
 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2b4573a9ccf4..7a114b7b943c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,8 +51,6 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delay_work;
-
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
 
@@ -160,6 +158,7 @@ enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 	BLK_MQ_S_TAG_WAITING	= 3,
+	BLK_MQ_S_START_ON_RUN	= 4,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH 0/3] blk-mq: unify hardware queue run handlers
From: Jens Axboe @ 2017-04-10 15:54 UTC (permalink / raw)
  To: linux-block; +Cc: osandov, hch, bart.vanassche

We currently have three of them, one normal work queue item, and two
delayed work queue items. The two delayed items differ in that one of
them only runs the queue it was previously stopped, that's it. The
non-delayed one is identical to the non stopped checking delayed
variant.

Sending this out for early review, as I'll be heading on vacation
shortly. This is untested, just compiled.

This shrinks the size of a hardware queue from 832 bytes (13 cachelines)
to 704 bytes (11 cachelines) on my setup. That's quite a substantial
win.

Patches are against my 4.12 branch.

-- 
Jens Axboe

^ permalink raw reply

* [PATCH 2/3] block: add kblock_mod_delayed_work_on()
From: Jens Axboe @ 2017-04-10 15:54 UTC (permalink / raw)
  To: linux-block; +Cc: osandov, hch, bart.vanassche, Jens Axboe
In-Reply-To: <1491839696-24783-1-git-send-email-axboe@fb.com>

This modifies (or adds, if not currently pending) an existing
delayed work item.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 7 +++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index d58541e4dc7b..bffb8640346b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3135,6 +3135,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work_on);
 
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
+				unsigned long delay)
+{
+	return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
 				  unsigned long delay)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec993573e0a8..71b978dedbbc 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1699,6 +1699,7 @@ int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
-- 
2.7.4

^ permalink raw reply related

* [PATCH 1/3] blk-mq: unify hctx delayed_run_work and run_work
From: Jens Axboe @ 2017-04-10 15:54 UTC (permalink / raw)
  To: linux-block; +Cc: osandov, hch, bart.vanassche, Jens Axboe
In-Reply-To: <1491839696-24783-1-git-send-email-axboe@fb.com>

They serve the exact same purpose. Get rid of the non-delayed
work variant, and just run it without delay for the normal case.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 +-
 block/blk-mq.c         | 27 ++++++---------------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8654aa0cef6d..d58541e4dc7b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -269,7 +269,7 @@ void blk_sync_queue(struct request_queue *q)
 		int i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
-			cancel_work_sync(&hctx->run_work);
+			cancel_delayed_work_sync(&hctx->run_work);
 			cancel_delayed_work_sync(&hctx->delay_work);
 		}
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2ef7b460924..7afba6ab5a96 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1168,13 +1168,9 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 		put_cpu();
 	}
 
-	if (msecs == 0)
-		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
-					 &hctx->run_work);
-	else
-		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-						 &hctx->delayed_run_work,
-						 msecs_to_jiffies(msecs));
+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					 &hctx->run_work,
+					 msecs_to_jiffies(msecs));
 }
 
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1226,7 +1222,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_work(&hctx->run_work);
+	cancel_delayed_work(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
@@ -1284,17 +1280,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
-
-	__blk_mq_run_hw_queue(hctx);
-}
-
-static void blk_mq_delayed_run_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
-
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 	__blk_mq_run_hw_queue(hctx);
 }
 
@@ -1899,8 +1885,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 
-	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
+	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d75de612845d..2b4573a9ccf4 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -15,7 +15,7 @@ struct blk_mq_hw_ctx {
 		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;
 
-	struct work_struct	run_work;
+	struct delayed_work	run_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@@ -51,7 +51,6 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delayed_run_work;
 	struct delayed_work	delay_work;
 
 	struct hlist_node	cpuhp_dead;
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH v3] lightnvm: physical block device (pblk) target
From: Bart Van Assche @ 2017-04-10 15:55 UTC (permalink / raw)
  To: jg@lightnvm.io
  Cc: mb@lightnvm.io, linux-kernel@vger.kernel.org,
	linux-block@vger.kernel.org
In-Reply-To: <E42A544D-D09C-4DDB-800A-9FCED4CBBD28@lightnvm.io>

On Sun, 2017-04-09 at 11:15 +0200, Javier Gonz=E1lez wrote:
> On 8 Apr 2017, at 22.56, Bart Van Assche <bart.vanassche@sandisk.com> wro=
te:
> > On 04/07/17 11:50, Javier Gonz=E1lez wrote:
> struct ppa_addr, which is the physical address format is not affected,
> but pblk's internal L2P address representation (pblk_addr) is. You can
> see that the type either represents struct ppa_addr or ppa_addr_32. How
> would you define a type that can either be u64 or u32 with different bit
> offsets at run-time? Note that address conversions to this type is in
> the fast path and this format allows us to only use bit shifts.

Selecting the appropriate representation at run-time would require to pass
pblk_addr by reference instead of by value to any function that expects a
pblk_addr. It would also require to have two versions of every data structu=
re
that depends on pblk_addr and to use casts to convert to the appropritate
version. However, this approach is probably only worth to be implemented if
it doesn't introduce too much additional complexity.

> > > +#ifdef CONFIG_NVM_DEBUG
> > > +	atomic_add(nr_entries, &pblk->inflight_writes);
> > > +	atomic_add(nr_entries, &pblk->req_writes);
> > > +#endif
> >=20
> > Has it been considered to use the "static key" feature such that
> > consistency checks can be enabled at run-time instead of having to
> > rebuild the kernel to enable CONFIG_NVM_DEBUG?
>=20
> I haven't considered it. I'll look into it. I would like to have this
> counters and the corresponding sysfs entry only available on debug mode
> since it allows us to have a good picture of the FTL state.

If there are sysfs entries that depend on CONFIG_NVM_DEBUG then the static
key mechanism is probably not a good alternative for CONFIG_NVM_DEBUG.

> > Has it been considered to add support for keeping a subset of the L2P
> > translation table in memory instead of keeping it in memory in its enti=
rety?
>=20
> Yes. L2P caching is on our roadmap and will be included in the future.

That's great. This will also help with reducing the time between discovery =
of
a lightnvm device and the time at which I/O can start since the L2P table m=
ust
be available before I/O can start.

Bart.=

^ permalink raw reply

* Re: bfq-mq performance comparison to cfq
From: Bart Van Assche @ 2017-04-10 15:15 UTC (permalink / raw)
  To: aherrmann@suse.com, paolo.valente@linaro.org
  Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org,
	axboe@kernel.dk
In-Reply-To: <82BCEB46-8D05-42DA-AE06-3426895A7842@linaro.org>

On Mon, 2017-04-10 at 11:55 +0200, Paolo Valente wrote:
> That said, if you do always want maximum throughput, even at the
> expense of latency, then just switch off low-latency heuristics, i.e.,
> set low_latency to 0.  Depending on the device, setting slice_ilde to
> 0 may help a lot too (as well as with CFQ).  If the throughput is
> still low also after forcing BFQ to an only-throughput mode, then you
> hit some bug, and I'll have a little more work to do ...

Hello Paolo,

Has it been considered to make applications tell the I/O scheduler
whether to optimize for latency or for throughput? It shouldn't be that
hard for window managers and shells to figure out whether or not a new
application that is being started is interactive or not. This would
require a mechanism that allows applications to provide such information
to the I/O scheduler. Wouldn't that be a better approach than the I/O
scheduler trying to guess whether or not an application is an interactive
application?

Bart.=

^ permalink raw reply

* Re: [PATCH 6/8] nowait aio: ext4
From: Jan Kara @ 2017-04-10 15:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Goldwyn Rodrigues, linux-fsdevel, jack, linux-block,
	linux-btrfs, linux-ext4, linux-xfs, sagi, avi, axboe, linux-api,
	willy, tom.leiming, Goldwyn Rodrigues
In-Reply-To: <20170410143943.GA2930@infradead.org>

On Mon 10-04-17 07:39:43, Christoph Hellwig wrote:
> On Mon, Apr 10, 2017 at 02:37:50PM +0200, Jan Kara wrote:
> > I don't understand here. Do you want that all filesystems support NOWAIT
> > direct IO?
> 
> No.  Per-file_system_type is way to coarse grained.  All feature flag
> needs to be per-file_operation at least for cases like ext4 with our
> without extents (or journal) XFS v4 vs v5, different NFS versions, etc.

Ah, I see your point now. Thanks for patience. I think we could make this
work by making generic_file_write/read_iter() refuse NOWAIT IO with
EOPNOTSUPP and then only modify those few filesystems that implement their
own iter helpers and will not initially support NOWAIT IO. Sounds easy
enough.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v4 4/6] blk-mq: Introduce blk_mq_delay_run_hw_queue()
From: Jens Axboe @ 2017-04-10 15:02 UTC (permalink / raw)
  To: Christoph Hellwig, Bart Van Assche
  Cc: linux-block, linux-scsi, Hannes Reinecke, Long Li,
	K . Y . Srinivasan
In-Reply-To: <20170410071219.GD5559@lst.de>

On 04/10/2017 01:12 AM, Christoph Hellwig wrote:
>> +	if (msecs == 0)
>> +		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
>> +					 &hctx->run_work);
>> +	else
>> +		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
>> +						 &hctx->delayed_run_work,
>> +						 msecs_to_jiffies(msecs));
>> +}
> 
> I'd rather make run_work a delayed_work (again) and use
> kblockd_schedule_delayed_work_on with a timeout of zero for the immediate
> run case instead of having two competing work structs.

Yeah that's a good point, it'd have to be an incremental patch at this
point though. Also note that blk_mq_stop_hw_queue() isn't currently
canceling the new ->delayed_run_work, that looks like a bug.

And looking at it, right now we have 3 (three!) work items in the
hardware queue. The two delayed items differ in that one of them only
runs the queue it was previously stopped, that's it. The non-delayed one
is identical to the non stopped checking delayed variant.

I'll send out a patch.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH 6/8] nowait aio: ext4
From: Christoph Hellwig @ 2017-04-10 14:39 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Goldwyn Rodrigues, linux-fsdevel, jack,
	linux-block, linux-btrfs, linux-ext4, linux-xfs, sagi, avi, axboe,
	linux-api, willy, tom.leiming, Goldwyn Rodrigues
In-Reply-To: <20170410123750.GE3224@quack2.suse.cz>

On Mon, Apr 10, 2017 at 02:37:50PM +0200, Jan Kara wrote:
> I don't understand here. Do you want that all filesystems support NOWAIT
> direct IO?

No.  Per-file_system_type is way to coarse grained.  All feature flag
needs to be per-file_operation at least for cases like ext4 with our
without extents (or journal) XFS v4 vs v5, different NFS versions, etc.

For RWF_* each file operation simply declares if the feature is
supported not by rejecting unknown ones.  FIEMAP does the same as do
a few other interfaces.

^ permalink raw reply

* Re: [PATCH 6/8] nowait aio: ext4
From: Jan Kara @ 2017-04-10 12:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Goldwyn Rodrigues, Jan Kara, linux-fsdevel, jack, linux-block,
	linux-btrfs, linux-ext4, linux-xfs, sagi, avi, axboe, linux-api,
	willy, tom.leiming, Goldwyn Rodrigues
In-Reply-To: <20170410074539.GA18250@infradead.org>

On Mon 10-04-17 00:45:39, Christoph Hellwig wrote:
> On Tue, Apr 04, 2017 at 01:41:09PM -0500, Goldwyn Rodrigues wrote:
> > I am working on incorporating RWF_* flags. However, I am not sure how
> > RWF_* flags would get rid of FS_NOWAIT/FS_NOWAIT_IO. Since most of
> > "blocking" information is with the filesystem, it is a per-filesystem
> > flag to block out (EOPNOTSUPP) the filesystems which do not support it.
> 
> You need to check the flag in the actual read/write methods as the
> support for features on Linux is not a per-file_system_type thing.

I don't understand here. Do you want that all filesystems support NOWAIT
direct IO? IMO that's not realistic and also not necessary. In reality
different filesystems support different sets or operations and we have a
precedens for that in various fallocate operations, rename exchange, or
O_TMPFILE support...

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: bfq-mq performance comparison to cfq
From: Paolo Valente @ 2017-04-10  9:55 UTC (permalink / raw)
  To: Andreas Herrmann; +Cc: Jens Axboe, linux-block, linux-kernel
In-Reply-To: <20170410090538.GA11473@suselix.suse.de>


> Il giorno 10 apr 2017, alle ore 11:05, Andreas Herrmann =
<aherrmann@suse.com> ha scritto:
>=20
> Hi Paolo,
>=20
> I've looked at your WIP branch as of 4.11.0-bfq-mq-rc4-00155-gbce0818
> and did some fio tests to compare the behavior to CFQ.
>=20
> My understanding is that bfq-mq is supposed to be merged sooner or
> later and then it will be the only reasonable I/O scheduler with
> blk-mq for rotational devices. Hence I think it is interesting to see
> what to expect performance-wise in comparison to CFQ which is usually
> used for such devices with the legacy block layer.
>=20
> I've just done simple tests iterating over number of jobs (1-8 as the
> test system had 8 CPUs) for all (random/sequential) read/write
> patterns. Fixed set of fio parameters used were '-size=3D5G
> --group_reporting --ioengine=3Dlibaio --direct=3D1 --iodepth=3D1
> --runtime=3D10'.
>=20
> I've done 10 runs for each such configuration. The device used was an
> older SAMSUNG HD103SJ 1TB disk, SATA attached. Results that stick out
> the most are those for sequential reads and sequential writes:
>=20
> * sequential reads
>  [0] - cfq, intel_pstate driver, powersave governor
>  [1] - bfq_mq, intel_pstate driver, powersave governor
>=20
> jo             [0]               [1]
> bs       mean     stddev    mean       stddev
>  1 & 17060.300 &  77.090 & 17657.500 &  69.602
>  2 & 15318.200 &  28.817 & 10678.000 & 279.070
>  3 & 15403.200 &  42.762 &  9874.600 &  93.436
>  4 & 14521.200 & 624.111 &  9918.700 & 226.425
>  5 & 13893.900 & 144.354 &  9485.000 & 109.291
>  6 & 13065.300 & 180.608 &  9419.800 &  75.043
>  7 & 12169.600 &  95.422 &  9863.800 & 227.662
>  8 & 12422.200 & 215.535 & 15335.300 & 245.764
>=20
> * sequential writes
>  [0] - cfq, intel_pstate driver, powersave governor
>  [1] - bfq_mq, intel_pstate driver, powersave governor
>=20
> jo            [0]               [1]
> bs      mean     stddev    mean       stddev
>  1 & 14171.300 & 80.796 & 14392.500 & 182.587
>  2 & 13520.000 & 88.967 &  9565.400 & 119.400
>  3 & 13396.100 & 44.936 &  9284.000 &  25.122
>  4 & 13139.800 & 62.325 &  8846.600 &  45.926
>  5 & 12942.400 & 45.729 &  8568.700 &  35.852
>  6 & 12650.600 & 41.283 &  8275.500 & 199.273
>  7 & 12475.900 & 43.565 &  8252.200 &  33.145
>  8 & 12307.200 & 43.594 & 13617.500 & 127.773
>=20
> With performance instead of powersave governor results were
> (expectedly) higher but the pattern was the same -- bfq-mq shows a
> "dent" for tests with 2-7 fio jobs. At the moment I have no
> explanation for this behavior.
>=20

I have :)

BFQ, by default, is configured to privilege latency over throughput.
In this respect, as various people and I happened to discuss a few
times, even on these mailing lists, the only way to provide strong
low-latency guarantees, at the moment, is through device idling.  The
throughput loss you see is very likely to be the consequence of that
idling.

Why does the throughput go back up at eight jobs?  Because, if many
processes are born in a very short time interval, then BFQ understands
that some multi-job task is being started.  And these parallel tasks
usually prefer overall high throughput to single-process low latency.
Then, BFQ does not idle the device for these processes.

That said, if you do always want maximum throughput, even at the
expense of latency, then just switch off low-latency heuristics, i.e.,
set low_latency to 0.  Depending on the device, setting slice_ilde to
0 may help a lot too (as well as with CFQ).  If the throughput is
still low also after forcing BFQ to an only-throughput mode, then you
hit some bug, and I'll have a little more work to do ...

Thanks,
Paolo

> Regards,
> Andreas

^ permalink raw reply

* bfq-mq performance comparison to cfq
From: Andreas Herrmann @ 2017-04-10  9:05 UTC (permalink / raw)
  To: Paolo Valente; +Cc: Jens Axboe, linux-block, linux-kernel

Hi Paolo,

I've looked at your WIP branch as of 4.11.0-bfq-mq-rc4-00155-gbce0818
and did some fio tests to compare the behavior to CFQ.

My understanding is that bfq-mq is supposed to be merged sooner or
later and then it will be the only reasonable I/O scheduler with
blk-mq for rotational devices. Hence I think it is interesting to see
what to expect performance-wise in comparison to CFQ which is usually
used for such devices with the legacy block layer.

I've just done simple tests iterating over number of jobs (1-8 as the
test system had 8 CPUs) for all (random/sequential) read/write
patterns. Fixed set of fio parameters used were '-size=5G
--group_reporting --ioengine=libaio --direct=1 --iodepth=1
--runtime=10'.

I've done 10 runs for each such configuration. The device used was an
older SAMSUNG HD103SJ 1TB disk, SATA attached. Results that stick out
the most are those for sequential reads and sequential writes:

 * sequential reads
  [0] - cfq, intel_pstate driver, powersave governor
  [1] - bfq_mq, intel_pstate driver, powersave governor

 jo             [0]               [1]
 bs       mean     stddev    mean       stddev
  1 & 17060.300 &  77.090 & 17657.500 &  69.602
  2 & 15318.200 &  28.817 & 10678.000 & 279.070
  3 & 15403.200 &  42.762 &  9874.600 &  93.436
  4 & 14521.200 & 624.111 &  9918.700 & 226.425
  5 & 13893.900 & 144.354 &  9485.000 & 109.291
  6 & 13065.300 & 180.608 &  9419.800 &  75.043
  7 & 12169.600 &  95.422 &  9863.800 & 227.662
  8 & 12422.200 & 215.535 & 15335.300 & 245.764

 * sequential writes
  [0] - cfq, intel_pstate driver, powersave governor
  [1] - bfq_mq, intel_pstate driver, powersave governor

 jo            [0]               [1]
 bs      mean     stddev    mean       stddev
  1 & 14171.300 & 80.796 & 14392.500 & 182.587
  2 & 13520.000 & 88.967 &  9565.400 & 119.400
  3 & 13396.100 & 44.936 &  9284.000 &  25.122
  4 & 13139.800 & 62.325 &  8846.600 &  45.926
  5 & 12942.400 & 45.729 &  8568.700 &  35.852
  6 & 12650.600 & 41.283 &  8275.500 & 199.273
  7 & 12475.900 & 43.565 &  8252.200 &  33.145
  8 & 12307.200 & 43.594 & 13617.500 & 127.773

With performance instead of powersave governor results were
(expectedly) higher but the pattern was the same -- bfq-mq shows a
"dent" for tests with 2-7 fio jobs. At the moment I have no
explanation for this behavior.

Regards,
Andreas

^ permalink raw reply

* Re: [PATCH] lightnvm: don't check for failure from mempool_alloc()
From: Matias Bjørling @ 2017-04-10  8:02 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-block, linux-kernel
In-Reply-To: <87k26txbfa.fsf@notabene.neil.brown.name>

On 04/10/2017 04:07 AM, NeilBrown wrote:
>
> mempool_alloc() cannot fail if the gfp flags allow it to
> sleep, and both GFP_KERNEL and GFP_NOIO allows for sleeping.
>
> So rrpc_move_valid_pages() and rrpc_make_rq() don't need to
> test the return value.
>
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  drivers/lightnvm/rrpc.c | 9 ---------
>  1 file changed, 9 deletions(-)
>
> diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
> index e00b1d7b976f..34f5f1cc9452 100644
> --- a/drivers/lightnvm/rrpc.c
> +++ b/drivers/lightnvm/rrpc.c
> @@ -318,10 +318,6 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
>  	}
>
>  	page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
> -	if (!page) {
> -		bio_put(bio);
> -		return -ENOMEM;
> -	}
>
>  	while ((slot = find_first_zero_bit(rblk->invalid_pages,
>  					    nr_sec_per_blk)) < nr_sec_per_blk) {
> @@ -1007,11 +1003,6 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
>  	}
>
>  	rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
> -	if (!rqd) {
> -		pr_err_ratelimited("rrpc: not able to queue bio.");
> -		bio_io_error(bio);
> -		return BLK_QC_T_NONE;
> -	}
>  	memset(rqd, 0, sizeof(struct nvm_rq));
>
>  	err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
>

This is great!, Thanks Neil.  Applied for 4.12.

^ permalink raw reply

* Re: [PATCH 6/8] nowait aio: ext4
From: Christoph Hellwig @ 2017-04-10  7:45 UTC (permalink / raw)
  To: Goldwyn Rodrigues
  Cc: Christoph Hellwig, Jan Kara, linux-fsdevel, jack, linux-block,
	linux-btrfs, linux-ext4, linux-xfs, sagi, avi, axboe, linux-api,
	willy, tom.leiming, Goldwyn Rodrigues
In-Reply-To: <d6dac6ec-c71c-0082-972f-61e7299ed418@suse.de>

On Tue, Apr 04, 2017 at 01:41:09PM -0500, Goldwyn Rodrigues wrote:
> I am working on incorporating RWF_* flags. However, I am not sure how
> RWF_* flags would get rid of FS_NOWAIT/FS_NOWAIT_IO. Since most of
> "blocking" information is with the filesystem, it is a per-filesystem
> flag to block out (EOPNOTSUPP) the filesystems which do not support it.

You need to check the flag in the actual read/write methods as the
support for features on Linux is not a per-file_system_type thing.

^ permalink raw reply

* Re: [PATCH v4 5/6] scsi: Avoid that SCSI queues get stuck
From: Christoph Hellwig @ 2017-04-10  7:12 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Jens Axboe, linux-block, linux-scsi, Martin K . Petersen,
	James Bottomley, Christoph Hellwig, Hannes Reinecke,
	Sagi Grimberg, Long Li, K . Y . Srinivasan
In-Reply-To: <20170407181654.27836-6-bart.vanassche@sandisk.com>

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCH v4 4/6] blk-mq: Introduce blk_mq_delay_run_hw_queue()
From: Christoph Hellwig @ 2017-04-10  7:12 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Jens Axboe, linux-block, linux-scsi, Christoph Hellwig,
	Hannes Reinecke, Long Li, K . Y . Srinivasan
In-Reply-To: <20170407181654.27836-5-bart.vanassche@sandisk.com>

> +	if (msecs == 0)
> +		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
> +					 &hctx->run_work);
> +	else
> +		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
> +						 &hctx->delayed_run_work,
> +						 msecs_to_jiffies(msecs));
> +}

I'd rather make run_work a delayed_work (again) and use
kblockd_schedule_delayed_work_on with a timeout of zero for the immediate
run case instead of having two competing work structs.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox