* [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES
@ 2016-11-17 22:17 Chaitanya Kulkarni
2016-11-18 2:22 ` Martin K. Petersen
` (2 more replies)
0 siblings, 3 replies; 16+ messages in thread
From: Chaitanya Kulkarni @ 2016-11-17 22:17 UTC (permalink / raw)
To: axboe
Cc: martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
This adds a new block layer operation to zero out a range of
LBAs. This allows to implement zeroing for devices that don't use
either discard with a predictable zero pattern or WRITE SAME of zeroes.
The prominent example of that is NVMe with the Write Zeroes command,
but in the future this should also help with improving the way
zeroing discards work.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
---
block/bio.c | 1 +
block/blk-core.c | 4 ++++
block/blk-lib.c | 58 +++++++++++++++++++++++++++++++++++++++++++++--
block/blk-merge.c | 17 ++++++++++----
block/blk-settings.c | 15 ++++++++++++
block/blk-wbt.c | 5 ++--
include/linux/bio.h | 25 +++++++++++---------
include/linux/blk_types.h | 2 ++
include/linux/blkdev.h | 19 ++++++++++++++++
9 files changed, 127 insertions(+), 19 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 2cf6eba..39fa10a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -670,6 +670,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
break;
case REQ_OP_WRITE_SAME:
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
diff --git a/block/blk-core.c b/block/blk-core.c
index 473dd69..f1cb1b1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1945,6 +1945,10 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!bdev_write_zeroes(bio->bi_bdev))
+ goto not_supported;
+ break;
default:
break;
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bfb28b0..bad64bb 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -227,6 +227,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
EXPORT_SYMBOL(blkdev_issue_write_same);
/**
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
+ * @bdev: blockdev to issue
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @biop: pointer to anchor bio
+ *
+ * Description:
+ * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
+ */
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+ sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+ struct bio **biop)
+{
+ struct bio *bio = *biop;
+ unsigned int max_write_zeroes_sectors;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (!q)
+ return -ENXIO;
+
+ /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+ max_write_zeroes_sectors = bdev_write_zeroes(bdev);
+
+ if (max_write_zeroes_sectors == 0)
+ return -EOPNOTSUPP;
+
+ while (nr_sects) {
+ bio = next_bio(bio, 0, gfp_mask);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+
+ if (nr_sects > max_write_zeroes_sectors) {
+ bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+ nr_sects -= max_write_zeroes_sectors;
+ sector += max_write_zeroes_sectors;
+ } else {
+ bio->bi_iter.bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+ cond_resched();
+ }
+
+ *biop = bio;
+ return 0;
+}
+
+/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
@@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
goto out;
}
+ ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+ biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+
ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
ZERO_PAGE(0), biop);
if (ret == 0 || (ret && ret != -EOPNOTSUPP))
@@ -304,8 +358,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
* the discard request fail, if the discard flag is not set, or if
* discard_zeroes_data is not supported, this function will resort to
* zeroing the blocks manually, thus provisioning (allocating,
- * anchoring) them. If the block device supports the WRITE SAME command
- * blkdev_issue_zeroout() will use it to optimize the process of
+ * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ * command(s), blkdev_issue_zeroout() will use it to optimize the process of
* clearing the block range. Otherwise the zeroing will be performed
* using regular WRITE calls.
*/
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fda6a12..cf2848c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
case REQ_OP_SECURE_ERASE:
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
break;
+ case REQ_OP_WRITE_ZEROES:
+ split = NULL;
+ nsegs = (*bio)->bi_phys_segments;
+ break;
case REQ_OP_WRITE_SAME:
split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
break;
@@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
* This should probably be returning 0, but blk_add_request_payload()
* (Christoph!!!!)
*/
- if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
- return 1;
-
- if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
return 1;
+ default:
+ break;
+ }
fbio = bio;
cluster = blk_queue_cluster(q);
@@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
/*
* This is a hack - drivers should be neither modifying the
* biovec, nor relying on bi_vcnt - but because of
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c7ccabc..3d1a494b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
+ lim->max_write_zeroes_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
+ lim->max_write_zeroes_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -300,6 +302,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
/**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ * write zeroes
+ * @q: the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+ unsigned int max_write_zeroes_sectors)
+{
+ q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
+/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 9f97594..0e34740 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
const int op = bio_op(bio);
/*
- * If not a WRITE (or a discard), do nothing
+ * If not a WRITE (or a discard or write zeroes), do nothing
*/
- if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+ op == REQ_OP_WRITE_ZEROES))
return false;
/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d367cd3..491c7e9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio)
if (bio &&
bio->bi_iter.bi_size &&
bio_op(bio) != REQ_OP_DISCARD &&
- bio_op(bio) != REQ_OP_SECURE_ERASE)
+ bio_op(bio) != REQ_OP_SECURE_ERASE &&
+ bio_op(bio) != REQ_OP_WRITE_ZEROES)
return true;
return false;
@@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio)
{
return bio_op(bio) == REQ_OP_DISCARD ||
bio_op(bio) == REQ_OP_SECURE_ERASE ||
- bio_op(bio) == REQ_OP_WRITE_SAME;
+ bio_op(bio) == REQ_OP_WRITE_SAME ||
+ bio_op(bio) == REQ_OP_WRITE_ZEROES;
}
static inline bool bio_mergeable(struct bio *bio)
@@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio)
struct bvec_iter iter;
/*
- * We special case discard/write same, because they interpret bi_size
- * differently:
+ * We special case discard/write same/write zeroes, because they
+ * interpret bi_size differently:
*/
- if (bio_op(bio) == REQ_OP_DISCARD)
- return 1;
-
- if (bio_op(bio) == REQ_OP_SECURE_ERASE)
- return 1;
-
- if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
return 1;
+ default:
+ break;
+ }
bio_for_each_segment(bv, bio, iter)
segs++;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4d0044d..2b0aebf 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -159,6 +159,8 @@ enum req_opf {
REQ_OP_ZONE_RESET = 6,
/* write the same sector many times */
REQ_OP_WRITE_SAME = 7,
+ /* write the zero filled sector many times */
+ REQ_OP_WRITE_ZEROES = 8,
REQ_OP_LAST,
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 13b2f2a..f3ee040 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -323,6 +323,7 @@ struct queue_limits {
unsigned int max_discard_sectors;
unsigned int max_hw_discard_sectors;
unsigned int max_write_same_sectors;
+ unsigned int max_write_zeroes_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
@@ -773,6 +774,9 @@ static inline bool rq_mergeable(struct request *rq)
if (req_op(rq) == REQ_OP_FLUSH)
return false;
+ if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+ return false;
+
if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
return false;
if (rq->rq_flags & RQF_NOMERGE_FLAGS)
@@ -1003,6 +1007,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
if (unlikely(op == REQ_OP_WRITE_SAME))
return q->limits.max_write_same_sectors;
+ if (unlikely(op == REQ_OP_WRITE_ZEROES))
+ return q->limits.max_write_zeroes_sectors;
+
return q->limits.max_sectors;
}
@@ -1106,6 +1113,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors);
extern void blk_queue_max_write_same_sectors(struct request_queue *q,
unsigned int max_write_same_sectors);
+extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+ unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
@@ -1474,6 +1483,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev)
return 0;
}
+static inline unsigned int bdev_write_zeroes(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q)
+ return q->limits.max_write_zeroes_sectors;
+
+ return 0;
+}
+
static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES
2016-11-17 22:17 [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
@ 2016-11-18 2:22 ` Martin K. Petersen
2016-11-18 7:46 ` Christoph Hellwig
2016-11-18 15:41 ` Keith Busch
2 siblings, 0 replies; 16+ messages in thread
From: Martin K. Petersen @ 2016-11-18 2:22 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
>>>>> "Chaitanya" == Chaitanya Kulkarni <ckulkarnilinux@gmail.com> writes:
Chaitanya> This adds a new block layer operation to zero out a range of
Chaitanya> LBAs. This allows to implement zeroing for devices that don't
Chaitanya> use either discard with a predictable zero pattern or WRITE
Chaitanya> SAME of zeroes. The prominent example of that is NVMe with
Chaitanya> the Write Zeroes command, but in the future this should also
Chaitanya> help with improving the way zeroing discards work.
Looks good. Please also export the queue limit in blk-sysfs.c and create
a suitable entry in Documentation/ABI/testing/sysfs-block.
--
Martin K. Petersen Oracle Linux Engineering
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES
2016-11-17 22:17 [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
2016-11-18 2:22 ` Martin K. Petersen
@ 2016-11-18 7:46 ` Christoph Hellwig
2016-11-18 8:25 ` chaitany kulkarni
2016-11-18 15:41 ` Keith Busch
2 siblings, 1 reply; 16+ messages in thread
From: Christoph Hellwig @ 2016-11-18 7:46 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
On Thu, Nov 17, 2016 at 02:17:11PM -0800, Chaitanya Kulkarni wrote:
> From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
>
> This adds a new block layer operation to zero out a range of
> LBAs. This allows to implement zeroing for devices that don't use
> either discard with a predictable zero pattern or WRITE SAME of zeroes.
> The prominent example of that is NVMe with the Write Zeroes command,
> but in the future this should also help with improving the way
> zeroing discards work.
>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
I think you'll need to resend the whole series so that nvme can set
the maximum discard sectors value.
> @@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
> const int op = bio_op(bio);
>
> /*
> - * If not a WRITE (or a discard), do nothing
> + * If not a WRITE (or a discard or write zeroes), do nothing
> */
> - if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
> + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
> + op == REQ_OP_WRITE_ZEROES))
> return false;
Jens: should we really throttle for discard or write zeroes here?
Those aren't really writeback driven..
> +static inline unsigned int bdev_write_zeroes(struct block_device *bdev)
> +{
> + struct request_queue *q = bdev_get_queue(bdev);
> +
> + if (q)
> + return q->limits.max_write_zeroes_sectors;
> +
> + return 0;
If this returns a sector value I'd name it bdev_write_zeroes_sectors.
Otherwise this looks great.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES
2016-11-18 7:46 ` Christoph Hellwig
@ 2016-11-18 8:25 ` chaitany kulkarni
0 siblings, 0 replies; 16+ messages in thread
From: chaitany kulkarni @ 2016-11-18 8:25 UTC (permalink / raw)
To: Christoph Hellwig
Cc: axboe, Martin K. Petersen, Keith Busch, linux-nvme, linux-block,
Chaitanya Kulkarni
Sounds good, I'll update the whole series and resend it with v2 prefix.
On Thu, Nov 17, 2016 at 11:46 PM, Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, Nov 17, 2016 at 02:17:11PM -0800, Chaitanya Kulkarni wrote:
>> From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
>>
>> This adds a new block layer operation to zero out a range of
>> LBAs. This allows to implement zeroing for devices that don't use
>> either discard with a predictable zero pattern or WRITE SAME of zeroes.
>> The prominent example of that is NVMe with the Write Zeroes command,
>> but in the future this should also help with improving the way
>> zeroing discards work.
>>
>> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
>
> I think you'll need to resend the whole series so that nvme can set
> the maximum discard sectors value.
>
>> @@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
>> const int op = bio_op(bio);
>>
>> /*
>> - * If not a WRITE (or a discard), do nothing
>> + * If not a WRITE (or a discard or write zeroes), do nothing
>> */
>> - if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
>> + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
>> + op == REQ_OP_WRITE_ZEROES))
>> return false;
>
> Jens: should we really throttle for discard or write zeroes here?
> Those aren't really writeback driven..
>
>> +static inline unsigned int bdev_write_zeroes(struct block_device *bdev)
>> +{
>> + struct request_queue *q = bdev_get_queue(bdev);
>> +
>> + if (q)
>> + return q->limits.max_write_zeroes_sectors;
>> +
>> + return 0;
>
> If this returns a sector value I'd name it bdev_write_zeroes_sectors.
>
> Otherwise this looks great.
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES
2016-11-17 22:17 [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
2016-11-18 2:22 ` Martin K. Petersen
2016-11-18 7:46 ` Christoph Hellwig
@ 2016-11-18 15:41 ` Keith Busch
2 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2016-11-18 15:41 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, martin.petersen, linux-nvme, linux-block,
Chaitanya Kulkarni
On Thu, Nov 17, 2016 at 02:17:11PM -0800, Chaitanya Kulkarni wrote:
> From: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
>
> This adds a new block layer operation to zero out a range of
> LBAs. This allows to implement zeroing for devices that don't use
> either discard with a predictable zero pattern or WRITE SAME of zeroes.
> The prominent example of that is NVMe with the Write Zeroes command,
> but in the future this should also help with improving the way
> zeroing discards work.
>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
> ---
I think we also to assign queue_limits for stacked devices in
blk_stack_limits. Otherwise, looks good.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout
@ 2016-11-30 20:28 Chaitanya Kulkarni
2016-11-30 20:28 ` [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
` (5 more replies)
0 siblings, 6 replies; 16+ messages in thread
From: Chaitanya Kulkarni @ 2016-11-30 20:28 UTC (permalink / raw)
To: axboe
Cc: martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
Similar to __blkdev_issue_discard this variant allows submitting
the final bio asynchronously and chaining multiple ranges
into a single completion.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
---
block/blk-lib.c | 115 ++++++++++++++++++++++++++++++++++---------------
include/linux/blkdev.h | 3 ++
2 files changed, 84 insertions(+), 34 deletions(-)
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 18abda8..bfb28b0 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -137,24 +137,24 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
EXPORT_SYMBOL(blkdev_issue_discard);
/**
- * blkdev_issue_write_same - queue a write same operation
+ * __blkdev_issue_write_same - generate number of bios with same page
* @bdev: target blockdev
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @page: page containing data to write
+ * @biop: pointer to anchor bio
*
* Description:
- * Issue a write same request for the sectors in question.
+ * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
*/
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask,
- struct page *page)
+static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct page *page,
+ struct bio **biop)
{
struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_write_same_sectors;
- struct bio *bio = NULL;
- int ret = 0;
+ struct bio *bio = *biop;
sector_t bs_mask;
if (!q)
@@ -164,6 +164,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
+ if (!bdev_write_same(bdev))
+ return -EOPNOTSUPP;
+
/* Ensure that max_write_same_sectors doesn't overflow bi_size */
max_write_same_sectors = UINT_MAX >> 9;
@@ -185,32 +188,63 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
+ cond_resched();
}
- if (bio) {
+ *biop = bio;
+ return 0;
+}
+
+/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev: target blockdev
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @page: page containing data
+ *
+ * Description:
+ * Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask,
+ struct page *page)
+{
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret;
+
+ blk_start_plug(&plug);
+ ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
+ &bio);
+ if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_write_same);
/**
- * blkdev_issue_zeroout - generate number of zero filed write bios
+ * __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
+ * @biop: pointer to anchor bio
+ * @discard: discard flag
*
* Description:
* Generate and issue number of bios with zerofiled pages.
*/
-
-static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask)
+int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+ bool discard)
{
int ret;
- struct bio *bio = NULL;
+ int bi_size = 0;
+ struct bio *bio = *biop;
unsigned int sz;
sector_t bs_mask;
@@ -218,6 +252,19 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
if ((sector | nr_sects) & bs_mask)
return -EINVAL;
+ if (discard) {
+ ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
+ BLKDEV_DISCARD_ZERO, biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+ }
+
+ ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+ ZERO_PAGE(0), biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+
+ ret = 0;
while (nr_sects != 0) {
bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES),
gfp_mask);
@@ -227,21 +274,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
- ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
- nr_sects -= ret >> 9;
- sector += ret >> 9;
- if (ret < (sz << 9))
+ bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+ nr_sects -= bi_size >> 9;
+ sector += bi_size >> 9;
+ if (bi_size < (sz << 9))
break;
}
+ cond_resched();
}
- if (bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- return ret;
- }
- return 0;
+ *biop = bio;
+out:
+ return ret;
}
+EXPORT_SYMBOL(__blkdev_issue_zeroout);
/**
* blkdev_issue_zeroout - zero-fill a block range
@@ -263,21 +309,22 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
* clearing the block range. Otherwise the zeroing will be performed
* using regular WRITE calls.
*/
-
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, bool discard)
{
- if (discard) {
- if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
- BLKDEV_DISCARD_ZERO))
- return 0;
- }
+ int ret;
+ struct bio *bio = NULL;
+ struct blk_plug plug;
- if (bdev_write_same(bdev) &&
- blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
- ZERO_PAGE(0)) == 0)
- return 0;
+ blk_start_plug(&plug);
+ ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
+ &bio, discard);
+ if (ret == 0 && bio) {
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ blk_finish_plug(&plug);
- return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
+ return ret;
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 541fdd8..7e9d8a0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1269,6 +1269,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
struct bio **biop);
extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
+ bool discard);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, bool discard);
static inline int sb_issue_discard(struct super_block *sb, sector_t block,
--
1.8.3.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
@ 2016-11-30 20:28 ` Chaitanya Kulkarni
2016-12-01 10:00 ` Christoph Hellwig
2016-11-30 20:29 ` [PATCH V2 3/5] nvme.h: add Write Zeroes definitions Chaitanya Kulkarni
` (4 subsequent siblings)
5 siblings, 1 reply; 16+ messages in thread
From: Chaitanya Kulkarni @ 2016-11-30 20:28 UTC (permalink / raw)
To: axboe
Cc: martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
This adds a new block layer operation to zero out a range of
LBAs. This allows to implement zeroing for devices that don't use
either discard with a predictable zero pattern or WRITE SAME of zeroes.
The prominent example of that is NVMe with the Write Zeroes command,
but in the future, this should also help with improving the way
zeroing discards work. For this operation, suitable entry is exported in
sysfs which indicate the number of maximum bytes allowed in one
write zeroes operation by the device.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
---
Documentation/ABI/testing/sysfs-block | 13 ++++++++
block/bio.c | 1 +
block/blk-core.c | 4 +++
block/blk-lib.c | 58 +++++++++++++++++++++++++++++++++--
block/blk-merge.c | 17 +++++++---
block/blk-settings.c | 17 ++++++++++
block/blk-sysfs.c | 11 +++++++
block/blk-wbt.c | 5 +--
include/linux/bio.h | 25 ++++++++-------
include/linux/blk_types.h | 2 ++
include/linux/blkdev.h | 19 ++++++++++++
11 files changed, 153 insertions(+), 19 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index ee2d5cd..2da04ce 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -235,6 +235,19 @@ Description:
write_same_max_bytes is 0, write same is not supported
by the device.
+What: /sys/block/<disk>/queue/write_zeroes_max_bytes
+Date: November 2016
+Contact: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
+Description:
+ Devices that support write zeroes operation in which a
+ single request can be issued to zero out the range of
+ contiguous blocks on storage without having any payload
+ in the request. This can be used to optimize writing zeroes
+ to the devices. write_zeroes_max_bytes indicates how many
+ bytes can be written in a single write zeroes command. If
+ write_zeroes_max_bytes is 0, write zeroes is not supported
+ by the device.
+
What: /sys/block/<disk>/queue/zoned
Date: September 2016
Contact: Damien Le Moal <damien.lemoal@hgst.com>
diff --git a/block/bio.c b/block/bio.c
index 2cf6eba..39fa10a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -670,6 +670,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
break;
case REQ_OP_WRITE_SAME:
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
diff --git a/block/blk-core.c b/block/blk-core.c
index 473dd69..7438936 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1945,6 +1945,10 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!bdev_write_zeroes_sectors(bio->bi_bdev))
+ goto not_supported;
+ break;
default:
break;
}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index bfb28b0..510a6fb 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -227,6 +227,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
EXPORT_SYMBOL(blkdev_issue_write_same);
/**
+ * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
+ * @bdev: blockdev to issue
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @biop: pointer to anchor bio
+ *
+ * Description:
+ * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
+ */
+static int __blkdev_issue_write_zeroes(struct block_device *bdev,
+ sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+ struct bio **biop)
+{
+ struct bio *bio = *biop;
+ unsigned int max_write_zeroes_sectors;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (!q)
+ return -ENXIO;
+
+ /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
+ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+
+ if (max_write_zeroes_sectors == 0)
+ return -EOPNOTSUPP;
+
+ while (nr_sects) {
+ bio = next_bio(bio, 0, gfp_mask);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+
+ if (nr_sects > max_write_zeroes_sectors) {
+ bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
+ nr_sects -= max_write_zeroes_sectors;
+ sector += max_write_zeroes_sectors;
+ } else {
+ bio->bi_iter.bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+ cond_resched();
+ }
+
+ *biop = bio;
+ return 0;
+}
+
+/**
* __blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
@@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
goto out;
}
+ ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
+ biop);
+ if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ goto out;
+
ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
ZERO_PAGE(0), biop);
if (ret == 0 || (ret && ret != -EOPNOTSUPP))
@@ -304,8 +358,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
* the discard request fail, if the discard flag is not set, or if
* discard_zeroes_data is not supported, this function will resort to
* zeroing the blocks manually, thus provisioning (allocating,
- * anchoring) them. If the block device supports the WRITE SAME command
- * blkdev_issue_zeroout() will use it to optimize the process of
+ * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
+ * command(s), blkdev_issue_zeroout() will use it to optimize the process of
* clearing the block range. Otherwise the zeroing will be performed
* using regular WRITE calls.
*/
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fda6a12..cf2848c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
case REQ_OP_SECURE_ERASE:
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
break;
+ case REQ_OP_WRITE_ZEROES:
+ split = NULL;
+ nsegs = (*bio)->bi_phys_segments;
+ break;
case REQ_OP_WRITE_SAME:
split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
break;
@@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
* This should probably be returning 0, but blk_add_request_payload()
* (Christoph!!!!)
*/
- if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
- return 1;
-
- if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
return 1;
+ default:
+ break;
+ }
fbio = bio;
cluster = blk_queue_cluster(q);
@@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
/*
* This is a hack - drivers should be neither modifying the
* biovec, nor relying on bi_vcnt - but because of
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c7ccabc..8a2bc12 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
+ lim->max_write_zeroes_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
+ lim->max_write_zeroes_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -300,6 +302,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
/**
+ * blk_queue_max_write_zeroes_sectors - set max sectors for a single
+ * write zeroes
+ * @q: the request queue for the device
+ * @max_write_zeroes_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+ unsigned int max_write_zeroes_sectors)
+{
+ q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
+
+/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
@@ -527,6 +542,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
+ t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
+ b->max_write_zeroes_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1855c67..46f03b2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -211,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_same_sectors << 9);
}
+static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)q->limits.max_write_zeroes_sectors << 9);
+}
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
@@ -593,6 +598,11 @@ static ssize_t queue_stats_show(struct request_queue *q, char *page)
.show = queue_write_same_max_show,
};
+static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
+ .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO },
+ .show = queue_write_zeroes_max_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_nonrot,
@@ -682,6 +692,7 @@ static ssize_t queue_stats_show(struct request_queue *q, char *page)
&queue_discard_max_hw_entry.attr,
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
+ &queue_write_zeroes_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nomerges_entry.attr,
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 9f97594..0e34740 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
const int op = bio_op(bio);
/*
- * If not a WRITE (or a discard), do nothing
+ * If not a WRITE (or a discard or write zeroes), do nothing
*/
- if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+ if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD ||
+ op == REQ_OP_WRITE_ZEROES))
return false;
/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d367cd3..491c7e9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio)
if (bio &&
bio->bi_iter.bi_size &&
bio_op(bio) != REQ_OP_DISCARD &&
- bio_op(bio) != REQ_OP_SECURE_ERASE)
+ bio_op(bio) != REQ_OP_SECURE_ERASE &&
+ bio_op(bio) != REQ_OP_WRITE_ZEROES)
return true;
return false;
@@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio)
{
return bio_op(bio) == REQ_OP_DISCARD ||
bio_op(bio) == REQ_OP_SECURE_ERASE ||
- bio_op(bio) == REQ_OP_WRITE_SAME;
+ bio_op(bio) == REQ_OP_WRITE_SAME ||
+ bio_op(bio) == REQ_OP_WRITE_ZEROES;
}
static inline bool bio_mergeable(struct bio *bio)
@@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio)
struct bvec_iter iter;
/*
- * We special case discard/write same, because they interpret bi_size
- * differently:
+ * We special case discard/write same/write zeroes, because they
+ * interpret bi_size differently:
*/
- if (bio_op(bio) == REQ_OP_DISCARD)
- return 1;
-
- if (bio_op(bio) == REQ_OP_SECURE_ERASE)
- return 1;
-
- if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_SAME:
+ case REQ_OP_WRITE_ZEROES:
return 1;
+ default:
+ break;
+ }
bio_for_each_segment(bv, bio, iter)
segs++;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4d0044d..2b0aebf 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -159,6 +159,8 @@ enum req_opf {
REQ_OP_ZONE_RESET = 6,
/* write the same sector many times */
REQ_OP_WRITE_SAME = 7,
+ /* write the zero filled sector many times */
+ REQ_OP_WRITE_ZEROES = 8,
REQ_OP_LAST,
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e9d8a0..ebeef2b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -323,6 +323,7 @@ struct queue_limits {
unsigned int max_discard_sectors;
unsigned int max_hw_discard_sectors;
unsigned int max_write_same_sectors;
+ unsigned int max_write_zeroes_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
@@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq)
if (req_op(rq) == REQ_OP_FLUSH)
return false;
+ if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+ return false;
+
if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
return false;
if (rq->rq_flags & RQF_NOMERGE_FLAGS)
@@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
if (unlikely(op == REQ_OP_WRITE_SAME))
return q->limits.max_write_same_sectors;
+ if (unlikely(op == REQ_OP_WRITE_ZEROES))
+ return q->limits.max_write_zeroes_sectors;
+
return q->limits.max_sectors;
}
@@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors);
extern void blk_queue_max_write_same_sectors(struct request_queue *q,
unsigned int max_write_same_sectors);
+extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
+ unsigned int max_write_same_sectors);
extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
extern void blk_queue_alignment_offset(struct request_queue *q,
@@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev)
return 0;
}
+static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q)
+ return q->limits.max_write_zeroes_sectors;
+
+ return 0;
+}
+
static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
--
1.8.3.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH V2 3/5] nvme.h: add Write Zeroes definitions
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
2016-11-30 20:28 ` [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
@ 2016-11-30 20:29 ` Chaitanya Kulkarni
2016-12-01 10:00 ` Christoph Hellwig
2016-11-30 20:29 ` [PATCH V2 4/5] nvme: add support for the Write Zeroes command Chaitanya Kulkarni
` (3 subsequent siblings)
5 siblings, 1 reply; 16+ messages in thread
From: Chaitanya Kulkarni @ 2016-11-30 20:29 UTC (permalink / raw)
To: axboe
Cc: martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
Add the command structure, optional command set support (ONCS) bit and
a new error code for the Write Zeroes command.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
---
include/linux/nvme.h | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9f3b488..5bf1d2d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -242,6 +242,7 @@ enum {
NVME_CTRL_ONCS_COMPARE = 1 << 0,
NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1,
NVME_CTRL_ONCS_DSM = 1 << 2,
+ NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
};
@@ -558,6 +559,23 @@ struct nvme_dsm_range {
__le64 slba;
};
+struct nvme_write_zeroes_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __u64 rsvd2;
+ __le64 metadata;
+ union nvme_data_ptr dptr;
+ __le64 slba;
+ __le16 length;
+ __le16 control;
+ __le32 dsmgmt;
+ __le32 reftag;
+ __le16 apptag;
+ __le16 appmask;
+};
+
/* Admin commands */
enum nvme_admin_opcode {
@@ -857,6 +875,7 @@ struct nvme_command {
struct nvme_download_firmware dlfw;
struct nvme_format_cmd format;
struct nvme_dsm_cmd dsm;
+ struct nvme_write_zeroes_cmd write_zeroes;
struct nvme_abort_cmd abort;
struct nvme_get_log_page_command get_log_page;
struct nvmf_common_command fabrics;
@@ -947,6 +966,7 @@ enum {
NVME_SC_BAD_ATTRIBUTES = 0x180,
NVME_SC_INVALID_PI = 0x181,
NVME_SC_READ_ONLY = 0x182,
+ NVME_SC_ONCS_NOT_SUPPORTED = 0x183,
/*
* I/O Command Set Specific - Fabrics commands:
--
1.8.3.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH V2 4/5] nvme: add support for the Write Zeroes command
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
2016-11-30 20:28 ` [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
2016-11-30 20:29 ` [PATCH V2 3/5] nvme.h: add Write Zeroes definitions Chaitanya Kulkarni
@ 2016-11-30 20:29 ` Chaitanya Kulkarni
2016-12-01 10:01 ` Christoph Hellwig
2016-11-30 20:29 ` [PATCH V2 5/5] nvmet: " Chaitanya Kulkarni
` (2 subsequent siblings)
5 siblings, 1 reply; 16+ messages in thread
From: Chaitanya Kulkarni @ 2016-11-30 20:29 UTC (permalink / raw)
To: axboe
Cc: martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
Allow write zeroes operations (REQ_OP_WRITE_ZEROES) on the block
device, if the device supports optional command bit set for write
zeroes. Add support to setup write zeroes command. Set maximum possible
write zeroes sectors in one write zeroes command according to
nvme write zeroes command definition.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
---
drivers/nvme/host/core.c | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 44f8a53..16a315d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -272,6 +272,21 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_MQ_RQ_QUEUE_OK;
}
+static inline void nvme_setup_write_zeroes(struct nvme_ns *ns,
+ struct request *req, struct nvme_command *cmnd)
+{
+ struct nvme_write_zeroes_cmd *write_zeroes = &cmnd->write_zeroes;
+
+ memset(cmnd, 0, sizeof(*cmnd));
+ write_zeroes->opcode = nvme_cmd_write_zeroes;
+ write_zeroes->nsid = cpu_to_le32(ns->ns_id);
+ write_zeroes->slba =
+ cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ write_zeroes->length =
+ cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ write_zeroes->control = 0;
+}
+
static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd)
{
@@ -325,6 +340,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
nvme_setup_flush(ns, cmd);
else if (req_op(req) == REQ_OP_DISCARD)
ret = nvme_setup_discard(ns, req, cmd);
+ else if (req_op(req) == REQ_OP_WRITE_ZEROES)
+ nvme_setup_write_zeroes(ns, req, cmd);
else
nvme_setup_rw(ns, req, cmd);
@@ -943,6 +960,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
nvme_config_discard(ns);
+ if (ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES)
+ blk_queue_max_write_zeroes_sectors(ns->queue,
+ ((u32)(USHRT_MAX + 1) * bs) >> 9);
+
blk_mq_unfreeze_queue(disk->queue);
}
--
1.8.3.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH V2 5/5] nvmet: add support for the Write Zeroes command
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
` (2 preceding siblings ...)
2016-11-30 20:29 ` [PATCH V2 4/5] nvme: add support for the Write Zeroes command Chaitanya Kulkarni
@ 2016-11-30 20:29 ` Chaitanya Kulkarni
2016-12-01 10:01 ` Christoph Hellwig
2016-12-01 10:00 ` [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Christoph Hellwig
2016-12-01 14:59 ` Jens Axboe
5 siblings, 1 reply; 16+ messages in thread
From: Chaitanya Kulkarni @ 2016-11-30 20:29 UTC (permalink / raw)
To: axboe
Cc: martin.petersen, keith.busch, linux-nvme, linux-block,
Chaitanya Kulkarni
Add support for handling write zeroes command on target.
Call into __blkdev_issue_zeroout, which the block layer expands into the
best suitable variant of zeroing the LBAs. Allow write zeroes operation
to deallocate the LBAs when calling __blkdev_issue_zeroout.
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
---
drivers/nvme/target/admin-cmd.c | 3 ++-
drivers/nvme/target/io-cmd.c | 29 +++++++++++++++++++++++++++++
2 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 6fe4c48..ec1ad2a 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -237,7 +237,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
- id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM);
+ id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
+ NVME_CTRL_ONCS_WRITE_ZEROES);
/* XXX: don't report vwc if the underlying device is write through */
id->vwc = NVME_CTRL_VWC_PRESENT;
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index ef52b1e..b3d21b0 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -172,6 +172,32 @@ static void nvmet_execute_dsm(struct nvmet_req *req)
}
}
+static void nvmet_execute_write_zeroes(struct nvmet_req *req)
+{
+ struct nvme_write_zeroes_cmd *write_zeroes = &req->cmd->write_zeroes;
+ struct bio *bio = NULL;
+ u16 status = NVME_SC_SUCCESS;
+ sector_t sector;
+ sector_t nr_sector;
+
+ sector = le64_to_cpu(write_zeroes->slba) <<
+ (req->ns->blksize_shift - 9);
+ nr_sector = (((sector_t)le32_to_cpu(write_zeroes->length)) <<
+ (req->ns->blksize_shift - 9)) + 1;
+
+ if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
+ GFP_KERNEL, &bio, true))
+ status = NVME_SC_INTERNAL | NVME_SC_DNR;
+
+ if (bio) {
+ bio->bi_private = req;
+ bio->bi_end_io = nvmet_bio_done;
+ submit_bio(bio);
+ } else {
+ nvmet_req_complete(req, status);
+ }
+}
+
int nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -209,6 +235,9 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
req->data_len = le32_to_cpu(cmd->dsm.nr + 1) *
sizeof(struct nvme_dsm_range);
return 0;
+ case nvme_cmd_write_zeroes:
+ req->execute = nvmet_execute_write_zeroes;
+ return 0;
default:
pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
--
1.8.3.1
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
` (3 preceding siblings ...)
2016-11-30 20:29 ` [PATCH V2 5/5] nvmet: " Chaitanya Kulkarni
@ 2016-12-01 10:00 ` Christoph Hellwig
2016-12-01 14:59 ` Jens Axboe
5 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2016-12-01 10:00 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, keith.busch, linux-block, linux-nvme, martin.petersen
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
[although normally a cover letter for the series would be nice, like
the last time around]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES
2016-11-30 20:28 ` [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
@ 2016-12-01 10:00 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2016-12-01 10:00 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, keith.busch, linux-block, linux-nvme, martin.petersen
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V2 3/5] nvme.h: add Write Zeroes definitions
2016-11-30 20:29 ` [PATCH V2 3/5] nvme.h: add Write Zeroes definitions Chaitanya Kulkarni
@ 2016-12-01 10:00 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2016-12-01 10:00 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, keith.busch, linux-block, linux-nvme, martin.petersen
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V2 4/5] nvme: add support for the Write Zeroes command
2016-11-30 20:29 ` [PATCH V2 4/5] nvme: add support for the Write Zeroes command Chaitanya Kulkarni
@ 2016-12-01 10:01 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2016-12-01 10:01 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, keith.busch, linux-block, linux-nvme, martin.petersen
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V2 5/5] nvmet: add support for the Write Zeroes command
2016-11-30 20:29 ` [PATCH V2 5/5] nvmet: " Chaitanya Kulkarni
@ 2016-12-01 10:01 ` Christoph Hellwig
0 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2016-12-01 10:01 UTC (permalink / raw)
To: Chaitanya Kulkarni
Cc: axboe, keith.busch, linux-block, linux-nvme, martin.petersen
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
` (4 preceding siblings ...)
2016-12-01 10:00 ` [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Christoph Hellwig
@ 2016-12-01 14:59 ` Jens Axboe
5 siblings, 0 replies; 16+ messages in thread
From: Jens Axboe @ 2016-12-01 14:59 UTC (permalink / raw)
To: Chaitanya Kulkarni; +Cc: martin.petersen, keith.busch, linux-nvme, linux-block
On 11/30/2016 01:28 PM, Chaitanya Kulkarni wrote:
> Similar to __blkdev_issue_discard this variant allows submitting
> the final bio asynchronously and chaining multiple ranges
> into a single completion.
Applied this, and the other 4, for 4.10. Thanks.
--
Jens Axboe
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2016-12-01 14:59 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-11-30 20:28 [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Chaitanya Kulkarni
2016-11-30 20:28 ` [PATCH V2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
2016-12-01 10:00 ` Christoph Hellwig
2016-11-30 20:29 ` [PATCH V2 3/5] nvme.h: add Write Zeroes definitions Chaitanya Kulkarni
2016-12-01 10:00 ` Christoph Hellwig
2016-11-30 20:29 ` [PATCH V2 4/5] nvme: add support for the Write Zeroes command Chaitanya Kulkarni
2016-12-01 10:01 ` Christoph Hellwig
2016-11-30 20:29 ` [PATCH V2 5/5] nvmet: " Chaitanya Kulkarni
2016-12-01 10:01 ` Christoph Hellwig
2016-12-01 10:00 ` [PATCH V2 1/5] block: add async variant of blkdev_issue_zeroout Christoph Hellwig
2016-12-01 14:59 ` Jens Axboe
-- strict thread matches above, loose matches on Subject: below --
2016-11-17 22:17 [PATCH v2 2/5] block: add support for REQ_OP_WRITE_ZEROES Chaitanya Kulkarni
2016-11-18 2:22 ` Martin K. Petersen
2016-11-18 7:46 ` Christoph Hellwig
2016-11-18 8:25 ` chaitany kulkarni
2016-11-18 15:41 ` Keith Busch
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).