* [PATCHv14 01/11] fs: add a write stream field to the kiocb
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 02/11] block: add a bi_write_stream field Keith Busch
` (10 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
Prepare for io_uring passthrough of write streams. The write stream
field in the kiocb structure fits into an existing 2-byte hole, so its
size is not changed.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/linux/fs.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2cc3d45da7b01..26940c451f319 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -373,6 +373,7 @@ struct kiocb {
void *private;
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
+ u8 ki_write_stream;
union {
/*
* Only used for async buffered reads, where it denotes the
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 02/11] block: add a bi_write_stream field
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
2024-12-11 18:35 ` [PATCHv14 01/11] fs: add a write stream field to the kiocb Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 03/11] block: introduce max_write_streams queue limit Keith Busch
` (9 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
Add the ability to pass a write stream for placement control in the bio.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/bio.c | 2 ++
block/blk-crypto-fallback.c | 1 +
block/blk-merge.c | 4 ++++
block/bounce.c | 1 +
include/linux/blk_types.h | 1 +
5 files changed, 9 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index 699a78c85c756..2aa86edc7cd6f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
+ bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 29a205482617c..66762243a886b 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e01383c6e534b..1e5327fb6c45b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -866,6 +866,8 @@ static struct request *attempt_merge(struct request_queue *q,
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
return NULL;
+ if (req->bio->bi_write_stream != next->bio->bi_write_stream)
+ return NULL;
if (req->bio->bi_ioprio != next->bio->bi_ioprio)
return NULL;
if (!blk_atomic_write_mergeable_rqs(req, next))
@@ -987,6 +989,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
if (rq->bio->bi_write_hint != bio->bi_write_hint)
return false;
+ if (rq->bio->bi_write_stream != bio->bi_write_stream)
+ return false;
if (rq->bio->bi_ioprio != bio->bi_ioprio)
return false;
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
diff --git a/block/bounce.c b/block/bounce.c
index 0d898cd5ec497..fb8f60f114d7d 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -170,6 +170,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dce7615c35e7e..4ca3449ce9c95 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -220,6 +220,7 @@ struct bio {
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
enum rw_hint bi_write_hint;
+ u8 bi_write_stream;
blk_status_t bi_status;
atomic_t __bi_remaining;
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 03/11] block: introduce max_write_streams queue limit
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
2024-12-11 18:35 ` [PATCHv14 01/11] fs: add a write stream field to the kiocb Keith Busch
2024-12-11 18:35 ` [PATCHv14 02/11] block: add a bi_write_stream field Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 04/11] block: introduce a write_stream_granularity " Keith Busch
` (8 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Keith Busch,
Hannes Reinecke, Nitesh Shetty
From: Keith Busch <kbusch@kernel.org>
Drivers with hardware that support write streams need a way to export how
many are available so applications can generically query this.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
[hch: renamed hints to streams, removed stacking]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
Documentation/ABI/stable/sysfs-block | 7 +++++++
block/blk-sysfs.c | 3 +++
include/linux/blkdev.h | 9 +++++++++
3 files changed, 19 insertions(+)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 0cceb2badc836..f67139b8b8eff 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -506,6 +506,13 @@ Description:
[RO] Maximum size in bytes of a single element in a DMA
scatter/gather list.
+What: /sys/block/<disk>/queue/max_write_streams
+Date: November 2024
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Maximum number of write streams supported, 0 if not
+ supported. If supported, valid values are 1 through
+ max_write_streams, inclusive.
What: /sys/block/<disk>/queue/max_segments
Date: March 2010
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4241aea84161c..c514c0cb5e93c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -104,6 +104,7 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
+QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@@ -446,6 +447,7 @@ QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_RO_ENTRY(queue_max_write_streams, "max_write_streams");
QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler");
QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
@@ -580,6 +582,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
+ &queue_max_write_streams_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 08a727b408164..ce2c3ddda2411 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -399,6 +399,8 @@ struct queue_limits {
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
+ unsigned short max_write_streams;
+
unsigned int max_open_zones;
unsigned int max_active_zones;
@@ -1240,6 +1242,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
return queue_max_segments(bdev_get_queue(bdev));
}
+static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
+{
+ if (bdev_is_partition(bdev))
+ return 0;
+ return bdev_limits(bdev)->max_write_streams;
+}
+
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
return q->limits.logical_block_size;
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 04/11] block: introduce a write_stream_granularity queue limit
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (2 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 03/11] block: introduce max_write_streams queue limit Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 05/11] block: expose write streams for block device nodes Keith Busch
` (7 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
Export the granularity that write streams should be discarded with,
as it is essential for making good use of them.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
Documentation/ABI/stable/sysfs-block | 8 ++++++++
block/blk-sysfs.c | 3 +++
include/linux/blkdev.h | 1 +
3 files changed, 12 insertions(+)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index f67139b8b8eff..c454c68b68fe6 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -514,6 +514,14 @@ Description:
supported. If supported, valid values are 1 through
max_write_streams, inclusive.
+What: /sys/block/<disk>/queue/write_stream_granularity
+Date: November 2024
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] Granularity of a write stream in bytes. The granularity
+ of a write stream is the size that should be discarded or
+ overwritten together to avoid write amplification in the device.
+
What: /sys/block/<disk>/queue/max_segments
Date: March 2010
Contact: linux-block@vger.kernel.org
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c514c0cb5e93c..525f4fa132cd3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,7 @@ QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
+QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@@ -448,6 +449,7 @@ QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
QUEUE_RO_ENTRY(queue_max_write_streams, "max_write_streams");
+QUEUE_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler");
QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
@@ -583,6 +585,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&queue_max_write_streams_entry.attr,
+ &queue_write_stream_granularity_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce2c3ddda2411..452e43e5735c5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -400,6 +400,7 @@ struct queue_limits {
unsigned short max_discard_segments;
unsigned short max_write_streams;
+ unsigned int write_stream_granularity;
unsigned int max_open_zones;
unsigned int max_active_zones;
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 05/11] block: expose write streams for block device nodes
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (3 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 04/11] block: introduce a write_stream_granularity " Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 06/11] io_uring: enable per-io write streams Keith Busch
` (6 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
Use the per-kiocb write stream if provided, or map temperature hints to
write streams (which is a bit questionable, but this shows how it is
done).
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
[kbusch: removed statx reporting]
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/fops.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/block/fops.c b/block/fops.c
index 6d5c4fc5a2168..f16aa39bf5bad 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
@@ -206,6 +207,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -333,6 +335,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -398,6 +401,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 06/11] io_uring: enable per-io write streams
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (4 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 05/11] block: expose write streams for block device nodes Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 07/11] nvme: add a nvme_get_log_lsi helper Keith Busch
` (5 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Keith Busch,
Hannes Reinecke, Nitesh Shetty
From: Keith Busch <kbusch@kernel.org>
Allow userspace to pass a per-I/O write stream in the SQE:
__u8 write_stream;
The __u8 type matches the size the filesystems and block layer support.
Application can query the supported values from the block devices
max_write_streams sysfs attribute. Unsupported values are ignored by
file operations that do not support write streams or rejected with an
error by those that support them.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/uapi/linux/io_uring.h | 4 ++++
io_uring/io_uring.c | 2 ++
io_uring/rw.c | 1 +
3 files changed, 7 insertions(+)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 38f0d6b10eaf7..986a480e3b9c2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,6 +92,10 @@ struct io_uring_sqe {
__u16 addr_len;
__u16 __pad3[1];
};
+ struct {
+ __u8 write_stream;
+ __u8 __pad4[3];
+ };
};
union {
struct {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ae36aa702f463..b561c5e8879ac 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3869,6 +3869,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
+ BUILD_BUG_SQE_ELEM(44, __u8, write_stream);
+ BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 5b24fd8b69f62..416ccd89a77ed 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -316,6 +316,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
rw->kiocb.dio_complete = NULL;
rw->kiocb.ki_flags = 0;
+ rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 07/11] nvme: add a nvme_get_log_lsi helper
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (5 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 06/11] io_uring: enable per-io write streams Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 08/11] nvme: pass a void pointer to nvme_get/set_features for the result Keith Busch
` (4 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
For log pages that need to pass in a LSI value, while at the same time
not touching all the existing nvme_get_log callers.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/core.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 571d4106d256d..36c44be98e38c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -150,6 +150,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -3074,8 +3076,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
- void *log, size_t size, u64 offset)
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
{
struct nvme_command c = { };
u32 dwlen = nvme_bytes_to_numd(size);
@@ -3089,10 +3091,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
+ c.get_log_page.lsi = cpu_to_le16(lsi);
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+ void *log, size_t size, u64 offset)
+{
+ return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
+ offset, 0);
+}
+
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 08/11] nvme: pass a void pointer to nvme_get/set_features for the result
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (6 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 07/11] nvme: add a nvme_get_log_lsi helper Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 09/11] nvme: add FDP definitions Keith Busch
` (3 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
That allows passing in structures instead of the u32 result, and thus
reduce the amount of bit shifting and masking required to parse the
result.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/core.c | 4 ++--
drivers/nvme/host/nvme.h | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 36c44be98e38c..c2a3585a3fa59 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1678,7 +1678,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
@@ -1687,7 +1687,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
buflen, result);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 611b02c8a8b37..c1995d89ffdb8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -890,10 +890,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 09/11] nvme: add FDP definitions
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (7 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 08/11] nvme: pass a void pointer to nvme_get/set_features for the result Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-11 18:35 ` [PATCHv14 10/11] nvme: register fdp parameters with the block layer Keith Busch
` (2 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Hannes Reinecke,
Nitesh Shetty, Keith Busch
From: Christoph Hellwig <hch@lst.de>
Add the config feature result, config log page, and management receive
commands needed for FDP.
Partially based on a patch from Kanchan Joshi <joshi.k@samsung.com>.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/linux/nvme.h | 77 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 13377dde4527b..7680078fa67fd 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -275,6 +275,7 @@ enum nvme_ctrl_attr {
NVME_CTRL_ATTR_HID_128_BIT = (1 << 0),
NVME_CTRL_ATTR_TBKAS = (1 << 6),
NVME_CTRL_ATTR_ELBAS = (1 << 15),
+ NVME_CTRL_ATTR_FDPS = (1 << 19),
};
struct nvme_id_ctrl {
@@ -661,6 +662,44 @@ struct nvme_rotational_media_log {
__u8 rsvd24[488];
};
+struct nvme_fdp_config {
+ __u8 flags;
+#define FDPCFG_FDPE (1U << 0)
+ __u8 fdpcidx;
+ __le16 reserved;
+};
+
+struct nvme_fdp_ruh_desc {
+ __u8 ruht;
+ __u8 reserved[3];
+};
+
+struct nvme_fdp_config_desc {
+ __le16 dsze;
+ __u8 fdpa;
+ __u8 vss;
+ __le32 nrg;
+ __le16 nruh;
+ __le16 maxpids;
+ __le32 nns;
+ __le64 runs;
+ __le32 erutl;
+ __u8 rsvd28[36];
+ struct nvme_fdp_ruh_desc ruhs[];
+};
+
+struct nvme_fdp_config_log {
+ __le16 numfdpc;
+ __u8 ver;
+ __u8 rsvd3;
+ __le32 sze;
+ __u8 rsvd8[8];
+ /*
+ * This is followed by variable number of nvme_fdp_config_desc
+ * structures, but sparse doesn't like nested variable sized arrays.
+ */
+};
+
struct nvme_smart_log {
__u8 critical_warning;
__u8 temperature[2];
@@ -887,6 +926,7 @@ enum nvme_opcode {
nvme_cmd_resv_register = 0x0d,
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
+ nvme_cmd_io_mgmt_recv = 0x12,
nvme_cmd_resv_release = 0x15,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
@@ -908,6 +948,7 @@ enum nvme_opcode {
nvme_opcode_name(nvme_cmd_resv_register), \
nvme_opcode_name(nvme_cmd_resv_report), \
nvme_opcode_name(nvme_cmd_resv_acquire), \
+ nvme_opcode_name(nvme_cmd_io_mgmt_recv), \
nvme_opcode_name(nvme_cmd_resv_release), \
nvme_opcode_name(nvme_cmd_zone_mgmt_send), \
nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \
@@ -1059,6 +1100,7 @@ enum {
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_DTYPE_STREAMS = 1 << 4,
+ NVME_RW_DTYPE_DPLCMT = 2 << 4,
NVME_WZ_DEAC = 1 << 9,
};
@@ -1146,6 +1188,38 @@ struct nvme_zone_mgmt_recv_cmd {
__le32 cdw14[2];
};
+struct nvme_io_mgmt_recv_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __le64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __u8 mo;
+ __u8 rsvd11;
+ __u16 mos;
+ __le32 numd;
+ __le32 cdw12[4];
+};
+
+enum {
+ NVME_IO_MGMT_RECV_MO_RUHS = 1,
+};
+
+struct nvme_fdp_ruh_status_desc {
+ __le16 pid;
+ __le16 ruhid;
+ __le32 earutr;
+ __le64 ruamw;
+ __u8 reserved[16];
+};
+
+struct nvme_fdp_ruh_status {
+ __u8 rsvd0[14];
+ __le16 nruhsd;
+ struct nvme_fdp_ruh_status_desc ruhsd[];
+};
+
enum {
NVME_ZRA_ZONE_REPORT = 0,
NVME_ZRASF_ZONE_REPORT_ALL = 0,
@@ -1281,6 +1355,7 @@ enum {
NVME_FEAT_PLM_WINDOW = 0x14,
NVME_FEAT_HOST_BEHAVIOR = 0x16,
NVME_FEAT_SANITIZE = 0x17,
+ NVME_FEAT_FDP = 0x1d,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
@@ -1301,6 +1376,7 @@ enum {
NVME_LOG_ANA = 0x0c,
NVME_LOG_FEATURES = 0x12,
NVME_LOG_RMI = 0x16,
+ NVME_LOG_FDP_CONFIGS = 0x20,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@@ -1888,6 +1964,7 @@ struct nvme_command {
struct nvmf_auth_receive_command auth_receive;
struct nvme_dbbuf dbbuf;
struct nvme_directive_cmd directive;
+ struct nvme_io_mgmt_recv_cmd imr;
};
};
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCHv14 10/11] nvme: register fdp parameters with the block layer
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (8 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 09/11] nvme: add FDP definitions Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-12 13:14 ` Hannes Reinecke
` (2 more replies)
2024-12-11 18:35 ` [PATCHv14 11/11] nvme: use fdp streams if write stream is provided Keith Busch
2024-12-12 11:39 ` [PATCHv14 00/11] block write streams with nvme fdp Kanchan Joshi
11 siblings, 3 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Register the device data placement limits if supported. This is just
registering the limits with the block layer. Nothing beyond reporting
these attributes is happening in this patch.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/core.c | 145 +++++++++++++++++++++++++++++++++++++++
drivers/nvme/host/nvme.h | 2 +
2 files changed, 147 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2a3585a3fa59..2392373415fd6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -23,6 +23,7 @@
#include <linux/pm_qos.h>
#include <linux/ratelimit.h>
#include <linux/unaligned.h>
+#include <linux/vmalloc.h>
#include "nvme.h"
#include "fabrics.h"
@@ -38,6 +39,8 @@ struct nvme_ns_info {
u32 nsid;
__le32 anagrpid;
u8 pi_offset;
+ u16 endgid;
+ u64 runs;
bool is_shared;
bool is_readonly;
bool is_ready;
@@ -1613,6 +1616,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
+ info->endgid = le16_to_cpu(id->endgid);
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
@@ -1653,6 +1657,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
+ info->endgid = le16_to_cpu(id->endgid);
}
kfree(id);
return ret;
@@ -2147,6 +2152,132 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
return ret;
}
+static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info, u8 fdp_idx)
+{
+ struct nvme_fdp_config_log hdr, *h;
+ struct nvme_fdp_config_desc *desc;
+ size_t size = sizeof(hdr);
+ void *log, *end;
+ int i, n, ret;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, &hdr, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log header status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ return ret;
+ }
+
+ size = le32_to_cpu(hdr.sze);
+ if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
+ dev_warn(ctrl->device, "FDP config size too large:%zu\n",
+ size);
+ return 0;
+ }
+
+ h = vmalloc(size);
+ if (!h)
+ return -ENOMEM;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, h, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ goto out;
+ }
+
+ n = le16_to_cpu(h->numfdpc) + 1;
+ if (fdp_idx > n) {
+ dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
+ fdp_idx, n);
+ /* Proceed without registering FDP streams */
+ ret = 0;
+ goto out;
+ }
+
+ log = h + 1;
+ desc = log;
+ end = log + size - sizeof(*h);
+ for (i = 0; i < fdp_idx; i++) {
+ log += le16_to_cpu(desc->dsze);
+ desc = log;
+ if (log >= end) {
+ dev_warn(ctrl->device,
+ "FDP invalid config descriptor list\n");
+ ret = 0;
+ goto out;
+ }
+ }
+
+ if (le32_to_cpu(desc->nrg) > 1) {
+ dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
+ ret = 0;
+ goto out;
+ }
+
+ info->runs = le64_to_cpu(desc->runs);
+out:
+ kvfree(h);
+ return ret;
+}
+
+static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ struct nvme_ns_head *head = ns->head;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct nvme_fdp_ruh_status *ruhs;
+ struct nvme_fdp_config fdp;
+ struct nvme_command c = {};
+ size_t size;
+ int ret;
+
+ /*
+ * The FDP configuration is static for the lifetime of the namespace,
+ * so return immediately if we've already registered this namespace's
+ * streams.
+ */
+ if (head->nr_plids)
+ return 0;
+
+ ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
+ &fdp);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
+ return ret;
+ }
+
+ if (!(fdp.flags & FDPCFG_FDPE))
+ return 0;
+
+ ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
+ if (!info->runs)
+ return ret;
+
+ size = struct_size(ruhs, ruhsd, S8_MAX - 1);
+ ruhs = kzalloc(size, GFP_KERNEL);
+ if (!ruhs)
+ return -ENOMEM;
+
+ c.imr.opcode = nvme_cmd_io_mgmt_recv;
+ c.imr.nsid = cpu_to_le32(head->ns_id);
+ c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
+ c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
+ ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
+ goto free;
+ }
+
+ head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+free:
+ kfree(ruhs);
+ return ret;
+}
+
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
@@ -2183,6 +2314,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
+ ret = nvme_query_fdp_info(ns, info);
+ if (ret < 0)
+ goto out;
+ }
+
blk_mq_freeze_queue(ns->disk->queue);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
@@ -2216,6 +2353,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
+ lim.max_write_streams = ns->head->nr_plids;
+ if (lim.max_write_streams)
+ lim.write_stream_granularity = max(info->runs, U32_MAX);
+ else
+ lim.write_stream_granularity = 0;
+
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
@@ -2318,6 +2461,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head, &lim, info);
+ lim.max_write_streams = ns_lim->max_write_streams;
+ lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c1995d89ffdb8..4b412cd8001f1 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -491,6 +491,8 @@ struct nvme_ns_head {
struct device cdev_device;
struct gendisk *disk;
+
+ u16 nr_plids;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCHv14 10/11] nvme: register fdp parameters with the block layer
2024-12-11 18:35 ` [PATCHv14 10/11] nvme: register fdp parameters with the block layer Keith Busch
@ 2024-12-12 13:14 ` Hannes Reinecke
2024-12-13 5:53 ` Nitesh Shetty
2024-12-16 16:12 ` Christoph Hellwig
2 siblings, 0 replies; 16+ messages in thread
From: Hannes Reinecke @ 2024-12-12 13:14 UTC (permalink / raw)
To: Keith Busch, axboe, hch, linux-block, linux-nvme, linux-fsdevel,
io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Keith Busch
On 12/11/24 19:35, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Register the device data placement limits if supported. This is just
> registering the limits with the block layer. Nothing beyond reporting
> these attributes is happening in this patch.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> drivers/nvme/host/core.c | 145 +++++++++++++++++++++++++++++++++++++++
> drivers/nvme/host/nvme.h | 2 +
> 2 files changed, 147 insertions(+)
>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCHv14 10/11] nvme: register fdp parameters with the block layer
2024-12-11 18:35 ` [PATCHv14 10/11] nvme: register fdp parameters with the block layer Keith Busch
2024-12-12 13:14 ` Hannes Reinecke
@ 2024-12-13 5:53 ` Nitesh Shetty
2024-12-16 16:12 ` Christoph Hellwig
2 siblings, 0 replies; 16+ messages in thread
From: Nitesh Shetty @ 2024-12-13 5:53 UTC (permalink / raw)
To: Keith Busch
Cc: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring,
sagi, asml.silence, anuj20.g, joshi.k, Keith Busch
[-- Attachment #1: Type: text/plain, Size: 375 bytes --]
On 11/12/24 10:35AM, Keith Busch wrote:
>From: Keith Busch <kbusch@kernel.org>
>
>Register the device data placement limits if supported. This is just
>registering the limits with the block layer. Nothing beyond reporting
>these attributes is happening in this patch.
>
>Signed-off-by: Keith Busch <kbusch@kernel.org>
>---
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCHv14 10/11] nvme: register fdp parameters with the block layer
2024-12-11 18:35 ` [PATCHv14 10/11] nvme: register fdp parameters with the block layer Keith Busch
2024-12-12 13:14 ` Hannes Reinecke
2024-12-13 5:53 ` Nitesh Shetty
@ 2024-12-16 16:12 ` Christoph Hellwig
2 siblings, 0 replies; 16+ messages in thread
From: Christoph Hellwig @ 2024-12-16 16:12 UTC (permalink / raw)
To: Keith Busch
Cc: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring,
sagi, asml.silence, anuj20.g, joshi.k, Keith Busch
On Wed, Dec 11, 2024 at 10:35:13AM -0800, Keith Busch wrote:
> + size = le32_to_cpu(hdr.sze);
> + if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
> + dev_warn(ctrl->device, "FDP config size too large:%zu\n",
> + size);
> + return 0;
> + h = vmalloc(size);
> + if (!h)
> + return -ENOMEM;
Isn't an unconditional vmalloc here for something that usually should
have less than a handful of descriptors a little aggressive? I'd use
kvmalloc here to get the best of both worlds, and the free path seems
to already use kvfree anyway.
Otherwise the incremental changes vs the previous version for the entire
series look good to me.
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCHv14 11/11] nvme: use fdp streams if write stream is provided
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (9 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 10/11] nvme: register fdp parameters with the block layer Keith Busch
@ 2024-12-11 18:35 ` Keith Busch
2024-12-12 11:39 ` [PATCHv14 00/11] block write streams with nvme fdp Kanchan Joshi
11 siblings, 0 replies; 16+ messages in thread
From: Keith Busch @ 2024-12-11 18:35 UTC (permalink / raw)
To: axboe, hch, linux-block, linux-nvme, linux-fsdevel, io-uring
Cc: sagi, asml.silence, anuj20.g, joshi.k, Keith Busch,
Hannes Reinecke, Nitesh Shetty
From: Keith Busch <kbusch@kernel.org>
Maps a user requested write stream to an FDP placement ID if possible.
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Nitesh Shetty <nj.shetty@samsung.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/core.c | 31 ++++++++++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 1 +
2 files changed, 31 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2392373415fd6..b12d904dbc495 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -675,6 +675,7 @@ static void nvme_free_ns_head(struct kref *ref)
ida_free(&head->subsys->ns_ida, head->instance);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
+ kfree(head->plids);
kfree(head);
}
@@ -998,6 +999,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (op == nvme_cmd_write && ns->head->nr_plids) {
+ u16 write_stream = req->bio->bi_write_stream;
+
+ if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
+ return BLK_STS_INVAL;
+
+ if (write_stream) {
+ dsmgmt |= ns->head->plids[write_stream - 1] << 16;
+ control |= NVME_RW_DTYPE_DPLCMT;
+ }
+ }
+
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
return BLK_STS_INVAL;
@@ -2233,7 +2246,7 @@ static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
struct nvme_fdp_config fdp;
struct nvme_command c = {};
size_t size;
- int ret;
+ int i, ret;
/*
* The FDP configuration is static for the lifetime of the namespace,
@@ -2273,6 +2286,22 @@ static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
}
head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+ if (!head->nr_plids)
+ goto free;
+
+ head->plids = kcalloc(head->nr_plids, sizeof(head->plids),
+ GFP_KERNEL);
+ if (!head->plids) {
+ dev_warn(ctrl->device,
+ "failed to allocate %u FDP placement IDs\n",
+ head->nr_plids);
+ head->nr_plids = 0;
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ for (i = 0; i < head->nr_plids; i++)
+ head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
free:
kfree(ruhs);
return ret;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4b412cd8001f1..50e7628296d1f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -493,6 +493,7 @@ struct nvme_ns_head {
struct gendisk *disk;
u16 nr_plids;
+ u16 *plids;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
--
2.43.5
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCHv14 00/11] block write streams with nvme fdp
2024-12-11 18:35 ` [PATCHv14 00/11] block write streams with nvme fdp Keith Busch
` (10 preceding siblings ...)
2024-12-11 18:35 ` [PATCHv14 11/11] nvme: use fdp streams if write stream is provided Keith Busch
@ 2024-12-12 11:39 ` Kanchan Joshi
11 siblings, 0 replies; 16+ messages in thread
From: Kanchan Joshi @ 2024-12-12 11:39 UTC (permalink / raw)
To: Keith Busch, axboe, hch, linux-block, linux-nvme, linux-fsdevel,
io-uring
Cc: sagi, asml.silence, anuj20.g, Keith Busch
For
> 16 files changed, 341 insertions(+), 6 deletions(-)
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
^ permalink raw reply [flat|nested] 16+ messages in thread