* [PATCH 00/12] block: support for partial sector reads
@ 2022-06-30 20:42 Keith Busch
2022-06-30 20:42 ` [PATCH 01/12] block: move direct io alignment check to common Keith Busch
` (11 more replies)
0 siblings, 12 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
At LSFMM nearly 2 months ago, I discussed how some storage hardware
supports the ability to read at granularities smaller than a sector, and
the nvme protocol feature that enables this capability, "bit buckets".
This is useful in scenarios where only parts of sectors are used by the
application, and the primary benefits to support this are:
* Improved link bandwidth usage
* Reduced memory utilization
This series enables the block layer and nvme to set up bit bucket
descriptors for read commands, then enables user space direct-io to make
use of this capability by allowing the user to specify an arbitrary
offset and length. This allows truncating an arbitrary number of bytes
off sectors from the front and end of the transfer.
There are no current in-kernel users beyond the direct-io cases, but
this could also be used for to truncate bytes out of the middle of a
transfer as well. For example, if you wanted to read a page and knew you
wer going to immediately dirty some number of bytes in the middle, you
could set up a read request to skip those in the data transfer.
Keith Busch (12):
block: move direct io alignment check to common
iomap: save copy of bdev for direct io
iomap: get logical block size directly
iomap: use common blkdev alignment check
block: add bit bucket capabilities
nvme: add support for bit buckets
block: allow copying pre-registered bvecs
block: add bio number of vecs helper for partials
block: add partial sector parameter helper
block: add direct-io partial sector read support
iomap: add direct io partial sector read support
block: export and document bit_bucket attribute
Documentation/ABI/stable/sysfs-block | 9 +++
block/bio.c | 42 +++++++++++-
block/blk-core.c | 5 ++
block/blk-merge.c | 3 +-
block/blk-mq.c | 2 +
block/blk-sysfs.c | 3 +
block/fops.c | 97 ++++++++++++++++++----------
drivers/nvme/host/core.c | 3 +
drivers/nvme/host/nvme.h | 6 ++
drivers/nvme/host/pci.c | 17 ++++-
fs/iomap/direct-io.c | 43 ++++++++----
include/linux/bio.h | 11 ++++
include/linux/blk-mq.h | 2 +
include/linux/blk_types.h | 1 +
include/linux/blkdev.h | 41 ++++++++++++
include/linux/nvme.h | 2 +
16 files changed, 236 insertions(+), 51 deletions(-)
--
2.30.2
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH 01/12] block: move direct io alignment check to common
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 02/12] iomap: save copy of bdev for direct io Keith Busch
` (10 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
All direct io has the same setup and alignment check, so just do it once
in common code.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/fops.c | 44 ++++++++++++++----------------------------
include/linux/blkdev.h | 7 +++++++
2 files changed, 22 insertions(+), 29 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index 86d3cab9bf93..f37af5924cef 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -42,28 +42,17 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
return op;
}
-static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
- struct iov_iter *iter)
-{
- return pos & (bdev_logical_block_size(bdev) - 1) ||
- !bdev_iter_is_aligned(bdev, iter);
-}
-
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
- struct iov_iter *iter, unsigned int nr_pages)
+ struct iov_iter *iter, unsigned int nr_pages,
+ struct block_device *bdev, loff_t pos)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
- loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -168,20 +157,15 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages)
+ unsigned int nr_pages, struct block_device *bdev, loff_t pos)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
- loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -292,20 +276,15 @@ static void blkdev_bio_end_io_async(struct bio *bio)
}
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
- struct iov_iter *iter,
- unsigned int nr_pages)
+ struct iov_iter *iter, unsigned int nr_pages,
+ struct block_device *bdev, loff_t pos)
{
- struct block_device *bdev = iocb->ki_filp->private_data;
bool is_read = iov_iter_rw(iter) == READ;
unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
struct blkdev_dio *dio;
struct bio *bio;
- loff_t pos = iocb->ki_pos;
int ret = 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
-
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -357,18 +336,25 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
+ struct block_device *bdev = iocb->ki_filp->private_data;
+ loff_t pos = iocb->ki_pos;
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
+ if (blkdev_dio_unaligned(bdev, pos, iter))
+ return -EINVAL;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
- return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
- return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_simple(iocb, iter, nr_pages,
+ bdev, pos);
+ return __blkdev_direct_IO_async(iocb, iter, nr_pages, bdev,
+ pos);
}
- return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+ return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages), bdev,
+ pos);
}
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 22b12531aeb7..9d676adfaaa1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1352,6 +1352,13 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev,
bdev_logical_block_size(bdev) - 1);
}
+static inline bool blkdev_dio_unaligned(struct block_device *bdev, loff_t p,
+ struct iov_iter *iter)
+{
+ return p & (bdev_logical_block_size(bdev) - 1) ||
+ !bdev_iter_is_aligned(bdev, iter);
+}
+
static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
unsigned int len)
{
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 02/12] iomap: save copy of bdev for direct io
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
2022-06-30 20:42 ` [PATCH 01/12] block: move direct io alignment check to common Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 03/12] iomap: get logical block size directly Keith Busch
` (9 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
The block_device is used three times already, so save a copy instead of
following the iomap pointer each time.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
fs/iomap/direct-io.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5d098adba443..5d478a95efdf 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -240,7 +240,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
- unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+ struct block_device *bdev = iomap->bdev;
+ unsigned int blkbits = blksize_bits(bdev_logical_block_size(bdev));
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
@@ -253,7 +254,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t orig_count;
if ((pos | length) & ((1 << blkbits) - 1) ||
- !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
+ !bdev_iter_is_aligned(bdev, dio->submit.iter))
return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) {
@@ -275,7 +276,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* cache flushes on IO completion.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
+ (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(bdev))
use_fua = true;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 03/12] iomap: get logical block size directly
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
2022-06-30 20:42 ` [PATCH 01/12] block: move direct io alignment check to common Keith Busch
2022-06-30 20:42 ` [PATCH 02/12] iomap: save copy of bdev for direct io Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 04/12] iomap: use common blkdev alignment check Keith Busch
` (8 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Don't transform the logical block size to a bit shift only to shift it
back out to the size. Just use the size.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
fs/iomap/direct-io.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5d478a95efdf..40cbf2025386 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -241,7 +241,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
struct block_device *bdev = iomap->bdev;
- unsigned int blkbits = blksize_bits(bdev_logical_block_size(bdev));
+ unsigned int blksz = bdev_logical_block_size(bdev);
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
@@ -253,7 +253,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
- if ((pos | length) & ((1 << blkbits) - 1) ||
+ if ((pos | length) & (blksz - 1) ||
!bdev_iter_is_aligned(bdev, dio->submit.iter))
return -EINVAL;
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 04/12] iomap: use common blkdev alignment check
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (2 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 03/12] iomap: get logical block size directly Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 05/12] block: add bit bucket capabilities Keith Busch
` (7 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
The block layer provides a generic io alignment check, so use that.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
fs/iomap/direct-io.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 40cbf2025386..10a113358365 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -241,7 +241,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
struct block_device *bdev = iomap->bdev;
- unsigned int blksz = bdev_logical_block_size(bdev);
unsigned int fs_block_size = i_blocksize(inode), pad;
loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
@@ -253,8 +252,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
- if ((pos | length) & (blksz - 1) ||
- !bdev_iter_is_aligned(bdev, dio->submit.iter))
+ if (blkdev_dio_unaligned(bdev, pos | length, dio->submit.iter))
return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) {
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 05/12] block: add bit bucket capabilities
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (3 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 04/12] iomap: use common blkdev alignment check Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 06/12] nvme: add support for bit buckets Keith Busch
` (6 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Recent storage protocol enhancements allow devices to support partial
sector read access. When used by a host, this can provide link bandwidth
savings and reduce memory utilization. Provide a way for drivers to
indicate support for this capability, and implement the framework to
submit bit-bucket read bio's.
The implementation indicates the unwanted data by using a special page.
The page can be used multiple times within the same bio to designate one
or more unwanted data gaps within the requested sector(s). Drivers that
subscribe to the capability must check for this page and set up their
protocol specific scatter-gather accordingly.
Requests with bit buckets need to be flagged specially for this since
NVMe needs to know before walking the segments if it should construct a
bit bucket SGL instead of a PRP.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/bio.c | 29 +++++++++++++++++++++++++++--
block/blk-core.c | 5 +++++
block/blk-merge.c | 3 ++-
block/blk-mq.c | 2 ++
include/linux/blk-mq.h | 2 ++
include/linux/blk_types.h | 1 +
include/linux/blkdev.h | 13 +++++++++++++
7 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 933ea3210954..b0c85778257a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1208,6 +1208,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
+ unsigned int lbas = bdev_logical_block_size(bio->bi_bdev);
ssize_t size, left;
unsigned len, i;
size_t offset;
@@ -1226,10 +1227,32 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
* result to ensure the bio's total size is correct. The remainder of
* the iov data will be picked up in the next bio iteration.
+ *
+ * Partial sector reads can break the iov length expecations by
+ * allowing dma_alignement granularities. The code enforces only 1
+ * segment in that case, which simplifies the following logic. We don't
+ * need to consider individual segment lengths since the skip and
+ * truncate bytes are guaranteed to align the total length to the block
+ * size.
*/
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
- if (size > 0)
- size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
+ if (size > 0) {
+ /*
+ * If size doesn't reach the end with bit buckets, align the
+ * total size down to the block size to avoid a bit-bucket
+ * truncation overlapping with the desired read data.
+ */
+ if (bio_flagged(bio, BIO_BIT_BUCKET)) {
+ if (size != iov_iter_count(iter)) {
+ size_t total_size = size + bio->bi_iter.bi_size;
+
+ total_size = ALIGN_DOWN(total_size, lbas);
+ size = total_size - bio->bi_iter.bi_size;
+ }
+ } else {
+ size = ALIGN_DOWN(size, lbas);
+ }
+ }
if (unlikely(size <= 0))
return size ? size : -EFAULT;
@@ -1602,6 +1625,8 @@ struct bio *bio_split(struct bio *bio, int sectors,
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
bio_set_flag(split, BIO_TRACE_COMPLETION);
+ if (bio_flagged(bio, BIO_BIT_BUCKET))
+ bio_set_flag(split, BIO_BIT_BUCKET);
return split;
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 5ad7bd93077c..d2e9fd42b732 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -73,6 +73,9 @@ struct kmem_cache *blk_requestq_srcu_cachep;
*/
static struct workqueue_struct *kblockd_workqueue;
+struct page *blk_bb_page;
+EXPORT_SYMBOL_GPL(blk_bb_page);
+
/**
* blk_queue_flag_set - atomically set a queue flag
* @flag: flag to be set
@@ -1228,5 +1231,7 @@ int __init blk_dev_init(void)
blk_debugfs_root = debugfs_create_dir("block", NULL);
+ blk_bb_page = ZERO_PAGE(0);
+
return 0;
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0f5f42ebd0bb..65b71114633f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -281,7 +281,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+ if (!bio_flagged(bio, BIO_BIT_BUCKET) && bvprvp &&
+ bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
if (nsegs < max_segs &&
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 15c7c5c4ad22..efbe308d7ae5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2425,6 +2425,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
+ if (bio_flagged(bio, BIO_BIT_BUCKET))
+ rq->rq_flags |= RQF_BIT_BUCKET;
rq->__sector = bio->bi_iter.bi_sector;
blk_rq_bio_prep(rq, bio, nr_segs);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 43aad0da3305..05fa0b292223 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,8 @@ typedef __u32 __bitwise req_flags_t;
/* drive already may have started this one */
#define RQF_STARTED ((__force req_flags_t)(1 << 1))
+/* request has bit bucket payload */
+#define RQF_BIT_BUCKET ((__force req_flags_t)(1 << 2))
/* may not be passed by ioscheduler */
#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
/* request for flush sequence */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a24d4078fb21..dc981d0232d1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -332,6 +332,7 @@ enum {
BIO_QOS_MERGED, /* but went through rq_qos merge path */
BIO_REMAPPED,
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
+ BIO_BIT_BUCKET, /* contains one or more bit bucket pages */
BIO_FLAG_LAST
};
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9d676adfaaa1..4396fcf04bb8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -44,6 +44,7 @@ struct blk_crypto_profile;
extern const struct device_type disk_type;
extern struct device_type part_type;
extern struct class block_class;
+extern struct page *blk_bb_page;
/* Must be consistent with blk_mq_poll_stats_bkt() */
#define BLK_MQ_POLL_STATS_BKTS 16
@@ -580,6 +581,7 @@ struct request_queue {
#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
+#define QUEUE_FLAG_BIT_BUCKET 31 /* device supports read bit buckets */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP) | \
@@ -621,6 +623,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
+#define blk_queue_bb(q) test_bit(QUEUE_FLAG_BIT_BUCKET, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@@ -1566,4 +1569,14 @@ struct io_comp_batch {
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
+static inline void blk_add_bb_page(struct bio *bio, int len)
+{
+ bio_set_flag(bio, BIO_BIT_BUCKET);
+ get_page(blk_bb_page);
+ bio_add_page(bio, blk_bb_page, len, 0);
+}
+static inline bool blk_is_bit_bucket(struct page *page)
+{
+ return page == blk_bb_page;
+}
#endif /* _LINUX_BLKDEV_H */
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 06/12] nvme: add support for bit buckets
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (4 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 05/12] block: add bit bucket capabilities Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 07/12] block: allow copying pre-registered bvecs Keith Busch
` (5 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Set the queue for bit bucket support if the hardware and driver support
it. The nvme pci driver will recognize the special bitbucket page for
read commands and set up an appropriate sg descriptor for it.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/nvme/host/core.c | 3 +++
drivers/nvme/host/nvme.h | 6 ++++++
drivers/nvme/host/pci.c | 17 +++++++++++++++--
include/linux/nvme.h | 2 ++
4 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b5b24998a5ab..211bc30bb707 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3999,6 +3999,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
+ if (nvme_ctrl_sgl_bb_supported(ctrl) && ctrl->ops->flags & NVME_F_BB)
+ blk_queue_flag_set(QUEUE_FLAG_BIT_BUCKET, ns->queue);
+
ns->ctrl = ctrl;
kref_init(&ns->kref);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0da94b233fed..7401f58fe534 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -496,6 +496,7 @@ struct nvme_ctrl_ops {
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
#define NVME_F_PCI_P2PDMA (1 << 2)
+#define NVME_F_BB (1 << 3)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -991,6 +992,11 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
+static inline bool nvme_ctrl_sgl_bb_supported(struct nvme_ctrl *ctrl)
+{
+ return ctrl->sgls & (1 << 16);
+}
+
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode);
int nvme_execute_passthru_rq(struct request *rq);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 247a74aba336..32894e392142 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -535,6 +535,8 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+ if (req->rq_flags & RQF_BIT_BUCKET)
+ return true;
if (!nvme_ctrl_sgl_supported(&dev->ctrl))
return false;
if (!iod->nvmeq->qid)
@@ -724,6 +726,13 @@ static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
+static void nvme_pci_sgl_set_bb(struct nvme_sgl_desc *sge,
+ struct scatterlist *sg)
+{
+ sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->type = NVME_SGL_FMT_BB_DESC << 4;
+}
+
static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
dma_addr_t dma_addr, int entries)
{
@@ -789,7 +798,10 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
nvme_pci_sgl_set_seg(link, sgl_dma, entries);
}
- nvme_pci_sgl_set_data(&sg_list[i++], sg);
+ if (rq_data_dir(req) == READ && blk_is_bit_bucket(sg_page(sg)))
+ nvme_pci_sgl_set_bb(&sg_list[i++], sg);
+ else
+ nvme_pci_sgl_set_data(&sg_list[i++], sg);
sg = sg_next(sg);
} while (--entries > 0);
@@ -3003,7 +3015,8 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA,
+ NVME_F_PCI_P2PDMA |
+ NVME_F_BB,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e3934003f239..1fae005715fc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -835,6 +835,7 @@ enum {
*
* For struct nvme_sgl_desc:
* @NVME_SGL_FMT_DATA_DESC: data block descriptor
+ * @NVME_SGL_FMT_BB_DESC: bit buckect descriptor
* @NVME_SGL_FMT_SEG_DESC: sgl segment descriptor
* @NVME_SGL_FMT_LAST_SEG_DESC: last sgl segment descriptor
*
@@ -846,6 +847,7 @@ enum {
*/
enum {
NVME_SGL_FMT_DATA_DESC = 0x00,
+ NVME_SGL_FMT_BB_DESC = 0x01,
NVME_SGL_FMT_SEG_DESC = 0x02,
NVME_SGL_FMT_LAST_SEG_DESC = 0x03,
NVME_KEY_SGL_FMT_DATA_DESC = 0x04,
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 07/12] block: allow copying pre-registered bvecs
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (5 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 06/12] nvme: add support for bit buckets Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-07-01 2:40 ` Keith Busch
2022-06-30 20:42 ` [PATCH 08/12] block: add bio number of vecs helper for partials Keith Busch
` (4 subsequent siblings)
11 siblings, 1 reply; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
If a bio was initialized with bi_max_vecs, then append the requested
bvec instead of overriding it. This will allow mixing bvecs from
multiple sources.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/bio.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index b0c85778257a..391cad726ff2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1130,12 +1130,25 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
+static void bio_copy_bvec(struct bio *bio, struct iov_iter *iter)
+{
+ memcpy(&bio->bi_io_vec[bio->bi_vcnt], iter->bvec,
+ iter->nr_segs * sizeof(struct bio_vec));
+ bio->bi_vcnt += iter->nr_segs;
+ bio->bi_iter.bi_size += iov_iter_count(iter);
+}
+
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
size_t size = iov_iter_count(iter);
WARN_ON_ONCE(bio->bi_max_vecs);
+ if (bio->bi_max_vecs) {
+ bio_copy_bvec(bio, iter);
+ return;
+ }
+
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
size_t max_sectors = queue_max_zone_append_sectors(q);
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 08/12] block: add bio number of vecs helper for partials
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (6 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 07/12] block: allow copying pre-registered bvecs Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 09/12] block: add partial sector parameter helper Keith Busch
` (3 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Bit buckets get their own vector, so need to allocate enough to hold
both the preregistered bvecs and the bit buckets.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/linux/bio.h | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 992ee987f273..ded38accf009 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -452,6 +452,17 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
return iov_iter_npages(iter, max_segs);
}
+static inline int bio_iov_vecs_to_alloc_partial(struct iov_iter *iter,
+ int max_segs, bool trunc,
+ bool skip)
+{
+ if (skip || trunc)
+ return min(iov_iter_npages(iter, max_segs) + skip + trunc,
+ max_segs);
+ else
+ return bio_iov_vecs_to_alloc(iter, max_segs);
+}
+
struct request_queue;
extern int submit_bio_wait(struct bio *bio);
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 09/12] block: add partial sector parameter helper
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (7 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 08/12] block: add bio number of vecs helper for partials Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 10/12] block: add direct-io partial sector read support Keith Busch
` (2 subsequent siblings)
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Check if an iov is a read, and aligned to a partial sector access. If so
set the skipped and truncated bytes.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
include/linux/blkdev.h | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4396fcf04bb8..e631cdd01df4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1362,6 +1362,27 @@ static inline bool blkdev_dio_unaligned(struct block_device *bdev, loff_t p,
!bdev_iter_is_aligned(bdev, iter);
}
+static inline bool blkdev_bit_bucket(struct block_device *bdev, loff_t pos,
+ loff_t len, struct iov_iter *iter, u16 *skip,
+ u16 *trunc)
+{
+ unsigned int blksz = bdev_logical_block_size(bdev);
+
+ if (iov_iter_rw(iter) != READ ||
+ !blk_queue_bb(bdev_get_queue(bdev)) ||
+ iter->nr_segs > 1)
+ return false;
+
+ if (!iov_iter_is_aligned(iter, bdev_dma_alignment(bdev),
+ bdev_dma_alignment(bdev)))
+ return false;
+
+ *skip = pos & (blksz - 1);
+ *trunc = blksz - ((pos + len) & (blksz - 1));
+
+ return true;
+}
+
static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
unsigned int len)
{
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 10/12] block: add direct-io partial sector read support
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (8 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 09/12] block: add partial sector parameter helper Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 11/12] iomap: " Keith Busch
2022-06-30 20:42 ` [PATCH 12/12] block: export and document bit_bucket attribute Keith Busch
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Enable direct io to read partial sectors if the block device supports bit
buckets.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
block/fops.c | 69 ++++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 56 insertions(+), 13 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index f37af5924cef..5eee8cef7ce0 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -46,9 +46,10 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct iov_iter *iter, unsigned int nr_pages,
- struct block_device *bdev, loff_t pos)
+ struct block_device *bdev, loff_t pos, u16 skip, u16 trunc)
{
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
+ u16 bucket_bytes = skip + trunc;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
@@ -72,10 +73,19 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_ioprio = iocb->ki_ioprio;
+ if (bucket_bytes) {
+ bio_set_flag(&bio, BIO_BIT_BUCKET);
+ if (skip)
+ blk_add_bb_page(&bio, skip);
+ }
+
ret = bio_iov_iter_get_pages(&bio, iter);
if (unlikely(ret))
goto out;
- ret = bio.bi_iter.bi_size;
+
+ if (trunc)
+ blk_add_bb_page(&bio, trunc);
+ ret = bio.bi_iter.bi_size - bucket_bytes;
if (iov_iter_rw(iter) == WRITE)
task_io_account_write(ret);
@@ -157,13 +167,15 @@ static void blkdev_bio_end_io(struct bio *bio)
}
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
- unsigned int nr_pages, struct block_device *bdev, loff_t pos)
+ unsigned int nr_pages, struct block_device *bdev, loff_t pos,
+ u16 skip, u16 trunc)
{
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ u16 bucket_bytes = skip + trunc;
int ret = 0;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -199,6 +211,14 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
+ if (bucket_bytes) {
+ bio_set_flag(bio, BIO_BIT_BUCKET);
+ if (skip) {
+ blk_add_bb_page(bio, skip);
+ skip = 0;
+ }
+ }
+
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
bio->bi_status = BLK_STS_IOERR;
@@ -206,6 +226,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
break;
}
+ if (trunc && !iov_iter_count(iter)) {
+ blk_add_bb_page(bio, trunc);
+ trunc = 0;
+ }
+
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio);
@@ -218,7 +243,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
- nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
+ nr_pages = bio_iov_vecs_to_alloc_partial(iter, BIO_MAX_VECS, 0,
+ trunc);
if (!nr_pages) {
submit_bio(bio);
break;
@@ -244,7 +270,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (!ret)
ret = blk_status_to_errno(dio->bio.bi_status);
if (likely(!ret))
- ret = dio->size;
+ ret = dio->size - bucket_bytes;
bio_put(&dio->bio);
return ret;
@@ -277,10 +303,11 @@ static void blkdev_bio_end_io_async(struct bio *bio)
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter, unsigned int nr_pages,
- struct block_device *bdev, loff_t pos)
+ struct block_device *bdev, loff_t pos, u16 skip, u16 trunc)
{
bool is_read = iov_iter_rw(iter) == READ;
unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ u16 bucket_bytes = skip + trunc;
struct blkdev_dio *dio;
struct bio *bio;
int ret = 0;
@@ -296,6 +323,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
+ if (bucket_bytes) {
+ bio_set_flag(bio, BIO_BIT_BUCKET);
+ if (skip)
+ blk_add_bb_page(bio, skip);
+ }
+
if (iov_iter_is_bvec(iter)) {
/*
* Users don't rely on the iterator being in any particular
@@ -311,7 +344,11 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
return ret;
}
}
- dio->size = bio->bi_iter.bi_size;
+
+ if (trunc)
+ blk_add_bb_page(bio, trunc);
+
+ dio->size = bio->bi_iter.bi_size - bucket_bytes;
if (is_read) {
if (iter_is_iovec(iter)) {
@@ -338,23 +375,29 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct block_device *bdev = iocb->ki_filp->private_data;
loff_t pos = iocb->ki_pos;
+ u16 skip = 0, trunc = 0;
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
- if (blkdev_dio_unaligned(bdev, pos, iter))
- return -EINVAL;
+ if (blkdev_dio_unaligned(bdev, pos, iter)) {
+ if (!blkdev_bit_bucket(bdev, pos, iov_iter_count(iter), iter,
+ &skip, &trunc))
+ return -EINVAL;
+ nr_pages = bio_iov_vecs_to_alloc_partial(iter, BIO_MAX_VECS + 1,
+ skip, trunc);
+ } else
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
- nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, nr_pages,
- bdev, pos);
+ bdev, pos, skip, trunc);
return __blkdev_direct_IO_async(iocb, iter, nr_pages, bdev,
- pos);
+ pos, skip, trunc);
}
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages), bdev,
- pos);
+ pos, skip, trunc);
}
static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 11/12] iomap: add direct-io partial sector read support
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (9 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 10/12] block: add direct-io partial sector read support Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
2022-06-30 20:42 ` [PATCH 12/12] block: export and document bit_bucket attribute Keith Busch
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Enable direct io to read partial sectors if the block device supports bit
buckets.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
fs/iomap/direct-io.c | 36 ++++++++++++++++++++++++++++--------
1 file changed, 28 insertions(+), 8 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 10a113358365..212e63b78950 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -251,9 +251,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
int nr_pages, ret = 0;
size_t copied = 0;
size_t orig_count;
+ u16 skip = 0, trunc = 0;
if (blkdev_dio_unaligned(bdev, pos | length, dio->submit.iter))
- return -EINVAL;
+ if (!blkdev_bit_bucket(bdev, pos, length, dio->submit.iter,
+ &skip, &trunc))
+ return -EINVAL;
if (iomap->type == IOMAP_UNWRITTEN) {
dio->flags |= IOMAP_DIO_UNWRITTEN;
@@ -310,9 +313,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
*/
bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
- nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
+ nr_pages = bio_iov_vecs_to_alloc_partial(dio->submit.iter, BIO_MAX_VECS,
+ skip, trunc);
do {
- size_t n;
+ size_t n, bucket_bytes = 0;
if (dio->error) {
iov_iter_revert(dio->submit.iter, copied);
copied = ret = 0;
@@ -327,6 +331,15 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
+ if (skip || trunc) {
+ bio_set_flag(bio, BIO_BIT_BUCKET);
+ if (skip) {
+ bucket_bytes += skip;
+ blk_add_bb_page(bio, skip);
+ skip = 0;
+ }
+ }
+
ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
if (unlikely(ret)) {
/*
@@ -339,6 +352,12 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto zero_tail;
}
+ if (trunc && !iov_iter_count(dio->submit.iter)) {
+ blk_add_bb_page(bio, trunc);
+ bucket_bytes += trunc;
+ trunc = 0;
+ }
+
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
@@ -347,18 +366,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
bio_set_pages_dirty(bio);
}
- dio->size += n;
- copied += n;
+ dio->size += n - bucket_bytes;
+ copied += n - bucket_bytes;
- nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
- BIO_MAX_VECS);
+ nr_pages = bio_iov_vecs_to_alloc_partial(dio->submit.iter,
+ BIO_MAX_VECS, skip,
+ trunc);
/*
* We can only poll for single bio I/Os.
*/
if (nr_pages)
dio->iocb->ki_flags &= ~IOCB_HIPRI;
iomap_dio_submit_bio(iter, dio, bio, pos);
- pos += n;
+ pos += n - bucket_bytes;
} while (nr_pages);
/*
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* [PATCH 12/12] block: export and document bit_bucket attribute
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
` (10 preceding siblings ...)
2022-06-30 20:42 ` [PATCH 11/12] iomap: " Keith Busch
@ 2022-06-30 20:42 ` Keith Busch
11 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-06-30 20:42 UTC (permalink / raw)
To: linux-fsdevel, linux-block, linux-nvme
Cc: axboe, Kernel Team, hch, willy, sagi, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Now that user space applications can make use of partial sector reads,
export and document the queue attribute that indicates if the capability
is supported.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
Documentation/ABI/stable/sysfs-block | 9 +++++++++
block/blk-sysfs.c | 3 +++
2 files changed, 12 insertions(+)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index cd14ecb3c9a5..defc056690d0 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -142,6 +142,15 @@ Description:
Default value of this file is '1'(on).
+What: /sys/block/<disk>/queue/bit_bucket
+Date: June 2022
+Contact: linux-block@vger.kernel.org
+Description:
+ [RO] This file indicates if the device supports partial sector
+ reads. If set to '1', user space can issue direct IO reads at
+ dma_alignment granularity.
+
+
What: /sys/block/<disk>/queue/chunk_sectors
Date: September 2016
Contact: Hannes Reinecke <hare@suse.com>
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 85ea43eff094..0c1f1c2fbb30 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -309,6 +309,7 @@ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
+QUEUE_SYSFS_BIT_FNS(bit_bucket, BIT_BUCKET, 0);
#undef QUEUE_SYSFS_BIT_FNS
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
@@ -627,6 +628,7 @@ QUEUE_RW_ENTRY(queue_nonrot, "rotational");
QUEUE_RW_ENTRY(queue_iostats, "iostats");
QUEUE_RW_ENTRY(queue_random, "add_random");
QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+QUEUE_RW_ENTRY(queue_bit_bucket, "bit_bucket");
static struct attribute *queue_attrs[] = {
&queue_requests_entry.attr,
@@ -653,6 +655,7 @@ static struct attribute *queue_attrs[] = {
&queue_zone_append_max_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_nonrot_entry.attr,
+ &queue_bit_bucket_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
&queue_max_open_zones_entry.attr,
--
2.30.2
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 07/12] block: allow copying pre-registered bvecs
2022-06-30 20:42 ` [PATCH 07/12] block: allow copying pre-registered bvecs Keith Busch
@ 2022-07-01 2:40 ` Keith Busch
0 siblings, 0 replies; 14+ messages in thread
From: Keith Busch @ 2022-07-01 2:40 UTC (permalink / raw)
To: Keith Busch
Cc: linux-fsdevel, linux-block, linux-nvme, axboe, Kernel Team, hch,
willy, sagi
On Thu, Jun 30, 2022 at 01:42:07PM -0700, Keith Busch wrote:
> void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
> {
> size_t size = iov_iter_count(iter);
>
> WARN_ON_ONCE(bio->bi_max_vecs);
>
> + if (bio->bi_max_vecs) {
> + bio_copy_bvec(bio, iter);
> + return;
> + }
Obviously the WARN_ON_ONCE needs to go away with this.
And with the follow on users in this series, there's also a bug with putting
page references on these at bio_endio, so don't try testing pre-registered
buffers with this series. I'll send a fix in the v2 if we get that far.
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2022-07-01 2:41 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-30 20:42 [PATCH 00/12] block: support for partial sector reads Keith Busch
2022-06-30 20:42 ` [PATCH 01/12] block: move direct io alignment check to common Keith Busch
2022-06-30 20:42 ` [PATCH 02/12] iomap: save copy of bdev for direct io Keith Busch
2022-06-30 20:42 ` [PATCH 03/12] iomap: get logical block size directly Keith Busch
2022-06-30 20:42 ` [PATCH 04/12] iomap: use common blkdev alignment check Keith Busch
2022-06-30 20:42 ` [PATCH 05/12] block: add bit bucket capabilities Keith Busch
2022-06-30 20:42 ` [PATCH 06/12] nvme: add support for bit buckets Keith Busch
2022-06-30 20:42 ` [PATCH 07/12] block: allow copying pre-registered bvecs Keith Busch
2022-07-01 2:40 ` Keith Busch
2022-06-30 20:42 ` [PATCH 08/12] block: add bio number of vecs helper for partials Keith Busch
2022-06-30 20:42 ` [PATCH 09/12] block: add partial sector parameter helper Keith Busch
2022-06-30 20:42 ` [PATCH 10/12] block: add direct-io partial sector read support Keith Busch
2022-06-30 20:42 ` [PATCH 11/12] iomap: " Keith Busch
2022-06-30 20:42 ` [PATCH 12/12] block: export and document bit_bucket attribute Keith Busch
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox