- * [PATCHv2 1/7] blk-mq: add ops to dma map bvec
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-02 19:36 ` [PATCHv2 2/7] file: " Keith Busch
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
The same buffer may be used for many subsequent IO's. Instead of setting
up the mapping per-IO, provide an interface that can allow a buffer to be
premapped just once and referenced again later.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bdev.c           | 20 ++++++++++++++++++++
 include/linux/blk-mq.h | 13 +++++++++++++
 include/linux/blkdev.h | 16 ++++++++++++++++
 3 files changed, 49 insertions(+)
diff --git a/block/bdev.c b/block/bdev.c
index ce05175e71ce..c3d73ad86fae 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1069,3 +1069,23 @@ void sync_bdevs(bool wait)
 	spin_unlock(&blockdev_superblock->s_inode_list_lock);
 	iput(old_inode);
 }
+
+#ifdef CONFIG_HAS_DMA
+void *block_dma_map(struct block_device *bdev, struct bio_vec *bvec,
+		    int nr_vecs)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q->mq_ops && q->mq_ops->dma_map)
+		return q->mq_ops->dma_map(q, bvec, nr_vecs);
+	return ERR_PTR(-EINVAL);
+}
+
+void block_dma_unmap(struct block_device *bdev, void *dma_tag)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q->mq_ops && q->mq_ops->dma_unmap)
+		return q->mq_ops->dma_unmap(q, dma_tag);
+}
+#endif
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index effee1dc715a..e10aabb36c2c 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -639,6 +639,19 @@ struct blk_mq_ops {
 	 */
 	void (*show_rq)(struct seq_file *m, struct request *rq);
 #endif
+
+#ifdef CONFIG_HAS_DMA
+	/**
+	 * @dma_map: Create a dma mapping. On success, returns an opaque cookie
+	 * that the can be referenced by the driver in future requests.
+	 */
+	void *(*dma_map)(struct request_queue *q, struct bio_vec *bvec, int nr_vecs);
+
+	/**
+	 * @dma_unmap: Tear down a previously created dma mapping.
+	 */
+	void (*dma_unmap)(struct request_queue *q, void *dma_tag);
+#endif
 };
 
 enum {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49dcd31e283e..efc5e805a46e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1527,4 +1527,20 @@ struct io_comp_batch {
 
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
+#ifdef CONFIG_HAS_DMA
+void *block_dma_map(struct block_device *bdev, struct bio_vec *bvec,
+		    int nr_vecs);
+void block_dma_unmap(struct block_device *bdev, void *dma_tag);
+#else
+static inline void *block_dma_map(struct block_device *bdev,
+				  struct bio_vec *bvec, int nr_vecs)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+
+static inline void block_dma_unmap(struct block_device *bdev, void *dma_tag)
+{
+}
+#endif
+
 #endif /* _LINUX_BLKDEV_H */
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * [PATCHv2 2/7] file: add ops to dma map bvec
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
  2022-08-02 19:36 ` [PATCHv2 1/7] blk-mq: add ops to dma map bvec Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-02 19:36 ` [PATCHv2 3/7] iov_iter: introduce type for preregistered dma tags Keith Busch
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
The same buffer may be used for many subsequent IO's. Instead of setting
up the mapping per-IO, provide an interface that can allow a buffer to
be premapped just once and referenced again later, and implement for the
block device file.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/fops.c       | 20 ++++++++++++++++++++
 fs/file.c          | 15 +++++++++++++++
 include/linux/fs.h | 20 ++++++++++++++++++++
 3 files changed, 55 insertions(+)
diff --git a/block/fops.c b/block/fops.c
index 29066ac5a2fa..db2d1e848f4b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -670,6 +670,22 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 	return error;
 }
 
+#ifdef CONFIG_HAS_DMA
+void *blkdev_dma_map(struct file *filp, struct bio_vec *bvec, int nr_vecs)
+{
+	struct block_device *bdev = filp->private_data;
+
+	return block_dma_map(bdev, bvec, nr_vecs);
+}
+
+void blkdev_dma_unmap(struct file *filp, void *dma_tag)
+{
+	struct block_device *bdev = filp->private_data;
+
+	return block_dma_unmap(bdev, dma_tag);
+}
+#endif
+
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
@@ -686,6 +702,10 @@ const struct file_operations def_blk_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= blkdev_fallocate,
+#ifdef CONFIG_HAS_DMA
+	.dma_map	= blkdev_dma_map,
+	.dma_unmap	= blkdev_dma_unmap,
+#endif
 };
 
 static __init int blkdev_init(void)
diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..767bf9d3205e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1307,3 +1307,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
 	return res;
 }
 EXPORT_SYMBOL(iterate_fd);
+
+#ifdef CONFIG_HAS_DMA
+void *file_dma_map(struct file *file, struct bio_vec *bvec, int nr_vecs)
+{
+	if (file->f_op->dma_map)
+		return file->f_op->dma_map(file, bvec, nr_vecs);
+	return ERR_PTR(-EINVAL);
+}
+
+void file_dma_unmap(struct file *file, void *dma_tag)
+{
+	if (file->f_op->dma_unmap)
+		return file->f_op->dma_unmap(file, dma_tag);
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0d99b5a166b..befd8ea5821a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1964,6 +1964,10 @@ struct dir_context {
 struct iov_iter;
 struct io_uring_cmd;
 
+#ifdef CONFIG_HAS_DMA
+struct bio_vec;
+#endif
+
 struct file_operations {
 	struct module *owner;
 	loff_t (*llseek) (struct file *, loff_t, int);
@@ -2006,6 +2010,10 @@ struct file_operations {
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
 	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
+#ifdef CONFIG_HAS_DMA
+	void *(*dma_map)(struct file *, struct bio_vec *, int);
+	void (*dma_unmap)(struct file *, void *);
+#endif
 } __randomize_layout;
 
 struct inode_operations {
@@ -3467,4 +3475,16 @@ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
 extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
 			   int advice);
 
+#ifdef CONFIG_HAS_DMA
+void *file_dma_map(struct file *file, struct bio_vec *bvec, int nr_vecs);
+void file_dma_unmap(struct file *file, void *dma_tag);
+#else
+static inline void *file_dma_map(struct file *file, struct bio_vec *bvec,
+				 int nr_vecs)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
+static inline void file_dma_unmap(struct file *file, void *dma_tag) {}
+#endif
+
 #endif /* _LINUX_FS_H */
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * [PATCHv2 3/7] iov_iter: introduce type for preregistered dma tags
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
  2022-08-02 19:36 ` [PATCHv2 1/7] blk-mq: add ops to dma map bvec Keith Busch
  2022-08-02 19:36 ` [PATCHv2 2/7] file: " Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-02 19:36 ` [PATCHv2 4/7] block: add dma tag bio type Keith Busch
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Introduce a new iov_iter type representing a pre-registered DMA address
tag. The tag is an opaque cookie specific to the lower level driver that
created it, and can be referenced at any arbitrary offset.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/uio.h |  9 +++++++++
 lib/iov_iter.c      | 24 +++++++++++++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 34ba4a731179..a55e4b86413a 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -26,6 +26,7 @@ enum iter_type {
 	ITER_PIPE,
 	ITER_XARRAY,
 	ITER_DISCARD,
+	ITER_DMA_TAG,
 };
 
 struct iov_iter_state {
@@ -46,6 +47,7 @@ struct iov_iter {
 		const struct bio_vec *bvec;
 		struct xarray *xarray;
 		struct pipe_inode_info *pipe;
+		void *dma_tag;
 	};
 	union {
 		unsigned long nr_segs;
@@ -85,6 +87,11 @@ static inline bool iov_iter_is_bvec(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_BVEC;
 }
 
+static inline bool iov_iter_is_dma_tag(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_DMA_TAG;
+}
+
 static inline bool iov_iter_is_pipe(const struct iov_iter *i)
 {
 	return iov_iter_type(i) == ITER_PIPE;
@@ -229,6 +236,8 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec
 			unsigned long nr_segs, size_t count);
 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
 			unsigned long nr_segs, size_t count);
+void iov_iter_dma_tag(struct iov_iter *i, unsigned int direction, void *dma_tag,
+			unsigned int dma_offset, size_t count);
 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe,
 			size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 507e732ef7cf..d370b45d7f1b 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1077,6 +1077,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		i->count -= size;
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
+	} else if (iov_iter_is_dma_tag(i)) {
+		i->iov_offset += size;
+		i->count -= size;
 	}
 }
 EXPORT_SYMBOL(iov_iter_advance);
@@ -1201,6 +1204,21 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_bvec);
 
+void iov_iter_dma_tag(struct iov_iter *i, unsigned int direction,
+			void *dma_tag, unsigned int dma_offset,
+			size_t count)
+{
+	WARN_ON(direction & ~(READ | WRITE));
+	*i = (struct iov_iter){
+		.iter_type = ITER_DMA_TAG,
+		.data_source = direction,
+		.dma_tag = dma_tag,
+		.iov_offset = dma_offset,
+		.count = count
+	};
+}
+EXPORT_SYMBOL(iov_iter_dma_tag);
+
 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
 			struct pipe_inode_info *pipe,
 			size_t count)
@@ -2124,8 +2142,8 @@ EXPORT_SYMBOL(import_single_range);
  */
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 {
-	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
-			 !iov_iter_is_kvec(i))
+	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
+			 !iov_iter_is_dma_tag(i)) && !iov_iter_is_kvec(i))
 		return;
 	i->iov_offset = state->iov_offset;
 	i->count = state->count;
@@ -2141,7 +2159,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
 	if (iov_iter_is_bvec(i))
 		i->bvec -= state->nr_segs - i->nr_segs;
-	else
+	else if (!iov_iter_is_dma_tag(i))
 		i->iov -= state->nr_segs - i->nr_segs;
 	i->nr_segs = state->nr_segs;
 }
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * [PATCHv2 4/7] block: add dma tag bio type
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
                   ` (2 preceding siblings ...)
  2022-08-02 19:36 ` [PATCHv2 3/7] iov_iter: introduce type for preregistered dma tags Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-02 19:36 ` [PATCHv2 5/7] io_uring: introduce file slot release helper Keith Busch
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Premapped buffers don't require a generic bio_vec since these have
already been dma mapped to the driver specific data structure. Repurpose
the bi_io_vec with the driver specific tag as they are mutually
exclusive, and provide all the setup and split helpers to support dma
tags.
In order to use this, a driver must implement the .dma_map() blk-mq op..
If the driver provides this callback, then it must be aware that any
given bio may be using a dma_tag instead of a bio_vec.
Note, this isn't working with blk_integrity.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/bio.c               | 25 ++++++++++++++++++++++++-
 block/blk-merge.c         | 19 +++++++++++++++++++
 include/linux/bio.h       | 21 ++++++++++++---------
 include/linux/blk-mq.h    | 11 +++++++++++
 include/linux/blk_types.h |  6 +++++-
 5 files changed, 71 insertions(+), 11 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index d6eb90d9b20b..3b7accae8996 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -229,7 +229,8 @@ static void bio_free(struct bio *bio)
 	WARN_ON_ONCE(!bs);
 
 	bio_uninit(bio);
-	bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
+	if (!bio_flagged(bio, BIO_DMA_TAGGED))
+		bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
 	mempool_free(p - bs->front_pad, &bs->bio_pool);
 }
 
@@ -762,6 +763,8 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 	bio_set_flag(bio, BIO_CLONED);
 	if (bio_flagged(bio_src, BIO_THROTTLED))
 		bio_set_flag(bio, BIO_THROTTLED);
+	if (bio_flagged(bio_src, BIO_DMA_TAGGED))
+		bio_set_flag(bio, BIO_DMA_TAGGED);
 	bio->bi_ioprio = bio_src->bi_ioprio;
 	bio->bi_iter = bio_src->bi_iter;
 
@@ -1151,6 +1154,21 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
+static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
+{
+	size_t size = iov_iter_count(iter);
+
+	bio->bi_vcnt = iter->nr_segs;
+	bio->bi_dma_tag = iter->dma_tag;
+	bio->bi_iter.bi_bvec_done = iter->iov_offset;
+	bio->bi_iter.bi_size = size;
+	bio->bi_opf |= REQ_NOMERGE;
+	bio_set_flag(bio, BIO_NO_PAGE_REF);
+	bio_set_flag(bio, BIO_DMA_TAGGED);
+
+	iov_iter_advance(iter, bio->bi_iter.bi_size);
+}
+
 static int bio_iov_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int offset)
 {
@@ -1287,6 +1305,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 		return 0;
 	}
 
+	if (iov_iter_is_dma_tag(iter)) {
+		bio_iov_dma_tag_set(bio, iter);
+		return 0;
+	}
+
 	do {
 		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ff04e9290715..d024885ad4c4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -274,6 +274,25 @@ static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
 	struct bvec_iter iter;
 	unsigned nsegs = 0, bytes = 0;
 
+	if (bio_flagged(bio, BIO_DMA_TAGGED)) {
+		int offset = offset_in_page(bio->bi_iter.bi_bvec_done);
+
+		nsegs = ALIGN(bio->bi_iter.bi_size + offset, PAGE_SIZE) >>
+								PAGE_SHIFT;
+		if (bio->bi_iter.bi_size > max_bytes) {
+			bytes = max_bytes;
+			nsegs = (bytes + offset) >> PAGE_SHIFT;
+		} else if (nsegs > lim->max_segments) {
+			nsegs = lim->max_segments;
+			bytes = PAGE_SIZE * nsegs - offset;
+		} else {
+			*segs = nsegs;
+			return NULL;
+		}
+
+		goto split;
+	}
+
 	bio_for_each_bvec(bv, bio, iter) {
 		/*
 		 * If the queue doesn't support SG gaps and adding this
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ca22b06700a9..649348bc03c2 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -61,11 +61,17 @@ static inline bool bio_has_data(struct bio *bio)
 	return false;
 }
 
+static inline bool bio_flagged(const struct bio *bio, unsigned int bit)
+{
+	return (bio->bi_flags & (1U << bit)) != 0;
+}
+
 static inline bool bio_no_advance_iter(const struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
-	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
+	       bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+	       bio_flagged(bio, BIO_DMA_TAGGED);
 }
 
 static inline void *bio_data(struct bio *bio)
@@ -98,9 +104,11 @@ static inline void bio_advance_iter(const struct bio *bio,
 {
 	iter->bi_sector += bytes >> 9;
 
-	if (bio_no_advance_iter(bio))
+	if (bio_no_advance_iter(bio)) {
 		iter->bi_size -= bytes;
-	else
+		if (bio_flagged(bio, BIO_DMA_TAGGED))
+			iter->bi_bvec_done += bytes;
+	} else
 		bvec_iter_advance(bio->bi_io_vec, iter, bytes);
 		/* TODO: It is reasonable to complete bio with error here. */
 }
@@ -225,11 +233,6 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
 	atomic_set(&bio->__bi_cnt, count);
 }
 
-static inline bool bio_flagged(struct bio *bio, unsigned int bit)
-{
-	return (bio->bi_flags & (1U << bit)) != 0;
-}
-
 static inline void bio_set_flag(struct bio *bio, unsigned int bit)
 {
 	bio->bi_flags |= (1U << bit);
@@ -447,7 +450,7 @@ static inline void bio_wouldblock_error(struct bio *bio)
  */
 static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
 {
-	if (iov_iter_is_bvec(iter))
+	if (iov_iter_is_bvec(iter) || iov_iter_is_dma_tag(iter))
 		return 0;
 	return iov_iter_npages(iter, max_segs);
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e10aabb36c2c..f5e0aa61bf85 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1141,6 +1141,17 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 void blk_dump_rq_flags(struct request *, char *);
 
+static inline void *blk_rq_dma_tag(struct request *rq)
+{
+	return rq->bio && bio_flagged(rq->bio, BIO_DMA_TAGGED) ?
+		rq->bio->bi_dma_tag : 0;
+}
+
+static inline size_t blk_rq_dma_offset(struct request *rq)
+{
+	return rq->bio->bi_iter.bi_bvec_done;
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline unsigned int blk_rq_zone_no(struct request *rq)
 {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1ef99790f6ed..ea6db439acbe 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -299,7 +299,10 @@ struct bio {
 
 	atomic_t		__bi_cnt;	/* pin count */
 
-	struct bio_vec		*bi_io_vec;	/* the actual vec list */
+	union {
+		struct bio_vec		*bi_io_vec;	/* the actual vec list */
+		void			*bi_dma_tag;    /* driver specific tag */
+	};
 
 	struct bio_set		*bi_pool;
 
@@ -334,6 +337,7 @@ enum {
 	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+	BIO_DMA_TAGGED,		/* Using premmaped dma buffers */
 	BIO_FLAG_LAST
 };
 
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * [PATCHv2 5/7] io_uring: introduce file slot release helper
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
                   ` (3 preceding siblings ...)
  2022-08-02 19:36 ` [PATCHv2 4/7] block: add dma tag bio type Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-02 19:36 ` [PATCHv2 6/7] io_uring: add support for dma pre-mapping Keith Busch
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Releasing the pre-registered file follows a repeated pattern. Introduce
a helper to make it easier to add more complexity to this resource in
the future.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 io_uring/filetable.c | 33 +++++++++++++++++++++------------
 io_uring/filetable.h |  3 +++
 io_uring/rsrc.c      |  5 +----
 3 files changed, 25 insertions(+), 16 deletions(-)
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 7b473259f3f4..1b8db1918678 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -76,19 +76,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
 
 	if (file_slot->file_ptr) {
-		struct file *old_file;
-
 		ret = io_rsrc_node_switch_start(ctx);
 		if (ret)
 			goto err;
 
-		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    ctx->rsrc_node, old_file);
+		ret = io_file_slot_queue_removal(ctx, file_slot);
 		if (ret)
 			goto err;
-		file_slot->file_ptr = 0;
-		io_file_bitmap_clear(&ctx->file_table, slot_index);
 		needs_switch = true;
 	}
 
@@ -148,7 +142,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
 	struct io_fixed_file *file_slot;
-	struct file *file;
 	int ret;
 
 	if (unlikely(!ctx->file_data))
@@ -164,13 +157,10 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 	if (!file_slot->file_ptr)
 		return -EBADF;
 
-	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+	ret = io_file_slot_queue_removal(ctx, file_slot);
 	if (ret)
 		return ret;
 
-	file_slot->file_ptr = 0;
-	io_file_bitmap_clear(&ctx->file_table, offset);
 	io_rsrc_node_switch(ctx, ctx->file_data);
 	return 0;
 }
@@ -191,3 +181,22 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx,
 	io_file_table_set_alloc_range(ctx, range.off, range.len);
 	return 0;
 }
+
+int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
+			       struct io_fixed_file *file_slot)
+{
+	u32 slot_index = file_slot - ctx->file_table.files;
+	struct file *file;
+	int ret;
+
+	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+	ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+				    ctx->rsrc_node, file);
+	if (ret)
+		return ret;
+
+	file_slot->file_ptr = 0;
+	io_file_bitmap_clear(&ctx->file_table, slot_index);
+
+	return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index ff3a712e11bf..e52ecf359199 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -34,6 +34,9 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
 int io_register_file_alloc_range(struct io_ring_ctx *ctx,
 				 struct io_uring_file_index_range __user *arg);
 
+int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
+			       struct io_fixed_file *file_slot);
+
 unsigned int io_file_get_flags(struct file *file);
 
 static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..1f10eecad4d7 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -469,12 +469,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 
 		if (file_slot->file_ptr) {
-			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
+			err = io_file_slot_queue_removal(ctx, file_slot);
 			if (err)
 				break;
-			file_slot->file_ptr = 0;
-			io_file_bitmap_clear(&ctx->file_table, i);
 			needs_switch = true;
 		}
 		if (fd != -1) {
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * [PATCHv2 6/7] io_uring: add support for dma pre-mapping
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
                   ` (4 preceding siblings ...)
  2022-08-02 19:36 ` [PATCHv2 5/7] io_uring: introduce file slot release helper Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-02 23:25   ` Ammar Faizi
  2022-08-02 19:36 ` [PATCHv2 7/7] nvme-pci: implement dma_map support Keith Busch
  2022-08-03 20:52 ` [PATCHv2 0/7] dma mapping optimisations Jens Axboe
  7 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Provide a new register operation that can request to pre-map a known
bvec to the requested fixed file's specific implementation. If
successful, io_uring will use the returned dma tag for future fixed
buffer requests to the same file.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 include/linux/io_uring_types.h |   2 +
 include/uapi/linux/io_uring.h  |  12 +++
 io_uring/filetable.c           |   7 +-
 io_uring/filetable.h           |   7 +-
 io_uring/io_uring.c            | 137 +++++++++++++++++++++++++++++++++
 io_uring/net.c                 |   2 +-
 io_uring/rsrc.c                |  21 +++--
 io_uring/rsrc.h                |  10 ++-
 io_uring/rw.c                  |   2 +-
 9 files changed, 185 insertions(+), 15 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..f62ea17cc480 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -23,6 +23,8 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
+struct io_mapped_ubuf;
+
 struct io_fixed_file {
 	/* file * with additional FFS_* flags */
 	unsigned long file_ptr;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
 	IORING_REGISTER_NOTIFIERS		= 26,
 	IORING_UNREGISTER_NOTIFIERS		= 27,
 
+	/* dma map registered buffers */
+	IORING_REGISTER_MAP_BUFFERS		= 28,
+	IORING_REGISTER_UNMAP_BUFFERS		= 29,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+struct io_uring_map_buffers {
+	__s32	fd;
+	__s32	buf_start;
+	__s32	buf_end;
+	__u32	flags;
+	__u64	rsvd[2];
+};
+
 #endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1b8db1918678..5ca2f27f317f 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -189,9 +189,10 @@ int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
 	struct file *file;
 	int ret;
 
-	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-				    ctx->rsrc_node, file);
+	file = io_file_from_fixed(file_slot);
+	io_dma_unmap_file(ctx, file_slot);
+	ret = io_queue_rsrc_removal(ctx->file_data, slot_index, ctx->rsrc_node,
+				    file);
 	if (ret)
 		return ret;
 
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index e52ecf359199..3b2aae5bff76 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -58,12 +58,17 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
 	return &table->files[i];
 }
 
+static inline struct file *io_file_from_fixed(struct io_fixed_file *f)
+{
+	return (struct file *) (f->file_ptr & FFS_MASK);
+}
+
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
 	struct io_fixed_file *slot = io_fixed_file_slot(table, index);
 
-	return (struct file *) (slot->file_ptr & FFS_MASK);
+	return io_file_from_fixed(slot);
 }
 
 static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b54218da075c..f5be488eaf21 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3681,6 +3681,131 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+static int get_map_range(struct io_ring_ctx *ctx,
+			 struct io_uring_map_buffers *map, void __user *arg)
+{
+	int ret;
+
+	if (copy_from_user(map, arg, sizeof(*map)))
+		return -EFAULT;
+	if (map->flags || map->rsvd[0] || map->rsvd[1])
+		return -EINVAL;
+	if (map->fd >= ctx->nr_user_files)
+		return -EINVAL;
+	if (map->buf_start < 0)
+		return -EINVAL;
+	if (map->buf_start >= ctx->nr_user_bufs)
+		return -EINVAL;
+	if (map->buf_end > ctx->nr_user_bufs)
+		map->buf_end = ctx->nr_user_bufs;
+
+	ret = map->buf_end - map->buf_start;
+	if (ret <= 0)
+		return -EINVAL;
+
+	return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+	struct file *file;
+
+	if (!imu->dma_tag)
+		return;
+
+	file = io_file_from_fixed(imu->dma_file);
+	file_dma_unmap(file, imu->dma_tag);
+	imu->dma_file = NULL;
+	imu->dma_tag = NULL;
+}
+
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot)
+{
+	int i;
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		if (!imu->dma_tag)
+			continue;
+
+		if (imu->dma_file == file_slot)
+			io_dma_unmap(imu);
+	}
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	int i, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+
+	return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	struct io_fixed_file *file_slot;
+	struct file *file;
+	int ret, i;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	file_slot = io_fixed_file_slot(&ctx->file_table,
+			array_index_nospec(map.fd, ctx->nr_user_files));
+	if (!file_slot || !file_slot->file_ptr)
+		return -EBADF;
+
+	file = io_file_from_fixed(file_slot);
+	if (!(file->f_flags & O_DIRECT))
+		return -EOPNOTSUPP;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		void *tag;
+
+		if (imu->dma_tag) {
+			ret = -EBUSY;
+			goto err;
+		}
+
+		tag = file_dma_map(file, imu->bvec, imu->nr_bvecs);
+		if (IS_ERR(tag)) {
+			ret = PTR_ERR(tag);
+			goto err;
+		}
+
+		imu->dma_tag = tag;
+		imu->dma_file = file_slot;
+	}
+
+	return 0;
+err:
+	while (--i >= map.buf_start) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -3847,6 +3972,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_notif_unregister(ctx);
 		break;
+	case IORING_REGISTER_MAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_map_buffers(ctx, arg);
+		break;
+	case IORING_REGISTER_UNMAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_unmap_buffers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 32fc3da04e41..2793fd7d99d5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
-					(u64)(uintptr_t)zc->buf, zc->len);
+					(u64)(uintptr_t)zc->buf, zc->len, NULL);
 		if (unlikely(ret))
 				return ret;
 	} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1f10eecad4d7..ee5e5284203d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 			unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
 			io_unaccount_mem(ctx, imu->acct_pages);
+		io_dma_unmap(imu);
 		kvfree(imu);
 	}
 	*slot = NULL;
@@ -809,12 +810,16 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	int i;
 
 	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(&ctx->file_table, i);
+		struct io_fixed_file *f = io_fixed_file_slot(&ctx->file_table, i);
+		struct file *file;
 
-		if (!file)
+		if (!f)
 			continue;
-		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+		if (f->file_ptr & FFS_SCM)
 			continue;
+
+		io_dma_unmap_file(ctx, f);
+		file = io_file_from_fixed(f);
 		io_file_bitmap_clear(&ctx->file_table, i);
 		fput(file);
 	}
@@ -1282,6 +1287,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
+	imu->dma_tag = NULL;
 	*pimu = imu;
 	ret = 0;
 done:
@@ -1356,9 +1362,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file)
 {
 	u64 buf_end;
 	size_t offset;
@@ -1376,6 +1381,10 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * and advance us to the beginning.
 	 */
 	offset = buf_addr - imu->ubuf;
+	if (imu->dma_tag && file == io_file_from_fixed(imu->dma_file)) {
+		iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, len);
+		return 0;
+	}
 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
 	if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..47a2942aa537 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,8 @@ struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	void		*dma_tag;
+	struct io_fixed_file	*dma_file;
 	struct bio_vec	bvec[];
 };
 
@@ -64,9 +66,11 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file);
+
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot);
 
 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	ssize_t ret;
 
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * Re: [PATCHv2 6/7] io_uring: add support for dma pre-mapping
  2022-08-02 19:36 ` [PATCHv2 6/7] io_uring: add support for dma pre-mapping Keith Busch
@ 2022-08-02 23:25   ` Ammar Faizi
  0 siblings, 0 replies; 12+ messages in thread
From: Ammar Faizi @ 2022-08-02 23:25 UTC (permalink / raw)
  To: Keith Busch
  Cc: Facebook Kernel Team, Jens Axboe, Christoph Hellwig, Al Viro,
	Keith Busch, Fernanda Ma'rouf, io-uring Mailing List,
	Linux Block Mailing List, Linux fsdevel Mailing List,
	Linux NVME Mailing List
On 8/3/22 2:36 AM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Provide a new register operation that can request to pre-map a known
> bvec to the requested fixed file's specific implementation. If
> successful, io_uring will use the returned dma tag for future fixed
> buffer requests to the same file.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
[...]
> +static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
> +{
> +	struct io_uring_map_buffers map;
> +	struct io_fixed_file *file_slot;
> +	struct file *file;
> +	int ret, i;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	ret = get_map_range(ctx, &map, arg);
> +	if (ret < 0)
> +		return ret;
> +
> +	file_slot = io_fixed_file_slot(&ctx->file_table,
> +			array_index_nospec(map.fd, ctx->nr_user_files));
> +	if (!file_slot || !file_slot->file_ptr)
> +		return -EBADF;
The @file_slot NULL-check doesn't make sense. The definition of
io_fixed_file_slot() is:
static inline struct io_fixed_file *
io_fixed_file_slot(struct io_file_table *table, unsigned i)
{
         return &table->files[i];
}
which takes the address of an element in the array. So @file_slot
should never be NULL, if it ever be, something has gone wrong.
If you ever had @ctx->file_table.files being NULL in this path, you
should NULL-check the @->files itself, *not* the return value of
io_fixed_file_slot().
IOW:
...
	// NULL check here.
         if (!ctx->file_table.files)
                 return -EBADF;
         file_slot = io_fixed_file_slot(&ctx->file_table,
                                        array_index_nospec(map.fd, ctx->nr_user_files));
         if (!file_slot->file_ptr)
                 return -EBADF;
...
>   	for (i = 0; i < ctx->nr_user_files; i++) {
> -		struct file *file = io_file_from_index(&ctx->file_table, i);
> +		struct io_fixed_file *f = io_fixed_file_slot(&ctx->file_table, i);
> +		struct file *file;
>   
> -		if (!file)
> +		if (!f)
>   			continue;
The same thing, this @f NULL-check is not needed.
> -		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
> +		if (f->file_ptr & FFS_SCM)
>   			continue;
> +
> +		io_dma_unmap_file(ctx, f);
> +		file = io_file_from_fixed(f);
>   		io_file_bitmap_clear(&ctx->file_table, i);
>   		fput(file);
>   	}
-- 
Ammar Faizi
^ permalink raw reply	[flat|nested] 12+ messages in thread
 
- * [PATCHv2 7/7] nvme-pci: implement dma_map support
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
                   ` (5 preceding siblings ...)
  2022-08-02 19:36 ` [PATCHv2 6/7] io_uring: add support for dma pre-mapping Keith Busch
@ 2022-08-02 19:36 ` Keith Busch
  2022-08-03 20:52 ` [PATCHv2 0/7] dma mapping optimisations Jens Axboe
  7 siblings, 0 replies; 12+ messages in thread
From: Keith Busch @ 2022-08-02 19:36 UTC (permalink / raw)
  To: linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: axboe, hch, Alexander Viro, Kernel Team, Keith Busch
From: Keith Busch <kbusch@kernel.org>
Implement callbacks to convert a registered bio_vec to a prp list, and
use this for each IO that uses the returned tag. This saves repeated IO
conversions and dma mapping/unmapping. In many cases, the driver can
skip per-IO pool allocations entirely, saving potentially signficant CPU
cycles.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 302 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 292 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 644664098ae7..2df2b9bde7d7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -110,6 +110,14 @@ struct nvme_queue;
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
 
+struct nvme_dma_mapping {
+	int nr_pages;
+	u16 offset;
+	u8  rsvd[2];
+	dma_addr_t prp_dma_addr;
+	__le64 *prps;
+};
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -544,6 +552,34 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
 	return true;
 }
 
+static void nvme_sync_dma(struct nvme_dev *dev, struct request *req,
+			  struct nvme_dma_mapping *mapping)
+{
+	int offset, i, j, length, nprps;
+	bool needs_sync;
+
+	offset = blk_rq_dma_offset(req) + mapping->offset;
+	i = offset >> NVME_CTRL_PAGE_SHIFT;
+	needs_sync = rq_data_dir(req) == READ &&
+		 dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
+
+	if (!needs_sync)
+		return;
+
+	offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+	length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+
+	dma_sync_single_for_cpu(dev->dev,
+		le64_to_cpu(mapping->prps[i++]),
+		NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE);
+	for (j = 1; j < nprps; j++) {
+		dma_sync_single_for_cpu(dev->dev,
+			le64_to_cpu(mapping->prps[i++]),
+			NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE);
+	}
+}
+
 static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 {
 	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
@@ -576,10 +612,24 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
 	}
 }
 
+static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req,
+				struct nvme_iod *iod)
+{
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+			      iod->first_dma);
+	else if (iod->use_sgl)
+		nvme_free_sgls(dev, req);
+	else
+		nvme_free_prps(dev, req);
+	mempool_free(iod->sg, dev->iod_mempool);
+}
+
 static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
+	WARN_ON_ONCE(!iod->nents);
 	if (is_pci_p2pdma_page(sg_page(iod->sg)))
 		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
 				    rq_dma_dir(req));
@@ -589,25 +639,24 @@ static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
 
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
+	struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req);
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
+	if (mapping) {
+		nvme_sync_dma(dev, req, mapping);
+		if (iod->npages >= 0)
+			nvme_free_prp_chain(dev, req, iod);
+		return;
+	}
+
 	if (iod->dma_len) {
 		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
 			       rq_dma_dir(req));
 		return;
 	}
 
-	WARN_ON_ONCE(!iod->nents);
-
 	nvme_unmap_sg(dev, req);
-	if (iod->npages == 0)
-		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
-			      iod->first_dma);
-	else if (iod->use_sgl)
-		nvme_free_sgls(dev, req);
-	else
-		nvme_free_prps(dev, req);
-	mempool_free(iod->sg, dev->iod_mempool);
+	nvme_free_prp_chain(dev, req, iod);
 }
 
 static void nvme_print_sgl(struct scatterlist *sgl, int nents)
@@ -835,13 +884,136 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
 	return BLK_STS_OK;
 }
 
+static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
+				   struct nvme_rw_command *cmnd,
+				   struct nvme_iod *iod,
+				   struct nvme_dma_mapping *mapping)
+{
+	static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
+	dma_addr_t prp_list_start, prp_list_end, prp_dma;
+	int i, offset, j, length, nprps, nprps_left;
+	struct dma_pool *pool;
+	__le64 *prp_list;
+	bool needs_sync;
+	void **list;
+
+	offset = blk_rq_dma_offset(req) + mapping->offset;
+	i = offset >> NVME_CTRL_PAGE_SHIFT;
+	offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+	needs_sync = rq_data_dir(req) == WRITE &&
+		 dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
+
+	if (needs_sync) {
+		dma_sync_single_for_device(dev->dev,
+			le64_to_cpu(mapping->prps[i]),
+			NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
+	}
+
+	length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+	cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[i++]) + offset);
+
+	if (length <= 0)
+		return BLK_STS_OK;
+
+	if (length <= NVME_CTRL_PAGE_SIZE) {
+		if (needs_sync)
+			dma_sync_single_for_device(dev->dev,
+				le64_to_cpu(mapping->prps[i]),
+				NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+		cmnd->dptr.prp2 = mapping->prps[i];
+		return BLK_STS_OK;
+	}
+
+	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+	prp_list_start = mapping->prp_dma_addr + 8 * i;
+	prp_list_end = prp_list_start + 8 * nprps;
+
+	/* Optimization when remaining list fits in one nvme page */
+	if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) ==
+	    (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) {
+		cmnd->dptr.prp2 = cpu_to_le64(prp_list_start);
+		goto sync;
+	}
+
+	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+	if (!iod->sg)
+		return BLK_STS_RESOURCE;
+
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+	if (!prp_list) {
+		iod->npages = -1;
+		goto out_free_sg;
+	}
+
+	list = nvme_pci_iod_list(req);
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
+
+	for (;;) {
+		dma_addr_t next_prp_dma;
+		__le64 *next_prp_list;
+
+		if (nprps_left <= last_prp + 1) {
+			memcpy(prp_list, &mapping->prps[i], nprps_left * 8);
+			break;
+		}
+
+		memcpy(prp_list, &mapping->prps[i],
+		       NVME_CTRL_PAGE_SIZE - 8);
+		nprps_left -= last_prp;
+		i += last_prp;
+
+		next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma);
+		if (!next_prp_list)
+			goto free_prps;
+
+		prp_list[last_prp] = cpu_to_le64(next_prp_dma);
+		prp_list = next_prp_list;
+		prp_dma = next_prp_dma;
+		list[iod->npages++] = prp_list;
+	}
+	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+
+sync:
+	if (!needs_sync)
+		return BLK_STS_OK;
+
+	i = offset >> NVME_CTRL_PAGE_SHIFT;
+	for (j = 0; j < nprps; j++)
+		dma_sync_single_for_device(dev->dev,
+			le64_to_cpu(mapping->prps[i++]),
+			NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+	return BLK_STS_OK;
+
+free_prps:
+	nvme_free_prps(dev, req);
+out_free_sg:
+	mempool_free(iod->sg, dev->iod_mempool);
+	return BLK_STS_RESOURCE;
+}
+
 static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 		struct nvme_command *cmnd)
 {
+	struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req);
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	blk_status_t ret = BLK_STS_RESOURCE;
 	int nr_mapped;
 
+	if (mapping) {
+		iod->dma_len = 0;
+		iod->use_sgl = false;
+		return nvme_premapped(dev, req, &cmnd->rw, iod, mapping);
+	}
+
 	if (blk_rq_nr_phys_segments(req) == 1) {
 		struct bio_vec bv = req_bvec(req);
 
@@ -1732,6 +1904,112 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	return result;
 }
 
+#ifdef CONFIG_HAS_DMA
+/*
+ * Important: bvec must be describing a virtually contiguous buffer.
+ */
+static void *nvme_pci_dma_map(struct request_queue *q,
+			       struct bio_vec *bvec, int nr_vecs)
+{
+	const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+	struct nvme_dma_mapping *mapping;
+	int i, j, k, size, ppv, ret = -ENOMEM;
+
+	if (!nr_vecs)
+		return ERR_PTR(-EINVAL);
+
+	mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+	if (!mapping)
+		return ERR_PTR(-ENOMEM);
+
+	mapping->nr_pages = nr_vecs * nvme_pages;
+	size = sizeof(*mapping->prps) * mapping->nr_pages;
+	mapping->prps = dma_alloc_coherent(dev->dev, size,
+				&mapping->prp_dma_addr, GFP_KERNEL);
+	if (!mapping->prps)
+		goto free_mapping;
+
+	for (i = 0, k = 0; i < nr_vecs; i++) {
+		struct bio_vec *bv = bvec + i;
+		dma_addr_t dma_addr;
+
+		ppv = nvme_pages;
+		if (i == 0) {
+			mapping->offset = bv->bv_offset;
+			ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+		} else if (bv->bv_offset) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		if (bv->bv_offset + bv->bv_len != PAGE_SIZE &&
+		    i < nr_vecs - 1) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		dma_addr = dma_map_bvec(dev->dev, bv, 0, 0);
+		if (dma_mapping_error(dev->dev, dma_addr)) {
+			ret = -EIO;
+			goto err;
+		}
+
+		if (i == 0)
+			dma_addr -= mapping->offset;
+
+		for (j = 0; j < ppv; j++)
+			mapping->prps[k++] = cpu_to_le64(dma_addr +
+						j * NVME_CTRL_PAGE_SIZE);
+	}
+
+	get_device(dev->dev);
+	return mapping;
+
+err:
+	for (i = 0; i < k; i += ppv) {
+		__u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+		ppv = nvme_pages;
+
+		if (i == 0)
+			ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+		dma_unmap_page(dev->dev, dma_addr,
+			       PAGE_SIZE - offset_in_page(dma_addr), 0);
+	}
+
+	dma_free_coherent(dev->dev, size, (void *)mapping->prps,
+			  mapping->prp_dma_addr);
+free_mapping:
+	kfree(mapping);
+	return ERR_PTR(ret);
+}
+
+static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag)
+{
+	const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+	struct nvme_ns *ns = q->queuedata;
+	struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+	struct nvme_dma_mapping *mapping = dma_tag;
+	int i, ppv;
+
+	for (i = 0; i < mapping->nr_pages; i += nvme_pages) {
+		__u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+		ppv = nvme_pages;
+
+		if (i == 0)
+			ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+		dma_unmap_page(dev->dev, dma_addr,
+			       PAGE_SIZE - offset_in_page(dma_addr), 0);
+	}
+
+	dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps),
+			  (void *)mapping->prps, mapping->prp_dma_addr);
+	kfree(mapping);
+	put_device(dev->dev);
+}
+#endif
+
 static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
@@ -1750,6 +2028,10 @@ static const struct blk_mq_ops nvme_mq_ops = {
 	.map_queues	= nvme_pci_map_queues,
 	.timeout	= nvme_timeout,
 	.poll		= nvme_poll,
+#ifdef CONFIG_HAS_DMA
+	.dma_map	= nvme_pci_dma_map,
+	.dma_unmap	= nvme_pci_dma_unmap,
+#endif
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
-- 
2.30.2
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * Re: [PATCHv2 0/7] dma mapping optimisations
  2022-08-02 19:36 [PATCHv2 0/7] dma mapping optimisations Keith Busch
                   ` (6 preceding siblings ...)
  2022-08-02 19:36 ` [PATCHv2 7/7] nvme-pci: implement dma_map support Keith Busch
@ 2022-08-03 20:52 ` Jens Axboe
  2022-08-04 16:28   ` Keith Busch
  7 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-08-03 20:52 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, linux-block, io-uring, linux-fsdevel
  Cc: hch, Alexander Viro, Kernel Team, Keith Busch
On 8/2/22 1:36 PM, Keith Busch wrote:
> device undergoes various represenations for every IO. Each consumes
> memory and CPU cycles. When the backing storage is NVMe, the sequence
> looks something like the following:
> 
>   __user void *
>   struct iov_iter
>   struct pages[]
>   struct bio_vec[]
>   struct scatterlist[]
>   __le64[]
> 
> Applications will often use the same buffer for many IO, though, so
> these potentially costly per-IO transformations to reach the exact same
> hardware descriptor can be skipped.
> 
> The io_uring interface already provides a way for users to register
> buffers to get to the 'struct bio_vec[]'. That still leaves the
> scatterlist needed for the repeated dma_map_sg(), then transform to
> nvme's PRP list format.
> 
> This series takes the registered buffers a step further. A block driver
> can implement a new .dma_map() callback to complete the representation
> to the hardware's DMA mapped address, and return a cookie so a user can
> reference it later for any given IO. When used, the block stack can skip
> significant amounts of code, improving CPU utilization, and, if not
> bandwidth limited, IOPs.
> 
> The implementation is currently limited to mapping a registered buffer
> to a single file.
I ran this on my test box to see how we'd do. First the bad news:
smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
and using t/io_uring (with registered buffers, polled, etc) and a 512b
block size I get:
IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
and adding -D1 I get:
IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
which does regress that workload. Since we avoid more expensive setup at
higher block sizes, I tested that too. Here's using 128k IOs with -D0:
OPS=972.18K, BW=121.52GiB/s, IOS/call=31/31
IOPS=988.79K, BW=123.60GiB/s, IOS/call=31/31
IOPS=990.40K, BW=123.80GiB/s, IOS/call=31/31
IOPS=987.80K, BW=123.48GiB/s, IOS/call=31/31
IOPS=988.12K, BW=123.52GiB/s, IOS/call=31/31
and here with -D1:
IOPS=978.36K, BW=122.30GiB/s, IOS/call=31/31
IOPS=996.75K, BW=124.59GiB/s, IOS/call=31/31
IOPS=996.55K, BW=124.57GiB/s, IOS/call=31/31
IOPS=996.52K, BW=124.56GiB/s, IOS/call=31/31
IOPS=996.54K, BW=124.57GiB/s, IOS/call=31/31
IOPS=996.51K, BW=124.56GiB/s, IOS/call=31/31
which is a notable improvement. Then I checked CPU utilization,
switching to IRQ driven IO instead. And the good news there for bs=128K
we end up using half the CPU to achieve better performance. So definite
win there!
Just a quick dump on some quick result, I didn't look further into this
just yet.
-- 
Jens Axboe
^ permalink raw reply	[flat|nested] 12+ messages in thread
- * Re: [PATCHv2 0/7] dma mapping optimisations
  2022-08-03 20:52 ` [PATCHv2 0/7] dma mapping optimisations Jens Axboe
@ 2022-08-04 16:28   ` Keith Busch
  2022-08-04 16:42     ` Jens Axboe
  0 siblings, 1 reply; 12+ messages in thread
From: Keith Busch @ 2022-08-04 16:28 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, linux-nvme, linux-block, io-uring, linux-fsdevel,
	hch, Alexander Viro, Kernel Team
On Wed, Aug 03, 2022 at 02:52:11PM -0600, Jens Axboe wrote:
> I ran this on my test box to see how we'd do. First the bad news:
> smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
> and using t/io_uring (with registered buffers, polled, etc) and a 512b
> block size I get:
> 
> IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
> IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
> IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
> IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
> IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
> IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
> 
> and adding -D1 I get:
> 
> IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
> IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
> IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
> IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
> IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
> IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
> 
> which does regress that workload.
Bummer, I would expect -D1 to be no worse. My test isn't nearly as consistent
as yours, so I'm having some trouble measuring. I'm only coming with a few
micro-optimizations that might help. A diff is below on top of this series. I
also created a branch with everything folded in here:
 git://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git io_uring/dma-register
 https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/log/?h=io_uring/dma-register
-- >8 --
diff --git a/block/bio.c b/block/bio.c
index 3b7accae8996..c1e97dff5e40 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1154,7 +1154,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 	bio_set_flag(bio, BIO_CLONED);
 }
 
-static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
 {
 	size_t size = iov_iter_count(iter);
 
@@ -1165,8 +1165,6 @@ static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
 	bio->bi_opf |= REQ_NOMERGE;
 	bio_set_flag(bio, BIO_NO_PAGE_REF);
 	bio_set_flag(bio, BIO_DMA_TAGGED);
-
-	iov_iter_advance(iter, bio->bi_iter.bi_size);
 }
 
 static int bio_iov_add_page(struct bio *bio, struct page *page,
@@ -1307,6 +1305,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	if (iov_iter_is_dma_tag(iter)) {
 		bio_iov_dma_tag_set(bio, iter);
+		iov_iter_advance(iter, bio->bi_iter.bi_size);
 		return 0;
 	}
 
diff --git a/block/fops.c b/block/fops.c
index db2d1e848f4b..1b3649c7eb17 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -325,7 +325,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		 * bio_iov_iter_get_pages() and set the bvec directly.
 		 */
 		bio_iov_bvec_set(bio, iter);
-	} else {
+	} else if (iov_iter_is_dma_tag(iter)) {
+		bio_iov_dma_tag_set(bio, iter);
+	}else {
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
 			bio_put(bio);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index dbf73ab0877e..511cae2b7ce9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -113,7 +113,8 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
 struct nvme_dma_mapping {
 	int nr_pages;
 	u16 offset;
-	u8  rsvd[2];
+	bool needs_sync;
+	u8  rsvd;
 	dma_addr_t prp_dma_addr;
 	__le64 *prps;
 };
@@ -556,16 +557,9 @@ static void nvme_sync_dma(struct nvme_dev *dev, struct request *req,
 			  struct nvme_dma_mapping *mapping)
 {
 	int offset, i, j, length, nprps;
-	bool needs_sync;
 
 	offset = blk_rq_dma_offset(req) + mapping->offset;
 	i = offset >> NVME_CTRL_PAGE_SHIFT;
-	needs_sync = rq_data_dir(req) == READ &&
-		 dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
-
-	if (!needs_sync)
-		return;
-
 	offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
 	length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
 	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
@@ -643,7 +637,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 
 	if (mapping) {
-		nvme_sync_dma(dev, req, mapping);
+		if (mapping->needs_sync)
+			nvme_sync_dma(dev, req, mapping);
 		if (iod->npages >= 0)
 			nvme_free_prp_chain(dev, req, iod);
 		return;
@@ -894,16 +889,13 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
 	int i, offset, j, length, nprps, nprps_left;
 	struct dma_pool *pool;
 	__le64 *prp_list;
-	bool needs_sync;
 	void **list;
 
 	offset = blk_rq_dma_offset(req) + mapping->offset;
 	i = offset >> NVME_CTRL_PAGE_SHIFT;
 	offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
-	needs_sync = rq_data_dir(req) == WRITE &&
-		 dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
 
-	if (needs_sync) {
+	if (mapping->needs_sync) {
 		dma_sync_single_for_device(dev->dev,
 			le64_to_cpu(mapping->prps[i]),
 			NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
@@ -916,7 +908,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
 		return BLK_STS_OK;
 
 	if (length <= NVME_CTRL_PAGE_SIZE) {
-		if (needs_sync)
+		if (mapping->needs_sync)
 			dma_sync_single_for_device(dev->dev,
 				le64_to_cpu(mapping->prps[i]),
 				NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
@@ -983,7 +975,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
 	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
 
 sync:
-	if (!needs_sync)
+	if (!mapping->needs_sync)
 		return BLK_STS_OK;
 
 	i = offset >> NVME_CTRL_PAGE_SHIFT;
@@ -1931,6 +1923,7 @@ static void *nvme_pci_dma_map(struct request_queue *q,
 	if (!mapping->prps)
 		goto free_mapping;
 
+	mapping->needs_sync = false;
 	for (i = 0, k = 0; i < nr_vecs; i++) {
 		struct bio_vec *bv = bvec + i;
 		dma_addr_t dma_addr;
@@ -1959,6 +1952,9 @@ static void *nvme_pci_dma_map(struct request_queue *q,
 		if (i == 0)
 			dma_addr -= mapping->offset;
 
+		if (dma_need_sync(dev->dev, dma_addr))
+			mapping->needs_sync = true;
+
 		for (j = 0; j < ppv; j++)
 			mapping->prps[k++] = cpu_to_le64(dma_addr +
 						j * NVME_CTRL_PAGE_SIZE);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 649348bc03c2..b5277ec189e0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -474,6 +474,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
 void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
+void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter);
 void __bio_release_pages(struct bio *bio, bool mark_dirty);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d370b45d7f1b..ebdf81473526 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1070,6 +1070,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		iov_iter_iovec_advance(i, size);
 	} else if (iov_iter_is_bvec(i)) {
 		iov_iter_bvec_advance(i, size);
+	} else if (iov_iter_is_dma_tag(i)) {
+		i->iov_offset += size;
+		i->count -= size;
 	} else if (iov_iter_is_pipe(i)) {
 		pipe_advance(i, size);
 	} else if (unlikely(iov_iter_is_xarray(i))) {
@@ -1077,9 +1080,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		i->count -= size;
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
-	} else if (iov_iter_is_dma_tag(i)) {
-		i->iov_offset += size;
-		i->count -= size;
 	}
 }
 EXPORT_SYMBOL(iov_iter_advance);
@@ -1353,6 +1353,9 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 	if (iov_iter_is_bvec(i))
 		return iov_iter_aligned_bvec(i, addr_mask, len_mask);
 
+	if (iov_iter_is_dma_tag(i))
+		return !(i->iov_offset & addr_mask);
+
 	if (iov_iter_is_pipe(i)) {
 		unsigned int p_mask = i->pipe->ring_size - 1;
 		size_t size = i->count;
-- 8< --
^ permalink raw reply related	[flat|nested] 12+ messages in thread
- * Re: [PATCHv2 0/7] dma mapping optimisations
  2022-08-04 16:28   ` Keith Busch
@ 2022-08-04 16:42     ` Jens Axboe
  0 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-08-04 16:42 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-nvme, linux-block, io-uring, linux-fsdevel,
	hch, Alexander Viro, Kernel Team
On 8/4/22 10:28 AM, Keith Busch wrote:
> On Wed, Aug 03, 2022 at 02:52:11PM -0600, Jens Axboe wrote:
>> I ran this on my test box to see how we'd do. First the bad news:
>> smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
>> and using t/io_uring (with registered buffers, polled, etc) and a 512b
>> block size I get:
>>
>> IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
>> IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
>> IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
>> IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
>> IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
>> IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
>>
>> and adding -D1 I get:
>>
>> IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
>> IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
>> IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
>> IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
>> IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
>> IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
>>
>> which does regress that workload.
> 
> Bummer, I would expect -D1 to be no worse. My test isn't nearly as consistent
> as yours, so I'm having some trouble measuring. I'm only coming with a few
> micro-optimizations that might help. A diff is below on top of this series. I
> also created a branch with everything folded in here:
That seemed to do the trick! Don't pay any attention to the numbers
being slightly different than before for -D0, it's a slightly different
kernel. But same test, -d8 -s2 -c2, polled:
-D0 -B1
IOPS=45.39M, BW=22.16GiB/s, IOS/call=1/1
IOPS=46.06M, BW=22.49GiB/s, IOS/call=2/1
IOPS=45.70M, BW=22.31GiB/s, IOS/call=1/1
IOPS=45.71M, BW=22.32GiB/s, IOS/call=2/2
IOPS=45.83M, BW=22.38GiB/s, IOS/call=1/1
IOPS=45.64M, BW=22.29GiB/s, IOS/call=2/2
-D1 -B1
IOPS=45.94M, BW=22.43GiB/s, IOS/call=1/1
IOPS=46.08M, BW=22.50GiB/s, IOS/call=1/1
IOPS=46.27M, BW=22.59GiB/s, IOS/call=2/1
IOPS=45.88M, BW=22.40GiB/s, IOS/call=1/1
IOPS=46.18M, BW=22.55GiB/s, IOS/call=2/1
IOPS=46.13M, BW=22.52GiB/s, IOS/call=2/2
IOPS=46.40M, BW=22.66GiB/s, IOS/call=1/1
which is a smidge higher, and definitely not regressing now.
-- 
Jens Axboe
^ permalink raw reply	[flat|nested] 12+ messages in thread