* [RFC PATCH 24/28] block: Remove PCI_P2PDMA queue flag
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
This flag has been superseded by the DMA_DIRECT functionality.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
include/linux/blkdev.h | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a5b856324276..9ea800645cf5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -615,8 +615,7 @@ struct request_queue {
#define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */
#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
-#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
-#define QUEUE_FLAG_DMA_DIRECT 26 /* device supports dma-addr requests */
+#define QUEUE_FLAG_DMA_DIRECT 25 /* device supports dma-addr requests */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP))
@@ -641,8 +640,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
#define blk_queue_scsi_passthrough(q) \
test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
-#define blk_queue_pci_p2pdma(q) \
- test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
#define blk_queue_dma_direct(q) \
test_bit(QUEUE_FLAG_DMA_DIRECT, &(q)->queue_flags)
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 23/28] nvme-pci: Remove support for PCI_P2PDMA requests
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
These requests have been superseded by dma-direct requests and are
therefore no longer needed.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
drivers/nvme/host/core.c | 2 --
drivers/nvme/host/nvme.h | 3 +--
drivers/nvme/host/pci.c | 27 ++++++++++-----------------
3 files changed, 11 insertions(+), 21 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8e876417c44b..63d132c478b4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3257,8 +3257,6 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
- if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
- blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
if (ctrl->ops->flags & NVME_F_DMA_DIRECT)
blk_queue_flag_set(QUEUE_FLAG_DMA_DIRECT, ns->queue);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f1dddc95c6a8..d103cecc14dd 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -361,8 +361,7 @@ struct nvme_ctrl_ops {
unsigned int flags;
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
-#define NVME_F_PCI_P2PDMA (1 << 2)
-#define NVME_F_DMA_DIRECT (1 << 3)
+#define NVME_F_DMA_DIRECT (1 << 2)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5957f3a4f261..7f806e76230a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -564,9 +564,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->nents);
- /* P2PDMA requests do not need to be unmapped */
- if (!is_pci_p2pdma_page(sg_page(iod->sg)) &&
- !blk_rq_is_dma_direct(req))
+ /* DMA direct requests do not need to be unmapped */
+ if (!blk_rq_is_dma_direct(req))
dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
@@ -828,16 +827,14 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (blk_rq_nr_phys_segments(req) == 1 && !blk_rq_is_dma_direct(req)) {
struct bio_vec bv = req_bvec(req);
- if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
- return nvme_setup_prp_simple(dev, req,
- &cmnd->rw, &bv);
+ if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
+ return nvme_setup_prp_simple(dev, req,
+ &cmnd->rw, &bv);
- if (iod->nvmeq->qid &&
- dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
- return nvme_setup_sgl_simple(dev, req,
- &cmnd->rw, &bv);
- }
+ if (iod->nvmeq->qid &&
+ dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
+ return nvme_setup_sgl_simple(dev, req,
+ &cmnd->rw, &bv);
}
iod->dma_len = 0;
@@ -849,10 +846,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (!iod->nents)
goto out;
- if (is_pci_p2pdma_page(sg_page(iod->sg)))
- nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
- rq_dma_dir(req));
- else if (blk_rq_is_dma_direct(req))
+ if (blk_rq_is_dma_direct(req))
nr_mapped = iod->nents;
else
nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
@@ -2642,7 +2636,6 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA |
NVME_F_DMA_DIRECT,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 22/28] nvmet: Use DMA addresses instead of struct pages for P2P
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Start using the dma-direct bios and DMA address RDMA CTX API.
This removes struct pages from all P2P transactions.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
drivers/nvme/target/core.c | 12 +++++----
drivers/nvme/target/io-cmd-bdev.c | 32 ++++++++++++++++++++---
drivers/nvme/target/nvmet.h | 5 +++-
drivers/nvme/target/rdma.c | 43 +++++++++++++++++++++++--------
4 files changed, 71 insertions(+), 21 deletions(-)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 7734a6acff85..230e99b63320 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -420,7 +420,7 @@ static int nvmet_p2pmem_ns_enable(struct nvmet_ns *ns)
return -EINVAL;
}
- if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
+ if (!blk_queue_dma_direct(ns->bdev->bd_queue)) {
pr_err("peer-to-peer DMA is not supported by the driver of %s\n",
ns->device_path);
return -EINVAL;
@@ -926,9 +926,9 @@ int nvmet_req_alloc_sgl(struct nvmet_req *req)
req->p2p_dev = NULL;
if (req->sq->qid && p2p_dev) {
- req->sg = pci_p2pmem_alloc_sgl(p2p_dev, &req->sg_cnt,
- req->transfer_len);
- if (req->sg) {
+ req->p2p_dma_buf = pci_alloc_p2pmem(p2p_dev,
+ req->transfer_len);
+ if (req->p2p_dma_buf) {
req->p2p_dev = p2p_dev;
return 0;
}
@@ -951,10 +951,12 @@ EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
void nvmet_req_free_sgl(struct nvmet_req *req)
{
if (req->p2p_dev)
- pci_p2pmem_free_sgl(req->p2p_dev, req->sg);
+ pci_free_p2pmem(req->p2p_dev, req->p2p_dma_buf,
+ req->transfer_len);
else
sgl_free(req->sg);
+ req->p2p_dev = NULL;
req->sg = NULL;
req->sg_cnt = 0;
}
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 061d40b020c7..f5621aeb1d6c 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -6,6 +6,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/blkdev.h>
#include <linux/module.h>
+#include <linux/pci-p2pdma.h>
#include "nvmet.h"
int nvmet_bdev_ns_enable(struct nvmet_ns *ns)
@@ -132,6 +133,24 @@ static void nvmet_submit_sg(struct nvmet_req *req, struct bio *bio,
submit_bio(bio);
}
+static void nvmet_submit_p2p(struct nvmet_req *req, struct bio *bio)
+{
+ dma_addr_t addr;
+ int ret;
+
+ addr = pci_p2pmem_virt_to_bus(req->p2p_dev, req->p2p_dma_buf);
+
+ ret = bio_add_dma_addr(req->ns->bdev->bd_queue, bio,
+ addr, req->transfer_len);
+ if (WARN_ON_ONCE(ret != req->transfer_len)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ nvmet_bio_done(bio);
+ return;
+ }
+
+ submit_bio(bio);
+}
+
static void nvmet_bdev_execute_rw(struct nvmet_req *req)
{
int sg_cnt = req->sg_cnt;
@@ -139,7 +158,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
sector_t sector;
int op, op_flags = 0;
- if (!req->sg_cnt) {
+ if (!req->sg_cnt && !req->p2p_dev) {
nvmet_req_complete(req, 0);
return;
}
@@ -153,8 +172,10 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
op = REQ_OP_READ;
}
- if (is_pci_p2pdma_page(sg_page(req->sg)))
- op_flags |= REQ_NOMERGE;
+ if (req->p2p_dev) {
+ op_flags |= REQ_DMA_DIRECT;
+ sg_cnt = 1;
+ }
sector = le64_to_cpu(req->cmd->rw.slba);
sector <<= (req->ns->blksize_shift - 9);
@@ -171,7 +192,10 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
bio->bi_end_io = nvmet_bio_done;
bio_set_op_attrs(bio, op, op_flags);
- nvmet_submit_sg(req, bio, sector);
+ if (req->p2p_dev)
+ nvmet_submit_p2p(req, bio);
+ else
+ nvmet_submit_sg(req, bio, sector);
}
static void nvmet_bdev_execute_flush(struct nvmet_req *req)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c25d88fc9dec..5714e5b5ef04 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -288,7 +288,10 @@ struct nvmet_req {
struct nvmet_sq *sq;
struct nvmet_cq *cq;
struct nvmet_ns *ns;
- struct scatterlist *sg;
+ union {
+ struct scatterlist *sg;
+ void *p2p_dma_buf;
+ };
struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC];
union {
struct {
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 36d906a7f70d..92bfc7207814 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -15,6 +15,7 @@
#include <linux/string.h>
#include <linux/wait.h>
#include <linux/inet.h>
+#include <linux/pci-p2pdma.h>
#include <asm/unaligned.h>
#include <rdma/ib_verbs.h>
@@ -495,6 +496,18 @@ static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
spin_unlock(&queue->rsp_wr_wait_lock);
}
+static void nvmet_rdma_ctx_destroy(struct nvmet_rdma_rsp *rsp)
+{
+ struct nvmet_rdma_queue *queue = rsp->queue;
+
+ if (rsp->req.p2p_dev)
+ rdma_rw_ctx_dma_destroy(&rsp->rw, queue->cm_id->qp,
+ queue->cm_id->port_num);
+ else
+ rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+ queue->cm_id->port_num, rsp->req.sg,
+ rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
+}
static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
{
@@ -502,11 +515,8 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
- if (rsp->n_rdma) {
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
- queue->cm_id->port_num, rsp->req.sg,
- rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
- }
+ if (rsp->n_rdma)
+ nvmet_rdma_ctx_destroy(rsp);
if (rsp->req.sg != rsp->cmd->inline_sg)
nvmet_req_free_sgl(&rsp->req);
@@ -587,9 +597,9 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
WARN_ON(rsp->n_rdma <= 0);
atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
- queue->cm_id->port_num, rsp->req.sg,
- rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
+
+ nvmet_rdma_ctx_destroy(rsp);
+
rsp->n_rdma = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
@@ -663,6 +673,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
struct rdma_cm_id *cm_id = rsp->queue->cm_id;
u64 addr = le64_to_cpu(sgl->addr);
u32 key = get_unaligned_le32(sgl->key);
+ dma_addr_t dma_addr;
int ret;
rsp->req.transfer_len = get_unaligned_le24(sgl->length);
@@ -675,9 +686,19 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
if (ret < 0)
goto error_out;
- ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
- rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
- nvmet_data_dir(&rsp->req));
+ if (rsp->req.p2p_dev) {
+ dma_addr = pci_p2pmem_virt_to_bus(rsp->req.p2p_dev,
+ rsp->req.p2p_dma_buf);
+
+ ret = rdma_rw_ctx_dma_init(&rsp->rw, cm_id->qp,
+ cm_id->port_num, dma_addr,
+ rsp->req.transfer_len, addr, key,
+ nvmet_data_dir(&rsp->req));
+ } else {
+ ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
+ rsp->req.sg, rsp->req.sg_cnt, 0, addr,
+ key, nvmet_data_dir(&rsp->req));
+ }
if (ret < 0)
goto error_out;
rsp->n_rdma += ret;
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 21/28] nvmet: Split nvmet_bdev_execute_rw() into a helper function
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Move the mapping of the SG and submission of the bio
into a static helper function to reduce the complexity.
This will be useful in the next patch which submits dma-direct bios
for P2P requests.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
drivers/nvme/target/io-cmd-bdev.c | 52 ++++++++++++++++++-------------
1 file changed, 31 insertions(+), 21 deletions(-)
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 7a1cf6437a6a..061d40b020c7 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -103,13 +103,41 @@ static void nvmet_bio_done(struct bio *bio)
bio_put(bio);
}
+static void nvmet_submit_sg(struct nvmet_req *req, struct bio *bio,
+ sector_t sector)
+{
+ int sg_cnt = req->sg_cnt;
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(req->sg, sg, req->sg_cnt, i) {
+ while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
+ != sg->length) {
+ struct bio *prev = bio;
+
+ bio = bio_alloc(GFP_KERNEL,
+ min(sg_cnt, BIO_MAX_PAGES));
+ bio_set_dev(bio, req->ns->bdev);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = prev->bi_opf;
+
+ bio_chain(bio, prev);
+ submit_bio(prev);
+ }
+
+ sector += sg->length >> 9;
+ sg_cnt--;
+ }
+
+ submit_bio(bio);
+}
+
static void nvmet_bdev_execute_rw(struct nvmet_req *req)
{
int sg_cnt = req->sg_cnt;
struct bio *bio;
- struct scatterlist *sg;
sector_t sector;
- int op, op_flags = 0, i;
+ int op, op_flags = 0;
if (!req->sg_cnt) {
nvmet_req_complete(req, 0);
@@ -143,25 +171,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
bio->bi_end_io = nvmet_bio_done;
bio_set_op_attrs(bio, op, op_flags);
- for_each_sg(req->sg, sg, req->sg_cnt, i) {
- while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
- != sg->length) {
- struct bio *prev = bio;
-
- bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
- bio_set_dev(bio, req->ns->bdev);
- bio->bi_iter.bi_sector = sector;
- bio_set_op_attrs(bio, op, op_flags);
-
- bio_chain(bio, prev);
- submit_bio(prev);
- }
-
- sector += sg->length >> 9;
- sg_cnt--;
- }
-
- submit_bio(bio);
+ nvmet_submit_sg(req, bio, sector);
}
static void nvmet_bdev_execute_flush(struct nvmet_req *req)
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 20/28] IB/core: Introduce API for initializing a RW ctx from a DMA address
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Introduce rdma_rw_ctx_dma_init() and rdma_rw_ctx_dma_destroy() which
peform the same operation as rdma_rw_ctx_init() and
rdma_rw_ctx_destroy() respectively except they operate on a DMA
address and length instead of an SGL.
This will be used for struct page-less P2PDMA, but there's also
been opinions expressed to migrate away from SGLs and struct
pages in the RDMA APIs and this will likely fit with that
effort.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
drivers/infiniband/core/rw.c | 74 ++++++++++++++++++++++++++++++------
include/rdma/rw.h | 6 +++
2 files changed, 69 insertions(+), 11 deletions(-)
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 32ca8429eaae..cefa6b930bc8 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -319,6 +319,39 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
}
EXPORT_SYMBOL(rdma_rw_ctx_init);
+/**
+ * rdma_rw_ctx_dma_init - initialize a RDMA READ/WRITE context from a
+ * DMA address instead of SGL
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @addr: DMA address to READ/WRITE from/to
+ * @len: length of memory to operate on
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ *
+ * Returns the number of WQEs that will be needed on the workqueue if
+ * successful, or a negative error code.
+ */
+int rdma_rw_ctx_dma_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, dma_addr_t addr, u32 len, u64 remote_addr,
+ u32 rkey, enum dma_data_direction dir)
+{
+ struct scatterlist sg;
+
+ sg_dma_address(&sg) = addr;
+ sg_dma_len(&sg) = len;
+
+ if (rdma_rw_io_needs_mr(qp->device, port_num, dir, 1))
+ return rdma_rw_init_mr_wrs(ctx, qp, port_num, &sg, 1, 0,
+ remote_addr, rkey, dir);
+ else
+ return rdma_rw_init_single_wr(ctx, qp, &sg, 0, remote_addr,
+ rkey, dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_dma_init);
+
/**
* rdma_rw_ctx_signature_init - initialize a RW context with signature offload
* @ctx: context to initialize
@@ -566,17 +599,7 @@ int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
}
EXPORT_SYMBOL(rdma_rw_ctx_post);
-/**
- * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
- * @ctx: context to release
- * @qp: queue pair to operate on
- * @port_num: port num to which the connection is bound
- * @sg: scatterlist that was used for the READ/WRITE
- * @sg_cnt: number of entries in @sg
- * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
- */
-void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
- struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+static void __rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp)
{
int i;
@@ -596,6 +619,21 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
BUG();
break;
}
+}
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist that was used for the READ/WRITE
+ * @sg_cnt: number of entries in @sg
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir)
+{
+ __rdma_rw_ctx_destroy(ctx, qp);
/* P2PDMA contexts do not need to be unmapped */
if (!is_pci_p2pdma_page(sg_page(sg)))
@@ -603,6 +641,20 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
}
EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+/**
+ * rdma_rw_ctx_dma_destroy - release all resources allocated by
+ * rdma_rw_ctx_dma_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ */
+void rdma_rw_ctx_dma_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num)
+{
+ __rdma_rw_ctx_destroy(ctx, qp);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_dma_destroy);
+
/**
* rdma_rw_ctx_destroy_signature - release all resources allocated by
* rdma_rw_ctx_init_signature
diff --git a/include/rdma/rw.h b/include/rdma/rw.h
index 494f79ca3e62..e47f8053af6e 100644
--- a/include/rdma/rw.h
+++ b/include/rdma/rw.h
@@ -58,6 +58,12 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
struct scatterlist *sg, u32 sg_cnt,
enum dma_data_direction dir);
+int rdma_rw_ctx_dma_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num, dma_addr_t addr, u32 len, u64 remote_addr,
+ u32 rkey, enum dma_data_direction dir);
+void rdma_rw_ctx_dma_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u8 port_num);
+
int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
u8 port_num, struct scatterlist *sg, u32 sg_cnt,
struct scatterlist *prot_sg, u32 prot_sg_cnt,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 19/28] nvme-pci: Support dma-direct bios
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Adding support for dma-direct bios only requires putting a condition
around the call to dma_map_sg() so it is skipped when the request
has the REQ_DMA_ADDR flag.
We then need to indicate support for the queue in much the same way
we did with PCI P2PDMA. Seeing this provides the same support as
PCI P2PDMA those flags will be removed in a subsequent patch.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
drivers/nvme/host/core.c | 2 ++
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/pci.c | 10 +++++++---
3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 120fb593d1da..8e876417c44b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3259,6 +3259,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
+ if (ctrl->ops->flags & NVME_F_DMA_DIRECT)
+ blk_queue_flag_set(QUEUE_FLAG_DMA_DIRECT, ns->queue);
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 55553d293a98..f1dddc95c6a8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -362,6 +362,7 @@ struct nvme_ctrl_ops {
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
#define NVME_F_PCI_P2PDMA (1 << 2)
+#define NVME_F_DMA_DIRECT (1 << 3)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 524d6bd6d095..5957f3a4f261 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -565,7 +565,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->nents);
/* P2PDMA requests do not need to be unmapped */
- if (!is_pci_p2pdma_page(sg_page(iod->sg)))
+ if (!is_pci_p2pdma_page(sg_page(iod->sg)) &&
+ !blk_rq_is_dma_direct(req))
dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
@@ -824,7 +825,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
blk_status_t ret = BLK_STS_RESOURCE;
int nr_mapped;
- if (blk_rq_nr_phys_segments(req) == 1) {
+ if (blk_rq_nr_phys_segments(req) == 1 && !blk_rq_is_dma_direct(req)) {
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
@@ -851,6 +852,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
if (is_pci_p2pdma_page(sg_page(iod->sg)))
nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
rq_dma_dir(req));
+ else if (blk_rq_is_dma_direct(req))
+ nr_mapped = iod->nents;
else
nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
rq_dma_dir(req), DMA_ATTR_NO_WARN);
@@ -2639,7 +2642,8 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA,
+ NVME_F_PCI_P2PDMA |
+ NVME_F_DMA_DIRECT,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 18/28] block: Introduce bio_add_dma_addr()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
bio_add_dma_addr() is analagous to bio_add_page() except it
adds a dma address to a dma-direct bio instead of a struct page.
It also checks to ensure that the queue supports dma address bios and
that we are not mixing dma addresses and struct pages.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/bio.c | 38 ++++++++++++++++++++++++++++++++++++++
include/linux/bio.h | 10 ++++++++++
2 files changed, 48 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index 6998fceddd36..02ae72e3ccfa 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -874,6 +874,44 @@ static void bio_release_pages(struct bio *bio)
put_page(bvec->bv_page);
}
+/**
+ * bio_add_dma_addr - attempt to add a dma address to a bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @dma_addr: dma address to add
+ * @len: vec entry length
+ *
+ * Attempt to add a dma address to the dma_vec maplist. This can
+ * fail for a number of reasons, such as the bio being full or
+ * target block device limitations. The target request queue must
+ * support dma-only bios and bios can not mix pages and dma_addresses.
+ */
+int bio_add_dma_addr(struct request_queue *q, struct bio *bio,
+ dma_addr_t dma_addr, unsigned int len)
+{
+ struct dma_vec *dv = &bio->bi_dma_vec[bio->bi_vcnt];
+
+ if (!blk_queue_dma_direct(q))
+ return -EINVAL;
+
+ if (!bio_is_dma_direct(bio))
+ return -EINVAL;
+
+ if (bio_dma_full(bio))
+ return 0;
+
+ WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+
+ dv->dv_addr = dma_addr;
+ dv->dv_len = len;
+
+ bio->bi_iter.bi_size += len;
+ bio->bi_vcnt++;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(bio_add_dma_addr);
+
static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
{
const struct bio_vec *bv = iter->bvec;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index df7973932525..d775f381ae00 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -112,6 +112,13 @@ static inline bool bio_full(struct bio *bio)
return bio->bi_vcnt >= bio->bi_max_vecs;
}
+static inline bool bio_dma_full(struct bio *bio)
+{
+ size_t vec_size = bio->bi_max_vecs * sizeof(struct bio_vec);
+
+ return bio->bi_vcnt >= (vec_size / sizeof(struct dma_vec));
+}
+
static inline bool bio_next_segment(const struct bio *bio,
struct bvec_iter_all *iter)
{
@@ -438,6 +445,9 @@ void bio_chain(struct bio *, struct bio *);
extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
unsigned int, unsigned int);
+extern int bio_add_dma_addr(struct request_queue *q, struct bio *bio,
+ dma_addr_t dma_addr, unsigned int len);
+
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool same_page);
void __bio_add_page(struct bio *bio, struct page *page,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 17/28] block: Introduce queue flag to indicate support for dma-direct bios
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Queues will need to advertise support to accept dma-direct requests.
The existing PCI P2P support which will be replaced by this and thus
the P2P flag will be dropped in a subsequent patch.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
include/linux/blkdev.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce70d5dded5f..a5b856324276 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -616,6 +616,7 @@ struct request_queue {
#define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */
#define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */
#define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */
+#define QUEUE_FLAG_DMA_DIRECT 26 /* device supports dma-addr requests */
#define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_SAME_COMP))
@@ -642,6 +643,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
#define blk_queue_pci_p2pdma(q) \
test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
+#define blk_queue_dma_direct(q) \
+ test_bit(QUEUE_FLAG_DMA_DIRECT, &(q)->queue_flags)
#define blk_noretry_request(rq) \
((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 16/28] block: Implement mapping dma-direct requests to SGs in blk_rq_map_sg()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
blk_rq_map_sg() just needs to move the dma_vec into the dma_address
of the sgl. Callers will need to ensure not to call dma_map_sg()
for dma-direct requests.
This will likely get less ugly with Christoph's proposed cleanup
to the DMA API. It will be much simpler if devices are just
calling a dma_map_bio() and don't have to worry about dma-direct
requests.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 65 insertions(+)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a7a5453987f9..ccd6c44b9f6e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -545,6 +545,69 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
return nsegs;
}
+static unsigned blk_dvec_map_sg(struct request_queue *q,
+ struct dma_vec *dvec, struct scatterlist *sglist,
+ struct scatterlist **sg)
+{
+ unsigned nbytes = dvec->dv_len;
+ unsigned nsegs = 0, total = 0;
+
+ while (nbytes > 0) {
+ unsigned seg_size;
+
+ *sg = blk_next_sg(sg, sglist);
+
+ seg_size = get_max_segment_size(q, total);
+ seg_size = min(nbytes, seg_size);
+
+ (*sg)->dma_address = dvec->dv_addr + total;
+ sg_dma_len(*sg) = seg_size;
+
+ total += seg_size;
+ nbytes -= seg_size;
+ nsegs++;
+ }
+
+ return nsegs;
+}
+
+static inline void
+__blk_segment_dma_map_sg(struct request_queue *q, struct dma_vec *dvec,
+ struct scatterlist *sglist, struct dma_vec *dvprv,
+ struct scatterlist **sg, int *nsegs)
+{
+ int nbytes = dvec->dv_len;
+
+ if (*sg) {
+ if ((*sg)->length + nbytes > queue_max_segment_size(q))
+ goto new_segment;
+ if (!dmavec_phys_mergeable(q, dvprv, dvec))
+ goto new_segment;
+
+ (*sg)->length += nbytes;
+ } else {
+new_segment:
+ (*nsegs) += blk_dvec_map_sg(q, dvec, sglist, sg);
+ }
+ *dvprv = *dvec;
+}
+
+static int __blk_dma_bios_map_sg(struct request_queue *q, struct bio *bio,
+ struct scatterlist *sglist,
+ struct scatterlist **sg)
+{
+ struct dma_vec dvec, dvprv = {};
+ struct bvec_iter iter;
+ int nsegs = 0;
+
+ for_each_bio(bio)
+ bio_for_each_dvec(dvec, bio, iter)
+ __blk_segment_dma_map_sg(q, &dvec, sglist, &dvprv,
+ sg, &nsegs);
+
+ return nsegs;
+}
+
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
@@ -559,6 +622,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg);
else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg);
+ else if (blk_rq_is_dma_direct(rq))
+ nsegs = __blk_dma_bios_map_sg(q, rq->bio, sglist, &sg);
else if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 15/28] block: Support counting dma-direct bio segments
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Change __blk_recalc_rq_segments() to loop through dma_vecs when
appropriate. It calls vec_split_segs() for each dma_vec or bio_vec.
Once this is done the bvec_split_segs() helper is no longer used.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 41 ++++++++++++++++++++++++++++++-----------
1 file changed, 30 insertions(+), 11 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c4c016f994f6..a7a5453987f9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -194,13 +194,6 @@ static bool vec_split_segs(struct request_queue *q, unsigned offset,
return !!len;
}
-static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
- unsigned *nsegs, unsigned *sectors, unsigned max_segs)
-{
- return vec_split_segs(q, bv->bv_offset, bv->bv_len, nsegs,
- sectors, max_segs);
-}
-
struct blk_segment_split_ctx {
unsigned nsegs;
unsigned sectors;
@@ -366,12 +359,36 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
}
EXPORT_SYMBOL(blk_queue_split);
+static unsigned int bio_calc_segs(struct request_queue *q, struct bio *bio)
+{
+ unsigned int nsegs = 0;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ bio_for_each_bvec(bv, bio, iter)
+ vec_split_segs(q, bv.bv_offset, bv.bv_len, &nsegs,
+ NULL, UINT_MAX);
+
+ return nsegs;
+}
+
+static unsigned int bio_dma_calc_segs(struct request_queue *q, struct bio *bio)
+{
+ unsigned int nsegs = 0;
+ struct bvec_iter iter;
+ struct dma_vec dv;
+
+ bio_for_each_dvec(dv, bio, iter)
+ vec_split_segs(q, dv.dv_addr, dv.dv_len, &nsegs,
+ NULL, UINT_MAX);
+
+ return nsegs;
+}
+
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
struct bio *bio)
{
unsigned int nr_phys_segs = 0;
- struct bvec_iter iter;
- struct bio_vec bv;
if (!bio)
return 0;
@@ -386,8 +403,10 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
}
for_each_bio(bio) {
- bio_for_each_bvec(bv, bio, iter)
- bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX);
+ if (bio_is_dma_direct(bio))
+ nr_phys_segs += bio_calc_segs(q, bio);
+ else
+ nr_phys_segs += bio_dma_calc_segs(q, bio);
}
return nr_phys_segs;
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 14/28] block: Support splitting dma-direct bios
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
If the bio is a dma-direct bio, loop through the dma_vecs instead
of the bio_vecs when calling vec_should_split().
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 45 +++++++++++++++++++++++++++++++++++++--------
1 file changed, 37 insertions(+), 8 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 32653fca53ce..c4c016f994f6 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -257,14 +257,44 @@ static bool vec_should_split(struct request_queue *q, unsigned offset,
return false;
}
+static bool bio_should_split(struct request_queue *q, struct bio *bio,
+ struct blk_segment_split_ctx *ctx)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ bool ret;
+
+ bio_for_each_bvec(bv, bio, iter) {
+ ret = vec_should_split(q, bv.bv_offset, bv.bv_len, ctx);
+ if (ret)
+ return true;
+ }
+
+ return false;
+}
+
+static bool bio_dma_should_split(struct request_queue *q, struct bio *bio,
+ struct blk_segment_split_ctx *ctx)
+{
+ struct bvec_iter iter;
+ struct dma_vec dv;
+ bool ret;
+
+ bio_for_each_dvec(dv, bio, iter) {
+ ret = vec_should_split(q, dv.dv_addr, dv.dv_len, ctx);
+ if (ret)
+ return true;
+ }
+
+ return false;
+}
+
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *segs)
{
- struct bio_vec bv;
- struct bvec_iter iter;
- bool do_split = false;
+ bool do_split;
struct bio *new = NULL;
struct blk_segment_split_ctx ctx = {
@@ -272,11 +302,10 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
.max_segs = queue_max_segments(q),
};
- bio_for_each_bvec(bv, bio, iter) {
- do_split = vec_should_split(q, bv.bv_offset, bv.bv_len, &ctx);
- if (do_split)
- break;
- }
+ if (bio_is_dma_direct(bio))
+ do_split = bio_dma_should_split(q, bio, &ctx);
+ else
+ do_split = bio_should_split(q, bio, &ctx);
*segs = ctx.nsegs;
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 13/28] block: Generalize bvec_should_split()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
bvec_should_split() will need to also operate on dma_vecs so
generalize it to take an offset and length instead of a bio_vec.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 31 +++++++++++++++++--------------
1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d9e89c0ad40d..32653fca53ce 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -206,23 +206,25 @@ struct blk_segment_split_ctx {
unsigned sectors;
bool prv_valid;
- struct bio_vec bvprv;
+ unsigned prv_offset;
+ unsigned prv_len;
const unsigned max_sectors;
const unsigned max_segs;
};
-static bool bvec_should_split(struct request_queue *q, struct bio_vec *bv,
- struct blk_segment_split_ctx *ctx)
+static bool vec_should_split(struct request_queue *q, unsigned offset,
+ unsigned len, struct blk_segment_split_ctx *ctx)
{
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (ctx->prv_valid && bvec_gap_to_prev(q, &ctx->bvprv, bv->bv_offset))
+ if (ctx->prv_valid &&
+ vec_gap_to_prev(q, ctx->prv_offset, ctx->prv_len, offset))
return true;
- if (ctx->sectors + (bv->bv_len >> 9) > ctx->max_sectors) {
+ if (ctx->sectors + (len >> 9) > ctx->max_sectors) {
/*
* Consider this a new segment if we're splitting in
* the middle of this vector.
@@ -230,9 +232,9 @@ static bool bvec_should_split(struct request_queue *q, struct bio_vec *bv,
if (ctx->nsegs < ctx->max_segs &&
ctx->sectors < ctx->max_sectors) {
/* split in the middle of bvec */
- bv->bv_len = (ctx->max_sectors - ctx->sectors) << 9;
- bvec_split_segs(q, bv, &ctx->nsegs,
- &ctx->sectors, ctx->max_segs);
+ len = (ctx->max_sectors - ctx->sectors) << 9;
+ vec_split_segs(q, offset, len, &ctx->nsegs,
+ &ctx->sectors, ctx->max_segs);
}
return true;
}
@@ -240,14 +242,15 @@ static bool bvec_should_split(struct request_queue *q, struct bio_vec *bv,
if (ctx->nsegs == ctx->max_segs)
return true;
- ctx->bvprv = *bv;
+ ctx->prv_offset = offset;
+ ctx->prv_len = len;
ctx->prv_valid = true;
- if (bv->bv_offset + bv->bv_len <= PAGE_SIZE) {
+ if (offset + len <= PAGE_SIZE) {
ctx->nsegs++;
- ctx->sectors += bv->bv_len >> 9;
- } else if (bvec_split_segs(q, bv, &ctx->nsegs, &ctx->sectors,
- ctx->max_segs)) {
+ ctx->sectors += len >> 9;
+ } else if (vec_split_segs(q, offset, len, &ctx->nsegs, &ctx->sectors,
+ ctx->max_segs)) {
return true;
}
@@ -270,7 +273,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
};
bio_for_each_bvec(bv, bio, iter) {
- do_split = bvec_should_split(q, &bv, &ctx);
+ do_split = vec_should_split(q, bv.bv_offset, bv.bv_len, &ctx);
if (do_split)
break;
}
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 12/28] block: Create helper for bvec_should_split()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
In order to support dma-direct bios, blk_bio_segment_split() will
need to operate on both bio_vecs and dma_vecs. In order to do
this, the code inside bio_for_each_bvec() is moved into a generic
helper called bvec_should_split().
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 86 +++++++++++++++++++++++++----------------------
1 file changed, 46 insertions(+), 40 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 414e61a714bf..d9e89c0ad40d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -212,6 +212,48 @@ struct blk_segment_split_ctx {
const unsigned max_segs;
};
+static bool bvec_should_split(struct request_queue *q, struct bio_vec *bv,
+ struct blk_segment_split_ctx *ctx)
+{
+ /*
+ * If the queue doesn't support SG gaps and adding this
+ * offset would create a gap, disallow it.
+ */
+ if (ctx->prv_valid && bvec_gap_to_prev(q, &ctx->bvprv, bv->bv_offset))
+ return true;
+
+ if (ctx->sectors + (bv->bv_len >> 9) > ctx->max_sectors) {
+ /*
+ * Consider this a new segment if we're splitting in
+ * the middle of this vector.
+ */
+ if (ctx->nsegs < ctx->max_segs &&
+ ctx->sectors < ctx->max_sectors) {
+ /* split in the middle of bvec */
+ bv->bv_len = (ctx->max_sectors - ctx->sectors) << 9;
+ bvec_split_segs(q, bv, &ctx->nsegs,
+ &ctx->sectors, ctx->max_segs);
+ }
+ return true;
+ }
+
+ if (ctx->nsegs == ctx->max_segs)
+ return true;
+
+ ctx->bvprv = *bv;
+ ctx->prv_valid = true;
+
+ if (bv->bv_offset + bv->bv_len <= PAGE_SIZE) {
+ ctx->nsegs++;
+ ctx->sectors += bv->bv_len >> 9;
+ } else if (bvec_split_segs(q, bv, &ctx->nsegs, &ctx->sectors,
+ ctx->max_segs)) {
+ return true;
+ }
+
+ return false;
+}
+
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -219,7 +261,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
{
struct bio_vec bv;
struct bvec_iter iter;
- bool do_split = true;
+ bool do_split = false;
struct bio *new = NULL;
struct blk_segment_split_ctx ctx = {
@@ -228,47 +270,11 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
};
bio_for_each_bvec(bv, bio, iter) {
- /*
- * If the queue doesn't support SG gaps and adding this
- * offset would create a gap, disallow it.
- */
- if (ctx.prv_valid && bvec_gap_to_prev(q, &ctx.bvprv,
- bv.bv_offset))
- goto split;
-
- if (ctx.sectors + (bv.bv_len >> 9) > ctx.max_sectors) {
- /*
- * Consider this a new segment if we're splitting in
- * the middle of this vector.
- */
- if (ctx.nsegs < ctx.max_segs &&
- ctx.sectors < ctx.max_sectors) {
- /* split in the middle of bvec */
- bv.bv_len =
- (ctx.max_sectors - ctx.sectors) << 9;
- bvec_split_segs(q, &bv, &ctx.nsegs,
- &ctx.sectors, ctx.max_segs);
- }
- goto split;
- }
-
- if (ctx.nsegs == ctx.max_segs)
- goto split;
-
- ctx.bvprv = bv;
- ctx.prv_valid = true;
-
- if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
- ctx.nsegs++;
- ctx.sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &ctx.nsegs, &ctx.sectors,
- ctx.max_segs)) {
- goto split;
- }
+ do_split = bvec_should_split(q, &bv, &ctx);
+ if (do_split)
+ break;
}
- do_split = false;
-split:
*segs = ctx.nsegs;
if (do_split) {
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 11/28] block: Create blk_segment_split_ctx
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
In order to support dma-direct bios, blk_bio_segment_split() will
need to operate on both bio_vecs and dma_vecs. In order to do
this the code inside bio_for_each_bvec() needs to be moved into
a generic helper. Step one to do this is to put some of the
variables used inside the loop into a context structure so we
don't need to pass a dozen variables to this new function.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 55 ++++++++++++++++++++++++++++++-----------------
1 file changed, 35 insertions(+), 20 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3581c7ac3c1b..414e61a714bf 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -201,63 +201,78 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
sectors, max_segs);
}
+struct blk_segment_split_ctx {
+ unsigned nsegs;
+ unsigned sectors;
+
+ bool prv_valid;
+ struct bio_vec bvprv;
+
+ const unsigned max_sectors;
+ const unsigned max_segs;
+};
+
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
unsigned *segs)
{
- struct bio_vec bv, bvprv, *bvprvp = NULL;
+ struct bio_vec bv;
struct bvec_iter iter;
- unsigned nsegs = 0, sectors = 0;
bool do_split = true;
struct bio *new = NULL;
- const unsigned max_sectors = get_max_io_size(q, bio);
- const unsigned max_segs = queue_max_segments(q);
+
+ struct blk_segment_split_ctx ctx = {
+ .max_sectors = get_max_io_size(q, bio),
+ .max_segs = queue_max_segments(q),
+ };
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+ if (ctx.prv_valid && bvec_gap_to_prev(q, &ctx.bvprv,
+ bv.bv_offset))
goto split;
- if (sectors + (bv.bv_len >> 9) > max_sectors) {
+ if (ctx.sectors + (bv.bv_len >> 9) > ctx.max_sectors) {
/*
* Consider this a new segment if we're splitting in
* the middle of this vector.
*/
- if (nsegs < max_segs &&
- sectors < max_sectors) {
+ if (ctx.nsegs < ctx.max_segs &&
+ ctx.sectors < ctx.max_sectors) {
/* split in the middle of bvec */
- bv.bv_len = (max_sectors - sectors) << 9;
- bvec_split_segs(q, &bv, &nsegs,
- §ors, max_segs);
+ bv.bv_len =
+ (ctx.max_sectors - ctx.sectors) << 9;
+ bvec_split_segs(q, &bv, &ctx.nsegs,
+ &ctx.sectors, ctx.max_segs);
}
goto split;
}
- if (nsegs == max_segs)
+ if (ctx.nsegs == ctx.max_segs)
goto split;
- bvprv = bv;
- bvprvp = &bvprv;
+ ctx.bvprv = bv;
+ ctx.prv_valid = true;
if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
- nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, §ors,
- max_segs)) {
+ ctx.nsegs++;
+ ctx.sectors += bv.bv_len >> 9;
+ } else if (bvec_split_segs(q, &bv, &ctx.nsegs, &ctx.sectors,
+ ctx.max_segs)) {
goto split;
}
}
do_split = false;
split:
- *segs = nsegs;
+ *segs = ctx.nsegs;
if (do_split) {
- new = bio_split(bio, sectors, GFP_NOIO, bs);
+ new = bio_split(bio, ctx.sectors, GFP_NOIO, bs);
if (new)
bio = new;
}
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 10/28] block: Create generic vec_split_segs() from bvec_split_segs()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
bvec_split_segs() only requires the address and length of the
vector. In order to generalize it to work with dma_vecs, we just
take the address and length directly instead of the bio_vec.
The function is renamed to vec_split_segs() and a helper is added
to avoid having to adjust the existing callsites.
Note: the new bvec_split_segs() helper will be removed in a subsequent
patch.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk-merge.c | 21 ++++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 17713d7d98d5..3581c7ac3c1b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -158,13 +158,13 @@ static unsigned get_max_segment_size(struct request_queue *q,
}
/*
- * Split the bvec @bv into segments, and update all kinds of
- * variables.
+ * Split the an address/offset and length into segments, and
+ * update all kinds of variables.
*/
-static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
- unsigned *nsegs, unsigned *sectors, unsigned max_segs)
+static bool vec_split_segs(struct request_queue *q, unsigned offset,
+ unsigned len, unsigned *nsegs, unsigned *sectors,
+ unsigned max_segs)
{
- unsigned len = bv->bv_len;
unsigned total_len = 0;
unsigned new_nsegs = 0, seg_size = 0;
@@ -173,14 +173,14 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
* current bvec has to be splitted as multiple segments.
*/
while (len && new_nsegs + *nsegs < max_segs) {
- seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+ seg_size = get_max_segment_size(q, offset + total_len);
seg_size = min(seg_size, len);
new_nsegs++;
total_len += seg_size;
len -= seg_size;
- if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+ if ((offset + total_len) & queue_virt_boundary(q))
break;
}
@@ -194,6 +194,13 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
return !!len;
}
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+ unsigned *nsegs, unsigned *sectors, unsigned max_segs)
+{
+ return vec_split_segs(q, bv->bv_offset, bv->bv_len, nsegs,
+ sectors, max_segs);
+}
+
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 09/28] block: Introduce vec_gap_to_prev()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Introduce vec_gap_to_prev() which is a more general
form of bvec_gap_to_prev().
In order to support splitting dma_vecs we will need to do a similar
calcualtion using the DMA address and length.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk.h | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/block/blk.h b/block/blk.h
index 4142383eed7a..c5512fefe703 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -98,11 +98,19 @@ static inline bool dmavec_phys_mergeable(struct request_queue *q,
vec2->dv_addr, vec2->dv_len);
}
+static inline bool __vec_gap_to_prev(struct request_queue *q,
+ unsigned int prv_offset, unsigned int prv_len,
+ unsigned int nxt_offset)
+{
+ return (nxt_offset & queue_virt_boundary(q)) ||
+ ((prv_offset + prv_len) & queue_virt_boundary(q));
+}
+
static inline bool __bvec_gap_to_prev(struct request_queue *q,
struct bio_vec *bprv, unsigned int offset)
{
- return (offset & queue_virt_boundary(q)) ||
- ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+ return __vec_gap_to_prev(q, bprv->bv_offset, bprv->bv_len,
+ offset);
}
/*
@@ -117,6 +125,15 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
return __bvec_gap_to_prev(q, bprv, offset);
}
+static inline bool vec_gap_to_prev(struct request_queue *q,
+ unsigned int prv_offset, unsigned int prv_len,
+ unsigned int nxt_offset)
+{
+ if (!queue_virt_boundary(q))
+ return false;
+ return __vec_gap_to_prev(q, prv_offset, prv_len, nxt_offset);
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 08/28] block: Introduce dmavec_phys_mergeable()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Introduce a new helper which is an analog of biovec_phys_mergeable()
for dma-direct vectors.
This also provides a common helper vec_phys_mergeable() for use in
code that's general to both bio_vecs and dma_vecs.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/blk.h | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..4142383eed7a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -66,20 +66,36 @@ static inline void blk_queue_enter_live(struct request_queue *q)
percpu_ref_get(&q->q_usage_counter);
}
+static inline bool vec_phys_mergeable(struct request_queue *q,
+ unsigned long addr1, unsigned int len1,
+ unsigned long addr2, unsigned int len2)
+{
+ unsigned long mask = queue_segment_boundary(q);
+
+ if (addr1 + len1 != addr2)
+ return false;
+ if ((addr1 | mask) != ((addr2 + len2 - 1) | mask))
+ return false;
+ return true;
+}
+
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
{
- unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
- if (addr1 + vec1->bv_len != addr2)
- return false;
if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
return false;
- if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
- return false;
- return true;
+
+ return vec_phys_mergeable(q, addr1, vec1->bv_len, addr2, vec2->bv_len);
+}
+
+static inline bool dmavec_phys_mergeable(struct request_queue *q,
+ struct dma_vec *vec1, struct dma_vec *vec2)
+{
+ return vec_phys_mergeable(q, vec1->dv_addr, vec1->dv_len,
+ vec2->dv_addr, vec2->dv_len);
}
static inline bool __bvec_gap_to_prev(struct request_queue *q,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 07/28] block: Use dma_vec length in bio_cur_bytes() for dma-direct bios
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
For dma-direct bios, use the dv_len of the current vector
seeing the bio_vec's are not valid in such a context.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
include/linux/bio.h | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e212e5958a75..df7973932525 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -91,10 +91,12 @@ static inline bool bio_mergeable(struct bio *bio)
static inline unsigned int bio_cur_bytes(struct bio *bio)
{
- if (bio_has_data(bio))
- return bio_iovec(bio).bv_len;
- else /* dataless requests such as discard */
+ if (!bio_has_data(bio)) /* dataless requests such as discard */
return bio->bi_iter.bi_size;
+ else if (op_is_dma_direct(bio->bi_opf))
+ return bio_dma_vec(bio).dv_len;
+ else
+ return bio_iovec(bio).bv_len;
}
static inline void *bio_data(struct bio *bio)
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 06/28] block: Support dma-direct bios in bio_advance_iter()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
Dma-direct bio iterators need to be advanced using a similar
dvec_iter_advance helper.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
include/linux/bio.h | 2 ++
1 file changed, 2 insertions(+)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8180309123d7..e212e5958a75 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -134,6 +134,8 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
if (bio_no_advance_iter(bio))
iter->bi_size -= bytes;
+ else if (op_is_dma_direct(bio->bi_opf))
+ dvec_iter_advance(bio->bi_dma_vec, iter, bytes);
else
bvec_iter_advance(bio->bi_io_vec, iter, bytes);
/* TODO: It is reasonable to complete bio with error here. */
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 05/28] block: Skip dma-direct bios in bio_integrity_prep()
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
The block layer will not be able to handle integrity for dma-direct
bios seeing it does not have access to the underlying data.
If users of dma-direct require integrity, they will have to handle it
in the layer creating the bios. This is left as future work should
somebody care about handling such a case.
Thus, bio_integrity_prep() should ignore dma-direct bios.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/bio-integrity.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4db620849515..10fdf456fcd8 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -221,6 +221,10 @@ bool bio_integrity_prep(struct bio *bio)
if (bio_integrity(bio))
return true;
+ /* The block layer cannot handle integrity for dma-direct bios */
+ if (bio_is_dma_direct(bio))
+ return true;
+
if (bio_data_dir(bio) == READ) {
if (!bi->profile->verify_fn ||
!(bi->flags & BLK_INTEGRITY_VERIFY))
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 04/28] block: Never bounce dma-direct bios
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
It is expected the creator of the dma-direct bio will ensure the
target device can access the DMA address it's creating bios for.
It's also not possible to bounce a dma-direct bio seeing the block
layer doesn't have any way to access the underlying data behind
the DMA address.
Thus, never bounce dma-direct bios.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/bounce.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/block/bounce.c b/block/bounce.c
index f8ed677a1bf7..17e020a40cca 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -367,6 +367,14 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
if (!bio_has_data(*bio_orig))
return;
+ /*
+ * For DMA direct bios, Upper layers are expected to ensure
+ * the device in question can access the DMA addresses. So
+ * it never makes sense to bounce a DMA direct bio.
+ */
+ if (bio_is_dma_direct(*bio_orig))
+ return;
+
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 03/28] block: Warn on mis-use of dma-direct bios
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
This is a result of an audit of users of 'bi_io_vec'. A number of
warnings and blocking conditions are added to ensure dma-direct bios
are not incorrectly accessing the 'bi_io_vec' when they should access
the 'bi_dma_vec'. These are largely just protecting against mis-uses
in future development so depending on taste and public opinion some
or all of these checks may not be necessary.
A few other issues with dma-direct bios will be tackled in subsequent
patches.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
block/bio.c | 33 +++++++++++++++++++++++++++++++++
block/blk-core.c | 3 +++
2 files changed, 36 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index 683cbb40f051..6998fceddd36 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -525,6 +525,9 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
struct bio_vec bv;
struct bvec_iter iter;
+ if (WARN_ON_ONCE(bio_is_dma_direct(bio)))
+ return;
+
__bio_for_each_segment(bv, bio, iter, start) {
char *data = bvec_kmap_irq(&bv, &flags);
memset(data, 0, bv.bv_len);
@@ -707,6 +710,8 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
*/
if (unlikely(bio_flagged(bio, BIO_CLONED)))
return 0;
+ if (unlikely(bio_is_dma_direct(bio)))
+ return 0;
if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
return 0;
@@ -783,6 +788,8 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
+ if (WARN_ON_ONCE(bio_is_dma_direct(bio)))
+ return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
@@ -814,6 +821,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio_full(bio));
+ WARN_ON_ONCE(bio_is_dma_direct(bio));
bv->bv_page = page;
bv->bv_offset = off;
@@ -851,6 +859,8 @@ static void bio_get_pages(struct bio *bio)
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
+ WARN_ON_ONCE(bio_is_dma_direct(bio));
+
bio_for_each_segment_all(bvec, bio, iter_all)
get_page(bvec->bv_page);
}
@@ -956,6 +966,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (WARN_ON_ONCE(bio->bi_vcnt))
return -EINVAL;
+ if (WARN_ON_ONCE(bio_is_dma_direct(bio)))
+ return -EINVAL;
do {
if (is_bvec)
@@ -1029,6 +1041,9 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
void *src_p, *dst_p;
unsigned bytes;
+ if (WARN_ON_ONCE(bio_is_dma_direct(src) || bio_is_dma_direct(dst)))
+ return;
+
while (src_iter->bi_size && dst_iter->bi_size) {
src_bv = bio_iter_iovec(src, *src_iter);
dst_bv = bio_iter_iovec(dst, *dst_iter);
@@ -1143,6 +1158,9 @@ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ if (WARN_ON_ONCE(bio_is_dma_direct(bio)))
+ return -EINVAL;
+
bio_for_each_segment_all(bvec, bio, iter_all) {
ssize_t ret;
@@ -1174,6 +1192,9 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ if (WARN_ON_ONCE(bio_is_dma_direct(bio)))
+ return -EINVAL;
+
bio_for_each_segment_all(bvec, bio, iter_all) {
ssize_t ret;
@@ -1197,6 +1218,9 @@ void bio_free_pages(struct bio *bio)
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ if (WARN_ON_ONCE(bio_is_dma_direct(bio)))
+ return;
+
bio_for_each_segment_all(bvec, bio, iter_all)
__free_page(bvec->bv_page);
}
@@ -1653,6 +1677,9 @@ void bio_set_pages_dirty(struct bio *bio)
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
+ if (unlikely(bio_is_dma_direct(bio)))
+ return;
+
bio_for_each_segment_all(bvec, bio, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
@@ -1704,6 +1731,9 @@ void bio_check_pages_dirty(struct bio *bio)
unsigned long flags;
struct bvec_iter_all iter_all;
+ if (unlikely(bio_is_dma_direct(bio)))
+ return;
+
bio_for_each_segment_all(bvec, bio, iter_all) {
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
goto defer;
@@ -1777,6 +1807,9 @@ void bio_flush_dcache_pages(struct bio *bi)
struct bio_vec bvec;
struct bvec_iter iter;
+ if (unlikely(bio_is_dma_direct(bi)))
+ return;
+
bio_for_each_segment(bvec, bi, iter)
flush_dcache_page(bvec.bv_page);
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..ea152d54c7ce 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1467,6 +1467,9 @@ void rq_flush_dcache_pages(struct request *rq)
struct req_iterator iter;
struct bio_vec bvec;
+ if (unlikely(blk_rq_is_dma_direct(rq)))
+ return;
+
rq_for_each_segment(bvec, rq, iter)
flush_dcache_page(bvec.bv_page);
}
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 02/28] block: Add dma_vec structure
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
The dma_vec structure is similar to the bio_vec structure except
it only stores DMA addresses instead of the struct page address.
struct bios will be able to make use of dma_vecs with a union and,
therefore, we need to ensure that struct dma_vec is no larger
than struct bvec, as they will share the allocated memory.
dma_vecs can make the same use of the bvec_iter structure
to iterate through the vectors.
This will be used for passing DMA addresses directly through the block
layer. I expect something like struct dma_vec will also be used in
Christoph's work to improve the dma_mapping layer and remove sgls.
At some point, these would use the same structure.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
include/linux/bio.h | 12 +++++++++++
include/linux/blk_types.h | 5 ++++-
include/linux/bvec.h | 43 +++++++++++++++++++++++++++++++++++++++
3 files changed, 59 insertions(+), 1 deletion(-)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0f23b5682640..8180309123d7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -28,6 +28,8 @@
#define bio_iter_iovec(bio, iter) \
bvec_iter_bvec((bio)->bi_io_vec, (iter))
+#define bio_iter_dma_vec(bio, iter) \
+ bvec_iter_dvec((bio)->bi_dma_vec, (iter))
#define bio_iter_page(bio, iter) \
bvec_iter_page((bio)->bi_io_vec, (iter))
@@ -39,6 +41,7 @@
#define bio_page(bio) bio_iter_page((bio), (bio)->bi_iter)
#define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter)
#define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter)
+#define bio_dma_vec(bio) bio_iter_dma_vec((bio), (bio)->bi_iter)
#define bio_multiple_segments(bio) \
((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len)
@@ -155,6 +158,15 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
#define bio_for_each_bvec(bvl, bio, iter) \
__bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)
+#define __bio_for_each_dvec(dvl, bio, iter, start) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((dvl = bvec_iter_dvec((bio)->bi_dma_vec, (iter))), 1); \
+ dvec_iter_advance((bio)->bi_dma_vec, &(iter), (dvl).dv_len))
+
+#define bio_for_each_dvec(dvl, bio, iter) \
+ __bio_for_each_dvec(dvl, bio, iter, (bio)->bi_iter)
+
#define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
static inline unsigned bio_segments(struct bio *bio)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f3cabfdb6774..7f76ea73b77d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -191,7 +191,10 @@ struct bio {
atomic_t __bi_cnt; /* pin count */
- struct bio_vec *bi_io_vec; /* the actual vec list */
+ union {
+ struct bio_vec *bi_io_vec; /* the actual vec list */
+ struct dma_vec *bi_dma_vec; /* for dma direct bios*/
+ };
struct bio_set *bi_pool;
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index a032f01e928c..f680e96132ef 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -21,6 +21,11 @@ struct bio_vec {
unsigned int bv_offset;
};
+struct dma_vec {
+ dma_addr_t dv_addr;
+ unsigned int dv_len;
+};
+
struct bvec_iter {
sector_t bi_sector; /* device address in 512 byte
sectors */
@@ -84,6 +89,18 @@ struct bvec_iter_all {
.bv_offset = bvec_iter_offset((bvec), (iter)), \
})
+#define bvec_iter_dvec_addr(dvec, iter) \
+ (__bvec_iter_bvec((dvec), (iter))->dv_addr + (iter).bi_bvec_done)
+#define bvec_iter_dvec_len(dvec, iter) \
+ min((iter).bi_size, \
+ __bvec_iter_bvec((dvec), (iter))->dv_len - (iter).bi_bvec_done)
+
+#define bvec_iter_dvec(dvec, iter) \
+((struct dma_vec) { \
+ .dv_addr = bvec_iter_dvec_addr((dvec), (iter)), \
+ .dv_len = bvec_iter_dvec_len((dvec), (iter)), \
+})
+
static inline bool bvec_iter_advance(const struct bio_vec *bv,
struct bvec_iter *iter, unsigned bytes)
{
@@ -110,6 +127,32 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
return true;
}
+static inline bool dvec_iter_advance(const struct dma_vec *dv,
+ struct bvec_iter *iter, unsigned bytes)
+{
+ if (WARN_ONCE(bytes > iter->bi_size,
+ "Attempted to advance past end of dvec iter\n")) {
+ iter->bi_size = 0;
+ return false;
+ }
+
+ while (bytes) {
+ const struct dma_vec *cur = dv + iter->bi_idx;
+ unsigned len = min3(bytes, iter->bi_size,
+ cur->dv_len - iter->bi_bvec_done);
+
+ bytes -= len;
+ iter->bi_size -= len;
+ iter->bi_bvec_done += len;
+
+ if (iter->bi_bvec_done == cur->dv_len) {
+ iter->bi_bvec_done = 0;
+ iter->bi_idx++;
+ }
+ }
+ return true;
+}
+
#define for_each_bvec(bvl, bio_vec, iter, start) \
for (iter = (start); \
(iter).bi_size && \
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 01/28] block: Introduce DMA direct request type
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
In-Reply-To: <20190620161240.22738-1-logang@deltatee.com>
A DMA direct request allows passing DMA addresses directly through
the block layer, instead of struct pages. This allows the calling
layer to take care of the mapping and unmapping and also creates
a path to doing peer-to-peer transactions without using struct pages.
Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
---
include/linux/blk_types.h | 9 ++++++++-
include/linux/blkdev.h | 10 ++++++++++
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 95202f80676c..f3cabfdb6774 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -322,6 +322,7 @@ enum req_flag_bits {
__REQ_NOUNMAP, /* do not free blocks when zeroing */
__REQ_HIPRI,
+ __REQ_DMA_DIRECT, /* DMA address direct request */
/* for driver use */
__REQ_DRV,
@@ -345,6 +346,7 @@ enum req_flag_bits {
#define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
#define REQ_HIPRI (1ULL << __REQ_HIPRI)
+#define REQ_DMA_DIRECT (1ULL << __REQ_DMA_DIRECT)
#define REQ_DRV (1ULL << __REQ_DRV)
#define REQ_SWAP (1ULL << __REQ_SWAP)
@@ -353,7 +355,7 @@ enum req_flag_bits {
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
#define REQ_NOMERGE_FLAGS \
- (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
+ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA | REQ_DMA_DIRECT)
enum stat_group {
STAT_READ,
@@ -412,6 +414,11 @@ static inline int op_stat_group(unsigned int op)
return op_is_write(op);
}
+static inline int op_is_dma_direct(unsigned int op)
+{
+ return op & REQ_DMA_DIRECT;
+}
+
typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE -1U
#define BLK_QC_T_SHIFT 16
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 592669bcc536..ce70d5dded5f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -271,6 +271,16 @@ static inline bool bio_is_passthrough(struct bio *bio)
return blk_op_is_scsi(op) || blk_op_is_private(op);
}
+static inline bool bio_is_dma_direct(struct bio *bio)
+{
+ return op_is_dma_direct(bio->bi_opf);
+}
+
+static inline bool blk_rq_is_dma_direct(struct request *rq)
+{
+ return op_is_dma_direct(rq->cmd_flags);
+}
+
static inline unsigned short req_get_ioprio(struct request *req)
{
return req->ioprio;
--
2.20.1
^ permalink raw reply related
* [RFC PATCH 00/28] Removing struct page from P2PDMA
From: Logan Gunthorpe @ 2019-06-20 16:12 UTC (permalink / raw)
For eons there has been a debate over whether or not to use
struct pages for peer-to-peer DMA transactions. Pro-pagers have
argued that struct pages are necessary for interacting with
existing code like scatterlists or the bio_vecs. Anti-pagers
assert that the tracking of the memory is unecessary and
allocating the pages is a waste of memory. Both viewpoints are
valid, however developers working on GPUs and RDMA tend to be
able to do away with struct pages relatively easily compared to
those wanting to work with NVMe devices through the block layer.
So it would be of great value to be able to universally do P2PDMA
transactions without the use of struct pages.
Previously, there have been multiple attempts[1][2] to replace
struct page usage with pfn_t but this has been unpopular seeing
it creates dangerous edge cases where unsuspecting code might
run accross pfn_t's they are not ready for.
Currently, we have P2PDMA using struct pages through the block layer
and the dangerous cases are avoided by using a queue flag that
indicates support for the special pages.
This RFC proposes a new solution: allow the block layer to take
DMA addresses directly for queues that indicate support. This will
provide a more general path for doing P2PDMA-like requests and will
allow us to remove the struct pages that back P2PDMA memory thus paving
the way to build a more uniform P2PDMA ecosystem.
This is a fairly long patch set but most of the patches are quite
small. Patches 1 through 18 introduce the concept of a dma_vec that
is similar to a bio_vec (except it takes dma_addr_t's instead of pages
and offsets) as well as a special dma-direct bio/request. Most of these
patches just prevent the new type of bio from being mis-used and
also support splitting and mapping them in the same way that struct
page bios can be operated on. Patches 19 through 22 modify the existing
P2PDMA support in nvme-pci, ib-core and nvmet to use DMA addresses
directly. Patches 23 through 25 remove the P2PDMA specific
code from the block layer and ib-core. Finally, patches 26 through 28
remove the struct pages from the PCI P2PDMA code.
This RFC is based on v5.2-rc5 and a git branch is available here:
https://github.com/sbates130272/linux-p2pmem.git dma_direct_rfc1
[1] https://lwn.net/Articles/647404/
[2] https://lore.kernel.org/lkml/1495662147-18277-1-git-send-email-logang at deltatee.com/
--
Logan Gunthorpe (28):
block: Introduce DMA direct request type
block: Add dma_vec structure
block: Warn on mis-use of dma-direct bios
block: Never bounce dma-direct bios
block: Skip dma-direct bios in bio_integrity_prep()
block: Support dma-direct bios in bio_advance_iter()
block: Use dma_vec length in bio_cur_bytes() for dma-direct bios
block: Introduce dmavec_phys_mergeable()
block: Introduce vec_gap_to_prev()
block: Create generic vec_split_segs() from bvec_split_segs()
block: Create blk_segment_split_ctx
block: Create helper for bvec_should_split()
block: Generalize bvec_should_split()
block: Support splitting dma-direct bios
block: Support counting dma-direct bio segments
block: Implement mapping dma-direct requests to SGs in blk_rq_map_sg()
block: Introduce queue flag to indicate support for dma-direct bios
block: Introduce bio_add_dma_addr()
nvme-pci: Support dma-direct bios
IB/core: Introduce API for initializing a RW ctx from a DMA address
nvmet: Split nvmet_bdev_execute_rw() into a helper function
nvmet: Use DMA addresses instead of struct pages for P2P
nvme-pci: Remove support for PCI_P2PDMA requests
block: Remove PCI_P2PDMA queue flag
IB/core: Remove P2PDMA mapping support in rdma_rw_ctx
PCI/P2PDMA: Remove SGL helpers
PCI/P2PDMA: Remove struct pages that back P2PDMA memory
memremap: Remove PCI P2PDMA page memory type
Documentation/driver-api/pci/p2pdma.rst | 9 +-
block/bio-integrity.c | 4 +
block/bio.c | 71 +++++++
block/blk-core.c | 3 +
block/blk-merge.c | 256 ++++++++++++++++++------
block/blk.h | 49 ++++-
block/bounce.c | 8 +
drivers/infiniband/core/rw.c | 85 ++++++--
drivers/nvme/host/core.c | 4 +-
drivers/nvme/host/nvme.h | 2 +-
drivers/nvme/host/pci.c | 29 ++-
drivers/nvme/target/core.c | 12 +-
drivers/nvme/target/io-cmd-bdev.c | 82 +++++---
drivers/nvme/target/nvmet.h | 5 +-
drivers/nvme/target/rdma.c | 43 +++-
drivers/pci/p2pdma.c | 202 +++----------------
include/linux/bio.h | 32 ++-
include/linux/blk_types.h | 14 +-
include/linux/blkdev.h | 16 +-
include/linux/bvec.h | 43 ++++
include/linux/memremap.h | 5 -
include/linux/mm.h | 13 --
include/linux/pci-p2pdma.h | 19 --
include/rdma/rw.h | 6 +
24 files changed, 648 insertions(+), 364 deletions(-)
--
2.20.1
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.