* [PATCH 1/6] IB: New common API for draining a queue pair
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
@ 2016-02-04 21:51 ` Steve Wise
2016-02-04 21:52 ` [PATCH 2/6] iw_cxgb4: add drain_qp function Steve Wise
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-04 21:51 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
From: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Add provider-specific drain_qp function for providers
needing special drain logic.
Add static function __ib_drain_qp() which posts noop WRs to the RQ and
SQ and blocks until their completions are processed. This ensures the
applications completions have all been processed.
Add API function ib_drain_qp() which calls the provider-specific drain
if it exists or __ib_drain_qp().
Reviewed-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
drivers/infiniband/core/verbs.c | 72 +++++++++++++++++++++++++++++++++++++++++
include/rdma/ib_verbs.h | 2 ++
2 files changed, 74 insertions(+)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 5af6d02..31b82cd 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1657,3 +1657,75 @@ next_page:
return i;
}
EXPORT_SYMBOL(ib_sg_to_pages);
+
+struct ib_drain_cqe {
+ struct ib_cqe cqe;
+ struct completion done;
+};
+
+static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe,
+ cqe);
+
+ complete(&cqe->done);
+}
+
+/*
+ * Post a WR and block until its completion is reaped for both the RQ and SQ.
+ */
+static void __ib_drain_qp(struct ib_qp *qp)
+{
+ struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+ struct ib_drain_cqe rdrain, sdrain;
+ struct ib_recv_wr rwr = {}, *bad_rwr;
+ struct ib_send_wr swr = {}, *bad_swr;
+ int ret;
+
+ rwr.wr_cqe = &rdrain.cqe;
+ rdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&rdrain.done);
+
+ swr.wr_cqe = &sdrain.cqe;
+ sdrain.cqe.done = ib_drain_qp_done;
+ init_completion(&sdrain.done);
+
+ ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
+ return;
+ }
+
+ ret = ib_post_recv(qp, &rwr, &bad_rwr);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
+ return;
+ }
+
+ ret = ib_post_send(qp, &swr, &bad_swr);
+ if (ret) {
+ WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
+ return;
+ }
+
+ wait_for_completion(&rdrain.done);
+ wait_for_completion(&sdrain.done);
+}
+
+/**
+ * ib_drain_qp() - Block until all CQEs have been consumed by the
+ * application.
+ * @qp: queue pair to drain
+ *
+ * If the device has a provider-specific drain function, then
+ * call that. Otherwise call the generic drain function
+ * __ib_drain_qp().
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+ if (qp->device->drain_qp)
+ qp->device->drain_qp(qp);
+ else
+ __ib_drain_qp(qp);
+}
+EXPORT_SYMBOL(ib_drain_qp);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 284b00c..d8533ab 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1846,6 +1846,7 @@ struct ib_device {
int (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
struct ib_mr_status *mr_status);
void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
+ void (*drain_qp)(struct ib_qp *qp);
struct ib_dma_mapping_ops *dma_ops;
@@ -3094,4 +3095,5 @@ int ib_sg_to_pages(struct ib_mr *mr,
int sg_nents,
int (*set_page)(struct ib_mr *, u64));
+void ib_drain_qp(struct ib_qp *qp);
#endif /* IB_VERBS_H */
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 2/6] iw_cxgb4: add drain_qp function
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
2016-02-04 21:51 ` [PATCH 1/6] IB: New common API for draining a queue pair Steve Wise
@ 2016-02-04 21:52 ` Steve Wise
2016-02-04 21:52 ` [PATCH 3/6] nvme-rdma: use ib_drain_qp() function Steve Wise
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-04 21:52 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
From: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Reviewed-by: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Reviewed-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
drivers/infiniband/hw/cxgb4/cq.c | 6 +++++-
drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 ++
drivers/infiniband/hw/cxgb4/provider.c | 1 +
drivers/infiniband/hw/cxgb4/qp.c | 8 ++++++++
4 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index cf21df4..6fdcf78 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -815,8 +815,12 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
}
}
out:
- if (wq)
+ if (wq) {
+ if (unlikely(qhp->attr.state != C4IW_QP_STATE_RTS &&
+ t4_sq_empty(wq) && t4_rq_empty(wq)))
+ complete(&qhp->qp_drained);
spin_unlock(&qhp->lock);
+ }
return ret;
}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index fb2de75..fdb9d9a 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -476,6 +476,7 @@ struct c4iw_qp {
wait_queue_head_t wait;
struct timer_list timer;
int sq_sig_all;
+ struct completion qp_drained;
};
static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
@@ -1016,6 +1017,7 @@ extern int c4iw_wr_log;
extern int db_fc_threshold;
extern int db_coalescing_threshold;
extern int use_dsgl;
+void c4iw_drain_qp(struct ib_qp *qp);
#endif
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index ec04272..0ab942f 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -564,6 +564,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
dev->ibdev.get_protocol_stats = c4iw_get_mib;
dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
dev->ibdev.get_port_immutable = c4iw_port_immutable;
+ dev->ibdev.drain_qp = c4iw_drain_qp;
dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
if (!dev->ibdev.iwcm)
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index e99345e..2e70c01 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1697,6 +1697,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
qhp->attr.max_ird = 0;
qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR;
spin_lock_init(&qhp->lock);
+ init_completion(&qhp->qp_drained);
mutex_init(&qhp->mutex);
init_waitqueue_head(&qhp->wait);
atomic_set(&qhp->refcnt, 1);
@@ -1888,3 +1889,10 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0;
return 0;
}
+
+void c4iw_drain_qp(struct ib_qp *ibqp)
+{
+ struct c4iw_qp *qp = to_c4iw_qp(ibqp);
+
+ wait_for_completion(&qp->qp_drained);
+}
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 3/6] nvme-rdma: use ib_drain_qp() function
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
2016-02-04 21:51 ` [PATCH 1/6] IB: New common API for draining a queue pair Steve Wise
2016-02-04 21:52 ` [PATCH 2/6] iw_cxgb4: add drain_qp function Steve Wise
@ 2016-02-04 21:52 ` Steve Wise
2016-02-04 21:52 ` [PATCH 4/6] IB: add a simple MR pool Steve Wise
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-04 21:52 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
From: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Reviewed-by: Sagi Grimberg <sagig-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Reviewed-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
drivers/nvme/host/rdma.c | 35 +++--------------------------------
drivers/nvme/target/rdma.c | 36 +++---------------------------------
2 files changed, 6 insertions(+), 65 deletions(-)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a3e5c3a..613cc39 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -101,8 +101,6 @@ struct nvme_rdma_queue {
struct rdma_cm_id *cm_id;
int cm_error;
struct completion cm_done;
-
- struct completion drain_done;
};
enum nvme_rdma_ctrl_state {
@@ -283,7 +281,8 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
return -ENOMEM;
init_attr->event_handler = nvme_rdma_qp_event;
- init_attr->cap.max_send_wr = factor * queue->queue_size;
+ /* +1 for drain */
+ init_attr->cap.max_send_wr = factor * queue->queue_size + 1;
/* +1 for drain */
init_attr->cap.max_recv_wr = queue->queue_size + 1;
init_attr->cap.max_recv_sge = 1;
@@ -638,7 +637,6 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
queue = &ctrl->queues[idx];
queue->ctrl = ctrl;
init_completion(&queue->cm_done);
- init_completion(&queue->drain_done);
if (idx > 0) {
queue->cmnd_capsule_len =
@@ -682,40 +680,13 @@ out_destroy_cm_id:
return ret;
}
-static void nvme_rdma_drain_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct nvme_rdma_queue *queue = cq->cq_context;
-
- complete(&queue->drain_done);
-}
-
-static struct ib_cqe nvme_rdma_drain_cqe = {
- .done = nvme_rdma_drain_done,
-};
-
-static void nvme_rdma_drain_qp(struct nvme_rdma_queue *queue)
-{
- struct ib_qp *qp = queue->cm_id->qp;
- struct ib_recv_wr wr = { }, *bad_wr;
- int ret;
-
- wr.wr_cqe = &nvme_rdma_drain_cqe;
- ret = ib_post_recv(qp, &wr, &bad_wr);
- if (ret) {
- WARN_ONCE(ret, "ib_post_recv(returned %d\n", ret);
- return;
- }
-
- wait_for_completion(&queue->drain_done);
-}
-
static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
{
if (!test_and_clear_bit(NVME_RDMA_Q_CONNECTED, &queue->flags))
return;
rdma_disconnect(queue->cm_id);
- nvme_rdma_drain_qp(queue);
+ ib_drain_qp(queue->cm_id->qp);
nvme_rdma_destroy_queue_ib(queue);
rdma_destroy_id(queue->cm_id);
}
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 85fd44a..19137c3 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -87,7 +87,6 @@ struct nvmet_rdma_queue {
int recv_queue_size;
int send_queue_size;
- struct completion drain_done;
struct list_head queue_list;
};
@@ -901,7 +900,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
if (ndev->srq) {
qp_attr->srq = ndev->srq;
} else {
- qp_attr->cap.max_recv_wr = queue->recv_queue_size;
+ /* +1 for drain */
+ qp_attr->cap.max_recv_wr = 1 + queue->recv_queue_size;
qp_attr->cap.max_recv_sge = 2;
}
@@ -1165,7 +1165,6 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
ret = -ENOMEM;
goto put_device;
}
- init_completion(&queue->drain_done);
cm_id->context = queue;
ret = nvmet_rdma_cm_accept(cm_id, queue);
@@ -1214,35 +1213,6 @@ out_unlock:
spin_unlock_irqrestore(&queue->state_lock, flags);
}
-static void nvmet_rdma_drain_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct nvmet_rdma_queue *queue = cq->cq_context;
-
- complete(&queue->drain_done);
-}
-
-static struct ib_cqe nvmet_rdma_drain_cqe = {
- .done = nvmet_rdma_drain_done,
-};
-
-static void nvmet_rdma_queue_drain(struct nvmet_rdma_queue *queue)
-{
- struct ib_qp *qp = queue->cm_id->qp;
- struct ib_send_wr wr = { }, *bad_wr;
- int ret;
-
- wr.wr_cqe = &nvmet_rdma_drain_cqe;
- wr.opcode = IB_WR_RDMA_WRITE;
- wr.send_flags = IB_SEND_SIGNALED;
- ret = ib_post_send(qp, &wr, &bad_wr);
- if (ret) {
- WARN_ONCE(ret, "ib_post_send(returned %d\n", ret);
- return;
- }
-
- wait_for_completion(&queue->drain_done);
-}
-
static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
{
bool disconnect = false;
@@ -1264,7 +1234,7 @@ static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
if (disconnect) {
rdma_disconnect(queue->cm_id);
- nvmet_rdma_queue_drain(queue);
+ ib_drain_qp(queue->cm_id->qp);
kref_put(&queue->ref, nvmet_rdma_queue_put);
}
}
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 4/6] IB: add a simple MR pool
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
` (2 preceding siblings ...)
2016-02-04 21:52 ` [PATCH 3/6] nvme-rdma: use ib_drain_qp() function Steve Wise
@ 2016-02-04 21:52 ` Steve Wise
2016-02-04 21:56 ` [PATCH 5/6] IB: generic RDMA READ/WRITE API Steve Wise
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-04 21:52 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
From: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/mr_pool.c | 85 +++++++++++++++++++++++++++++++++++++++
drivers/infiniband/core/verbs.c | 4 ++
include/rdma/ib_verbs.h | 10 ++++-
include/rdma/mr_pool.h | 20 +++++++++
5 files changed, 119 insertions(+), 2 deletions(-)
create mode 100644 drivers/infiniband/core/mr_pool.c
create mode 100644 include/rdma/mr_pool.h
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index f818538..48bd9d8 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \
- roce_gid_mgmt.o
+ roce_gid_mgmt.o mr_pool.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c
new file mode 100644
index 0000000..b0a04c8
--- /dev/null
+++ b/drivers/infiniband/core/mr_pool.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/mr_pool.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp)
+{
+ struct ib_mr *mr = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ mr = list_first_entry_or_null(&qp->free_mrs, struct ib_mr, qp_entry);
+ if (mr)
+ list_move(&mr->qp_entry, &qp->used_mrs);
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_mr_pool_get);
+
+void ib_mr_pool_put(struct ib_qp *qp, struct ib_mr *mr)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_move(&mr->qp_entry, &qp->free_mrs);
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_put);
+
+int ib_mr_pool_init(struct ib_qp *qp, int nr, enum ib_mr_type type,
+ u32 max_num_sg)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+ int ret, i;
+
+ for (i = 0; i < nr; i++) {
+ mr = ib_alloc_mr(qp->pd, type, max_num_sg);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto out;
+ }
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ list_add_tail(&mr->qp_entry, &qp->free_mrs);
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ }
+
+ return 0;
+out:
+ ib_mr_pool_destroy(qp);
+ return ret;
+}
+EXPORT_SYMBOL(ib_mr_pool_init);
+
+void ib_mr_pool_destroy(struct ib_qp *qp)
+{
+ struct ib_mr *mr;
+ unsigned long flags;
+
+ WARN_ON_ONCE(!list_empty(&qp->used_mrs));
+
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ while (!list_empty(&qp->free_mrs)) {
+ mr = list_first_entry(&qp->free_mrs, struct ib_mr, qp_entry);
+ list_del(&mr->qp_entry);
+
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+ ib_dereg_mr(mr);
+ spin_lock_irqsave(&qp->mr_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp->mr_lock, flags);
+}
+EXPORT_SYMBOL(ib_mr_pool_destroy);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 31b82cd..879839f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -738,6 +738,10 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
qp->uobject = NULL;
qp->qp_type = qp_init_attr->qp_type;
+ spin_lock_init(&qp->mr_lock);
+ INIT_LIST_HEAD(&qp->free_mrs);
+ INIT_LIST_HEAD(&qp->used_mrs);
+
atomic_set(&qp->usecnt, 0);
if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
qp->event_handler = __ib_shared_qp_event_handler;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d8533ab..380d03d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1408,6 +1408,11 @@ struct ib_qp {
struct ib_srq *srq;
struct ib_xrcd *xrcd; /* XRC TGT QPs only */
struct list_head xrcd_list;
+
+ spinlock_t mr_lock;
+ struct list_head free_mrs;
+ struct list_head used_mrs;
+
/* count times opened, mcast attaches, flow attaches */
atomic_t usecnt;
struct list_head open_list;
@@ -1422,12 +1427,15 @@ struct ib_qp {
struct ib_mr {
struct ib_device *device;
struct ib_pd *pd;
- struct ib_uobject *uobject;
u32 lkey;
u32 rkey;
u64 iova;
u32 length;
unsigned int page_size;
+ union {
+ struct ib_uobject *uobject; /* user */
+ struct list_head qp_entry; /* FR */
+ };
};
struct ib_mw {
diff --git a/include/rdma/mr_pool.h b/include/rdma/mr_pool.h
new file mode 100644
index 0000000..b4ee196
--- /dev/null
+++ b/include/rdma/mr_pool.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <rdma/ib_verbs.h>
+
+struct ib_mr *ib_mr_pool_get(struct ib_qp *qp);
+void ib_mr_pool_put(struct ib_qp *qp, struct ib_mr *mr);
+
+int ib_mr_pool_init(struct ib_qp *qp, int nr, enum ib_mr_type type,
+ u32 max_num_sg);
+void ib_mr_pool_destroy(struct ib_qp *qp);
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 5/6] IB: generic RDMA READ/WRITE API
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
` (3 preceding siblings ...)
2016-02-04 21:52 ` [PATCH 4/6] IB: add a simple MR pool Steve Wise
@ 2016-02-04 21:56 ` Steve Wise
2016-02-04 21:56 ` [PATCH 6/6] nvmet_rdma: use generic RDMA READ/WRITE path Steve Wise
2016-02-05 21:54 ` [PATCH 0/6] iWARP patches for the 4.5-rc2 rebase Steve Wise
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-04 21:56 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
From: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
This supports both manual mapping of lots of SGEs, as well as using MRs
from the QP's MR pool, for iWarp or other cases where it's more optimal.
For now, MRs are only used for iWARP transports. The user of the RDMA-RW
API must allocate the QP MR pool as well as size the SQ accordingly.
XXX: for now added under drivers/nvme. Should move to drivers/infiniband
before submitting upstream.
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
drivers/nvme/target/Makefile | 2 +-
drivers/nvme/target/rw.c | 348 +++++++++++++++++++++++++++++++++++++++++++
drivers/nvme/target/rw.h | 81 ++++++++++
3 files changed, 430 insertions(+), 1 deletion(-)
create mode 100644 drivers/nvme/target/rw.c
create mode 100644 drivers/nvme/target/rw.h
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index e5b8680..a9b50b5 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o
nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o
nvme-loop-y += loop.o
-nvmet-rdma-y += rdma.o
+nvmet-rdma-y += rdma.o rw.o
diff --git a/drivers/nvme/target/rw.c b/drivers/nvme/target/rw.c
new file mode 100644
index 0000000..0350207
--- /dev/null
+++ b/drivers/nvme/target/rw.c
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/slab.h>
+#include <rdma/mr_pool.h>
+#include "rw.h"
+
+static inline u32 rdma_max_sge(struct rdma_rw_ctx *ctx, struct ib_device *dev)
+{
+ return ctx->dma_dir == DMA_TO_DEVICE ? dev->attrs.max_sge : dev->attrs.max_sge_rd;
+}
+
+static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u64 remote_addr, u32 rkey)
+{
+ struct ib_device *dev = qp->pd->device;
+ struct ib_rdma_wr *rdma_wr = &ctx->single.wr;
+
+ ctx->nr_wrs = 1;
+
+ ctx->single.sge.lkey = qp->pd->local_dma_lkey;
+ ctx->single.sge.addr = ib_sg_dma_address(dev, ctx->sg);
+ ctx->single.sge.length = ib_sg_dma_len(dev, ctx->sg);
+
+ memset(rdma_wr, 0, sizeof(*rdma_wr));
+ rdma_wr->wr.opcode = ctx->dma_dir == DMA_TO_DEVICE ?
+ IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ rdma_wr->wr.sg_list = &ctx->single.sge;
+ rdma_wr->wr.num_sge = 1;
+ rdma_wr->remote_addr = remote_addr;
+ rdma_wr->rkey = rkey;
+
+ return 1;
+}
+
+static int rdma_rw_build_sg_list(struct rdma_rw_ctx *ctx, struct ib_pd *pd,
+ struct ib_sge *sge, u32 data_left, u32 offset)
+{
+ struct scatterlist *sg;
+ u32 sg_nents = min(ctx->dma_nents, rdma_max_sge(ctx, pd->device));
+ u32 page_off = offset % PAGE_SIZE;
+ int i;
+
+ for_each_sg(ctx->sg, sg, sg_nents, i) {
+ sge->addr = ib_sg_dma_address(pd->device, sg) + page_off;
+ sge->length = min_t(u32, data_left,
+ ib_sg_dma_len(pd->device, sg) - page_off);
+ sge->lkey = pd->local_dma_lkey;
+
+ page_off = 0;
+ data_left -= sge->length;
+ if (!data_left)
+ break;
+ sge++;
+ }
+
+ return i + 1;
+}
+
+static int rdma_rw_init_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u64 remote_addr, u32 rkey, u32 length, u32 offset)
+{
+ u32 max_sge = rdma_max_sge(ctx, qp->pd->device);
+ u32 rdma_write_max = max_sge * PAGE_SIZE;
+ struct ib_sge *sge;
+ u32 va_offset = 0, i;
+
+ ctx->map.sges = sge =
+ kcalloc(ctx->dma_nents, sizeof(*ctx->map.sges), GFP_KERNEL);
+ if (!ctx->map.sges)
+ goto out;
+
+ ctx->nr_wrs = DIV_ROUND_UP(ctx->dma_nents, max_sge);
+ ctx->map.wrs = kcalloc(ctx->nr_wrs, sizeof(*ctx->map.wrs), GFP_KERNEL);
+ if (!ctx->map.wrs)
+ goto out_free_sges;
+
+ for (i = 0; i < ctx->nr_wrs; i++) {
+ struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i];
+ u32 data_len = min(length - va_offset, rdma_write_max);
+
+ rdma_wr->wr.opcode = ctx->dma_dir == DMA_TO_DEVICE ?
+ IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ rdma_wr->wr.sg_list = sge;
+ rdma_wr->wr.num_sge = rdma_rw_build_sg_list(ctx, qp->pd, sge,
+ data_len, offset + va_offset);
+ rdma_wr->remote_addr = remote_addr + va_offset;
+ rdma_wr->rkey = rkey;
+
+ if (i + 1 != ctx->nr_wrs)
+ rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr;
+
+ sge += rdma_wr->wr.num_sge;
+ va_offset += data_len;
+ }
+
+ return ctx->nr_wrs;
+
+out_free_sges:
+ kfree(ctx->map.sges);
+out:
+ return -ENOMEM;
+}
+
+static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
+ u64 remote_addr, u32 rkey)
+{
+ int pages_per_mr = qp->pd->device->attrs.max_fast_reg_page_list_len;
+ int pages_left = ctx->dma_nents;
+ struct scatterlist *sg = ctx->sg;
+ bool use_read_w_invalidate = ctx->dma_dir == DMA_FROM_DEVICE &&
+ rdma_protocol_iwarp(qp->device, ctx->port_num);
+ u32 va_offset = 0;
+ int i, ret = 0, count = 0;
+
+ ctx->nr_wrs = (ctx->dma_nents + pages_per_mr - 1) / pages_per_mr;
+ ctx->reg = kcalloc(ctx->nr_wrs, sizeof(*ctx->reg), GFP_KERNEL);
+ if (!ctx->reg) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ctx->nr_wrs; i++) {
+ struct rdma_rw_reg_ctx *reg = &ctx->reg[i];
+ int nents = min(pages_left, pages_per_mr);
+
+ reg->mr = ib_mr_pool_get(qp);
+ if (!reg->mr) {
+ pr_info("failed to allocate MR from pool\n");
+ ret = -EAGAIN;
+ goto out_free;
+ }
+
+ ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey));
+
+ /* XXX: what about a non page sized offset into the SG? */
+ ret = ib_map_mr_sg(reg->mr, sg, nents, PAGE_SIZE);
+ if (ret < nents) {
+ pr_info("failed to map MR\n");
+ ib_mr_pool_put(qp, reg->mr);
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ reg->reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg->reg_wr.mr = reg->mr;
+ reg->reg_wr.key = reg->mr->lkey;
+
+ reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE;
+ if (use_read_w_invalidate)
+ reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE;
+
+ reg->sge.lkey = reg->mr->lkey;
+ reg->sge.addr = reg->mr->iova;
+ reg->sge.length = reg->mr->length;
+
+ reg->wr.wr.sg_list = ®->sge;
+ reg->wr.wr.num_sge = 1;
+ reg->wr.remote_addr = remote_addr + va_offset;
+ reg->wr.rkey = rkey;
+
+ if (use_read_w_invalidate) {
+ reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey;
+
+ count += 2; /* REG_MR + READ_W_INV */
+ } else {
+ if (ctx->dma_dir == DMA_TO_DEVICE)
+ reg->wr.wr.opcode = IB_WR_RDMA_WRITE;
+ else
+ reg->wr.wr.opcode = IB_WR_RDMA_READ;
+
+ reg->inv_wr.opcode = IB_WR_LOCAL_INV;
+ reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey;
+
+ /*
+ * FIXME: IB_SEND_FENCE can stall SQ processing.
+ * The LINV WR should be posted after the RDMA
+ * WR completes instead.
+ */
+ if (i == 0)
+ reg->inv_wr.send_flags |= IB_SEND_FENCE;
+
+ count += 3; /* REG_MR + READ + LOCAL_INV */
+ }
+
+ if (i + 1 == ctx->nr_wrs) {
+ struct rdma_rw_reg_ctx *first = &ctx->reg[0];
+
+ reg->reg_wr.wr.next = &first->wr.wr;
+ if (!use_read_w_invalidate)
+ reg->wr.wr.next = &first->inv_wr;
+ } else {
+ struct rdma_rw_reg_ctx *next = &ctx->reg[i + 1];
+
+ reg->reg_wr.wr.next = &next->reg_wr.wr;
+ reg->wr.wr.next = &next->wr.wr;
+ if (!use_read_w_invalidate)
+ reg->inv_wr.next = &next->inv_wr;
+ }
+
+ va_offset += reg->sge.length;
+ pages_left -= nents;
+ sg += nents; // XXX: use accessors for chained SGLs
+ }
+
+ return count;
+
+out_free:
+ while (--i >= 0)
+ ib_mr_pool_put(qp, ctx->reg[i].mr);
+ kfree(ctx->reg);
+out:
+ return ret;
+}
+
+/**
+ * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
+ * @ctx: context to initialize
+ * @qp: queue pair to operate on
+ * @port_num: port num to which the connection is bound
+ * @sg: scatterlist to READ/WRITE from/to
+ * @nents: number of entries in @sg
+ * @total_len: total length of @sg in bytes
+ * @remote_addr:remote address to read/write (relative to @rkey)
+ * @rkey: remote key to operate on
+ * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ
+ * @offset: current byte offset into @sg
+ *
+ * If we're going to use a FR to map this context @max_nents should be smaller
+ * or equal to the MR size.
+ *
+ * Returns the number of WRs that will be needed on the workqueue if successful,
+ * or a negative error code.
+ */
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 nents, u32 total_len,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir,
+ u32 offset)
+{
+ struct ib_device *dev = qp->pd->device;
+ u32 first_sg_index = offset / PAGE_SIZE;
+ int ret = -ENOMEM;
+
+ ctx->sg = sg + first_sg_index;
+ ctx->dma_dir = dir;
+
+ ctx->orig_nents = nents - first_sg_index;
+ ctx->dma_nents =
+ ib_dma_map_sg(dev, ctx->sg, ctx->orig_nents, ctx->dma_dir);
+ if (!ctx->dma_nents)
+ goto out;
+
+ ctx->port_num = port_num;
+ if (rdma_protocol_iwarp(qp->device, ctx->port_num))
+ ret = rdma_rw_init_mr_wrs(ctx, qp, remote_addr, rkey);
+ else if (ctx->dma_nents == 1)
+ ret = rdma_rw_init_single_wr(ctx, qp, remote_addr, rkey);
+ else
+ ret = rdma_rw_init_wrs(ctx, qp, remote_addr, rkey,
+ total_len - offset, offset);
+
+ if (ret < 0)
+ goto out_unmap_sg;
+
+ return ret;
+
+out_unmap_sg:
+ ib_dma_unmap_sg(dev, ctx->sg, ctx->orig_nents, ctx->dma_dir);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(rdma_rw_ctx_init);
+
+/**
+ * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init
+ * @ctx: context to release
+ * @qp: queue pair to operate on
+ */
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp)
+{
+ if (rdma_protocol_iwarp(qp->device, ctx->port_num)) {
+ int i;
+
+ for (i = 0; i < ctx->nr_wrs; i++)
+ ib_mr_pool_put(qp, ctx->reg[i].mr);
+ kfree(ctx->reg);
+ } else if (ctx->nr_wrs > 1) {
+ kfree(ctx->map.wrs);
+ kfree(ctx->map.sges);
+ }
+
+ ib_dma_unmap_sg(qp->pd->device, ctx->sg, ctx->orig_nents, ctx->dma_dir);
+}
+EXPORT_SYMBOL(rdma_rw_ctx_destroy);
+
+/**
+ * rdma_rw_post - post a RDMA READ or RDMA WRITE operation
+ * @ctx: context to operate on
+ * @qp: queue pair to operate on
+ * @cqe: completion queue entry for the last WR
+ * @chain_wr: WR to append to the posted chain
+ *
+ * Post the set of RDMA READ/WRITE operations described by @ctx, as well as
+ * any memory registration operations needed. If @chain_wr is non-NULL the
+ * WR it points to will be appended to the chain of WRs posted. If @chain_wr
+ * is not set @cqe must be set so that the caller gets a completion
+ * notification.
+ */
+int rdma_rw_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct ib_cqe *cqe,
+ struct ib_send_wr *chain_wr)
+{
+ struct ib_send_wr *first_wr, *last_wr, *bad_wr;
+
+ if (rdma_protocol_iwarp(qp->device, ctx->port_num)) {
+ first_wr = &ctx->reg[0].reg_wr.wr;
+ if (ctx->dma_dir == DMA_FROM_DEVICE &&
+ rdma_protocol_iwarp(qp->device, ctx->port_num))
+ last_wr = &ctx->reg[ctx->nr_wrs - 1].wr.wr;
+ else
+ last_wr = &ctx->reg[ctx->nr_wrs - 1].inv_wr;
+ } else if (ctx->dma_nents == 1) {
+ first_wr = &ctx->single.wr.wr;
+ last_wr = &ctx->single.wr.wr;
+ } else {
+ first_wr = &ctx->map.wrs[0].wr;
+ last_wr = &ctx->map.wrs[ctx->nr_wrs - 1].wr;
+ }
+
+ if (chain_wr) {
+ last_wr->next = chain_wr;
+ } else {
+ last_wr->wr_cqe = cqe;
+ last_wr->send_flags |= IB_SEND_SIGNALED;
+ }
+
+ return ib_post_send(qp, first_wr, &bad_wr);
+}
+EXPORT_SYMBOL(rdma_rw_post);
diff --git a/drivers/nvme/target/rw.h b/drivers/nvme/target/rw.h
new file mode 100644
index 0000000..27e2bfe
--- /dev/null
+++ b/drivers/nvme/target/rw.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#ifndef _RDMA_RW_H
+#define _RDMA_RW_H
+
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/mr_pool.h>
+
+struct rdma_rw_ctx {
+ /*
+ * The scatterlist passed in, and the number of entries and total
+ * length operated on. Note that these might be smaller than the
+ * values originally passed in if an offset or max_nents value was
+ * passed to rdma_rw_ctx_init.
+ *
+ * dma_nents is the value returned from dma_map_sg, which might be
+ * smaller than the original value passed in.
+ */
+ struct scatterlist *sg;
+ u32 orig_nents;
+ u32 dma_nents;
+
+ /* data direction of the transfer */
+ enum dma_data_direction dma_dir;
+
+ /* number of RDMA READ/WRITE WRs (not counting MR WRs) */
+ int nr_wrs;
+
+ /*
+ * The device port number pass in for the connection. Needed to call
+ * rdma_protocol_iwarp() for enabling iwarp-specific features.
+ */
+ u8 port_num;
+
+ union {
+ /* for mapping a single SGE or registering a single WR: */
+ struct {
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ } single;
+
+ /* for mapping of multiple SGEs: */
+ struct {
+ struct ib_sge *sges;
+ struct ib_rdma_wr *wrs;
+ } map;
+
+ /* for registering multiple WRs: */
+ struct rdma_rw_reg_ctx {
+ struct ib_sge sge;
+ struct ib_rdma_wr wr;
+ struct ib_reg_wr reg_wr;
+ struct ib_send_wr inv_wr;
+ struct ib_mr *mr;
+ } *reg;
+ };
+};
+
+int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
+ struct scatterlist *sg, u32 nents, u32 length,
+ u64 remote_addr, u32 rkey, enum dma_data_direction dir,
+ u32 offset);
+void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp);
+
+int rdma_rw_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct ib_cqe *cqe,
+ struct ib_send_wr *chain_wr);
+
+#endif /* _RDMA_RW_H */
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread
* [PATCH 6/6] nvmet_rdma: use generic RDMA READ/WRITE path
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
` (4 preceding siblings ...)
2016-02-04 21:56 ` [PATCH 5/6] IB: generic RDMA READ/WRITE API Steve Wise
@ 2016-02-04 21:56 ` Steve Wise
2016-02-05 21:54 ` [PATCH 0/6] iWARP patches for the 4.5-rc2 rebase Steve Wise
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-04 21:56 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
From: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Christoph Hellwig <hch-jcswGhMUV9g@public.gmane.org>
Signed-off-by: Steve Wise <swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
drivers/nvme/target/rdma.c | 169 ++++++++++++++-------------------------------
1 file changed, 51 insertions(+), 118 deletions(-)
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 19137c3..8e870f9 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -28,6 +28,7 @@
#include <linux/nvme-rdma.h>
#include "nvmet.h"
+#include "rw.h"
/*
* Number of scatter/gather entries we support per WR. We only ever use two
@@ -47,8 +48,8 @@ struct nvmet_rdma_cmd {
struct nvmet_rdma_queue *queue;
- struct ib_rdma_wr *rdma_wr;
struct ib_cqe read_cqe;
+ struct rdma_rw_ctx rw;
struct scatterlist inline_sg;
void *inline_data;
@@ -98,6 +99,7 @@ struct nvmet_rdma_device {
size_t srq_size;
struct kref ref;
struct list_head entry;
+ bool need_rdma_read_mr;
};
static u16 nvmet_rdma_cm_port = 1023; // XXX
@@ -318,78 +320,6 @@ static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
kfree(cmds);
}
-static void nvmet_rdma_free_rdma_wrs(struct nvmet_rdma_cmd *cmd)
-{
- struct ib_rdma_wr *wr, *next;
-
- for (wr = cmd->rdma_wr; wr; wr = next) {
- if (wr->wr.next &&
- (wr->wr.next->opcode == IB_WR_RDMA_READ ||
- wr->wr.next->opcode == IB_WR_RDMA_WRITE))
- next = rdma_wr(wr->wr.next);
- else
- next = NULL;
-
- kfree(wr);
- }
-}
-
-static int nvmet_rdma_setup_rdma_wrs(struct ib_pd *pd, struct scatterlist *sgl,
- int sg_cnt, int hw_sge_cnt, struct ib_rdma_wr **first,
- struct ib_rdma_wr **last, u64 remote_addr, u32 len,
- u32 rkey, enum dma_data_direction dir)
-{
- struct ib_device *dev = pd->device;
- struct ib_rdma_wr *wr = NULL, *prev = NULL;
- int offset = 0, nr_wrs, i, j;
-
- nr_wrs = (sg_cnt + hw_sge_cnt - 1) / hw_sge_cnt;
- for (i = 0; i < nr_wrs; i++) {
- int nr_sge = min_t(int, sg_cnt, hw_sge_cnt);
- struct ib_sge *sge;
- struct scatterlist *s;
-
- wr = kzalloc(sizeof(*wr) + nr_sge * sizeof(*sge), GFP_KERNEL);
- if (!wr)
- return -ENOMEM;
-
- wr->wr.opcode = dir == DMA_FROM_DEVICE ?
- IB_WR_RDMA_READ : IB_WR_RDMA_WRITE;
- wr->wr.num_sge = nr_sge;
- wr->wr.sg_list = sge = (struct ib_sge *)(wr + 1);
-
- wr->remote_addr = remote_addr + offset;
- wr->rkey = rkey;
-
- for_each_sg(sgl, s, nr_sge, j) {
- sge->addr = ib_sg_dma_address(dev, s);
- sge->length = min_t(u32, ib_sg_dma_len(dev, s), len);
- sge->lkey = pd->local_dma_lkey;
-
- offset += sge->length;
- len -= sge->length;
- if (!len) {
- WARN_ON_ONCE(j + 1 != nr_sge);
- break;
- }
- sge++;
- }
-
- sg_cnt -= nr_sge;
- sgl = s;
-
- if (prev)
- prev->wr.next = &wr->wr;
- else
- *first = wr;
- prev = wr;
- }
-
- WARN_ON_ONCE(!wr);
- *last = wr;
- return nr_wrs;
-}
-
static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
struct nvmet_rdma_cmd *cmd)
{
@@ -429,16 +359,12 @@ static void nvmet_rdma_release_cmd(struct nvmet_rdma_cmd *cmd)
struct nvmet_rdma_queue *queue = cmd->queue;
atomic_add(1 + cmd->n_rdma, &queue->sq_wr_avail);
- if (cmd->n_rdma)
- nvmet_rdma_free_rdma_wrs(cmd);
- if (cmd->req.sg_cnt) {
- ib_dma_unmap_sg(queue->dev->device, cmd->req.sg,
- cmd->req.sg_cnt, nvmet_data_dir(&cmd->req));
+ if (cmd->n_rdma)
+ rdma_rw_ctx_destroy(&cmd->rw, queue->cm_id->qp);
- if (cmd->req.sg != &cmd->inline_sg)
- nvmet_rdma_free_sgl(cmd->req.sg, cmd->req.sg_cnt);
- }
+ if (cmd->req.sg != &cmd->inline_sg)
+ nvmet_rdma_free_sgl(cmd->req.sg, cmd->req.sg_cnt);
if (unlikely(!list_empty_careful(&queue->cmd_wr_wait_list)))
nvmet_rdma_process_wr_wait_list(queue);
@@ -457,7 +383,9 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
{
struct nvmet_rdma_cmd *cmd =
container_of(req, struct nvmet_rdma_cmd, req);
- struct ib_send_wr *first_wr, *bad_wr;
+ struct ib_qp *qp = cmd->queue->cm_id->qp;
+ struct ib_send_wr *bad_wr;
+ int ret;
if (cmd->req.flags & NVMET_REQ_INVALIDATE_RKEY) {
cmd->send_wr.opcode = IB_WR_SEND_WITH_INV;
@@ -467,12 +395,12 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
}
if (nvmet_rdma_need_data_out(req))
- first_wr = &cmd->rdma_wr->wr;
+ ret = rdma_rw_post(&cmd->rw, qp, NULL, &cmd->send_wr);
else
- first_wr = &cmd->send_wr;
+ ret = ib_post_send(qp, &cmd->send_wr, &bad_wr);
- if (ib_post_send(cmd->queue->cm_id->qp, first_wr, &bad_wr)) {
- pr_err("sending cmd response failed\n");
+ if (ret) {
+ pr_err("sending cmd response failed: %d\n", ret);
nvmet_rdma_release_cmd(cmd);
}
}
@@ -485,7 +413,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
WARN_ON(cmd->n_rdma <= 0);
atomic_add(cmd->n_rdma, &queue->sq_wr_avail);
- nvmet_rdma_free_rdma_wrs(cmd);
+ rdma_rw_ctx_destroy(&cmd->rw, queue->cm_id->qp);
cmd->n_rdma = 0;
if (unlikely(wc->status != IB_WC_SUCCESS &&
@@ -541,11 +469,8 @@ static u16 nvmet_rdma_map_inline_data(struct nvmet_rdma_cmd *cmd)
static u16 nvmet_rdma_map_sgl_data(struct nvmet_rdma_cmd *cmd,
struct nvme_rsgl_desc *rsgl)
{
- struct nvmet_rdma_device *ndev = cmd->queue->dev;
- enum dma_data_direction dir = nvmet_data_dir(&cmd->req);
- struct ib_rdma_wr *last_wr;
struct scatterlist *sg;
- int sg_cnt, count, hw_sge_cnt, ret;
+ int sg_cnt, ret;
u32 len;
u16 status;
@@ -569,17 +494,10 @@ static u16 nvmet_rdma_map_sgl_data(struct nvmet_rdma_cmd *cmd,
if (status)
return status;
- count = ib_dma_map_sg(ndev->device, sg, sg_cnt, dir);
- if (unlikely(!count))
- return NVME_SC_INTERNAL;
-
- hw_sge_cnt = dir == DMA_FROM_DEVICE ?
- ndev->device->attrs.max_sge_rd :
- ndev->device->attrs.max_sge;
-
- ret = nvmet_rdma_setup_rdma_wrs(ndev->pd, sg, sg_cnt, hw_sge_cnt,
- &cmd->rdma_wr, &last_wr, le64_to_cpu(rsgl->addr), len,
- get_unaligned_le32(rsgl->key), dir);
+ ret = rdma_rw_ctx_init(&cmd->rw, cmd->queue->cm_id->qp,
+ cmd->queue->cm_id->port_num, sg, sg_cnt, len,
+ le64_to_cpu(rsgl->addr), get_unaligned_le32(rsgl->key),
+ nvmet_data_dir(&cmd->req), 0);
if (ret < 0)
return NVME_SC_INTERNAL;
@@ -592,14 +510,6 @@ static u16 nvmet_rdma_map_sgl_data(struct nvmet_rdma_cmd *cmd,
*/
cmd->req.sg = sg;
cmd->req.sg_cnt = sg_cnt;
-
- if (nvme_is_write(cmd->req.cmd)) {
- last_wr->wr.wr_cqe = &cmd->read_cqe;
- last_wr->wr.send_flags = IB_SEND_SIGNALED;
- } else {
- last_wr->wr.next = &cmd->send_wr;
- }
-
return 0;
}
@@ -679,10 +589,8 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_cmd *cmd)
}
if (nvmet_rdma_need_data_in(&cmd->req)) {
- struct ib_send_wr *bad_wr;
-
- if (ib_post_send(cmd->queue->cm_id->qp, &cmd->rdma_wr->wr,
- &bad_wr))
+ if (rdma_rw_post(&cmd->rw, cmd->queue->cm_id->qp,
+ &cmd->read_cqe, NULL))
nvmet_req_complete(&cmd->req, NVME_SC_DATA_XFER_ERROR);
} else {
cmd->req.execute(&cmd->req);
@@ -698,7 +606,6 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
cmd->queue = queue;
cmd->n_rdma = 0;
- cmd->rdma_wr = NULL;
status = nvmet_req_init(&cmd->req, &queue->nvme_cq, &queue->nvme_sq,
nvmet_rdma_queue_response);
@@ -829,6 +736,9 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
ndev->device = cm_id->device;
kref_init(&ndev->ref);
+ if (rdma_protocol_iwarp(ndev->device, cm_id->port_num))
+ ndev->need_rdma_read_mr = true;
+
ndev->pd = ib_alloc_pd(ndev->device);
if (IS_ERR(ndev->pd))
goto out_free_dev;
@@ -858,8 +768,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
{
struct ib_qp_init_attr *qp_attr;
struct nvmet_rdma_device *ndev = queue->dev;
- int nr_cqe = queue->send_queue_size + queue->recv_queue_size;
- int comp_vector, ret, i;
+ int comp_vector, send_wrs, nr_cqe, ret, i;
/*
* The admin queue is barely used once the controller is live, so don't
@@ -871,6 +780,12 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
comp_vector =
queue->idx % ndev->device->num_comp_vectors;
+ send_wrs = queue->send_queue_size;
+ if (ndev->need_rdma_read_mr)
+ send_wrs *= 3; /* + REG_WR, INV_WR */
+
+ nr_cqe = send_wrs + queue->recv_queue_size;
+
ret = -ENOMEM;
qp_attr = kzalloc(sizeof(*qp_attr), GFP_KERNEL);
if (!qp_attr)
@@ -893,7 +808,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr->qp_type = IB_QPT_RC;
/* +1 for drain */
- qp_attr->cap.max_send_wr = 1 + queue->send_queue_size;
+ qp_attr->cap.max_send_wr = 1 + send_wrs;
qp_attr->cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
ndev->device->attrs.max_sge);
@@ -911,6 +826,20 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
goto err_destroy_cq;
}
+ if (ndev->need_rdma_read_mr) {
+ /*
+ * Allocate one MR per SQE as a start. For devices with very
+ * small MR sizes we will need a multiplier here.
+ */
+ ret = ib_mr_pool_init(queue->cm_id->qp, queue->send_queue_size,
+ IB_MR_TYPE_MEM_REG,
+ ndev->device->attrs.max_fast_reg_page_list_len);
+ if (ret) {
+ pr_err("failed to init MR pool ret= %d\n", ret);
+ goto err_destroy_qp;
+ }
+ }
+
atomic_set(&queue->sq_wr_avail, qp_attr->cap.max_send_wr);
pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
@@ -928,6 +857,8 @@ out:
kfree(qp_attr);
return ret;
+err_destroy_qp:
+ rdma_destroy_qp(queue->cm_id);
err_destroy_cq:
ib_free_cq(queue->cq);
goto out;
@@ -935,6 +866,8 @@ err_destroy_cq:
static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
{
+ if (queue->dev->need_rdma_read_mr)
+ ib_mr_pool_destroy(queue->cm_id->qp);
rdma_destroy_qp(queue->cm_id);
ib_free_cq(queue->cq);
}
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH 0/6] iWARP patches for the 4.5-rc2 rebase
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
` (5 preceding siblings ...)
2016-02-04 21:56 ` [PATCH 6/6] nvmet_rdma: use generic RDMA READ/WRITE path Steve Wise
@ 2016-02-05 21:54 ` Steve Wise
6 siblings, 0 replies; 8+ messages in thread
From: Steve Wise @ 2016-02-05 21:54 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
Ignore these. I sent them to the wrong list :(
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH 0/6] iWARP patches for the 4.5-rc2 rebase
@ 2016-02-05 22:01 Steve Wise
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
0 siblings, 1 reply; 8+ messages in thread
From: Steve Wise @ 2016-02-05 22:01 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
Here are the remaining iwarp-wip patches rebased on nvmf-4.5. I've pushed
this to branch nvmf-4.5-iwarp.
Sagi, it would be great if you could add these when you rebase.
Thanks!
Steve
---
Christoph Hellwig (2):
IB: add a simple MR pool
nvmet_rdma: use generic RDMA READ/WRITE path
Steve Wise (4):
IB: New common API for draining a queue pair
iw_cxgb4: add drain_qp function
nvme-rdma: use ib_drain_qp() function
IB: generic RDMA READ/WRITE API
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/mr_pool.c | 85 ++++++++
drivers/infiniband/core/verbs.c | 76 +++++++
drivers/infiniband/hw/cxgb4/cq.c | 6 +-
drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 +
drivers/infiniband/hw/cxgb4/provider.c | 1 +
drivers/infiniband/hw/cxgb4/qp.c | 8 +
drivers/nvme/host/rdma.c | 35 +---
drivers/nvme/target/Makefile | 2 +-
drivers/nvme/target/rdma.c | 205 +++++--------------
drivers/nvme/target/rw.c | 348 +++++++++++++++++++++++++++++++++
drivers/nvme/target/rw.h | 81 ++++++++
include/rdma/ib_verbs.h | 12 +-
include/rdma/mr_pool.h | 20 ++
14 files changed, 696 insertions(+), 187 deletions(-)
create mode 100644 drivers/infiniband/core/mr_pool.c
create mode 100644 drivers/nvme/target/rw.c
create mode 100644 drivers/nvme/target/rw.h
create mode 100644 include/rdma/mr_pool.h
--
2.7.0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2016-02-05 22:01 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-02-05 22:01 [PATCH 0/6] iWARP patches for the 4.5-rc2 rebase Steve Wise
[not found] ` <cover.1454709715.git.swise-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
2016-02-04 21:51 ` [PATCH 1/6] IB: New common API for draining a queue pair Steve Wise
2016-02-04 21:52 ` [PATCH 2/6] iw_cxgb4: add drain_qp function Steve Wise
2016-02-04 21:52 ` [PATCH 3/6] nvme-rdma: use ib_drain_qp() function Steve Wise
2016-02-04 21:52 ` [PATCH 4/6] IB: add a simple MR pool Steve Wise
2016-02-04 21:56 ` [PATCH 5/6] IB: generic RDMA READ/WRITE API Steve Wise
2016-02-04 21:56 ` [PATCH 6/6] nvmet_rdma: use generic RDMA READ/WRITE path Steve Wise
2016-02-05 21:54 ` [PATCH 0/6] iWARP patches for the 4.5-rc2 rebase Steve Wise
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).