From: Leon Romanovsky <leon@kernel.org>
To: Doug Ledford <dledford@redhat.com>, Jason Gunthorpe <jgg@mellanox.com>
Cc: Leon Romanovsky <leonro@mellanox.com>,
RDMA mailing list <linux-rdma@vger.kernel.org>,
Max Gurtovoy <maxg@mellanox.com>,
Saeed Mahameed <saeedm@mellanox.com>,
Sagi Grimberg <sagi@grimberg.me>,
Yamin Friedman <yaminf@mellanox.com>
Subject: [PATCH rdma-next v4 2/3] RDMA/core: Provide RDMA DIM support for ULPs
Date: Thu, 4 Jul 2019 15:57:42 +0300 [thread overview]
Message-ID: <20190704125743.7814-3-leon@kernel.org> (raw)
In-Reply-To: <20190704125743.7814-1-leon@kernel.org>
From: Yamin Friedman <yaminf@mellanox.com>
Added the interface in the infiniband driver that applies the rdma_dim
adaptive moderation. There is now a special function for allocating an
ib_cq that uses rdma_dim.
Performance improvement (ConnectX-5 100GbE, x86) running FIO benchmark over
NVMf between two equal end-hosts with 56 cores across a Mellanox switch
using null_blk device:
READS without DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 3.8GiB/s | 7.7M | 1401 usec | 2442 usec
4k | 7.0GiB/s | 1.8M | 4817 usec | 6587 usec
64k | 10.7GiB/s| 175k | 9896 usec | 10028 usec
IO WRITES without DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 3.6GiB/s | 7.5M | 1434 usec | 2474 usec
4k | 6.3GiB/s | 1.6M | 938 usec | 1221 usec
64k | 10.7GiB/s| 175k | 8979 usec | 12780 usec
IO READS with DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 4GiB/s | 8.2M | 816 usec | 889 usec
4k | 10.1GiB/s| 2.65M| 3359 usec | 5080 usec
64k | 10.7GiB/s| 175k | 9896 usec | 10028 usec
IO WRITES with DIM:
blk size | BW | IOPS | 99th percentile latency | 99.99th latency
512B | 3.9GiB/s | 8.1M | 799 usec | 922 usec
4k | 9.6GiB/s | 2.5M | 717 usec | 1004 usec
64k | 10.7GiB/s| 176k | 8586 usec | 12256 usec
The rdma_dim algorithm was designed to measure the effectiveness of
moderation on the flow in a general way and thus should be appropriate
for all RDMA storage protocols.
rdma_dim is configured to be the default option based on performance
improvement seen after extensive tests.
Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/cq.c | 45 +++++++++++++++++++++++++++++++
drivers/infiniband/hw/mlx5/main.c | 2 ++
include/rdma/ib_verbs.h | 4 +++
3 files changed, 51 insertions(+)
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 00d70f166209..ffd6e24109d5 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -18,6 +18,40 @@
#define IB_POLL_FLAGS \
(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+static void ib_cq_rdma_dim_work(struct work_struct *w)
+{
+ struct dim *dim = container_of(w, struct dim, work);
+ struct ib_cq *cq = dim->priv;
+
+ u16 usec = rdma_dim_prof[dim->profile_ix].usec;
+ u16 comps = rdma_dim_prof[dim->profile_ix].comps;
+
+ dim->state = DIM_START_MEASURE;
+
+ cq->device->ops.modify_cq(cq, comps, usec);
+}
+
+static void rdma_dim_init(struct ib_cq *cq)
+{
+ struct dim *dim;
+
+ if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim ||
+ cq->poll_ctx == IB_POLL_DIRECT)
+ return;
+
+ dim = kzalloc(sizeof(struct dim), GFP_KERNEL);
+ if (!dim)
+ return;
+
+ dim->state = DIM_START_MEASURE;
+ dim->tune_state = DIM_GOING_RIGHT;
+ dim->profile_ix = RDMA_DIM_START_PROFILE;
+ dim->priv = cq;
+ cq->dim = dim;
+
+ INIT_WORK(&dim->work, ib_cq_rdma_dim_work);
+}
+
static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs,
int batch)
{
@@ -78,6 +112,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
static int ib_poll_handler(struct irq_poll *iop, int budget)
{
struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
+ struct dim *dim = cq->dim;
int completed;
completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH);
@@ -87,6 +122,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget)
irq_poll_sched(&cq->iop);
}
+ if (dim)
+ rdma_dim(dim, completed);
+
return completed;
}
@@ -105,6 +143,8 @@ static void ib_cq_poll_work(struct work_struct *work)
if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
queue_work(cq->comp_wq, &cq->work);
+ else if (cq->dim)
+ rdma_dim(cq->dim, completed);
}
static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
@@ -161,6 +201,8 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
rdma_restrack_kadd(&cq->res);
+ rdma_dim_init(cq);
+
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
cq->comp_handler = ib_cq_completion_direct;
@@ -223,6 +265,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
rdma_restrack_del(&cq->res);
cq->device->ops.destroy_cq(cq, udata);
+ if (cq->dim)
+ cancel_work_sync(&cq->dim->work);
+ kfree(cq->dim);
kfree(cq->wc);
kfree(cq);
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 7581571bd9cd..07a05b0b9e42 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -6424,6 +6424,8 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
mutex_init(&dev->lb.mutex);
+ dev->ib_dev.use_cq_dim = true;
+
return 0;
}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 50806bef9f20..30eb68f36109 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -61,6 +61,7 @@
#include <linux/cgroup_rdma.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
+#include <linux/dim.h>
#include <uapi/rdma/ib_user_verbs.h>
#include <rdma/rdma_counter.h>
#include <rdma/restrack.h>
@@ -1509,6 +1510,7 @@ struct ib_cq {
struct work_struct work;
};
struct workqueue_struct *comp_wq;
+ struct dim *dim;
/*
* Implementation details of the RDMA core, don't use in drivers:
*/
@@ -2576,6 +2578,8 @@ struct ib_device {
u16 is_switch:1;
/* Indicates kernel verbs support, should not be used in drivers */
u16 kverbs_provider:1;
+ /* CQ adaptive moderation (RDMA DIM) */
+ u16 use_cq_dim:1;
u8 node_type;
u8 phys_port_cnt;
struct ib_device_attr attrs;
--
2.20.1
next prev parent reply other threads:[~2019-07-04 12:57 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2019-07-04 12:57 [PATCH rdma-next v4 0/3] Use RDMA adaptive moderation library Leon Romanovsky
2019-07-04 12:57 ` [PATCH rdma-next v4 1/3] linux/dim: Implement RDMA adaptive moderation (DIM) Leon Romanovsky
2019-07-04 12:57 ` Leon Romanovsky [this message]
2019-07-08 9:31 ` [PATCH rdma-next v4 2/3] RDMA/core: Provide RDMA DIM support for ULPs Sagi Grimberg
2019-07-08 10:44 ` Leon Romanovsky
2019-07-04 12:57 ` [PATCH rdma-next v4 3/3] RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20190704125743.7814-3-leon@kernel.org \
--to=leon@kernel.org \
--cc=dledford@redhat.com \
--cc=jgg@mellanox.com \
--cc=leonro@mellanox.com \
--cc=linux-rdma@vger.kernel.org \
--cc=maxg@mellanox.com \
--cc=saeedm@mellanox.com \
--cc=sagi@grimberg.me \
--cc=yaminf@mellanox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.