From: Leon Romanovsky <leon@kernel.org>
To: Jason Gunthorpe <jgg@nvidia.com>
Cc: Vlad Dumitrescu <vdumitrescu@nvidia.com>,
linux-rdma@vger.kernel.org, Sean Hefty <shefty@nvidia.com>
Subject: [PATCH rdma-next 7/9] IB/mad: Exponential backoff when retrying sends
Date: Thu, 5 Dec 2024 15:49:37 +0200 [thread overview]
Message-ID: <af348c70c47485235d7d6811b56ccf23e105bdad.1733405453.git.leon@kernel.org> (raw)
In-Reply-To: <cover.1733405453.git.leon@kernel.org>
From: Vlad Dumitrescu <vdumitrescu@nvidia.com>
When a receiver is overloaded, MAD requests time out and get retried in
a linear fashion. This could worsen congestion and reduce goodput. To
help reduce the load over time, use exponential backoff after a preset
number of retries. Cap delays between retries at 60s, even when in
exponential mode.
MRA message from recipient could request an even higher timeout, so
continue to respect that for the next retry. However, reset the backoff
algorithm to the beginning when and MRA is received.
Exclude RMPP and OPA from exponential backoff.
Signed-off-by: Vlad Dumitrescu <vdumitrescu@nvidia.com>
Reviewed-by: Sean Hefty <shefty@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/mad.c | 53 ++++++++++++++++++++++++++++--
drivers/infiniband/core/mad_priv.h | 3 ++
2 files changed, 53 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 5c255ee3db38..a3a8cf4bbc20 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -54,7 +54,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ib_mad.h>
-#define IB_MAD_MAX_DEADLINE (jiffies + msecs_to_jiffies(5 * 60 * 1000))
+#define IB_MAD_LINEAR_TIMEOUTS_DEFAULT 4
+#define IB_MAD_MAX_TIMEOUT_MS (60 * MSEC_PER_SEC)
+#define IB_MAD_MAX_DEADLINE (jiffies + msecs_to_jiffies(5 * 60 * 1000))
#ifdef CONFIG_TRACEPOINTS
static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
@@ -1210,10 +1212,12 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
}
mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+ mad_send_wr->var_timeout_ms = send_buf->timeout_ms;
/* Timeout will be updated after send completes */
mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
mad_send_wr->max_retries = send_buf->retries;
mad_send_wr->retries_left = send_buf->retries;
+ mad_send_wr->backoff_retries = 0;
send_buf->retries = 0;
mad_send_wr->status = IB_WC_SUCCESS;
@@ -2662,18 +2666,34 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms)
return -EINVAL;
}
- if (!timeout_ms)
+ if (!timeout_ms) {
mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ goto apply;
+ }
+
+ /* CM MRA requesting a lower timeout than ours. Could be a delayed MRA
+ * (variable backoff increased in the meantime) or remote using a const.
+ */
+ if (timeout_ms < mad_send_wr->var_timeout_ms)
+ goto ignore;
+
+ /* Assume remote will no longer be overloaded after MRA Service Timeout
+ * passes and restart variable backoff algorithm.
+ */
+ mad_send_wr->var_timeout_ms = mad_send_wr->send_buf.timeout_ms;
+ mad_send_wr->backoff_retries = 0;
if (mad_send_wr->deadline)
mad_send_wr->deadline += msecs_to_jiffies(timeout_ms);
+apply:
if (mad_send_wr->state == IB_MAD_STATE_SEND_START ||
(mad_send_wr->state == IB_MAD_STATE_QUEUED && timeout_ms))
mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
else
ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+ignore:
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
return 0;
}
@@ -2767,6 +2787,30 @@ static void local_completions(struct work_struct *work)
spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
}
+/*
+ * Applies a variable backoff to certain send MADs.
+ *
+ * Exists to scope down the initial variable backoff implementation.
+ */
+static void set_next_timeout(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ const struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+ const struct ib_mad_port_private *port = agent->qp_info->port_priv;
+ const struct ib_mad_hdr *hdr = mad_send_wr->send_buf.mad;
+
+ if (ib_mad_kernel_rmpp_agent(&agent->agent))
+ return;
+
+ if (hdr->base_version != IB_MGMT_BASE_VERSION)
+ return;
+
+ if (++mad_send_wr->backoff_retries < READ_ONCE(port->linear_timeouts))
+ return;
+
+ mad_send_wr->var_timeout_ms =
+ min(mad_send_wr->var_timeout_ms << 1, IB_MAD_MAX_TIMEOUT_MS);
+}
+
static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
{
int ret;
@@ -2778,7 +2822,8 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
mad_send_wr->retries_left--;
mad_send_wr->send_buf.retries++;
- mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ set_next_timeout(mad_send_wr);
+ mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->var_timeout_ms);
if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
ret = ib_retry_rmpp(mad_send_wr);
@@ -3195,6 +3240,8 @@ static int ib_mad_port_open(struct ib_device *device,
goto error8;
}
+ port_priv->linear_timeouts = IB_MAD_LINEAR_TIMEOUTS_DEFAULT;
+
spin_lock_irqsave(&ib_mad_port_list_lock, flags);
list_add_tail(&port_priv->port_list, &ib_mad_port_list);
spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 24580ad2d428..076ebcea27b4 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -139,10 +139,12 @@ struct ib_mad_send_wr_private {
struct ib_ud_wr send_wr;
struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
__be64 tid;
+ unsigned int var_timeout_ms;
unsigned long timeout;
unsigned long deadline;
int max_retries;
int retries_left;
+ int backoff_retries;
int retry;
enum ib_wc_status status;
@@ -222,6 +224,7 @@ struct ib_mad_port_private {
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
struct workqueue_struct *wq;
struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+ u8 linear_timeouts;
};
int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
--
2.47.0
next prev parent reply other threads:[~2024-12-05 13:50 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-12-05 13:49 [PATCH rdma-next 0/9] Rework retry algorithm used when sending MADs Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 1/9] IB/mad: Apply timeout modification (CM MRA) only once Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 2/9] IB/mad: Add deadline for send MADs Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 3/9] RDMA/sa_query: Enforce min retry interval and deadline Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 4/9] RDMA/nldev: Add sa-min-timeout management attribute Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 5/9] IB/umad: Set deadline when sending non-RMPP MADs Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 6/9] IB/cm: Set deadline when sending MADs Leon Romanovsky
2024-12-05 13:49 ` Leon Romanovsky [this message]
2024-12-05 13:49 ` [PATCH rdma-next 8/9] RDMA/nldev: Add mad-linear-timeouts management attribute Leon Romanovsky
2024-12-05 13:49 ` [PATCH rdma-next 9/9] IB/cma: Lower response timeout to roughly 1s Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=af348c70c47485235d7d6811b56ccf23e105bdad.1733405453.git.leon@kernel.org \
--to=leon@kernel.org \
--cc=jgg@nvidia.com \
--cc=linux-rdma@vger.kernel.org \
--cc=shefty@nvidia.com \
--cc=vdumitrescu@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox