From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Jack Wang <jinpu.wang@ionos.com>,
Aleksei Marov <aleksei.marov@ionos.com>,
Md Haris Iqbal <haris.iqbal@ionos.com>,
Jason Gunthorpe <jgg@nvidia.com>, Sasha Levin <sashal@kernel.org>,
linux-rdma@vger.kernel.org
Subject: [PATCH AUTOSEL 5.17 026/149] RDMA/rtrs-clt: Do stop and failover outside reconnect work.
Date: Fri, 1 Apr 2022 10:23:33 -0400 [thread overview]
Message-ID: <20220401142536.1948161-26-sashal@kernel.org> (raw)
In-Reply-To: <20220401142536.1948161-1-sashal@kernel.org>
From: Jack Wang <jinpu.wang@ionos.com>
[ Upstream commit c1289d5d8502d62e5bc50ff066c9d6daabfc3264 ]
We can't do instant reconnect, not to DDoS server, but we should stop and
failover earlier, so there is less service interruption.
To avoid deadlock, as error_recovery is called from different callback
like rdma event or hb error handler, add a new err recovery_work.
Link: https://lore.kernel.org/r/20220114154753.983568-6-haris.iqbal@ionos.com
Signed-off-by: Jack Wang <jinpu.wang@ionos.com>
Reviewed-by: Aleksei Marov <aleksei.marov@ionos.com>
Reviewed-by: Md Haris Iqbal <haris.iqbal@ionos.com>
Signed-off-by: Md Haris Iqbal <haris.iqbal@ionos.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
drivers/infiniband/ulp/rtrs/rtrs-clt.c | 40 ++++++++++++++------------
drivers/infiniband/ulp/rtrs/rtrs-clt.h | 1 +
2 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
index 759b85f03331..df4d06d4d183 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c
@@ -297,6 +297,7 @@ static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path,
return changed;
}
+static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path);
static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
{
struct rtrs_clt_path *clt_path = to_clt_path(con->c.path);
@@ -304,16 +305,7 @@ static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con)
if (rtrs_clt_change_state_from_to(clt_path,
RTRS_CLT_CONNECTED,
RTRS_CLT_RECONNECTING)) {
- struct rtrs_clt_sess *clt = clt_path->clt;
- unsigned int delay_ms;
-
- /*
- * Normal scenario, reconnect if we were successfully connected
- */
- delay_ms = clt->reconnect_delay_sec * 1000;
- queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
- msecs_to_jiffies(delay_ms +
- prandom_u32() % RTRS_RECONNECT_SEED));
+ queue_work(rtrs_wq, &clt_path->err_recovery_work);
} else {
/*
* Error can happen just on establishing new connection,
@@ -1511,6 +1503,22 @@ static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path)
static void rtrs_clt_reconnect_work(struct work_struct *work);
static void rtrs_clt_close_work(struct work_struct *work);
+static void rtrs_clt_err_recovery_work(struct work_struct *work)
+{
+ struct rtrs_clt_path *clt_path;
+ struct rtrs_clt_sess *clt;
+ int delay_ms;
+
+ clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work);
+ clt = clt_path->clt;
+ delay_ms = clt->reconnect_delay_sec * 1000;
+ rtrs_clt_stop_and_destroy_conns(clt_path);
+ queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
+ msecs_to_jiffies(delay_ms +
+ prandom_u32() %
+ RTRS_RECONNECT_SEED));
+}
+
static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
const struct rtrs_addr *path,
size_t con_num, u32 nr_poll_queues)
@@ -1562,6 +1570,7 @@ static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt,
clt_path->state = RTRS_CLT_CONNECTING;
atomic_set(&clt_path->connected_cnt, 0);
INIT_WORK(&clt_path->close_work, rtrs_clt_close_work);
+ INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work);
INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work);
rtrs_clt_init_hb(clt_path);
@@ -2326,6 +2335,7 @@ static void rtrs_clt_close_work(struct work_struct *work)
clt_path = container_of(work, struct rtrs_clt_path, close_work);
+ cancel_work_sync(&clt_path->err_recovery_work);
cancel_delayed_work_sync(&clt_path->reconnect_dwork);
rtrs_clt_stop_and_destroy_conns(clt_path);
rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL);
@@ -2638,7 +2648,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
{
struct rtrs_clt_path *clt_path;
struct rtrs_clt_sess *clt;
- unsigned int delay_ms;
int err;
clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path,
@@ -2655,8 +2664,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
}
clt_path->reconnect_attempts++;
- /* Stop everything */
- rtrs_clt_stop_and_destroy_conns(clt_path);
msleep(RTRS_RECONNECT_BACKOFF);
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) {
err = init_path(clt_path);
@@ -2669,11 +2676,7 @@ static void rtrs_clt_reconnect_work(struct work_struct *work)
reconnect_again:
if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) {
clt_path->stats->reconnects.fail_cnt++;
- delay_ms = clt->reconnect_delay_sec * 1000;
- queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork,
- msecs_to_jiffies(delay_ms +
- prandom_u32() %
- RTRS_RECONNECT_SEED));
+ queue_work(rtrs_wq, &clt_path->err_recovery_work);
}
}
@@ -2908,6 +2911,7 @@ int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path)
&old_state);
if (changed) {
clt_path->reconnect_attempts = 0;
+ rtrs_clt_stop_and_destroy_conns(clt_path);
queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0);
}
if (changed || old_state == RTRS_CLT_RECONNECTING) {
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h
index d1b18a154ae0..f848c0392d98 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h
+++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h
@@ -134,6 +134,7 @@ struct rtrs_clt_path {
struct rtrs_clt_io_req *reqs;
struct delayed_work reconnect_dwork;
struct work_struct close_work;
+ struct work_struct err_recovery_work;
unsigned int reconnect_attempts;
bool established;
struct rtrs_rbuf *rbufs;
--
2.34.1
next prev parent reply other threads:[~2022-04-01 14:28 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20220401142536.1948161-1-sashal@kernel.org>
2022-04-01 14:23 ` [PATCH AUTOSEL 5.17 020/149] net/mlx5e: TC, Hold sample_attr on stack instead of pointer Sasha Levin
2022-04-01 14:23 ` Sasha Levin [this message]
2022-04-01 14:24 ` [PATCH AUTOSEL 5.17 074/149] net/mlx5e: Disable TX queues before registering the netdev Sasha Levin
2022-04-01 14:25 ` [PATCH AUTOSEL 5.17 132/149] net/mlx5e: Remove overzealous validations in netlink EEPROM query Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220401142536.1948161-26-sashal@kernel.org \
--to=sashal@kernel.org \
--cc=aleksei.marov@ionos.com \
--cc=haris.iqbal@ionos.com \
--cc=jgg@nvidia.com \
--cc=jinpu.wang@ionos.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).