From: Jason Gunthorpe <jgg@nvidia.com>
To: Leon Romanovsky <leon@kernel.org>
Cc: Doug Ledford <dledford@redhat.com>,
Mark Zhang <markzhang@nvidia.com>,
linux-rdma@vger.kernel.org
Subject: Re: [PATCH rdma-next v2 7/9] IB/cm: Clear all associated AV's ports when remove a cm device
Date: Thu, 22 Apr 2021 16:34:17 -0300 [thread overview]
Message-ID: <20210422193417.GA2435405@nvidia.com> (raw)
In-Reply-To: <00c97755c41b06af84f621a1b3e0e8adfe0771cc.1619004798.git.leonro@nvidia.com>
On Wed, Apr 21, 2021 at 02:40:37PM +0300, Leon Romanovsky wrote:
> @@ -4396,6 +4439,14 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
> cm_dev->going_down = 1;
> spin_unlock_irq(&cm.lock);
>
> + list_for_each_entry_safe(cm_id_priv, tmp,
> + &cm_dev->cm_id_priv_list, cm_dev_list) {
> + if (!list_empty(&cm_id_priv->cm_dev_list))
> + list_del(&cm_id_priv->cm_dev_list);
> + cm_id_priv->av.port = NULL;
> + cm_id_priv->alt_av.port = NULL;
> + }
Ugh, this is in the wrong order, it has to be after the work queue
flush..
Hurm, I didn't see an easy way to fix it up, but I did think of a much
better design!
Generally speaking all we need is the memory of the cm_dev and port to
remain active, we don't need to block or fence with cm_remove_one(),
so just stick a memory kref on this thing and keep the memory. The
only things that needs to seralize with cm_remove_one() are on the
workqueue or take a spinlock (eg because they touch mad_agent)
Try this, I didn't finish every detail, applies on top of your series,
but you'll need to reflow it into new commits:
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3feff999a5e003..c26367006a4485 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -205,8 +205,11 @@ struct cm_port {
};
struct cm_device {
+ struct kref kref;
struct list_head list;
+ struct mutex unregistration_lock;
struct ib_device *ib_device;
+ unsigned int num_ports;
u8 ack_delay;
int going_down;
struct list_head cm_id_priv_list;
@@ -262,7 +265,6 @@ struct cm_id_private {
/* todo: use alternate port on send failure */
struct cm_av av;
struct cm_av alt_av;
- rwlock_t av_rwlock; /* Do not acquire inside cm.lock */
void *private_data;
__be64 tid;
@@ -287,10 +289,23 @@ struct cm_id_private {
atomic_t work_count;
struct rdma_ucm_ece ece;
-
- struct list_head cm_dev_list;
};
+static void cm_dev_release(struct kref *kref)
+{
+ struct cm_device *cm_dev = container_of(kref, struct cm_device, kref);
+ unsigned int i;
+
+ for (i = 0; i != cm_dev->num_ports; i++)
+ kfree(cm_dev->port[i]);
+ kfree(cm_dev);
+}
+
+static void cm_device_put(struct cm_device *cm_dev)
+{
+ kref_put(&cm_dev->kref, cm_dev_release);
+}
+
static void cm_work_handler(struct work_struct *work);
static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
@@ -306,12 +321,12 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
struct ib_ah *ah;
int ret;
- read_lock(&cm_id_priv->av_rwlock);
if (!cm_id_priv->av.port) {
ret = -EINVAL;
goto out;
}
+ spin_lock(&cm_id_priv->av.port.cm_dev->unregistration_lock);
mad_agent = cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
ret = -EINVAL;
@@ -330,7 +345,6 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
GFP_ATOMIC,
IB_MGMT_BASE_VERSION);
- read_unlock(&cm_id_priv->av_rwlock);
if (IS_ERR(m)) {
rdma_destroy_ah(ah, 0);
ret = PTR_ERR(m);
@@ -346,7 +360,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
return m;
out:
- read_unlock(&cm_id_priv->av_rwlock);
+ spin_unlock(&cm_id_priv->av.port.cm_dev->unregistration_lock);
return ERR_PTR(ret);
}
@@ -465,20 +479,18 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv,
cm_id_priv->private_data_len = private_data_len;
}
-static void add_cm_id_to_cm_dev_list(struct cm_id_private *cm_id_priv,
- struct cm_device *cm_dev)
+static void cm_set_av_port(struct cm_av *av, struct cm_port *port)
{
- unsigned long flags;
+ struct cm_port *old_port = av->port;
- spin_lock_irqsave(&cm.lock, flags);
- if (cm_dev->going_down)
- goto out;
+ if (old_port == port)
+ return;
- if (!list_empty(&cm_id_priv->cm_dev_list))
- list_del(&cm_id_priv->cm_dev_list);
- list_add_tail(&cm_id_priv->cm_dev_list, &cm_dev->cm_id_priv_list);
-out:
- spin_unlock_irqrestore(&cm.lock, flags);
+ av->port = port;
+ if (old_port)
+ cm_device_put(old_port->cm_dev);
+ if (port)
+ kref_get(&old_port->cm_dev->kref);
}
static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
@@ -505,11 +517,8 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc,
if (ret)
return ret;
- write_lock(&cm_id_priv->av_rwlock);
- av->port = port;
+ cm_set_av_port(av, port);
av->pkey_index = wc->pkey_index;
- add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
- write_unlock(&cm_id_priv->av_rwlock);
rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
return 0;
@@ -521,10 +530,7 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
{
struct cm_av *av = &cm_id_priv->av;
- write_lock(&cm_id_priv->av_rwlock);
- av->port = port;
- add_cm_id_to_cm_dev_list(cm_id_priv, port->cm_dev);
- write_unlock(&cm_id_priv->av_rwlock);
+ cm_set_av_port(av, port);
av->pkey_index = wc->pkey_index;
return ib_init_ah_attr_from_wc(port->cm_dev->ib_device,
port->port_num, wc,
@@ -588,12 +594,9 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
return -EINVAL;
cm_dev = port->cm_dev;
- read_lock(&cm_id_priv->av_rwlock);
if (!is_priv_av &&
(!cm_id_priv->av.port || cm_dev != cm_id_priv->av.port->cm_dev))
ret = -EINVAL;
-
- read_unlock(&cm_id_priv->av_rwlock);
if (ret)
return ret;
@@ -618,13 +621,8 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
if (ret)
return ret;
- write_lock(&cm_id_priv->av_rwlock);
- av->port = port;
+ cm_set_av_port(av, port);
av->timeout = path->packet_life_time + 1;
- if (is_priv_av)
- add_cm_id_to_cm_dev_list(cm_id_priv, cm_dev);
-
- write_unlock(&cm_id_priv->av_rwlock);
rdma_move_ah_attr(&av->ah_attr, &new_ah_attr);
return 0;
@@ -905,10 +903,8 @@ static struct cm_id_private *cm_alloc_id_priv(struct ib_device *device,
spin_lock_init(&cm_id_priv->lock);
init_completion(&cm_id_priv->comp);
INIT_LIST_HEAD(&cm_id_priv->work_list);
- INIT_LIST_HEAD(&cm_id_priv->cm_dev_list);
atomic_set(&cm_id_priv->work_count, -1);
refcount_set(&cm_id_priv->refcount, 1);
- rwlock_init(&cm_id_priv->av_rwlock);
ret = xa_alloc_cyclic(&cm.local_id_table, &id, NULL, xa_limit_32b,
&cm.local_id_next, GFP_KERNEL);
@@ -1027,10 +1023,8 @@ static u8 cm_ack_timeout_req(struct cm_id_private *cm_id_priv,
{
u8 ack_delay = 0;
- read_lock(&cm_id_priv->av_rwlock);
- if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
+ if (cm_id_priv->av.port)
ack_delay = cm_id_priv->av.port->cm_dev->ack_delay;
- read_unlock(&cm_id_priv->av_rwlock);
return cm_ack_timeout(ack_delay, packet_life_time);
}
@@ -1228,8 +1222,6 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
cm_id_priv->timewait_info = NULL;
}
- if (!list_empty(&cm_id_priv->cm_dev_list))
- list_del(&cm_id_priv->cm_dev_list);
WARN_ON(cm_id_priv->listen_sharecount);
WARN_ON(!RB_EMPTY_NODE(&cm_id_priv->service_node));
if (!RB_EMPTY_NODE(&cm_id_priv->sidr_id_node))
@@ -1246,6 +1238,8 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr);
rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr);
kfree(cm_id_priv->private_data);
+ cm_set_av_port(&cm_id_priv->av, NULL);
+ cm_set_av_port(&cm_id_priv->alt_av, NULL);
kfree_rcu(cm_id_priv, rcu);
}
@@ -1378,10 +1372,13 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
{
u64 hi_tid = 0, low_tid;
- read_lock(&cm_id_priv->av_rwlock);
- if (cm_id_priv->av.port && cm_id_priv->av.port->mad_agent)
- hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- read_unlock(&cm_id_priv->av_rwlock);
+ if (cm_id_priv->av.port) {
+ spin_lock(&cm_id_priv->av.port->cm_dev->unregistration_lock);
+ if (cm_id_priv->av.port->mad_agent)
+ hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid)
+ << 32;
+ spin_unlock(&cm_id_priv->av.port->cm_dev->unregistration_lock);
+ }
low_tid = (u64)cm_id_priv->id.local_id;
return cpu_to_be64(hi_tid | low_tid);
@@ -1879,12 +1876,10 @@ static void cm_format_req_event(struct cm_work *work,
param = &work->cm_event.param.req_rcvd;
param->listen_id = listen_id;
param->bth_pkey = cm_get_bth_pkey(work);
- read_lock(&cm_id_priv->av_rwlock);
if (cm_id_priv->av.port)
param->port = cm_id_priv->av.port->port_num;
else
param->port = 0;
- read_unlock(&cm_id_priv->av_rwlock);
param->primary_path = &work->path[0];
cm_opa_to_ib_sgid(work, param->primary_path);
if (cm_req_has_alt_path(req_msg)) {
@@ -2311,13 +2306,11 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg,
IBA_SET(CM_REP_STARTING_PSN, rep_msg, param->starting_psn);
IBA_SET(CM_REP_RESPONDER_RESOURCES, rep_msg,
param->responder_resources);
- read_lock(&cm_id_priv->av_rwlock);
if (cm_id_priv->av.port && cm_id_priv->av.port->cm_dev)
IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg,
cm_id_priv->av.port->cm_dev->ack_delay);
else
IBA_SET(CM_REP_TARGET_ACK_DELAY, rep_msg, 0);
- read_unlock(&cm_id_priv->av_rwlock);
IBA_SET(CM_REP_FAILOVER_ACCEPTED, rep_msg, param->failover_accepted);
IBA_SET(CM_REP_RNR_RETRY_COUNT, rep_msg, param->rnr_retry_count);
IBA_SET(CM_REP_LOCAL_CA_GUID, rep_msg,
@@ -4187,10 +4180,8 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_ATOMIC;
qp_attr->pkey_index = cm_id_priv->av.pkey_index;
- read_lock(&cm_id_priv->av_rwlock);
qp_attr->port_num = cm_id_priv->av.port ?
cm_id_priv->av.port->port_num : 0;
- read_unlock(&cm_id_priv->av_rwlock);
ret = 0;
break;
default:
@@ -4234,10 +4225,8 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
}
if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) {
*qp_attr_mask |= IB_QP_ALT_PATH;
- read_lock(&cm_id_priv->av_rwlock);
qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
cm_id_priv->alt_av.port->port_num : 0;
- read_unlock(&cm_id_priv->av_rwlock);
qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4296,10 +4285,8 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
}
} else {
*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
- read_lock(&cm_id_priv->av_rwlock);
qp_attr->alt_port_num = cm_id_priv->alt_av.port ?
cm_id_priv->alt_av.port->port_num : 0;
- read_unlock(&cm_id_priv->av_rwlock);
qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
@@ -4417,9 +4404,11 @@ static int cm_add_one(struct ib_device *ib_device)
if (!cm_dev)
return -ENOMEM;
+ kref_init(&cm_dev->kref);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
+ cm_dev->num_ports = ib_device->phys_port_cnt;
INIT_LIST_HEAD(&cm_dev->cm_id_priv_list);
set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
@@ -4489,10 +4478,9 @@ static int cm_add_one(struct ib_device *ib_device)
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
ib_unregister_mad_agent(port->mad_agent);
cm_remove_port_fs(port);
- kfree(port);
}
free:
- kfree(cm_dev);
+ cm_device_put(cm_dev);
return ret;
}
@@ -4515,21 +4503,15 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
cm_dev->going_down = 1;
spin_unlock_irq(&cm.lock);
- list_for_each_entry_safe(cm_id_priv, tmp,
- &cm_dev->cm_id_priv_list, cm_dev_list) {
- write_lock(&cm_id_priv->av_rwlock);
- if (!list_empty(&cm_id_priv->cm_dev_list))
- list_del(&cm_id_priv->cm_dev_list);
- cm_id_priv->av.port = NULL;
- cm_id_priv->alt_av.port = NULL;
- write_unlock(&cm_id_priv->av_rwlock);
- }
-
rdma_for_each_port (ib_device, i) {
+ struct ib_mad_agent *mad_agent;
+
if (!rdma_cap_ib_cm(ib_device, i))
continue;
port = cm_dev->port[i-1];
+ mad_agent = port->mad_agent;
+
ib_modify_port(ib_device, port->port_num, 0, &port_modify);
/*
* We flush the queue here after the going_down set, this
@@ -4537,12 +4519,20 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* after that we can call the unregister_mad_agent
*/
flush_workqueue(cm.wq);
- ib_unregister_mad_agent(port->mad_agent);
+ /*
+ * The above ensures no call paths from the work are running,
+ * the remaining paths all take the unregistration lock
+ */
+ spin_lock(&cm_dev->unregistration_lock);
+ port->mad_agent = NULL;
+ spin_unlock(&cm_dev->unregistration_lock);
+ ib_unregister_mad_agent(mad_agent);
cm_remove_port_fs(port);
- kfree(port);
}
- kfree(cm_dev);
+ /* All touches can only be on call path from the work */
+ cm_dev->ib_device = NULL;
+ cm_device_put(cm_dev);
}
static int __init ib_cm_init(void)
next prev parent reply other threads:[~2021-04-22 19:34 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-04-21 11:40 [PATCH rdma-next v2 0/9] Fix memory corruption in CM Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 1/9] IB/cm: Pair cm_alloc_response_msg() with a cm_free_response_msg() Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 2/9] IB/cm: Split cm_alloc_msg() Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 3/9] IB/cm: Call the correct message free functions in cm_send_handler() Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 4/9] IB/cm: Tidy remaining cm_msg free paths Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 5/9] Revert "IB/cm: Mark stale CM id's whenever the mad agent was unregistered" Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 6/9] IB/cm: Simplify ib_cancel_mad() and ib_modify_mad() calls Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 7/9] IB/cm: Clear all associated AV's ports when remove a cm device Leon Romanovsky
2021-04-22 19:34 ` Jason Gunthorpe [this message]
2021-04-23 13:14 ` Mark Zhang
2021-04-23 14:24 ` Jason Gunthorpe
2021-04-24 2:33 ` Mark Zhang
2021-04-26 13:56 ` Jason Gunthorpe
2021-04-27 1:59 ` Mark Zhang
2021-04-21 11:40 ` [PATCH rdma-next v2 8/9] IB/cm: Add lock protection when access av/alt_av's port of a cm_id Leon Romanovsky
2021-04-22 19:08 ` Jason Gunthorpe
2021-04-25 13:21 ` Leon Romanovsky
2021-04-21 11:40 ` [PATCH rdma-next v2 9/9] IB/cm: Initialize av before aquire the spin lock in cm_lap_handler Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210422193417.GA2435405@nvidia.com \
--to=jgg@nvidia.com \
--cc=dledford@redhat.com \
--cc=leon@kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=markzhang@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox