From: Michael Guralnik <michaelgur@nvidia.com>
To: <jgg@nvidia.com>
Cc: <linux-rdma@vger.kernel.org>, <leonro@nvidia.com>,
<mbloch@nvidia.com>, <cmeiohas@nvidia.com>, <msanalla@nvidia.com>,
Michael Guralnik <michaelgur@nvidia.com>
Subject: [PATCH rdma-next 6/7] RDMA/nldev: Add support for RDMA monitoring
Date: Wed, 21 Aug 2024 08:10:16 +0300 [thread overview]
Message-ID: <20240821051017.7730-7-michaelgur@nvidia.com> (raw)
In-Reply-To: <20240821051017.7730-1-michaelgur@nvidia.com>
From: Chiara Meiohas <cmeiohas@nvidia.com>
Introduce a new netlink command to allow rdma event monitoring.
The rdma events supported now are IB device
registration/unregistration and net device attachment/detachment.
Example output of rdma monitor and the commands which trigger
the events:
$ rdma monitor
$ rmmod mlx5_ib
[UNREGISTER] dev 3
[UNREGISTER] dev 0
$modprobe mlx5_ib
[REGISTER] dev 4
[NETDEV_ATTACH] dev 4 port 1 netdev 4
[REGISTER] dev 5
[NETDEV_ATTACH] dev 5 port 1 netdev 5
$ devlink dev eswitch set pci/0000:08:00.0 mode switchdev
[UNREGISTER] dev 4
[REGISTER] dev 6
[NETDEV_ATTACH] dev 6 port 6 netdev 4
$ echo 4 > /sys/class/net/eth2/device/sriov_numvfs
[NETDEV_ATTACH] dev 6 port 2 netdev 7
[NETDEV_ATTACH] dev 6 port 3 netdev 8
[NETDEV_ATTACH] dev 6 port 4 netdev 9
[NETDEV_ATTACH] dev 6 port 5 netdev 10
[REGISTER] dev 7
[NETDEV_ATTACH] dev 7 port 1 netdev 11
[REGISTER] dev 8
[NETDEV_ATTACH] dev 8 port 1 netdev 12
[REGISTER] dev 9
[NETDEV_ATTACH] dev 9 port 1 netdev 13
[REGISTER] dev 10
[NETDEV_ATTACH] dev 10 port 1 netdev 14
$ echo 0 > /sys/class/net/eth2/device/sriov_numvfs
[UNREGISTER] dev 7
[UNREGISTER] dev 8
[UNREGISTER] dev 9
[UNREGISTER] dev 10
[NETDEV_DETACH] dev 6 port 2
[NETDEV_DETACH] dev 6 port 3
[NETDEV_DETACH] dev 6 port 4
[NETDEV_DETACH] dev 6 port 5
Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/infiniband/core/device.c | 38 ++++++++++
drivers/infiniband/core/netlink.c | 1 +
drivers/infiniband/core/nldev.c | 117 ++++++++++++++++++++++++++++++
include/rdma/rdma_netlink.h | 10 +++
include/uapi/rdma/rdma_netlink.h | 15 ++++
5 files changed, 181 insertions(+)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b2fc5a13577c..2113eb7c7573 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1351,6 +1351,30 @@ static void prevent_dealloc_device(struct ib_device *ib_dev)
{
}
+static void ib_device_notify_register(struct ib_device *device)
+{
+ struct net_device *netdev;
+ u32 port;
+ int ret;
+
+ ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
+ if (ret)
+ return;
+
+ rdma_for_each_port(device, port) {
+ netdev = ib_device_get_netdev(device, port);
+ if (!netdev)
+ continue;
+
+ ret = rdma_nl_notify_event(device, port,
+ RDMA_NETDEV_ATTACH_EVENT);
+ dev_put(netdev);
+ if (ret)
+ return;
+ }
+ return;
+}
+
/**
* ib_register_device - Register an IB device with IB core
* @device: Device to register
@@ -1449,6 +1473,8 @@ int ib_register_device(struct ib_device *device, const char *name,
dev_set_uevent_suppress(&device->dev, false);
/* Mark for userspace that device is ready */
kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+ ib_device_notify_register(device);
ib_device_put(device);
return 0;
@@ -1491,6 +1517,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
goto out;
disable_device(ib_dev);
+ rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT);
/* Expedite removing unregistered pointers from the hash table */
free_netdevs(ib_dev);
@@ -2159,6 +2186,7 @@ static void add_ndev_hash(struct ib_port_data *pdata)
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
u32 port)
{
+ enum rdma_nl_notify_event_type etype;
struct net_device *old_ndev;
struct ib_port_data *pdata;
unsigned long flags;
@@ -2190,6 +2218,16 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
spin_unlock_irqrestore(&pdata->netdev_lock, flags);
add_ndev_hash(pdata);
+
+ down_read(&devices_rwsem);
+ if (xa_get_mark(&devices, ib_dev->index, DEVICE_REGISTERED) &&
+ xa_load(&devices, ib_dev->index) == ib_dev) {
+ etype = ndev ?
+ RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT;
+ rdma_nl_notify_event(ib_dev, port, etype);
+ }
+ up_read(&devices_rwsem);
+
return 0;
}
EXPORT_SYMBOL(ib_device_set_netdev);
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index ae2db0c70788..def14c54b648 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet)
struct net *net = read_pnet(&rnet->net);
struct netlink_kernel_cfg cfg = {
.input = rdma_nl_rcv,
+ .flags = NL_CFG_F_NONROOT_RECV,
};
struct sock *nls;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 4d4a1f90e484..5acbde242b97 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -170,6 +170,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 },
[RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING },
[RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 },
+ [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 },
};
static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -2722,6 +2723,122 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
},
};
+static int fill_mon_netdev_association(struct sk_buff *msg,
+ struct ib_device *device, u32 port,
+ const struct net *net)
+{
+ struct net_device *netdev = ib_device_get_netdev(device, port);
+ int ret = 0;
+
+ if (netdev && !net_eq(dev_net(netdev), net))
+ goto out;
+
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index);
+ if (ret)
+ goto out;
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port);
+ if (ret)
+ goto out;
+ if (netdev)
+ ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX,
+ netdev->ifindex);
+
+out:
+ dev_put(netdev);
+ return ret;
+}
+
+static int fill_mon_register(struct sk_buff *msg, struct ib_device *device,
+ const struct net *net)
+{
+ return nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index);
+}
+
+static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num,
+ enum rdma_nl_notify_event_type type)
+{
+ struct net_device *netdev;
+
+ switch (type) {
+ case RDMA_REGISTER_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor register device event\n");
+ break;
+ case RDMA_UNREGISTER_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor unregister device event\n");
+ break;
+ case RDMA_NETDEV_ATTACH_EVENT:
+ netdev = ib_device_get_netdev(device, port_num);
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n",
+ port_num, netdev->ifindex);
+ dev_put(netdev);
+ break;
+ case RDMA_NETDEV_DETACH_EVENT:
+ dev_warn_ratelimited(&device->dev,
+ "Failed to send RDMA monitor netdev detach event: port %d\n",
+ port_num);
+ default:
+ break;
+ };
+}
+
+int rdma_nl_notify_event(struct ib_device *device, u32 port_num,
+ enum rdma_nl_notify_event_type type)
+{
+ struct sk_buff *skb;
+ struct net *net;
+ int ret = 0;
+ void *nlh;
+
+ net = read_pnet(&device->coredev.rdma_net);
+ if (!net)
+ return -EINVAL;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+ nlh = nlmsg_put(skb, 0, 0,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR),
+ 0, 0);
+
+ switch (type) {
+ case RDMA_REGISTER_EVENT:
+ case RDMA_UNREGISTER_EVENT:
+ ret = fill_mon_register(skb, device, net);
+ if (ret)
+ goto err_free;
+ break;
+ case RDMA_NETDEV_ATTACH_EVENT:
+ case RDMA_NETDEV_DETACH_EVENT:
+ ret = fill_mon_netdev_association(skb, device,
+ port_num, net);
+ if (ret)
+ goto err_free;
+ break;
+ default:
+ goto err_free;
+ }
+
+ ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type);
+ if (ret)
+ goto err_free;
+
+ nlmsg_end(skb, nlh);
+ ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL);
+ if (ret && ret != -ESRCH) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ goto err_free;
+ }
+ return 0;
+
+err_free:
+ rdma_nl_notify_err_msg(device, port_num, type);
+ nlmsg_free(skb);
+ return ret;
+}
+
void __init nldev_init(void)
{
rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table);
diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h
index c2a79aeee113..a5b47ca4cd66 100644
--- a/include/rdma/rdma_netlink.h
+++ b/include/rdma/rdma_netlink.h
@@ -110,6 +110,16 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb,
*/
bool rdma_nl_chk_listeners(unsigned int group);
+/**
+ * Prepare and send an event message
+ * @ib: the IB device which triggered the event
+ * @port_num: the port number which triggered the event - 0 if unused
+ * @type: the event type
+ * Returns 0 on success or a negative error code
+ */
+int rdma_nl_notify_event(struct ib_device *ib, u32 port_num,
+ enum rdma_nl_notify_event_type type);
+
struct rdma_link_ops {
struct list_head list;
const char *type;
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 2f37568f5556..5f9636d26050 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -15,6 +15,7 @@ enum {
enum {
RDMA_NL_GROUP_IWPM = 2,
RDMA_NL_GROUP_LS,
+ RDMA_NL_GROUP_NOTIFY,
RDMA_NL_NUM_GROUPS
};
@@ -305,6 +306,8 @@ enum rdma_nldev_command {
RDMA_NLDEV_CMD_DELDEV,
+ RDMA_NLDEV_CMD_MONITOR,
+
RDMA_NLDEV_NUM_OPS
};
@@ -574,6 +577,8 @@ enum rdma_nldev_attr {
RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE, /* u8 */
+ RDMA_NLDEV_ATTR_EVENT_TYPE, /* u8 */
+
/*
* Always the end
*/
@@ -624,4 +629,14 @@ enum rdma_nl_name_assign_type {
RDMA_NAME_ASSIGN_TYPE_USER = 1, /* Provided by user-space */
};
+/*
+ * Supported rdma monitoring event types.
+ */
+enum rdma_nl_notify_event_type {
+ RDMA_REGISTER_EVENT,
+ RDMA_UNREGISTER_EVENT,
+ RDMA_NETDEV_ATTACH_EVENT,
+ RDMA_NETDEV_DETACH_EVENT,
+};
+
#endif /* _UAPI_RDMA_NETLINK_H */
--
2.17.2
next prev parent reply other threads:[~2024-08-21 5:10 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-08-21 5:10 [PATCH rdma-next 0/7] Support RDMA events monitoring through Michael Guralnik
2024-08-21 5:10 ` [PATCH rdma-next 1/7] RDMA/mlx5: Check RoCE LAG status before getting netdev Michael Guralnik
2024-08-21 5:10 ` [PATCH rdma-next 2/7] RDMA/mlx5: Obtain upper net device only when needed Michael Guralnik
2024-08-21 5:10 ` [PATCH rdma-next 3/7] RDMA/mlx5: Initialize phys_port_cnt earlier in RDMA device creation Michael Guralnik
2024-08-21 5:10 ` [PATCH rdma-next 4/7] RDMA/device: Clear netdev mapping in ib_device_get_netdev on unregistration Michael Guralnik
2024-08-21 5:10 ` [PATCH rdma-next 5/7] RDMA/mlx5: Use IB set_netdev and get_netdev functions Michael Guralnik
2024-08-21 5:10 ` Michael Guralnik [this message]
2024-08-21 19:18 ` [PATCH rdma-next 6/7] RDMA/nldev: Add support for RDMA monitoring kernel test robot
2024-08-21 20:30 ` kernel test robot
2024-08-21 5:10 ` [PATCH rdma-next 7/7] RDMA/nldev: Expose whether RDMA monitoring is supported Michael Guralnik
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240821051017.7730-7-michaelgur@nvidia.com \
--to=michaelgur@nvidia.com \
--cc=cmeiohas@nvidia.com \
--cc=jgg@nvidia.com \
--cc=leonro@nvidia.com \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=msanalla@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.