* Re: [for-next V2 06/10] linux/dim: Move implementation to .c files
From: Geert Uytterhoeven @ 2019-07-02 16:15 UTC (permalink / raw)
To: Saeed Mahameed, Tal Gilboa
Cc: David S. Miller, Doug Ledford, Jason Gunthorpe, Leon Romanovsky,
Or Gerlitz, Sagi Grimberg, netdev@vger.kernel.org,
linux-rdma@vger.kernel.org, linux-kernel
In-Reply-To: <20190625205701.17849-7-saeedm@mellanox.com>
Hi Saeed, Tal,
On Tue, 25 Jun 2019, Saeed Mahameed wrote:
> From: Tal Gilboa <talgi@mellanox.com>
>
> Moved all logic from dim.h and net_dim.h to dim.c and net_dim.c.
> This is both more structurally appealing and would allow to only
> expose externally used functions.
>
> Signed-off-by: Tal Gilboa <talgi@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
This is now commit 4f75da3666c0c572 ("linux/dim: Move implementation to
.c files") in net-next.
> --- a/drivers/net/ethernet/broadcom/Kconfig
> +++ b/drivers/net/ethernet/broadcom/Kconfig
> @@ -8,6 +8,7 @@ config NET_VENDOR_BROADCOM
> default y
> depends on (SSB_POSSIBLE && HAS_DMA) || PCI || BCM63XX || \
> SIBYTE_SB1xxx_SOC
> + select DIMLIB
Merely enabling a NET_VENDOR_* symbol should not enable inclusion of
any additional code, cfr. the help text for the NET_VENDOR_BROADCOM
option.
Hence please move the select to the config symbol(s) for the driver(s)
that need it.
> --- a/lib/Kconfig
> +++ b/lib/Kconfig
> @@ -562,6 +562,14 @@ config SIGNATURE
> Digital signature verification. Currently only RSA is supported.
> Implementation is done using GnuPG MPI library
>
> +config DIMLIB
> + bool "DIM library"
> + default y
Please drop this line, as optional library code should never be included
by default.
Thanks!
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
^ permalink raw reply
* [PATCH rdma-next v5 17/17] RDMA/nldev: Allow get default counter statistics through RDMA netlink
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
This patch adds the ability to return the hwstats of per-port default
counters (which can also be queried through sysfs nodes).
Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/nldev.c | 98 ++++++++++++++++++++++++++++++++-
drivers/infiniband/core/sysfs.c | 6 ++
include/rdma/ib_verbs.h | 1 +
3 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 4993f47b0731..a4431ed566b6 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1799,6 +1799,99 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return ret;
}
+static int stat_get_doit_default_counter(struct sk_buff *skb,
+ struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack,
+ struct nlattr *tb[])
+{
+ struct rdma_hw_stats *stats;
+ struct nlattr *table_attr;
+ struct ib_device *device;
+ int ret, num_cnts, i;
+ struct sk_buff *msg;
+ u32 index, port;
+ u64 v;
+
+ if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_GET),
+ 0, 0);
+
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
+ stats = device->port_data ? device->port_data[port].hw_stats : NULL;
+ if (stats == NULL) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
+ mutex_lock(&stats->lock);
+
+ num_cnts = device->ops.get_hw_stats(device, stats, port, 0);
+ if (num_cnts < 0) {
+ ret = -EINVAL;
+ goto err_stats;
+ }
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+ if (!table_attr) {
+ ret = -EMSGSIZE;
+ goto err_stats;
+ }
+ for (i = 0; i < num_cnts; i++) {
+ v = stats->value[i] +
+ rdma_counter_get_hwstat_value(device, port, i);
+ if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) {
+ ret = -EMSGSIZE;
+ goto err_table;
+ }
+ }
+ nla_nest_end(msg, table_attr);
+
+ mutex_unlock(&stats->lock);
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_table:
+ nla_nest_cancel(msg, table_attr);
+err_stats:
+ mutex_unlock(&stats->lock);
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack, struct nlattr *tb[])
@@ -1871,9 +1964,12 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
nldev_policy, extack);
- if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+ if (ret)
return -EINVAL;
+ if (!tb[RDMA_NLDEV_ATTR_STAT_RES])
+ return stat_get_doit_default_counter(skb, nlh, extack, tb);
+
switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
case RDMA_NLDEV_ATTR_RES_QP:
ret = stat_get_doit_qp(skb, nlh, extack, tb);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index c59b80e0a740..b477295a96c2 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -1003,6 +1003,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port,
goto err;
port->hw_stats_ag = hsag;
port->hw_stats = stats;
+ if (device->port_data)
+ device->port_data[port_num].hw_stats = stats;
} else {
struct kobject *kobj = &device->dev.kobj;
ret = sysfs_create_group(kobj, hsag);
@@ -1293,6 +1295,8 @@ const struct attribute_group ib_dev_attr_group = {
void ib_free_port_attrs(struct ib_core_device *coredev)
{
+ struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+ bool is_full_dev = &device->coredev == coredev;
struct kobject *p, *t;
list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
@@ -1302,6 +1306,8 @@ void ib_free_port_attrs(struct ib_core_device *coredev)
if (port->hw_stats_ag)
free_hsag(&port->kobj, port->hw_stats_ag);
kfree(port->hw_stats);
+ if (device->port_data && is_full_dev)
+ device->port_data[port->port_num].hw_stats = NULL;
if (port->pma_table)
sysfs_remove_group(p, port->pma_table);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0c5151a12ae4..50806bef9f20 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2124,6 +2124,7 @@ struct ib_port_data {
struct net_device __rcu *netdev;
struct hlist_node ndev_hash_link;
struct rdma_port_counter port_counter;
+ struct rdma_hw_stats *hw_stats;
};
/* rdma netdev type - specifies protocol type */
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 16/17] RDMA/nldev: Allow get counter mode through RDMA netlink
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Provide an option to get current counter mode through RDMA netlink.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/counters.c | 13 ++++++
drivers/infiniband/core/nldev.c | 66 +++++++++++++++++++++++++++++-
2 files changed, 78 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 6ac8bf1240de..e924e9f7956d 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -576,6 +576,19 @@ int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
return ret;
}
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+ enum rdma_nl_counter_mode *mode,
+ enum rdma_nl_counter_mask *mask)
+{
+ struct rdma_port_counter *port_counter;
+
+ port_counter = &dev->port_data[port].port_counter;
+ *mode = port_counter->mode.mode;
+ *mask = port_counter->mode.mask;
+
+ return 0;
+}
+
void rdma_counter_init(struct ib_device *dev)
{
struct rdma_port_counter *port_counter;
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 3d750eca53d5..4993f47b0731 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1799,6 +1799,70 @@ static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return ret;
}
+static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, struct nlattr *tb[])
+
+{
+ static enum rdma_nl_counter_mode mode;
+ static enum rdma_nl_counter_mask mask;
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index, port;
+ int ret;
+
+ if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID])
+ return nldev_res_get_counter_doit(skb, nlh, extack);
+
+ if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_GET),
+ 0, 0);
+
+ ret = rdma_counter_get_mode(device, port, &mode, &mask);
+ if (ret)
+ goto err_msg;
+
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode))
+ goto err_msg;
+
+ if ((mode == RDMA_COUNTER_MODE_AUTO) &&
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask))
+ goto err_msg;
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -1812,7 +1876,7 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
case RDMA_NLDEV_ATTR_RES_QP:
- ret = nldev_res_get_counter_doit(skb, nlh, extack);
+ ret = stat_get_doit_qp(skb, nlh, extack, tb);
break;
default:
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 15/17] RDMA/nldev: Allow counter manual mode configration through RDMA netlink
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Provide an option to allow users to manually bind a qp with a counter
through RDMA netlink. Limit it to users with ADMIN capability only.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/nldev.c | 111 +++++++++++++++++++++++++++----
include/rdma/rdma_counter.h | 3 +
include/uapi/rdma/rdma_netlink.h | 2 +
3 files changed, 103 insertions(+), 13 deletions(-)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index cebc15b23b15..3d750eca53d5 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1649,8 +1649,8 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ u32 index, port, mode, mask = 0, qpn, cntn = 0;
struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
- u32 index, port, mode, mask = 0;
struct ib_device *device;
struct sk_buff *msg;
int ret;
@@ -1688,30 +1688,111 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
0, 0);
mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
- if (mode != RDMA_COUNTER_MODE_AUTO) {
- ret = -EMSGSIZE;
- goto err_msg;
+ if (mode == RDMA_COUNTER_MODE_AUTO) {
+ if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+ mask = nla_get_u32(
+ tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+ ret = rdma_counter_set_auto_mode(device, port,
+ mask ? true : false, mask);
+ if (ret)
+ goto err_msg;
+ } else {
+ qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+ if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
+ cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+ ret = rdma_counter_bind_qpn(device, port, qpn, cntn);
+ } else {
+ ret = rdma_counter_bind_qpn_alloc(device, port,
+ qpn, &cntn);
+ }
+ if (ret)
+ goto err_msg;
+
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
+ ret = -EMSGSIZE;
+ goto err_fill;
+ }
}
- if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
- mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_fill:
+ rdma_counter_unbind_qpn(device, port, qpn, cntn);
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
+static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ struct ib_device *device;
+ struct sk_buff *msg;
+ u32 index, port, qpn, cntn;
+ int ret;
- ret = rdma_counter_set_auto_mode(device, port,
- mask ? true : false, mask);
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] ||
+ !tb[RDMA_NLDEV_ATTR_RES_LQPN])
+ return -EINVAL;
+
+ if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_SET),
+ 0, 0);
+
+ cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);
+ qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
+ ret = rdma_counter_unbind_qpn(device, port, qpn, cntn);
if (ret)
- goto err_msg;
+ goto err_unbind;
- if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) ||
- nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+ if (fill_nldev_handle(msg, device) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) {
ret = -EMSGSIZE;
- goto err_msg;
+ goto err_fill;
}
nlmsg_end(msg, nlh);
ib_device_put(device);
return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
-err_msg:
+err_fill:
+ rdma_counter_bind_qpn(device, port, qpn, cntn);
+err_unbind:
nlmsg_free(msg);
err:
ib_device_put(device);
@@ -1828,6 +1909,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.doit = nldev_stat_get_doit,
.dump = nldev_stat_get_dumpit,
},
+ [RDMA_NLDEV_CMD_STAT_DEL] = {
+ .doit = nldev_stat_del_doit,
+ .flags = RDMA_NL_ADMIN_PERM,
+ },
};
void __init nldev_init(void)
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 6603e10eb352..68827700ba95 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -58,5 +58,8 @@ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
u32 qp_num, u32 *counter_id);
int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
u32 qp_num, u32 counter_id);
+int rdma_counter_get_mode(struct ib_device *dev, u8 port,
+ enum rdma_nl_counter_mode *mode,
+ enum rdma_nl_counter_mask *mask);
#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index ebd728f9e351..d770bd21e873 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -285,6 +285,8 @@ enum rdma_nldev_command {
RDMA_NLDEV_CMD_STAT_GET, /* can dump */
+ RDMA_NLDEV_CMD_STAT_DEL,
+
RDMA_NLDEV_NUM_OPS
};
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 14/17] RDMA/counter: Allow manual mode configuration support
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
In manual mode a QP is bound to a counter manually. If counter is not
specified then a new one will be allocated.
Manually mode is enabled when user binds a QP, and disabled when the
last manually bound QP is unbound.
When auto-mode is turned off and there are counters left, manual mode
is enabled so that the user is able to access these counters.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/counters.c | 219 ++++++++++++++++++++++++++++-
include/rdma/rdma_counter.h | 7 +
include/uapi/rdma/rdma_netlink.h | 6 +
3 files changed, 229 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 2de4c555eba9..6ac8bf1240de 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -27,7 +27,9 @@ static int __counter_set_mode(struct rdma_counter_mode *curr,
/**
* rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
*
- * When @on is true, the @mask must be set
+ * When @on is true, the @mask must be set; When @on is false, it goes
+ * into manual mode if there's any counter, so that the user is able to
+ * manually access them.
*/
int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
bool on, enum rdma_nl_counter_mask mask)
@@ -45,8 +47,13 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
ret = -EINVAL;
goto out;
}
- ret = __counter_set_mode(&port_counter->mode,
- RDMA_COUNTER_MODE_NONE, 0);
+
+ if (port_counter->num_counters)
+ ret = __counter_set_mode(&port_counter->mode,
+ RDMA_COUNTER_MODE_MANUAL, 0);
+ else
+ ret = __counter_set_mode(&port_counter->mode,
+ RDMA_COUNTER_MODE_NONE, 0);
}
out:
@@ -57,7 +64,9 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
enum rdma_nl_counter_mode mode)
{
+ struct rdma_port_counter *port_counter;
struct rdma_counter *counter;
+ int ret;
if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
return NULL;
@@ -73,12 +82,27 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
if (!counter->stats)
goto err_stats;
+ port_counter = &dev->port_data[port].port_counter;
+ mutex_lock(&port_counter->lock);
+ if (mode == RDMA_COUNTER_MODE_MANUAL) {
+ ret = __counter_set_mode(&port_counter->mode,
+ RDMA_COUNTER_MODE_MANUAL, 0);
+ if (ret)
+ goto err_mode;
+ }
+
+ port_counter->num_counters++;
+ mutex_unlock(&port_counter->lock);
+
counter->mode.mode = mode;
kref_init(&counter->kref);
mutex_init(&counter->lock);
return counter;
+err_mode:
+ mutex_unlock(&port_counter->lock);
+ kfree(counter->stats);
err_stats:
kfree(counter);
return NULL;
@@ -86,6 +110,18 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
static void rdma_counter_free(struct rdma_counter *counter)
{
+ struct rdma_port_counter *port_counter;
+
+ port_counter = &counter->device->port_data[counter->port].port_counter;
+ mutex_lock(&port_counter->lock);
+ port_counter->num_counters--;
+ if (!port_counter->num_counters &&
+ (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL))
+ __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE,
+ 0);
+
+ mutex_unlock(&port_counter->lock);
+
rdma_restrack_del(&counter->res);
kfree(counter->stats);
kfree(counter);
@@ -363,6 +399,183 @@ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
return sum;
}
+static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num)
+{
+ struct rdma_restrack_entry *res = NULL;
+ struct ib_qp *qp = NULL;
+
+ res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num);
+ if (IS_ERR(res))
+ return NULL;
+
+ if (!rdma_is_visible_in_pid_ns(res))
+ goto err;
+
+ qp = container_of(res, struct ib_qp, res);
+ if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ goto err;
+
+ return qp;
+
+err:
+ rdma_restrack_put(&qp->res);
+ return NULL;
+}
+
+static int rdma_counter_bind_qp_manual(struct rdma_counter *counter,
+ struct ib_qp *qp)
+{
+ if ((counter->device != qp->device) || (counter->port != qp->port))
+ return -EINVAL;
+
+ return __rdma_counter_bind_qp(counter, qp);
+}
+
+static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev,
+ u32 counter_id)
+{
+ struct rdma_restrack_entry *res;
+ struct rdma_counter *counter;
+
+ res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id);
+ if (IS_ERR(res))
+ return NULL;
+
+ if (!rdma_is_visible_in_pid_ns(res)) {
+ rdma_restrack_put(res);
+ return NULL;
+ }
+
+ counter = container_of(res, struct rdma_counter, res);
+ kref_get(&counter->kref);
+ rdma_restrack_put(res);
+
+ return counter;
+}
+
+/**
+ * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id
+ */
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+ u32 qp_num, u32 counter_id)
+{
+ struct rdma_counter *counter;
+ struct ib_qp *qp;
+ int ret;
+
+ qp = rdma_counter_get_qp(dev, qp_num);
+ if (!qp)
+ return -ENOENT;
+
+ counter = rdma_get_counter_by_id(dev, counter_id);
+ if (!counter) {
+ ret = -ENOENT;
+ goto err;
+ }
+
+ if (counter->res.task != qp->res.task) {
+ ret = -EINVAL;
+ goto err_task;
+ }
+
+ ret = rdma_counter_bind_qp_manual(counter, qp);
+ if (ret)
+ goto err_task;
+
+ rdma_restrack_put(&qp->res);
+ return 0;
+
+err_task:
+ kref_put(&counter->kref, counter_release);
+err:
+ rdma_restrack_put(&qp->res);
+ return ret;
+}
+
+/**
+ * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it
+ * The id of new counter is returned in @counter_id
+ */
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+ u32 qp_num, u32 *counter_id)
+{
+ struct rdma_counter *counter;
+ struct ib_qp *qp;
+ int ret;
+
+ if (!rdma_is_port_valid(dev, port))
+ return -EINVAL;
+
+ qp = rdma_counter_get_qp(dev, qp_num);
+ if (!qp)
+ return -ENOENT;
+
+ if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL);
+ if (!counter) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = rdma_counter_bind_qp_manual(counter, qp);
+ if (ret)
+ goto err_bind;
+
+ if (counter_id)
+ *counter_id = counter->id;
+
+ rdma_counter_res_add(counter, qp);
+
+ rdma_restrack_put(&qp->res);
+ return ret;
+
+err_bind:
+ rdma_counter_free(counter);
+err:
+ rdma_restrack_put(&qp->res);
+ return ret;
+}
+
+/**
+ * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter
+ */
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+ u32 qp_num, u32 counter_id)
+{
+ struct rdma_port_counter *port_counter;
+ struct ib_qp *qp;
+ int ret;
+
+ if (!rdma_is_port_valid(dev, port))
+ return -EINVAL;
+
+ qp = rdma_counter_get_qp(dev, qp_num);
+ if (!qp)
+ return -ENOENT;
+
+ if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (!qp->counter || qp->counter->id != counter_id ||
+ port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = rdma_counter_unbind_qp(qp, false);
+
+out:
+ rdma_restrack_put(&qp->res);
+ return ret;
+}
+
void rdma_counter_init(struct ib_device *dev)
{
struct rdma_port_counter *port_counter;
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index bf2c3578768f..6603e10eb352 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -28,6 +28,7 @@ struct rdma_counter_mode {
struct rdma_port_counter {
struct rdma_counter_mode mode;
struct rdma_hw_stats *hstats;
+ unsigned int num_counters;
struct mutex lock;
};
@@ -51,5 +52,11 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
int rdma_counter_query_stats(struct rdma_counter *counter);
u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index);
+int rdma_counter_bind_qpn(struct ib_device *dev, u8 port,
+ u32 qp_num, u32 counter_id);
+int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port,
+ u32 qp_num, u32 *counter_id);
+int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port,
+ u32 qp_num, u32 counter_id);
#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 22c5bc7a82dd..ebd728f9e351 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -536,6 +536,12 @@ enum rdma_nl_counter_mode {
*/
RDMA_COUNTER_MODE_AUTO,
+ /*
+ * Which qp are bound with which counter is explicitly specified
+ * by the user
+ */
+ RDMA_COUNTER_MODE_MANUAL,
+
/*
* Always the end
*/
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 13/17] RDMA/core: Get sum value of all counters when perform a sysfs stat read
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Since a QP can only be bound to one counter, then if it is bound to a
separate counter, for backward compatibility purpose, the statistic
value must be:
* stat of default counter
+ stat of all running allocated counters
+ stat of all deallocated counters (history stats)
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/counters.c | 89 ++++++++++++++++++++++++++++++
drivers/infiniband/core/sysfs.c | 10 +++-
include/rdma/rdma_counter.h | 2 +
3 files changed, 98 insertions(+), 3 deletions(-)
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index ca9adee19159..2de4c555eba9 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -158,6 +158,20 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp)
return ret;
}
+static void counter_history_stat_update(const struct rdma_counter *counter)
+{
+ struct ib_device *dev = counter->device;
+ struct rdma_port_counter *port_counter;
+ int i;
+
+ port_counter = &dev->port_data[counter->port].port_counter;
+ if (!port_counter->hstats)
+ return;
+
+ for (i = 0; i < counter->stats->num_counters; i++)
+ port_counter->hstats->value[i] += counter->stats->value[i];
+}
+
/**
* rdma_get_counter_auto_mode - Find the counter that @qp should be bound
* with in auto mode
@@ -215,6 +229,7 @@ static void counter_release(struct kref *kref)
struct rdma_counter *counter;
counter = container_of(kref, struct rdma_counter, kref);
+ counter_history_stat_update(counter);
counter->device->ops.counter_dealloc(counter);
rdma_counter_free(counter);
}
@@ -299,6 +314,55 @@ int rdma_counter_query_stats(struct rdma_counter *counter)
return ret;
}
+static u64 get_running_counters_hwstat_sum(struct ib_device *dev,
+ u8 port, u32 index)
+{
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ struct rdma_counter *counter;
+ unsigned long id = 0;
+ u64 sum = 0;
+
+ rt = &dev->res[RDMA_RESTRACK_COUNTER];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ if (!rdma_restrack_get(res))
+ continue;
+
+ xa_unlock(&rt->xa);
+
+ counter = container_of(res, struct rdma_counter, res);
+ if ((counter->device != dev) || (counter->port != port) ||
+ rdma_counter_query_stats(counter))
+ goto next;
+
+ sum += counter->stats->value[index];
+
+next:
+ xa_lock(&rt->xa);
+ rdma_restrack_put(res);
+ }
+
+ xa_unlock(&rt->xa);
+ return sum;
+}
+
+/**
+ * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a
+ * specific port, including the running ones and history data
+ */
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index)
+{
+ struct rdma_port_counter *port_counter;
+ u64 sum;
+
+ port_counter = &dev->port_data[port].port_counter;
+ sum = get_running_counters_hwstat_sum(dev, port, index);
+ sum += port_counter->hstats->value[index];
+
+ return sum;
+}
+
void rdma_counter_init(struct ib_device *dev)
{
struct rdma_port_counter *port_counter;
@@ -311,9 +375,34 @@ void rdma_counter_init(struct ib_device *dev)
port_counter = &dev->port_data[port].port_counter;
port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
mutex_init(&port_counter->lock);
+
+ port_counter->hstats = dev->ops.alloc_hw_stats(dev, port);
+ if (!port_counter->hstats)
+ goto fail;
}
+
+ return;
+
+fail:
+ rdma_for_each_port(dev, port) {
+ port_counter = &dev->port_data[port].port_counter;
+ kfree(port_counter->hstats);
+ port_counter->hstats = NULL;
+ }
+
+ return;
}
void rdma_counter_release(struct ib_device *dev)
{
+ struct rdma_port_counter *port_counter;
+ u32 port;
+
+ if (!dev->ops.alloc_hw_stats)
+ return;
+
+ rdma_for_each_port(dev, port) {
+ port_counter = &dev->port_data[port].port_counter;
+ kfree(port_counter->hstats);
+ }
}
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index c78d0c9646ae..c59b80e0a740 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -43,6 +43,7 @@
#include <rdma/ib_mad.h>
#include <rdma/ib_pma.h>
#include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
struct ib_port;
@@ -800,9 +801,12 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats,
return 0;
}
-static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf)
+static ssize_t print_hw_stat(struct ib_device *dev, int port_num,
+ struct rdma_hw_stats *stats, int index, char *buf)
{
- return sprintf(buf, "%llu\n", stats->value[index]);
+ u64 v = rdma_counter_get_hwstat_value(dev, port_num, index);
+
+ return sprintf(buf, "%llu\n", stats->value[index] + v);
}
static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
@@ -828,7 +832,7 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr,
ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index);
if (ret)
goto unlock;
- ret = print_hw_stat(stats, hsa->index, buf);
+ ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf);
unlock:
mutex_unlock(&stats->lock);
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index f2a5c8efc404..bf2c3578768f 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -27,6 +27,7 @@ struct rdma_counter_mode {
struct rdma_port_counter {
struct rdma_counter_mode mode;
+ struct rdma_hw_stats *hstats;
struct mutex lock;
};
@@ -49,5 +50,6 @@ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
int rdma_counter_query_stats(struct rdma_counter *counter);
+u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index);
#endif /* _RDMA_COUNTER_H_ */
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 12/17] IB/mlx5: Add counter_alloc_stats() and counter_update_stats() support
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Add support for ib callback counter_alloc_stats() and
counter_update_stats().
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/hw/mlx5/main.c | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index ffd6f16d3c37..d6751b2cfa1c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5552,6 +5552,27 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
return num_counters;
}
+static struct rdma_hw_stats *
+mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
+{
+ struct mlx5_ib_dev *dev = to_mdev(counter->device);
+ struct mlx5_ib_port *port = &dev->port[counter->port - 1];
+
+ /* Q counters are in the beginning of all counters */
+ return rdma_alloc_hw_stats_struct(port->cnts.names,
+ port->cnts.num_q_counters,
+ RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
+{
+ struct mlx5_ib_dev *dev = to_mdev(counter->device);
+ struct mlx5_ib_port *port = &dev->port[counter->port - 1];
+
+ return mlx5_ib_query_q_counters(dev->mdev, port,
+ counter->stats, counter->id);
+}
+
static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
struct ib_qp *qp)
{
@@ -6519,6 +6540,8 @@ static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
.counter_bind_qp = mlx5_ib_counter_bind_qp,
.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
.counter_dealloc = mlx5_ib_counter_dealloc,
+ .counter_alloc_stats = mlx5_ib_counter_alloc_stats,
+ .counter_update_stats = mlx5_ib_counter_update_stats,
};
static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 11/17] RDMA/netlink: Implement counter dumpit calback
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
This patch adds the ability to return all available counters
together with their properties and hwstats.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/counters.c | 26 +++-
drivers/infiniband/core/device.c | 2 +
drivers/infiniband/core/nldev.c | 213 +++++++++++++++++++++++++++++
include/rdma/ib_verbs.h | 10 ++
include/rdma/rdma_counter.h | 3 +
include/uapi/rdma/rdma_netlink.h | 10 +-
6 files changed, 262 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 60639452669c..ca9adee19159 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -59,7 +59,7 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
{
struct rdma_counter *counter;
- if (!dev->ops.counter_dealloc)
+ if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats)
return NULL;
counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -69,16 +69,25 @@ static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
counter->device = dev;
counter->port = port;
counter->res.type = RDMA_RESTRACK_COUNTER;
+ counter->stats = dev->ops.counter_alloc_stats(counter);
+ if (!counter->stats)
+ goto err_stats;
+
counter->mode.mode = mode;
kref_init(&counter->kref);
mutex_init(&counter->lock);
return counter;
+
+err_stats:
+ kfree(counter);
+ return NULL;
}
static void rdma_counter_free(struct rdma_counter *counter)
{
rdma_restrack_del(&counter->res);
+ kfree(counter->stats);
kfree(counter);
}
@@ -275,6 +284,21 @@ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
return 0;
}
+int rdma_counter_query_stats(struct rdma_counter *counter)
+{
+ struct ib_device *dev = counter->device;
+ int ret;
+
+ if (!dev->ops.counter_update_stats)
+ return -EINVAL;
+
+ mutex_lock(&counter->lock);
+ ret = dev->ops.counter_update_stats(counter);
+ mutex_unlock(&counter->lock);
+
+ return ret;
+}
+
void rdma_counter_init(struct ib_device *dev)
{
struct rdma_port_counter *port_counter;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index f3181b74c863..bdf61499e6d5 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2471,9 +2471,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, alloc_xrcd);
SET_DEVICE_OP(dev_ops, attach_mcast);
SET_DEVICE_OP(dev_ops, check_mr_status);
+ SET_DEVICE_OP(dev_ops, counter_alloc_stats);
SET_DEVICE_OP(dev_ops, counter_bind_qp);
SET_DEVICE_OP(dev_ops, counter_dealloc);
SET_DEVICE_OP(dev_ops, counter_unbind_qp);
+ SET_DEVICE_OP(dev_ops, counter_update_stats);
SET_DEVICE_OP(dev_ops, create_ah);
SET_DEVICE_OP(dev_ops, create_counters);
SET_DEVICE_OP(dev_ops, create_cq);
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 9a4cf285f447..cebc15b23b15 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -129,6 +129,13 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING },
+ [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 },
[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 },
[RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 },
@@ -636,6 +643,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin,
err: return -EMSGSIZE;
}
+static int fill_stat_counter_mode(struct sk_buff *msg,
+ struct rdma_counter *counter)
+{
+ struct rdma_counter_mode *m = &counter->mode;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode))
+ return -EMSGSIZE;
+
+ if (m->mode == RDMA_COUNTER_MODE_AUTO)
+ if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) &&
+ nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn)
+{
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_stat_counter_qps(struct sk_buff *msg,
+ struct rdma_counter *counter)
+{
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ struct nlattr *table_attr;
+ struct ib_qp *qp = NULL;
+ unsigned long id = 0;
+ int ret = 0;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP);
+
+ rt = &counter->device->res[RDMA_RESTRACK_QP];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ if (!rdma_is_visible_in_pid_ns(res))
+ continue;
+
+ qp = container_of(res, struct ib_qp, res);
+ if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ continue;
+
+ if (!qp->counter || (qp->counter->id != counter->id))
+ continue;
+
+ ret = fill_stat_counter_qp_entry(msg, qp->qp_num);
+ if (ret)
+ goto err;
+ }
+
+ xa_unlock(&rt->xa);
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ xa_unlock(&rt->xa);
+ nla_nest_cancel(msg, table_attr);
+ return ret;
+}
+
+static int fill_stat_hwcounter_entry(struct sk_buff *msg,
+ const char *name, u64 value)
+{
+ struct nlattr *entry_attr;
+
+ entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME,
+ name))
+ goto err;
+ if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE,
+ value, RDMA_NLDEV_ATTR_PAD))
+ goto err;
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_stat_counter_hwcounters(struct sk_buff *msg,
+ struct rdma_counter *counter)
+{
+ struct rdma_hw_stats *st = counter->stats;
+ struct nlattr *table_attr;
+ int i;
+
+ table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < st->num_counters; i++)
+ if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i]))
+ goto err;
+
+ nla_nest_end(msg, table_attr);
+ return 0;
+
+err:
+ nla_nest_cancel(msg, table_attr);
+ return -EMSGSIZE;
+}
+
+static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin,
+ struct rdma_restrack_entry *res,
+ uint32_t port)
+{
+ struct rdma_counter *counter =
+ container_of(res, struct rdma_counter, res);
+
+ if (port && port != counter->port)
+ return 0;
+
+ /* Dump it even query failed */
+ rdma_counter_query_stats(counter);
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) ||
+ fill_res_name_pid(msg, &counter->res) ||
+ fill_stat_counter_mode(msg, counter) ||
+ fill_stat_counter_qps(msg, counter) ||
+ fill_stat_counter_hwcounters(msg, counter))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -1003,6 +1156,13 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
.entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY,
.id = RDMA_NLDEV_ATTR_RES_PDN,
},
+ [RDMA_RESTRACK_COUNTER] = {
+ .fill_res_func = fill_res_counter_entry,
+ .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET,
+ .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER,
+ .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY,
+ .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID,
+ },
};
static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -1239,6 +1399,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID);
RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ);
RES_GET_FUNCS(pd, RDMA_RESTRACK_PD);
RES_GET_FUNCS(mr, RDMA_RESTRACK_MR);
+RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER);
static LIST_HEAD(link_ops);
static DECLARE_RWSEM(link_ops_rwsem);
@@ -1557,6 +1718,54 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return ret;
}
+static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+ return -EINVAL;
+
+ switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+ case RDMA_NLDEV_ATTR_RES_QP:
+ ret = nldev_res_get_counter_doit(skb, nlh, extack);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int nldev_stat_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ int ret;
+
+ ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, NULL);
+ if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES])
+ return -EINVAL;
+
+ switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) {
+ case RDMA_NLDEV_ATTR_RES_QP:
+ ret = nldev_res_get_counter_dumpit(skb, cb);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
[RDMA_NLDEV_CMD_GET] = {
.doit = nldev_get_doit,
@@ -1615,6 +1824,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
.doit = nldev_stat_set_doit,
.flags = RDMA_NL_ADMIN_PERM,
},
+ [RDMA_NLDEV_CMD_STAT_GET] = {
+ .doit = nldev_stat_get_doit,
+ .dump = nldev_stat_get_dumpit,
+ },
};
void __init nldev_init(void)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0205472eb73a..0c5151a12ae4 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2503,6 +2503,16 @@ struct ib_device_ops {
* counter_dealloc -De-allocate the hw counter
*/
int (*counter_dealloc)(struct rdma_counter *counter);
+ /**
+ * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
+ * the driver initialized data.
+ */
+ struct rdma_hw_stats *(*counter_alloc_stats)(
+ struct rdma_counter *counter);
+ /**
+ * counter_update_stats - Query the stats value of this counter
+ */
+ int (*counter_update_stats)(struct rdma_counter *counter);
DECLARE_RDMA_OBJ_SIZE(ib_ah);
DECLARE_RDMA_OBJ_SIZE(ib_cq);
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 9f93a2403c9c..f2a5c8efc404 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -37,6 +37,7 @@ struct rdma_counter {
struct kref kref;
struct rdma_counter_mode mode;
struct mutex lock;
+ struct rdma_hw_stats *stats;
u8 port;
};
@@ -47,4 +48,6 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
+int rdma_counter_query_stats(struct rdma_counter *counter);
+
#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 0cb47d23fd86..22c5bc7a82dd 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -283,6 +283,8 @@ enum rdma_nldev_command {
RDMA_NLDEV_CMD_STAT_SET,
+ RDMA_NLDEV_CMD_STAT_GET, /* can dump */
+
RDMA_NLDEV_NUM_OPS
};
@@ -496,7 +498,13 @@ enum rdma_nldev_attr {
RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */
RDMA_NLDEV_ATTR_STAT_RES, /* u32 */
RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */
-
+ RDMA_NLDEV_ATTR_STAT_COUNTER, /* nested table */
+ RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, /* nested table */
+ RDMA_NLDEV_ATTR_STAT_COUNTER_ID, /* u32 */
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTERS, /* nested table */
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY, /* nested table */
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, /* string */
+ RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, /* u64 */
/*
* Information about a chardev.
* CHARDEV_TYPE is the name of the chardev ABI (ie uverbs, umad, etc)
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 10/17] RDMA/nldev: Allow counter auto mode configration through RDMA netlink
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Provide an option to enable/disable per-port counter auto mode through
RDMA netlink. Limit it to users with ADMIN capability only.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/nldev.c | 78 ++++++++++++++++++++++++++++++++
include/uapi/rdma/rdma_netlink.h | 8 ++++
2 files changed, 86 insertions(+)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index d9ebfb50962b..9a4cf285f447 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -126,6 +126,9 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
[RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 },
[RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 },
+ [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 },
+ [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 },
[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 },
[RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 },
[RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 },
@@ -1482,6 +1485,78 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
return err;
}
+static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+ u32 index, port, mode, mask = 0;
+ struct ib_device *device;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+ nldev_policy, extack);
+ /* Currently only counter for QP is supported */
+ if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] ||
+ !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||
+ !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE])
+ return -EINVAL;
+
+ if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP)
+ return -EINVAL;
+
+ index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
+ device = ib_device_get_by_index(sock_net(skb->sk), index);
+ if (!device)
+ return -EINVAL;
+
+ port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]);
+ if (!rdma_is_port_valid(device, port)) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq,
+ RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+ RDMA_NLDEV_CMD_STAT_SET),
+ 0, 0);
+
+ mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]);
+ if (mode != RDMA_COUNTER_MODE_AUTO) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
+ if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK])
+ mask = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]);
+
+ ret = rdma_counter_set_auto_mode(device, port,
+ mask ? true : false, mask);
+ if (ret)
+ goto err_msg;
+
+ if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode) ||
+ nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) {
+ ret = -EMSGSIZE;
+ goto err_msg;
+ }
+
+ nlmsg_end(msg, nlh);
+ ib_device_put(device);
+ return rdma_nl_unicast(msg, NETLINK_CB(skb).portid);
+
+err_msg:
+ nlmsg_free(msg);
+err:
+ ib_device_put(device);
+ return ret;
+}
+
static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
[RDMA_NLDEV_CMD_GET] = {
.doit = nldev_get_doit,
@@ -1535,6 +1610,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
},
[RDMA_NLDEV_CMD_SYS_SET] = {
.doit = nldev_set_sys_set_doit,
+ },
+ [RDMA_NLDEV_CMD_STAT_SET] = {
+ .doit = nldev_stat_set_doit,
.flags = RDMA_NL_ADMIN_PERM,
},
};
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index e3cd912e9cef..0cb47d23fd86 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -281,6 +281,8 @@ enum rdma_nldev_command {
RDMA_NLDEV_CMD_GET_CHARDEV,
+ RDMA_NLDEV_CMD_STAT_SET,
+
RDMA_NLDEV_NUM_OPS
};
@@ -488,6 +490,12 @@ enum rdma_nldev_attr {
* File descriptor handle of the net namespace object
*/
RDMA_NLDEV_NET_NS_FD, /* u32 */
+ /*
+ * Counter-specific attributes.
+ */
+ RDMA_NLDEV_ATTR_STAT_MODE, /* u32 */
+ RDMA_NLDEV_ATTR_STAT_RES, /* u32 */
+ RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, /* u32 */
/*
* Information about a chardev.
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 09/17] IB/mlx5: Support statistic q counter configuration
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Add support for ib callbacks counter_bind_qp(), counter_unbind_qp()
and counter_dealloc().
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/hw/mlx5/main.c | 44 +++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 4740cfda5b17..ffd6f16d3c37 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5552,6 +5552,47 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
return num_counters;
}
+static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+ struct ib_qp *qp)
+{
+ struct mlx5_ib_dev *dev = to_mdev(qp->device);
+ u16 cnt_set_id = 0;
+ int err;
+
+ if (!counter->id) {
+ err = mlx5_cmd_alloc_q_counter(dev->mdev,
+ &cnt_set_id,
+ MLX5_SHARED_RESOURCE_UID);
+ if (err)
+ return err;
+ counter->id = cnt_set_id;
+ }
+
+ err = mlx5_ib_qp_set_counter(qp, counter);
+ if (err)
+ goto fail_set_counter;
+
+ return 0;
+
+fail_set_counter:
+ mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
+ counter->id = 0;
+
+ return err;
+}
+
+static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
+{
+ return mlx5_ib_qp_set_counter(qp, NULL);
+}
+
+static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
+{
+ struct mlx5_ib_dev *dev = to_mdev(counter->device);
+
+ return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
+}
+
static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
enum rdma_netdev_t type,
struct rdma_netdev_alloc_params *params)
@@ -6475,6 +6516,9 @@ static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
.alloc_hw_stats = mlx5_ib_alloc_hw_stats,
.get_hw_stats = mlx5_ib_get_hw_stats,
+ .counter_bind_qp = mlx5_ib_counter_bind_qp,
+ .counter_unbind_qp = mlx5_ib_counter_unbind_qp,
+ .counter_dealloc = mlx5_ib_counter_dealloc,
};
static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 08/17] IB/mlx5: Add counter set id as a parameter for mlx5_ib_query_q_counters()
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Add counter set id as a parameter so that this API can be used for
querying any q counter.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/hw/mlx5/main.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 05d2bfcb3d60..4740cfda5b17 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -5443,7 +5443,8 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
struct mlx5_ib_port *port,
- struct rdma_hw_stats *stats)
+ struct rdma_hw_stats *stats,
+ u16 set_id)
{
int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
void *out;
@@ -5454,9 +5455,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
if (!out)
return -ENOMEM;
- ret = mlx5_core_query_q_counter(mdev,
- port->cnts.set_id, 0,
- out, outlen);
+ ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
if (ret)
goto free;
@@ -5516,7 +5515,8 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
port->cnts.num_ext_ppcnt_counters;
/* q_counters are per IB device, query the master mdev */
- ret = mlx5_ib_query_q_counters(dev->mdev, port, stats);
+ ret = mlx5_ib_query_q_counters(dev->mdev, port, stats,
+ port->cnts.set_id);
if (ret)
return ret;
--
2.20.1
^ permalink raw reply related
* [PATCH mlx5-next v5 07/17] IB/mlx5: Support set qp counter
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Support bind a qp with counter. If counter is null then bind the qp to
the default counter. Different QP state has different operation:
- RESET: Set the counter field so that it will take effective
during RST2INIT change;
- RTS: Issue an RTS2RTS change to update the QP counter;
- Other: Set the counter field and mark the counter_pending flag,
when QP is moved to RTS state and this flag is set, then issue
an RTS2RTS modification to update the counter.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +++
drivers/infiniband/hw/mlx5/qp.c | 76 +++++++++++++++++++++++++++-
include/linux/mlx5/qp.h | 1 +
3 files changed, 81 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index bdb83fc85f94..c0f4327bd1a5 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -439,6 +439,10 @@ struct mlx5_ib_qp {
u32 flags_en;
/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
enum ib_qp_type qp_sub_type;
+ /* A flag to indicate if there's a new counter is configured
+ * but not take effective
+ */
+ u32 counter_pending;
};
struct mlx5_ib_cq_buf {
@@ -1456,4 +1460,6 @@ void mlx5_ib_put_xlt_emergency_page(void);
int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
struct mlx5_bfreg_info *bfregi, u32 bfregn,
bool dyn_bfreg);
+
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 4fbf60fed374..42375cdafd53 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -34,6 +34,7 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
#include <linux/mlx5/fs.h>
#include "mlx5_ib.h"
#include "ib_rep.h"
@@ -3380,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
return tx_port_affinity;
}
+static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
+ struct rdma_counter *counter)
+{
+ struct mlx5_ib_dev *dev = to_mdev(qp->device);
+ struct mlx5_ib_qp *mqp = to_mqp(qp);
+ struct mlx5_qp_context context = {};
+ struct mlx5_ib_port *mibport = NULL;
+ struct mlx5_ib_qp_base *base;
+ u32 set_id;
+
+ if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
+ return 0;
+
+ if (counter) {
+ set_id = counter->id;
+ } else {
+ mibport = &dev->port[mqp->port - 1];
+ set_id = mibport->cnts.set_id;
+ }
+
+ base = &mqp->trans_qp.base;
+ context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
+ context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
+ return mlx5_core_qp_modify(dev->mdev,
+ MLX5_CMD_OP_RTS2RTS_QP,
+ MLX5_QP_OPTPAR_COUNTER_SET_ID,
+ &context, &base->mqp);
+}
+
static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
enum ib_qp_state cur_state,
@@ -3433,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
struct mlx5_ib_port *mibport = NULL;
enum mlx5_qp_state mlx5_cur, mlx5_new;
enum mlx5_qp_optpar optpar;
+ u32 set_id = 0;
int mlx5_st;
int err;
u16 op;
@@ -3595,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
port_num = 0;
mibport = &dev->port[port_num];
+ if (ibqp->counter)
+ set_id = ibqp->counter->id;
+ else
+ set_id = mibport->cnts.set_id;
context->qp_counter_set_usr_page |=
- cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+ cpu_to_be32(set_id << 24);
}
if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -3624,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
raw_qp_param.operation = op;
if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
- raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+ raw_qp_param.rq_q_ctr_id = set_id;
raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
}
@@ -3701,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
qp->db.db[MLX5_SND_DBR] = 0;
}
+ if ((new_state == IB_QPS_RTS) && qp->counter_pending) {
+ err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter);
+ if (!err)
+ qp->counter_pending = 0;
+ }
+
out:
kfree(context);
return err;
@@ -6435,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp)
handle_drain_completion(cq, &rdrain, dev);
}
+
+/**
+ * Bind a qp to a counter. If @counter is NULL then bind the qp to
+ * the default counter
+ */
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+{
+ struct mlx5_ib_qp *mqp = to_mqp(qp);
+ int err = 0;
+
+ mutex_lock(&mqp->mutex);
+ if (mqp->state == IB_QPS_RESET) {
+ qp->counter = counter;
+ goto out;
+ }
+
+ if (mqp->state == IB_QPS_RTS) {
+ err = __mlx5_ib_qp_set_counter(qp, counter);
+ if (!err)
+ qp->counter = counter;
+
+ goto out;
+ }
+
+ mqp->counter_pending = 1;
+ qp->counter = counter;
+
+out:
+ mutex_unlock(&mqp->mutex);
+ return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 08e43cd9e742..a8270869f0b6 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -71,6 +71,7 @@ enum mlx5_qp_optpar {
MLX5_QP_OPTPAR_CQN_RCV = 1 << 19,
MLX5_QP_OPTPAR_DC_HS = 1 << 20,
MLX5_QP_OPTPAR_DC_KEY = 1 << 21,
+ MLX5_QP_OPTPAR_COUNTER_SET_ID = 1 << 25,
};
enum mlx5_qp_state {
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 06/17] RDMA/counter: Add "auto" configuration mode support
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
In auto mode all QPs belong to one category are bind automatically to
a single counter set. Currently only "qp type" is supported.
In this mode the qp counter is set in RST2INIT modification, and when
a qp is destroyed the counter is unbound.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/counters.c | 221 +++++++++++++++++++++++++++++
drivers/infiniband/core/device.c | 3 +
drivers/infiniband/core/verbs.c | 9 ++
include/rdma/ib_verbs.h | 18 +++
include/rdma/rdma_counter.h | 8 ++
5 files changed, 259 insertions(+)
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
index 6167914fba06..60639452669c 100644
--- a/drivers/infiniband/core/counters.c
+++ b/drivers/infiniband/core/counters.c
@@ -54,6 +54,227 @@ int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
return ret;
}
+static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port,
+ enum rdma_nl_counter_mode mode)
+{
+ struct rdma_counter *counter;
+
+ if (!dev->ops.counter_dealloc)
+ return NULL;
+
+ counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+ if (!counter)
+ return NULL;
+
+ counter->device = dev;
+ counter->port = port;
+ counter->res.type = RDMA_RESTRACK_COUNTER;
+ counter->mode.mode = mode;
+ kref_init(&counter->kref);
+ mutex_init(&counter->lock);
+
+ return counter;
+}
+
+static void rdma_counter_free(struct rdma_counter *counter)
+{
+ rdma_restrack_del(&counter->res);
+ kfree(counter);
+}
+
+static void auto_mode_init_counter(struct rdma_counter *counter,
+ const struct ib_qp *qp,
+ enum rdma_nl_counter_mask new_mask)
+{
+ struct auto_mode_param *param = &counter->mode.param;
+
+ counter->mode.mode = RDMA_COUNTER_MODE_AUTO;
+ counter->mode.mask = new_mask;
+
+ if (new_mask & RDMA_COUNTER_MASK_QP_TYPE)
+ param->qp_type = qp->qp_type;
+}
+
+static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter,
+ enum rdma_nl_counter_mask auto_mask)
+{
+ struct auto_mode_param *param = &counter->mode.param;
+ bool match = true;
+
+ if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res))
+ return false;
+
+ /* Ensure that counter belong to right PID */
+ if (!rdma_is_kernel_res(&counter->res) &&
+ !rdma_is_kernel_res(&qp->res) &&
+ (task_pid_vnr(counter->res.task) != current->pid))
+ return false;
+
+ if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE)
+ match &= (param->qp_type == qp->qp_type);
+
+ return match;
+}
+
+static int __rdma_counter_bind_qp(struct rdma_counter *counter,
+ struct ib_qp *qp)
+{
+ int ret;
+
+ if (qp->counter)
+ return -EINVAL;
+
+ if (!qp->device->ops.counter_bind_qp)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&counter->lock);
+ ret = qp->device->ops.counter_bind_qp(counter, qp);
+ mutex_unlock(&counter->lock);
+
+ return ret;
+}
+
+static int __rdma_counter_unbind_qp(struct ib_qp *qp)
+{
+ struct rdma_counter *counter = qp->counter;
+ int ret;
+
+ if (!qp->device->ops.counter_unbind_qp)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&counter->lock);
+ ret = qp->device->ops.counter_unbind_qp(qp);
+ mutex_unlock(&counter->lock);
+
+ return ret;
+}
+
+/**
+ * rdma_get_counter_auto_mode - Find the counter that @qp should be bound
+ * with in auto mode
+ *
+ * Return: The counter (with ref-count increased) if found
+ */
+static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp,
+ u8 port)
+{
+ struct rdma_port_counter *port_counter;
+ struct rdma_counter *counter = NULL;
+ struct ib_device *dev = qp->device;
+ struct rdma_restrack_entry *res;
+ struct rdma_restrack_root *rt;
+ unsigned long id = 0;
+
+ port_counter = &dev->port_data[port].port_counter;
+ rt = &dev->res[RDMA_RESTRACK_COUNTER];
+ xa_lock(&rt->xa);
+ xa_for_each(&rt->xa, id, res) {
+ if (!rdma_is_visible_in_pid_ns(res))
+ continue;
+
+ counter = container_of(res, struct rdma_counter, res);
+ if ((counter->device != qp->device) || (counter->port != port))
+ goto next;
+
+ if (auto_mode_match(qp, counter, port_counter->mode.mask))
+ break;
+next:
+ counter = NULL;
+ }
+
+ if (counter)
+ kref_get(&counter->kref);
+
+ xa_unlock(&rt->xa);
+ return counter;
+}
+
+static void rdma_counter_res_add(struct rdma_counter *counter,
+ struct ib_qp *qp)
+{
+ if (rdma_is_kernel_res(&qp->res)) {
+ rdma_restrack_set_task(&counter->res, qp->res.kern_name);
+ rdma_restrack_kadd(&counter->res);
+ } else {
+ rdma_restrack_attach_task(&counter->res, qp->res.task);
+ rdma_restrack_uadd(&counter->res);
+ }
+}
+
+static void counter_release(struct kref *kref)
+{
+ struct rdma_counter *counter;
+
+ counter = container_of(kref, struct rdma_counter, kref);
+ counter->device->ops.counter_dealloc(counter);
+ rdma_counter_free(counter);
+}
+
+/**
+ * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on
+ * the auto-mode rule
+ */
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port)
+{
+ struct rdma_port_counter *port_counter;
+ struct ib_device *dev = qp->device;
+ struct rdma_counter *counter;
+ int ret;
+
+ if (!rdma_is_port_valid(dev, port))
+ return -EINVAL;
+
+ port_counter = &dev->port_data[port].port_counter;
+ if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO)
+ return 0;
+
+ counter = rdma_get_counter_auto_mode(qp, port);
+ if (counter) {
+ ret = __rdma_counter_bind_qp(counter, qp);
+ if (ret) {
+ kref_put(&counter->kref, counter_release);
+ return ret;
+ }
+ } else {
+ counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO);
+ if (!counter)
+ return -ENOMEM;
+
+ auto_mode_init_counter(counter, qp, port_counter->mode.mask);
+
+ ret = __rdma_counter_bind_qp(counter, qp);
+ if (ret) {
+ rdma_counter_free(counter);
+ return ret;
+ }
+
+ rdma_counter_res_add(counter, qp);
+ }
+
+ return 0;
+}
+
+/**
+ * rdma_counter_unbind_qp - Unbind a qp from a counter
+ * @force:
+ * true - Decrease the counter ref-count anyway (e.g., qp destroy)
+ */
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force)
+{
+ struct rdma_counter *counter = qp->counter;
+ int ret;
+
+ if (!counter)
+ return -EINVAL;
+
+ ret = __rdma_counter_unbind_qp(qp);
+ if (ret && !force)
+ return ret;
+
+ kref_put(&counter->kref, counter_release);
+ return 0;
+}
+
void rdma_counter_init(struct ib_device *dev)
{
struct rdma_port_counter *port_counter;
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 6579865e4866..f3181b74c863 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2471,6 +2471,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
SET_DEVICE_OP(dev_ops, alloc_xrcd);
SET_DEVICE_OP(dev_ops, attach_mcast);
SET_DEVICE_OP(dev_ops, check_mr_status);
+ SET_DEVICE_OP(dev_ops, counter_bind_qp);
+ SET_DEVICE_OP(dev_ops, counter_dealloc);
+ SET_DEVICE_OP(dev_ops, counter_unbind_qp);
SET_DEVICE_OP(dev_ops, create_ah);
SET_DEVICE_OP(dev_ops, create_counters);
SET_DEVICE_OP(dev_ops, create_cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 4a04e94a72db..92349bf37589 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1690,6 +1690,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
}
}
+ /*
+ * Bind this qp to a counter automatically based on the rdma counter
+ * rules. This only set in RST2INIT with port specified
+ */
+ if (!qp->counter && (attr_mask & IB_QP_PORT) &&
+ ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT))
+ rdma_counter_bind_qp_auto(qp, attr->port_num);
+
ret = ib_security_modify_qp(qp, attr, attr_mask, udata);
if (ret)
goto out;
@@ -1885,6 +1893,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
if (!qp->uobject)
rdma_rw_cleanup_mrs(qp);
+ rdma_counter_unbind_qp(qp, true);
rdma_restrack_del(&qp->res);
ret = qp->device->ops.destroy_qp(qp, udata);
if (!ret) {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3d19c056fbc0..0205472eb73a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1698,6 +1698,9 @@ struct ib_qp {
* Implementation details of the RDMA core, don't use in drivers:
*/
struct rdma_restrack_entry res;
+
+ /* The counter the qp is bind to */
+ struct rdma_counter *counter;
};
struct ib_dm {
@@ -2485,6 +2488,21 @@ struct ib_device_ops {
u8 pdata_len);
int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
+ /**
+ * counter_bind_qp - Bind a QP to a counter.
+ * @counter - The counter to be bound. If counter->id is zero then
+ * the driver needs to allocate a new counter and set counter->id
+ */
+ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp);
+ /**
+ * counter_unbind_qp - Unbind the qp from the dynamically-allocated
+ * counter and bind it onto the default one
+ */
+ int (*counter_unbind_qp)(struct ib_qp *qp);
+ /**
+ * counter_dealloc -De-allocate the hw counter
+ */
+ int (*counter_dealloc)(struct rdma_counter *counter);
DECLARE_RDMA_OBJ_SIZE(ib_ah);
DECLARE_RDMA_OBJ_SIZE(ib_cq);
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 8dd2619c015d..9f93a2403c9c 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -7,11 +7,14 @@
#define _RDMA_COUNTER_H_
#include <linux/mutex.h>
+#include <linux/pid_namespace.h>
#include <rdma/ib_verbs.h>
#include <rdma/restrack.h>
#include <rdma/rdma_netlink.h>
+struct ib_qp;
+
struct auto_mode_param {
int qp_type;
};
@@ -31,6 +34,9 @@ struct rdma_counter {
struct rdma_restrack_entry res;
struct ib_device *device;
uint32_t id;
+ struct kref kref;
+ struct rdma_counter_mode mode;
+ struct mutex lock;
u8 port;
};
@@ -38,5 +44,7 @@ void rdma_counter_init(struct ib_device *dev);
void rdma_counter_release(struct ib_device *dev);
int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
bool on, enum rdma_nl_counter_mask mask);
+int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port);
+int rdma_counter_unbind_qp(struct ib_qp *qp, bool force);
#endif /* _RDMA_COUNTER_H_ */
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 05/17] RDMA/counter: Add set/clear per-port auto mode support
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Add an API to support set/clear per-port auto mode.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/counters.c | 74 ++++++++++++++++++++++++++++++
drivers/infiniband/core/device.c | 5 ++
include/rdma/ib_verbs.h | 2 +
include/rdma/rdma_counter.h | 24 ++++++++++
include/uapi/rdma/rdma_netlink.h | 26 +++++++++++
6 files changed, 132 insertions(+), 1 deletion(-)
create mode 100644 drivers/infiniband/core/counters.c
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 42f1b2a4f746..09881bd5f12d 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \
roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \
multicast.o mad.o smi.o agent.o mad_rmpp.o \
- nldev.o restrack.o
+ nldev.o restrack.o counters.o
ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c
new file mode 100644
index 000000000000..6167914fba06
--- /dev/null
+++ b/drivers/infiniband/core/counters.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_counter.h>
+
+#include "core_priv.h"
+#include "restrack.h"
+
+#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE)
+
+static int __counter_set_mode(struct rdma_counter_mode *curr,
+ enum rdma_nl_counter_mode new_mode,
+ enum rdma_nl_counter_mask new_mask)
+{
+ if ((new_mode == RDMA_COUNTER_MODE_AUTO) &&
+ ((new_mask & (~ALL_AUTO_MODE_MASKS)) ||
+ (curr->mode != RDMA_COUNTER_MODE_NONE)))
+ return -EINVAL;
+
+ curr->mode = new_mode;
+ curr->mask = new_mask;
+ return 0;
+}
+
+/**
+ * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode
+ *
+ * When @on is true, the @mask must be set
+ */
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+ bool on, enum rdma_nl_counter_mask mask)
+{
+ struct rdma_port_counter *port_counter;
+ int ret;
+
+ port_counter = &dev->port_data[port].port_counter;
+ mutex_lock(&port_counter->lock);
+ if (on) {
+ ret = __counter_set_mode(&port_counter->mode,
+ RDMA_COUNTER_MODE_AUTO, mask);
+ } else {
+ if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __counter_set_mode(&port_counter->mode,
+ RDMA_COUNTER_MODE_NONE, 0);
+ }
+
+out:
+ mutex_unlock(&port_counter->lock);
+ return ret;
+}
+
+void rdma_counter_init(struct ib_device *dev)
+{
+ struct rdma_port_counter *port_counter;
+ u32 port;
+
+ if (!dev->ops.alloc_hw_stats || !dev->port_data)
+ return;
+
+ rdma_for_each_port(dev, port) {
+ port_counter = &dev->port_data[port].port_counter;
+ port_counter->mode.mode = RDMA_COUNTER_MODE_NONE;
+ mutex_init(&port_counter->lock);
+ }
+}
+
+void rdma_counter_release(struct ib_device *dev)
+{
+}
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8a6ccb936dfe..6579865e4866 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -46,6 +46,7 @@
#include <rdma/rdma_netlink.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
+#include <rdma/rdma_counter.h>
#include "core_priv.h"
#include "restrack.h"
@@ -492,10 +493,12 @@ static void ib_device_release(struct device *device)
if (dev->port_data) {
ib_cache_release_one(dev);
ib_security_release_port_pkey_list(dev);
+ rdma_counter_release(dev);
kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
pdata[0]),
rcu_head);
}
+
xa_destroy(&dev->compat_devs);
xa_destroy(&dev->client_data);
kfree_rcu(dev, rcu_head);
@@ -1316,6 +1319,8 @@ int ib_register_device(struct ib_device *device, const char *name)
ib_device_register_rdmacg(device);
+ rdma_counter_init(device);
+
/*
* Ensure that ADD uevent is not fired because it
* is too early amd device is not initialized yet.
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 26e9c2594913..3d19c056fbc0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -62,6 +62,7 @@
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <uapi/rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
#include <rdma/restrack.h>
#include <rdma/signature.h>
#include <uapi/rdma/rdma_user_ioctl.h>
@@ -2119,6 +2120,7 @@ struct ib_port_data {
spinlock_t netdev_lock;
struct net_device __rcu *netdev;
struct hlist_node ndev_hash_link;
+ struct rdma_port_counter port_counter;
};
/* rdma netdev type - specifies protocol type */
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
index 283ac1a0cdb7..8dd2619c015d 100644
--- a/include/rdma/rdma_counter.h
+++ b/include/rdma/rdma_counter.h
@@ -6,8 +6,26 @@
#ifndef _RDMA_COUNTER_H_
#define _RDMA_COUNTER_H_
+#include <linux/mutex.h>
+
#include <rdma/ib_verbs.h>
#include <rdma/restrack.h>
+#include <rdma/rdma_netlink.h>
+
+struct auto_mode_param {
+ int qp_type;
+};
+
+struct rdma_counter_mode {
+ enum rdma_nl_counter_mode mode;
+ enum rdma_nl_counter_mask mask;
+ struct auto_mode_param param;
+};
+
+struct rdma_port_counter {
+ struct rdma_counter_mode mode;
+ struct mutex lock;
+};
struct rdma_counter {
struct rdma_restrack_entry res;
@@ -15,4 +33,10 @@ struct rdma_counter {
uint32_t id;
u8 port;
};
+
+void rdma_counter_init(struct ib_device *dev);
+void rdma_counter_release(struct ib_device *dev);
+int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port,
+ bool on, enum rdma_nl_counter_mask mask);
+
#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 650cee8c4bf1..e3cd912e9cef 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -507,4 +507,30 @@ enum rdma_nldev_attr {
*/
RDMA_NLDEV_ATTR_MAX
};
+
+/*
+ * Supported counter bind modes. All modes are mutual-exclusive.
+ */
+enum rdma_nl_counter_mode {
+ RDMA_COUNTER_MODE_NONE,
+
+ /*
+ * A qp is bound with a counter automatically during initialization
+ * based on the auto mode (e.g., qp type, ...)
+ */
+ RDMA_COUNTER_MODE_AUTO,
+
+ /*
+ * Always the end
+ */
+ RDMA_COUNTER_MODE_MAX,
+};
+
+/*
+ * Supported criteria in counter auto mode.
+ * Currently only "qp type" is supported
+ */
+enum rdma_nl_counter_mask {
+ RDMA_COUNTER_MASK_QP_TYPE = 1,
+};
#endif /* _UAPI_RDMA_NETLINK_H */
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 04/17] RDMA/restrack: Make is_visible_in_pid_ns() as an API
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Remove is_visible_in_pid_ns() from nldev.c and make it as a restrack API,
so that it can be taken advantage by other parts like counter.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/nldev.c | 15 ++-------------
drivers/infiniband/core/restrack.c | 13 +++++++++++++
drivers/infiniband/core/restrack.h | 1 +
3 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 5499f5629dc2..d9ebfb50962b 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1002,17 +1002,6 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = {
},
};
-static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res)
-{
- /*
- * 1. Kern resources should be visible in init name space only
- * 2. Present only resources visible in the current namespace
- */
- if (rdma_is_kernel_res(res))
- return task_active_pid_ns(current) == &init_pid_ns;
- return task_active_pid_ns(current) == task_active_pid_ns(res->task);
-}
-
static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
enum rdma_restrack_type res_type)
@@ -1057,7 +1046,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
goto err;
}
- if (!is_visible_in_pid_ns(res)) {
+ if (!rdma_is_visible_in_pid_ns(res)) {
ret = -ENOENT;
goto err_get;
}
@@ -1169,7 +1158,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
* objects.
*/
xa_for_each(&rt->xa, id, res) {
- if (!is_visible_in_pid_ns(res))
+ if (!rdma_is_visible_in_pid_ns(res))
continue;
if (idx < start || !rdma_restrack_get(res))
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3714634ae296..bddff426ee0f 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -349,3 +349,16 @@ void rdma_restrack_del(struct rdma_restrack_entry *res)
}
}
EXPORT_SYMBOL(rdma_restrack_del);
+
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res)
+{
+ /*
+ * 1. Kern resources should be visible in init
+ * namespace only
+ * 2. Present only resources visible in the current
+ * namespace
+ */
+ if (rdma_is_kernel_res(res))
+ return task_active_pid_ns(current) == &init_pid_ns;
+ return task_active_pid_ns(current) == task_active_pid_ns(res->task);
+}
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
index d084e5f89849..7bd177cc0a61 100644
--- a/drivers/infiniband/core/restrack.h
+++ b/drivers/infiniband/core/restrack.h
@@ -27,4 +27,5 @@ int rdma_restrack_init(struct ib_device *dev);
void rdma_restrack_clean(struct ib_device *dev);
void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
struct task_struct *task);
+bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res);
#endif /* _RDMA_CORE_RESTRACK_H_ */
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 03/17] RDMA/restrack: Add an API to attach a task to a resource
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Add rdma_restrack_attach_task() which is able to attach a task
other then "current" to a resource.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/restrack.c | 14 ++++++++++++++
drivers/infiniband/core/restrack.h | 2 ++
2 files changed, 16 insertions(+)
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 95573f292aae..3714634ae296 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -194,6 +194,20 @@ void rdma_restrack_set_task(struct rdma_restrack_entry *res,
}
EXPORT_SYMBOL(rdma_restrack_set_task);
+/**
+ * rdma_restrack_attach_task() - attach the task onto this resource
+ * @res: resource entry
+ * @task: the task to attach, the current task will be used if it is NULL.
+ */
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+ struct task_struct *task)
+{
+ if (res->task)
+ put_task_struct(res->task);
+ get_task_struct(task);
+ res->task = task;
+}
+
static void rdma_restrack_add(struct rdma_restrack_entry *res)
{
struct ib_device *dev = res_to_dev(res);
diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h
index 09a1fbdf578e..d084e5f89849 100644
--- a/drivers/infiniband/core/restrack.h
+++ b/drivers/infiniband/core/restrack.h
@@ -25,4 +25,6 @@ struct rdma_restrack_root {
int rdma_restrack_init(struct ib_device *dev);
void rdma_restrack_clean(struct ib_device *dev);
+void rdma_restrack_attach_task(struct rdma_restrack_entry *res,
+ struct task_struct *task);
#endif /* _RDMA_CORE_RESTRACK_H_ */
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 02/17] RDMA/restrack: Introduce statistic counter
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Introduce statistic counter as a new resource. It allows a user
to monitor specific objects (e.g., QPs) by binding to a counter.
In some cases a user counter resource is created with task other then
"current", because its creation is done as part of rdmatool call.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
drivers/infiniband/core/restrack.c | 22 +++++++++++++++++-----
include/rdma/rdma_counter.h | 18 ++++++++++++++++++
include/rdma/restrack.h | 4 ++++
3 files changed, 39 insertions(+), 5 deletions(-)
create mode 100644 include/rdma/rdma_counter.h
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 3b5ff2f7b5f8..95573f292aae 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -6,6 +6,7 @@
#include <rdma/rdma_cm.h>
#include <rdma/ib_verbs.h>
#include <rdma/restrack.h>
+#include <rdma/rdma_counter.h>
#include <linux/mutex.h>
#include <linux/sched/task.h>
#include <linux/pid_namespace.h>
@@ -45,6 +46,7 @@ static const char *type2str(enum rdma_restrack_type type)
[RDMA_RESTRACK_CM_ID] = "CM_ID",
[RDMA_RESTRACK_MR] = "MR",
[RDMA_RESTRACK_CTX] = "CTX",
+ [RDMA_RESTRACK_COUNTER] = "COUNTER",
};
return names[type];
@@ -169,6 +171,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res)
return container_of(res, struct ib_mr, res)->device;
case RDMA_RESTRACK_CTX:
return container_of(res, struct ib_ucontext, res)->device;
+ case RDMA_RESTRACK_COUNTER:
+ return container_of(res, struct rdma_counter, res)->device;
default:
WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type);
return NULL;
@@ -203,15 +207,22 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res)
kref_init(&res->kref);
init_completion(&res->comp);
- if (res->type != RDMA_RESTRACK_QP)
- ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
- &rt->next_id, GFP_KERNEL);
- else {
+ if (res->type == RDMA_RESTRACK_QP) {
/* Special case to ensure that LQPN points to right QP */
struct ib_qp *qp = container_of(res, struct ib_qp, res);
ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL);
res->id = ret ? 0 : qp->qp_num;
+ } else if (res->type == RDMA_RESTRACK_COUNTER) {
+ /* Special case to ensure that cntn points to right counter */
+ struct rdma_counter *counter;
+
+ counter = container_of(res, struct rdma_counter, res);
+ ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL);
+ res->id = ret ? 0 : counter->id;
+ } else {
+ ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b,
+ &rt->next_id, GFP_KERNEL);
}
if (!ret)
@@ -237,7 +248,8 @@ EXPORT_SYMBOL(rdma_restrack_kadd);
*/
void rdma_restrack_uadd(struct rdma_restrack_entry *res)
{
- if (res->type != RDMA_RESTRACK_CM_ID)
+ if ((res->type != RDMA_RESTRACK_CM_ID) &&
+ (res->type != RDMA_RESTRACK_COUNTER))
res->task = NULL;
if (!res->task)
diff --git a/include/rdma/rdma_counter.h b/include/rdma/rdma_counter.h
new file mode 100644
index 000000000000..283ac1a0cdb7
--- /dev/null
+++ b/include/rdma/rdma_counter.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+
+#ifndef _RDMA_COUNTER_H_
+#define _RDMA_COUNTER_H_
+
+#include <rdma/ib_verbs.h>
+#include <rdma/restrack.h>
+
+struct rdma_counter {
+ struct rdma_restrack_entry res;
+ struct ib_device *device;
+ uint32_t id;
+ u8 port;
+};
+#endif /* _RDMA_COUNTER_H_ */
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index ecf3c7702a4f..4041a4d96524 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -42,6 +42,10 @@ enum rdma_restrack_type {
* @RDMA_RESTRACK_CTX: Verbs contexts (CTX)
*/
RDMA_RESTRACK_CTX,
+ /**
+ * @RDMA_RESTRACK_COUNTER: Statistic Counter
+ */
+ RDMA_RESTRACK_COUNTER,
/**
* @RDMA_RESTRACK_MAX: Last entry, used for array dclarations
*/
--
2.20.1
^ permalink raw reply related
* [PATCH mlx5-next v5 01/17] net/mlx5: Add rts2rts_qp_counters_set_id field in hca cap
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
In-Reply-To: <20190702100246.17382-1-leon@kernel.org>
From: Mark Zhang <markz@mellanox.com>
Add rts2rts_qp_counters_set_id field in hca cap so that RTS2RTS
qp modification can be used to change the counter of a QP.
Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
include/linux/mlx5/mlx5_ifc.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index e3c154b573a2..16348528fef6 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1028,7 +1028,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 cc_modify_allowed[0x1];
u8 start_pad[0x1];
u8 cache_line_128byte[0x1];
- u8 reserved_at_165[0xa];
+ u8 reserved_at_165[0x4];
+ u8 rts2rts_qp_counters_set_id[0x1];
+ u8 reserved_at_16a[0x5];
u8 qcam_reg[0x1];
u8 gid_table_size[0x10];
--
2.20.1
^ permalink raw reply related
* [PATCH rdma-next v5 00/17] Statistics counter support
From: Leon Romanovsky @ 2019-07-02 10:02 UTC (permalink / raw)
To: Doug Ledford, Jason Gunthorpe
Cc: Leon Romanovsky, RDMA mailing list, Majd Dibbiny, Mark Zhang,
Saeed Mahameed, linux-netdev
From: Leon Romanovsky <leonro@mellanox.com>
Changelog:
v4 -> v5:
* Patch #6 and #14 - consolidated many counter release functions,
removed mutex lock protection from dealloc_counter() call
and simplified kref_put/kref_get operations.
* Added Saeed's ACK tags.
v3 -> v4:
* Add counter_dealloc() callback function
* Moved to kref implementation
* Fixed lock during spinlock
v2 -> v3:
* We didn't change use of atomics over kref for management of unbind
counter from QP. The reason to it that bind and unbind are non-symmetric
in regards of put and get, so we need to count differently memory
release flows of HW objects (restrack) and SW bind operations.
* Everything else was addressed.
v1 -> v2:
* Rebased to latest rdma-next
v0 -> v1:
* Changed wording of counter comment
* Removed unneeded assignments
* Added extra patch to present global counters
----------------------------------------------------
Hi,
This series from Mark provides dynamic statistics infrastructure.
He uses netlink interface to configure and retrieve those counters.
This infrastructure allows to users monitor various objects by binding
to them counters. As the beginning, we used QP object as target for
those counters, but future patches will include ODP MR information too.
Two binding modes are supported:
- Auto: This allows a user to build automatic set of objects to a counter
according to common criteria. For example in a per-type scheme, where in
one process all QPs with same QP type are bound automatically to a single
counter.
- Manual: This allows a user to manually bind objects on a counter.
Those two modes are mutual-exclusive with separation between processes,
objects created by different processes cannot be bound to a same counter.
For objects which don't support counter binding, we will return
pre-allocated counters.
$ rdma statistic qp set link mlx5_2/1 auto type on
$ rdma statistic qp set link mlx5_2/1 auto off
$ rdma statistic qp bind link mlx5_2/1 lqpn 178
$ rdma statistic qp unbind link mlx5_2/1 cntn 4 lqpn 178
$ rdma statistic show
$ rdma statistic qp mode
Thanks
Mark Zhang (17):
net/mlx5: Add rts2rts_qp_counters_set_id field in hca cap
RDMA/restrack: Introduce statistic counter
RDMA/restrack: Add an API to attach a task to a resource
RDMA/restrack: Make is_visible_in_pid_ns() as an API
RDMA/counter: Add set/clear per-port auto mode support
RDMA/counter: Add "auto" configuration mode support
IB/mlx5: Support set qp counter
IB/mlx5: Add counter set id as a parameter for
mlx5_ib_query_q_counters()
IB/mlx5: Support statistic q counter configuration
RDMA/nldev: Allow counter auto mode configration through RDMA netlink
RDMA/netlink: Implement counter dumpit calback
IB/mlx5: Add counter_alloc_stats() and counter_update_stats() support
RDMA/core: Get sum value of all counters when perform a sysfs stat
read
RDMA/counter: Allow manual mode configuration support
RDMA/nldev: Allow counter manual mode configration through RDMA
netlink
RDMA/nldev: Allow get counter mode through RDMA netlink
RDMA/nldev: Allow get default counter statistics through RDMA netlink
drivers/infiniband/core/Makefile | 2 +-
drivers/infiniband/core/counters.c | 634 +++++++++++++++++++++++++++
drivers/infiniband/core/device.c | 10 +
drivers/infiniband/core/nldev.c | 551 ++++++++++++++++++++++-
drivers/infiniband/core/restrack.c | 49 ++-
drivers/infiniband/core/restrack.h | 3 +
drivers/infiniband/core/sysfs.c | 16 +-
drivers/infiniband/core/verbs.c | 9 +
drivers/infiniband/hw/mlx5/main.c | 77 +++-
drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +
drivers/infiniband/hw/mlx5/qp.c | 76 +++-
include/linux/mlx5/mlx5_ifc.h | 4 +-
include/linux/mlx5/qp.h | 1 +
include/rdma/ib_verbs.h | 31 ++
include/rdma/rdma_counter.h | 65 +++
include/rdma/restrack.h | 4 +
include/uapi/rdma/rdma_netlink.h | 52 ++-
17 files changed, 1559 insertions(+), 31 deletions(-)
create mode 100644 drivers/infiniband/core/counters.c
create mode 100644 include/rdma/rdma_counter.h
^ permalink raw reply
* Re: use exact allocation for dma coherent memory
From: Arend Van Spriel @ 2019-07-02 9:48 UTC (permalink / raw)
To: Christoph Hellwig, Maarten Lankhorst, Maxime Ripard, Sean Paul,
David Airlie, Daniel Vetter, Jani Nikula, Joonas Lahtinen,
Rodrigo Vivi, Ian Abbott, H Hartley Sweeten
Cc: devel, linux-s390, Intel Linux Wireless, linux-rdma, netdev,
intel-gfx, linux-wireless, linux-kernel, dri-devel, linux-mm,
iommu, moderated list:ARM PORT, linux-media
In-Reply-To: <20190701084833.GA22927@lst.de>
On 7/1/2019 10:48 AM, Christoph Hellwig wrote:
> On Fri, Jun 14, 2019 at 03:47:10PM +0200, Christoph Hellwig wrote:
>> Switching to a slightly cleaned up alloc_pages_exact is pretty easy,
>> but it turns out that because we didn't filter valid gfp_t flags
>> on the DMA allocator, a bunch of drivers were passing __GFP_COMP
>> to it, which is rather bogus in too many ways to explain. Arm has
>> been filtering it for a while, but this series instead tries to fix
>> the drivers and warn when __GFP_COMP is passed, which makes it much
>> larger than just adding the functionality.
>
> Dear driver maintainers,
>
> can you look over the patches touching your drivers, please? I'd
> like to get as much as possible of the driver patches into this
> merge window, so that it can you through your maintainer trees.
You made me look ;-) Actually not touching my drivers so I'm off the
hook. However, I was wondering if drivers could know so I decided to
look into the DMA-API.txt documentation which currently states:
"""
The flag parameter (dma_alloc_coherent() only) allows the caller to
specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
implementation may choose to ignore flags that affect the location of
the returned memory, like GFP_DMA).
"""
I do expect you are going to change that description as well now that
you are going to issue a warning on __GFP_COMP. Maybe include that in
patch 15/16 where you introduce that warning.
Regards,
Arend
^ permalink raw reply
* Re: [for-next V2 10/10] RDMA/core: Provide RDMA DIM support for ULPs
From: Leon Romanovsky @ 2019-07-02 6:41 UTC (permalink / raw)
To: Sagi Grimberg
Cc: Idan Burstein, Saeed Mahameed, David S. Miller, Doug Ledford,
Jason Gunthorpe, Or Gerlitz, Tal Gilboa, netdev@vger.kernel.org,
linux-rdma@vger.kernel.org, Yamin Friedman, Max Gurtovoy
In-Reply-To: <9d26c90c-8e0b-656f-341f-a67251549126@grimberg.me>
On Mon, Jul 01, 2019 at 10:36:41PM -0700, Sagi Grimberg wrote:
> Hey Idan,
>
> > " Please don't. This is a bad choice to opt it in by default."
> >
> > I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency.
>
> Well, its a Mellanox device driver after all.
>
> But do note that by far, the vast majority of users are not saturating
> 100G of 4K I/O. The absolute vast majority of users are primarily
> sensitive to synchronous QD=1 I/O latency, and when the workload
> is much more dynamic than the synthetic 100%/50%/0% read mix.
>
> As much as I'm a fan (IIRC I was the one giving a first pass at this),
> the dim default opt-in is not only not beneficial, but potentially
> harmful to the majority of users out-of-the-box experience.
>
> Given that this is a fresh code with almost no exposure, and that was
> not tested outside of Yamin running limited performance testing, I think
> it would be a mistake to add it as a default opt-in, that can come as an
> incremental stage.
>
> Obviously, I cannot tell what Mellanox should/shouldn't do in its own
> device driver of course, but I just wanted to emphasize that I think
> this is a mistake.
Hi Sagi,
I'm not sharing your worries about bad out-of-the-box experience for a
number of reasons.
First of all, this code is part of upstream kernel and will take time
till users actually start to use it as is and not as part of some distro
backports or MOFED packages.
Second, Yamin did extensive testing and worked very close with Or G.
and I have very high confident in the results of their team work.
Third (outcome of first), actually the opposite is true, the setting
this option as a default will give us more time to fix/adjust code if
needed, before users will see any potential degradation.
>
> > Moreover, net-dim is enabled by default, I don't see why RDMA is different.
>
> Very different animals.
Yes and no, the logic behind is the same and both solutions have same
constrains of throughput vs. latency.
Thanks
^ permalink raw reply
* Re: [for-next V2 10/10] RDMA/core: Provide RDMA DIM support for ULPs
From: Sagi Grimberg @ 2019-07-02 5:36 UTC (permalink / raw)
To: Idan Burstein, Saeed Mahameed, David S. Miller, Doug Ledford,
Jason Gunthorpe
Cc: Leon Romanovsky, Or Gerlitz, Tal Gilboa, netdev@vger.kernel.org,
linux-rdma@vger.kernel.org, Yamin Friedman, Max Gurtovoy
In-Reply-To: <AM5PR0501MB248327B260F97EF97CD5B80EC5E20@AM5PR0501MB2483.eurprd05.prod.outlook.com>
Hey Idan,
> " Please don't. This is a bad choice to opt it in by default."
>
> I disagree here. I'd prefer Linux to have good out of the box experience (e.g. reach 100G in 4K NVMeOF on Intel servers) with the default parameters. Especially since Yamin have shown it is beneficial / not hurting in terms of performance for variety of use cases. The whole concept of DIM is that it adapts to the workload requirements in terms of bandwidth and latency.
Well, its a Mellanox device driver after all.
But do note that by far, the vast majority of users are not saturating
100G of 4K I/O. The absolute vast majority of users are primarily
sensitive to synchronous QD=1 I/O latency, and when the workload
is much more dynamic than the synthetic 100%/50%/0% read mix.
As much as I'm a fan (IIRC I was the one giving a first pass at this),
the dim default opt-in is not only not beneficial, but potentially
harmful to the majority of users out-of-the-box experience.
Given that this is a fresh code with almost no exposure, and that was
not tested outside of Yamin running limited performance testing, I think
it would be a mistake to add it as a default opt-in, that can come as an
incremental stage.
Obviously, I cannot tell what Mellanox should/shouldn't do in its own
device driver of course, but I just wanted to emphasize that I think
this is a mistake.
> Moreover, net-dim is enabled by default, I don't see why RDMA is different.
Very different animals.
^ permalink raw reply
* Re: [PATCH mlx5-next 00/18] Mellanox, mlx5 E-Switch and low level updates
From: Saeed Mahameed @ 2019-07-01 23:46 UTC (permalink / raw)
To: Saeed Mahameed
Cc: Leon Romanovsky, netdev@vger.kernel.org,
linux-rdma@vger.kernel.org
In-Reply-To: <20190628223516.9368-1-saeedm@mellanox.com>
On Fri, Jun 28, 2019 at 3:35 PM Saeed Mahameed <saeedm@mellanox.com> wrote:
>
> Hi All,
>
> This series includes some low level updates mainly in the E-Switch
> netdev and rdma vport representors areas.
>
> From Parav and Huy:
> 1) Added hardware bits and structures definitions for sub-functions
> 2) Small code cleanup and improvement for PF pci driver.
>
> From Bodong:
> 3) Use the correct name semantics of vport index and vport number
> 4) Cleanup the rep and netdev reference when unloading IB rep.
> 5) Bluefield (ECPF) updates and refactoring for better E-Switch
> management on ECPF embedded CPU NIC:
> 5.1) Consolidate querying eswitch number of VFs
> 5.2) Register event handler at the correct E-Switch init stage
> 5.3) Setup PF's inline mode and vlan pop when the ECPF is the
> E-Swtich manager ( the host PF is basically a VF ).
> 5.4) Handle Vport UC address changes in switchdev mode.
>
> From Shay:
> 6) Add support for MCQI and MCQS hardware registers.
>
> In case of no objections these patches will be applied to mlx5-next and
> will be sent later as pull request to both rdma-next and net-next trees.
Applied to mlx5-next,
Thanks!
^ permalink raw reply
* [PATCH v5 0/3] [v4.9.y] coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping
From: Ajay Kaher @ 2019-07-01 18:32 UTC (permalink / raw)
To: aarcange, jannh, oleg, peterx, rppt, jgg, mhocko
Cc: jglisse, akpm, mike.kravetz, viro, riandrews, arve, yishaih,
dledford, sean.hefty, hal.rosenstock, matanb, leonro, gregkh,
torvalds, linux-fsdevel, linux-mm, devel, linux-rdma,
linux-kernel, stable, akaher, srivatsab, amakhalov
In-Reply-To: <1562005928-1929-1-git-send-email-akaher@vmware.com>
coredump: fix race condition between mmget_not_zero()/get_task_mm()
and core dumping
[PATCH v5 1/3]:
Backporting of commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream.
[PATCH v5 2/3]:
Extension of commit 04f5866e41fb to fix the race condition between
get_task_mm() and core dumping for IB->mlx4 and IB->mlx5 drivers.
[PATCH v5 3/3]
Backporting of commit 59ea6d06cfa9247b586a695c21f94afa7183af74 upstream.
[diff from v4]:
- Corrected Subject line for [PATCH v5 2/3], [PATCH v5 3/3]
^ permalink raw reply
* [PATCH v5 3/3] [v4.9.y] coredump: fix race condition between collapse_huge_page() and core dumping
From: Ajay Kaher @ 2019-07-01 18:32 UTC (permalink / raw)
To: aarcange, jannh, oleg, peterx, rppt, jgg, mhocko
Cc: jglisse, akpm, mike.kravetz, viro, riandrews, arve, yishaih,
dledford, sean.hefty, hal.rosenstock, matanb, leonro, gregkh,
torvalds, linux-fsdevel, linux-mm, devel, linux-rdma,
linux-kernel, stable, akaher, srivatsab, amakhalov, Hugh Dickins,
Mike Rapoport
In-Reply-To: <1562005928-1929-1-git-send-email-akaher@vmware.com>
From: Andrea Arcangeli <aarcange@redhat.com>
coredump: fix race condition between collapse_huge_page() and core dumping
commit 59ea6d06cfa9247b586a695c21f94afa7183af74 upstream.
When fixing the race conditions between the coredump and the mmap_sem
holders outside the context of the process, we focused on
mmget_not_zero()/get_task_mm() callers in 04f5866e41fb70 ("coredump: fix
race condition between mmget_not_zero()/get_task_mm() and core
dumping"), but those aren't the only cases where the mmap_sem can be
taken outside of the context of the process as Michal Hocko noticed
while backporting that commit to older -stable kernels.
If mmgrab() is called in the context of the process, but then the
mm_count reference is transferred outside the context of the process,
that can also be a problem if the mmap_sem has to be taken for writing
through that mm_count reference.
khugepaged registration calls mmgrab() in the context of the process,
but the mmap_sem for writing is taken later in the context of the
khugepaged kernel thread.
collapse_huge_page() after taking the mmap_sem for writing doesn't
modify any vma, so it's not obvious that it could cause a problem to the
coredump, but it happens to modify the pmd in a way that breaks an
invariant that pmd_trans_huge_lock() relies upon. collapse_huge_page()
needs the mmap_sem for writing just to block concurrent page faults that
call pmd_trans_huge_lock().
Specifically the invariant that "!pmd_trans_huge()" cannot become a
"pmd_trans_huge()" doesn't hold while collapse_huge_page() runs.
The coredump will call __get_user_pages() without mmap_sem for reading,
which eventually can invoke a lockless page fault which will need a
functional pmd_trans_huge_lock().
So collapse_huge_page() needs to use mmget_still_valid() to check it's
not running concurrently with the coredump... as long as the coredump
can invoke page faults without holding the mmap_sem for reading.
This has "Fixes: khugepaged" to facilitate backporting, but in my view
it's more a bug in the coredump code that will eventually have to be
rewritten to stop invoking page faults without the mmap_sem for reading.
So the long term plan is still to drop all mmget_still_valid().
Link: http://lkml.kernel.org/r/20190607161558.32104-1-aarcange@redhat.com
Fixes: ba76149f47d8 ("thp: khugepaged")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[Ajay: Just adjusted to apply on v4.9]
Signed-off-by: Ajay Kaher <akaher@vmware.com>
---
include/linux/mm.h | 4 ++++
mm/khugepaged.c | 3 +++
2 files changed, 7 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c239984..8852158 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1194,6 +1194,10 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
* followed by taking the mmap_sem for writing before modifying the
* vmas or anything the coredump pretends not to change from under it.
*
+ * It also has to be called when mmgrab() is used in the context of
+ * the process, but then the mm_count refcount is transferred outside
+ * the context of the process to run down_write() on that pinned mm.
+ *
* NOTE: find_extend_vma() called from GUP context is the only place
* that can modify the "mm" (notably the vm_start/end) under mmap_sem
* for reading and outside the context of the process, so it is also
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e0cfc3a..8217ee5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1004,6 +1004,9 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
+ result = SCAN_ANY_PROCESS;
+ if (!mmget_still_valid(mm))
+ goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
--
2.7.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox