* [PATCH 1/2] devlink: Expose port function commands to control roce
2022-11-02 16:39 [PATCH 0/2] devlink: Add port function attribute to enable/disable roce Daniel Jurgens
@ 2022-11-02 16:39 ` Daniel Jurgens
2022-11-05 2:53 ` Jakub Kicinski
2022-11-02 16:39 ` [PATCH 2/2] net/mlx5: E-Switch, Implement devlink port function cmds " Daniel Jurgens
1 sibling, 1 reply; 4+ messages in thread
From: Daniel Jurgens @ 2022-11-02 16:39 UTC (permalink / raw)
To: netdev, davem, kuba; +Cc: parav, saeedm, yishaih, Daniel Jurgens, Jiri Pirko
From: Yishai Hadas <yishaih@nvidia.com>
Expose port function commands to turn on / off roce, this is used to
control the port roce device capabilities.
When roce is disabled for a function of the port, function cannot create
any roce specific resources (e.g GID table).
It also saves system memory utilization. For example disabling roce on a
VF/SF saves 1 Mbytes of system memory per function.
Example of a PCI VF port which supports function configuration:
Set roce of the VF's port function.
$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
function:
hw_addr 00:00:00:00:00:00 roce on
$ devlink port function set pci/0000:06:00.0/2 roce off
$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
function:
hw_addr 00:11:22:33:44:55 roce off
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Daniel Jurgens <danielj@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
.../networking/devlink/devlink-port.rst | 5 +-
include/net/devlink.h | 20 ++++++
include/uapi/linux/devlink.h | 1 +
net/core/devlink.c | 61 +++++++++++++++++++
4 files changed, 86 insertions(+), 1 deletion(-)
diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 7627b1da01f2..fd191622ab68 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -110,7 +110,7 @@ devlink ports for both the controllers.
Function configuration
======================
-A user can configure the function attribute before enumerating the PCI
+A user can configure one or more function attributes before enumerating the PCI
function. Usually it means, user should configure function attribute
before a bus specific device for the function is created. However, when
SRIOV is enabled, virtual function devices are created on the PCI bus.
@@ -122,6 +122,9 @@ A user may set the hardware address of the function using
'devlink port function set hw_addr' command. For Ethernet port function
this means a MAC address.
+A user may set also the roce capability of the function using
+'devlink port function set roce' command.
+
Subfunction
============
diff --git a/include/net/devlink.h b/include/net/devlink.h
index ba6b8b094943..c26edd5c72fa 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1414,6 +1414,26 @@ struct devlink_ops {
int (*port_function_hw_addr_set)(struct devlink_port *port,
const u8 *hw_addr, int hw_addr_len,
struct netlink_ext_ack *extack);
+ /**
+ * @port_function_roce_get: Port function's roce get function.
+ *
+ * Should be used by device drivers to report the roce state of
+ * a function managed by the devlink port. Driver should return
+ * -EOPNOTSUPP if it doesn't support port function handling for
+ * a particular port.
+ */
+ int (*port_function_roce_get)(struct devlink_port *port, bool *on,
+ struct netlink_ext_ack *extack);
+ /**
+ * @port_function_roce_set: Port function's roce set function.
+ *
+ * Should be used by device drivers to enable/disable the roce state of
+ * a function managed by the devlink port. Driver should return
+ * -EOPNOTSUPP if it doesn't support port function handling for
+ * a particular port.
+ */
+ int (*port_function_roce_set)(struct devlink_port *port, bool on,
+ struct netlink_ext_ack *extack);
/**
* port_new() - Add a new port function of a specified flavor
* @devlink: Devlink instance
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 2f24b53a87a5..16efff56fc0d 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -658,6 +658,7 @@ enum devlink_port_function_attr {
DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */
DEVLINK_PORT_FN_ATTR_STATE, /* u8 */
DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */
+ DEVLINK_PORT_FN_ATTR_ROCE, /* u8 */
__DEVLINK_PORT_FUNCTION_ATTR_MAX,
DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 89baa7c0938b..4f9d81888699 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -199,6 +199,7 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
[DEVLINK_PORT_FN_ATTR_STATE] =
NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
DEVLINK_PORT_FN_STATE_ACTIVE),
+ [DEVLINK_PORT_FN_ATTR_ROCE] = NLA_POLICY_RANGE(NLA_U8, 0, 1),
};
static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
@@ -691,6 +692,33 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
return 0;
}
+static int devlink_port_function_roce_fill(const struct devlink_ops *ops,
+ struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ bool on;
+ int err;
+
+ if (!ops->port_function_roce_get)
+ return 0;
+
+ err = ops->port_function_roce_get(port, &on, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ err = nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_ROCE, on);
+ if (err)
+ return err;
+
+ *msg_updated = true;
+ return 0;
+}
+
static int
devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct genl_info *info,
@@ -1248,6 +1276,25 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
return 0;
}
+static int
+devlink_port_fn_roce_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_ops *ops = port->devlink->ops;
+ bool on;
+
+ on = nla_get_u8(attr);
+
+ if (!ops->port_function_roce_set) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Port doesn't support roce function attribute");
+ return -EOPNOTSUPP;
+ }
+
+ return ops->port_function_roce_set(port, on, extack);
+}
+
static int
devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
struct netlink_ext_ack *extack)
@@ -1266,6 +1313,12 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
&msg_updated);
if (err)
goto out;
+
+ err = devlink_port_function_roce_fill(ops, port, msg, extack,
+ &msg_updated);
+ if (err)
+ goto out;
+
err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
out:
if (err || !msg_updated)
@@ -1670,6 +1723,14 @@ static int devlink_port_function_set(struct devlink_port *port,
if (err)
return err;
}
+
+ attr = tb[DEVLINK_PORT_FN_ATTR_ROCE];
+ if (attr) {
+ err = devlink_port_fn_roce_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
/* Keep this as the last function attribute set, so that when
* multiple port function attributes are set along with state,
* Those can be applied first before activating the state.
--
2.27.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH 2/2] net/mlx5: E-Switch, Implement devlink port function cmds to control roce
2022-11-02 16:39 [PATCH 0/2] devlink: Add port function attribute to enable/disable roce Daniel Jurgens
2022-11-02 16:39 ` [PATCH 1/2] devlink: Expose port function commands to control roce Daniel Jurgens
@ 2022-11-02 16:39 ` Daniel Jurgens
1 sibling, 0 replies; 4+ messages in thread
From: Daniel Jurgens @ 2022-11-02 16:39 UTC (permalink / raw)
To: netdev, davem, kuba; +Cc: parav, saeedm, yishaih, Daniel Jurgens
From: Yishai Hadas <yishaih@nvidia.com>
Implement devlink port function commands to enable / disable roce.
This is used to control the roce device capabilities.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Daniel Jurgens <danielj@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
.../device_drivers/ethernet/mellanox/mlx5.rst | 32 ++++++
.../net/ethernet/mellanox/mlx5/core/devlink.c | 2 +
.../net/ethernet/mellanox/mlx5/core/eswitch.c | 34 ++++++
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 8 +-
.../mellanox/mlx5/core/eswitch_offloads.c | 105 ++++++++++++++++++
.../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +
.../net/ethernet/mellanox/mlx5/core/vport.c | 23 ++++
7 files changed, 205 insertions(+), 1 deletion(-)
diff --git a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
index 5edf50d7dbd5..2ffdc103571e 100644
--- a/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
+++ b/Documentation/networking/device_drivers/ethernet/mellanox/mlx5.rst
@@ -388,6 +388,38 @@ device created for the PCI VF/SF.
function:
hw_addr 00:00:00:00:88:88
+RoCE capability setup
+---------------------
+Not all mlx5 PCI VFs/SFs require RoCE capability.
+
+When RoCE capability is disabled, it saves 1 Mbytes worth of system memory per
+PCI VF/SF.
+
+mlx5 driver provides mechanism to setup RoCE capability.
+
+When user disables RoCE capability for a VF/SF, user application cannot send or
+receive any RoCE packets through this VF/SF and RoCE GID table for this PCI
+will be empty.
+
+When RoCE capability is disabled in the device using port function attribute,
+VF/SF driver cannot override it.
+
+- Get RoCE capability of the VF device::
+
+ $ devlink port show pci/0000:06:00.0/2
+ pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+ function:
+ hw_addr 00:00:00:00:00:00 roce on
+
+- Set RoCE capability of the VF device::
+
+ $ devlink port function set pci/0000:06:00.0/2 roce off
+
+ $ devlink port show pci/0000:06:00.0/2
+ pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+ function:
+ hw_addr 00:00:00:00:00:00 roce off
+
SF state setup
--------------
To use the SF, the user must active the SF using the SF function state
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 66c6a7017695..69d103694ac2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -318,6 +318,8 @@ static const struct devlink_ops mlx5_devlink_ops = {
.rate_node_new = mlx5_esw_devlink_rate_node_new,
.rate_node_del = mlx5_esw_devlink_rate_node_del,
.rate_leaf_parent_set = mlx5_esw_devlink_rate_parent_set,
+ .port_function_roce_get = mlx5_devlink_port_function_roce_get,
+ .port_function_roce_set = mlx5_devlink_port_function_roce_set,
#endif
#ifdef CONFIG_MLX5_SF_MANAGER
.port_new = mlx5_devlink_sf_port_new,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index c59107fa9e6d..035240587163 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -772,6 +772,32 @@ static void esw_vport_cleanup_acl(struct mlx5_eswitch *esw,
esw_vport_destroy_offloads_acl_tables(esw, vport);
}
+static int mlx5_esw_vport_roce_cap_get(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
+{
+ int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+ void *query_ctx;
+ void *hca_caps;
+ int err;
+
+ if (!MLX5_CAP_GEN(esw->dev, vhca_resource_manager))
+ return 0;
+
+ query_ctx = kzalloc(query_out_sz, GFP_KERNEL);
+ if (!query_ctx)
+ return -ENOMEM;
+
+ err = mlx5_vport_get_other_func_cap(esw->dev, vport->vport, query_ctx);
+ if (err)
+ goto out_free;
+
+ hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability);
+ vport->info.roce_enabled = MLX5_GET(cmd_hca_cap, hca_caps, roce);
+
+out_free:
+ kfree(query_ctx);
+ return err;
+}
+
static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
{
u16 vport_num = vport->vport;
@@ -785,6 +811,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
if (mlx5_esw_is_manager_vport(esw, vport_num))
return 0;
+ err = mlx5_esw_vport_roce_cap_get(esw, vport);
+ if (err)
+ goto err_roce;
+
mlx5_modify_vport_admin_state(esw->dev,
MLX5_VPORT_STATE_OP_MOD_ESW_VPORT,
vport_num, 1,
@@ -804,6 +834,10 @@ static int esw_vport_setup(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
vport->info.qos, flags);
return 0;
+
+err_roce:
+ esw_vport_cleanup_acl(esw, vport);
+ return err;
}
/* Don't cleanup vport->info, it's needed to restore vport configuration */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index f68dc2d0dbe6..83e125d13b7d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -153,6 +153,7 @@ struct mlx5_vport_info {
u8 qos;
u8 spoofchk: 1;
u8 trusted: 1;
+ u8 roce_enabled: 1;
};
/* Vport context events */
@@ -508,7 +509,12 @@ int mlx5_devlink_port_function_hw_addr_get(struct devlink_port *port,
int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port,
const u8 *hw_addr, int hw_addr_len,
struct netlink_ext_ack *extack);
-
+int mlx5_devlink_port_function_roce_get(struct devlink_port *port,
+ bool *is_enabled,
+ struct netlink_ext_ack *extack);
+int mlx5_devlink_port_function_roce_set(struct devlink_port *port,
+ bool enable,
+ struct netlink_ext_ack *extack);
void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type);
int mlx5_eswitch_add_vlan_action(struct mlx5_eswitch *esw,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 4e50df3139c6..446488687a1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -4024,3 +4024,108 @@ int mlx5_devlink_port_function_hw_addr_set(struct devlink_port *port,
return mlx5_eswitch_set_vport_mac(esw, vport_num, hw_addr);
}
+
+int mlx5_devlink_port_function_roce_get(struct devlink_port *port,
+ bool *is_enabled,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_eswitch *esw;
+ struct mlx5_vport *vport;
+ int err = -EOPNOTSUPP;
+ u16 vport_num;
+
+ esw = mlx5_devlink_eswitch_get(port->devlink);
+ if (IS_ERR(esw))
+ return PTR_ERR(esw);
+
+ if (!MLX5_CAP_GEN(esw->dev, vhca_resource_manager))
+ return -EOPNOTSUPP;
+
+ vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index);
+ if (!is_port_function_supported(esw, vport_num))
+ return -EOPNOTSUPP;
+
+ vport = mlx5_eswitch_get_vport(esw, vport_num);
+ if (IS_ERR(vport)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid port");
+ return PTR_ERR(vport);
+ }
+
+ mutex_lock(&esw->state_lock);
+ if (vport->enabled) {
+ *is_enabled = vport->info.roce_enabled;
+ err = 0;
+ }
+ mutex_unlock(&esw->state_lock);
+ return err;
+}
+
+int mlx5_devlink_port_function_roce_set(struct devlink_port *port,
+ bool enable,
+ struct netlink_ext_ack *extack)
+{
+ int query_out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
+ struct mlx5_eswitch *esw;
+ struct mlx5_vport *vport;
+ int err = -EOPNOTSUPP;
+ void *query_ctx;
+ void *hca_caps;
+ u16 vport_num;
+
+ esw = mlx5_devlink_eswitch_get(port->devlink);
+ if (IS_ERR(esw))
+ return PTR_ERR(esw);
+
+ if (!MLX5_CAP_GEN(esw->dev, vhca_resource_manager))
+ return -EOPNOTSUPP;
+
+ vport_num = mlx5_esw_devlink_port_index_to_vport_num(port->index);
+ if (!is_port_function_supported(esw, vport_num))
+ return -EOPNOTSUPP;
+
+ vport = mlx5_eswitch_get_vport(esw, vport_num);
+ if (IS_ERR(vport)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid port");
+ return PTR_ERR(vport);
+ }
+
+ mutex_lock(&esw->state_lock);
+ if (!vport->enabled) {
+ NL_SET_ERR_MSG_MOD(extack, "Eswitch vport is disabled");
+ goto out;
+ }
+
+ if (vport->info.roce_enabled == enable) {
+ err = 0;
+ goto out;
+ }
+
+ query_ctx = kzalloc(query_out_sz, GFP_KERNEL);
+ if (!query_ctx) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = mlx5_vport_get_other_func_cap(esw->dev, vport_num, query_ctx);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed getting HCA caps");
+ goto out_free;
+ }
+
+ hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_ctx, capability);
+ MLX5_SET(cmd_hca_cap, hca_caps, roce, enable);
+
+ err = mlx5_vport_set_other_func_cap(esw->dev, hca_caps, vport_num);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed setting HCA roce cap");
+ goto out_free;
+ }
+
+ vport->info.roce_enabled = enable;
+
+out_free:
+ kfree(query_ctx);
+out:
+ mutex_unlock(&esw->state_lock);
+ return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index a806e3de7b7c..9f717f3fff98 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -309,6 +309,8 @@ enum {
u8 mlx5_get_nic_state(struct mlx5_core_dev *dev);
void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state);
+int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
+int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 function_id);
static inline bool mlx5_core_is_sf(const struct mlx5_core_dev *dev)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
index d5c317325030..75c876712992 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c
@@ -1171,3 +1171,26 @@ int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, vo
MLX5_SET(query_hca_cap_in, in, other_function, true);
return mlx5_cmd_exec_inout(dev, query_hca_cap, in, out);
}
+
+int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 function_id)
+{
+ int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
+ void *set_hca_cap;
+ void *set_ctx;
+ int ret;
+
+ set_ctx = kzalloc(set_sz, GFP_KERNEL);
+ if (!set_ctx)
+ return -ENOMEM;
+
+ MLX5_SET(set_hca_cap_in, set_ctx, opcode, MLX5_CMD_OP_SET_HCA_CAP);
+ MLX5_SET(set_hca_cap_in, set_ctx, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1);
+ set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability);
+ memcpy(set_hca_cap, hca_cap, MLX5_ST_SZ_BYTES(cmd_hca_cap));
+ MLX5_SET(set_hca_cap_in, set_ctx, function_id, function_id);
+ MLX5_SET(set_hca_cap_in, set_ctx, other_function, true);
+ ret = mlx5_cmd_exec_in(dev, set_hca_cap, set_ctx);
+
+ kfree(set_ctx);
+ return ret;
+}
--
2.27.0
^ permalink raw reply related [flat|nested] 4+ messages in thread