[PATCH net-next V9 10/14] net/mlx5: qos: Model the root node in the scheduling hierarchy

public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed

From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Donald Hunter <donald.hunter@gmail.com>,
	Simon Horman <horms@kernel.org>, Jiri Pirko <jiri@resnulli.us>,
	Jonathan Corbet <corbet@lwn.net>,
	Shuah Khan <skhan@linuxfoundation.org>,
	Saeed Mahameed <saeedm@nvidia.com>,
	"Leon Romanovsky" <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Shuah Khan <shuah@kernel.org>,
	Chuck Lever <chuck.lever@oracle.com>,
	"Matthieu Baerts (NGI0)" <matttbe@kernel.org>,
	Carolina Jubran <cjubran@nvidia.com>,
	Cosmin Ratiu <cratiu@nvidia.com>,
	Dragos Tatulea <dtatulea@nvidia.com>,
	Jacob Keller <jacob.e.keller@intel.com>,
	Shahar Shitrit <shshitrit@nvidia.com>,
	"Daniel Zahka" <daniel.zahka@gmail.com>,
	Parav Pandit <parav@nvidia.com>,
	"Adithya Jayachandran" <ajayachandra@nvidia.com>,
	Kees Cook <kees@kernel.org>, "Shay Drori" <shayd@nvidia.com>,
	Daniel Jurgens <danielj@nvidia.com>,
	Moshe Shemesh <moshe@nvidia.com>,
	Willem de Bruijn <willemb@google.com>, David Wei <dw@davidwei.uk>,
	Petr Machata <petrm@nvidia.com>,
	Stanislav Fomichev <sdf@fomichev.me>,
	Daniel Borkmann <daniel@iogearbox.net>, Joe Damato <joe@dama.to>,
	Nikolay Aleksandrov <razor@blackwall.org>,
	Vadim Fedorenko <vadim.fedorenko@linux.dev>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	"Antonio Quartulli" <antonio@openvpn.net>,
	Allison Henderson <allison.henderson@oracle.com>,
	Bui Quang Minh <minhquangbui99@gmail.com>,
	Nimrod Oren <noren@nvidia.com>, <netdev@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, <linux-doc@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, <linux-kselftest@vger.kernel.org>,
	Gal Pressman <gal@nvidia.com>, Jiri Pirko <jiri@nvidia.com>
Subject: [PATCH net-next V9 10/14] net/mlx5: qos: Model the root node in the scheduling hierarchy
Date: Thu, 26 Mar 2026 08:59:45 +0200	[thread overview]
Message-ID: <20260326065949.44058-11-tariqt@nvidia.com> (raw)
In-Reply-To: <20260326065949.44058-1-tariqt@nvidia.com>

From: Cosmin Ratiu <cratiu@nvidia.com>

In commit [1] the concept of the root node in the qos hierarchy was
removed due to a bug with how tx_share worked. The side effect is that
in many places, there are now corner cases related to parent handling.
However, since that change, support for tc_bw was added and now, with
upcoming cross-esw support, the code is about to become even more
complicated, increasing the number of such corner cases.

Bring back the concept of the root node, to which all esw vports and
nodes are connected to. This benefits multiple operations which can
assume there's always a valid parent and don't have to do ternary
gymnastics to determine the correct esw to talk to.

As side effect, there's no longer a need to store the groups in the
qos domain, since normalization can simply iterate over all children of
the root node. Normalization gets simplified as a result.

There should be no functionality changes as a result of this change.

[1] commit 330f0f6713a3 ("net/mlx5: Remove default QoS group and attach
vports directly to root TSAR")
Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/esw/qos.c | 194 ++++++++----------
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |   3 +-
 2 files changed, 84 insertions(+), 113 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 4781a1a42f1a..0be516003bcd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -15,8 +15,6 @@
 struct mlx5_qos_domain {
 	/* Serializes access to all qos changes in the qos domain. */
 	struct mutex lock;
-	/* List of all mlx5_esw_sched_nodes. */
-	struct list_head nodes;
 };
 
 static void esw_qos_lock(struct mlx5_eswitch *esw)
@@ -43,7 +41,6 @@ static struct mlx5_qos_domain *esw_qos_domain_alloc(void)
 		return NULL;
 
 	mutex_init(&qos_domain->lock);
-	INIT_LIST_HEAD(&qos_domain->nodes);
 
 	return qos_domain;
 }
@@ -62,6 +59,7 @@ static void esw_qos_domain_release(struct mlx5_eswitch *esw)
 }
 
 enum sched_node_type {
+	SCHED_NODE_TYPE_ROOT,
 	SCHED_NODE_TYPE_VPORTS_TSAR,
 	SCHED_NODE_TYPE_VPORT,
 	SCHED_NODE_TYPE_TC_ARBITER_TSAR,
@@ -106,18 +104,6 @@ struct mlx5_esw_sched_node {
 	u32 tc_bw[DEVLINK_RATE_TCS_MAX];
 };
 
-static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node)
-{
-	if (!node->parent) {
-		/* Root children are assigned a depth level of 2. */
-		node->level = 2;
-		list_add_tail(&node->entry, &node->esw->qos.domain->nodes);
-	} else {
-		node->level = node->parent->level + 1;
-		list_add_tail(&node->entry, &node->parent->children);
-	}
-}
-
 static int esw_qos_num_tcs(struct mlx5_core_dev *dev)
 {
 	int num_tcs = mlx5_max_tc(dev) + 1;
@@ -125,14 +111,14 @@ static int esw_qos_num_tcs(struct mlx5_core_dev *dev)
 	return num_tcs < DEVLINK_RATE_TCS_MAX ? num_tcs : DEVLINK_RATE_TCS_MAX;
 }
 
-static void
-esw_qos_node_set_parent(struct mlx5_esw_sched_node *node, struct mlx5_esw_sched_node *parent)
+static void esw_qos_node_set_parent(struct mlx5_esw_sched_node *node,
+				    struct mlx5_esw_sched_node *parent)
 {
-	list_del_init(&node->entry);
 	node->parent = parent;
-	if (parent)
-		node->esw = parent->esw;
-	esw_qos_node_attach_to_parent(node);
+	node->esw = parent->esw;
+	node->level = parent->level + 1;
+	list_del(&node->entry);
+	list_add_tail(&node->entry, &parent->children);
 }
 
 static void esw_qos_nodes_set_parent(struct list_head *nodes,
@@ -321,22 +307,19 @@ static int esw_qos_create_rate_limit_element(struct mlx5_esw_sched_node *node,
 	return esw_qos_node_create_sched_element(node, sched_ctx, extack);
 }
 
-static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
-					      struct mlx5_esw_sched_node *parent)
+static u32
+esw_qos_calculate_min_rate_divider(struct mlx5_esw_sched_node *parent)
 {
-	struct list_head *nodes = parent ? &parent->children : &esw->qos.domain->nodes;
-	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	u32 fw_max_bw_share = MLX5_CAP_QOS(parent->esw->dev, max_tsar_bw_share);
 	struct mlx5_esw_sched_node *node;
 	u32 max_guarantee = 0;
 
 	/* Find max min_rate across all nodes.
 	 * This will correspond to fw_max_bw_share in the final bw_share calculation.
 	 */
-	list_for_each_entry(node, nodes, entry) {
-		if (node->esw == esw && node->ix != esw->qos.root_tsar_ix &&
-		    node->min_rate > max_guarantee)
+	list_for_each_entry(node, &parent->children, entry)
+		if (node->min_rate > max_guarantee)
 			max_guarantee = node->min_rate;
-	}
 
 	if (max_guarantee)
 		return max_t(u32, max_guarantee / fw_max_bw_share, 1);
@@ -368,18 +351,13 @@ static void esw_qos_update_sched_node_bw_share(struct mlx5_esw_sched_node *node,
 	esw_qos_sched_elem_config(node, node->max_rate, bw_share, extack);
 }
 
-static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
-				       struct mlx5_esw_sched_node *parent,
+static void esw_qos_normalize_min_rate(struct mlx5_esw_sched_node *parent,
 				       struct netlink_ext_ack *extack)
 {
-	struct list_head *nodes = parent ? &parent->children : &esw->qos.domain->nodes;
-	u32 divider = esw_qos_calculate_min_rate_divider(esw, parent);
+	u32 divider = esw_qos_calculate_min_rate_divider(parent);
 	struct mlx5_esw_sched_node *node;
 
-	list_for_each_entry(node, nodes, entry) {
-		if (node->esw != esw || node->ix == esw->qos.root_tsar_ix)
-			continue;
-
+	list_for_each_entry(node, &parent->children, entry) {
 		/* Vports TC TSARs don't have a minimum rate configured,
 		 * so there's no need to update the bw_share on them.
 		 */
@@ -391,7 +369,7 @@ static void esw_qos_normalize_min_rate(struct mlx5_eswitch *esw,
 		if (list_empty(&node->children))
 			continue;
 
-		esw_qos_normalize_min_rate(node->esw, node, extack);
+		esw_qos_normalize_min_rate(node, extack);
 	}
 }
 
@@ -412,14 +390,11 @@ static u32 esw_qos_calculate_tc_bw_divider(u32 *tc_bw)
 static int esw_qos_set_node_min_rate(struct mlx5_esw_sched_node *node,
 				     u32 min_rate, struct netlink_ext_ack *extack)
 {
-	struct mlx5_eswitch *esw = node->esw;
-
 	if (min_rate == node->min_rate)
 		return 0;
 
 	node->min_rate = min_rate;
-	esw_qos_normalize_min_rate(esw, node->parent, extack);
-
+	esw_qos_normalize_min_rate(node->parent, extack);
 	return 0;
 }
 
@@ -472,8 +447,7 @@ esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
 	attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
 	MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport);
-	MLX5_SET(scheduling_context, sched_ctx, parent_element_id,
-		 parent ? parent->ix : vport_node->esw->qos.root_tsar_ix);
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent->ix);
 	MLX5_SET(scheduling_context, sched_ctx, max_average_bw,
 		 vport_node->max_rate);
 
@@ -513,7 +487,7 @@ esw_qos_vport_tc_create_sched_element(struct mlx5_esw_sched_node *vport_tc_node,
 }
 
 static struct mlx5_esw_sched_node *
-__esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type type,
+__esw_qos_alloc_node(u32 tsar_ix, enum sched_node_type type,
 		     struct mlx5_esw_sched_node *parent)
 {
 	struct mlx5_esw_sched_node *node;
@@ -522,20 +496,12 @@ __esw_qos_alloc_node(struct mlx5_eswitch *esw, u32 tsar_ix, enum sched_node_type
 	if (!node)
 		return NULL;
 
-	node->esw = esw;
 	node->ix = tsar_ix;
 	node->type = type;
-	node->parent = parent;
 	INIT_LIST_HEAD(&node->children);
-	esw_qos_node_attach_to_parent(node);
-	if (!parent) {
-		/* The caller is responsible for inserting the node into the
-		 * parent list if necessary. This function can also be used with
-		 * a NULL parent, which doesn't necessarily indicate that it
-		 * refers to the root scheduling element.
-		 */
-		list_del_init(&node->entry);
-	}
+	INIT_LIST_HEAD(&node->entry);
+	if (parent)
+		esw_qos_node_set_parent(node, parent);
 
 	return node;
 }
@@ -570,7 +536,7 @@ static int esw_qos_create_vports_tc_node(struct mlx5_esw_sched_node *parent,
 					  SCHEDULING_HIERARCHY_E_SWITCH))
 		return -EOPNOTSUPP;
 
-	vports_tc_node = __esw_qos_alloc_node(parent->esw, 0,
+	vports_tc_node = __esw_qos_alloc_node(0,
 					      SCHED_NODE_TYPE_VPORTS_TC_TSAR,
 					      parent);
 	if (!vports_tc_node) {
@@ -665,7 +631,6 @@ static int esw_qos_create_tc_arbiter_sched_elem(
 		struct netlink_ext_ack *extack)
 {
 	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
-	u32 tsar_parent_ix;
 	void *attr;
 
 	if (!mlx5_qos_tsar_type_supported(tc_arbiter_node->esw->dev,
@@ -678,10 +643,8 @@ static int esw_qos_create_tc_arbiter_sched_elem(
 
 	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
 	MLX5_SET(tsar_element, attr, tsar_type, TSAR_ELEMENT_TSAR_TYPE_TC_ARB);
-	tsar_parent_ix = tc_arbiter_node->parent ? tc_arbiter_node->parent->ix :
-			 tc_arbiter_node->esw->qos.root_tsar_ix;
 	MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
-		 tsar_parent_ix);
+		 tc_arbiter_node->parent->ix);
 	MLX5_SET(scheduling_context, tsar_ctx, element_type,
 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
 	MLX5_SET(scheduling_context, tsar_ctx, max_average_bw,
@@ -694,37 +657,36 @@ static int esw_qos_create_tc_arbiter_sched_elem(
 }
 
 static struct mlx5_esw_sched_node *
-__esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct mlx5_esw_sched_node *parent,
+__esw_qos_create_vports_sched_node(struct mlx5_esw_sched_node *parent,
 				   struct netlink_ext_ack *extack)
 {
 	struct mlx5_esw_sched_node *node;
-	u32 tsar_ix;
 	int err;
+	u32 ix;
 
-	err = esw_qos_create_node_sched_elem(esw->dev, esw->qos.root_tsar_ix, 0,
-					     0, &tsar_ix);
+	err = esw_qos_create_node_sched_elem(parent->esw->dev, parent->ix, 0, 0,
+					     &ix);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch create TSAR for node failed");
 		return ERR_PTR(err);
 	}
 
-	node = __esw_qos_alloc_node(esw, tsar_ix, SCHED_NODE_TYPE_VPORTS_TSAR, parent);
+	node = __esw_qos_alloc_node(ix, SCHED_NODE_TYPE_VPORTS_TSAR, parent);
 	if (!node) {
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch alloc node failed");
 		err = -ENOMEM;
 		goto err_alloc_node;
 	}
 
-	list_add_tail(&node->entry, &esw->qos.domain->nodes);
-	esw_qos_normalize_min_rate(esw, NULL, extack);
-	trace_mlx5_esw_node_qos_create(esw->dev, node, node->ix);
+	esw_qos_normalize_min_rate(parent, extack);
+	trace_mlx5_esw_node_qos_create(parent->esw->dev, node, node->ix);
 
 	return node;
 
 err_alloc_node:
-	if (mlx5_destroy_scheduling_element_cmd(esw->dev,
+	if (mlx5_destroy_scheduling_element_cmd(parent->esw->dev,
 						SCHEDULING_HIERARCHY_E_SWITCH,
-						tsar_ix))
+						ix))
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR for node failed");
 	return ERR_PTR(err);
 }
@@ -746,7 +708,7 @@ esw_qos_create_vports_sched_node(struct mlx5_eswitch *esw, struct netlink_ext_ac
 	if (err)
 		return ERR_PTR(err);
 
-	node = __esw_qos_create_vports_sched_node(esw, NULL, extack);
+	node = __esw_qos_create_vports_sched_node(esw->qos.root, extack);
 	if (IS_ERR(node))
 		esw_qos_put(esw);
 
@@ -762,38 +724,47 @@ static void __esw_qos_destroy_node(struct mlx5_esw_sched_node *node, struct netl
 
 	trace_mlx5_esw_node_qos_destroy(esw->dev, node, node->ix);
 	esw_qos_destroy_node(node, extack);
-	esw_qos_normalize_min_rate(esw, NULL, extack);
+	esw_qos_normalize_min_rate(esw->qos.root, extack);
 }
 
 static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_esw_sched_node *root;
+	u32 root_ix;
 	int err;
 
 	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
 		return -EOPNOTSUPP;
 
-	err = esw_qos_create_node_sched_elem(esw->dev, 0, 0, 0,
-					     &esw->qos.root_tsar_ix);
+	err = esw_qos_create_node_sched_elem(esw->dev, 0, 0, 0, &root_ix);
 	if (err) {
 		esw_warn(dev, "E-Switch create root TSAR failed (%d)\n", err);
 		return err;
 	}
 
+	root = __esw_qos_alloc_node(root_ix, SCHED_NODE_TYPE_ROOT, NULL);
+	if (!root) {
+		esw_warn(dev, "E-Switch create root node failed\n");
+		err = -ENOMEM;
+		goto out_err;
+	}
+	root->esw = esw;
+	root->level = 1;
+	esw->qos.root = root;
 	refcount_set(&esw->qos.refcnt, 1);
 
 	return 0;
+out_err:
+	mlx5_destroy_scheduling_element_cmd(dev, SCHEDULING_HIERARCHY_E_SWITCH,
+					    root_ix);
+	return err;
 }
 
 static void esw_qos_destroy(struct mlx5_eswitch *esw)
 {
-	int err;
-
-	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
-						  SCHEDULING_HIERARCHY_E_SWITCH,
-						  esw->qos.root_tsar_ix);
-	if (err)
-		esw_warn(esw->dev, "E-Switch destroy root TSAR failed (%d)\n", err);
+	esw_qos_destroy_node(esw->qos.root, NULL);
+	esw->qos.root = NULL;
 }
 
 static int esw_qos_get(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
@@ -866,8 +837,7 @@ esw_qos_create_vport_tc_sched_node(struct mlx5_vport *vport,
 	u8 tc = vports_tc_node->tc;
 	int err;
 
-	vport_tc_node = __esw_qos_alloc_node(vport_node->esw, 0,
-					     SCHED_NODE_TYPE_VPORT_TC,
+	vport_tc_node = __esw_qos_alloc_node(0, SCHED_NODE_TYPE_VPORT_TC,
 					     vports_tc_node);
 	if (!vport_tc_node)
 		return -ENOMEM;
@@ -959,7 +929,7 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type,
 		/* Increase the parent's level by 2 to account for both the
 		 * TC arbiter and the vports TC scheduling element.
 		 */
-		new_level = (parent ? parent->level : 2) + 2;
+		new_level = parent->level + 2;
 		max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev,
 					      log_esw_max_sched_depth);
 		if (new_level > max_level) {
@@ -997,7 +967,7 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type,
 err_sched_nodes:
 	if (type == SCHED_NODE_TYPE_RATE_LIMITER) {
 		esw_qos_node_destroy_sched_element(vport_node, NULL);
-		esw_qos_node_attach_to_parent(vport_node);
+		esw_qos_node_set_parent(vport_node, vport_node->parent);
 	} else {
 		esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL);
 	}
@@ -1055,7 +1025,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a
 	vport_node->bw_share = 0;
 	memset(vport_node->tc_bw, 0, sizeof(vport_node->tc_bw));
 	list_del_init(&vport_node->entry);
-	esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack);
+	esw_qos_normalize_min_rate(vport_node->parent, extack);
 
 	trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport);
 }
@@ -1068,7 +1038,7 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport,
 	struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node;
 	int err;
 
-	esw_assert_qos_lock_held(vport->dev->priv.eswitch);
+	esw_assert_qos_lock_held(vport_node->esw);
 
 	esw_qos_node_set_parent(vport_node, parent);
 	if (type == SCHED_NODE_TYPE_VPORT)
@@ -1079,7 +1049,7 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport,
 		return err;
 
 	vport_node->type = type;
-	esw_qos_normalize_min_rate(vport_node->esw, parent, extack);
+	esw_qos_normalize_min_rate(parent, extack);
 	trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport_node->max_rate,
 					vport_node->bw_share);
 
@@ -1092,7 +1062,6 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
 {
 	struct mlx5_eswitch *esw = vport->dev->priv.eswitch;
 	struct mlx5_esw_sched_node *sched_node;
-	struct mlx5_eswitch *parent_esw;
 	int err;
 
 	esw_assert_qos_lock_held(esw);
@@ -1100,14 +1069,13 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t
 	if (err)
 		return err;
 
-	parent_esw = parent ? parent->esw : esw;
-	sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent);
+	if (!parent)
+		parent = esw->qos.root;
+	sched_node = __esw_qos_alloc_node(0, type, parent);
 	if (!sched_node) {
 		esw_qos_put(esw);
 		return -ENOMEM;
 	}
-	if (!parent)
-		list_add_tail(&sched_node->entry, &esw->qos.domain->nodes);
 
 	sched_node->max_rate = max_rate;
 	sched_node->min_rate = min_rate;
@@ -1147,7 +1115,8 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport)
 		goto unlock;
 
 	parent = vport->qos.sched_node->parent;
-	WARN(parent, "Disabling QoS on port before detaching it from node");
+	WARN(parent != esw->qos.root,
+	     "Disabling QoS on port before detaching it from node");
 
 	mlx5_esw_qos_vport_disable_locked(vport);
 unlock:
@@ -1319,11 +1288,9 @@ static int esw_qos_switch_tc_arbiter_node_to_vports(
 	struct mlx5_esw_sched_node *node,
 	struct netlink_ext_ack *extack)
 {
-	u32 parent_tsar_ix = node->parent ?
-			     node->parent->ix : node->esw->qos.root_tsar_ix;
 	int err;
 
-	err = esw_qos_create_node_sched_elem(node->esw->dev, parent_tsar_ix,
+	err = esw_qos_create_node_sched_elem(node->esw->dev, node->parent->ix,
 					     node->max_rate, node->bw_share,
 					     &node->ix);
 	if (err) {
@@ -1378,8 +1345,8 @@ esw_qos_move_node(struct mlx5_esw_sched_node *curr_node)
 {
 	struct mlx5_esw_sched_node *new_node;
 
-	new_node = __esw_qos_alloc_node(curr_node->esw, curr_node->ix,
-					curr_node->type, NULL);
+	new_node = __esw_qos_alloc_node(curr_node->ix, curr_node->type,
+					curr_node->parent);
 	if (!new_node)
 		return ERR_PTR(-ENOMEM);
 
@@ -1888,7 +1855,9 @@ mlx5_esw_qos_vport_update_parent(struct mlx5_vport *vport,
 		err = mlx5_esw_qos_vport_enable(vport, type, parent, 0, 0,
 						extack);
 	} else if (vport->qos.sched_node) {
-		err = esw_qos_vport_update_parent(vport, parent, extack);
+		err = esw_qos_vport_update_parent(vport,
+						  parent ? : esw->qos.root,
+						  extack);
 	}
 	esw_qos_unlock(esw);
 	return err;
@@ -1941,7 +1910,7 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
 {
 	u8 new_level, max_level;
 
-	if (parent && parent->esw != node->esw) {
+	if (parent->esw != node->esw) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Cannot assign node to another E-Switch");
 		return -EOPNOTSUPP;
@@ -1953,13 +1922,13 @@ mlx5_esw_qos_node_validate_set_parent(struct mlx5_esw_sched_node *node,
 		return -EOPNOTSUPP;
 	}
 
-	if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
+	if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Cannot attach a node to a parent with TC bandwidth configured");
 		return -EOPNOTSUPP;
 	}
 
-	new_level = parent ? parent->level + 1 : 2;
+	new_level = parent->level + 1;
 	if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
 		/* Increase by one to account for the vports TC scheduling
 		 * element.
@@ -2010,14 +1979,12 @@ static int esw_qos_vports_node_update_parent(struct mlx5_esw_sched_node *node,
 {
 	struct mlx5_esw_sched_node *curr_parent = node->parent;
 	struct mlx5_eswitch *esw = node->esw;
-	u32 parent_ix;
 	int err;
 
-	parent_ix = parent ? parent->ix : node->esw->qos.root_tsar_ix;
 	mlx5_destroy_scheduling_element_cmd(esw->dev,
 					    SCHEDULING_HIERARCHY_E_SWITCH,
 					    node->ix);
-	err = esw_qos_create_node_sched_elem(esw->dev, parent_ix,
+	err = esw_qos_create_node_sched_elem(esw->dev, parent->ix,
 					     node->max_rate, 0, &node->ix);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
@@ -2044,12 +2011,15 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node,
 	struct mlx5_eswitch *esw = node->esw;
 	int err;
 
+	esw_qos_lock(esw);
+	curr_parent = node->parent;
+	if (!parent)
+		parent = esw->qos.root;
+
 	err = mlx5_esw_qos_node_validate_set_parent(node, parent, extack);
 	if (err)
-		return err;
+		goto out;
 
-	esw_qos_lock(esw);
-	curr_parent = node->parent;
 	if (node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) {
 		err = esw_qos_tc_arbiter_node_update_parent(node, parent,
 							    extack);
@@ -2060,8 +2030,8 @@ static int mlx5_esw_qos_node_update_parent(struct mlx5_esw_sched_node *node,
 	if (err)
 		goto out;
 
-	esw_qos_normalize_min_rate(esw, curr_parent, extack);
-	esw_qos_normalize_min_rate(esw, parent, extack);
+	esw_qos_normalize_min_rate(curr_parent, extack);
+	esw_qos_normalize_min_rate(parent, extack);
 
 out:
 	esw_qos_unlock(esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index a5a02b26b80b..9b3949a64784 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -390,8 +390,9 @@ struct mlx5_eswitch {
 	struct {
 		/* Initially 0, meaning no QoS users and QoS is disabled. */
 		refcount_t refcnt;
-		u32 root_tsar_ix;
 		struct mlx5_qos_domain *domain;
+		/* The root node of the hierarchy. */
+		struct mlx5_esw_sched_node *root;
 	} qos;
 
 	struct mlx5_esw_bridge_offloads *br_offloads;
-- 
2.44.0

next prev parent reply	other threads:[~2026-03-26  7:02 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-03-26  6:59 [PATCH net-next V9 00/14] devlink and mlx5: Support cross-function rate scheduling Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 01/14] devlink: Update nested instance locking comment Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 02/14] devlink: Add helpers to lock nested-in instances Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 03/14] devlink: Migrate from info->user_ptr to info->ctx Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 04/14] devlink: Decouple rate storage from associated devlink object Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 05/14] devlink: Add parent dev to devlink API Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 06/14] devlink: Allow parent dev for rate-set and rate-new Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 07/14] devlink: Allow rate node parents from other devlinks Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 08/14] net/mlx5: qos: Use mlx5_lag_query_bond_speed to query LAG speed Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 09/14] net/mlx5: qos: Expose a function to clear a vport's parent Tariq Toukan
2026-03-26  6:59 ` Tariq Toukan [this message]
2026-03-26  6:59 ` [PATCH net-next V9 11/14] net/mlx5: qos: Remove qos domains and use shd lock Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 12/14] net/mlx5: qos: Support cross-device tx scheduling Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 13/14] selftests: drv-net: Add test for cross-esw rate scheduling Tariq Toukan
2026-03-26  6:59 ` [PATCH net-next V9 14/14] net/mlx5: Document devlink rates Tariq Toukan

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:4781a1a42f1 dfblob:0be516003bc dfblob:a5a02b26b80
dfblob:9b3949a6478 )
 OR (
bs:"[PATCH net-next V9 10/14] net/mlx5: qos: Model the root node in the scheduling hierarchy" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260326065949.44058-11-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=ajayachandra@nvidia.com \
    --cc=allison.henderson@oracle.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=antonio@openvpn.net \
    --cc=chuck.lever@oracle.com \
    --cc=cjubran@nvidia.com \
    --cc=corbet@lwn.net \
    --cc=cratiu@nvidia.com \
    --cc=daniel.zahka@gmail.com \
    --cc=daniel@iogearbox.net \
    --cc=danielj@nvidia.com \
    --cc=davem@davemloft.net \
    --cc=donald.hunter@gmail.com \
    --cc=dtatulea@nvidia.com \
    --cc=dw@davidwei.uk \
    --cc=edumazet@google.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=jacob.e.keller@intel.com \
    --cc=jiri@nvidia.com \
    --cc=jiri@resnulli.us \
    --cc=joe@dama.to \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=matttbe@kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=minhquangbui99@gmail.com \
    --cc=moshe@nvidia.com \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=noren@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=petrm@nvidia.com \
    --cc=razor@blackwall.org \
    --cc=saeedm@nvidia.com \
    --cc=sdf@fomichev.me \
    --cc=shayd@nvidia.com \
    --cc=shshitrit@nvidia.com \
    --cc=shuah@kernel.org \
    --cc=skhan@linuxfoundation.org \
    --cc=vadim.fedorenko@linux.dev \
    --cc=willemb@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox