From: Tariq Toukan <tariqt@nvidia.com>
To: "David S. Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Eric Dumazet <edumazet@google.com>,
"Andrew Lunn" <andrew+netdev@lunn.ch>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, <netdev@vger.kernel.org>,
<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
Moshe Shemesh <moshe@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
Vlad Dogaru <vdogaru@nvidia.com>,
Yevgeny Kliteynik <kliteyn@nvidia.com>,
Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 10/10] net/mlx5: HWS, dump bad completion details
Date: Sun, 11 May 2025 22:38:10 +0300 [thread overview]
Message-ID: <1746992290-568936-11-git-send-email-tariqt@nvidia.com> (raw)
In-Reply-To: <1746992290-568936-1-git-send-email-tariqt@nvidia.com>
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Failing to insert/delete a rule should not happen. If it does happen,
it would be good to know at which stage it happened and what was the
failure. This patch adds printing of bad CQE details.
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Reviewed-by: Vlad Dogaru <vdogaru@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../mellanox/mlx5/core/steering/hws/send.c | 122 +++++++++++++++++-
.../mellanox/mlx5/core/steering/hws/send.h | 1 +
2 files changed, 120 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
index cb6abc4ab7df..c4b22be19a9b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c
@@ -344,18 +344,133 @@ hws_send_engine_update_rule_resize(struct mlx5hws_send_engine *queue,
}
}
+static void hws_send_engine_dump_error_cqe(struct mlx5hws_send_engine *queue,
+ struct mlx5hws_send_ring_priv *priv,
+ struct mlx5_cqe64 *cqe)
+{
+ u8 wqe_opcode = cqe ? be32_to_cpu(cqe->sop_drop_qpn) >> 24 : 0;
+ struct mlx5hws_context *ctx = priv->rule->matcher->tbl->ctx;
+ u32 opcode = cqe ? get_cqe_opcode(cqe) : 0;
+ struct mlx5hws_rule *rule = priv->rule;
+
+ /* If something bad happens and lots of rules are failing, we don't
+ * want to pollute dmesg. Print only the first bad cqe per engine,
+ * the one that started the avalanche.
+ */
+ if (queue->error_cqe_printed)
+ return;
+
+ queue->error_cqe_printed = true;
+
+ if (mlx5hws_rule_move_in_progress(rule))
+ mlx5hws_err(ctx,
+ "--- rule 0x%08llx: error completion moving rule: phase %s, wqes left %d\n",
+ HWS_PTR_TO_ID(rule),
+ rule->resize_info->state ==
+ MLX5HWS_RULE_RESIZE_STATE_WRITING ? "WRITING" :
+ rule->resize_info->state ==
+ MLX5HWS_RULE_RESIZE_STATE_DELETING ? "DELETING" :
+ "UNKNOWN",
+ rule->pending_wqes);
+ else
+ mlx5hws_err(ctx,
+ "--- rule 0x%08llx: error completion %s (%d), wqes left %d\n",
+ HWS_PTR_TO_ID(rule),
+ rule->status ==
+ MLX5HWS_RULE_STATUS_CREATING ? "CREATING" :
+ rule->status ==
+ MLX5HWS_RULE_STATUS_DELETING ? "DELETING" :
+ rule->status ==
+ MLX5HWS_RULE_STATUS_FAILING ? "FAILING" :
+ rule->status ==
+ MLX5HWS_RULE_STATUS_UPDATING ? "UPDATING" : "NA",
+ rule->status,
+ rule->pending_wqes);
+
+ mlx5hws_err(ctx, " rule 0x%08llx: matcher 0x%llx %s\n",
+ HWS_PTR_TO_ID(rule),
+ HWS_PTR_TO_ID(rule->matcher),
+ (rule->matcher->flags & MLX5HWS_MATCHER_FLAGS_ISOLATED) ?
+ "(isolated)" : "");
+
+ if (!cqe) {
+ mlx5hws_err(ctx, " rule 0x%08llx: no CQE\n",
+ HWS_PTR_TO_ID(rule));
+ return;
+ }
+
+ mlx5hws_err(ctx, " rule 0x%08llx: cqe->opcode = %d %s\n",
+ HWS_PTR_TO_ID(rule), opcode,
+ opcode == MLX5_CQE_REQ ? "(MLX5_CQE_REQ)" :
+ opcode == MLX5_CQE_REQ_ERR ? "(MLX5_CQE_REQ_ERR)" : " ");
+
+ if (opcode == MLX5_CQE_REQ_ERR) {
+ struct mlx5_err_cqe *err_cqe = (struct mlx5_err_cqe *)cqe;
+
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- hw_error_syndrome = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->rsvd1[16]);
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- hw_syndrome_type = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->rsvd1[17] >> 4);
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- vendor_err_synd = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->vendor_err_synd);
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |--- syndrome = 0x%x\n",
+ HWS_PTR_TO_ID(rule),
+ err_cqe->syndrome);
+ }
+
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: cqe->byte_cnt = 0x%08x\n",
+ HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->byte_cnt));
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |-- UPDATE STATUS = %s\n",
+ HWS_PTR_TO_ID(rule),
+ (be32_to_cpu(cqe->byte_cnt) & 0x80000000) ?
+ "FAILURE" : "SUCCESS");
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |------- SYNDROME = %s\n",
+ HWS_PTR_TO_ID(rule),
+ ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 1) ?
+ "SET_FLOW_FAIL" :
+ ((be32_to_cpu(cqe->byte_cnt) & 0x00000003) == 2) ?
+ "DISABLE_FLOW_FAIL" : "UNKNOWN");
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: cqe->sop_drop_qpn = 0x%08x\n",
+ HWS_PTR_TO_ID(rule), be32_to_cpu(cqe->sop_drop_qpn));
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |-send wqe opcode = 0x%02x %s\n",
+ HWS_PTR_TO_ID(rule), wqe_opcode,
+ wqe_opcode == MLX5HWS_WQE_OPCODE_TBL_ACCESS ?
+ "(MLX5HWS_WQE_OPCODE_TBL_ACCESS)" : "(UNKNOWN)");
+ mlx5hws_err(ctx,
+ " rule 0x%08llx: |------------ qpn = 0x%06x\n",
+ HWS_PTR_TO_ID(rule),
+ be32_to_cpu(cqe->sop_drop_qpn) & 0xffffff);
+}
+
static void hws_send_engine_update_rule(struct mlx5hws_send_engine *queue,
struct mlx5hws_send_ring_priv *priv,
u16 wqe_cnt,
- enum mlx5hws_flow_op_status *status)
+ enum mlx5hws_flow_op_status *status,
+ struct mlx5_cqe64 *cqe)
{
priv->rule->pending_wqes--;
- if (*status == MLX5HWS_FLOW_OP_ERROR) {
+ if (unlikely(*status == MLX5HWS_FLOW_OP_ERROR)) {
if (priv->retry_id) {
+ /* If there is a retry_id, then it's not an error yet,
+ * retry to insert this rule in the collision RTC.
+ */
hws_send_engine_retry_post_send(queue, priv, wqe_cnt);
return;
}
+ hws_send_engine_dump_error_cqe(queue, priv, cqe);
/* Some part of the rule failed */
priv->rule->status = MLX5HWS_RULE_STATUS_FAILING;
*priv->used_id = 0;
@@ -420,7 +535,8 @@ static void hws_send_engine_update(struct mlx5hws_send_engine *queue,
if (priv->user_data) {
if (priv->rule) {
- hws_send_engine_update_rule(queue, priv, wqe_cnt, &status);
+ hws_send_engine_update_rule(queue, priv, wqe_cnt,
+ &status, cqe);
/* Completion is provided on the last rule WQE */
if (priv->rule->pending_wqes)
return;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
index f833092235c1..3fb8e99309b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.h
@@ -140,6 +140,7 @@ struct mlx5hws_send_engine {
u16 used_entries;
u16 num_entries;
bool err;
+ bool error_cqe_printed;
struct mutex lock; /* Protects the send engine */
};
--
2.31.1
next prev parent reply other threads:[~2025-05-11 19:40 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-05-11 19:38 [PATCH net-next 00/10] net/mlx5: HWS, Complex Matchers and rehash mechanism fixes Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 01/10] net/mlx5: HWS, expose function mlx5hws_table_ft_set_next_ft in header Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 02/10] net/mlx5: HWS, add definer function to get field name str Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 03/10] net/mlx5: HWS, expose polling function in header file Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 04/10] net/mlx5: HWS, introduce isolated matchers Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 05/10] net/mlx5: HWS, support complex matchers Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 06/10] net/mlx5: HWS, force rehash when rule insertion failed Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 07/10] net/mlx5: HWS, fix counting of rules in the matcher Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 08/10] net/mlx5: HWS, fix redundant extension of action templates Tariq Toukan
2025-05-11 19:38 ` [PATCH net-next 09/10] net/mlx5: HWS, rework rehash loop Tariq Toukan
2025-05-11 19:38 ` Tariq Toukan [this message]
2025-05-13 22:50 ` [PATCH net-next 00/10] net/mlx5: HWS, Complex Matchers and rehash mechanism fixes patchwork-bot+netdevbpf
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1746992290-568936-11-git-send-email-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=gal@nvidia.com \
--cc=kliteyn@nvidia.com \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=saeedm@nvidia.com \
--cc=vdogaru@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox