DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 12/17] net/cnxk: fix unsigned integer underflow in LSO calculation
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Pavan Nikhilesh, Jerin Jacob,
	Rahul Bhansali
  Cc: Aarnav JP, stable
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Replace branchless mask-based selection with a ternary operator
to resolve Coverity integer underflow warning. The expression
-(!w1.il3type) assigned -1 to a uint64_t variable, which is
well-defined but flagged as an unsigned integer underflow.

Coverity issue: 502004

Fixes: 19f3af2371a7 ("net/cnxk: add Tx burst for CN10K")
Fixes: 39dc567c1955 ("net/cnxk: add Tx burst for CN9K")
Fixes: 006c1daa89b9 ("net/cnxk: support Tx burst scalar for CN20K")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
 drivers/net/cnxk/cn10k_tx.h | 8 ++------
 drivers/net/cnxk/cn20k_tx.h | 8 ++------
 drivers/net/cnxk/cn9k_tx.h  | 8 ++------
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8c912a1f35..d5cb2c3294 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -1138,10 +1138,8 @@ cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
 	if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_TSO_F &&
 	    (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
 		uint16_t lso_sb;
-		uint64_t mask;
 
-		mask = -(!w1.il3type);
-		lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
+		lso_sb = (w1.il3type ? w1.il4ptr : w1.ol4ptr) + m->l4_len;
 
 		send_hdr_ext->w0.lso_sb = lso_sb;
 		send_hdr_ext->w0.lso = 1;
@@ -1766,13 +1764,11 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 		      const uint64_t flags, const uint64_t lso_tun_fmt)
 {
 	uint16_t lso_sb;
-	uint64_t mask;
 
 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
 		return;
 
-	mask = -(!w1->il3type);
-	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+	lso_sb = (w1->il3type ? w1->il4ptr : w1->ol4ptr) + m->l4_len;
 
 	w0->u |= BIT(14);
 	w0->lso_sb = lso_sb;
diff --git a/drivers/net/cnxk/cn20k_tx.h b/drivers/net/cnxk/cn20k_tx.h
index 8e64d2e352..a1c71f2761 100644
--- a/drivers/net/cnxk/cn20k_tx.h
+++ b/drivers/net/cnxk/cn20k_tx.h
@@ -1117,10 +1117,8 @@ cn20k_nix_xmit_prepare(struct cn20k_eth_txq *txq, struct rte_mbuf *m, struct rte
 	if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_TSO_F &&
 	    (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
 		uint16_t lso_sb;
-		uint64_t mask;
 
-		mask = -(!w1.il3type);
-		lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
+		lso_sb = (w1.il3type ? w1.il4ptr : w1.ol4ptr) + m->l4_len;
 
 		send_hdr_ext->w0.lso_sb = lso_sb;
 		send_hdr_ext->w0.lso = 1;
@@ -1732,13 +1730,11 @@ cn20k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1, union nix
 		      uint64_t ol_flags, const uint64_t flags, const uint64_t lso_tun_fmt)
 {
 	uint16_t lso_sb;
-	uint64_t mask;
 
 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
 		return;
 
-	mask = -(!w1->il3type);
-	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+	lso_sb = (w1->il3type ? w1->il4ptr : w1->ol4ptr) + m->l4_len;
 
 	w0->u |= BIT(14);
 	w0->lso_sb = lso_sb;
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 0ec448e36c..2f9b936d56 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -478,10 +478,8 @@ cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_m
 
 	if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
 		uint16_t lso_sb;
-		uint64_t mask;
 
-		mask = -(!w1.il3type);
-		lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
+		lso_sb = (w1.il3type ? w1.il4ptr : w1.ol4ptr) + m->l4_len;
 
 		send_hdr_ext->w0.lso_sb = lso_sb;
 		send_hdr_ext->w0.lso = 1;
@@ -875,13 +873,11 @@ cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 		     uint64_t flags)
 {
 	uint16_t lso_sb;
-	uint64_t mask;
 
 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
 		return;
 
-	mask = -(!w1->il3type);
-	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+	lso_sb = (w1->il3type ? w1->il4ptr : w1->ol4ptr) + m->l4_len;
 
 	w0->u |= BIT(14);
 	w0->lso_sb = lso_sb;
-- 
2.34.1


^ permalink raw reply related

* [PATCH 11/17] net/cnxk: enable CPT CQ by default for inline IPsec
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Aarnav JP
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

CPT Completion Queue is supported on CN20K and provides
hardware-based completion notification, eliminating the
need for software polling. Change the default value of
cpt_cq_enable devarg from 0 to 1 so that CPT CQ is
enabled by default.

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
 doc/guides/nics/cnxk.rst           | 13 +++++++++++++
 drivers/net/cnxk/cnxk_ethdev_sec.c |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index 239ebcd05c..c71029e1dc 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -745,6 +745,19 @@ Runtime Config Options for inline device
    With the above configuration, inline inbound IPsec post-processing
    should be done by the application.
 
+- ``Enable CPT Completion Queue for inline IPsec`` (default ``1`` for CN20K, ``0`` otherwise)
+
+   CPT Completion Queue for inline IPsec event delivery can be enabled or disabled
+   by ``cpt_cq_enable`` devargs parameter.
+   This option is supported on OCTEON CN20K SoC family.
+
+   For example::
+
+      -a 0002:1d:00.0,cpt_cq_enable=1
+
+   With the above configuration, driver would enable CPT completion queue
+   for inline IPsec event delivery instead of using the err-ring poll thread.
+
 Port Representors
 -----------------
 
diff --git a/drivers/net/cnxk/cnxk_ethdev_sec.c b/drivers/net/cnxk/cnxk_ethdev_sec.c
index fa7eacfbe4..61eb55ba43 100644
--- a/drivers/net/cnxk/cnxk_ethdev_sec.c
+++ b/drivers/net/cnxk/cnxk_ethdev_sec.c
@@ -742,7 +742,7 @@ nix_inl_parse_devargs(struct rte_devargs *devargs,
 	uint32_t meta_buf_sz = 0;
 	uint8_t rx_inj_ena = 0;
 	uint8_t selftest = 0;
-	uint8_t cpt_cq_enable = 0;
+	uint8_t cpt_cq_enable = roc_feature_nix_has_cpt_cq_support() ? 1 : 0;
 
 	memset(&cpt_channel, 0, sizeof(cpt_channel));
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH 10/17] common/cnxk: fix event type for soft expiry
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Stephen Hemminger
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

Fix event type to default for inline soft expiry processing.

Fixes: 4a6154a7bd27 ("common/cnxk: fix array out-of-bounds")

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 drivers/common/cnxk/roc_nix_inl_dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/common/cnxk/roc_nix_inl_dev.c b/drivers/common/cnxk/roc_nix_inl_dev.c
index 667209b8a0..bfdeef2688 100644
--- a/drivers/common/cnxk/roc_nix_inl_dev.c
+++ b/drivers/common/cnxk/roc_nix_inl_dev.c
@@ -1234,6 +1234,7 @@ inl_outb_soft_exp_poll(struct nix_inl_dev *inl_dev, uint32_t ring_idx)
 
 		if (sa != NULL) {
 			uint64_t tmp[2];
+			tmp[0] = ~0ULL;
 			inl_dev->work_cb(tmp, sa, NIX_INL_SOFT_EXPIRY_THRD, NULL, port_id);
 			__atomic_store_n(ring_base + tail_l + 1, 0ULL, __ATOMIC_RELAXED);
 			__atomic_fetch_add((uint32_t *)ring_base, 1, __ATOMIC_ACQ_REL);
-- 
2.34.1


^ permalink raw reply related

* [PATCH 09/17] common/cnxk: fix cnxk xstats names
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Thomas Monjalon, Nithin Dabilpuram, Kiran Kumar K,
	Sunil Kumar Kori, Satha Rao, Harman Kalra, Rakesh Kudurumalla
  Cc: jerinj, Alok Mishra, stable
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

From: Alok Mishra <almishra@marvell.com>

Prevent out of bounds writes when application provides a smaller
xstat name array. Return required count when xstats_names is NULL
or when the provided buffer is too small,

Fixes: 825bd1d9d8e6 ("common/cnxk: update extra stats for inline device")
Cc: stable@dpdk.org

Signed-off-by: Alok Mishra <almishra@marvell.com>
---
 .mailmap                            |  1 +
 drivers/common/cnxk/roc_nix_stats.c | 46 ++++++++++++++++-------------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/.mailmap b/.mailmap
index 0e0d83e1c6..efcb38b6bd 100644
--- a/.mailmap
+++ b/.mailmap
@@ -80,6 +80,7 @@ Alin Rauta <alin.rauta@intel.com>
 Allain Legacy <allain.legacy@windriver.com>
 Allen Hubbe <allen.hubbe@amd.com>
 Alok Makhariya <alok.makhariya@nxp.com>
+Alok Mishra <almishra@marvell.com>
 Alok Prasad <palok@marvell.com>
 Alvaro Karsz <alvaro.karsz@solid-run.com>
 Alvin Zhang <alvinx.zhang@intel.com>
diff --git a/drivers/common/cnxk/roc_nix_stats.c b/drivers/common/cnxk/roc_nix_stats.c
index 6f241c72de..ec2aca8164 100644
--- a/drivers/common/cnxk/roc_nix_stats.c
+++ b/drivers/common/cnxk/roc_nix_stats.c
@@ -503,46 +503,51 @@ roc_nix_xstats_names_get(struct roc_nix *roc_nix,
 	struct idev_cfg *idev = idev_get_cfg();
 	uint64_t i, count = 0;
 
-	PLT_SET_USED(limit);
-
 	for (i = 0; i < CNXK_NIX_NUM_TX_XSTATS; i++) {
-		NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats, i);
+		if (xstats_names && count < limit)
+			NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats, i);
 		count++;
 	}
 
 	for (i = 0; i < CNXK_NIX_NUM_RX_XSTATS; i++) {
-		NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats, i);
+		if (xstats_names && count < limit)
+			NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats, i);
 		count++;
 	}
 
 	if (nix->inb_inl_dev && idev) {
 		if (idev->nix_inl_dev) {
 			for (i = 0; i < CNXK_INL_NIX_NUM_RX_XSTATS; i++) {
-				NIX_XSTATS_NAME_PRINT(xstats_names, count,
-						      inl_nix_rx_xstats, i);
+				if (xstats_names && count < limit)
+					NIX_XSTATS_NAME_PRINT(xstats_names, count,
+							      inl_nix_rx_xstats, i);
 				count++;
 			}
 			for (i = 0; i < CNXK_INL_NIX_RQ_XSTATS; i++) {
-				NIX_XSTATS_NAME_PRINT(xstats_names, count,
-						      inl_nix_rq_xstats, i);
+				if (xstats_names && count < limit)
+					NIX_XSTATS_NAME_PRINT(xstats_names, count,
+							      inl_nix_rq_xstats, i);
 				count++;
 			}
 			for (i = 0; i < PLT_DIM(inl_sw_xstats); i++) {
-				NIX_XSTATS_NAME_PRINT(xstats_names, count, inl_sw_xstats, i);
+				if (xstats_names && count < limit)
+					NIX_XSTATS_NAME_PRINT(xstats_names, count, inl_sw_xstats,
+							      i);
 				count++;
 			}
 		}
 	}
 
 	for (i = 0; i < CNXK_NIX_NUM_QUEUE_XSTATS; i++) {
-		NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_q_xstats, i);
+		if (xstats_names && count < limit)
+			NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_q_xstats, i);
 		count++;
 	}
 
 	if (roc_model_is_cn10k() || roc_model_is_cn20k()) {
 		for (i = 0; i < CNXK_NIX_NUM_CN10K_RX_XSTATS; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_cn10k_rx_xstats, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_cn10k_rx_xstats, i);
 			count++;
 		}
 	}
@@ -552,30 +557,29 @@ roc_nix_xstats_names_get(struct roc_nix *roc_nix,
 
 	if (roc_model_is_cn9k()) {
 		for (i = 0; i < CNXK_NIX_NUM_RX_XSTATS_CGX; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_rx_xstats_cgx, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats_cgx, i);
 			count++;
 		}
 
 		for (i = 0; i < CNXK_NIX_NUM_TX_XSTATS_CGX; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_tx_xstats_cgx, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats_cgx, i);
 			count++;
 		}
 
 	} else {
 		for (i = 0; i < CNXK_NIX_NUM_RX_XSTATS_RPM; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_rx_xstats_rpm, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats_rpm, i);
 			count++;
 		}
 
 		for (i = 0; i < CNXK_NIX_NUM_TX_XSTATS_RPM; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_tx_xstats_rpm, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats_rpm, i);
 			count++;
 		}
 	}
-
 	return count;
 }
-- 
2.34.1


^ permalink raw reply related

* [PATCH 08/17] net/cnxk: update inbound SA pkind for skip size
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rakesh Kudurumalla
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

From: Rakesh Kudurumalla <rkudurumalla@marvell.com>

Update the inbound SA pkind using roc_npc_skip_size_pkind_get()
during session create and session update for both CN10K and CN20K.
This ensures the CPT second pass uses the correct pkind when
skip size is configured, retaining the default pkind otherwise.

Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>
---
 drivers/net/cnxk/cn10k_ethdev_sec.c | 9 +++++++++
 drivers/net/cnxk/cn20k_ethdev_sec.c | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/net/cnxk/cn10k_ethdev_sec.c b/drivers/net/cnxk/cn10k_ethdev_sec.c
index 855bea1796..2f1fdf34fc 100644
--- a/drivers/net/cnxk/cn10k_ethdev_sec.c
+++ b/drivers/net/cnxk/cn10k_ethdev_sec.c
@@ -853,6 +853,10 @@ cn10k_eth_sec_session_create(void *device,
 			goto err;
 		}
 
+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		inb_priv = roc_nix_inl_ot_ipsec_inb_sa_sw_rsvd(inb_sa);
 		/* Back pointer to get eth_sec */
 		inb_priv->eth_sec = eth_sec;
@@ -1151,6 +1155,11 @@ cn10k_eth_sec_session_update(void *device, struct rte_security_session *sess,
 		rc = cnxk_ot_ipsec_inb_sa_fill(inb_sa_dptr, ipsec, crypto, 0);
 		if (rc)
 			goto err;
+
+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		/* Use cookie for original data */
 		inb_sa_dptr->w1.s.cookie = inb_sa->w1.s.cookie;
 
diff --git a/drivers/net/cnxk/cn20k_ethdev_sec.c b/drivers/net/cnxk/cn20k_ethdev_sec.c
index 5d0debb81d..31f2518ea3 100644
--- a/drivers/net/cnxk/cn20k_ethdev_sec.c
+++ b/drivers/net/cnxk/cn20k_ethdev_sec.c
@@ -865,6 +865,10 @@ cn20k_eth_sec_session_create(void *device, struct rte_security_session_conf *con
 			goto err;
 		}
 
+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		cn20k_eth_sec_inb_sa_misc_fill(inb_sa_dptr, ipsec);
 
 		inb_priv = roc_nix_inl_ow_ipsec_inb_sa_sw_rsvd(inb_sa);
@@ -1137,6 +1141,10 @@ cn20k_eth_sec_session_update(void *device, struct rte_security_session *sess,
 		if (rc)
 			return -EINVAL;
 
+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		cn20k_eth_sec_inb_sa_misc_fill(inb_sa_dptr, ipsec);
 
 		/* Use cookie for original data */
-- 
2.34.1


^ permalink raw reply related

* [PATCH 07/17] drivers: add support for devargs skip size
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

From: Kiran Kumar K <kirankumark@marvell.com>

Adding support for devargs skip_size to cnxk driver.
This allows users to specify the number of bytes to skip in the packet
parsing before L2.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
 doc/guides/nics/cnxk.rst                      | 19 +++++++-
 drivers/common/cnxk/roc_mbox.h                | 12 ++++-
 drivers/common/cnxk/roc_nix.h                 | 14 ++----
 drivers/common/cnxk/roc_nix_ops.c             | 46 +++++++++++++++++--
 drivers/common/cnxk/roc_npc.c                 | 44 +++++++++++++++++-
 drivers/common/cnxk/roc_npc.h                 |  2 +
 drivers/common/cnxk/roc_npc_priv.h            | 11 +++++
 .../common/cnxk/roc_platform_base_symbols.c   |  1 +
 drivers/net/cnxk/cnxk_eswitch.c               |  2 +-
 drivers/net/cnxk/cnxk_ethdev.c                |  7 ++-
 drivers/net/cnxk/cnxk_ethdev_devargs.c        | 29 +++++++++++-
 11 files changed, 163 insertions(+), 24 deletions(-)

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index b5bd50ceea..239ebcd05c 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -183,8 +183,8 @@ Runtime Config Options
 
    With the above configuration, higig2 will be enabled on that port and the
    traffic on this port should be higig2 traffic only. Supported switch header
-   types are "chlen24b", "chlen90b", "dsa", "exdsa", "higig2", "vlan_exdsa" and
-   "pre_l2".
+   types are "chlen24b", "chlen90b", "dsa", "exdsa", "higig2", "vlan_exdsa",
+   "pre_l2" and "skip_size".
 
 - ``Flow pre_l2 info`` (default ``0x0/0x0/0x0``)
 
@@ -212,6 +212,21 @@ Runtime Config Options
    is 0 (i.e., left shift) then the shift count will be 1, that is, (8 - n),
    where n is the absolute position of leftmost set bit.
 
+- ``Skip size info`` (default ``0x0``)
+
+   When the switch header type is set to "skip_size", the number of bytes to
+   skip before the Ethernet header can be configured using ``skip_size_info``
+   ``devargs`` parameter. The value is in hexadecimal format and the valid
+   range is 0x0 to 0xff. This configures the PKIND so that the NPC parser
+   skips the specified number of bytes.
+
+   For example::
+
+      -a 0002:02:00.0,switch_header="skip_size",skip_size_info=0x2
+
+   With the above configuration, 2 bytes will be skipped before the Ethernet
+   header when parsing the incoming packets.
+
 - ``RSS tag as XOR`` (default ``0``)
 
    The HW gives two options to configure the RSS adder i.e
diff --git a/drivers/common/cnxk/roc_mbox.h b/drivers/common/cnxk/roc_mbox.h
index e31abf2234..1158ff50a7 100644
--- a/drivers/common/cnxk/roc_mbox.h
+++ b/drivers/common/cnxk/roc_mbox.h
@@ -462,8 +462,11 @@ struct ready_msg_rsp {
 };
 
 enum npc_pkind_type {
+	NPC_RX_SKIP_SIZE_PKIND = 46ULL,
+	NPC_RX_CPT_SKIP_SIZE_PKIND = 50ULL,
+	NPC_RX_CPT_HDR_PTP_PKIND = 54ULL,
 	NPC_RX_CUSTOM_PRE_L2_PKIND = 55ULL,
-	NPC_RX_VLAN_EXDSA_PKIND = 56ULL,
+	NPC_RX_VLAN_EXDSA_PKIND,
 	NPC_RX_CHLEN24B_PKIND,
 	NPC_RX_CPT_HDR_PKIND,
 	NPC_RX_CHLEN90B_PKIND,
@@ -474,6 +477,8 @@ enum npc_pkind_type {
 	NPC_TX_DEF_PKIND,
 };
 
+#define NPC_SKIP_SIZE_PKIND_MAX 4
+
 /* Struct to set pkind */
 struct npc_set_pkind {
 	struct mbox_msghdr hdr;
@@ -484,6 +489,7 @@ struct npc_set_pkind {
 #define ROC_PRIV_FLAGS_EXDSA	  BIT_ULL(4)
 #define ROC_PRIV_FLAGS_VLAN_EXDSA BIT_ULL(5)
 #define ROC_PRIV_FLAGS_PRE_L2	  BIT_ULL(6)
+#define ROC_PRIV_FLAGS_SKIP_SIZE  BIT_ULL(7)
 #define ROC_PRIV_FLAGS_CUSTOM	  BIT_ULL(63)
 	uint64_t __io mode;
 #define PKIND_TX BIT_ULL(0)
@@ -499,6 +505,10 @@ struct npc_set_pkind {
 	/* Shift direction to get length of the
 	 * header at var_len_off
 	 */
+	uint8_t __io skip_size;
+	/* Number of bytes to skip before the Ethernet header.
+	 * Valid only in case custom flag.
+	 */
 };
 
 /* Structure for requesting resource provisioning.
diff --git a/drivers/common/cnxk/roc_nix.h b/drivers/common/cnxk/roc_nix.h
index 8ba8b3e0b6..49ede85f9a 100644
--- a/drivers/common/cnxk/roc_nix.h
+++ b/drivers/common/cnxk/roc_nix.h
@@ -990,18 +990,14 @@ int __roc_api roc_nix_mac_stats_reset(struct roc_nix *roc_nix);
 int __roc_api roc_nix_mac_fwdata_get(struct roc_nix *roc_nix, struct roc_nix_mac_fwdata *fwdata);
 
 /* Ops */
-int __roc_api roc_nix_switch_hdr_set(struct roc_nix *roc_nix,
-				     uint64_t switch_header_type,
-				     uint8_t pre_l2_size_offset,
-				     uint8_t pre_l2_size_offset_mask,
-				     uint8_t pre_l2_size_shift_dir);
+int __roc_api roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
+				     uint8_t pre_l2_size_offset, uint8_t pre_l2_size_offset_mask,
+				     uint8_t pre_l2_size_shift_dir, uint8_t skip_size);
 int __roc_api roc_nix_lso_fmt_setup(struct roc_nix *roc_nix);
-int __roc_api roc_nix_lso_fmt_get(struct roc_nix *roc_nix,
-				  uint8_t udp_tun[ROC_NIX_LSO_TUN_MAX],
+int __roc_api roc_nix_lso_fmt_get(struct roc_nix *roc_nix, uint8_t udp_tun[ROC_NIX_LSO_TUN_MAX],
 				  uint8_t tun[ROC_NIX_LSO_TUN_MAX]);
 int __roc_api roc_nix_lso_fmt_ipv4_frag_get(struct roc_nix *roc_nix);
-int __roc_api roc_nix_lso_custom_fmt_setup(struct roc_nix *roc_nix,
-					   struct nix_lso_format *fields,
+int __roc_api roc_nix_lso_custom_fmt_setup(struct roc_nix *roc_nix, struct nix_lso_format *fields,
 					   uint16_t nb_fields);
 int __roc_api roc_nix_lso_alt_flags_profile_setup(struct roc_nix *roc_nix,
 						  nix_lso_alt_flg_format_t *fmt);
diff --git a/drivers/common/cnxk/roc_nix_ops.c b/drivers/common/cnxk/roc_nix_ops.c
index 4653bb2049..13a548216b 100644
--- a/drivers/common/cnxk/roc_nix_ops.c
+++ b/drivers/common/cnxk/roc_nix_ops.c
@@ -501,17 +501,49 @@ roc_nix_lso_fmt_get(struct roc_nix *roc_nix,
 	return 0;
 }
 
+static int
+skip_size_pkind_get(uint8_t skip_size, uint8_t *pkind)
+{
+	struct skip_size_pkind_cfg *cfg;
+	const struct plt_memzone *mz;
+	int i;
+
+	mz = plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE);
+	if (!mz)
+		return -ENOMEM;
+	cfg = mz->addr;
+
+	for (i = 0; i < cfg->count; i++) {
+		if (cfg->entries[i].skip_size == skip_size) {
+			*pkind = cfg->entries[i].pkind;
+			return 0;
+		}
+	}
+
+	if (cfg->count >= NPC_SKIP_SIZE_PKIND_MAX) {
+		plt_err("skip_size PKIND limit (%d) reached", NPC_SKIP_SIZE_PKIND_MAX);
+		return -ENOSPC;
+	}
+
+	i = cfg->count;
+	cfg->entries[i].skip_size = skip_size;
+	cfg->entries[i].pkind = NPC_RX_SKIP_SIZE_PKIND + i;
+	*pkind = cfg->entries[i].pkind;
+	cfg->count++;
+	return 0;
+}
+
 int
 roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
-		       uint8_t pre_l2_size_offset,
-		       uint8_t pre_l2_size_offset_mask,
-		       uint8_t pre_l2_size_shift_dir)
+		       uint8_t pre_l2_size_offset, uint8_t pre_l2_size_offset_mask,
+		       uint8_t pre_l2_size_shift_dir, uint8_t skip_size)
 {
 	struct nix *nix = roc_nix_to_nix_priv(roc_nix);
 	struct dev *dev = &nix->dev;
 	struct mbox *mbox = mbox_get(dev->mbox);
 	struct npc_set_pkind *req;
 	struct msg_resp *rsp;
+	uint8_t pkind = 0;
 	int rc = -ENOSPC;
 
 	if (switch_header_type == 0)
@@ -524,6 +556,7 @@ roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
 	    switch_header_type != ROC_PRIV_FLAGS_EXDSA &&
 	    switch_header_type != ROC_PRIV_FLAGS_VLAN_EXDSA &&
 	    switch_header_type != ROC_PRIV_FLAGS_PRE_L2 &&
+	    switch_header_type != ROC_PRIV_FLAGS_SKIP_SIZE &&
 	    switch_header_type != ROC_PRIV_FLAGS_CUSTOM) {
 		plt_err("switch header type is not supported");
 		rc = NIX_ERR_PARAM;
@@ -564,6 +597,13 @@ roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
 		req->var_len_off = pre_l2_size_offset;
 		req->var_len_off_mask = pre_l2_size_offset_mask;
 		req->shift_dir = pre_l2_size_shift_dir;
+	} else if (switch_header_type == ROC_PRIV_FLAGS_SKIP_SIZE) {
+		rc = skip_size_pkind_get(skip_size, &pkind);
+		if (rc)
+			goto exit;
+		req->mode = ROC_PRIV_FLAGS_CUSTOM;
+		req->pkind = pkind;
+		req->skip_size = skip_size;
 	}
 
 	req->dir = PKIND_RX;
diff --git a/drivers/common/cnxk/roc_npc.c b/drivers/common/cnxk/roc_npc.c
index a906fe0413..111ad0e8bb 100644
--- a/drivers/common/cnxk/roc_npc.c
+++ b/drivers/common/cnxk/roc_npc.c
@@ -420,6 +420,16 @@ roc_npc_init(struct roc_npc *roc_npc)
 
 	roc_npc->flow_age.age_flow_refcnt = 0;
 
+	/* Create skip-size PKIND memzone if it doesn't exist */
+	if (!plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE)) {
+		const struct plt_memzone *mz;
+
+		mz = plt_memzone_reserve_cache_align(SKIP_SIZE_PKIND_MEMZONE,
+						     sizeof(struct skip_size_pkind_cfg));
+		if (mz != NULL)
+			memset(mz->addr, 0, sizeof(struct skip_size_pkind_cfg));
+	}
+
 	return rc;
 
 done:
@@ -457,12 +467,42 @@ roc_npc_fini(struct roc_npc *roc_npc)
 		npc->prio_flow_list = NULL;
 	}
 
+	{
+		const struct plt_memzone *mz;
+
+		mz = plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE);
+		if (mz)
+			plt_memzone_free(mz);
+	}
+
 	return 0;
 }
 
 int
-roc_npc_validate_portid_action(struct roc_npc *roc_npc_src,
-			       struct roc_npc *roc_npc_dst)
+roc_npc_skip_size_pkind_get(struct roc_npc *roc_npc)
+{
+	struct skip_size_pkind_cfg *cfg;
+	const struct plt_memzone *mz;
+	int i;
+
+	if (roc_npc->switch_header_type != ROC_PRIV_FLAGS_SKIP_SIZE)
+		return -1;
+
+	mz = plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE);
+	if (!mz)
+		return -1;
+	cfg = mz->addr;
+
+	for (i = 0; i < cfg->count; i++) {
+		if (cfg->entries[i].skip_size == roc_npc->skip_size)
+			return cfg->entries[i].pkind + NPC_SKIP_SIZE_PKIND_MAX;
+	}
+
+	return -1;
+}
+
+int
+roc_npc_validate_portid_action(struct roc_npc *roc_npc_src, struct roc_npc *roc_npc_dst)
 {
 	struct roc_nix *roc_nix_src = roc_npc_src->roc_nix;
 	struct nix *nix_src = roc_nix_to_nix_priv(roc_nix_src);
diff --git a/drivers/common/cnxk/roc_npc.h b/drivers/common/cnxk/roc_npc.h
index 130990bda7..a7254f35ca 100644
--- a/drivers/common/cnxk/roc_npc.h
+++ b/drivers/common/cnxk/roc_npc.h
@@ -423,6 +423,7 @@ struct roc_npc {
 					  */
 	uint8_t pre_l2_size_shift_dir;	 /**< Shift direction to calculate size
 					  */
+	uint8_t skip_size;		 /**< Switch header skip size */
 	uint16_t flow_prealloc_size;
 	uint16_t flow_max_priority;
 	uint16_t channel;
@@ -506,4 +507,5 @@ void __roc_api roc_npc_sdp_channel_get(struct roc_npc *roc_npc, uint16_t *chan_b
 				       uint16_t *chan_mask);
 int __roc_api roc_npc_mcam_get_stats(struct roc_npc *roc_npc, struct roc_npc_flow *flow,
 				     uint64_t *count);
+int __roc_api roc_npc_skip_size_pkind_get(struct roc_npc *roc_npc);
 #endif /* _ROC_NPC_H_ */
diff --git a/drivers/common/cnxk/roc_npc_priv.h b/drivers/common/cnxk/roc_npc_priv.h
index f8f4489f06..6a27f0e0fa 100644
--- a/drivers/common/cnxk/roc_npc_priv.h
+++ b/drivers/common/cnxk/roc_npc_priv.h
@@ -554,4 +554,15 @@ int npc_aging_ctrl_thread_create(struct roc_npc *roc_npc, const struct roc_npc_a
 				 struct roc_npc_flow *flow);
 void npc_aging_ctrl_thread_destroy(struct roc_npc *roc_npc);
 int npc_rss_free_grp_get(struct npc *npc, uint32_t *pos);
+
+#define SKIP_SIZE_PKIND_MEMZONE "roc_skip_size_pkind_cfg"
+
+struct skip_size_pkind_cfg {
+	uint8_t count;
+	struct {
+		uint8_t skip_size;
+		uint8_t pkind;
+	} entries[NPC_SKIP_SIZE_PKIND_MAX];
+};
+
 #endif /* _ROC_NPC_PRIV_H_ */
diff --git a/drivers/common/cnxk/roc_platform_base_symbols.c b/drivers/common/cnxk/roc_platform_base_symbols.c
index ed34d4b05b..d1c9f2304d 100644
--- a/drivers/common/cnxk/roc_platform_base_symbols.c
+++ b/drivers/common/cnxk/roc_platform_base_symbols.c
@@ -492,6 +492,7 @@ RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_fini)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_validate_portid_action)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_parse)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_sdp_channel_get)
+RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_skip_size_pkind_get)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_create)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_destroy)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_dump)
diff --git a/drivers/net/cnxk/cnxk_eswitch.c b/drivers/net/cnxk/cnxk_eswitch.c
index e45c7dfd07..7e717a2fbf 100644
--- a/drivers/net/cnxk/cnxk_eswitch.c
+++ b/drivers/net/cnxk/cnxk_eswitch.c
@@ -553,7 +553,7 @@ nix_lf_setup(struct cnxk_eswitch_dev *eswitch_dev)
 		goto free_cqs;
 	}
 
-	rc = roc_nix_switch_hdr_set(nix, 0, 0, 0, 0);
+	rc = roc_nix_switch_hdr_set(nix, 0, 0, 0, 0, 0);
 	if (rc) {
 		plt_err("switch hdr set failed = %s(%d)", roc_error_msg_get(rc), rc);
 		goto free_cqs;
diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index c782dc51a8..a21e170229 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -1639,10 +1639,9 @@ cnxk_nix_configure(struct rte_eth_dev *eth_dev)
 		goto free_nix_lf;
 	}
 
-	rc = roc_nix_switch_hdr_set(nix, dev->npc.switch_header_type,
-				    dev->npc.pre_l2_size_offset,
+	rc = roc_nix_switch_hdr_set(nix, dev->npc.switch_header_type, dev->npc.pre_l2_size_offset,
 				    dev->npc.pre_l2_size_offset_mask,
-				    dev->npc.pre_l2_size_shift_dir);
+				    dev->npc.pre_l2_size_shift_dir, dev->npc.skip_size);
 	if (rc) {
 		plt_err("Failed to enable switch type nix_lf rc=%d", rc);
 		goto free_nix_lf;
@@ -2364,7 +2363,7 @@ cnxk_eth_dev_uninit(struct rte_eth_dev *eth_dev, bool reset)
 		return 0;
 
 	/* Disable switch hdr pkind */
-	roc_nix_switch_hdr_set(&dev->nix, 0, 0, 0, 0);
+	roc_nix_switch_hdr_set(&dev->nix, 0, 0, 0, 0, 0);
 
 	/* Clear the flag since we are closing down */
 	dev->configured = 0;
diff --git a/drivers/net/cnxk/cnxk_ethdev_devargs.c b/drivers/net/cnxk/cnxk_ethdev_devargs.c
index da8fc83f9d..ea18090919 100644
--- a/drivers/net/cnxk/cnxk_ethdev_devargs.c
+++ b/drivers/net/cnxk/cnxk_ethdev_devargs.c
@@ -239,6 +239,25 @@ parse_switch_header_type(const char *key, const char *value, void *extra_args)
 	if (strcmp(value, "pre_l2") == 0)
 		*(uint16_t *)extra_args = ROC_PRIV_FLAGS_PRE_L2;
 
+	if (strcmp(value, "skip_size") == 0)
+		*(uint16_t *)extra_args = ROC_PRIV_FLAGS_SKIP_SIZE;
+
+	return 0;
+}
+
+static int
+parse_skip_size_info(const char *key, const char *value, void *extra_args)
+{
+	RTE_SET_USED(key);
+	uint32_t val;
+
+	errno = 0;
+	val = strtoul(value, NULL, 0);
+	if (errno || val > 255)
+		return -EINVAL;
+
+	*(uint16_t *)extra_args = val;
+
 	return 0;
 }
 
@@ -303,6 +322,7 @@ parse_val_u16(const char *key, const char *value, void *extra_args)
 #define CNXK_FORCE_TAIL_DROP	  "force_tail_drop"
 #define CNXK_DIS_XQE_DROP	  "disable_xqe_drop"
 #define CNXK_RXC_STEP		  "rxc_step"
+#define CNXK_SKIP_SIZE_INFO	  "skip_size_info"
 
 int
 cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
@@ -317,6 +337,7 @@ cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 	uint16_t custom_meta_aura_dis = 0;
 	uint16_t flow_prealloc_size = 1;
 	uint16_t switch_header_type = 0;
+	uint16_t skip_size_info = 0;
 	uint16_t flow_max_priority = 3;
 	uint16_t outb_nb_crypto_qs = 1;
 	uint32_t ipsec_in_min_spi = 0;
@@ -392,6 +413,8 @@ cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 	rte_kvargs_process(kvlist, CNXK_FORCE_TAIL_DROP, &parse_flag, &force_tail_drop);
 	rte_kvargs_process(kvlist, CNXK_DIS_XQE_DROP, &parse_flag, &dis_xqe_drop);
 	rte_kvargs_process(kvlist, CNXK_RXC_STEP, &parse_rxc_step, &rxc_step);
+	rte_kvargs_process(kvlist, CNXK_SKIP_SIZE_INFO, &parse_skip_size_info,
+			   &skip_size_info);
 	rte_kvargs_free(kvlist);
 
 null_devargs:
@@ -424,6 +447,7 @@ cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 		dev->npc.flow_max_priority = flow_max_priority;
 
 	dev->npc.switch_header_type = switch_header_type;
+	dev->npc.skip_size = skip_size_info;
 	dev->npc.sdp_channel = sdp_chan.channel;
 	dev->npc.sdp_channel_mask = sdp_chan.mask;
 	dev->npc.is_sdp_mask_set = sdp_chan.is_sdp_mask_set;
@@ -448,7 +472,7 @@ RTE_PMD_REGISTER_PARAM_STRING(net_cnxk,
 			      CNXK_MAX_SQB_COUNT "=<8-512>"
 			      CNXK_FLOW_PREALLOC_SIZE "=<1-32>"
 			      CNXK_FLOW_MAX_PRIORITY "=<1-32>"
-			      CNXK_SWITCH_HEADER_TYPE "=<higig2|dsa|chlen90b>"
+			      CNXK_SWITCH_HEADER_TYPE "=<higig2|dsa|chlen90b|skip_size>"
 			      CNXK_RSS_TAG_AS_XOR "=1"
 			      CNXK_IPSEC_IN_MAX_SPI "=<1-65535>"
 			      CNXK_OUTB_NB_DESC "=<1-65535>"
@@ -463,4 +487,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_cnxk,
 			      CNXK_CUSTOM_META_AURA_DIS "=1"
 			      CNXK_FORCE_TAIL_DROP "=1"
 			      CNXK_DIS_XQE_DROP "=1"
-			      CNXK_RXC_STEP "=<0-1048575>");
+			      CNXK_RXC_STEP "=<0-1048575>"
+			      CNXK_SKIP_SIZE_INFO "=<0x0-0xff>");
-- 
2.34.1


^ permalink raw reply related

* [PATCH 06/17] net/cnxk: reserve memory for lookup mem at probe
From: Rahul Bhansali @ 2026-06-11  7:33 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

From: Nithin Dabilpuram <ndabilpuram@marvell.com>

Reserve memory for lookup mem at probe that is global for
all cnxk ethdev devices to avoid race at later stage.

Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
---
 drivers/net/cnxk/cnxk_ethdev.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index 06d1c9b362..c782dc51a8 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -2220,6 +2220,12 @@ cnxk_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Register callback for inline meta pool create 1:N pool:aura */
 	roc_nix_inl_custom_meta_pool_cb_register(cnxk_nix_inl_custom_meta_pool_cb);
 
+	/* Reserve memory for lookup_memory */
+	if (!cnxk_nix_fastpath_lookup_mem_get()) {
+		plt_err("Failed to reserve lookup memory rc=%d", rc);
+		goto dev_fini;
+	}
+
 	dev->eth_dev = eth_dev;
 	dev->configured = 0;
 	dev->ptype_disable = 0;
-- 
2.34.1


^ permalink raw reply related

* [PATCH 05/17] common/cnxk: configure LSO mask for single segments
From: Rahul Bhansali @ 2026-06-11  7:32 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

Configures LSO flag mask for single packets.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 drivers/common/cnxk/roc_nix_ops.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/common/cnxk/roc_nix_ops.c b/drivers/common/cnxk/roc_nix_ops.c
index 12a12c6e35..4653bb2049 100644
--- a/drivers/common/cnxk/roc_nix_ops.c
+++ b/drivers/common/cnxk/roc_nix_ops.c
@@ -239,6 +239,8 @@ nix_lso_ipv4(struct roc_nix *roc_nix)
 
 	/* First get flags profile to update v4 flags */
 	memset(&alt_flags, 0, sizeof(alt_flags));
+	alt_flags.s.alt_ssf_set = 0;
+	alt_flags.s.alt_ssf_mask = 0xFFFF;
 	alt_flags.s.alt_fsf_set = 0x2000;
 	alt_flags.s.alt_fsf_mask = 0x5FFF;
 	alt_flags.s.alt_msf_set = 0x2000;
-- 
2.34.1


^ permalink raw reply related

* [PATCH 04/17] common/cnxk: update NIX irq handler
From: Rahul Bhansali @ 2026-06-11  7:32 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

Move queue context dump and register print before interrupt
clear in NIX irq handler.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 drivers/common/cnxk/roc_nix_irq.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/common/cnxk/roc_nix_irq.c b/drivers/common/cnxk/roc_nix_irq.c
index 2b731302cd..6874435a4e 100644
--- a/drivers/common/cnxk/roc_nix_irq.c
+++ b/drivers/common/cnxk/roc_nix_irq.c
@@ -168,7 +168,7 @@ nix_lf_q_irq_get_and_clear(struct nix *nix, uint16_t q, uint32_t off,
 	reg = roc_atomic64_add_nosync(wdata, (int64_t *)(nix->base + off));
 
 	if (reg & BIT_ULL(42) /* OP_ERR */) {
-		plt_err("Failed execute irq get off=0x%x", off);
+		plt_err("Failed execute irq get queue=%d off=0x%x", q, off);
 		return 0;
 	}
 	qint = reg & 0xff;
@@ -262,6 +262,10 @@ nix_lf_q_irq(void *param)
 	plt_err("Queue_intr=0x%" PRIx64 " qintx=%d pf=%d, vf=%d", intr, qintx,
 		dev->pf, dev->vf);
 
+	/* Dump registers to std out */
+	roc_nix_lf_reg_dump(nix_priv_to_roc_nix(nix), NULL);
+	roc_nix_queues_ctx_dump(nix_priv_to_roc_nix(nix), NULL);
+
 	/* Handle RQ interrupts */
 	for (q = 0; q < nix->nb_rx_queues; q++) {
 		rq = q % nix->qints;
@@ -323,10 +327,6 @@ nix_lf_q_irq(void *param)
 	/* Clear interrupt */
 	plt_write64(intr, nix->base + NIX_LF_QINTX_INT(qintx));
 
-	/* Dump registers to std out */
-	roc_nix_lf_reg_dump(nix_priv_to_roc_nix(nix), NULL);
-	roc_nix_queues_ctx_dump(nix_priv_to_roc_nix(nix), NULL);
-
 	/* Call reset callback */
 	if (intr_cb && dev->ops->q_err_cb)
 		dev->ops->q_err_cb(nix_priv_to_roc_nix(nix), NULL);
-- 
2.34.1


^ permalink raw reply related

* [PATCH 03/17] common/cnxk: additional NIX SQ ctx fields prints
From: Rahul Bhansali @ 2026-06-11  7:32 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

Additional debug prints for CN20k NIX SQ ctx dump

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 drivers/common/cnxk/roc_nix_debug.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/common/cnxk/roc_nix_debug.c b/drivers/common/cnxk/roc_nix_debug.c
index 11994bf131..07a18d4503 100644
--- a/drivers/common/cnxk/roc_nix_debug.c
+++ b/drivers/common/cnxk/roc_nix_debug.c
@@ -540,6 +540,8 @@ nix_cn10k_lf_sq_dump(__io struct nix_cn10k_sq_ctx_s *ctx, uint32_t *sqb_aura_p,
 static inline void
 nix_lf_sq_dump(__io struct nix_cn20k_sq_ctx_s *ctx, uint32_t *sqb_aura_p, FILE *file)
 {
+	int64_t *sq_cnt_ptr = NULL;
+
 	nix_dump(file, "W0: sqe_way_mask \t\t%d\nW0: cq \t\t\t\t%d",
 		 ctx->sqe_way_mask, ctx->cq);
 	nix_dump(file, "W0: sdp_mcast \t\t\t%d\nW0: substream \t\t\t0x%03x",
@@ -561,6 +563,7 @@ nix_lf_sq_dump(__io struct nix_cn20k_sq_ctx_s *ctx, uint32_t *sqb_aura_p, FILE *
 	nix_dump(file, "W2: smq_rr_count[ub:lb] \t\t%x:%x\n", ctx->smq_rr_count_ub,
 		 ctx->smq_rr_count_lb);
 
+	nix_dump(file, "W3: update_sq_count\t\t%d\n", ctx->update_sq_count);
 	nix_dump(file, "W3: smq_next_sq_vld\t\t%d\nW3: smq_pend\t\t\t%d",
 		 ctx->smq_next_sq_vld, ctx->smq_pend);
 	nix_dump(file, "W3: smenq_next_sqb_vld  \t%d\nW3: head_offset\t\t\t%d",
@@ -588,6 +591,11 @@ nix_lf_sq_dump(__io struct nix_cn20k_sq_ctx_s *ctx, uint32_t *sqb_aura_p, FILE *
 		 ctx->vfi_lso_sizem1);
 	nix_dump(file, "W9: vfi_lso_total\t\t%d", ctx->vfi_lso_total);
 
+	nix_dump(file, "W10: sq_count_iova \t\t0x%" PRIx64 "", (uint64_t)ctx->sq_count_iova);
+	sq_cnt_ptr = (int64_t *)(uintptr_t)(ctx->sq_count_iova << 3);
+	if (sq_cnt_ptr && ctx->update_sq_count)
+		nix_dump(file, "sq_count value \t\t0x%" PRIx64 "",
+			 plt_atomic_load_explicit(sq_cnt_ptr, plt_memory_order_relaxed));
 	nix_dump(file, "W10: scm_lso_rem \t\t0x%" PRIx64 "", (uint64_t)ctx->scm_lso_rem);
 	nix_dump(file, "W11: octs \t\t\t0x%" PRIx64 "", (uint64_t)ctx->octs);
 	nix_dump(file, "W12: pkts \t\t\t0x%" PRIx64 "", (uint64_t)ctx->pkts);
-- 
2.34.1


^ permalink raw reply related

* [PATCH 02/17] common/cnxk: add API of SA valid for cn20k platform
From: Rahul Bhansali @ 2026-06-11  7:32 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611073311.3129711-1-rbhansali@marvell.com>

Add API to get SA valid configuration for cn20k platform.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 drivers/common/cnxk/cnxk_security.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/common/cnxk/cnxk_security.c b/drivers/common/cnxk/cnxk_security.c
index 6b51055100..6f46ad3276 100644
--- a/drivers/common/cnxk/cnxk_security.c
+++ b/drivers/common/cnxk/cnxk_security.c
@@ -606,6 +606,20 @@ cnxk_ot_ipsec_outb_sa_valid(struct roc_ot_ipsec_outb_sa *sa)
 	return !!sa->w2.s.valid;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(cnxk_ow_ipsec_inb_sa_valid)
+bool
+cnxk_ow_ipsec_inb_sa_valid(struct roc_ow_ipsec_inb_sa *sa)
+{
+	return !!sa->w2.s.valid;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(cnxk_ow_ipsec_outb_sa_valid)
+bool
+cnxk_ow_ipsec_outb_sa_valid(struct roc_ow_ipsec_outb_sa *sa)
+{
+	return !!sa->w2.s.valid;
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(cnxk_ipsec_ivlen_get)
 uint8_t
 cnxk_ipsec_ivlen_get(enum rte_crypto_cipher_algorithm c_algo,
-- 
2.34.1


^ permalink raw reply related

* [PATCH 01/17] net/cnxk: update mbuf next field for multi segment
From: Rahul Bhansali @ 2026-06-11  7:32 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali

As per the requirement of rte_mbuf_raw_reset_bulk(), the mbuf's
'next' and 'nb_segs' fields are required to be reset.
This reset these field for multi-segment mbufs on cn9k platform.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 drivers/net/cnxk/cn9k_rx.h |  8 --------
 drivers/net/cnxk/cn9k_tx.h | 42 ++++++++++++++++++--------------------
 2 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/drivers/net/cnxk/cn9k_rx.h b/drivers/net/cnxk/cn9k_rx.h
index 79b56fe160..5ccdc5dee1 100644
--- a/drivers/net/cnxk/cn9k_rx.h
+++ b/drivers/net/cnxk/cn9k_rx.h
@@ -410,8 +410,6 @@ cn9k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const uint32_t tag,
 		 * Hence, flag argument is not required.
 		 */
 		nix_cqe_xtract_mseg(rx, mbuf, val, 0);
-	else
-		mbuf->next = NULL;
 }
 
 static inline uint16_t
@@ -826,12 +824,6 @@ cn9k_nix_recv_pkts_vector(void *rx_queue, struct rte_mbuf **rx_pkts,
 			nix_cqe_xtract_mseg((union nix_rx_parse_u *)
 						(cq0 + CQE_SZ(3) + 8), mbuf3,
 					    mbuf_initializer, flags);
-		} else {
-			/* Update that no more segments */
-			mbuf0->next = NULL;
-			mbuf1->next = NULL;
-			mbuf2->next = NULL;
-			mbuf3->next = NULL;
 		}
 
 		/* Store the mbufs to rx_pkts */
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 32665d2050..0ec448e36c 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -665,14 +665,14 @@ cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_m
 #else
 	RTE_SET_USED(cookie);
 #endif
-#ifdef RTE_ENABLE_ASSERT
-	m->next = NULL;
-	m->nb_segs = 1;
-#endif
-	m = m_next;
-	if (!m)
+	if (likely(!m_next))
 		goto done;
 
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)) {
+		m->next = NULL;
+		m->nb_segs = 1;
+	}
+	m = m_next;
 	/* Fill mbuf segments */
 	do {
 		m_next = m->next;
@@ -704,12 +704,13 @@ cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_m
 			sg_u = sg->u;
 			slist++;
 		}
-#ifdef RTE_ENABLE_ASSERT
-		m->next = NULL;
-#endif
+		if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+			m->next = NULL;
 		m = m_next;
 	} while (nb_segs);
 
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+		rte_io_wmb();
 done:
 	sg->u = sg_u;
 	sg->segs = i;
@@ -720,9 +721,6 @@ cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_m
 	segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
 	send_hdr->w0.sizem1 = segdw - 1;
 
-#ifdef RTE_ENABLE_ASSERT
-	rte_io_wmb();
-#endif
 	return segdw;
 }
 
@@ -950,10 +948,10 @@ cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 	RTE_SET_USED(cookie);
 #endif
 
-#ifdef RTE_ENABLE_ASSERT
-	m->next = NULL;
-	m->nb_segs = 1;
-#endif
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)) {
+		m->next = NULL;
+		m->nb_segs = 1;
+	}
 	m = m_next;
 	/* Fill mbuf segments */
 	do {
@@ -984,9 +982,8 @@ cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 			sg_u = sg->u;
 			slist++;
 		}
-#ifdef RTE_ENABLE_ASSERT
-		m->next = NULL;
-#endif
+		if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+			m->next = NULL;
 		m = m_next;
 	} while (nb_segs);
 
@@ -1002,9 +999,6 @@ cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 		 !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
 	send_hdr->w0.sizem1 = segdw - 1;
 
-#ifdef RTE_ENABLE_ASSERT
-	rte_io_wmb();
-#endif
 	return segdw;
 }
 
@@ -1089,6 +1083,10 @@ cn9k_nix_xmit_pkts_mseg_vector(uint64x2_t *cmd0, uint64x2_t *cmd1,
 		}
 	}
 
+	/* Multi segment mbufs */
+	if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
+		rte_io_wmb();
+
 	for (j = 0; j < NIX_DESCS_PER_LOOP;) {
 		/* Fit consecutive packets in same LMTLINE. */
 		if ((segdw[j] + segdw[j + 1]) <= 8) {
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH] eal: fix core_index for non-EAL registered threads
From: Maxime Peim @ 2026-06-10 13:45 UTC (permalink / raw)
  To: David Marchand; +Cc: dev
In-Reply-To: <CAJFAV8zsDEW6vU69NHmZiqUyB4xz+qDPQsw-cBBeScGeuE_Fiw@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3900 bytes --]

Hi David

I am not sure it works either, if the lcore are manual set with a gap:
`--lcores=0,7` (from `eal_parse_lcores`):
- lcore 0 will get core_index = 0
- lcore 7 will get core_index = 1

When calling `rte_thread_register` we will hit lcore=1 as first
not-assigned lcore and set core_index=1 as well.

It seems like a solution should be to have a bitmap of the currently used
core_index stored in the global config.

Please let me know what you think about that.

Maxime Peim

On Mon, Jun 8, 2026 at 6:35 PM David Marchand <david.marchand@redhat.com>
wrote:

> On Mon, 8 Jun 2026 at 18:10, David Marchand <david.marchand@redhat.com>
> wrote:
> >
> > On Wed, 22 Apr 2026 at 09:54, Maxime Peim <maxime.peim@gmail.com> wrote:
> > >
> > > Threads registered via rte_thread_register() are assigned a valid
> > > lcore_id by eal_lcore_non_eal_allocate(), but their core_index in
> > > lcore_config is left at -1. This value was set during
> rte_eal_cpu_init()
> > > for lcores with ROLE_OFF (undetected CPUs) and is never updated when
> the
> > > lcore is later allocated to a non-EAL thread.
> > >
> > > As a result, rte_lcore_index() returns -1 for registered non-EAL
> > > threads. Libraries that use rte_lcore_index() to select per-lcore
> > > caches fall back to a shared global path when it returns -1, causing
> > > severe contention under concurrent access from multiple registered
> > > threads.
> > >
> > > A concrete example is the mlx5 indexed memory pool (mlx5_ipool), which
> > > uses rte_lcore_index() in mlx5_ipool_malloc_cache() to select a
> per-core
> > > cache slot. When core_index is -1, all registered threads are funneled
> > > into a single shared slot protected by a spinlock. In testing with VPP
> > > (which registers worker threads via rte_thread_register()), this caused
> > > async flow rule insertion throughput to drop from ~6.4M rules/sec to
> > > ~1.2M rules/sec with 4 workers -- a 5x regression attributable entirely
> > > to spinlock contention in the ipool allocator.
> > >
> > > Fix by setting core_index to the next sequential index
> (cfg->lcore_count)
> > > in eal_lcore_non_eal_allocate() before incrementing the count. Also
> reset
> > > core_index back to -1 on the error rollback path and in
> > > eal_lcore_non_eal_release() for correctness.
> > >
> > > Fixes: 5c307ba2a5b1 ("eal: register non-EAL threads as lcores")
> > Cc: stable@dpdk.org
> >
> > > Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
> > Acked-by: David Marchand <david.marchand@redhat.com>
> >
>
> Hum, I did not push the change.
> Re-reading this code, we have an issue if some external thread
> unregisters in the middle.
>
> What do you think of the additional hunk:
>
> $ git diff
> diff --git a/lib/eal/common/eal_common_lcore.c
> b/lib/eal/common/eal_common_lcore.c
> index ae085d73e4..6f53f20d90 100644
> --- a/lib/eal/common/eal_common_lcore.c
> +++ b/lib/eal/common/eal_common_lcore.c
> @@ -372,13 +372,16 @@ eal_lcore_non_eal_allocate(void)
>         struct rte_config *cfg = rte_eal_get_configuration();
>         struct lcore_callback *callback;
>         struct lcore_callback *prev;
> +       unsigned int index = 0;
>         unsigned int lcore_id;
>
>         rte_rwlock_write_lock(&lcore_lock);
>         for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
> -               if (cfg->lcore_role[lcore_id] != ROLE_OFF)
> +               if (cfg->lcore_role[lcore_id] != ROLE_OFF) {
> +                       index++;
>                         continue;
> -               lcore_config[lcore_id].core_index = cfg->lcore_count;
> +               }
> +               lcore_config[lcore_id].core_index = index;
>                 cfg->lcore_role[lcore_id] = ROLE_NON_EAL;
>                 cfg->lcore_count++;
>                 break;
>
>
> --
> David Marchand
>
>

[-- Attachment #2: Type: text/html, Size: 5124 bytes --]

^ permalink raw reply

* [PATCH v3] net/mlx5: fix counter TAILQ race between free and query callback
From: Linhu Li @ 2026-06-10  6:34 UTC (permalink / raw)
  To: dev; +Cc: stable, dsosnowski, Linhu Li
In-Reply-To: <20260604101112.72177-1-lilinhu618@gmail.com>

flow_dv_counter_free() inserts counters into
pool->counters[pool->query_gen] under pool->csl. Meanwhile,
mlx5_flow_async_pool_query_handle() moves counters from
pool->counters[query_gen ^ 1] to the global free list via
TAILQ_CONCAT while holding only cmng->csl, not pool->csl.

The comment in flow_dv_counter_free() claims the lock is not needed
because the query callback and the release function operate on different lists.
That holds only if the free path always observes the up-to-date query_gen. It
can be violated:

1. A counter free thread (non-PMD, e.g. OVS offload thread) reads
   pool->query_gen == 0 and is about to insert into counters[0].
2. The free thread is preempted by the OS scheduler; it is a regular
   pthread, not pinned to a core.
3. The eal-intr-thread alarm fires: query_gen++ (now 1) and the async
   query is sent.
4. Hardware completes the query and the callback runs TAILQ_CONCAT on
   counters[0] (= query_gen ^ 1).
5. The free thread resumes and runs TAILQ_INSERT_TAIL on counters[0]
   concurrently with step 4 on another core.

Because the two paths take different locks, TAILQ_INSERT_TAIL and
TAILQ_CONCAT run concurrently on the same list with no synchronization and
corrupt it: the pool-local list ends up with a NULL head but a dangling
tqh_last, and the global free list tail no longer points to the real tail. The just-
freed counter and every counter inserted afterwards become unreachable
and are leaked.

Non-PMD threads can be preempted for hundreds of microseconds under
CPU pressure, which is well within the async query round-trip time, so the
window is reachable in practice.

Fix it by taking pool->csl in the query completion callback before operating on
pool->counters[query_gen], serializing the CONCAT with any concurrent
INSERT. The lock is taken once per pool per query completion in the eal-intr-
thread context, not on the datapath, so the cost is negligible. Lock order is
pool->csl then cmng->csl, matching all other sites.

Also handle the error path: previously the counters accumulated in
pool->counters[query_gen] were abandoned when a query failed. Move
them back to the global free list to avoid a leak on persistent query failures.

Fixes: ac79183dc6f7 ("net/mlx5: optimize free counter lookup")
Cc: stable@dpdk.org

Signed-off-by: Linhu Li <lilinhu618@gmail.com>
Acked-by: Dariusz Sosnowski <dsosnowski@nvidia.com>
---
 doc/guides/rel_notes/release_26_07.rst | 21 +++++++++++++++++
 drivers/net/mlx5/mlx5_flow.c           | 31 ++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index b8a3e2ced9..30a9564884 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -153,6 +153,27 @@ ABI Changes
 * No ABI change that would break compatibility with 25.11.

+Fixed Issues
+------------
+
+.. This section should contain fixed issues in this release. Sample format:
+
+   * **Add a title in the past tense with a full stop.**
+
+     Add a short 1-2 sentence description of the fix in the past tense.
+
+   This section is a comment. Do not overwrite or remove it.
+   Also, make sure to start the actual text at the margin.
+   =======================================================
+
+* **net/mlx5: Fixed counter TAILQ race between free and query callback.**
+
+  Fixed a race condition where concurrent counter free operations and async
+  query completions could corrupt the counter free list, causing counter leaks.
+  The issue occurred when non-PMD threads were preempted between reading
+  ``query_gen`` and inserting into the counter list.
+
+
 Known Issues
 ------------

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 915ea29a5a..2f785d58ec 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -9893,6 +9893,13 @@ void
 mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
 				  uint64_t async_id, int status)
 {
+	/*
+	 * Handle async counter pool query completion.
+	 * query_gen is flipped each round: freed counters go into [query_gen],
+	 * while this callback moves [query_gen ^ 1] to the global free list.
+	 * pool->csl must be held when operating on pool->counters[] to serialize
+	 * with concurrent free-path insertions.
+	 */
 	struct mlx5_flow_counter_pool *pool =
 		(struct mlx5_flow_counter_pool *)(uintptr_t)async_id;
 	struct mlx5_counter_stats_raw *raw_to_free;
@@ -9904,6 +9911,21 @@ mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,

 	if (unlikely(status)) {
 		raw_to_free = pool->raw_hw;
+		/*
+		 * The query failed, so the freed counters accumulated
+		 * in the old-gen list would otherwise be stranded.
+		 * Move them back to the global free list. This is safe
+		 * for both transient and persistent failures: the
+		 * counters are still valid and can be reused.
+		 */
+		if (!TAILQ_EMPTY(&pool->counters[query_gen])) {
+			rte_spinlock_lock(&pool->csl);
+			rte_spinlock_lock(&cmng->csl[cnt_type]);
+			TAILQ_CONCAT(&cmng->counters[cnt_type],
+				     &pool->counters[query_gen], next);
+			rte_spinlock_unlock(&cmng->csl[cnt_type]);
+			rte_spinlock_unlock(&pool->csl);
+		}
 	} else {
 		raw_to_free = pool->raw;
 		if (pool->is_aged)
@@ -9913,11 +9935,20 @@ mlx5_flow_async_pool_query_handle(struct mlx5_dev_ctx_shared *sh,
 		rte_spinlock_unlock(&pool->sl);
 		/* Be sure the new raw counters data is updated in memory. */
 		rte_io_wmb();
+		/*
+		 * A counter free thread may have read a stale query_gen
+		 * before the generation was flipped and could still be
+		 * inserting into this same old-gen list. Hold pool->csl to
+		 * serialize TAILQ_CONCAT with that TAILQ_INSERT_TAIL and
+		 * avoid corrupting the list.
+		 */
 		if (!TAILQ_EMPTY(&pool->counters[query_gen])) {
+			rte_spinlock_lock(&pool->csl);
 			rte_spinlock_lock(&cmng->csl[cnt_type]);
 			TAILQ_CONCAT(&cmng->counters[cnt_type],
 				     &pool->counters[query_gen], next);
 			rte_spinlock_unlock(&cmng->csl[cnt_type]);
+			rte_spinlock_unlock(&pool->csl);
 		}
 	}
 	LIST_INSERT_HEAD(&sh->sws_cmng.free_stat_raws, raw_to_free, next);
-- 
2.39.3 (Apple Git-146)

^ permalink raw reply related

* 回复：回复：[PATCH] gpu/metax: add new driver for Metax GPU
From: 许玲燕 @ 2026-06-11  7:10 UTC (permalink / raw)
  To: Thomas Monjalon; +Cc: dev, eagostini, 王冬冬
In-Reply-To: <J2c1Ke4gRXqOUlr2bqZFPg@monjalon.net>

[-- Attachment #1: Type: text/plain, Size: 5751 bytes --]

Hi,
Regarding your question about whether the lib and module are upstreamed already, I would like to clarify their current status:
Both libmcruntime.so and the corresponding gdrapi libraries are proprietary user-space libraries provided by Metax. They are not upstreamed to the DPDK mainline repository.
However, please rest assured that the current patch interacts with them via standard dlopen (dynamic loading) at runtime. We do not link directly against their source code or require them as hard build-time dependencies. Therefore, this approach will not introduce any additional compilation dependencies or licensing issues to the DPDK main tree.
------------------------------------------------------------------
发件人：Thomas Monjalon <thomas@monjalon.net>
发送时间：2026年6月9日(周二) 18:44
收件人："许玲燕"<lingyan.xu@metax-tech.com>
抄　送：dev<dev@dpdk.org>; eagostini<eagostini@nvidia.com>
主　题：Re: 回复：[PATCH] gpu/metax: add new driver for Metax GPU
Thank you for the detailed answer and your understanding.
One more question: are the lib and module upstreamed already?
09/06/2026 12:22, 许玲燕:
> Hi,
> Thank you for the detailed feedback and for reviewing the proposal for the Metax GPU driver.
> Based on the questions raised and the analysis of the code implementation, here are the clarifications and my action plan:
> 1. Regarding GPU Access Method
> The driver interfaces with the Metax GPU hardware through a combination of the vendor-provided MC Runtime (Metax Compute Runtime) library and GDRCopy (GPU Direct RDMA) technology.
> 
> * 
> User-space Library: As seen in the maca.c code, the driver dynamically loads (dlopen) the libmcruntime.so library. It uses mc_runtime_api.h to manage GPU contexts, memory allocation, and device attributes.
> 
> * 
> Kernel Module: The driver relies on the underlying Metax kernel driver (for PCI probing and basic device access) and the gdrapi (GDRCopy) kernel module to facilitate zero-copy data transfer between CPU and GPU memory.
> 
> * 
> Dependency: The build log confirms the detection of headers like mc_runtime_api.h and gdrapi.h, which are essential for this integration.
> 2. Clarification on "Rendering" Functionality
> I apologize for the confusion caused by the term "Rendering" in the initial description. Upon reviewing the code and your feedback, I realize this was an inaccurate choice of words.
> 
> * 
> Correction: The intended functionality is purely "Compute/Data Processing" and "Memory Management".
> 
> * 
> Explanation: The driver's core logic (as shown in the patch) focuses on memory registration, allocation, and CPU/GPU data synchronization (via maca_mem_cpu_map and gdrcopy_pin), which are essential for network data processing acceleration rather than graphical rendering. I will correct this terminology in the documentation to avoid further confusion.
> 3. Action Plan: Following the Contribution Guide
> I have reviewed the <"Adding a New Driver"> guide you linked.
> 
> * 
> Patch Splitting: I understand that the current monolithic patch is not suitable. I will rework the submission and split it into a logical patch series:
> * 
> Patch 1: Add the basic infrastructure (Meson files, maintainers, configuration).
> 
> * 
> Patch 2: Implement core device functionality (PCI probing, initialization, context management).
> 
> * 
> Patch 3: Add memory management and data path features (allocation, registration, and CPU mapping).
> Thank you again for your guidance. I will resubmit the revised patch series shortly.
> Best regards,
> Lingyan Xu
> ------------------------------------------------------------------
> 发件人：Thomas Monjalon <thomas@monjalon.net>
> 发送时间：2026年6月2日(周二) 18:01
> 收件人："许玲燕"<lingyan.xu@metax-tech.com>
> 抄 送：dev<dev@dpdk.org>; eagostini<eagostini@nvidia.com>
> 主 题：Re: [PATCH] gpu/metax: add new driver for Metax GPU
> Hello,
> 01/06/2026 07:47, 许玲燕:
> > I am writing to propose a new driver for the Metax GPU,
> How do you access the GPU?
> Are you using a specific library or kernel module?
> > which I believe will significantly enhance our support
> > and performance for this hardware.
> > The patch attached includes the initial implementation of the driver,
> > with key features such as:
> > 
> > * Basic initialization and configuration 
> > * Memory management and allocation 
> > * Core functionality for rendering and compute tasks 
> I am familiar with connecting compute tasks of a GPU
> with DPDK networking, but I'm surprised by the rendering functionality.
> Do you mean graphical rendering of data coming from the network?
> > Please review the code and let me know if you have any feedback or suggestions.
> > I am more than happy to make any necessary adjustments and improvements.
> Thank you for working on this.
> I recommend following this guide to introduce a new driver:
> https://doc.dpdk.org/guides/contributing/new_driver.html <https://doc.dpdk.org/guides/contributing/new_driver.html > <https://doc.dpdk.org/guides/contributing/new_driver.html <https://doc.dpdk.org/guides/contributing/new_driver.html > >
> 
> 
> 超大附件列表 dpdk-build-test-log.txt [48KB]
> 进入下载页面 https://qiye.aliyun.com/alimail/openLinks/downloadMimeMetaDiskBigAttach?id=netdiskid%3Av001%3Afile%3ADzzzzzzNqZx%3BJYiJwCficINAoHh55iyjKdydQzW5hDE%2FGjddF2Xp4ghl2ujmlGlWdfhgNCLOb5s3BZAHvDXTdZhtzGA3q8HJ%2Fv%2FPGnrPJfO1Xc%2BWnHr%2FKRwIkHzWFe5Iwm1IZrurr9hW <https://qiye.aliyun.com/alimail/openLinks/downloadMimeMetaDiskBigAttach?id=netdiskid%3Av001%3Afile%3ADzzzzzzNqZx%3BJYiJwCficINAoHh55iyjKdydQzW5hDE%2FGjddF2Xp4ghl2ujmlGlWdfhgNCLOb5s3BZAHvDXTdZhtzGA3q8HJ%2Fv%2FPGnrPJfO1Xc%2BWnHr%2FKRwIkHzWFe5Iwm1IZrurr9hW > 
> 

[-- Attachment #2: Type: text/html, Size: 9618 bytes --]

^ permalink raw reply

* [PATCH] test/security: increase wait time for reassebmly test
From: Rahul Bhansali @ 2026-06-11  6:18 UTC (permalink / raw)
  To: dev, Akhil Goyal, Anoob Joseph; +Cc: Rahul Bhansali

In case of multi segment inline IPsec reassembly burst test
of 4 fragment per packet where each fragment is multi
segmented ~11k bytes and sometimes few of reassembly fails
out of 33 such burst.

Delay of 1ms after burst Tx is not sufficient in this case,
hence need to increase to 10ms to avoid random reassembly
failures in functional tests.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 app/test/test_security_inline_proto.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/app/test/test_security_inline_proto.c b/app/test/test_security_inline_proto.c
index b0cce5ebd9..55d81041df 100644
--- a/app/test/test_security_inline_proto.c
+++ b/app/test/test_security_inline_proto.c
@@ -1107,6 +1107,7 @@ test_ipsec_with_reassembly(struct reassembly_vector *vector,
 	void *ctx;
 	unsigned int i, nb_rx = 0, j;
 	uint32_t ol_flags;
+	uint32_t delay_ms;
 	bool outer_ipv4;
 	int ret = 0;
 
@@ -1214,7 +1215,9 @@ test_ipsec_with_reassembly(struct reassembly_vector *vector,
 		goto out;
 	}
 
-	rte_delay_ms(1);
+	/* Multi-segment fragments requires more delay for burst Tx and reassembly in Rx path. */
+	delay_ms = sg_mode ? 10 : 1;
+	rte_delay_ms(delay_ms);
 
 	/* Retry few times before giving up */
 	nb_rx = 0;
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v3 01/10] eal: add interface to check if lcore is EAL managed
From: lihuisong (C) @ 2026-06-11  6:16 UTC (permalink / raw)
  To: Thomas Monjalon
  Cc: anatoly.burakov, sivaprasad.tummala, dev, stephen, fengchengwen,
	yangxingui, zhanjie9, lihuisong
In-Reply-To: <vMlZoeDqTD6UrHS2wuWBKw@monjalon.net>

Hi Thomas,

Thanks for your review.


On 6/11/2026 7:28 AM, Thomas Monjalon wrote:
> 22/05/2026 06:11, Huisong Li:
>> Add a new helper function rte_lcore_is_eal_managed() to determine
>> if a logical core is managed by EAL.
>>
>> This interface returns true if the lcore role is either ROLE_RTE
>> (standard worker/main cores) or ROLE_SERVICE (service cores).
> [...]
>> +RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_lcore_is_eal_managed, 26.07)
>> +int rte_lcore_is_eal_managed(unsigned int lcore_id)
>> +{
>> +	struct rte_config *cfg = rte_eal_get_configuration();
>> +
>> +	if (lcore_id >= RTE_MAX_LCORE)
>> +		return 0;
>> +	return cfg->lcore_role[lcore_id] == ROLE_RTE ||
>> +		cfg->lcore_role[lcore_id] == ROLE_SERVICE;
>> +}
> I'm not sure about adding this function in the API.
> We already have rte_eal_lcore_role()
> and I feel having this explicit ROLE_RTE || ROLE_SERVICE
> in the code where needed may be less confusing.

Ack.

>
> Note: we should prefix these constants with RTE_LCORE_

Yeah, it's good.

This will break API. And we can do this in 26.11.

>
>

^ permalink raw reply

* RE: [PATCH v5] ethdev: support inline calculating masked item value
From: Bing Zhao @ 2026-06-11  4:56 UTC (permalink / raw)
  To: Slava Ovsiienko, dev@dpdk.org, Raslan Darawsheh,
	stephen@networkplumber.org
  Cc: Ori Kam, Dariusz Sosnowski, Suanming Mou, Matan Azrad,
	NBU-Contact-Thomas Monjalon (EXTERNAL)
In-Reply-To: <BN9PR12MB5338F6E48AC491C180292DB8D01B2@BN9PR12MB5338.namprd12.prod.outlook.com>

Oh, I see, I squashed the changes into the next following mlx5 driver fix but not the proper one, so the code is still the old one. My fault.

> -----Original Message-----
> From: Bing Zhao
> Sent: Thursday, June 11, 2026 12:55 PM
> To: 'Bing Zhao' <bingz@nvidia.com>; Slava Ovsiienko
> <viacheslavo@nvidia.com>; dev@dpdk.org; Raslan Darawsheh
> <rasland@nvidia.com>; stephen@networkplumber.org
> Cc: Ori Kam <orika@nvidia.com>; Dariusz Sosnowski <dsosnowski@nvidia.com>;
> Suanming Mou <suanmingm@nvidia.com>; Matan Azrad <matan@nvidia.com>; NBU-
> Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>
> Subject: RE: [PATCH v5] ethdev: support inline calculating masked item
> value
> 
> Hi,
> 
> In my local code,
> 
> diff --git a/lib/ethdev/rte_flow.c b/lib/ethdev/rte_flow.c index
> 7cf9f6f6f3..7a2721af00 100644
> --- a/lib/ethdev/rte_flow.c
> +++ b/lib/ethdev/rte_flow.c
> @@ -181,9 +181,18 @@ static const struct rte_flow_desc_data
> rte_flow_desc_item[] = {  static inline size_t
> rte_flow_conv_item_mask_size(const struct rte_flow_item *item)  {
> -       if ((int)item->type >= 0)
> +       if ((int)item->type < 0)
> +               return sizeof(void *);
> +       switch (item->type) {
> +       case RTE_FLOW_ITEM_TYPE_RAW:
> +               return offsetof(struct rte_flow_item_raw, pattern);
> +       case RTE_FLOW_ITEM_TYPE_GENEVE_OPT:
> +               return offsetof(struct rte_flow_item_geneve_opt, data);
> +       default:
> +               if (rte_flow_desc_item[item->type].desc_fn != NULL)
> +                       return 0;
>                 return rte_flow_desc_item[item->type].size;
> -       return sizeof(void *);
> +       }
>  }
> 
> // This is the code before my latest change.
> > +static inline size_t
> > +rte_flow_conv_item_mask_size(const struct rte_flow_item *item) {
> > +       if ((int)item->type >= 0)
> > +               return rte_flow_desc_item[item->type].size;
> > +       return sizeof(void *);
> > +}
> > +
> 
> 
> I didn't understand why the patch I sent is still using the old code.
> 
> > -----Original Message-----
> > From: Bing Zhao <bingz@nvidia.com>
> > Sent: Wednesday, June 10, 2026 1:27 PM
> > To: Slava Ovsiienko <viacheslavo@nvidia.com>; dev@dpdk.org; Raslan
> > Darawsheh <rasland@nvidia.com>; stephen@networkplumber.org
> > Cc: Ori Kam <orika@nvidia.com>; Dariusz Sosnowski
> > <dsosnowski@nvidia.com>; Suanming Mou <suanmingm@nvidia.com>; Matan
> > Azrad <matan@nvidia.com>; NBU- Contact-Thomas Monjalon (EXTERNAL)
> > <thomas@monjalon.net>
> > Subject: [PATCH v5] ethdev: support inline calculating masked item
> > value
> >
> > External email: Use caution opening links or attachments
> >
> >
> > In the asynchronous API definition and some drivers, the rte_flow_item
> > spec value may not be calculated by the driver due to the reason of
> > speed of light rule insertion rate and sometimes the input parameters
> > will be copied and changed internally.
> >
> > After copying, the spec and last will be protected by the keyword
> > const and cannot be changed in the code itself. And also the driver
> > needs some extra memory to do the calculation and extra conditions to
> > understand the length of each item spec. This is not efficient.
> >
> > To solve the issue and support usage of the following fix, a new OP
> > was introduced to calculate the spec and last values after applying
> > the mask inline.
> >
> > Signed-off-by: Bing Zhao <bingz@nvidia.com>
> > Acked-by: Dariusz Sosnowski <dsosnowski@nvidia.com>
> > ---
> > v3:
> >   - add test code
> >   - fix the issue found by AI
> > v4: reabse on top of the main
> > v5: handle some items separately and add test for them
> > ---
> >  app/test/test_ethdev_api.c             | 76 ++++++++++++++++++++++++++
> >  doc/guides/rel_notes/release_26_07.rst |  6 ++
> >  lib/ethdev/rte_flow.c                  | 46 ++++++++++++++--
> >  lib/ethdev/rte_flow.h                  | 13 +++++
> >  4 files changed, 135 insertions(+), 6 deletions(-)
> >
> > diff --git a/app/test/test_ethdev_api.c b/app/test/test_ethdev_api.c
> > index 76afd0345c..5cae1cdc1d 100644
> > --- a/app/test/test_ethdev_api.c
> > +++ b/app/test/test_ethdev_api.c
> > @@ -4,6 +4,7 @@
> >
> >  #include <rte_log.h>
> >  #include <rte_ethdev.h>
> > +#include <rte_flow.h>
> >
> >  #include <rte_test.h>
> >  #include "test.h"
> > @@ -15,6 +16,80 @@
> >  #define NUM_MBUF 1024
> >  #define MBUF_CACHE_SIZE 256
> >
> > +static int32_t
> > +ethdev_api_flow_conv_pattern_masked(void)
> > +{
> > +       const struct rte_flow_item_eth spec = {
> > +               .hdr.dst_addr.addr_bytes = { 0x01, 0x02, 0x03, 0x04,
> > +0x05,
> > 0x06 },
> > +               .hdr.src_addr.addr_bytes = { 0x0a, 0x0b, 0x0c, 0x0d,
> > + 0x0e,
> > 0x0f },
> > +               .hdr.ether_type = RTE_BE16(0x1234),
> > +       };
> > +       const struct rte_flow_item_eth last = {
> > +               .hdr.dst_addr.addr_bytes = { 0x11, 0x12, 0x13, 0x14,
> > + 0x15,
> > 0x16 },
> > +               .hdr.src_addr.addr_bytes = { 0x1a, 0x1b, 0x1c, 0x1d,
> > + 0x1e,
> > 0x1f },
> > +               .hdr.ether_type = RTE_BE16(0x5678),
> > +       };
> > +       const struct rte_flow_item_eth mask = {
> > +               .hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0x00, 0x00,
> > + 0xff,
> > 0xff },
> > +               .hdr.src_addr.addr_bytes = { 0xff, 0x00, 0xff, 0x00,
> > + 0xff,
> > 0x00 },
> > +               .hdr.ether_type = RTE_BE16(0xffff),
> > +       };
> > +       const struct rte_flow_item pattern[] = {
> > +               {
> > +                       .type = RTE_FLOW_ITEM_TYPE_ETH,
> > +                       .spec = &spec,
> > +                       .last = &last,
> > +                       .mask = &mask,
> > +               },
> > +               { .type = RTE_FLOW_ITEM_TYPE_END },
> > +       };
> > +       union {
> > +               struct rte_flow_item item;
> > +               struct rte_flow_item_eth eth;
> > +               double align;
> > +               uint8_t raw[256];
> > +       } dst;
> > +       const struct rte_flow_item *item;
> > +       const struct rte_flow_item_eth *conv_spec;
> > +       const struct rte_flow_item_eth *conv_last;
> > +       int ret;
> > +
> > +       ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN_MASKED, NULL, 0,
> > pattern, NULL);
> > +       TEST_ASSERT(ret > 0, "Masked pattern conversion size query
> > failed");
> > +       TEST_ASSERT((size_t)ret <= sizeof(dst.raw),
> > +                   "Masked pattern conversion needs too much
> > + storage");
> > +
> > +       memset(&dst, 0, sizeof(dst));
> > +       ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN_MASKED, dst.raw,
> > +                           sizeof(dst.raw), pattern, NULL);
> > +       TEST_ASSERT(ret > 0, "Masked pattern conversion failed");
> > +
> > +       item = (const struct rte_flow_item *)dst.raw;
> > +       conv_spec = item[0].spec;
> > +       conv_last = item[0].last;
> > +       TEST_ASSERT_NOT_NULL(conv_spec, "Converted spec must be set");
> > +       TEST_ASSERT_NOT_NULL(conv_last, "Converted last must be set");
> > +
> > +       TEST_ASSERT_EQUAL(conv_spec->hdr.dst_addr.addr_bytes[0], 0x01,
> > +                         "Masked spec dst byte 0 mismatch");
> > +       TEST_ASSERT_EQUAL(conv_spec->hdr.dst_addr.addr_bytes[2], 0x00,
> > +                         "Masked spec dst byte 2 mismatch");
> > +       TEST_ASSERT_EQUAL(conv_spec->hdr.src_addr.addr_bytes[1], 0x00,
> > +                         "Masked spec src byte 1 mismatch");
> > +       TEST_ASSERT_EQUAL(conv_spec->hdr.ether_type, RTE_BE16(0x1234),
> > +                         "Masked spec ether type mismatch");
> > +       TEST_ASSERT_EQUAL(conv_last->hdr.dst_addr.addr_bytes[0], 0x11,
> > +                         "Masked last dst byte 0 mismatch");
> > +       TEST_ASSERT_EQUAL(conv_last->hdr.dst_addr.addr_bytes[2], 0x00,
> > +                         "Masked last dst byte 2 mismatch");
> > +       TEST_ASSERT_EQUAL(conv_last->hdr.src_addr.addr_bytes[1], 0x00,
> > +                         "Masked last src byte 1 mismatch");
> > +       TEST_ASSERT_EQUAL(conv_last->hdr.ether_type, RTE_BE16(0x5678),
> > +                         "Masked last ether type mismatch");
> > +
> > +       return TEST_SUCCESS;
> > +}
> > +
> >  static int32_t
> >  ethdev_api_queue_status(void)
> >  {
> > @@ -167,6 +242,7 @@ static struct unit_test_suite ethdev_api_testsuite =
> {
> >         .setup = NULL,
> >         .teardown = NULL,
> >         .unit_test_cases = {
> > +               TEST_CASE(ethdev_api_flow_conv_pattern_masked),
> >                 TEST_CASE(ethdev_api_queue_status),
> >                 /* TODO: Add deferred_start queue status test */
> >                 TEST_CASES_END() /**< NULL terminate unit test array
> > */ diff --git a/doc/guides/rel_notes/release_26_07.rst
> > b/doc/guides/rel_notes/release_26_07.rst
> > index b5285af5fe..4f5d21d576 100644
> > --- a/doc/guides/rel_notes/release_26_07.rst
> > +++ b/doc/guides/rel_notes/release_26_07.rst
> > @@ -190,6 +190,12 @@ API Changes
> >    - ``rte_pmd_mlx5_enable_steering``
> >    - ``rte_pmd_mlx5_disable_steering``
> >
> > +* ethdev: Added masked pattern conversion.
> > +
> > +  Added ``RTE_FLOW_CONV_OP_PATTERN_MASKED`` to ``rte_flow_conv()``
> > + to copy an entire pattern while applying each item's mask to its
> > + ``spec`` and ``last`` fields.
> > +
> >
> >  ABI Changes
> >  -----------
> > diff --git a/lib/ethdev/rte_flow.c b/lib/ethdev/rte_flow.c index
> > ec0fe08355..c7a94a1194 100644
> > --- a/lib/ethdev/rte_flow.c
> > +++ b/lib/ethdev/rte_flow.c
> > @@ -178,6 +178,14 @@ static const struct rte_flow_desc_data
> > rte_flow_desc_item[] = {
> >         MK_FLOW_ITEM(COMPARE, sizeof(struct rte_flow_item_compare)),
> > };
> >
> > +static inline size_t
> > +rte_flow_conv_item_mask_size(const struct rte_flow_item *item) {
> > +       if ((int)item->type >= 0)
> > +               return rte_flow_desc_item[item->type].size;
> > +       return sizeof(void *);
> > +}
> > +
> >  /** Generate flow_action[] entry. */
> >  #define MK_FLOW_ACTION(t, s) \
> >         [RTE_FLOW_ACTION_TYPE_ ## t] = { \ @@ -835,6 +843,8 @@
> > rte_flow_conv_action_conf(void *buf, const size_t size,
> >   *   RTE_FLOW_ITEM_TYPE_END is encountered.
> >   * @param[out] error
> >   *   Perform verbose error reporting if not NULL.
> > + * @param[in] with_mask
> > + *   If true, @p src mask will be applied to spec and last.
> >   *
> >   * @return
> >   *   A positive value representing the number of bytes needed to store
> > @@ -847,12 +857,13 @@ rte_flow_conv_pattern(struct rte_flow_item *dst,
> >                       const size_t size,
> >                       const struct rte_flow_item *src,
> >                       unsigned int num,
> > +                     bool with_mask,
> >                       struct rte_flow_error *error)  {
> >         uintptr_t data = (uintptr_t)dst;
> >         size_t off;
> >         size_t ret;
> > -       unsigned int i;
> > +       unsigned int i, j;
> >
> >         for (i = 0, off = 0; !num || i != num; ++i, ++src, ++dst) {
> >                 /**
> > @@ -876,15 +887,27 @@ rte_flow_conv_pattern(struct rte_flow_item *dst,
> >         src -= num;
> >         dst -= num;
> >         do {
> > +               uint8_t *c_spec = NULL, *c_last = NULL;
> > +               const uint8_t *mask = src->mask;
> > +               size_t item_mask_size = mask ?
> > + rte_flow_conv_item_mask_size(src) : 0;
> > +
> >                 if (src->spec) {
> >                         off = RTE_ALIGN_CEIL(off, sizeof(double));
> >                         ret = rte_flow_conv_item_spec
> >                                 ((void *)(data + off),
> >                                  size > off ? size - off : 0, src,
> >                                  RTE_FLOW_CONV_ITEM_SPEC);
> > -                       if (size && size >= off + ret)
> > +                       if (size && size >= off + ret) {
> >                                 dst->spec = (void *)(data + off);
> > +                               c_spec = (uint8_t *)(data + off);
> > +                       }
> >                         off += ret;
> > +                       if (with_mask && c_spec && mask) {
> > +                               size_t mask_size = RTE_MIN(ret,
> > + item_mask_size);
> > +
> > +                               for (j = 0; j < mask_size; j++)
> > +                                       c_spec[j] &= mask[j];
> > +                       }
> >
> >                 }
> >                 if (src->last) {
> > @@ -893,9 +916,17 @@ rte_flow_conv_pattern(struct rte_flow_item *dst,
> >                                 ((void *)(data + off),
> >                                  size > off ? size - off : 0, src,
> >                                  RTE_FLOW_CONV_ITEM_LAST);
> > -                       if (size && size >= off + ret)
> > +                       if (size && size >= off + ret) {
> >                                 dst->last = (void *)(data + off);
> > +                               c_last = (uint8_t *)(data + off);
> > +                       }
> >                         off += ret;
> > +                       if (with_mask && c_last && mask) {
> > +                               size_t mask_size = RTE_MIN(ret,
> > + item_mask_size);
> > +
> > +                               for (j = 0; j < mask_size; j++)
> > +                                       c_last[j] &= mask[j];
> > +                       }
> >                 }
> >                 if (src->mask) {
> >                         off = RTE_ALIGN_CEIL(off, sizeof(double)); @@
> > -
> > 1042,7 +1073,7 @@ rte_flow_conv_rule(struct rte_flow_conv_rule *dst,
> >                 off = RTE_ALIGN_CEIL(off, sizeof(double));
> >                 ret = rte_flow_conv_pattern((void *)((uintptr_t)dst +
> > off),
> >                                             size > off ? size - off : 0,
> > -                                           src->pattern_ro, 0, error);
> > +                                           src->pattern_ro, 0, false,
> > + error);
> >                 if (ret < 0)
> >                         return ret;
> >                 if (size && size >= off + (size_t)ret) @@ -1143,7
> > +1174,7 @@ rte_flow_conv(enum rte_flow_conv_op op,
> >                 ret = sizeof(*attr);
> >                 break;
> >         case RTE_FLOW_CONV_OP_ITEM:
> > -               ret = rte_flow_conv_pattern(dst, size, src, 1, error);
> > +               ret = rte_flow_conv_pattern(dst, size, src, 1, false,
> > + error);
> >                 break;
> >         case RTE_FLOW_CONV_OP_ITEM_MASK:
> >                 item = src;
> > @@ -1158,7 +1189,7 @@ rte_flow_conv(enum rte_flow_conv_op op,
> >                 ret = rte_flow_conv_actions(dst, size, src, 1, error);
> >                 break;
> >         case RTE_FLOW_CONV_OP_PATTERN:
> > -               ret = rte_flow_conv_pattern(dst, size, src, 0, error);
> > +               ret = rte_flow_conv_pattern(dst, size, src, 0, false,
> > + error);
> >                 break;
> >         case RTE_FLOW_CONV_OP_ACTIONS:
> >                 ret = rte_flow_conv_actions(dst, size, src, 0, error);
> > @@
> > -1178,6 +1209,9 @@ rte_flow_conv(enum rte_flow_conv_op op,
> >         case RTE_FLOW_CONV_OP_ACTION_NAME_PTR:
> >                 ret = rte_flow_conv_name(1, 1, dst, size, src, error);
> >                 break;
> > +       case RTE_FLOW_CONV_OP_PATTERN_MASKED:
> > +               ret = rte_flow_conv_pattern(dst, size, src, 0, true,
> > error);
> > +               break;
> >         default:
> >                 ret = rte_flow_error_set
> >                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> > NULL, diff --git a/lib/ethdev/rte_flow.h b/lib/ethdev/rte_flow.h index
> > b495409406..959a2f903b 100644
> > --- a/lib/ethdev/rte_flow.h
> > +++ b/lib/ethdev/rte_flow.h
> > @@ -4556,6 +4556,19 @@ enum rte_flow_conv_op {
> >          *   @code const char ** @endcode
> >          */
> >         RTE_FLOW_CONV_OP_ACTION_NAME_PTR,
> > +
> > +       /**
> > +        * Convert an entire pattern.
> > +        *
> > +        * Duplicates all pattern items at once, applying @p mask to
> > + @p
> > spec
> > +        * and @p last.
> > +        *
> > +        * - @p src type:
> > +        *   @code const struct rte_flow_item * @endcode
> > +        * - @p dst type:
> > +        *   @code struct rte_flow_item * @endcode
> > +        */
> > +       RTE_FLOW_CONV_OP_PATTERN_MASKED,
> >  };
> >
> >  /**
> > --
> > 2.34.1


^ permalink raw reply

* RE: [PATCH v5] ethdev: support inline calculating masked item value
From: Bing Zhao @ 2026-06-11  4:55 UTC (permalink / raw)
  To: Bing Zhao, Slava Ovsiienko, dev@dpdk.org, Raslan Darawsheh,
	stephen@networkplumber.org
  Cc: Ori Kam, Dariusz Sosnowski, Suanming Mou, Matan Azrad,
	NBU-Contact-Thomas Monjalon (EXTERNAL)
In-Reply-To: <20260610052729.5637-1-bingz@nvidia.com>

Hi,

In my local code,

diff --git a/lib/ethdev/rte_flow.c b/lib/ethdev/rte_flow.c
index 7cf9f6f6f3..7a2721af00 100644
--- a/lib/ethdev/rte_flow.c
+++ b/lib/ethdev/rte_flow.c
@@ -181,9 +181,18 @@ static const struct rte_flow_desc_data rte_flow_desc_item[] = {
 static inline size_t
 rte_flow_conv_item_mask_size(const struct rte_flow_item *item)
 {
-       if ((int)item->type >= 0)
+       if ((int)item->type < 0)
+               return sizeof(void *);
+       switch (item->type) {
+       case RTE_FLOW_ITEM_TYPE_RAW:
+               return offsetof(struct rte_flow_item_raw, pattern);
+       case RTE_FLOW_ITEM_TYPE_GENEVE_OPT:
+               return offsetof(struct rte_flow_item_geneve_opt, data);
+       default:
+               if (rte_flow_desc_item[item->type].desc_fn != NULL)
+                       return 0;
                return rte_flow_desc_item[item->type].size;
-       return sizeof(void *);
+       }
 }

// This is the code before my latest change.
> +static inline size_t
> +rte_flow_conv_item_mask_size(const struct rte_flow_item *item) {
> +       if ((int)item->type >= 0)
> +               return rte_flow_desc_item[item->type].size;
> +       return sizeof(void *);
> +}
> +


I didn't understand why the patch I sent is still using the old code.

> -----Original Message-----
> From: Bing Zhao <bingz@nvidia.com>
> Sent: Wednesday, June 10, 2026 1:27 PM
> To: Slava Ovsiienko <viacheslavo@nvidia.com>; dev@dpdk.org; Raslan
> Darawsheh <rasland@nvidia.com>; stephen@networkplumber.org
> Cc: Ori Kam <orika@nvidia.com>; Dariusz Sosnowski <dsosnowski@nvidia.com>;
> Suanming Mou <suanmingm@nvidia.com>; Matan Azrad <matan@nvidia.com>; NBU-
> Contact-Thomas Monjalon (EXTERNAL) <thomas@monjalon.net>
> Subject: [PATCH v5] ethdev: support inline calculating masked item value
> 
> External email: Use caution opening links or attachments
> 
> 
> In the asynchronous API definition and some drivers, the rte_flow_item
> spec value may not be calculated by the driver due to the reason of speed
> of light rule insertion rate and sometimes the input parameters will be
> copied and changed internally.
> 
> After copying, the spec and last will be protected by the keyword const
> and cannot be changed in the code itself. And also the driver needs some
> extra memory to do the calculation and extra conditions to understand the
> length of each item spec. This is not efficient.
> 
> To solve the issue and support usage of the following fix, a new OP was
> introduced to calculate the spec and last values after applying the mask
> inline.
> 
> Signed-off-by: Bing Zhao <bingz@nvidia.com>
> Acked-by: Dariusz Sosnowski <dsosnowski@nvidia.com>
> ---
> v3:
>   - add test code
>   - fix the issue found by AI
> v4: reabse on top of the main
> v5: handle some items separately and add test for them
> ---
>  app/test/test_ethdev_api.c             | 76 ++++++++++++++++++++++++++
>  doc/guides/rel_notes/release_26_07.rst |  6 ++
>  lib/ethdev/rte_flow.c                  | 46 ++++++++++++++--
>  lib/ethdev/rte_flow.h                  | 13 +++++
>  4 files changed, 135 insertions(+), 6 deletions(-)
> 
> diff --git a/app/test/test_ethdev_api.c b/app/test/test_ethdev_api.c index
> 76afd0345c..5cae1cdc1d 100644
> --- a/app/test/test_ethdev_api.c
> +++ b/app/test/test_ethdev_api.c
> @@ -4,6 +4,7 @@
> 
>  #include <rte_log.h>
>  #include <rte_ethdev.h>
> +#include <rte_flow.h>
> 
>  #include <rte_test.h>
>  #include "test.h"
> @@ -15,6 +16,80 @@
>  #define NUM_MBUF 1024
>  #define MBUF_CACHE_SIZE 256
> 
> +static int32_t
> +ethdev_api_flow_conv_pattern_masked(void)
> +{
> +       const struct rte_flow_item_eth spec = {
> +               .hdr.dst_addr.addr_bytes = { 0x01, 0x02, 0x03, 0x04, 0x05,
> 0x06 },
> +               .hdr.src_addr.addr_bytes = { 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
> 0x0f },
> +               .hdr.ether_type = RTE_BE16(0x1234),
> +       };
> +       const struct rte_flow_item_eth last = {
> +               .hdr.dst_addr.addr_bytes = { 0x11, 0x12, 0x13, 0x14, 0x15,
> 0x16 },
> +               .hdr.src_addr.addr_bytes = { 0x1a, 0x1b, 0x1c, 0x1d, 0x1e,
> 0x1f },
> +               .hdr.ether_type = RTE_BE16(0x5678),
> +       };
> +       const struct rte_flow_item_eth mask = {
> +               .hdr.dst_addr.addr_bytes = { 0xff, 0xff, 0x00, 0x00, 0xff,
> 0xff },
> +               .hdr.src_addr.addr_bytes = { 0xff, 0x00, 0xff, 0x00, 0xff,
> 0x00 },
> +               .hdr.ether_type = RTE_BE16(0xffff),
> +       };
> +       const struct rte_flow_item pattern[] = {
> +               {
> +                       .type = RTE_FLOW_ITEM_TYPE_ETH,
> +                       .spec = &spec,
> +                       .last = &last,
> +                       .mask = &mask,
> +               },
> +               { .type = RTE_FLOW_ITEM_TYPE_END },
> +       };
> +       union {
> +               struct rte_flow_item item;
> +               struct rte_flow_item_eth eth;
> +               double align;
> +               uint8_t raw[256];
> +       } dst;
> +       const struct rte_flow_item *item;
> +       const struct rte_flow_item_eth *conv_spec;
> +       const struct rte_flow_item_eth *conv_last;
> +       int ret;
> +
> +       ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN_MASKED, NULL, 0,
> pattern, NULL);
> +       TEST_ASSERT(ret > 0, "Masked pattern conversion size query
> failed");
> +       TEST_ASSERT((size_t)ret <= sizeof(dst.raw),
> +                   "Masked pattern conversion needs too much storage");
> +
> +       memset(&dst, 0, sizeof(dst));
> +       ret = rte_flow_conv(RTE_FLOW_CONV_OP_PATTERN_MASKED, dst.raw,
> +                           sizeof(dst.raw), pattern, NULL);
> +       TEST_ASSERT(ret > 0, "Masked pattern conversion failed");
> +
> +       item = (const struct rte_flow_item *)dst.raw;
> +       conv_spec = item[0].spec;
> +       conv_last = item[0].last;
> +       TEST_ASSERT_NOT_NULL(conv_spec, "Converted spec must be set");
> +       TEST_ASSERT_NOT_NULL(conv_last, "Converted last must be set");
> +
> +       TEST_ASSERT_EQUAL(conv_spec->hdr.dst_addr.addr_bytes[0], 0x01,
> +                         "Masked spec dst byte 0 mismatch");
> +       TEST_ASSERT_EQUAL(conv_spec->hdr.dst_addr.addr_bytes[2], 0x00,
> +                         "Masked spec dst byte 2 mismatch");
> +       TEST_ASSERT_EQUAL(conv_spec->hdr.src_addr.addr_bytes[1], 0x00,
> +                         "Masked spec src byte 1 mismatch");
> +       TEST_ASSERT_EQUAL(conv_spec->hdr.ether_type, RTE_BE16(0x1234),
> +                         "Masked spec ether type mismatch");
> +       TEST_ASSERT_EQUAL(conv_last->hdr.dst_addr.addr_bytes[0], 0x11,
> +                         "Masked last dst byte 0 mismatch");
> +       TEST_ASSERT_EQUAL(conv_last->hdr.dst_addr.addr_bytes[2], 0x00,
> +                         "Masked last dst byte 2 mismatch");
> +       TEST_ASSERT_EQUAL(conv_last->hdr.src_addr.addr_bytes[1], 0x00,
> +                         "Masked last src byte 1 mismatch");
> +       TEST_ASSERT_EQUAL(conv_last->hdr.ether_type, RTE_BE16(0x5678),
> +                         "Masked last ether type mismatch");
> +
> +       return TEST_SUCCESS;
> +}
> +
>  static int32_t
>  ethdev_api_queue_status(void)
>  {
> @@ -167,6 +242,7 @@ static struct unit_test_suite ethdev_api_testsuite = {
>         .setup = NULL,
>         .teardown = NULL,
>         .unit_test_cases = {
> +               TEST_CASE(ethdev_api_flow_conv_pattern_masked),
>                 TEST_CASE(ethdev_api_queue_status),
>                 /* TODO: Add deferred_start queue status test */
>                 TEST_CASES_END() /**< NULL terminate unit test array */
> diff --git a/doc/guides/rel_notes/release_26_07.rst
> b/doc/guides/rel_notes/release_26_07.rst
> index b5285af5fe..4f5d21d576 100644
> --- a/doc/guides/rel_notes/release_26_07.rst
> +++ b/doc/guides/rel_notes/release_26_07.rst
> @@ -190,6 +190,12 @@ API Changes
>    - ``rte_pmd_mlx5_enable_steering``
>    - ``rte_pmd_mlx5_disable_steering``
> 
> +* ethdev: Added masked pattern conversion.
> +
> +  Added ``RTE_FLOW_CONV_OP_PATTERN_MASKED`` to ``rte_flow_conv()``  to
> + copy an entire pattern while applying each item's mask to its
> + ``spec`` and ``last`` fields.
> +
> 
>  ABI Changes
>  -----------
> diff --git a/lib/ethdev/rte_flow.c b/lib/ethdev/rte_flow.c index
> ec0fe08355..c7a94a1194 100644
> --- a/lib/ethdev/rte_flow.c
> +++ b/lib/ethdev/rte_flow.c
> @@ -178,6 +178,14 @@ static const struct rte_flow_desc_data
> rte_flow_desc_item[] = {
>         MK_FLOW_ITEM(COMPARE, sizeof(struct rte_flow_item_compare)),  };
> 
> +static inline size_t
> +rte_flow_conv_item_mask_size(const struct rte_flow_item *item) {
> +       if ((int)item->type >= 0)
> +               return rte_flow_desc_item[item->type].size;
> +       return sizeof(void *);
> +}
> +
>  /** Generate flow_action[] entry. */
>  #define MK_FLOW_ACTION(t, s) \
>         [RTE_FLOW_ACTION_TYPE_ ## t] = { \ @@ -835,6 +843,8 @@
> rte_flow_conv_action_conf(void *buf, const size_t size,
>   *   RTE_FLOW_ITEM_TYPE_END is encountered.
>   * @param[out] error
>   *   Perform verbose error reporting if not NULL.
> + * @param[in] with_mask
> + *   If true, @p src mask will be applied to spec and last.
>   *
>   * @return
>   *   A positive value representing the number of bytes needed to store
> @@ -847,12 +857,13 @@ rte_flow_conv_pattern(struct rte_flow_item *dst,
>                       const size_t size,
>                       const struct rte_flow_item *src,
>                       unsigned int num,
> +                     bool with_mask,
>                       struct rte_flow_error *error)  {
>         uintptr_t data = (uintptr_t)dst;
>         size_t off;
>         size_t ret;
> -       unsigned int i;
> +       unsigned int i, j;
> 
>         for (i = 0, off = 0; !num || i != num; ++i, ++src, ++dst) {
>                 /**
> @@ -876,15 +887,27 @@ rte_flow_conv_pattern(struct rte_flow_item *dst,
>         src -= num;
>         dst -= num;
>         do {
> +               uint8_t *c_spec = NULL, *c_last = NULL;
> +               const uint8_t *mask = src->mask;
> +               size_t item_mask_size = mask ?
> + rte_flow_conv_item_mask_size(src) : 0;
> +
>                 if (src->spec) {
>                         off = RTE_ALIGN_CEIL(off, sizeof(double));
>                         ret = rte_flow_conv_item_spec
>                                 ((void *)(data + off),
>                                  size > off ? size - off : 0, src,
>                                  RTE_FLOW_CONV_ITEM_SPEC);
> -                       if (size && size >= off + ret)
> +                       if (size && size >= off + ret) {
>                                 dst->spec = (void *)(data + off);
> +                               c_spec = (uint8_t *)(data + off);
> +                       }
>                         off += ret;
> +                       if (with_mask && c_spec && mask) {
> +                               size_t mask_size = RTE_MIN(ret,
> + item_mask_size);
> +
> +                               for (j = 0; j < mask_size; j++)
> +                                       c_spec[j] &= mask[j];
> +                       }
> 
>                 }
>                 if (src->last) {
> @@ -893,9 +916,17 @@ rte_flow_conv_pattern(struct rte_flow_item *dst,
>                                 ((void *)(data + off),
>                                  size > off ? size - off : 0, src,
>                                  RTE_FLOW_CONV_ITEM_LAST);
> -                       if (size && size >= off + ret)
> +                       if (size && size >= off + ret) {
>                                 dst->last = (void *)(data + off);
> +                               c_last = (uint8_t *)(data + off);
> +                       }
>                         off += ret;
> +                       if (with_mask && c_last && mask) {
> +                               size_t mask_size = RTE_MIN(ret,
> + item_mask_size);
> +
> +                               for (j = 0; j < mask_size; j++)
> +                                       c_last[j] &= mask[j];
> +                       }
>                 }
>                 if (src->mask) {
>                         off = RTE_ALIGN_CEIL(off, sizeof(double)); @@ -
> 1042,7 +1073,7 @@ rte_flow_conv_rule(struct rte_flow_conv_rule *dst,
>                 off = RTE_ALIGN_CEIL(off, sizeof(double));
>                 ret = rte_flow_conv_pattern((void *)((uintptr_t)dst +
> off),
>                                             size > off ? size - off : 0,
> -                                           src->pattern_ro, 0, error);
> +                                           src->pattern_ro, 0, false,
> + error);
>                 if (ret < 0)
>                         return ret;
>                 if (size && size >= off + (size_t)ret) @@ -1143,7 +1174,7
> @@ rte_flow_conv(enum rte_flow_conv_op op,
>                 ret = sizeof(*attr);
>                 break;
>         case RTE_FLOW_CONV_OP_ITEM:
> -               ret = rte_flow_conv_pattern(dst, size, src, 1, error);
> +               ret = rte_flow_conv_pattern(dst, size, src, 1, false,
> + error);
>                 break;
>         case RTE_FLOW_CONV_OP_ITEM_MASK:
>                 item = src;
> @@ -1158,7 +1189,7 @@ rte_flow_conv(enum rte_flow_conv_op op,
>                 ret = rte_flow_conv_actions(dst, size, src, 1, error);
>                 break;
>         case RTE_FLOW_CONV_OP_PATTERN:
> -               ret = rte_flow_conv_pattern(dst, size, src, 0, error);
> +               ret = rte_flow_conv_pattern(dst, size, src, 0, false,
> + error);
>                 break;
>         case RTE_FLOW_CONV_OP_ACTIONS:
>                 ret = rte_flow_conv_actions(dst, size, src, 0, error); @@
> -1178,6 +1209,9 @@ rte_flow_conv(enum rte_flow_conv_op op,
>         case RTE_FLOW_CONV_OP_ACTION_NAME_PTR:
>                 ret = rte_flow_conv_name(1, 1, dst, size, src, error);
>                 break;
> +       case RTE_FLOW_CONV_OP_PATTERN_MASKED:
> +               ret = rte_flow_conv_pattern(dst, size, src, 0, true,
> error);
> +               break;
>         default:
>                 ret = rte_flow_error_set
>                 (error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> diff --git a/lib/ethdev/rte_flow.h b/lib/ethdev/rte_flow.h index
> b495409406..959a2f903b 100644
> --- a/lib/ethdev/rte_flow.h
> +++ b/lib/ethdev/rte_flow.h
> @@ -4556,6 +4556,19 @@ enum rte_flow_conv_op {
>          *   @code const char ** @endcode
>          */
>         RTE_FLOW_CONV_OP_ACTION_NAME_PTR,
> +
> +       /**
> +        * Convert an entire pattern.
> +        *
> +        * Duplicates all pattern items at once, applying @p mask to @p
> spec
> +        * and @p last.
> +        *
> +        * - @p src type:
> +        *   @code const struct rte_flow_item * @endcode
> +        * - @p dst type:
> +        *   @code struct rte_flow_item * @endcode
> +        */
> +       RTE_FLOW_CONV_OP_PATTERN_MASKED,
>  };
> 
>  /**
> --
> 2.34.1


^ permalink raw reply related

* Re: [PATCH v3 01/10] eal: add interface to check if lcore is EAL managed
From: Thomas Monjalon @ 2026-06-10 23:28 UTC (permalink / raw)
  To: Huisong Li
  Cc: anatoly.burakov, sivaprasad.tummala, dev, stephen, fengchengwen,
	yangxingui, zhanjie9, lihuisong
In-Reply-To: <20260522041110.2023062-2-lihuisong@huawei.com>

22/05/2026 06:11, Huisong Li:
> Add a new helper function rte_lcore_is_eal_managed() to determine
> if a logical core is managed by EAL.
> 
> This interface returns true if the lcore role is either ROLE_RTE
> (standard worker/main cores) or ROLE_SERVICE (service cores).
[...]
> +RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_lcore_is_eal_managed, 26.07)
> +int rte_lcore_is_eal_managed(unsigned int lcore_id)
> +{
> +	struct rte_config *cfg = rte_eal_get_configuration();
> +
> +	if (lcore_id >= RTE_MAX_LCORE)
> +		return 0;
> +	return cfg->lcore_role[lcore_id] == ROLE_RTE ||
> +		cfg->lcore_role[lcore_id] == ROLE_SERVICE;
> +}

I'm not sure about adding this function in the API.
We already have rte_eal_lcore_role()
and I feel having this explicit ROLE_RTE || ROLE_SERVICE
in the code where needed may be less confusing.

Note: we should prefix these constants with RTE_LCORE_



^ permalink raw reply

* Re: [PATCH] power/intel_uncore: reduce log level for dependency missing
From: Thomas Monjalon @ 2026-06-10 23:21 UTC (permalink / raw)
  To: anatoly.burakov, sivaprasad.tummala
  Cc: dev, stephen, fengchengwen, yangxingui, zhanjie9, Huisong Li
In-Reply-To: <20260512013047.375535-1-lihuisong@huawei.com>

12/05/2026 03:30, Huisong Li:
> When run dpdk-l3fwd with '-u' on non-X86 platform, user would
> happen a noisy print as the following:
> "POWER: Uncore frequency management not supported/enabled on this
> kernel. Please enable CONFIG_INTEL_UNCORE_FREQ_CONTROL if on Intel
> x86 with linux kernel >= 5.6".
> 
> The root cause is that intel_uncore driver's .init() will be called
> on any platform when use automatic detection mode. The function in
> intel_uncore driver will print above log on non-X86 platform.
> 
> But the existing uncore core cannot solve this problem unless break
> ABI to add new callback. So reduce its log level to avoid this
> incorrect prompt.

Any comment please?
What would be the right solution?



^ permalink raw reply

* Re: [PATCH 0/3] power: some cleancode for cpufreq library
From: Thomas Monjalon @ 2026-06-10 23:14 UTC (permalink / raw)
  To: Huisong Li
  Cc: anatoly.burakov, sivaprasad.tummala, dev, stephen, yangxingui,
	zhanjie9, fengchengwen
In-Reply-To: <5a7c0fff-515c-425d-bd25-538cccabfb09@huawei.com>

11/05/2026 03:10, fengchengwen:
> Series-acked-by: Chengwen Feng <fengchengwen@huawei.com>
> 
> On 5/9/2026 4:45 PM, Huisong Li wrote:
> > Move some common definition to common header file.
> > 
> > Huisong Li (3):
> >   power: move power state structure to power cpufreq header
> >   power: unify decimal format macro for strtoul
> >   power: use common decimal macro definition

Applied, thanks.



^ permalink raw reply

* Re: [PATCH] power: fix duplicated typedef for setting uncore freq
From: Thomas Monjalon @ 2026-06-10 22:46 UTC (permalink / raw)
  To: Huisong Li
  Cc: anatoly.burakov, sivaprasad.tummala, stephen, dev, fengchengwen,
	yangxingui, zhanjie9
In-Reply-To: <20260507112754.3418377-1-lihuisong@huawei.com>

07/05/2026 13:27, Huisong Li:
> Remove a duplicated rte_power_set_uncore_freq_t definition.
> And this ops is intended to set any available uncore frequency instead
> of minimum and maximum one.
> 
> Fixes: ebe99d351a3f ("power: refactor uncore power management")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Huisong Li <lihuisong@huawei.com>

Applied, thanks.




^ permalink raw reply

* Re: [PATCH] power: fix off-by-one in uncore env bounds check
From: Thomas Monjalon @ 2026-06-10 22:36 UTC (permalink / raw)
  To: Denis Sergeev; +Cc: dev, stable, anatoly.burakov, sivaprasad.tummala, sdl.dpdk
In-Reply-To: <20260603042205.116191-1-denserg.edu@gmail.com>

03/06/2026 06:21, Denis Sergeev:
> The condition in rte_power_set_uncore_env() uses '<=' instead of '<'
> when comparing the env argument against the size of uncore_env_str[].
> Since RTE_DIM(uncore_env_str) equals 4 and valid indices are 0..3,
> a caller passing env=4 bypasses the guard and causes an out-of-bounds
> read of uncore_env_str[4] at two sites within the same block.
> 
> Fix by replacing '<=' with '<', consistent with the correct pattern
> already used in rte_power_uncore_init() in the same file.
> 
> Found by Linux Verification Center (linuxtesting.org) with SVACE.
> 
> Fixes: ac1edcb6621a ("power: refactor uncore power management API")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Denis Sergeev <denserg.edu@gmail.com>

Applied, thanks.



^ permalink raw reply

* Re: [PATCH] power/amd_pstate: fix frequency matching for continuous scaling
From: Thomas Monjalon @ 2026-06-10 22:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, stable, Anatoly Burakov, Sivaprasad Tummala
In-Reply-To: <20260328193419.106100-1-stephen@networkplumber.org>

28/03/2026 20:34, Stephen Hemminger:
> The power_init_for_setting_freq() function fails on systems using the
> amd-pstate-epp driver because the current CPU frequency read from
> scaling_setspeed does not exactly match any of the synthesized
> frequency buckets. Unlike acpi_cpufreq which provides a discrete list
> of frequencies, amd-pstate operates with continuously variable
> frequencies, so an exact match will rarely succeed.
> 
> For example, on a Ryzen 9 7945HX the sysfs file reports 2797172
> which rounds to 2797000, but this value does not appear in the
> generated frequency table.
> 
> Replace the exact match lookup with a nearest-frequency search.
> 
[...]
> -	freq = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL);
> +	errno = 0;
> +	freq = strtoul(buf, &endptr, POWER_CONVERT_TO_DECIMAL);
> +	if (errno != 0 || endptr == buf || freq == 0) {
> +		POWER_LOG(ERR, "Failed to parse frequency '%s' for lcore %u",
> +				buf, pi->lcore_id);
> +		goto err;
> +	}
>  
>  	/* convert the frequency to nearest 1000 value
>  	 * Ex: if freq=1396789 then freq_conv=1397000
>  	 * Ex: if freq=800030 then freq_conv=800000
>  	 */
> -	unsigned int freq_conv = 0;
> -	freq_conv = (freq + FREQ_ROUNDING_DELTA)
> -				/ ROUND_FREQ_TO_N_1000;
> +	freq_conv = (freq + FREQ_ROUNDING_DELTA) / ROUND_FREQ_TO_N_1000;
>  	freq_conv = freq_conv * ROUND_FREQ_TO_N_1000;
>  
> -	for (i = 0; i < pi->nb_freqs; i++) {
> -		if (freq_conv == pi->freqs[i]) {
> -			pi->curr_idx = i;
> -			pi->f = f;
> -			return 0;
> +	/* Find the nearest frequency in the table.
> +	 * With amd-pstate the CPU runs at continuously variable
> +	 * frequencies so the current frequency will not exactly
> +	 * match one of the synthesized frequency buckets.
> +	 */
> +	best_idx = 0;
> +	best_diff = abs_diff(freq_conv, pi->freqs[0]);
> +
> +	for (i = 1; i < pi->nb_freqs; i++) {
> +		diff = abs_diff(freq_conv, pi->freqs[i]);
> +		if (diff < best_diff) {
> +			best_diff = diff;
> +			best_idx = i;
>  		}
>  	}

GPT found this problem:

power_init_for_setting_freq() now assigns pi->curr_idx = best_idx
after finding the nearest synthesized frequency bucket.
However, set_freq_internal() skips the sysfs write
whenever idx == pi->curr_idx.

This means that if the current scaling_setspeed value is merely close
to a bucket but not equal to it, a later request to set that bucket
will return success without actually writing the requested frequency.
This can happen during init too: power_amd_pstate_cpufreq_init()
calls freq_max() after initialization, but if the current frequency
is nearest to the max bucket, freq_max() will be skipped even when
the actual sysfs value is not the synthesized max.
The nearest-bucket match should not be treated as an exact programmed
frequency, or the next explicit set to that bucket should be forced.



^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox