DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 02/22] common/cnxk: add API of SA valid for cn20k platform
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

Add API to get SA valid configuration for cn20k platform.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/cnxk_security.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/common/cnxk/cnxk_security.c b/drivers/common/cnxk/cnxk_security.c
index 6b51055100..6f46ad3276 100644
--- a/drivers/common/cnxk/cnxk_security.c
+++ b/drivers/common/cnxk/cnxk_security.c
@@ -606,6 +606,20 @@ cnxk_ot_ipsec_outb_sa_valid(struct roc_ot_ipsec_outb_sa *sa)
 	return !!sa->w2.s.valid;
 }

+RTE_EXPORT_INTERNAL_SYMBOL(cnxk_ow_ipsec_inb_sa_valid)
+bool
+cnxk_ow_ipsec_inb_sa_valid(struct roc_ow_ipsec_inb_sa *sa)
+{
+	return !!sa->w2.s.valid;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(cnxk_ow_ipsec_outb_sa_valid)
+bool
+cnxk_ow_ipsec_outb_sa_valid(struct roc_ow_ipsec_outb_sa *sa)
+{
+	return !!sa->w2.s.valid;
+}
+
 RTE_EXPORT_INTERNAL_SYMBOL(cnxk_ipsec_ivlen_get)
 uint8_t
 cnxk_ipsec_ivlen_get(enum rte_crypto_cipher_algorithm c_algo,
--
2.34.1


^ permalink raw reply related

* [PATCH v2 03/22] common/cnxk: additional NIX SQ ctx fields prints
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

Additional debug prints for CN20k NIX SQ ctx dump

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: fix ubuntu clang stdatmoic compile failure.

 drivers/common/cnxk/roc_nix_debug.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/common/cnxk/roc_nix_debug.c b/drivers/common/cnxk/roc_nix_debug.c
index 11994bf131..d4b2b86916 100644
--- a/drivers/common/cnxk/roc_nix_debug.c
+++ b/drivers/common/cnxk/roc_nix_debug.c
@@ -540,6 +540,8 @@ nix_cn10k_lf_sq_dump(__io struct nix_cn10k_sq_ctx_s *ctx, uint32_t *sqb_aura_p,
 static inline void
 nix_lf_sq_dump(__io struct nix_cn20k_sq_ctx_s *ctx, uint32_t *sqb_aura_p, FILE *file)
 {
+	int64_t *sq_cnt_ptr = NULL;
+
 	nix_dump(file, "W0: sqe_way_mask \t\t%d\nW0: cq \t\t\t\t%d",
 		 ctx->sqe_way_mask, ctx->cq);
 	nix_dump(file, "W0: sdp_mcast \t\t\t%d\nW0: substream \t\t\t0x%03x",
@@ -561,6 +563,7 @@ nix_lf_sq_dump(__io struct nix_cn20k_sq_ctx_s *ctx, uint32_t *sqb_aura_p, FILE *
 	nix_dump(file, "W2: smq_rr_count[ub:lb] \t\t%x:%x\n", ctx->smq_rr_count_ub,
 		 ctx->smq_rr_count_lb);

+	nix_dump(file, "W3: update_sq_count\t\t%d\n", ctx->update_sq_count);
 	nix_dump(file, "W3: smq_next_sq_vld\t\t%d\nW3: smq_pend\t\t\t%d",
 		 ctx->smq_next_sq_vld, ctx->smq_pend);
 	nix_dump(file, "W3: smenq_next_sqb_vld  \t%d\nW3: head_offset\t\t\t%d",
@@ -588,6 +591,12 @@ nix_lf_sq_dump(__io struct nix_cn20k_sq_ctx_s *ctx, uint32_t *sqb_aura_p, FILE *
 		 ctx->vfi_lso_sizem1);
 	nix_dump(file, "W9: vfi_lso_total\t\t%d", ctx->vfi_lso_total);

+	nix_dump(file, "W10: sq_count_iova \t\t0x%" PRIx64 "", (uint64_t)ctx->sq_count_iova);
+	sq_cnt_ptr = (int64_t *)(uintptr_t)(ctx->sq_count_iova << 3);
+	if (sq_cnt_ptr && ctx->update_sq_count)
+		nix_dump(file, "sq_count value \t\t0x%" PRIx64 "",
+			 plt_atomic_load_explicit((uint64_t __plt_atomic *)sq_cnt_ptr,
+						  plt_memory_order_relaxed));
 	nix_dump(file, "W10: scm_lso_rem \t\t0x%" PRIx64 "", (uint64_t)ctx->scm_lso_rem);
 	nix_dump(file, "W11: octs \t\t\t0x%" PRIx64 "", (uint64_t)ctx->octs);
 	nix_dump(file, "W12: pkts \t\t\t0x%" PRIx64 "", (uint64_t)ctx->pkts);
--
2.34.1


^ permalink raw reply related

* [PATCH v2 04/22] common/cnxk: update NIX irq handler
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

Move queue context dump and register print before interrupt
clear in NIX irq handler.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_nix_irq.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/common/cnxk/roc_nix_irq.c b/drivers/common/cnxk/roc_nix_irq.c
index 2b731302cd..6874435a4e 100644
--- a/drivers/common/cnxk/roc_nix_irq.c
+++ b/drivers/common/cnxk/roc_nix_irq.c
@@ -168,7 +168,7 @@ nix_lf_q_irq_get_and_clear(struct nix *nix, uint16_t q, uint32_t off,
 	reg = roc_atomic64_add_nosync(wdata, (int64_t *)(nix->base + off));

 	if (reg & BIT_ULL(42) /* OP_ERR */) {
-		plt_err("Failed execute irq get off=0x%x", off);
+		plt_err("Failed execute irq get queue=%d off=0x%x", q, off);
 		return 0;
 	}
 	qint = reg & 0xff;
@@ -262,6 +262,10 @@ nix_lf_q_irq(void *param)
 	plt_err("Queue_intr=0x%" PRIx64 " qintx=%d pf=%d, vf=%d", intr, qintx,
 		dev->pf, dev->vf);

+	/* Dump registers to std out */
+	roc_nix_lf_reg_dump(nix_priv_to_roc_nix(nix), NULL);
+	roc_nix_queues_ctx_dump(nix_priv_to_roc_nix(nix), NULL);
+
 	/* Handle RQ interrupts */
 	for (q = 0; q < nix->nb_rx_queues; q++) {
 		rq = q % nix->qints;
@@ -323,10 +327,6 @@ nix_lf_q_irq(void *param)
 	/* Clear interrupt */
 	plt_write64(intr, nix->base + NIX_LF_QINTX_INT(qintx));

-	/* Dump registers to std out */
-	roc_nix_lf_reg_dump(nix_priv_to_roc_nix(nix), NULL);
-	roc_nix_queues_ctx_dump(nix_priv_to_roc_nix(nix), NULL);
-
 	/* Call reset callback */
 	if (intr_cb && dev->ops->q_err_cb)
 		dev->ops->q_err_cb(nix_priv_to_roc_nix(nix), NULL);
--
2.34.1


^ permalink raw reply related

* [PATCH v2 05/22] common/cnxk: configure LSO mask for single segments
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

Configures LSO flag mask for single packets.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_nix_ops.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/common/cnxk/roc_nix_ops.c b/drivers/common/cnxk/roc_nix_ops.c
index 12a12c6e35..4653bb2049 100644
--- a/drivers/common/cnxk/roc_nix_ops.c
+++ b/drivers/common/cnxk/roc_nix_ops.c
@@ -239,6 +239,8 @@ nix_lso_ipv4(struct roc_nix *roc_nix)

 	/* First get flags profile to update v4 flags */
 	memset(&alt_flags, 0, sizeof(alt_flags));
+	alt_flags.s.alt_ssf_set = 0;
+	alt_flags.s.alt_ssf_mask = 0xFFFF;
 	alt_flags.s.alt_fsf_set = 0x2000;
 	alt_flags.s.alt_fsf_mask = 0x5FFF;
 	alt_flags.s.alt_msf_set = 0x2000;
--
2.34.1


^ permalink raw reply related

* [PATCH v2 06/22] net/cnxk: reserve memory for lookup mem at probe
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Nithin Dabilpuram <ndabilpuram@marvell.com>

Reserve memory for lookup mem at probe that is global for
all cnxk ethdev devices to avoid race at later stage.

Signed-off-by: Nithin Dabilpuram <ndabilpuram@marvell.com>
---
Changes in v2: No change.

 drivers/net/cnxk/cnxk_ethdev.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index 06d1c9b362..c782dc51a8 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -2220,6 +2220,12 @@ cnxk_eth_dev_init(struct rte_eth_dev *eth_dev)
 	/* Register callback for inline meta pool create 1:N pool:aura */
 	roc_nix_inl_custom_meta_pool_cb_register(cnxk_nix_inl_custom_meta_pool_cb);

+	/* Reserve memory for lookup_memory */
+	if (!cnxk_nix_fastpath_lookup_mem_get()) {
+		plt_err("Failed to reserve lookup memory rc=%d", rc);
+		goto dev_fini;
+	}
+
 	dev->eth_dev = eth_dev;
 	dev->configured = 0;
 	dev->ptype_disable = 0;
--
2.34.1


^ permalink raw reply related

* [PATCH v2 07/22] drivers: add support for devargs skip size
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Kiran Kumar K <kirankumark@marvell.com>

Adding support for devargs skip_size to cnxk driver.
This allows users to specify the number of bytes to skip in the packet
parsing before L2.

Signed-off-by: Kiran Kumar K <kirankumark@marvell.com>
---
Changes in v2: No change.

 doc/guides/nics/cnxk.rst                      | 19 +++++++-
 drivers/common/cnxk/roc_mbox.h                | 12 ++++-
 drivers/common/cnxk/roc_nix.h                 | 14 ++----
 drivers/common/cnxk/roc_nix_ops.c             | 46 +++++++++++++++++--
 drivers/common/cnxk/roc_npc.c                 | 44 +++++++++++++++++-
 drivers/common/cnxk/roc_npc.h                 |  2 +
 drivers/common/cnxk/roc_npc_priv.h            | 11 +++++
 .../common/cnxk/roc_platform_base_symbols.c   |  1 +
 drivers/net/cnxk/cnxk_eswitch.c               |  2 +-
 drivers/net/cnxk/cnxk_ethdev.c                |  7 ++-
 drivers/net/cnxk/cnxk_ethdev_devargs.c        | 29 +++++++++++-
 11 files changed, 163 insertions(+), 24 deletions(-)

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index b5bd50ceea..239ebcd05c 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -183,8 +183,8 @@ Runtime Config Options

    With the above configuration, higig2 will be enabled on that port and the
    traffic on this port should be higig2 traffic only. Supported switch header
-   types are "chlen24b", "chlen90b", "dsa", "exdsa", "higig2", "vlan_exdsa" and
-   "pre_l2".
+   types are "chlen24b", "chlen90b", "dsa", "exdsa", "higig2", "vlan_exdsa",
+   "pre_l2" and "skip_size".

 - ``Flow pre_l2 info`` (default ``0x0/0x0/0x0``)

@@ -212,6 +212,21 @@ Runtime Config Options
    is 0 (i.e., left shift) then the shift count will be 1, that is, (8 - n),
    where n is the absolute position of leftmost set bit.

+- ``Skip size info`` (default ``0x0``)
+
+   When the switch header type is set to "skip_size", the number of bytes to
+   skip before the Ethernet header can be configured using ``skip_size_info``
+   ``devargs`` parameter. The value is in hexadecimal format and the valid
+   range is 0x0 to 0xff. This configures the PKIND so that the NPC parser
+   skips the specified number of bytes.
+
+   For example::
+
+      -a 0002:02:00.0,switch_header="skip_size",skip_size_info=0x2
+
+   With the above configuration, 2 bytes will be skipped before the Ethernet
+   header when parsing the incoming packets.
+
 - ``RSS tag as XOR`` (default ``0``)

    The HW gives two options to configure the RSS adder i.e
diff --git a/drivers/common/cnxk/roc_mbox.h b/drivers/common/cnxk/roc_mbox.h
index e31abf2234..1158ff50a7 100644
--- a/drivers/common/cnxk/roc_mbox.h
+++ b/drivers/common/cnxk/roc_mbox.h
@@ -462,8 +462,11 @@ struct ready_msg_rsp {
 };

 enum npc_pkind_type {
+	NPC_RX_SKIP_SIZE_PKIND = 46ULL,
+	NPC_RX_CPT_SKIP_SIZE_PKIND = 50ULL,
+	NPC_RX_CPT_HDR_PTP_PKIND = 54ULL,
 	NPC_RX_CUSTOM_PRE_L2_PKIND = 55ULL,
-	NPC_RX_VLAN_EXDSA_PKIND = 56ULL,
+	NPC_RX_VLAN_EXDSA_PKIND,
 	NPC_RX_CHLEN24B_PKIND,
 	NPC_RX_CPT_HDR_PKIND,
 	NPC_RX_CHLEN90B_PKIND,
@@ -474,6 +477,8 @@ enum npc_pkind_type {
 	NPC_TX_DEF_PKIND,
 };

+#define NPC_SKIP_SIZE_PKIND_MAX 4
+
 /* Struct to set pkind */
 struct npc_set_pkind {
 	struct mbox_msghdr hdr;
@@ -484,6 +489,7 @@ struct npc_set_pkind {
 #define ROC_PRIV_FLAGS_EXDSA	  BIT_ULL(4)
 #define ROC_PRIV_FLAGS_VLAN_EXDSA BIT_ULL(5)
 #define ROC_PRIV_FLAGS_PRE_L2	  BIT_ULL(6)
+#define ROC_PRIV_FLAGS_SKIP_SIZE  BIT_ULL(7)
 #define ROC_PRIV_FLAGS_CUSTOM	  BIT_ULL(63)
 	uint64_t __io mode;
 #define PKIND_TX BIT_ULL(0)
@@ -499,6 +505,10 @@ struct npc_set_pkind {
 	/* Shift direction to get length of the
 	 * header at var_len_off
 	 */
+	uint8_t __io skip_size;
+	/* Number of bytes to skip before the Ethernet header.
+	 * Valid only in case custom flag.
+	 */
 };

 /* Structure for requesting resource provisioning.
diff --git a/drivers/common/cnxk/roc_nix.h b/drivers/common/cnxk/roc_nix.h
index 8ba8b3e0b6..49ede85f9a 100644
--- a/drivers/common/cnxk/roc_nix.h
+++ b/drivers/common/cnxk/roc_nix.h
@@ -990,18 +990,14 @@ int __roc_api roc_nix_mac_stats_reset(struct roc_nix *roc_nix);
 int __roc_api roc_nix_mac_fwdata_get(struct roc_nix *roc_nix, struct roc_nix_mac_fwdata *fwdata);

 /* Ops */
-int __roc_api roc_nix_switch_hdr_set(struct roc_nix *roc_nix,
-				     uint64_t switch_header_type,
-				     uint8_t pre_l2_size_offset,
-				     uint8_t pre_l2_size_offset_mask,
-				     uint8_t pre_l2_size_shift_dir);
+int __roc_api roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
+				     uint8_t pre_l2_size_offset, uint8_t pre_l2_size_offset_mask,
+				     uint8_t pre_l2_size_shift_dir, uint8_t skip_size);
 int __roc_api roc_nix_lso_fmt_setup(struct roc_nix *roc_nix);
-int __roc_api roc_nix_lso_fmt_get(struct roc_nix *roc_nix,
-				  uint8_t udp_tun[ROC_NIX_LSO_TUN_MAX],
+int __roc_api roc_nix_lso_fmt_get(struct roc_nix *roc_nix, uint8_t udp_tun[ROC_NIX_LSO_TUN_MAX],
 				  uint8_t tun[ROC_NIX_LSO_TUN_MAX]);
 int __roc_api roc_nix_lso_fmt_ipv4_frag_get(struct roc_nix *roc_nix);
-int __roc_api roc_nix_lso_custom_fmt_setup(struct roc_nix *roc_nix,
-					   struct nix_lso_format *fields,
+int __roc_api roc_nix_lso_custom_fmt_setup(struct roc_nix *roc_nix, struct nix_lso_format *fields,
 					   uint16_t nb_fields);
 int __roc_api roc_nix_lso_alt_flags_profile_setup(struct roc_nix *roc_nix,
 						  nix_lso_alt_flg_format_t *fmt);
diff --git a/drivers/common/cnxk/roc_nix_ops.c b/drivers/common/cnxk/roc_nix_ops.c
index 4653bb2049..13a548216b 100644
--- a/drivers/common/cnxk/roc_nix_ops.c
+++ b/drivers/common/cnxk/roc_nix_ops.c
@@ -501,17 +501,49 @@ roc_nix_lso_fmt_get(struct roc_nix *roc_nix,
 	return 0;
 }

+static int
+skip_size_pkind_get(uint8_t skip_size, uint8_t *pkind)
+{
+	struct skip_size_pkind_cfg *cfg;
+	const struct plt_memzone *mz;
+	int i;
+
+	mz = plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE);
+	if (!mz)
+		return -ENOMEM;
+	cfg = mz->addr;
+
+	for (i = 0; i < cfg->count; i++) {
+		if (cfg->entries[i].skip_size == skip_size) {
+			*pkind = cfg->entries[i].pkind;
+			return 0;
+		}
+	}
+
+	if (cfg->count >= NPC_SKIP_SIZE_PKIND_MAX) {
+		plt_err("skip_size PKIND limit (%d) reached", NPC_SKIP_SIZE_PKIND_MAX);
+		return -ENOSPC;
+	}
+
+	i = cfg->count;
+	cfg->entries[i].skip_size = skip_size;
+	cfg->entries[i].pkind = NPC_RX_SKIP_SIZE_PKIND + i;
+	*pkind = cfg->entries[i].pkind;
+	cfg->count++;
+	return 0;
+}
+
 int
 roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
-		       uint8_t pre_l2_size_offset,
-		       uint8_t pre_l2_size_offset_mask,
-		       uint8_t pre_l2_size_shift_dir)
+		       uint8_t pre_l2_size_offset, uint8_t pre_l2_size_offset_mask,
+		       uint8_t pre_l2_size_shift_dir, uint8_t skip_size)
 {
 	struct nix *nix = roc_nix_to_nix_priv(roc_nix);
 	struct dev *dev = &nix->dev;
 	struct mbox *mbox = mbox_get(dev->mbox);
 	struct npc_set_pkind *req;
 	struct msg_resp *rsp;
+	uint8_t pkind = 0;
 	int rc = -ENOSPC;

 	if (switch_header_type == 0)
@@ -524,6 +556,7 @@ roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
 	    switch_header_type != ROC_PRIV_FLAGS_EXDSA &&
 	    switch_header_type != ROC_PRIV_FLAGS_VLAN_EXDSA &&
 	    switch_header_type != ROC_PRIV_FLAGS_PRE_L2 &&
+	    switch_header_type != ROC_PRIV_FLAGS_SKIP_SIZE &&
 	    switch_header_type != ROC_PRIV_FLAGS_CUSTOM) {
 		plt_err("switch header type is not supported");
 		rc = NIX_ERR_PARAM;
@@ -564,6 +597,13 @@ roc_nix_switch_hdr_set(struct roc_nix *roc_nix, uint64_t switch_header_type,
 		req->var_len_off = pre_l2_size_offset;
 		req->var_len_off_mask = pre_l2_size_offset_mask;
 		req->shift_dir = pre_l2_size_shift_dir;
+	} else if (switch_header_type == ROC_PRIV_FLAGS_SKIP_SIZE) {
+		rc = skip_size_pkind_get(skip_size, &pkind);
+		if (rc)
+			goto exit;
+		req->mode = ROC_PRIV_FLAGS_CUSTOM;
+		req->pkind = pkind;
+		req->skip_size = skip_size;
 	}

 	req->dir = PKIND_RX;
diff --git a/drivers/common/cnxk/roc_npc.c b/drivers/common/cnxk/roc_npc.c
index a906fe0413..111ad0e8bb 100644
--- a/drivers/common/cnxk/roc_npc.c
+++ b/drivers/common/cnxk/roc_npc.c
@@ -420,6 +420,16 @@ roc_npc_init(struct roc_npc *roc_npc)

 	roc_npc->flow_age.age_flow_refcnt = 0;

+	/* Create skip-size PKIND memzone if it doesn't exist */
+	if (!plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE)) {
+		const struct plt_memzone *mz;
+
+		mz = plt_memzone_reserve_cache_align(SKIP_SIZE_PKIND_MEMZONE,
+						     sizeof(struct skip_size_pkind_cfg));
+		if (mz != NULL)
+			memset(mz->addr, 0, sizeof(struct skip_size_pkind_cfg));
+	}
+
 	return rc;

 done:
@@ -457,12 +467,42 @@ roc_npc_fini(struct roc_npc *roc_npc)
 		npc->prio_flow_list = NULL;
 	}

+	{
+		const struct plt_memzone *mz;
+
+		mz = plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE);
+		if (mz)
+			plt_memzone_free(mz);
+	}
+
 	return 0;
 }

 int
-roc_npc_validate_portid_action(struct roc_npc *roc_npc_src,
-			       struct roc_npc *roc_npc_dst)
+roc_npc_skip_size_pkind_get(struct roc_npc *roc_npc)
+{
+	struct skip_size_pkind_cfg *cfg;
+	const struct plt_memzone *mz;
+	int i;
+
+	if (roc_npc->switch_header_type != ROC_PRIV_FLAGS_SKIP_SIZE)
+		return -1;
+
+	mz = plt_memzone_lookup(SKIP_SIZE_PKIND_MEMZONE);
+	if (!mz)
+		return -1;
+	cfg = mz->addr;
+
+	for (i = 0; i < cfg->count; i++) {
+		if (cfg->entries[i].skip_size == roc_npc->skip_size)
+			return cfg->entries[i].pkind + NPC_SKIP_SIZE_PKIND_MAX;
+	}
+
+	return -1;
+}
+
+int
+roc_npc_validate_portid_action(struct roc_npc *roc_npc_src, struct roc_npc *roc_npc_dst)
 {
 	struct roc_nix *roc_nix_src = roc_npc_src->roc_nix;
 	struct nix *nix_src = roc_nix_to_nix_priv(roc_nix_src);
diff --git a/drivers/common/cnxk/roc_npc.h b/drivers/common/cnxk/roc_npc.h
index 130990bda7..a7254f35ca 100644
--- a/drivers/common/cnxk/roc_npc.h
+++ b/drivers/common/cnxk/roc_npc.h
@@ -423,6 +423,7 @@ struct roc_npc {
 					  */
 	uint8_t pre_l2_size_shift_dir;	 /**< Shift direction to calculate size
 					  */
+	uint8_t skip_size;		 /**< Switch header skip size */
 	uint16_t flow_prealloc_size;
 	uint16_t flow_max_priority;
 	uint16_t channel;
@@ -506,4 +507,5 @@ void __roc_api roc_npc_sdp_channel_get(struct roc_npc *roc_npc, uint16_t *chan_b
 				       uint16_t *chan_mask);
 int __roc_api roc_npc_mcam_get_stats(struct roc_npc *roc_npc, struct roc_npc_flow *flow,
 				     uint64_t *count);
+int __roc_api roc_npc_skip_size_pkind_get(struct roc_npc *roc_npc);
 #endif /* _ROC_NPC_H_ */
diff --git a/drivers/common/cnxk/roc_npc_priv.h b/drivers/common/cnxk/roc_npc_priv.h
index f8f4489f06..6a27f0e0fa 100644
--- a/drivers/common/cnxk/roc_npc_priv.h
+++ b/drivers/common/cnxk/roc_npc_priv.h
@@ -554,4 +554,15 @@ int npc_aging_ctrl_thread_create(struct roc_npc *roc_npc, const struct roc_npc_a
 				 struct roc_npc_flow *flow);
 void npc_aging_ctrl_thread_destroy(struct roc_npc *roc_npc);
 int npc_rss_free_grp_get(struct npc *npc, uint32_t *pos);
+
+#define SKIP_SIZE_PKIND_MEMZONE "roc_skip_size_pkind_cfg"
+
+struct skip_size_pkind_cfg {
+	uint8_t count;
+	struct {
+		uint8_t skip_size;
+		uint8_t pkind;
+	} entries[NPC_SKIP_SIZE_PKIND_MAX];
+};
+
 #endif /* _ROC_NPC_PRIV_H_ */
diff --git a/drivers/common/cnxk/roc_platform_base_symbols.c b/drivers/common/cnxk/roc_platform_base_symbols.c
index ed34d4b05b..d1c9f2304d 100644
--- a/drivers/common/cnxk/roc_platform_base_symbols.c
+++ b/drivers/common/cnxk/roc_platform_base_symbols.c
@@ -492,6 +492,7 @@ RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_fini)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_validate_portid_action)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_parse)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_sdp_channel_get)
+RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_skip_size_pkind_get)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_create)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_destroy)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_npc_flow_dump)
diff --git a/drivers/net/cnxk/cnxk_eswitch.c b/drivers/net/cnxk/cnxk_eswitch.c
index e45c7dfd07..7e717a2fbf 100644
--- a/drivers/net/cnxk/cnxk_eswitch.c
+++ b/drivers/net/cnxk/cnxk_eswitch.c
@@ -553,7 +553,7 @@ nix_lf_setup(struct cnxk_eswitch_dev *eswitch_dev)
 		goto free_cqs;
 	}

-	rc = roc_nix_switch_hdr_set(nix, 0, 0, 0, 0);
+	rc = roc_nix_switch_hdr_set(nix, 0, 0, 0, 0, 0);
 	if (rc) {
 		plt_err("switch hdr set failed = %s(%d)", roc_error_msg_get(rc), rc);
 		goto free_cqs;
diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index c782dc51a8..a21e170229 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -1639,10 +1639,9 @@ cnxk_nix_configure(struct rte_eth_dev *eth_dev)
 		goto free_nix_lf;
 	}

-	rc = roc_nix_switch_hdr_set(nix, dev->npc.switch_header_type,
-				    dev->npc.pre_l2_size_offset,
+	rc = roc_nix_switch_hdr_set(nix, dev->npc.switch_header_type, dev->npc.pre_l2_size_offset,
 				    dev->npc.pre_l2_size_offset_mask,
-				    dev->npc.pre_l2_size_shift_dir);
+				    dev->npc.pre_l2_size_shift_dir, dev->npc.skip_size);
 	if (rc) {
 		plt_err("Failed to enable switch type nix_lf rc=%d", rc);
 		goto free_nix_lf;
@@ -2364,7 +2363,7 @@ cnxk_eth_dev_uninit(struct rte_eth_dev *eth_dev, bool reset)
 		return 0;

 	/* Disable switch hdr pkind */
-	roc_nix_switch_hdr_set(&dev->nix, 0, 0, 0, 0);
+	roc_nix_switch_hdr_set(&dev->nix, 0, 0, 0, 0, 0);

 	/* Clear the flag since we are closing down */
 	dev->configured = 0;
diff --git a/drivers/net/cnxk/cnxk_ethdev_devargs.c b/drivers/net/cnxk/cnxk_ethdev_devargs.c
index da8fc83f9d..ea18090919 100644
--- a/drivers/net/cnxk/cnxk_ethdev_devargs.c
+++ b/drivers/net/cnxk/cnxk_ethdev_devargs.c
@@ -239,6 +239,25 @@ parse_switch_header_type(const char *key, const char *value, void *extra_args)
 	if (strcmp(value, "pre_l2") == 0)
 		*(uint16_t *)extra_args = ROC_PRIV_FLAGS_PRE_L2;

+	if (strcmp(value, "skip_size") == 0)
+		*(uint16_t *)extra_args = ROC_PRIV_FLAGS_SKIP_SIZE;
+
+	return 0;
+}
+
+static int
+parse_skip_size_info(const char *key, const char *value, void *extra_args)
+{
+	RTE_SET_USED(key);
+	uint32_t val;
+
+	errno = 0;
+	val = strtoul(value, NULL, 0);
+	if (errno || val > 255)
+		return -EINVAL;
+
+	*(uint16_t *)extra_args = val;
+
 	return 0;
 }

@@ -303,6 +322,7 @@ parse_val_u16(const char *key, const char *value, void *extra_args)
 #define CNXK_FORCE_TAIL_DROP	  "force_tail_drop"
 #define CNXK_DIS_XQE_DROP	  "disable_xqe_drop"
 #define CNXK_RXC_STEP		  "rxc_step"
+#define CNXK_SKIP_SIZE_INFO	  "skip_size_info"

 int
 cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
@@ -317,6 +337,7 @@ cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 	uint16_t custom_meta_aura_dis = 0;
 	uint16_t flow_prealloc_size = 1;
 	uint16_t switch_header_type = 0;
+	uint16_t skip_size_info = 0;
 	uint16_t flow_max_priority = 3;
 	uint16_t outb_nb_crypto_qs = 1;
 	uint32_t ipsec_in_min_spi = 0;
@@ -392,6 +413,8 @@ cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 	rte_kvargs_process(kvlist, CNXK_FORCE_TAIL_DROP, &parse_flag, &force_tail_drop);
 	rte_kvargs_process(kvlist, CNXK_DIS_XQE_DROP, &parse_flag, &dis_xqe_drop);
 	rte_kvargs_process(kvlist, CNXK_RXC_STEP, &parse_rxc_step, &rxc_step);
+	rte_kvargs_process(kvlist, CNXK_SKIP_SIZE_INFO, &parse_skip_size_info,
+			   &skip_size_info);
 	rte_kvargs_free(kvlist);

 null_devargs:
@@ -424,6 +447,7 @@ cnxk_ethdev_parse_devargs(struct rte_devargs *devargs, struct cnxk_eth_dev *dev)
 		dev->npc.flow_max_priority = flow_max_priority;

 	dev->npc.switch_header_type = switch_header_type;
+	dev->npc.skip_size = skip_size_info;
 	dev->npc.sdp_channel = sdp_chan.channel;
 	dev->npc.sdp_channel_mask = sdp_chan.mask;
 	dev->npc.is_sdp_mask_set = sdp_chan.is_sdp_mask_set;
@@ -448,7 +472,7 @@ RTE_PMD_REGISTER_PARAM_STRING(net_cnxk,
 			      CNXK_MAX_SQB_COUNT "=<8-512>"
 			      CNXK_FLOW_PREALLOC_SIZE "=<1-32>"
 			      CNXK_FLOW_MAX_PRIORITY "=<1-32>"
-			      CNXK_SWITCH_HEADER_TYPE "=<higig2|dsa|chlen90b>"
+			      CNXK_SWITCH_HEADER_TYPE "=<higig2|dsa|chlen90b|skip_size>"
 			      CNXK_RSS_TAG_AS_XOR "=1"
 			      CNXK_IPSEC_IN_MAX_SPI "=<1-65535>"
 			      CNXK_OUTB_NB_DESC "=<1-65535>"
@@ -463,4 +487,5 @@ RTE_PMD_REGISTER_PARAM_STRING(net_cnxk,
 			      CNXK_CUSTOM_META_AURA_DIS "=1"
 			      CNXK_FORCE_TAIL_DROP "=1"
 			      CNXK_DIS_XQE_DROP "=1"
-			      CNXK_RXC_STEP "=<0-1048575>");
+			      CNXK_RXC_STEP "=<0-1048575>"
+			      CNXK_SKIP_SIZE_INFO "=<0x0-0xff>");
--
2.34.1


^ permalink raw reply related

* [PATCH v2 08/22] net/cnxk: update inbound SA pkind for skip size
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rakesh Kudurumalla
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Rakesh Kudurumalla <rkudurumalla@marvell.com>

Update the inbound SA pkind using roc_npc_skip_size_pkind_get()
during session create and session update for both CN10K and CN20K.
This ensures the CPT second pass uses the correct pkind when
skip size is configured, retaining the default pkind otherwise.

Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>
---
Changes in v2: No change.

 drivers/net/cnxk/cn10k_ethdev_sec.c | 9 +++++++++
 drivers/net/cnxk/cn20k_ethdev_sec.c | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/drivers/net/cnxk/cn10k_ethdev_sec.c b/drivers/net/cnxk/cn10k_ethdev_sec.c
index 855bea1796..2f1fdf34fc 100644
--- a/drivers/net/cnxk/cn10k_ethdev_sec.c
+++ b/drivers/net/cnxk/cn10k_ethdev_sec.c
@@ -853,6 +853,10 @@ cn10k_eth_sec_session_create(void *device,
 			goto err;
 		}

+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		inb_priv = roc_nix_inl_ot_ipsec_inb_sa_sw_rsvd(inb_sa);
 		/* Back pointer to get eth_sec */
 		inb_priv->eth_sec = eth_sec;
@@ -1151,6 +1155,11 @@ cn10k_eth_sec_session_update(void *device, struct rte_security_session *sess,
 		rc = cnxk_ot_ipsec_inb_sa_fill(inb_sa_dptr, ipsec, crypto, 0);
 		if (rc)
 			goto err;
+
+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		/* Use cookie for original data */
 		inb_sa_dptr->w1.s.cookie = inb_sa->w1.s.cookie;

diff --git a/drivers/net/cnxk/cn20k_ethdev_sec.c b/drivers/net/cnxk/cn20k_ethdev_sec.c
index 5d0debb81d..31f2518ea3 100644
--- a/drivers/net/cnxk/cn20k_ethdev_sec.c
+++ b/drivers/net/cnxk/cn20k_ethdev_sec.c
@@ -865,6 +865,10 @@ cn20k_eth_sec_session_create(void *device, struct rte_security_session_conf *con
 			goto err;
 		}

+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		cn20k_eth_sec_inb_sa_misc_fill(inb_sa_dptr, ipsec);

 		inb_priv = roc_nix_inl_ow_ipsec_inb_sa_sw_rsvd(inb_sa);
@@ -1137,6 +1141,10 @@ cn20k_eth_sec_session_update(void *device, struct rte_security_session *sess,
 		if (rc)
 			return -EINVAL;

+		rc = roc_npc_skip_size_pkind_get(&dev->npc);
+		if (rc >= 0)
+			inb_sa_dptr->w0.s.pkind = rc;
+
 		cn20k_eth_sec_inb_sa_misc_fill(inb_sa_dptr, ipsec);

 		/* Use cookie for original data */
--
2.34.1


^ permalink raw reply related

* [PATCH v2 09/22] common/cnxk: fix cnxk xstats names
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Thomas Monjalon, Nithin Dabilpuram, Kiran Kumar K,
	Sunil Kumar Kori, Satha Rao, Harman Kalra, Rakesh Kudurumalla
  Cc: jerinj, Alok Mishra, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Alok Mishra <almishra@marvell.com>

Prevent out of bounds writes when application provides a smaller
xstat name array. Return required count when xstats_names is NULL
or when the provided buffer is too small,

Fixes: 825bd1d9d8e6 ("common/cnxk: update extra stats for inline device")
Cc: stable@dpdk.org

Signed-off-by: Alok Mishra <almishra@marvell.com>
---
Changes in v2: No change.

 .mailmap                            |  1 +
 drivers/common/cnxk/roc_nix_stats.c | 46 ++++++++++++++++-------------
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/.mailmap b/.mailmap
index 0e0d83e1c6..efcb38b6bd 100644
--- a/.mailmap
+++ b/.mailmap
@@ -80,6 +80,7 @@ Alin Rauta <alin.rauta@intel.com>
 Allain Legacy <allain.legacy@windriver.com>
 Allen Hubbe <allen.hubbe@amd.com>
 Alok Makhariya <alok.makhariya@nxp.com>
+Alok Mishra <almishra@marvell.com>
 Alok Prasad <palok@marvell.com>
 Alvaro Karsz <alvaro.karsz@solid-run.com>
 Alvin Zhang <alvinx.zhang@intel.com>
diff --git a/drivers/common/cnxk/roc_nix_stats.c b/drivers/common/cnxk/roc_nix_stats.c
index 6f241c72de..ec2aca8164 100644
--- a/drivers/common/cnxk/roc_nix_stats.c
+++ b/drivers/common/cnxk/roc_nix_stats.c
@@ -503,46 +503,51 @@ roc_nix_xstats_names_get(struct roc_nix *roc_nix,
 	struct idev_cfg *idev = idev_get_cfg();
 	uint64_t i, count = 0;

-	PLT_SET_USED(limit);
-
 	for (i = 0; i < CNXK_NIX_NUM_TX_XSTATS; i++) {
-		NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats, i);
+		if (xstats_names && count < limit)
+			NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats, i);
 		count++;
 	}

 	for (i = 0; i < CNXK_NIX_NUM_RX_XSTATS; i++) {
-		NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats, i);
+		if (xstats_names && count < limit)
+			NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats, i);
 		count++;
 	}

 	if (nix->inb_inl_dev && idev) {
 		if (idev->nix_inl_dev) {
 			for (i = 0; i < CNXK_INL_NIX_NUM_RX_XSTATS; i++) {
-				NIX_XSTATS_NAME_PRINT(xstats_names, count,
-						      inl_nix_rx_xstats, i);
+				if (xstats_names && count < limit)
+					NIX_XSTATS_NAME_PRINT(xstats_names, count,
+							      inl_nix_rx_xstats, i);
 				count++;
 			}
 			for (i = 0; i < CNXK_INL_NIX_RQ_XSTATS; i++) {
-				NIX_XSTATS_NAME_PRINT(xstats_names, count,
-						      inl_nix_rq_xstats, i);
+				if (xstats_names && count < limit)
+					NIX_XSTATS_NAME_PRINT(xstats_names, count,
+							      inl_nix_rq_xstats, i);
 				count++;
 			}
 			for (i = 0; i < PLT_DIM(inl_sw_xstats); i++) {
-				NIX_XSTATS_NAME_PRINT(xstats_names, count, inl_sw_xstats, i);
+				if (xstats_names && count < limit)
+					NIX_XSTATS_NAME_PRINT(xstats_names, count, inl_sw_xstats,
+							      i);
 				count++;
 			}
 		}
 	}

 	for (i = 0; i < CNXK_NIX_NUM_QUEUE_XSTATS; i++) {
-		NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_q_xstats, i);
+		if (xstats_names && count < limit)
+			NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_q_xstats, i);
 		count++;
 	}

 	if (roc_model_is_cn10k() || roc_model_is_cn20k()) {
 		for (i = 0; i < CNXK_NIX_NUM_CN10K_RX_XSTATS; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_cn10k_rx_xstats, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_cn10k_rx_xstats, i);
 			count++;
 		}
 	}
@@ -552,30 +557,29 @@ roc_nix_xstats_names_get(struct roc_nix *roc_nix,

 	if (roc_model_is_cn9k()) {
 		for (i = 0; i < CNXK_NIX_NUM_RX_XSTATS_CGX; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_rx_xstats_cgx, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats_cgx, i);
 			count++;
 		}

 		for (i = 0; i < CNXK_NIX_NUM_TX_XSTATS_CGX; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_tx_xstats_cgx, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats_cgx, i);
 			count++;
 		}

 	} else {
 		for (i = 0; i < CNXK_NIX_NUM_RX_XSTATS_RPM; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_rx_xstats_rpm, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_rx_xstats_rpm, i);
 			count++;
 		}

 		for (i = 0; i < CNXK_NIX_NUM_TX_XSTATS_RPM; i++) {
-			NIX_XSTATS_NAME_PRINT(xstats_names, count,
-					      nix_tx_xstats_rpm, i);
+			if (xstats_names && count < limit)
+				NIX_XSTATS_NAME_PRINT(xstats_names, count, nix_tx_xstats_rpm, i);
 			count++;
 		}
 	}
-
 	return count;
 }
--
2.34.1


^ permalink raw reply related

* [PATCH v2 10/22] common/cnxk: fix event type for soft expiry
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Stephen Hemminger
  Cc: jerinj, Rahul Bhansali
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

Fix event type to default for inline soft expiry processing.

Fixes: 4a6154a7bd27 ("common/cnxk: fix array out-of-bounds")

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_nix_inl_dev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/common/cnxk/roc_nix_inl_dev.c b/drivers/common/cnxk/roc_nix_inl_dev.c
index 667209b8a0..bfdeef2688 100644
--- a/drivers/common/cnxk/roc_nix_inl_dev.c
+++ b/drivers/common/cnxk/roc_nix_inl_dev.c
@@ -1234,6 +1234,7 @@ inl_outb_soft_exp_poll(struct nix_inl_dev *inl_dev, uint32_t ring_idx)

 		if (sa != NULL) {
 			uint64_t tmp[2];
+			tmp[0] = ~0ULL;
 			inl_dev->work_cb(tmp, sa, NIX_INL_SOFT_EXPIRY_THRD, NULL, port_id);
 			__atomic_store_n(ring_base + tail_l + 1, 0ULL, __ATOMIC_RELAXED);
 			__atomic_fetch_add((uint32_t *)ring_base, 1, __ATOMIC_ACQ_REL);
--
2.34.1


^ permalink raw reply related

* [PATCH v2 11/22] net/cnxk: enable CPT CQ by default for inline IPsec
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Aarnav JP
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

CPT Completion Queue is supported on CN20K and provides
hardware-based completion notification, eliminating the
need for software polling. Change the default value of
cpt_cq_enable devarg from 0 to 1 so that CPT CQ is
enabled by default.

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 doc/guides/nics/cnxk.rst           | 13 +++++++++++++
 drivers/net/cnxk/cnxk_ethdev_sec.c |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/doc/guides/nics/cnxk.rst b/doc/guides/nics/cnxk.rst
index 239ebcd05c..c71029e1dc 100644
--- a/doc/guides/nics/cnxk.rst
+++ b/doc/guides/nics/cnxk.rst
@@ -745,6 +745,19 @@ Runtime Config Options for inline device
    With the above configuration, inline inbound IPsec post-processing
    should be done by the application.

+- ``Enable CPT Completion Queue for inline IPsec`` (default ``1`` for CN20K, ``0`` otherwise)
+
+   CPT Completion Queue for inline IPsec event delivery can be enabled or disabled
+   by ``cpt_cq_enable`` devargs parameter.
+   This option is supported on OCTEON CN20K SoC family.
+
+   For example::
+
+      -a 0002:1d:00.0,cpt_cq_enable=1
+
+   With the above configuration, driver would enable CPT completion queue
+   for inline IPsec event delivery instead of using the err-ring poll thread.
+
 Port Representors
 -----------------

diff --git a/drivers/net/cnxk/cnxk_ethdev_sec.c b/drivers/net/cnxk/cnxk_ethdev_sec.c
index fa7eacfbe4..61eb55ba43 100644
--- a/drivers/net/cnxk/cnxk_ethdev_sec.c
+++ b/drivers/net/cnxk/cnxk_ethdev_sec.c
@@ -742,7 +742,7 @@ nix_inl_parse_devargs(struct rte_devargs *devargs,
 	uint32_t meta_buf_sz = 0;
 	uint8_t rx_inj_ena = 0;
 	uint8_t selftest = 0;
-	uint8_t cpt_cq_enable = 0;
+	uint8_t cpt_cq_enable = roc_feature_nix_has_cpt_cq_support() ? 1 : 0;

 	memset(&cpt_channel, 0, sizeof(cpt_channel));

--
2.34.1


^ permalink raw reply related

* [PATCH v2 12/22] net/cnxk: fix unsigned integer underflow in LSO calculation
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Pavan Nikhilesh, Jerin Jacob,
	Rahul Bhansali
  Cc: Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Replace branchless mask-based selection with a ternary operator
to resolve Coverity integer underflow warning. The expression
-(!w1.il3type) assigned -1 to a uint64_t variable, which is
well-defined but flagged as an unsigned integer underflow.

Coverity issue: 502004

Fixes: 19f3af2371a7 ("net/cnxk: add Tx burst for CN10K")
Fixes: 39dc567c1955 ("net/cnxk: add Tx burst for CN9K")
Fixes: 006c1daa89b9 ("net/cnxk: support Tx burst scalar for CN20K")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/net/cnxk/cn10k_tx.h | 8 ++------
 drivers/net/cnxk/cn20k_tx.h | 8 ++------
 drivers/net/cnxk/cn9k_tx.h  | 8 ++------
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 8c912a1f35..d5cb2c3294 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -1138,10 +1138,8 @@ cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
 	if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_TSO_F &&
 	    (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
 		uint16_t lso_sb;
-		uint64_t mask;

-		mask = -(!w1.il3type);
-		lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
+		lso_sb = (w1.il3type ? w1.il4ptr : w1.ol4ptr) + m->l4_len;

 		send_hdr_ext->w0.lso_sb = lso_sb;
 		send_hdr_ext->w0.lso = 1;
@@ -1766,13 +1764,11 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 		      const uint64_t flags, const uint64_t lso_tun_fmt)
 {
 	uint16_t lso_sb;
-	uint64_t mask;

 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
 		return;

-	mask = -(!w1->il3type);
-	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+	lso_sb = (w1->il3type ? w1->il4ptr : w1->ol4ptr) + m->l4_len;

 	w0->u |= BIT(14);
 	w0->lso_sb = lso_sb;
diff --git a/drivers/net/cnxk/cn20k_tx.h b/drivers/net/cnxk/cn20k_tx.h
index 8e64d2e352..a1c71f2761 100644
--- a/drivers/net/cnxk/cn20k_tx.h
+++ b/drivers/net/cnxk/cn20k_tx.h
@@ -1117,10 +1117,8 @@ cn20k_nix_xmit_prepare(struct cn20k_eth_txq *txq, struct rte_mbuf *m, struct rte
 	if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_TSO_F &&
 	    (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
 		uint16_t lso_sb;
-		uint64_t mask;

-		mask = -(!w1.il3type);
-		lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
+		lso_sb = (w1.il3type ? w1.il4ptr : w1.ol4ptr) + m->l4_len;

 		send_hdr_ext->w0.lso_sb = lso_sb;
 		send_hdr_ext->w0.lso = 1;
@@ -1732,13 +1730,11 @@ cn20k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1, union nix
 		      uint64_t ol_flags, const uint64_t flags, const uint64_t lso_tun_fmt)
 {
 	uint16_t lso_sb;
-	uint64_t mask;

 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
 		return;

-	mask = -(!w1->il3type);
-	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+	lso_sb = (w1->il3type ? w1->il4ptr : w1->ol4ptr) + m->l4_len;

 	w0->u |= BIT(14);
 	w0->lso_sb = lso_sb;
diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 0ec448e36c..2f9b936d56 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -478,10 +478,8 @@ cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_m

 	if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
 		uint16_t lso_sb;
-		uint64_t mask;

-		mask = -(!w1.il3type);
-		lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
+		lso_sb = (w1.il3type ? w1.il4ptr : w1.ol4ptr) + m->l4_len;

 		send_hdr_ext->w0.lso_sb = lso_sb;
 		send_hdr_ext->w0.lso = 1;
@@ -875,13 +873,11 @@ cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
 		     uint64_t flags)
 {
 	uint16_t lso_sb;
-	uint64_t mask;

 	if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
 		return;

-	mask = -(!w1->il3type);
-	lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
+	lso_sb = (w1->il3type ? w1->il4ptr : w1->ol4ptr) + m->l4_len;

 	w0->u |= BIT(14);
 	w0->lso_sb = lso_sb;
--
2.34.1


^ permalink raw reply related

* [PATCH v2 13/22] net/cnxk: derive ethdev from SA for inbound CPT CQ events
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Aarnav JP
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

With inbound CPT CQ events, port_id is unavailable since the inline
device has no roc_nix. Resolve eth_dev for inbound CQ events in the
cn20k PMD callback via the SA private data chain instead of relying
on port_id.

Add an eth_dev back pointer to cnxk_eth_sec_sess, populated at
session creation, to complete the SA-to-ethdev lookup path
(inb_priv->eth_sec->eth_dev).

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/net/cnxk/cn20k_ethdev_sec.c | 22 ++++++++++++++++++++--
 drivers/net/cnxk/cnxk_ethdev.h      |  3 +++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/net/cnxk/cn20k_ethdev_sec.c b/drivers/net/cnxk/cn20k_ethdev_sec.c
index 31f2518ea3..a5be85901f 100644
--- a/drivers/net/cnxk/cn20k_ethdev_sec.c
+++ b/drivers/net/cnxk/cn20k_ethdev_sec.c
@@ -541,7 +541,6 @@ cn20k_eth_sec_sso_work_cb(uint64_t *gw, void *args, enum nix_inl_event_type type
 	uintptr_t nixtx;
 	uint8_t port;

-	RTE_SET_USED(args);
 	plt_nix_dbg("Received %s event", get_inl_event_type(type));

 	switch ((gw[0] >> 28) & 0xF) {
@@ -561,8 +560,26 @@ cn20k_eth_sec_sso_work_cb(uint64_t *gw, void *args, enum nix_inl_event_type type
 		/* Fall through */
 	default:
 		if (type) {
-			eth_dev = &rte_eth_devices[port_id];
 			struct cpt_cq_s *cqs = (struct cpt_cq_s *)cq_s;
+
+			if (type == NIX_INL_INB_CPT_CQ) {
+				struct cn20k_inb_priv_data *inb_priv;
+
+				inb_priv = roc_nix_inl_ow_ipsec_inb_sa_sw_rsvd(args);
+				if (inb_priv->eth_sec && inb_priv->eth_sec->eth_dev) {
+					eth_dev = inb_priv->eth_sec->eth_dev;
+				} else {
+					plt_err("Inbound CPT CQ event: no eth_dev in SA priv");
+					return;
+				}
+			} else {
+				if (port_id >= RTE_MAX_ETHPORTS) {
+					plt_err("CPT CQ event: invalid port_id %u", port_id);
+					return;
+				}
+				eth_dev = &rte_eth_devices[port_id];
+			}
+
 			if (type < NIX_INL_SSO) {
 				cn20k_eth_sec_post_event(eth_dev, args, type,
 							 (uint16_t)cqs->w0.s.uc_compcode,
@@ -804,6 +821,7 @@ cn20k_eth_sec_session_create(void *device, struct rte_security_session_conf *con
 	inl_dev = !!dev->inb.inl_dev;

 	memset(eth_sec, 0, sizeof(struct cnxk_eth_sec_sess));
+	eth_sec->eth_dev = eth_dev;
 	sess_priv.u64 = 0;

 	lock = inbound ? &dev->inb.lock : &dev->outb.lock;
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index ea6a2be30e..6686fdba31 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -225,6 +225,9 @@ struct cnxk_eth_sec_sess {

 	/* Out-Of-Place processing */
 	bool inb_oop;
+
+	/* Back pointer to eth_dev for port_id derivation in CQ callbacks */
+	struct rte_eth_dev *eth_dev;
 };

 TAILQ_HEAD(cnxk_eth_sec_sess_list, cnxk_eth_sec_sess);
--
2.34.1


^ permalink raw reply related

* [PATCH v2 14/22] net/cnxk: fix bitwise operand size mismatch in link mode
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Cast enum roc_nix_link_mode values to uint64_t before bitwise
OR with uint64_t advertise variable to ensure consistent
operand sizes.

Fixes: 292fcbb3d290 ("net/cnxk: support link mode configuration")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/net/cnxk/cnxk_link.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/cnxk/cnxk_link.c b/drivers/net/cnxk/cnxk_link.c
index dde2c3a313..f6682b4697 100644
--- a/drivers/net/cnxk/cnxk_link.c
+++ b/drivers/net/cnxk/cnxk_link.c
@@ -61,13 +61,13 @@ nix_link_advertising_get(struct cnxk_eth_dev *dev, struct roc_nix_link_info *lin
 		} else {
 			for (bit = 0; bit < ROC_NIX_LINK_SPEED_MAX; bit++) {
 				if (link_info->speed_bitmask & BIT_ULL(bit))
-					advertise |= rte_to_ethtool_mode[bit];
+					advertise |= (uint64_t)rte_to_ethtool_mode[bit];
 			}
 			goto exit;
 		}
 	}

-	advertise |= mac_to_ethtool_mode[linfo.lmac_type_id][link_info->full_duplex];
+	advertise |= (uint64_t)mac_to_ethtool_mode[linfo.lmac_type_id][link_info->full_duplex];
 exit:
 	return advertise;
 }
--
2.34.1


^ permalink raw reply related

* [PATCH v2 15/22] common/cnxk: add cipher key length check in key set
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Akhil Goyal, Ankur Dwivedi
  Cc: jerinj, Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Add upper-bound checks before memcpy into encr_key[32]
in roc_se_ciph_key_set() to prevent buffer overflow into
adjacent encr_iv[16]. Covers all write paths including
AES-DOCSISBPI and DES-DOCSISBPI branches that bypass
the generic copy via goto.

Fixes: 5e076b609f2a ("common/cnxk: add SE set key for crypto")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_se.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/common/cnxk/roc_se.c b/drivers/common/cnxk/roc_se.c
index d841a926a4..1cec536169 100644
--- a/drivers/common/cnxk/roc_se.c
+++ b/drivers/common/cnxk/roc_se.c
@@ -545,12 +545,22 @@ roc_se_ciph_key_set(struct roc_se_ctx *se_ctx, roc_se_cipher_type type, const ui
 		 * less than 128. Pass it as regular AES-CBC cipher to CPT, but keep type in
 		 * se_ctx as AES_DOCSISBPI to skip block size checks in instruction preparation.
 		 */
+		if (key_len > sizeof(fctx->enc.encr_key)) {
+			plt_err("Cipher key length %u exceeds max %zu", key_len,
+				sizeof(fctx->enc.encr_key));
+			return -1;
+		}
 		cpt_ciph_aes_key_type_set(fctx, key_len);
 		fctx->enc.enc_cipher = ROC_SE_AES_CBC;
 		memcpy(fctx->enc.encr_key, key, key_len);
 		goto success;
 	case ROC_SE_DES_DOCSISBPI:
 		/* See case ROC_SE_DES3_CBC: for explanation */
+		if (key_len * 3 > sizeof(fctx->enc.encr_key)) {
+			plt_err("DES-DOCSISBPI key length %u exceeds max %zu", key_len,
+				sizeof(fctx->enc.encr_key) / 3);
+			return -1;
+		}
 		for (i = 0; i < 3; i++)
 			memcpy(fctx->enc.encr_key + key_len * i, key, key_len);
 		/*
@@ -628,6 +638,11 @@ roc_se_ciph_key_set(struct roc_se_ctx *se_ctx, roc_se_cipher_type type, const ui
 	if (se_ctx->hash_type != ROC_SE_GMAC_TYPE)
 		fctx->enc.enc_cipher = type;

+	if (key_len > sizeof(fctx->enc.encr_key)) {
+		plt_err("Cipher key length %u exceeds max %zu", key_len,
+			sizeof(fctx->enc.encr_key));
+		return -1;
+	}
 	memcpy(fctx->enc.encr_key, key, key_len);

 success:
--
2.34.1


^ permalink raw reply related

* [PATCH v2 16/22] common/cnxk: fix Klocwork static analysis issues
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Satheesh Paul, Jerin Jacob,
	Rakesh Kudurumalla
  Cc: Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Fix NULL pointer dereferences (roc_dev.c, roc_npa.c, roc_nix_inl.c),
resource leaks in error paths (roc_dev.c, roc_dpi.c, roc_ree.c,
roc_nix.c, roc_emdev.c), uninitialized variables (roc_npa_debug.c,
roc_emdev.c), array out-of-bounds access (roc_npc_utils.c, roc_emdev.c),
bitwise operand size mismatches (roc_mbox.h, roc_emdev_irq.c), and
format string type mismatches (roc_cpt_debug.c).

Fixes: 5d8ff275433a ("common/cnxk: fix race condition between up and down mailbox")
Fixes: 9a92937cf0c8 ("common/cnxk: fix possible out-of-bounds access")
Fixes: 7557e3f5b9fa ("common/cnxk: replace direct API usage in REE")
Fixes: 3fdf3e53f3c4 ("common/cnxk: enable CPT CQ for inline IPsec inbound")
Fixes: c758279fee32 ("common/cnxk: support debug dump to file")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_cpt_debug.c | 29 ++++++++++++++---------------
 drivers/common/cnxk/roc_dev.c       | 15 +++++++++++----
 drivers/common/cnxk/roc_mbox.h      |  4 ++--
 drivers/common/cnxk/roc_nix_inl.c   |  3 +--
 drivers/common/cnxk/roc_npa.c       |  3 +++
 drivers/common/cnxk/roc_npa_debug.c |  8 +++++++-
 drivers/common/cnxk/roc_npc_utils.c | 10 +++++++---
 drivers/common/cnxk/roc_ree.c       | 17 ++++++++++++-----
 8 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/drivers/common/cnxk/roc_cpt_debug.c b/drivers/common/cnxk/roc_cpt_debug.c
index 3b3e678c20..3c1c052e50 100644
--- a/drivers/common/cnxk/roc_cpt_debug.c
+++ b/drivers/common/cnxk/roc_cpt_debug.c
@@ -33,7 +33,7 @@ cpt_cnxk_parse_hdr_dump(FILE *file, const struct cpt_parse_hdr_s *cpth)
 		 cpth->w0.num_frags, cpth->w0.pkt_out);

 	/* W1 */
-	cpt_dump(file, "W1: wqe_ptr \t0x%016lx\t", cpth->wqe_ptr);
+	cpt_dump(file, "W1: wqe_ptr \t0x%016" PRIx64 "\t", cpth->wqe_ptr);

 	/* W2 */
 	cpt_dump(file, "W2: pkt_inline \t0x%x\t\torig_pkt_aura \t0x%x", cpth->w2.pkt_inline,
@@ -135,29 +135,28 @@ cpt_cn10k_parse_hdr_dump(FILE *file, const struct cpt_cn10k_parse_hdr_s *cpth)
 	cpt_dump(file, "W0: cookie \t0x%x\t\tmatch_id \t0x%04x \t",
 		  cpth->w0.cookie, cpth->w0.match_id);
 	cpt_dump(file, "W0: err_sum \t%u \t", cpth->w0.err_sum);
-	cpt_dump(file, "W0: reas_sts \t0x%x\t\tet_owr \t%u\t\tpkt_fmt \t%u \t",
-		  cpth->w0.reas_sts, cpth->w0.et_owr, cpth->w0.pkt_fmt);
-	cpt_dump(file, "W0: pad_len \t%u\t\tnum_frags \t%u\t\tpkt_out \t%u \t",
-		  cpth->w0.pad_len, cpth->w0.num_frags, cpth->w0.pkt_out);
+	cpt_dump(file, "W0: reas_sts \t0x%x\t\tet_owr \t%u\t\tpkt_fmt \t%u \t", cpth->w0.reas_sts,
+		 cpth->w0.et_owr, cpth->w0.pkt_fmt);
+	cpt_dump(file, "W0: pad_len \t%u\t\tnum_frags \t%u\t\tpkt_out \t%u \t", cpth->w0.pad_len,
+		 cpth->w0.num_frags, cpth->w0.pkt_out);

 	/* W1 */
-	cpt_dump(file, "W1: wqe_ptr \t0x%016lx\t",
-			plt_be_to_cpu_64(cpth->wqe_ptr));
+	cpt_dump(file, "W1: wqe_ptr \t0x%016" PRIx64 "\t",
+		 (uint64_t)plt_be_to_cpu_64(cpth->wqe_ptr));

 	/* W2 */
-	cpt_dump(file, "W2: frag_age \t0x%x\t\torig_pf_func \t0x%04x",
-		  cpth->w2.frag_age, cpth->w2.orig_pf_func);
-	cpt_dump(file, "W2: il3_off \t0x%x\t\tfi_pad \t0x%x \t",
-		  cpth->w2.il3_off, cpth->w2.fi_pad);
+	cpt_dump(file, "W2: frag_age \t0x%x\t\torig_pf_func \t0x%04x", cpth->w2.frag_age,
+		 cpth->w2.orig_pf_func);
+	cpt_dump(file, "W2: il3_off \t0x%x\t\tfi_pad \t0x%x \t", cpth->w2.il3_off, cpth->w2.fi_pad);
 	cpt_dump(file, "W2: fi_offset \t0x%x \t", cpth->w2.fi_offset);

 	/* W3 */
-	cpt_dump(file, "W3: hw_ccode \t0x%x\t\tuc_ccode \t0x%x\t\tspi \t0x%08x",
-		  cpth->w3.hw_ccode, cpth->w3.uc_ccode, cpth->w3.spi);
+	cpt_dump(file, "W3: hw_ccode \t0x%x\t\tuc_ccode \t0x%x\t\tspi \t0x%08x", cpth->w3.hw_ccode,
+		 cpth->w3.uc_ccode, cpth->w3.spi);

 	/* W4 */
-	cpt_dump(file, "W4: esn \t%" PRIx64 " \t OR frag1_wqe_ptr \t0x%" PRIx64,
-		  cpth->esn, plt_be_to_cpu_64(cpth->frag1_wqe_ptr));
+	cpt_dump(file, "W4: esn \t%" PRIx64 " \t OR frag1_wqe_ptr \t0x%" PRIx64, cpth->esn,
+		 (uint64_t)plt_be_to_cpu_64(cpth->frag1_wqe_ptr));

 	/* offset of 0 implies 256B, otherwise it implies offset*8B */
 	offset = cpth->w2.fi_offset;
diff --git a/drivers/common/cnxk/roc_dev.c b/drivers/common/cnxk/roc_dev.c
index 32409f2ef3..61aa4b3075 100644
--- a/drivers/common/cnxk/roc_dev.c
+++ b/drivers/common/cnxk/roc_dev.c
@@ -1796,14 +1796,17 @@ dev_init(struct dev *dev, struct plt_pci_device *pci_dev)

 	rc = npa_lf_init(dev, pci_dev);
 	if (rc)
-		goto stop_msg_thrd;
+		goto vf_flr_unregister;

 	/* Setup LMT line base */
 	rc = dev_lmt_setup(dev);
 	if (rc)
-		goto stop_msg_thrd;
+		goto vf_flr_unregister;

 	return rc;
+vf_flr_unregister:
+	if (!is_vf)
+		dev_vf_flr_unregister_irqs(pci_dev, dev);
 stop_msg_thrd:
 	/* Exiting the mbox sync thread */
 	if (dev->sync.start_thread) {
@@ -1812,10 +1815,14 @@ dev_init(struct dev *dev, struct plt_pci_device *pci_dev)
 		plt_thread_join(dev->sync.pfvf_msg_thread, NULL);
 	}
 thread_fail:
-	pthread_mutex_destroy(&dev->sync.mutex);
-	pthread_cond_destroy(&dev->sync.pfvf_msg_cond);
+	if (pci_dev->max_vfs > 0) {
+		pthread_mutex_destroy(&dev->sync.mutex);
+		pthread_cond_destroy(&dev->sync.pfvf_msg_cond);
+	}
 iounmap:
 	dev_vf_mbase_put(pci_dev, vf_mbase);
+	mbox_fini(&dev->mbox_vfpf);
+	mbox_fini(&dev->mbox_vfpf_up);
 mbox_unregister:
 	dev_mbox_unregister_irq(pci_dev, dev);
 	if (dev->ops)
diff --git a/drivers/common/cnxk/roc_mbox.h b/drivers/common/cnxk/roc_mbox.h
index 1158ff50a7..52ecde6563 100644
--- a/drivers/common/cnxk/roc_mbox.h
+++ b/drivers/common/cnxk/roc_mbox.h
@@ -47,8 +47,8 @@ struct mbox_msghdr {
 #define RVU_VF_VFPF_MBOX0 (0x0000)
 #define RVU_VF_VFPF_MBOX1 (0x0008)

-#define MBOX_DOWN_MSG 1
-#define MBOX_UP_MSG   2
+#define MBOX_DOWN_MSG 1ULL
+#define MBOX_UP_MSG   2ULL

 /* Mailbox message types */
 #define MBOX_MSG_MASK	 0xFFFF
diff --git a/drivers/common/cnxk/roc_nix_inl.c b/drivers/common/cnxk/roc_nix_inl.c
index b515d52534..db101e71a5 100644
--- a/drivers/common/cnxk/roc_nix_inl.c
+++ b/drivers/common/cnxk/roc_nix_inl.c
@@ -638,9 +638,8 @@ nix_inl_reass_inb_sa_tbl_setup(struct roc_nix *roc_nix)
 		res_addr_offset = (uint64_t)(inl_dev->res_addr_offset & 0xFF) << 48;
 		if (res_addr_offset)
 			res_addr_offset |= (1UL << 56);
+		cpt_cq_ena = (uint64_t)inl_dev->cpt_cq_ena << 63;
 	}
-
-	cpt_cq_ena = (uint64_t)inl_dev->cpt_cq_ena << 63;
 	lf_cfg->enable = 1;
 	lf_cfg->profile_id = profile_id;
 	lf_cfg->rx_inline_sa_base = (uintptr_t)nix->inb_sa_base[profile_id] | cpt_cq_ena;
diff --git a/drivers/common/cnxk/roc_npa.c b/drivers/common/cnxk/roc_npa.c
index 88e328105a..4a3e96a97a 100644
--- a/drivers/common/cnxk/roc_npa.c
+++ b/drivers/common/cnxk/roc_npa.c
@@ -1113,6 +1113,9 @@ roc_npa_pool_destroy(uint64_t aura_handle)
 	struct npa_lf *lf = idev_npa_obj_get();
 	int rc = 0, aura_id;

+	if (lf == NULL)
+		return NPA_ERR_DEVICE_NOT_BOUNDED;
+
 	plt_npa_dbg("lf=%p aura_handle=0x%" PRIx64, lf, aura_handle);
 	aura_id = roc_npa_aura_handle_to_aura(aura_handle);

diff --git a/drivers/common/cnxk/roc_npa_debug.c b/drivers/common/cnxk/roc_npa_debug.c
index e64696730f..f978be9642 100644
--- a/drivers/common/cnxk/roc_npa_debug.c
+++ b/drivers/common/cnxk/roc_npa_debug.c
@@ -283,6 +283,9 @@ roc_npa_ctx_dump(void)
 		if (lf->aura_attr[q].halo) {
 			aq->ctype = NPA_AQ_CTYPE_HALO;
 			rc = mbox_process_msg(mbox, (void *)&rsp_cn20k);
+		} else if (roc_model_is_cn20k()) {
+			aq->ctype = NPA_AQ_CTYPE_AURA;
+			rc = mbox_process_msg(mbox, (void *)&rsp_cn20k);
 		} else {
 			aq->ctype = NPA_AQ_CTYPE_AURA;
 			rc = mbox_process_msg(mbox, (void *)&rsp);
@@ -323,7 +326,10 @@ roc_npa_ctx_dump(void)
 		aq->ctype = NPA_AQ_CTYPE_POOL;
 		aq->op = NPA_AQ_INSTOP_READ;

-		rc = mbox_process_msg(mbox, (void *)&rsp);
+		if (roc_model_is_cn20k())
+			rc = mbox_process_msg(mbox, (void *)&rsp_cn20k);
+		else
+			rc = mbox_process_msg(mbox, (void *)&rsp);
 		if (rc) {
 			plt_err("Failed to get pool(%d) context", q);
 			goto exit;
diff --git a/drivers/common/cnxk/roc_npc_utils.c b/drivers/common/cnxk/roc_npc_utils.c
index 3c05e46e1b..8e83b8662d 100644
--- a/drivers/common/cnxk/roc_npc_utils.c
+++ b/drivers/common/cnxk/roc_npc_utils.c
@@ -486,7 +486,7 @@ npc_process_ipv6_field_hash_o20k(const struct roc_npc_flow_item_ipv6 *ipv6_spec,
 	uint8_t hash_field[ROC_IPV6_ADDR_LEN];
 	struct npc_xtract_info *xinfo;
 	uint32_t hash = 0, mask;
-	int intf, i, rc = 0;
+	int intf, i, hash_idx = 0, rc = 0;

 	memset(hash_field, 0, sizeof(hash_field));

@@ -505,14 +505,18 @@ npc_process_ipv6_field_hash_o20k(const struct roc_npc_flow_item_ipv6 *ipv6_spec,
 		if (rc == 0)
 			continue;

-		rc = npc_ipv6_field_hash_get(pst->npc, (const uint32_t *)hash_field, intf, i,
-					     &hash);
+		if (hash_idx >= NPC_MAX_HASH)
+			break;
+
+		rc = npc_ipv6_field_hash_get(pst->npc, (const uint32_t *)hash_field, intf,
+					     hash_idx, &hash);
 		if (rc)
 			return rc;

 		mask = GENMASK(31, 0);
 		memcpy(pst->mcam_mask + xinfo->key_off, (uint8_t *)&mask, 4);
 		memcpy(pst->mcam_data + xinfo->key_off, (uint8_t *)&hash, 4);
+		hash_idx++;
 	}

 	return 0;
diff --git a/drivers/common/cnxk/roc_ree.c b/drivers/common/cnxk/roc_ree.c
index b6392658c3..923d9251ad 100644
--- a/drivers/common/cnxk/roc_ree.c
+++ b/drivers/common/cnxk/roc_ree.c
@@ -592,14 +592,15 @@ roc_ree_dev_init(struct roc_ree_vf *vf)
 	vf->block_address = ree_get_blkaddr(dev);
 	if (!vf->block_address) {
 		plt_err("Could not determine block PF number");
-		goto fail;
+		rc = -ENODEV;
+		goto dev_fini;
 	}

 	/* Get number of queues available on the device */
 	rc = roc_ree_available_queues_get(vf, &nb_queues);
 	if (rc) {
 		plt_err("Could not determine the number of queues available");
-		goto fail;
+		goto dev_fini;
 	}

 	/* Don't exceed the limits set per VF */
@@ -607,7 +608,8 @@ roc_ree_dev_init(struct roc_ree_vf *vf)

 	if (nb_queues == 0) {
 		plt_err("No free queues available on the device");
-		goto fail;
+		rc = -ENOSPC;
+		goto dev_fini;
 	}

 	vf->max_queues = nb_queues;
@@ -618,18 +620,23 @@ roc_ree_dev_init(struct roc_ree_vf *vf)
 	rc = roc_ree_max_matches_get(vf, &max_matches);
 	if (rc) {
 		plt_err("Could not determine the maximum matches supported");
-		goto fail;
+		goto dev_fini;
 	}
 	/* Don't exceed the limits set per VF */
 	max_matches = RTE_MIN(max_matches, REE_MAX_MATCHES_PER_VF);
 	if (max_matches == 0) {
 		plt_err("Could not determine the maximum matches supported");
-		goto fail;
+		rc = -EIO;
+		goto dev_fini;
 	}

 	vf->max_matches = max_matches;

 	plt_ree_dbg("Max matches supported by device: %d", vf->max_matches);
+
+	return 0;
+dev_fini:
+	dev_fini(dev, pci_dev);
 fail:
 	return rc;
 }
--
2.34.1


^ permalink raw reply related

* [PATCH v2 17/22] common/cnxk: add auth key len check in inbound SA
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, Archana Muniganti, Vidya Sagar Velumuri,
	Akhil Goyal
  Cc: jerinj, Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Add auth key length validation before memcpy in
cnxk_on_ipsec_inb_sa_create() to prevent caller-provided
keys from overflowing fixed-size in-struct buffers and
corrupting adjacent fields.

Fixes: 532963b80707 ("crypto/cnxk: move IPsec SA creation to common")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/cnxk_security.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/common/cnxk/cnxk_security.c b/drivers/common/cnxk/cnxk_security.c
index 6f46ad3276..228ff2781d 100644
--- a/drivers/common/cnxk/cnxk_security.c
+++ b/drivers/common/cnxk/cnxk_security.c
@@ -1199,22 +1199,33 @@ cnxk_on_ipsec_inb_sa_create(struct rte_security_ipsec_xform *ipsec,
 			break;
 		case RTE_CRYPTO_AUTH_MD5_HMAC:
 		case RTE_CRYPTO_AUTH_SHA1_HMAC:
-			memcpy(in_sa->sha1_or_gcm.hmac_key, auth_key,
-			       auth_key_len);
-			ctx_len = offsetof(struct roc_ie_on_inb_sa,
-					   sha1_or_gcm.selector);
+			if (auth_key_len > (int)sizeof(in_sa->sha1_or_gcm.hmac_key)) {
+				plt_err("Auth key len %d exceeds max %zu for algo %u", auth_key_len,
+					sizeof(in_sa->sha1_or_gcm.hmac_key), auth_xform->auth.algo);
+				return -EINVAL;
+			}
+			memcpy(in_sa->sha1_or_gcm.hmac_key, auth_key, auth_key_len);
+			ctx_len = offsetof(struct roc_ie_on_inb_sa, sha1_or_gcm.selector);
 			break;
 		case RTE_CRYPTO_AUTH_SHA256_HMAC:
 		case RTE_CRYPTO_AUTH_SHA384_HMAC:
 		case RTE_CRYPTO_AUTH_SHA512_HMAC:
+			if (auth_key_len > (int)sizeof(in_sa->sha2.hmac_key)) {
+				plt_err("Auth key len %d exceeds max %zu for algo %u", auth_key_len,
+					sizeof(in_sa->sha2.hmac_key), auth_xform->auth.algo);
+				return -EINVAL;
+			}
 			memcpy(in_sa->sha2.hmac_key, auth_key, auth_key_len);
-			ctx_len = offsetof(struct roc_ie_on_inb_sa,
-					   sha2.selector);
+			ctx_len = offsetof(struct roc_ie_on_inb_sa, sha2.selector);
 			break;
 		case RTE_CRYPTO_AUTH_AES_XCBC_MAC:
+			if (auth_key_len > (int)sizeof(in_sa->aes_xcbc.key)) {
+				plt_err("Auth key len %d exceeds max %zu for algo %u", auth_key_len,
+					sizeof(in_sa->aes_xcbc.key), auth_xform->auth.algo);
+				return -EINVAL;
+			}
 			memcpy(in_sa->aes_xcbc.key, auth_key, auth_key_len);
-			ctx_len = offsetof(struct roc_ie_on_inb_sa,
-					   aes_xcbc.selector);
+			ctx_len = offsetof(struct roc_ie_on_inb_sa, aes_xcbc.selector);
 			break;
 		default:
 			plt_err("Unsupported auth algorithm %u", auth_xform->auth.algo);
--
2.34.1


^ permalink raw reply related

* [PATCH v2 18/22] common/cnxk: add FEC configuration support
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rakesh Kudurumalla
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Rakesh Kudurumalla <rkudurumalla@marvell.com>

Add ROC APIs for Forward Error Correction (FEC) configuration:
- roc_nix_mac_fec_set: Set FEC mode on the link
- roc_nix_mac_fec_supported_get: Query supported FEC modes
  from firmware

These APIs use CGX mailbox messages to configure and query
FEC parameters on PF interfaces.

Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_nix.h                 |  2 +
 drivers/common/cnxk/roc_nix_mac.c             | 52 ++++++++++++++++++-
 .../common/cnxk/roc_platform_base_symbols.c   |  2 +
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/common/cnxk/roc_nix.h b/drivers/common/cnxk/roc_nix.h
index 49ede85f9a..802519f5e8 100644
--- a/drivers/common/cnxk/roc_nix.h
+++ b/drivers/common/cnxk/roc_nix.h
@@ -975,6 +975,8 @@ int __roc_api roc_nix_mac_link_info_set(struct roc_nix *roc_nix,
 					struct roc_nix_link_info *link_info);
 int __roc_api roc_nix_mac_link_info_get(struct roc_nix *roc_nix,
 					struct roc_nix_link_info *link_info);
+int __roc_api roc_nix_mac_fec_set(struct roc_nix *roc_nix, int fec);
+int __roc_api roc_nix_mac_fec_supported_get(struct roc_nix *roc_nix, uint64_t *supported_fec);
 int __roc_api roc_nix_mac_mtu_set(struct roc_nix *roc_nix, uint16_t mtu);
 int __roc_api roc_nix_mac_max_rx_len_set(struct roc_nix *roc_nix,
 					 uint16_t maxlen);
diff --git a/drivers/common/cnxk/roc_nix_mac.c b/drivers/common/cnxk/roc_nix_mac.c
index 376ff48522..9440cad33d 100644
--- a/drivers/common/cnxk/roc_nix_mac.c
+++ b/drivers/common/cnxk/roc_nix_mac.c
@@ -257,6 +257,57 @@ roc_nix_mac_link_state_set(struct roc_nix *roc_nix, uint8_t up)
 	return rc;
 }

+int
+roc_nix_mac_fec_set(struct roc_nix *roc_nix, int fec)
+{
+	struct nix *nix = roc_nix_to_nix_priv(roc_nix);
+	struct dev *dev = &nix->dev;
+	struct mbox *mbox = mbox_get(dev->mbox);
+	struct fec_mode *req;
+	int rc = -ENOSPC;
+
+	if (roc_nix_is_vf_or_sdp(roc_nix)) {
+		rc = NIX_ERR_OP_NOTSUP;
+		goto exit;
+	}
+
+	req = mbox_alloc_msg_cgx_set_fec_param(mbox);
+	if (req == NULL)
+		goto exit;
+	req->fec = fec;
+
+	rc = mbox_process(mbox);
+exit:
+	mbox_put(mbox);
+	return rc;
+}
+
+int
+roc_nix_mac_fec_supported_get(struct roc_nix *roc_nix, uint64_t *supported_fec)
+{
+	struct nix *nix = roc_nix_to_nix_priv(roc_nix);
+	struct dev *dev = &nix->dev;
+	struct mbox *mbox = mbox_get(dev->mbox);
+	struct cgx_fw_data *rsp = NULL;
+	int rc;
+
+	if (roc_nix_is_vf_or_sdp(roc_nix)) {
+		rc = NIX_ERR_OP_NOTSUP;
+		goto exit;
+	}
+
+	mbox_alloc_msg_cgx_get_aux_link_info(mbox);
+	rc = mbox_process_msg(mbox, (void *)&rsp);
+	if (rc)
+		goto exit;
+
+	*supported_fec = rsp->fwdata.supported_fec;
+	rc = 0;
+exit:
+	mbox_put(mbox);
+	return rc;
+}
+
 int
 roc_nix_mac_link_info_set(struct roc_nix *roc_nix,
 			  struct roc_nix_link_info *link_info)
@@ -283,7 +334,6 @@ roc_nix_mac_link_info_set(struct roc_nix *roc_nix,
 exit:
 	mbox_put(mbox);
 	return rc;
-
 }

 int
diff --git a/drivers/common/cnxk/roc_platform_base_symbols.c b/drivers/common/cnxk/roc_platform_base_symbols.c
index d1c9f2304d..ffae154788 100644
--- a/drivers/common/cnxk/roc_platform_base_symbols.c
+++ b/drivers/common/cnxk/roc_platform_base_symbols.c
@@ -316,6 +316,8 @@ RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_link_state_set)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_link_info_set)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_mtu_set)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_max_rx_len_set)
+RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_fec_set)
+RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_fec_supported_get)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_stats_reset)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_fwdata_get)
 RTE_EXPORT_INTERNAL_SYMBOL(roc_nix_mac_link_cb_register)
--
2.34.1


^ permalink raw reply related

* [PATCH v2 19/22] net/cnxk: add FEC get set and capability ops
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, Rakesh Kudurumalla
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Rakesh Kudurumalla <rkudurumalla@marvell.com>

Add ethdev FEC operations for cnxk NIX driver:
- fec_get_capability: Report supported FEC modes per speed.
  If firmware provides supported FEC info, return actual
  capabilities for current link speed. Otherwise, fall back
  to a default capability table for common speeds.
- fec_get: Query current FEC mode from link info
- fec_set: Configure FEC mode on the link. AUTO mode
  defaults to Reed-Solomon FEC.

Signed-off-by: Rakesh Kudurumalla <rkudurumalla@marvell.com>
---
Changes in v2: No change.

 drivers/net/cnxk/cnxk_ethdev.c     |  3 +
 drivers/net/cnxk/cnxk_ethdev.h     |  5 ++
 drivers/net/cnxk/cnxk_ethdev_ops.c | 94 ++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+)

diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index a21e170229..f3f5035947 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -2137,6 +2137,9 @@ struct eth_dev_ops cnxk_eth_dev_ops = {
 	.cman_config_set = cnxk_nix_cman_config_set,
 	.cman_config_get = cnxk_nix_cman_config_get,
 	.eth_tx_descriptor_dump = cnxk_nix_tx_descriptor_dump,
+	.fec_get_capability = cnxk_nix_fec_get_capability,
+	.fec_get = cnxk_nix_fec_get,
+	.fec_set = cnxk_nix_fec_set,
 };

 void
diff --git a/drivers/net/cnxk/cnxk_ethdev.h b/drivers/net/cnxk/cnxk_ethdev.h
index 6686fdba31..9429a81ee8 100644
--- a/drivers/net/cnxk/cnxk_ethdev.h
+++ b/drivers/net/cnxk/cnxk_ethdev.h
@@ -667,6 +667,11 @@ int cnxk_nix_tm_mark_ip_dscp(struct rte_eth_dev *eth_dev, int mark_green,
 int cnxk_nix_tx_descriptor_dump(const struct rte_eth_dev *eth_dev, uint16_t qid, uint16_t offset,
 				uint16_t num, FILE *file);

+/* FEC */
+int cnxk_nix_fec_get_capability(struct rte_eth_dev *eth_dev,
+				struct rte_eth_fec_capa *speed_fec_capa, unsigned int num);
+int cnxk_nix_fec_get(struct rte_eth_dev *eth_dev, uint32_t *fec_capa);
+int cnxk_nix_fec_set(struct rte_eth_dev *eth_dev, uint32_t fec_capa);
 /* MTR */
 int cnxk_nix_mtr_ops_get(struct rte_eth_dev *dev, void *ops);

diff --git a/drivers/net/cnxk/cnxk_ethdev_ops.c b/drivers/net/cnxk/cnxk_ethdev_ops.c
index 49e77e49a6..a45721d414 100644
--- a/drivers/net/cnxk/cnxk_ethdev_ops.c
+++ b/drivers/net/cnxk/cnxk_ethdev_ops.c
@@ -1414,3 +1414,97 @@ cnxk_nix_tx_descriptor_dump(const struct rte_eth_dev *eth_dev, uint16_t qid, uin

 	return roc_nix_sq_desc_dump(nix, qid, offset, num, file);
 }
+
+static uint32_t
+cnxk_roc_fec_to_ethdev_capa(int roc_fec)
+{
+	switch (roc_fec) {
+	case ROC_FEC_BASER:
+		return RTE_ETH_FEC_MODE_CAPA_MASK(BASER);
+	case ROC_FEC_RS:
+		return RTE_ETH_FEC_MODE_CAPA_MASK(RS);
+	default:
+		return RTE_ETH_FEC_MODE_CAPA_MASK(NOFEC);
+	}
+}
+
+static int
+cnxk_ethdev_fec_to_roc(uint32_t fec_capa)
+{
+	if (fec_capa & RTE_ETH_FEC_MODE_CAPA_MASK(RS))
+		return ROC_FEC_RS;
+	if (fec_capa & RTE_ETH_FEC_MODE_CAPA_MASK(BASER))
+		return ROC_FEC_BASER;
+	return ROC_FEC_NONE;
+}
+
+static uint32_t
+cnxk_fec_capa_from_supported(uint64_t supported_fec)
+{
+	uint32_t capa = RTE_ETH_FEC_MODE_CAPA_MASK(NOFEC) | RTE_ETH_FEC_MODE_CAPA_MASK(AUTO);
+
+	if (supported_fec & (1ULL << ROC_FEC_BASER))
+		capa |= RTE_ETH_FEC_MODE_CAPA_MASK(BASER);
+	if (supported_fec & (1ULL << ROC_FEC_RS))
+		capa |= RTE_ETH_FEC_MODE_CAPA_MASK(RS);
+
+	return capa;
+}
+
+int
+cnxk_nix_fec_get_capability(struct rte_eth_dev *eth_dev, struct rte_eth_fec_capa *speed_fec_capa,
+			    unsigned int num)
+{
+	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
+	struct roc_nix *nix = &dev->nix;
+	struct roc_nix_link_info link_info;
+	uint64_t supported_fec = 0;
+	int rc;
+
+	rc = roc_nix_mac_fec_supported_get(nix, &supported_fec);
+	if (rc == 0 && supported_fec != 0) {
+		rc = roc_nix_mac_link_info_get(nix, &link_info);
+		if (rc)
+			return rc;
+
+		if (speed_fec_capa == NULL || num == 0)
+			return 1;
+
+		speed_fec_capa[0].speed = link_info.speed;
+		speed_fec_capa[0].capa = cnxk_fec_capa_from_supported(supported_fec);
+		return 1;
+	}
+
+	return rc;
+}
+
+int
+cnxk_nix_fec_get(struct rte_eth_dev *eth_dev, uint32_t *fec_capa)
+{
+	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
+	struct roc_nix *nix = &dev->nix;
+	struct roc_nix_link_info link_info;
+	int rc;
+
+	rc = roc_nix_mac_link_info_get(nix, &link_info);
+	if (rc)
+		return rc;
+
+	*fec_capa = cnxk_roc_fec_to_ethdev_capa(link_info.fec);
+	return 0;
+}
+
+int
+cnxk_nix_fec_set(struct rte_eth_dev *eth_dev, uint32_t fec_capa)
+{
+	struct cnxk_eth_dev *dev = cnxk_eth_pmd_priv(eth_dev);
+	struct roc_nix *nix = &dev->nix;
+	int roc_fec;
+
+	if (fec_capa & RTE_ETH_FEC_MODE_CAPA_MASK(AUTO))
+		roc_fec = ROC_FEC_RS;
+	else
+		roc_fec = cnxk_ethdev_fec_to_roc(fec_capa);
+
+	return roc_nix_mac_fec_set(nix, roc_fec);
+}
--
2.34.1


^ permalink raw reply related

* [PATCH v2 20/22] event/cnxk: fix Klocwork static analysis issues
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Pavan Nikhilesh, Shijith Thotton, Rakesh Kudurumalla,
	Rahul Bhansali
  Cc: jerinj, Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Cast uint16_t operands to uint64_t before bitwise OR with
uint64_t rx_offloads to fix operand size mismatches. Add NULL
check for bracket parser end pointer to prevent undefined
behavior from pointer comparison with NULL.

Fixes: 697883bcb0a8 ("event/cnxk: fix Rx timestamp handling")
Fixes: fe7ed2ebbf37 ("event/cnxk: set Rx offload flags")
Fixes: 38c2e3240ba8 ("event/cnxk: add option to control SSO HWGRP QoS")
Fixes: 8a3d58c189fd ("event/cnxk: add option to control timer adapters")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/event/cnxk/cn10k_eventdev.c      | 2 +-
 drivers/event/cnxk/cnxk_eventdev.c       | 2 +-
 drivers/event/cnxk/cnxk_eventdev_adptr.c | 4 ++--
 drivers/event/cnxk/cnxk_tim_evdev.c      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
index 2e4b8aab92..62fea93b0b 100644
--- a/drivers/event/cnxk/cn10k_eventdev.c
+++ b/drivers/event/cnxk/cn10k_eventdev.c
@@ -660,7 +660,7 @@ cn10k_sso_tstamp_hdl_update(uint16_t port_id, uint16_t flags, bool ptp_en)
 	struct rte_eventdev *event_dev = cnxk_eth_dev->evdev_priv;
 	struct cnxk_sso_evdev *evdev = cnxk_sso_pmd_priv(event_dev);

-	evdev->rx_offloads |= flags;
+	evdev->rx_offloads |= (uint64_t)flags;
 	if (ptp_en)
 		evdev->tstamp[port_id] = &cnxk_eth_dev->tstamp;
 	else
diff --git a/drivers/event/cnxk/cnxk_eventdev.c b/drivers/event/cnxk/cnxk_eventdev.c
index be6a487b59..4aa16f9026 100644
--- a/drivers/event/cnxk/cnxk_eventdev.c
+++ b/drivers/event/cnxk/cnxk_eventdev.c
@@ -566,7 +566,7 @@ parse_list(const char *value, void *opaque, param_parse_t fn)
 		else if (*s == ']')
 			end = s;

-		if (start && start < end) {
+		if (start && end && start < end) {
 			*end = 0;
 			fn(start + 1, opaque);
 			s = end;
diff --git a/drivers/event/cnxk/cnxk_eventdev_adptr.c b/drivers/event/cnxk/cnxk_eventdev_adptr.c
index 8536dee5bf..5678e5d264 100644
--- a/drivers/event/cnxk/cnxk_eventdev_adptr.c
+++ b/drivers/event/cnxk/cnxk_eventdev_adptr.c
@@ -285,7 +285,7 @@ cnxk_sso_rx_adapter_queues_add(const struct rte_eventdev *event_dev,
 	/* Propagate force bp devarg */
 	cnxk_eth_dev->nix.force_rx_aura_bp = dev->force_ena_bp;
 	cnxk_sso_tstamp_cfg(eth_dev->data->port_id, eth_dev, dev);
-	dev->rx_offloads |= cnxk_eth_dev->rx_offload_flags;
+	dev->rx_offloads |= (uint64_t)cnxk_eth_dev->rx_offload_flags;
 	return 0;

 fail:
@@ -330,7 +330,7 @@ cnxk_sso_rx_adapter_start(const struct rte_eventdev *event_dev,
 {
 	struct cnxk_eth_dev *cnxk_eth_dev = eth_dev->data->dev_private;
 	struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
-	dev->rx_offloads |= cnxk_eth_dev->rx_offload_flags;
+	dev->rx_offloads |= (uint64_t)cnxk_eth_dev->rx_offload_flags;
 	return 0;
 }

diff --git a/drivers/event/cnxk/cnxk_tim_evdev.c b/drivers/event/cnxk/cnxk_tim_evdev.c
index 994d1d1090..8cdb8a72dd 100644
--- a/drivers/event/cnxk/cnxk_tim_evdev.c
+++ b/drivers/event/cnxk/cnxk_tim_evdev.c
@@ -508,7 +508,7 @@ cnxk_tim_parse_ring_ctl_list(const char *value, void *opaque)
 		else
 			continue;

-		if (start && start < end) {
+		if (start && end && start < end) {
 			*end = 0;
 			cnxk_tim_parse_ring_param(start + 1, opaque);
 			start = end;
--
2.34.1


^ permalink raw reply related

* [PATCH v2 21/22] crypto/cnxk: enforce DES/3DES cipher key length
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Ankur Dwivedi, Anoob Joseph, Tejasree Kondoj, Akhil Goyal,
	Archana Muniganti
  Cc: jerinj, Aarnav JP, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Aarnav JP <ajp@marvell.com>

Enforce exact key length match for DES/3DES algorithms
in fill_sess_cipher(), since these have fixed key sizes
(8 or 24 bytes). The existing check only enforced a lower
bound, allowing oversized keys to pass through.

Fixes: eb43e39851b8 ("crypto/cnxk: add cipher operation in session")
Cc: stable@dpdk.org

Signed-off-by: Aarnav JP <ajp@marvell.com>
---
Changes in v2: No change.

 drivers/crypto/cnxk/cnxk_se.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/cnxk/cnxk_se.h b/drivers/crypto/cnxk/cnxk_se.h
index 8dbf3e73c7..e2d7e10ec9 100644
--- a/drivers/crypto/cnxk/cnxk_se.h
+++ b/drivers/crypto/cnxk/cnxk_se.h
@@ -2297,9 +2297,14 @@ fill_sess_cipher(struct rte_crypto_sym_xform *xform, struct cnxk_se_sess *sess)
 		return -1;
 	}

-	if (c_form->key.length < cipher_key_len) {
-		plt_dp_err("Invalid cipher params keylen %u",
-			   c_form->key.length);
+	if (enc_type == ROC_SE_DES3_CBC || enc_type == ROC_SE_DES3_ECB ||
+	    enc_type == ROC_SE_DES_DOCSISBPI) {
+		if (c_form->key.length != cipher_key_len) {
+			plt_dp_err("Invalid cipher params keylen %u", c_form->key.length);
+			return -1;
+		}
+	} else if (c_form->key.length < cipher_key_len) {
+		plt_dp_err("Invalid cipher params keylen %u", c_form->key.length);
 		return -1;
 	}

--
2.34.1


^ permalink raw reply related

* [PATCH v2 22/22] common/cnxk: fix TM link config selection in debug dump
From: Rahul Bhansali @ 2026-06-11 14:20 UTC (permalink / raw)
  To: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra
  Cc: jerinj, stable
In-Reply-To: <20260611142029.3351415-1-rbhansali@marvell.com>

From: Satha Rao <skoteshwar@marvell.com>

Only emit the TM link configuration register when the configured TM
link level matches the hardware level being dumped, and use nix->tx_link
for the register and label so the dump reflects the active link.

Fixes: fcdef46b6698 ("common/cnxk: support NIX TM debug and misc utils")
Cc: stable@dpdk.org

Signed-off-by: Satha Rao <skoteshwar@marvell.com>
---
Changes in v2: No change.

 drivers/common/cnxk/roc_nix_debug.c | 31 +++++++++++++++++++----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/common/cnxk/roc_nix_debug.c b/drivers/common/cnxk/roc_nix_debug.c
index d4b2b86916..9c3bc8abe3 100644
--- a/drivers/common/cnxk/roc_nix_debug.c
+++ b/drivers/common/cnxk/roc_nix_debug.c
@@ -1150,7 +1150,7 @@ roc_nix_sq_dump(struct roc_nix_sq *sq, FILE *file)
 };

 static uint8_t
-nix_tm_reg_dump_prep(uint16_t hw_lvl, uint16_t schq, uint16_t link,
+nix_tm_reg_dump_prep(struct nix *nix, uint16_t hw_lvl, uint16_t schq,
 		     uint64_t *reg, char regstr[][NIX_REG_NAME_SZ])
 {
 	FILE *file = NULL;
@@ -1228,9 +1228,14 @@ nix_tm_reg_dump_prep(uint16_t hw_lvl, uint16_t schq, uint16_t link,
 		snprintf(regstr[k++], NIX_REG_NAME_SZ,
 			 "NIX_AF_TL3[%u]_TOPOLOGY", schq);

-		reg[k] = NIX_AF_TL3_TL2X_LINKX_CFG(schq, link);
-		snprintf(regstr[k++], NIX_REG_NAME_SZ,
-			 "NIX_AF_TL3_TL2[%u]_LINK[%u]_CFG", schq, link);
+		/* Link configuration */
+		if (!nix->sdp_link &&
+		    nix->tm_link_cfg_lvl == NIX_TXSCH_LVL_TL3) {
+			reg[k] = NIX_AF_TL3_TL2X_LINKX_CFG(schq, nix->tx_link);
+			snprintf(regstr[k++], NIX_REG_NAME_SZ,
+				 "NIX_AF_TL3_TL2[%u]_LINK[%u]_CFG", schq,
+				 nix->tx_link);
+		}

 		reg[k] = NIX_AF_TL3X_SCHEDULE(schq);
 		snprintf(regstr[k++], NIX_REG_NAME_SZ,
@@ -1261,9 +1266,14 @@ nix_tm_reg_dump_prep(uint16_t hw_lvl, uint16_t schq, uint16_t link,
 		snprintf(regstr[k++], NIX_REG_NAME_SZ,
 			 "NIX_AF_TL2[%u]_TOPOLOGY", schq);

-		reg[k] = NIX_AF_TL3_TL2X_LINKX_CFG(schq, link);
-		snprintf(regstr[k++], NIX_REG_NAME_SZ,
-			 "NIX_AF_TL3_TL2[%u]_LINK[%u]_CFG", schq, link);
+		/* Link configuration */
+		if (!nix->sdp_link &&
+		    nix->tm_link_cfg_lvl == NIX_TXSCH_LVL_TL2) {
+			reg[k] = NIX_AF_TL3_TL2X_LINKX_CFG(schq, nix->tx_link);
+			snprintf(regstr[k++], NIX_REG_NAME_SZ,
+				 "NIX_AF_TL3_TL2[%u]_LINK[%u]_CFG", schq,
+				 nix->tx_link);
+		}

 		reg[k] = NIX_AF_TL2X_SCHEDULE(schq);
 		snprintf(regstr[k++], NIX_REG_NAME_SZ,
@@ -1370,8 +1380,7 @@ nix_tm_dump_lvl(struct nix *nix, struct nix_tm_node_list *list, uint8_t hw_lvl)
 			root = node;

 		/* Dump registers only when HWRES is present */
-		k = nix_tm_reg_dump_prep(node->hw_lvl, schq, nix->tx_link, reg,
-					 regstr);
+		k = nix_tm_reg_dump_prep(nix, node->hw_lvl, schq, reg, regstr);
 		if (!k)
 			continue;

@@ -1396,8 +1405,8 @@ nix_tm_dump_lvl(struct nix *nix, struct nix_tm_node_list *list, uint8_t hw_lvl)

 	/* Dump TL1 node data when root level is TL2 */
 	if (root && root->hw_lvl == NIX_TXSCH_LVL_TL2) {
-		k = nix_tm_reg_dump_prep(NIX_TXSCH_LVL_TL1, root->parent_hw_id,
-					 nix->tx_link, reg, regstr);
+		k = nix_tm_reg_dump_prep(nix, NIX_TXSCH_LVL_TL1,
+					 root->parent_hw_id, reg, regstr);
 		if (!k)
 			return;

--
2.34.1


^ permalink raw reply related

* DTS code coverage question
From: Lincoln Lavoie @ 2026-06-11 14:29 UTC (permalink / raw)
  To: dev

Hello All,

We have a patch into the 26.07 release that will enable generating
code coverage reports when DTS is run. The community lab can then
generate coverage reports, like we do for unit testing
(https://lab.dpdk.org/results/dashboard/code-coverage).

The question is, with DTS, tests can run on combinations of NICs, etc.
So how should those be factored into the reporting.  We can do 1 of
the following:

Option 1: Collect coverage reports per NIC / PMD
Option 2: Aggregate (combine) reports from multiple NICs / PMDs into a
single report
Option 3: Only run it on one NIC / PMD, assuming specific PMDs don't
change the coverage much.

Is there a specific direction the community would prefer?

Cheers,
Lincoln
-- 
Lincoln Lavoie
Principal Engineer, Broadband Technologies
21 Madbury Rd., Ste. 100, Durham, NH 03824
lylavoie@iol.unh.edu
https://www.iol.unh.edu
+1-603-674-2755 (m)

^ permalink raw reply

* [PATCH v8 00/18] Support VFIO cdev API in DPDK
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
  To: dev
In-Reply-To: <cover.1763141462.git.anatoly.burakov@intel.com>

This patchset introduces a major refactor of the VFIO subsystem in DPDK to
support character device (cdev) interface introduced in Linux kernel, as well as
make the API more streamlined and useful. The goal is to simplify device
management, improve compatibility, and clarify API responsibilities.

The following sections outline the key issues addressed by this patchset and the
corresponding changes introduced.

1. Only group mode is supported
===============================

Since kernel version 4.14.327 (LTS), VFIO supports the new character device
(cdev)-based way of working with VFIO devices (otherwise known as IOMMUFD). This
is a device-centric mode and does away with all the complexity regarding groups
and IOMMU types, delegating it all to the kernel, and exposes a much simpler
interface to userspace.

The old group interface is still around, and will need to be kept in DPDK both
for compatibility reasons, as well as supporting special cases (FSLMC bus, NBL
driver, no-IOMMU mode etc.).

To enable this, VFIO is heavily refactored, so that the code can support both
modes while relying on (mostly) common infrastructure.

Note that the existing `rte_vfio_device_setup/release` model is fundamentally
incompatible with cdev mode, because for custom container cases, the expected
flow is that the user binds the IOMMU group (and thus, implicitly, the device
itself) to a specific container using `rte_vfio_container_group_bind`, whereas
this step is not needed for cdev as the device fd is assigned to the container
straight away.

Therefore, what we do instead is introduce a new API for container device
assignment which, semantically, will assign a device to specified container, so
that when it is mapped using `rte_pci_map_device`, the appropriate container is
selected. Under the hood though, we essentially transition to getting device fd
straight away at assign stage, so that by the time the PCI bus attempts to map
the device, it is already mapped and we just return an fd. There is no
"unassign" API because `release_device` already performs that function.

Additionally, a new `rte_vfio_get_mode` API is added for those cases that need
some introspection into VFIO's internals, with three new modes: group
(old-style), no-iommu (old-style but without IOMMU), and cdev (the new mode).
Although no-IOMMU is technically a variant of group mode, the distinction is
largely irrelevant to the user, as all usages of noiommu checks in our codebase
are for deciding whether to use IOVA or PA, not anything to do with managing
groups. The current plan for kernel community is to *not* introduce no-IOMMU
cdev implementation, and IOMMUFD's own group API compatibility layer also does
not implement no-IOMMU mode, which is why this will be kept for compatibility
for these use cases.

There were other users of VFIO which relied on group API but only for convenience
purposes; no actual VFIO functionality depended on those API's. Therefore, group
API's are removed and, where appropriate, replaced with the new API's.

List of removed API's:

* `rte_vfio_get_group_fd`
* `rte_vfio_clear_group`
* `rte_vfio_container_group_bind` (replaced by container assign API)
* `rte_vfio_container_group_unbind`
* `rte_vfio_noiommu_is_enabled` (replaced by new mode API)

2. The API responsibilities aren't clear and bleed into each other
==================================================================

Some API's do multiple things at once. In particular:

* `rte_vfio_get_device_info` will setup the device
* `rte_vfio_setup_device` will get device info

These API's have been adjusted to do one thing only.

v8:
- Rebase
- Fixed build errors due to variable shadowing
- Removed duplicate fd check as kernel does not provide a way to distinguish
  between device fd's

v7:
- Rebase
- Added removal of deprecation notices
- Fixed implicit numeric comparison in patch 12

v6:
- Fixed missing header include in vfio cdev file

v5:
- Added back missing uapi patch

v4:
- Fixed issues with documenting rte_vfio_mode enum
- Separated deprecation notices into a separate patchset

v3:
- Make API removal cleaner
- Fix `get_group_num` usages to align with new API
- Fix issues with function exports
- Fix issues with `setup_device` returning old-style values in some cases

v2:
- Make the entire API internal
- More aggressive API pruning, complete removal of group API
- Fixed a bug in group mode where device could not be used
- Better documentation and deprecation notice patches
- Moved doc patches to beginning of patchset

Anatoly Burakov (18):
  uapi: update to v6.17 and add iommufd.h
  vfio: make all functions internal
  vfio: split get device info from setup
  vfio: add container device assignment API
  net/nbl: do not use VFIO group bind API
  net/ntnic: use container device assignment API
  vdpa/ifc: use container device assignment API
  vdpa/nfp: use container device assignment API
  vdpa/sfc: use container device assignment API
  vhost: remove group-related API from drivers
  vfio: remove group-based API
  vfio: cleanup and refactor
  bus/pci: use the new VFIO mode API
  bus/fslmc: use the new VFIO mode API
  net/hinic3: use the new VFIO mode API
  net/ntnic: use the new VFIO mode API
  vfio: remove no-IOMMU check API
  vfio: introduce cdev mode

 config/arm/meson.build                    |    1 +
 config/meson.build                        |    1 +
 doc/guides/prog_guide/vhost_lib.rst       |    4 -
 doc/guides/rel_notes/deprecation.rst      |   10 -
 drivers/bus/cdx/cdx_vfio.c                |   25 +-
 drivers/bus/fslmc/fslmc_bus.c             |   10 +-
 drivers/bus/fslmc/fslmc_vfio.c            |    6 +-
 drivers/bus/pci/linux/pci.c               |    2 +-
 drivers/bus/pci/linux/pci_vfio.c          |   33 +-
 drivers/bus/platform/platform.c           |    9 +-
 drivers/crypto/bcmfs/bcmfs_vfio.c         |   14 +-
 drivers/net/hinic3/base/hinic3_hwdev.c    |    3 +-
 drivers/net/nbl/nbl_common/nbl_userdev.c  |   20 +-
 drivers/net/nbl/nbl_include/nbl_include.h |    1 +
 drivers/net/ntnic/ntnic_ethdev.c          |    2 +-
 drivers/net/ntnic/ntnic_vfio.c            |   30 +-
 drivers/vdpa/ifc/ifcvf_vdpa.c             |   34 +-
 drivers/vdpa/mlx5/mlx5_vdpa.c             |    1 -
 drivers/vdpa/nfp/nfp_vdpa.c               |   37 +-
 drivers/vdpa/sfc/sfc_vdpa.c               |   39 +-
 drivers/vdpa/sfc/sfc_vdpa.h               |    2 -
 kernel/linux/uapi/linux/iommufd.h         | 1292 +++++++++++
 kernel/linux/uapi/linux/vduse.h           |    2 +-
 kernel/linux/uapi/linux/vfio.h            |   12 +-
 kernel/linux/uapi/version                 |    2 +-
 lib/eal/freebsd/eal.c                     |   98 +-
 lib/eal/include/rte_vfio.h                |  387 ++--
 lib/eal/linux/eal_vfio.c                  | 2437 ++++++++-------------
 lib/eal/linux/eal_vfio.h                  |  167 +-
 lib/eal/linux/eal_vfio_cdev.c             |  390 ++++
 lib/eal/linux/eal_vfio_group.c            |  984 +++++++++
 lib/eal/linux/eal_vfio_mp_sync.c          |   80 +-
 lib/eal/linux/meson.build                 |    2 +
 lib/eal/windows/eal.c                     |    4 +-
 lib/vhost/vdpa_driver.h                   |    3 -
 35 files changed, 4248 insertions(+), 1896 deletions(-)
 create mode 100644 kernel/linux/uapi/linux/iommufd.h
 create mode 100644 lib/eal/linux/eal_vfio_cdev.c
 create mode 100644 lib/eal/linux/eal_vfio_group.c

-- 
2.47.3

^ permalink raw reply

* [PATCH v8 01/18] uapi: update to v6.17 and add iommufd.h
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
  To: dev, Maxime Coquelin
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>

In order to support VF tokens for cdev-based VFIO mode, kernel v6.17 is
required. Update internal headers to version v6.17 and include iommufd.h.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
---
 kernel/linux/uapi/linux/iommufd.h | 1292 +++++++++++++++++++++++++++++
 kernel/linux/uapi/linux/vduse.h   |    2 +-
 kernel/linux/uapi/linux/vfio.h    |   12 +-
 kernel/linux/uapi/version         |    2 +-
 4 files changed, 1305 insertions(+), 3 deletions(-)
 create mode 100644 kernel/linux/uapi/linux/iommufd.h

diff --git a/kernel/linux/uapi/linux/iommufd.h b/kernel/linux/uapi/linux/iommufd.h
new file mode 100644
index 0000000000..2105a03955
--- /dev/null
+++ b/kernel/linux/uapi/linux/iommufd.h
@@ -0,0 +1,1292 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ */
+#ifndef _IOMMUFD_H
+#define _IOMMUFD_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define IOMMUFD_TYPE (';')
+
+/**
+ * DOC: General ioctl format
+ *
+ * The ioctl interface follows a general format to allow for extensibility. Each
+ * ioctl is passed in a structure pointer as the argument providing the size of
+ * the structure in the first u32. The kernel checks that any structure space
+ * beyond what it understands is 0. This allows userspace to use the backward
+ * compatible portion while consistently using the newer, larger, structures.
+ *
+ * ioctls use a standard meaning for common errnos:
+ *
+ *  - ENOTTY: The IOCTL number itself is not supported at all
+ *  - E2BIG: The IOCTL number is supported, but the provided structure has
+ *    non-zero in a part the kernel does not understand.
+ *  - EOPNOTSUPP: The IOCTL number is supported, and the structure is
+ *    understood, however a known field has a value the kernel does not
+ *    understand or support.
+ *  - EINVAL: Everything about the IOCTL was understood, but a field is not
+ *    correct.
+ *  - ENOENT: An ID or IOVA provided does not exist.
+ *  - ENOMEM: Out of memory.
+ *  - EOVERFLOW: Mathematics overflowed.
+ *
+ * As well as additional errnos, within specific ioctls.
+ */
+enum {
+	IOMMUFD_CMD_BASE = 0x80,
+	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
+	IOMMUFD_CMD_IOAS_ALLOC = 0x81,
+	IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
+	IOMMUFD_CMD_IOAS_COPY = 0x83,
+	IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84,
+	IOMMUFD_CMD_IOAS_MAP = 0x85,
+	IOMMUFD_CMD_IOAS_UNMAP = 0x86,
+	IOMMUFD_CMD_OPTION = 0x87,
+	IOMMUFD_CMD_VFIO_IOAS = 0x88,
+	IOMMUFD_CMD_HWPT_ALLOC = 0x89,
+	IOMMUFD_CMD_GET_HW_INFO = 0x8a,
+	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b,
+	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
+	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
+	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+	IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
+	IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
+	IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
+	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
+	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
+	IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
+};
+
+/**
+ * struct iommu_destroy - ioctl(IOMMU_DESTROY)
+ * @size: sizeof(struct iommu_destroy)
+ * @id: iommufd object ID to destroy. Can be any destroyable object type.
+ *
+ * Destroy any object held within iommufd.
+ */
+struct iommu_destroy {
+	__u32 size;
+	__u32 id;
+};
+#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
+
+/**
+ * struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
+ * @size: sizeof(struct iommu_ioas_alloc)
+ * @flags: Must be 0
+ * @out_ioas_id: Output IOAS ID for the allocated object
+ *
+ * Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
+ * to memory mapping.
+ */
+struct iommu_ioas_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_ioas_id;
+};
+#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
+
+/**
+ * struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE)
+ * @start: First IOVA
+ * @last: Inclusive last IOVA
+ *
+ * An interval in IOVA space.
+ */
+struct iommu_iova_range {
+	__aligned_u64 start;
+	__aligned_u64 last;
+};
+
+/**
+ * struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
+ * @size: sizeof(struct iommu_ioas_iova_ranges)
+ * @ioas_id: IOAS ID to read ranges from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to the output array of struct iommu_iova_range
+ * @out_iova_alignment: Minimum alignment required for mapping IOVA
+ *
+ * Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
+ * is not allowed. num_iovas will be set to the total number of iovas and
+ * the allowed_iovas[] will be filled in as space permits.
+ *
+ * The allowed ranges are dependent on the HW path the DMA operation takes, and
+ * can change during the lifetime of the IOAS. A fresh empty IOAS will have a
+ * full range, and each attached device will narrow the ranges based on that
+ * device's HW restrictions. Detaching a device can widen the ranges. Userspace
+ * should query ranges after every attach/detach to know what IOVAs are valid
+ * for mapping.
+ *
+ * On input num_iovas is the length of the allowed_iovas array. On output it is
+ * the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
+ * num_iovas to the required value if num_iovas is too small. In this case the
+ * caller should allocate a larger output array and re-issue the ioctl.
+ *
+ * out_iova_alignment returns the minimum IOVA alignment that can be given
+ * to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy::
+ *
+ *   starting_iova % out_iova_alignment == 0
+ *   (starting_iova + length) % out_iova_alignment == 0
+ *
+ * out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot
+ * be higher than the system PAGE_SIZE.
+ */
+struct iommu_ioas_iova_ranges {
+	__u32 size;
+	__u32 ioas_id;
+	__u32 num_iovas;
+	__u32 __reserved;
+	__aligned_u64 allowed_iovas;
+	__aligned_u64 out_iova_alignment;
+};
+#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
+
+/**
+ * struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
+ * @size: sizeof(struct iommu_ioas_allow_iovas)
+ * @ioas_id: IOAS ID to allow IOVAs from
+ * @num_iovas: Input/Output total number of ranges in the IOAS
+ * @__reserved: Must be 0
+ * @allowed_iovas: Pointer to array of struct iommu_iova_range
+ *
+ * Ensure a range of IOVAs are always available for allocation. If this call
+ * succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
+ * that are narrower than the ranges provided here. This call will fail if
+ * IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
+ *
+ * When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
+ * devices are attached the IOVA will narrow based on the device restrictions.
+ * When an allowed range is specified any narrowing will be refused, ie device
+ * attachment can fail if the device requires limiting within the allowed range.
+ *
+ * Automatic IOVA allocation is also impacted by this call. MAP will only
+ * allocate within the allowed IOVAs if they are present.
+ *
+ * This call replaces the entire allowed list with the given list.
+ */
+struct iommu_ioas_allow_iovas {
+	__u32 size;
+	__u32 ioas_id;
+	__u32 num_iovas;
+	__u32 __reserved;
+	__aligned_u64 allowed_iovas;
+};
+#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
+
+/**
+ * enum iommufd_ioas_map_flags - Flags for map and copy
+ * @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
+ *                             IOVA to place the mapping at
+ * @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
+ * @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
+ */
+enum iommufd_ioas_map_flags {
+	IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
+	IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
+	IOMMU_IOAS_MAP_READABLE = 1 << 2,
+};
+
+/**
+ * struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
+ * @size: sizeof(struct iommu_ioas_map)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @ioas_id: IOAS ID to change the mapping of
+ * @__reserved: Must be 0
+ * @user_va: Userspace pointer to start mapping from
+ * @length: Number of bytes to map
+ * @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
+ *        then this must be provided as input.
+ *
+ * Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
+ * mapping will be established at iova, otherwise a suitable location based on
+ * the reserved and allowed lists will be automatically selected and returned in
+ * iova.
+ *
+ * If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently
+ * be unused, existing IOVA cannot be replaced.
+ */
+struct iommu_ioas_map {
+	__u32 size;
+	__u32 flags;
+	__u32 ioas_id;
+	__u32 __reserved;
+	__aligned_u64 user_va;
+	__aligned_u64 length;
+	__aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
+
+/**
+ * struct iommu_ioas_map_file - ioctl(IOMMU_IOAS_MAP_FILE)
+ * @size: sizeof(struct iommu_ioas_map_file)
+ * @flags: same as for iommu_ioas_map
+ * @ioas_id: same as for iommu_ioas_map
+ * @fd: the memfd to map
+ * @start: byte offset from start of file to map from
+ * @length: same as for iommu_ioas_map
+ * @iova: same as for iommu_ioas_map
+ *
+ * Set an IOVA mapping from a memfd file.  All other arguments and semantics
+ * match those of IOMMU_IOAS_MAP.
+ */
+struct iommu_ioas_map_file {
+	__u32 size;
+	__u32 flags;
+	__u32 ioas_id;
+	__s32 fd;
+	__aligned_u64 start;
+	__aligned_u64 length;
+	__aligned_u64 iova;
+};
+#define IOMMU_IOAS_MAP_FILE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP_FILE)
+
+/**
+ * struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
+ * @size: sizeof(struct iommu_ioas_copy)
+ * @flags: Combination of enum iommufd_ioas_map_flags
+ * @dst_ioas_id: IOAS ID to change the mapping of
+ * @src_ioas_id: IOAS ID to copy from
+ * @length: Number of bytes to copy and map
+ * @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
+ *            set then this must be provided as input.
+ * @src_iova: IOVA to start the copy
+ *
+ * Copy an already existing mapping from src_ioas_id and establish it in
+ * dst_ioas_id. The src iova/length must exactly match a range used with
+ * IOMMU_IOAS_MAP.
+ *
+ * This may be used to efficiently clone a subset of an IOAS to another, or as a
+ * kind of 'cache' to speed up mapping. Copy has an efficiency advantage over
+ * establishing equivalent new mappings, as internal resources are shared, and
+ * the kernel will pin the user memory only once.
+ */
+struct iommu_ioas_copy {
+	__u32 size;
+	__u32 flags;
+	__u32 dst_ioas_id;
+	__u32 src_ioas_id;
+	__aligned_u64 length;
+	__aligned_u64 dst_iova;
+	__aligned_u64 src_iova;
+};
+#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
+
+/**
+ * struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
+ * @size: sizeof(struct iommu_ioas_unmap)
+ * @ioas_id: IOAS ID to change the mapping of
+ * @iova: IOVA to start the unmapping at
+ * @length: Number of bytes to unmap, and return back the bytes unmapped
+ *
+ * Unmap an IOVA range. The iova/length must be a superset of a previously
+ * mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or
+ * truncating ranges is not allowed. The values 0 to U64_MAX will unmap
+ * everything.
+ */
+struct iommu_ioas_unmap {
+	__u32 size;
+	__u32 ioas_id;
+	__aligned_u64 iova;
+	__aligned_u64 length;
+};
+#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
+
+/**
+ * enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
+ *                       ioctl(IOMMU_OPTION_HUGE_PAGES)
+ * @IOMMU_OPTION_RLIMIT_MODE:
+ *    Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
+ *    to invoke this. Value 0 (default) is user based accounting, 1 uses process
+ *    based accounting. Global option, object_id must be 0
+ * @IOMMU_OPTION_HUGE_PAGES:
+ *    Value 1 (default) allows contiguous pages to be combined when generating
+ *    iommu mappings. Value 0 disables combining, everything is mapped to
+ *    PAGE_SIZE. This can be useful for benchmarking.  This is a per-IOAS
+ *    option, the object_id must be the IOAS ID.
+ */
+enum iommufd_option {
+	IOMMU_OPTION_RLIMIT_MODE = 0,
+	IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+/**
+ * enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and
+ *                           ioctl(IOMMU_OPTION_OP_GET)
+ * @IOMMU_OPTION_OP_SET: Set the option's value
+ * @IOMMU_OPTION_OP_GET: Get the option's value
+ */
+enum iommufd_option_ops {
+	IOMMU_OPTION_OP_SET = 0,
+	IOMMU_OPTION_OP_GET = 1,
+};
+
+/**
+ * struct iommu_option - iommu option multiplexer
+ * @size: sizeof(struct iommu_option)
+ * @option_id: One of enum iommufd_option
+ * @op: One of enum iommufd_option_ops
+ * @__reserved: Must be 0
+ * @object_id: ID of the object if required
+ * @val64: Option value to set or value returned on get
+ *
+ * Change a simple option value. This multiplexor allows controlling options
+ * on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
+ * will return the current value.
+ */
+struct iommu_option {
+	__u32 size;
+	__u32 option_id;
+	__u16 op;
+	__u16 __reserved;
+	__u32 object_id;
+	__aligned_u64 val64;
+};
+#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+
+/**
+ * enum iommufd_vfio_ioas_op - IOMMU_VFIO_IOAS_* ioctls
+ * @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS
+ * @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility
+ */
+enum iommufd_vfio_ioas_op {
+	IOMMU_VFIO_IOAS_GET = 0,
+	IOMMU_VFIO_IOAS_SET = 1,
+	IOMMU_VFIO_IOAS_CLEAR = 2,
+};
+
+/**
+ * struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS)
+ * @size: sizeof(struct iommu_vfio_ioas)
+ * @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set
+ *           For IOMMU_VFIO_IOAS_GET will output the IOAS ID
+ * @op: One of enum iommufd_vfio_ioas_op
+ * @__reserved: Must be 0
+ *
+ * The VFIO compatibility support uses a single ioas because VFIO APIs do not
+ * support the ID field. Set or Get the IOAS that VFIO compatibility will use.
+ * When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the
+ * compatibility ioas, either by taking what is already set, or auto creating
+ * one. From then on VFIO will continue to use that ioas and is not effected by
+ * this ioctl. SET or CLEAR does not destroy any auto-created IOAS.
+ */
+struct iommu_vfio_ioas {
+	__u32 size;
+	__u32 ioas_id;
+	__u16 op;
+	__u16 __reserved;
+};
+#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
+
+/**
+ * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation
+ * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as
+ *                                the parent HWPT in a nesting configuration.
+ * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is
+ *                                   enforced on device attachment
+ * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is
+ *                             valid.
+ * @IOMMU_HWPT_ALLOC_PASID: Requests a domain that can be used with PASID. The
+ *                          domain can be attached to any PASID on the device.
+ *                          Any domain attached to the non-PASID part of the
+ *                          device must also be flagged, otherwise attaching a
+ *                          PASID will blocked.
+ *                          For the user that wants to attach PASID, ioas is
+ *                          not recommended for both the non-PASID part
+ *                          and PASID part of the device.
+ *                          If IOMMU does not support PASID it will return
+ *                          error (-EOPNOTSUPP).
+ */
+enum iommufd_hwpt_alloc_flags {
+	IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
+	IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
+	IOMMU_HWPT_FAULT_ID_VALID = 1 << 2,
+	IOMMU_HWPT_ALLOC_PASID = 1 << 3,
+};
+
+/**
+ * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table
+ *                                entry attributes
+ * @IOMMU_VTD_S1_SRE: Supervisor request
+ * @IOMMU_VTD_S1_EAFE: Extended access enable
+ * @IOMMU_VTD_S1_WPE: Write protect enable
+ */
+enum iommu_hwpt_vtd_s1_flags {
+	IOMMU_VTD_S1_SRE = 1 << 0,
+	IOMMU_VTD_S1_EAFE = 1 << 1,
+	IOMMU_VTD_S1_WPE = 1 << 2,
+};
+
+/**
+ * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table
+ *                            info (IOMMU_HWPT_DATA_VTD_S1)
+ * @flags: Combination of enum iommu_hwpt_vtd_s1_flags
+ * @pgtbl_addr: The base address of the stage-1 page table.
+ * @addr_width: The address width of the stage-1 page table
+ * @__reserved: Must be 0
+ */
+struct iommu_hwpt_vtd_s1 {
+	__aligned_u64 flags;
+	__aligned_u64 pgtbl_addr;
+	__u32 addr_width;
+	__u32 __reserved;
+};
+
+/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 nested STE
+ *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ *       the translation. Must be little-endian.
+ *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ *       - word-1: EATS, S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent. The S1 will
+ * install a Context Descriptor Table pointing at userspace memory translated
+ * by the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+	__aligned_le64 ste[2];
+};
+
+/**
+ * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
+ * @IOMMU_HWPT_DATA_NONE: no data
+ * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
+ */
+enum iommu_hwpt_data_type {
+	IOMMU_HWPT_DATA_NONE = 0,
+	IOMMU_HWPT_DATA_VTD_S1 = 1,
+	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
+};
+
+/**
+ * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC)
+ * @size: sizeof(struct iommu_hwpt_alloc)
+ * @flags: Combination of enum iommufd_hwpt_alloc_flags
+ * @dev_id: The device to allocate this HWPT for
+ * @pt_id: The IOAS or HWPT or vIOMMU to connect this HWPT to
+ * @out_hwpt_id: The ID of the new HWPT
+ * @__reserved: Must be 0
+ * @data_type: One of enum iommu_hwpt_data_type
+ * @data_len: Length of the type specific data
+ * @data_uptr: User pointer to the type specific data
+ * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of
+ *            IOMMU_HWPT_FAULT_ID_VALID is set.
+ * @__reserved2: Padding to 64-bit alignment. Must be 0.
+ *
+ * Explicitly allocate a hardware page table object. This is the same object
+ * type that is returned by iommufd_device_attach() and represents the
+ * underlying iommu driver's iommu_domain kernel object.
+ *
+ * A kernel-managed HWPT will be created with the mappings from the given
+ * IOAS via the @pt_id. The @data_type for this allocation must be set to
+ * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
+ * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
+ *
+ * A user-managed nested HWPT will be created from a given vIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware. The device
+ * via @dev_id and the vIOMMU via @pt_id must be associated to the same IOMMU
+ * instance.
+ *
+ * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
+ * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr
+ * must be given.
+ */
+struct iommu_hwpt_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 dev_id;
+	__u32 pt_id;
+	__u32 out_hwpt_id;
+	__u32 __reserved;
+	__u32 data_type;
+	__u32 data_len;
+	__aligned_u64 data_uptr;
+	__u32 fault_id;
+	__u32 __reserved2;
+};
+#define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
+
+/**
+ * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info
+ * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings
+ *                                         on a nested_parent domain.
+ *                                         https://www.intel.com/content/www/us/en/content-details/772415/content-details.html
+ */
+enum iommu_hw_info_vtd_flags {
+	IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0,
+};
+
+/**
+ * struct iommu_hw_info_vtd - Intel VT-d hardware information
+ *
+ * @flags: Combination of enum iommu_hw_info_vtd_flags
+ * @__reserved: Must be 0
+ *
+ * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec
+ *           section 11.4.2 Capability Register.
+ * @ecap_reg: Value of Intel VT-d capability register defined in VT-d spec
+ *            section 11.4.3 Extended Capability Register.
+ *
+ * User needs to understand the Intel VT-d specification to decode the
+ * register value.
+ */
+struct iommu_hw_info_vtd {
+	__u32 flags;
+	__u32 __reserved;
+	__aligned_u64 cap_reg;
+	__aligned_u64 ecap_reg;
+};
+
+/**
+ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
+ *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+ *
+ * @flags: Must be set to 0
+ * @__reserved: Must be 0
+ * @idr: Implemented features for ARM SMMU Non-secure programming interface
+ * @iidr: Information about the implementation and implementer of ARM SMMU,
+ *        and architecture version supported
+ * @aidr: ARM SMMU architecture version
+ *
+ * For the details of @idr, @iidr and @aidr, please refer to the chapters
+ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
+ *
+ * This reports the raw HW capability, and not all bits are meaningful to be
+ * read by userspace. Only the following fields should be used:
+ *
+ * idr[0]: ST_LEVEL, TERM_MODEL, STALL_MODEL, TTENDIAN , CD2L, ASID16, TTF
+ * idr[1]: SIDSIZE, SSIDSIZE
+ * idr[3]: BBML, RIL
+ * idr[5]: VAX, GRAN64K, GRAN16K, GRAN4K
+ *
+ * - S1P should be assumed to be true if a NESTED HWPT can be created
+ * - VFIO/iommufd only support platforms with COHACC, it should be assumed to be
+ *   true.
+ * - ATS is a per-device property. If the VMM describes any devices as ATS
+ *   capable in ACPI/DT it should set the corresponding idr.
+ *
+ * This list may expand in future (eg E0PD, AIE, PBHA, D128, DS etc). It is
+ * important that VMMs do not read bits outside the list to allow for
+ * compatibility with future kernels. Several features in the SMMUv3
+ * architecture are not currently supported by the kernel for nesting: HTTU,
+ * BTM, MPAM and others.
+ */
+struct iommu_hw_info_arm_smmuv3 {
+	__u32 flags;
+	__u32 __reserved;
+	__u32 idr[6];
+	__u32 iidr;
+	__u32 aidr;
+};
+
+/**
+ * struct iommu_hw_info_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Hardware
+ *         Information (IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV)
+ *
+ * @flags: Must be 0
+ * @version: Version number for the CMDQ-V HW for PARAM bits[03:00]
+ * @log2vcmdqs: Log2 of the total number of VCMDQs for PARAM bits[07:04]
+ * @log2vsids: Log2 of the total number of SID replacements for PARAM bits[15:12]
+ * @__reserved: Must be 0
+ *
+ * VMM can use these fields directly in its emulated global PARAM register. Note
+ * that only one Virtual Interface (VINTF) should be exposed to a VM, i.e. PARAM
+ * bits[11:08] should be set to 0 for log2 of the total number of VINTFs.
+ */
+struct iommu_hw_info_tegra241_cmdqv {
+	__u32 flags;
+	__u8 version;
+	__u8 log2vcmdqs;
+	__u8 log2vsids;
+	__u8 __reserved;
+};
+
+/**
+ * enum iommu_hw_info_type - IOMMU Hardware Info Types
+ * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware
+ *                           info
+ * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type
+ * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
+ * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
+ *                                     SMMUv3) info type
+ */
+enum iommu_hw_info_type {
+	IOMMU_HW_INFO_TYPE_NONE = 0,
+	IOMMU_HW_INFO_TYPE_DEFAULT = 0,
+	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
+	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
+	IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3,
+};
+
+/**
+ * enum iommufd_hw_capabilities
+ * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking
+ *                               If available, it means the following APIs
+ *                               are supported:
+ *
+ *                                   IOMMU_HWPT_GET_DIRTY_BITMAP
+ *                                   IOMMU_HWPT_SET_DIRTY_TRACKING
+ *
+ * @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it
+ *                               when the struct
+ *                               iommu_hw_info::out_max_pasid_log2 is zero.
+ * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it
+ *                               when the struct
+ *                               iommu_hw_info::out_max_pasid_log2 is zero.
+ */
+enum iommufd_hw_capabilities {
+	IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0,
+	IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1,
+	IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2,
+};
+
+/**
+ * enum iommufd_hw_info_flags - Flags for iommu_hw_info
+ * @IOMMU_HW_INFO_FLAG_INPUT_TYPE: If set, @in_data_type carries an input type
+ *                                 for user space to request for a specific info
+ */
+enum iommufd_hw_info_flags {
+	IOMMU_HW_INFO_FLAG_INPUT_TYPE = 1 << 0,
+};
+
+/**
+ * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO)
+ * @size: sizeof(struct iommu_hw_info)
+ * @flags: Must be 0
+ * @dev_id: The device bound to the iommufd
+ * @data_len: Input the length of a user buffer in bytes. Output the length of
+ *            data that kernel supports
+ * @data_uptr: User pointer to a user-space buffer used by the kernel to fill
+ *             the iommu type specific hardware information data
+ * @in_data_type: This shares the same field with @out_data_type, making it be
+ *                a bidirectional field. When IOMMU_HW_INFO_FLAG_INPUT_TYPE is
+ *                set, an input type carried via this @in_data_type field will
+ *                be valid, requesting for the info data to the given type. If
+ *                IOMMU_HW_INFO_FLAG_INPUT_TYPE is unset, any input value will
+ *                be seen as IOMMU_HW_INFO_TYPE_DEFAULT
+ * @out_data_type: Output the iommu hardware info type as defined in the enum
+ *                 iommu_hw_info_type.
+ * @out_capabilities: Output the generic iommu capability info type as defined
+ *                    in the enum iommu_hw_capabilities.
+ * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support.
+ *                      PCI devices turn to out_capabilities to check if the
+ *                      specific capabilities is supported or not.
+ * @__reserved: Must be 0
+ *
+ * Query an iommu type specific hardware information data from an iommu behind
+ * a given device that has been bound to iommufd. This hardware info data will
+ * be used to sync capabilities between the virtual iommu and the physical
+ * iommu, e.g. a nested translation setup needs to check the hardware info, so
+ * a guest stage-1 page table can be compatible with the physical iommu.
+ *
+ * To capture an iommu type specific hardware information data, @data_uptr and
+ * its length @data_len must be provided. Trailing bytes will be zeroed if the
+ * user buffer is larger than the data that kernel has. Otherwise, kernel only
+ * fills the buffer using the given length in @data_len. If the ioctl succeeds,
+ * @data_len will be updated to the length that kernel actually supports,
+ * @out_data_type will be filled to decode the data filled in the buffer
+ * pointed by @data_uptr. Input @data_len == zero is allowed.
+ */
+struct iommu_hw_info {
+	__u32 size;
+	__u32 flags;
+	__u32 dev_id;
+	__u32 data_len;
+	__aligned_u64 data_uptr;
+	union {
+		__u32 in_data_type;
+		__u32 out_data_type;
+	};
+	__u8 out_max_pasid_log2;
+	__u8 __reserved[3];
+	__aligned_u64 out_capabilities;
+};
+#define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
+
+/*
+ * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty
+ *                                              tracking
+ * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking
+ */
+enum iommufd_hwpt_set_dirty_tracking_flags {
+	IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1,
+};
+
+/**
+ * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING)
+ * @size: sizeof(struct iommu_hwpt_set_dirty_tracking)
+ * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @__reserved: Must be 0
+ *
+ * Toggle dirty tracking on an HW pagetable.
+ */
+struct iommu_hwpt_set_dirty_tracking {
+	__u32 size;
+	__u32 flags;
+	__u32 hwpt_id;
+	__u32 __reserved;
+};
+#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \
+					  IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING)
+
+/**
+ * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits
+ * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing
+ *                                        any dirty bits metadata. This flag
+ *                                        can be passed in the expectation
+ *                                        where the next operation is an unmap
+ *                                        of the same IOVA range.
+ *
+ */
+enum iommufd_hwpt_get_dirty_bitmap_flags {
+	IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1,
+};
+
+/**
+ * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP)
+ * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap)
+ * @hwpt_id: HW pagetable ID that represents the IOMMU domain
+ * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags
+ * @__reserved: Must be 0
+ * @iova: base IOVA of the bitmap first bit
+ * @length: IOVA range size
+ * @page_size: page size granularity of each bit in the bitmap
+ * @data: bitmap where to set the dirty bits. The bitmap bits each
+ *        represent a page_size which you deviate from an arbitrary iova.
+ *
+ * Checking a given IOVA is dirty:
+ *
+ *  data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64))
+ *
+ * Walk the IOMMU pagetables for a given IOVA range to return a bitmap
+ * with the dirty IOVAs. In doing so it will also by default clear any
+ * dirty bit metadata set in the IOPTE.
+ */
+struct iommu_hwpt_get_dirty_bitmap {
+	__u32 size;
+	__u32 hwpt_id;
+	__u32 flags;
+	__u32 __reserved;
+	__aligned_u64 iova;
+	__aligned_u64 length;
+	__aligned_u64 page_size;
+	__aligned_u64 data;
+};
+#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \
+					IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP)
+
+/**
+ * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation
+ *                                        Data Type
+ * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
+ * @IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3
+ */
+enum iommu_hwpt_invalidate_data_type {
+	IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
+	IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 = 1,
+};
+
+/**
+ * enum iommu_hwpt_vtd_s1_invalidate_flags - Flags for Intel VT-d
+ *                                           stage-1 cache invalidation
+ * @IOMMU_VTD_INV_FLAGS_LEAF: Indicates whether the invalidation applies
+ *                            to all-levels page structure cache or just
+ *                            the leaf PTE cache.
+ */
+enum iommu_hwpt_vtd_s1_invalidate_flags {
+	IOMMU_VTD_INV_FLAGS_LEAF = 1 << 0,
+};
+
+/**
+ * struct iommu_hwpt_vtd_s1_invalidate - Intel VT-d cache invalidation
+ *                                       (IOMMU_HWPT_INVALIDATE_DATA_VTD_S1)
+ * @addr: The start address of the range to be invalidated. It needs to
+ *        be 4KB aligned.
+ * @npages: Number of contiguous 4K pages to be invalidated.
+ * @flags: Combination of enum iommu_hwpt_vtd_s1_invalidate_flags
+ * @__reserved: Must be 0
+ *
+ * The Intel VT-d specific invalidation data for user-managed stage-1 cache
+ * invalidation in nested translation. Userspace uses this structure to
+ * tell the impacted cache scope after modifying the stage-1 page table.
+ *
+ * Invalidating all the caches related to the page table by setting @addr
+ * to be 0 and @npages to be U64_MAX.
+ *
+ * The device TLB will be invalidated automatically if ATS is enabled.
+ */
+struct iommu_hwpt_vtd_s1_invalidate {
+	__aligned_u64 addr;
+	__aligned_u64 npages;
+	__u32 flags;
+	__u32 __reserved;
+};
+
+/**
+ * struct iommu_viommu_arm_smmuv3_invalidate - ARM SMMUv3 cache invalidation
+ *         (IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3)
+ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
+ *       Must be little-endian.
+ *
+ * Supported command list only when passing in a vIOMMU via @hwpt_id:
+ *     CMDQ_OP_TLBI_NSNH_ALL
+ *     CMDQ_OP_TLBI_NH_VA
+ *     CMDQ_OP_TLBI_NH_VAA
+ *     CMDQ_OP_TLBI_NH_ALL
+ *     CMDQ_OP_TLBI_NH_ASID
+ *     CMDQ_OP_ATC_INV
+ *     CMDQ_OP_CFGI_CD
+ *     CMDQ_OP_CFGI_CD_ALL
+ *
+ * -EIO will be returned if the command is not supported.
+ */
+struct iommu_viommu_arm_smmuv3_invalidate {
+	__aligned_le64 cmd[2];
+};
+
+/**
+ * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
+ * @size: sizeof(struct iommu_hwpt_invalidate)
+ * @hwpt_id: ID of a nested HWPT or a vIOMMU, for cache invalidation
+ * @data_uptr: User pointer to an array of driver-specific cache invalidation
+ *             data.
+ * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
+ *             type of all the entries in the invalidation request array. It
+ *             should be a type supported by the hwpt pointed by @hwpt_id.
+ * @entry_len: Length (in bytes) of a request entry in the request array
+ * @entry_num: Input the number of cache invalidation requests in the array.
+ *             Output the number of requests successfully handled by kernel.
+ * @__reserved: Must be 0.
+ *
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a vIOMMU is passed in via the @hwpt_id field.
+ *
+ * Each ioctl can support one or more cache invalidation requests in the array
+ * that has a total size of @entry_len * @entry_num.
+ *
+ * An empty invalidation request array by setting @entry_num==0 is allowed, and
+ * @entry_len and @data_uptr would be ignored in this case. This can be used to
+ * check if the given @data_type is supported or not by kernel.
+ */
+struct iommu_hwpt_invalidate {
+	__u32 size;
+	__u32 hwpt_id;
+	__aligned_u64 data_uptr;
+	__u32 data_type;
+	__u32 entry_len;
+	__u32 entry_num;
+	__u32 __reserved;
+};
+#define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
+
+/**
+ * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault
+ * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is
+ *                                   valid.
+ * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group.
+ */
+enum iommu_hwpt_pgfault_flags {
+	IOMMU_PGFAULT_FLAGS_PASID_VALID		= (1 << 0),
+	IOMMU_PGFAULT_FLAGS_LAST_PAGE		= (1 << 1),
+};
+
+/**
+ * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault
+ * @IOMMU_PGFAULT_PERM_READ: request for read permission
+ * @IOMMU_PGFAULT_PERM_WRITE: request for write permission
+ * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the
+ *                           Execute Requested bit set in PASID TLP Prefix.
+ * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the
+ *                           Privileged Mode Requested bit set in PASID TLP
+ *                           Prefix.
+ */
+enum iommu_hwpt_pgfault_perm {
+	IOMMU_PGFAULT_PERM_READ			= (1 << 0),
+	IOMMU_PGFAULT_PERM_WRITE		= (1 << 1),
+	IOMMU_PGFAULT_PERM_EXEC			= (1 << 2),
+	IOMMU_PGFAULT_PERM_PRIV			= (1 << 3),
+};
+
+/**
+ * struct iommu_hwpt_pgfault - iommu page fault data
+ * @flags: Combination of enum iommu_hwpt_pgfault_flags
+ * @dev_id: id of the originated device
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: Combination of enum iommu_hwpt_pgfault_perm
+ * @__reserved: Must be 0.
+ * @addr: Fault address
+ * @length: a hint of how much data the requestor is expecting to fetch. For
+ *          example, if the PRI initiator knows it is going to do a 10MB
+ *          transfer, it could fill in 10MB and the OS could pre-fault in
+ *          10MB of IOVA. It's default to 0 if there's no such hint.
+ * @cookie: kernel-managed cookie identifying a group of fault messages. The
+ *          cookie number encoded in the last page fault of the group should
+ *          be echoed back in the response message.
+ */
+struct iommu_hwpt_pgfault {
+	__u32 flags;
+	__u32 dev_id;
+	__u32 pasid;
+	__u32 grpid;
+	__u32 perm;
+	__u32 __reserved;
+	__aligned_u64 addr;
+	__u32 length;
+	__u32 cookie;
+};
+
+/**
+ * enum iommufd_page_response_code - Return status of fault handlers
+ * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
+ *                             populated, retry the access. This is the
+ *                             "Success" defined in PCI 10.4.2.1.
+ * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
+ *                             access. This is the "Invalid Request" in PCI
+ *                             10.4.2.1.
+ */
+enum iommufd_page_response_code {
+	IOMMUFD_PAGE_RESP_SUCCESS = 0,
+	IOMMUFD_PAGE_RESP_INVALID = 1,
+};
+
+/**
+ * struct iommu_hwpt_page_response - IOMMU page fault response
+ * @cookie: The kernel-managed cookie reported in the fault message.
+ * @code: One of response code in enum iommufd_page_response_code.
+ */
+struct iommu_hwpt_page_response {
+	__u32 cookie;
+	__u32 code;
+};
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_fault_id;
+	__u32 out_fault_fd;
+};
+#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type
+ * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
+ *                                    SMMUv3) enabled ARM SMMUv3 type
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+	IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1,
+	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2,
+};
+
+/**
+ * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface
+ *                                      (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV)
+ * @out_vintf_mmap_offset: mmap offset argument for VINTF's page0
+ * @out_vintf_mmap_length: mmap length argument for VINTF's page0
+ *
+ * Both @out_vintf_mmap_offset and @out_vintf_mmap_length are reported by kernel
+ * for user space to mmap the VINTF page0 from the host physical address space
+ * to the guest physical address space so that a guest kernel can directly R/W
+ * access to the VINTF page0 in order to control its virtual command queues.
+ */
+struct iommu_viommu_tegra241_cmdqv {
+	__aligned_u64 out_vintf_mmap_offset;
+	__aligned_u64 out_vintf_mmap_length;
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ * @data_len: Length of the type specific data
+ * @__reserved: Must be 0
+ * @data_uptr: User pointer to a driver-specific virtual IOMMU data
+ *
+ * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's
+ * virtualization support that is a security-isolated slice of the real IOMMU HW
+ * that is unique to a specific VM. Operations global to the IOMMU are connected
+ * to the vIOMMU, such as:
+ * - Security namespace for guest owned ID, e.g. guest-controlled cache tags
+ * - Non-device-affiliated event reporting, e.g. invalidation queue errors
+ * - Access to a sharable nesting parent pagetable across physical IOMMUs
+ * - Virtualization of various platforms IDs, e.g. RIDs and others
+ * - Delivery of paravirtualized invalidation
+ * - Direct assigned invalidation queues
+ * - Direct assigned interrupts
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+	__u32 data_len;
+	__u32 __reserved;
+	__aligned_u64 data_uptr;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_vdevice_alloc - ioctl(IOMMU_VDEVICE_ALLOC)
+ * @size: sizeof(struct iommu_vdevice_alloc)
+ * @viommu_id: vIOMMU ID to associate with the virtual device
+ * @dev_id: The physical device to allocate a virtual instance on the vIOMMU
+ * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY
+ * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID
+ *           of AMD IOMMU, and vRID of Intel VT-d
+ *
+ * Allocate a virtual device instance (for a physical device) against a vIOMMU.
+ * This instance holds the device's information (related to its vIOMMU) in a VM.
+ * User should use IOMMU_DESTROY to destroy the virtual device before
+ * destroying the physical device (by closing vfio_cdev fd). Otherwise the
+ * virtual device would be forcibly destroyed on physical device destruction,
+ * its vdevice_id would be permanently leaked (unremovable & unreusable) until
+ * iommu fd closed.
+ */
+struct iommu_vdevice_alloc {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 out_vdevice_id;
+	__aligned_u64 virt_id;
+};
+#define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
+
+/**
+ * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
+ * @size: sizeof(struct iommu_ioas_change_process)
+ * @__reserved: Must be 0
+ *
+ * This transfers pinned memory counts for every memory map in every IOAS
+ * in the context to the current process.  This only supports maps created
+ * with IOMMU_IOAS_MAP_FILE, and returns EINVAL if other maps are present.
+ * If the ioctl returns a failure status, then nothing is changed.
+ *
+ * This API is useful for transferring operation of a device from one process
+ * to another, such as during userland live update.
+ */
+struct iommu_ioas_change_process {
+	__u32 size;
+	__u32 __reserved;
+};
+
+#define IOMMU_IOAS_CHANGE_PROCESS \
+	_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+
+/**
+ * enum iommu_veventq_flag - flag for struct iommufd_vevent_header
+ * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs
+ */
+enum iommu_veventq_flag {
+	IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0),
+};
+
+/**
+ * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
+ * @flags: Combination of enum iommu_veventq_flag
+ * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of
+ *            [0, INT_MAX] where the following index of INT_MAX is 0
+ *
+ * Each iommufd_vevent_header reports a sequence index of the following vEVENT:
+ *
+ * +----------------------+-------+----------------------+-------+---+-------+
+ * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN |
+ * +----------------------+-------+----------------------+-------+---+-------+
+ *
+ * And this sequence index is expected to be monotonic to the sequence index of
+ * the previous vEVENT. If two adjacent sequence indexes has a delta larger than
+ * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs:
+ *
+ * +-----+----------------------+-------+----------------------+-------+-----+
+ * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... |
+ * +-----+----------------------+-------+----------------------+-------+-----+
+ *
+ * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT
+ * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header
+ * would be added to the tail, and no data would follow this header:
+ *
+ * +--+----------------------+-------+-----------------------------------------+
+ * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} |
+ * +--+----------------------+-------+-----------------------------------------+
+ */
+struct iommufd_vevent_header {
+	__u32 flags;
+	__u32 sequence;
+};
+
+/**
+ * enum iommu_veventq_type - Virtual Event Queue Type
+ * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue
+ * @IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension IRQ
+ */
+enum iommu_veventq_type {
+	IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
+	IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1,
+	IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV = 2,
+};
+
+/**
+ * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event
+ *                                  (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3)
+ * @evt: 256-bit ARM SMMUv3 Event record, little-endian.
+ *       Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec)
+ *       - 0x04 C_BAD_STE
+ *       - 0x06 F_STREAM_DISABLED
+ *       - 0x08 C_BAD_SUBSTREAMID
+ *       - 0x0a C_BAD_CD
+ *       - 0x10 F_TRANSLATION
+ *       - 0x11 F_ADDR_SIZE
+ *       - 0x12 F_ACCESS
+ *       - 0x13 F_PERMISSION
+ *
+ * StreamID field reports a virtual device ID. To receive a virtual event for a
+ * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC.
+ */
+struct iommu_vevent_arm_smmuv3 {
+	__aligned_le64 evt[4];
+};
+
+/**
+ * struct iommu_vevent_tegra241_cmdqv - Tegra241 CMDQV IRQ
+ *                                      (IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV)
+ * @lvcmdq_err_map: 128-bit logical vcmdq error map, little-endian.
+ *                  (Refer to register LVCMDQ_ERR_MAPs per VINTF )
+ *
+ * The 128-bit register value from HW exclusively reflect the error bits for a
+ * Virtual Interface represented by a vIOMMU object. Read and report directly.
+ */
+struct iommu_vevent_tegra241_cmdqv {
+	__aligned_le64 lvcmdq_err_map[2];
+};
+
+/**
+ * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
+ * @size: sizeof(struct iommu_veventq_alloc)
+ * @flags: Must be 0
+ * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with
+ * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type
+ * @veventq_depth: Maximum number of events in the vEVENTQ
+ * @out_veventq_id: The ID of the new vEVENTQ
+ * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the
+ *                  successfully returned fd after using it
+ * @__reserved: Must be 0
+ *
+ * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
+ * can have multiple FDs for different types, but is confined to one per @type.
+ * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ,
+ * if there are vEVENTs available. A vEVENTQ will lose events due to overflow,
+ * if the number of the vEVENTs hits @veventq_depth.
+ *
+ * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
+ * a type-specific data structure, in a normal case:
+ *
+ * +-+---------+-------+---------+-------+-----+---------+-------+-+
+ * | | header0 | data0 | header1 | data1 | ... | headerN | dataN | |
+ * +-+---------+-------+---------+-------+-----+---------+-------+-+
+ *
+ * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to
+ * struct iommufd_vevent_header).
+ */
+struct iommu_veventq_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 veventq_depth;
+	__u32 out_veventq_id;
+	__u32 out_veventq_fd;
+	__u32 __reserved;
+};
+#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
+
+/**
+ * enum iommu_hw_queue_type - HW Queue Type
+ * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use
+ * @IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
+ *                                      SMMUv3) Virtual Command Queue (VCMDQ)
+ */
+enum iommu_hw_queue_type {
+	IOMMU_HW_QUEUE_TYPE_DEFAULT = 0,
+	/*
+	 * TEGRA241_CMDQV requirements (otherwise, allocation will fail)
+	 * - alloc starts from the lowest @index=0 in ascending order
+	 * - destroy starts from the last allocated @index in descending order
+	 * - @base_addr must be aligned to @length in bytes and mapped in IOAS
+	 * - @length must be a power of 2, with a minimum 32 bytes and a maximum
+	 *   2 ^ idr[1].CMDQS * 16 bytes (use GET_HW_INFO call to read idr[1]
+	 *   from struct iommu_hw_info_arm_smmuv3)
+	 * - suggest to back the queue memory with contiguous physical pages or
+	 *   a single huge page with alignment of the queue size, and limit the
+	 *   emulated vSMMU's IDR1.CMDQS to log2(huge page size / 16 bytes)
+	 */
+	IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV = 1,
+};
+
+/**
+ * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_hw_queue_alloc)
+ * @flags: Must be 0
+ * @viommu_id: Virtual IOMMU ID to associate the HW queue with
+ * @type: One of enum iommu_hw_queue_type
+ * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue
+ *         model
+ * @out_hw_queue_id: The ID of the new HW queue
+ * @nesting_parent_iova: Base address of the queue memory in the guest physical
+ *                       address space
+ * @length: Length of the queue memory
+ *
+ * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which
+ * allows HW to access a guest queue memory described using @nesting_parent_iova
+ * and @length.
+ *
+ * A vIOMMU can allocate multiple queues, but it must use a different @index per
+ * type to separate each allocation, e.g::
+ *
+ *     Type1 HW queue0, Type1 HW queue1, Type2 HW queue0, ...
+ */
+struct iommu_hw_queue_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 index;
+	__u32 out_hw_queue_id;
+	__aligned_u64 nesting_parent_iova;
+	__aligned_u64 length;
+};
+#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC)
+#endif
diff --git a/kernel/linux/uapi/linux/vduse.h b/kernel/linux/uapi/linux/vduse.h
index f46269af34..da6ac89af1 100644
--- a/kernel/linux/uapi/linux/vduse.h
+++ b/kernel/linux/uapi/linux/vduse.h
@@ -237,7 +237,7 @@ struct vduse_iova_umem {
  * struct vduse_iova_info - information of one IOVA region
  * @start: start of the IOVA region
  * @last: last of the IOVA region
- * @capability: capability of the IOVA regsion
+ * @capability: capability of the IOVA region
  * @reserved: for future use, needs to be initialized to zero
  *
  * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of
diff --git a/kernel/linux/uapi/linux/vfio.h b/kernel/linux/uapi/linux/vfio.h
index 79bf8c0cc5..4d96d1fc12 100644
--- a/kernel/linux/uapi/linux/vfio.h
+++ b/kernel/linux/uapi/linux/vfio.h
@@ -905,10 +905,12 @@ struct vfio_device_feature {
  * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18,
  *				   struct vfio_device_bind_iommufd)
  * @argsz:	 User filled size of this data.
- * @flags:	 Must be 0.
+ * @flags:	 Must be 0 or a bit flags of VFIO_DEVICE_BIND_*
  * @iommufd:	 iommufd to bind.
  * @out_devid:	 The device id generated by this bind. devid is a handle for
  *		 this device/iommufd bond and can be used in IOMMUFD commands.
+ * @token_uuid_ptr: Valid if VFIO_DEVICE_BIND_FLAG_TOKEN. Points to a 16 byte
+ *                  UUID in the same format as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN.
  *
  * Bind a vfio_device to the specified iommufd.
  *
@@ -917,13 +919,21 @@ struct vfio_device_feature {
  *
  * Unbind is automatically conducted when device fd is closed.
  *
+ * A token is sometimes required to open the device, unless this is known to be
+ * needed VFIO_DEVICE_BIND_FLAG_TOKEN should not be set and token_uuid_ptr is
+ * ignored. The only case today is a PF/VF relationship where the VF bind must
+ * be provided the same token as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN provided to
+ * the PF.
+ *
  * Return: 0 on success, -errno on failure.
  */
 struct vfio_device_bind_iommufd {
 	__u32		argsz;
 	__u32		flags;
+#define VFIO_DEVICE_BIND_FLAG_TOKEN (1 << 0)
 	__s32		iommufd;
 	__u32		out_devid;
+	__aligned_u64	token_uuid_ptr;
 };
 
 #define VFIO_DEVICE_BIND_IOMMUFD	_IO(VFIO_TYPE, VFIO_BASE + 18)
diff --git a/kernel/linux/uapi/version b/kernel/linux/uapi/version
index 966a998301..d9e789dade 100644
--- a/kernel/linux/uapi/version
+++ b/kernel/linux/uapi/version
@@ -1 +1 @@
-v6.16
+v6.17
-- 
2.47.3


^ permalink raw reply related

* [PATCH v8 02/18] vfio: make all functions internal
From: Anatoly Burakov @ 2026-06-11 15:08 UTC (permalink / raw)
  To: dev, Bruce Richardson, Dmitry Kozlyuk
In-Reply-To: <cover.1781190151.git.anatoly.burakov@intel.com>

The VFIO API is an externally exported API because the original intent was
to offer DMA mapping facilities to applications. However, practical usage
of this API seems to be centered around drivers, so keeping this API
exported to applications only creates needless API/ABI stability surface
that has no added value.

Make the entire VFIO API internal-only and visible only to drivers.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 doc/guides/rel_notes/deprecation.rst |  6 ----
 lib/eal/freebsd/eal.c                | 30 +++++++++---------
 lib/eal/include/rte_vfio.h           | 47 ++++++++++++++++++++++++----
 lib/eal/linux/eal_vfio.c             | 32 +++++++++----------
 lib/eal/windows/eal.c                |  4 +--
 5 files changed, 74 insertions(+), 45 deletions(-)

diff --git a/doc/guides/rel_notes/deprecation.rst b/doc/guides/rel_notes/deprecation.rst
index c7f8230278..f2901064f5 100644
--- a/doc/guides/rel_notes/deprecation.rst
+++ b/doc/guides/rel_notes/deprecation.rst
@@ -30,12 +30,6 @@ Deprecation Notices
   Use the ``-S <service-corelist>`` parameter instead
   to specify the cores to be used for background services in DPDK.
 
-* eal: The entire VFIO API (``rte_vfio_*``) will be made internal only,
-  and will only be available to EAL and drivers.
-  Group-based API (``rte_vfio_*_group_*``) will be removed
-  and replaced with unified container device assignment API.
-  This change will be made in 26.11 release.
-
 * vdpa: The vDPA driver API will no longer offer ``get_vfio_group_fd``
   as part of its internal API. All drivers will be adjusted
   to use the new unified VFIO container device assignment API.
diff --git a/lib/eal/freebsd/eal.c b/lib/eal/freebsd/eal.c
index 8b1ba5b99b..0fe54a9dd7 100644
--- a/lib/eal/freebsd/eal.c
+++ b/lib/eal/freebsd/eal.c
@@ -819,7 +819,7 @@ rte_eal_vfio_get_vf_token(__rte_unused rte_uuid_t vf_token)
 {
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_setup_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
 int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
 		      __rte_unused const char *dev_addr,
 		      __rte_unused int *vfio_dev_fd,
@@ -829,7 +829,7 @@ int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_release_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_release_device)
 int rte_vfio_release_device(__rte_unused const char *sysfs_base,
 			__rte_unused const char *dev_addr,
 			__rte_unused int fd)
@@ -838,33 +838,33 @@ int rte_vfio_release_device(__rte_unused const char *sysfs_base,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_enable)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_enable)
 int rte_vfio_enable(__rte_unused const char *modname)
 {
 	rte_errno = ENOTSUP;
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_is_enabled)
 int rte_vfio_is_enabled(__rte_unused const char *modname)
 {
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_noiommu_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
 int rte_vfio_noiommu_is_enabled(void)
 {
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_clear_group)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_clear_group)
 int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
 {
 	rte_errno = ENOTSUP;
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_num)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
 int
 rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
 		       __rte_unused const char *dev_addr,
@@ -874,7 +874,7 @@ rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_get_container_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_container_fd)
 int
 rte_vfio_get_container_fd(void)
 {
@@ -882,7 +882,7 @@ rte_vfio_get_container_fd(void)
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_fd)
 int
 rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
 {
@@ -890,7 +890,7 @@ rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_create)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
 int
 rte_vfio_container_create(void)
 {
@@ -898,7 +898,7 @@ rte_vfio_container_create(void)
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_destroy)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_destroy)
 int
 rte_vfio_container_destroy(__rte_unused int container_fd)
 {
@@ -906,7 +906,7 @@ rte_vfio_container_destroy(__rte_unused int container_fd)
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_bind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
 int
 rte_vfio_container_group_bind(__rte_unused int container_fd,
 		__rte_unused int iommu_group_num)
@@ -915,7 +915,7 @@ rte_vfio_container_group_bind(__rte_unused int container_fd,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_unbind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_unbind)
 int
 rte_vfio_container_group_unbind(__rte_unused int container_fd,
 		__rte_unused int iommu_group_num)
@@ -924,7 +924,7 @@ rte_vfio_container_group_unbind(__rte_unused int container_fd,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_map)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
 int
 rte_vfio_container_dma_map(__rte_unused int container_fd,
 			__rte_unused uint64_t vaddr,
@@ -935,7 +935,7 @@ rte_vfio_container_dma_map(__rte_unused int container_fd,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_unmap)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
 int
 rte_vfio_container_dma_unmap(__rte_unused int container_fd,
 			__rte_unused uint64_t vaddr,
diff --git a/lib/eal/include/rte_vfio.h b/lib/eal/include/rte_vfio.h
index d1e8bce56b..0ddeb08f94 100644
--- a/lib/eal/include/rte_vfio.h
+++ b/lib/eal/include/rte_vfio.h
@@ -7,7 +7,11 @@
 
 /**
  * @file
- * RTE VFIO. This library provides various VFIO related utility functions.
+ * @internal
+ *
+ * RTE VFIO internal API.
+ *
+ * This library provides VFIO related utility functions for use by drivers.
  */
 
 #include <stdbool.h>
@@ -36,6 +40,7 @@ struct vfio_device_info;
 #define RTE_VFIO_DEFAULT_CONTAINER_FD (-1)
 
 /**
+ * @internal
  * Setup vfio_cfg for the device identified by its address.
  * It discovers the configured I/O MMU groups or sets a new one for the device.
  * If a new groups is assigned, the DMA mapping is performed.
@@ -60,10 +65,12 @@ struct vfio_device_info;
  *   <0 on failure.
  *   >1 if the device cannot be managed this way.
  */
+__rte_internal
 int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info);
 
 /**
+ * @internal
  * Release a device mapped to a VFIO-managed I/O MMU group.
  *
  * This function is only relevant to linux and will return
@@ -82,9 +89,11 @@ int rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
  *   0 on success.
  *   <0 on failure.
  */
+__rte_internal
 int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
 
 /**
+ * @internal
  * Enable a VFIO-related kmod.
  *
  * This function is only relevant to linux and will return
@@ -97,9 +106,11 @@ int rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd
  *   0 on success.
  *   <0 on failure.
  */
+__rte_internal
 int rte_vfio_enable(const char *modname);
 
 /**
+ * @internal
  * Check whether a VFIO-related kmod is enabled.
  *
  * This function is only relevant to Linux.
@@ -111,9 +122,11 @@ int rte_vfio_enable(const char *modname);
  *   1 if true.
  *   0 otherwise.
  */
+__rte_internal
 int rte_vfio_is_enabled(const char *modname);
 
 /**
+ * @internal
  * Whether VFIO NOIOMMU mode is enabled.
  *
  * This function is only relevant to Linux.
@@ -123,10 +136,12 @@ int rte_vfio_is_enabled(const char *modname);
  *   0 if false.
  *   <0 for errors.
  */
+__rte_internal
 int rte_vfio_noiommu_is_enabled(void);
 
 /**
- * Remove group fd from internal VFIO group fd array/
+ * @internal
+ * Remove group fd from internal VFIO group fd array.
  *
  * This function is only relevant to linux and will return
  * an error on BSD.
@@ -138,11 +153,13 @@ int rte_vfio_noiommu_is_enabled(void);
  *   0 on success.
  *   <0 on failure.
  */
+__rte_internal
 int
 rte_vfio_clear_group(int vfio_group_fd);
 
 /**
- * Parse IOMMU group number for a device
+ * @internal
+ * Parse IOMMU group number for a device.
  *
  * This function is only relevant to linux and will return
  * an error on BSD.
@@ -161,12 +178,14 @@ rte_vfio_clear_group(int vfio_group_fd);
  *   0 for non-existent group or VFIO
  *  <0 for errors
  */
+__rte_internal
 int
 rte_vfio_get_group_num(const char *sysfs_base,
 		      const char *dev_addr, int *iommu_group_num);
 
 /**
- * Get device information
+ * @internal
+ * Get device information.
  *
  * This function is only relevant to Linux and will return an error on BSD.
  *
@@ -186,12 +205,13 @@ rte_vfio_get_group_num(const char *sysfs_base,
  *   0 on success.
  *  <0 on failure.
  */
-__rte_experimental
+__rte_internal
 int
 rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info);
 
 /**
+ * @internal
  * Get the default VFIO container fd
  *
  * This function is only relevant to linux and will return
@@ -201,11 +221,13 @@ rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
  *  > 0 default container fd
  *  < 0 if VFIO is not enabled or not supported
  */
+__rte_internal
 int
 rte_vfio_get_container_fd(void);
 
 /**
- * Open VFIO group fd or get an existing one
+ * @internal
+ * Open VFIO group fd or get an existing one.
  *
  * This function is only relevant to linux and will return
  * an error on BSD.
@@ -217,10 +239,12 @@ rte_vfio_get_container_fd(void);
  *  > 0 group fd
  *  < 0 for errors
  */
+__rte_internal
 int
 rte_vfio_get_group_fd(int iommu_group_num);
 
 /**
+ * @internal
  * Create a new container for device binding.
  *
  * @note Any newly allocated DPDK memory will not be mapped into these
@@ -235,10 +259,12 @@ rte_vfio_get_group_fd(int iommu_group_num);
  *   the container fd if successful
  *   <0 if failed
  */
+__rte_internal
 int
 rte_vfio_container_create(void);
 
 /**
+ * @internal
  * Destroy the container, unbind all vfio groups within it.
  *
  * @param container_fd
@@ -248,10 +274,12 @@ rte_vfio_container_create(void);
  *    0 if successful
  *   <0 if failed
  */
+__rte_internal
 int
 rte_vfio_container_destroy(int container_fd);
 
 /**
+ * @internal
  * Bind a IOMMU group to a container.
  *
  * @param container_fd
@@ -264,10 +292,12 @@ rte_vfio_container_destroy(int container_fd);
  *   group fd if successful
  *   <0 if failed
  */
+__rte_internal
 int
 rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
 
 /**
+ * @internal
  * Unbind a IOMMU group from a container.
  *
  * @param container_fd
@@ -280,10 +310,12 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num);
  *    0 if successful
  *   <0 if failed
  */
+__rte_internal
 int
 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
 
 /**
+ * @internal
  * Perform DMA mapping for devices in a container.
  *
  * @param container_fd
@@ -303,11 +335,13 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num);
  *    0 if successful
  *   <0 if failed
  */
+__rte_internal
 int
 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
 		uint64_t iova, uint64_t len);
 
 /**
+ * @internal
  * Perform DMA unmapping for devices in a container.
  *
  * @param container_fd
@@ -327,6 +361,7 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr,
  *    0 if successful
  *   <0 if failed
  */
+__rte_internal
 int
 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr,
 		uint64_t iova, uint64_t len);
diff --git a/lib/eal/linux/eal_vfio.c b/lib/eal/linux/eal_vfio.c
index f1050ffa60..33fa04feaf 100644
--- a/lib/eal/linux/eal_vfio.c
+++ b/lib/eal/linux/eal_vfio.c
@@ -532,7 +532,7 @@ get_vfio_cfg_by_container_fd(int container_fd)
 	return NULL;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_fd)
 int
 rte_vfio_get_group_fd(int iommu_group_num)
 {
@@ -731,7 +731,7 @@ vfio_sync_default_container(void)
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_clear_group)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_clear_group)
 int
 rte_vfio_clear_group(int vfio_group_fd)
 {
@@ -755,7 +755,7 @@ rte_vfio_clear_group(int vfio_group_fd)
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_setup_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_setup_device)
 int
 rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info)
@@ -1009,7 +1009,7 @@ rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_release_device)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_release_device)
 int
 rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 		    int vfio_dev_fd)
@@ -1098,7 +1098,7 @@ rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
 	return ret;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_enable)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_enable)
 int
 rte_vfio_enable(const char *modname)
 {
@@ -1175,7 +1175,7 @@ rte_vfio_enable(const char *modname)
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_is_enabled)
 int
 rte_vfio_is_enabled(const char *modname)
 {
@@ -1215,7 +1215,7 @@ vfio_set_iommu_type(int vfio_container_fd)
 	return NULL;
 }
 
-RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_vfio_get_device_info, 24.03)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_device_info)
 int
 rte_vfio_get_device_info(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info)
@@ -1349,7 +1349,7 @@ vfio_open_container_fd(bool mp_request)
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_get_container_fd)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_container_fd)
 int
 rte_vfio_get_container_fd(void)
 {
@@ -1363,7 +1363,7 @@ rte_vfio_get_container_fd(void)
 	return default_vfio_cfg->vfio_container_fd;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_get_group_num)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_get_group_num)
 int
 rte_vfio_get_group_num(const char *sysfs_base,
 		const char *dev_addr, int *iommu_group_num)
@@ -2034,7 +2034,7 @@ container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
 	return ret;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_noiommu_is_enabled)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_noiommu_is_enabled)
 int
 rte_vfio_noiommu_is_enabled(void)
 {
@@ -2067,7 +2067,7 @@ rte_vfio_noiommu_is_enabled(void)
 	return c == 'Y';
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_create)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_create)
 int
 rte_vfio_container_create(void)
 {
@@ -2094,7 +2094,7 @@ rte_vfio_container_create(void)
 	return vfio_cfgs[i].vfio_container_fd;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_destroy)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_destroy)
 int
 rte_vfio_container_destroy(int container_fd)
 {
@@ -2120,7 +2120,7 @@ rte_vfio_container_destroy(int container_fd)
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_bind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_bind)
 int
 rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
 {
@@ -2135,7 +2135,7 @@ rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
 	return vfio_get_group_fd(vfio_cfg, iommu_group_num);
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_group_unbind)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_group_unbind)
 int
 rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
 {
@@ -2176,7 +2176,7 @@ rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
 	return 0;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_map)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
 int
 rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
@@ -2197,7 +2197,7 @@ rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
 	return container_dma_map(vfio_cfg, vaddr, iova, len);
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_unmap)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
 int
 rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
 		uint64_t len)
diff --git a/lib/eal/windows/eal.c b/lib/eal/windows/eal.c
index 6dacae7235..de7a89a829 100644
--- a/lib/eal/windows/eal.c
+++ b/lib/eal/windows/eal.c
@@ -453,7 +453,7 @@ eal_asprintf(char **buffer, const char *format, ...)
 	return ret;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_map)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_map)
 int
 rte_vfio_container_dma_map(__rte_unused int container_fd,
 			__rte_unused uint64_t vaddr,
@@ -464,7 +464,7 @@ rte_vfio_container_dma_map(__rte_unused int container_fd,
 	return -1;
 }
 
-RTE_EXPORT_SYMBOL(rte_vfio_container_dma_unmap)
+RTE_EXPORT_INTERNAL_SYMBOL(rte_vfio_container_dma_unmap)
 int
 rte_vfio_container_dma_unmap(__rte_unused int container_fd,
 			__rte_unused uint64_t vaddr,
-- 
2.47.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox