DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/3] net/iavf: downgrade opcode 0 ARQ log to debug
From: Ciara Loftus @ 2026-06-08 14:55 UTC (permalink / raw)
  To: dev; +Cc: Talluri Chaitanyababu
In-Reply-To: <20260608145518.1705524-1-ciara.loftus@intel.com>

From: Talluri Chaitanyababu <chaitanyababux.talluri@intel.com>

After admin queue reinitialisation, completions from uninitialised
ARQ ring descriptor memory may arrive before any real PF response.
These carry opcode 0 (`VIRTCHNL_OP_UNKNOWN`) and trigger a WARNING
log on every poll iteration, flooding the log during reset recovery.

Treat opcode 0 as a distinct case and log it at DEBUG level, while
retaining WARNING for genuine opcode mismatches.

Signed-off-by: Talluri Chaitanyababu <chaitanyababux.talluri@intel.com>
---
 drivers/net/intel/iavf/iavf_vchnl.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf_vchnl.c b/drivers/net/intel/iavf/iavf_vchnl.c
index 94ccfb5d6e..cd90d35023 100644
--- a/drivers/net/intel/iavf/iavf_vchnl.c
+++ b/drivers/net/intel/iavf/iavf_vchnl.c
@@ -299,8 +299,15 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 		/* async reply msg on command issued by vf previously */
 		result = IAVF_MSG_CMD;
 		if (opcode != vf->pend_cmd) {
-			PMD_DRV_LOG(WARNING, "command mismatch, expect %u, get %u",
-					vf->pend_cmd, opcode);
+			if (opcode == VIRTCHNL_OP_UNKNOWN)
+				PMD_DRV_LOG(DEBUG,
+					    "Spurious msg with opcode 0, pending cmd %u",
+					    vf->pend_cmd);
+			else
+				PMD_DRV_LOG(WARNING,
+					    "command mismatch, expect %u, get %u",
+					    vf->pend_cmd, opcode);
+
 			result = IAVF_MSG_ERR;
 		}
 	}
-- 
2.43.0


^ permalink raw reply related

* [PATCH 2/3] net/iavf: wait for PF reset start before reinitializing
From: Ciara Loftus @ 2026-06-08 14:55 UTC (permalink / raw)
  To: dev; +Cc: Ciara Loftus, stable, Talluri Chaitanyababu
In-Reply-To: <20260608145518.1705524-1-ciara.loftus@intel.com>

Commit 1428895ad417 ("net/iavf: fix disabling of promiscuous modes on
close") added a synchronous VIRTCHNL round-trip on the close path
before the reset request is sent. This delays the reset just long
enough that `IAVF_VFGEN_RSTAT` still reads `VIRTCHNL_VFR_VFACTIVE`
when the re-init path polls it for reset completion. The driver
interprets this as the reset being complete, when in fact it has not
yet started, and proceeds to issue VIRTCHNL commands before the PF
has disabled the VF mailbox.

Fix by polling `IAVF_VF_ARQLEN1.ARQENABLE` immediately after the reset
request and before shutting down the admin queue, when the close is
triggered by a reset. The PF clears this bit as its first reset action,
providing an unambiguous signal that the reset is in progress.

Fixes: 1428895ad4 ("net/iavf: fix disabling of promiscuous modes on close")
Cc: stable@dpdk.org

Reported-by: Talluri Chaitanyababu <chaitanyababux.talluri@intel.com>
Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 drivers/net/intel/iavf/iavf.h        |  1 +
 drivers/net/intel/iavf/iavf_ethdev.c | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 2615b6f034..4444602a30 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -291,6 +291,7 @@ struct iavf_info {
 	struct rte_eth_dev *eth_dev;
 
 	bool in_reset_recovery;
+	bool reset_pending;
 
 	uint32_t ptp_caps;
 	rte_spinlock_t phc_time_aq_lock;
diff --git a/drivers/net/intel/iavf/iavf_ethdev.c b/drivers/net/intel/iavf/iavf_ethdev.c
index a8031e23a5..a38132e80e 100644
--- a/drivers/net/intel/iavf/iavf_ethdev.c
+++ b/drivers/net/intel/iavf/iavf_ethdev.c
@@ -106,6 +106,7 @@ static int iavf_dev_start(struct rte_eth_dev *dev);
 static int iavf_dev_stop(struct rte_eth_dev *dev);
 static int iavf_dev_close(struct rte_eth_dev *dev);
 static int iavf_dev_reset(struct rte_eth_dev *dev);
+static bool iavf_is_reset_detected(struct iavf_adapter *adapter);
 static int iavf_dev_info_get(struct rte_eth_dev *dev,
 			     struct rte_eth_dev_info *dev_info);
 static const uint32_t *iavf_dev_supported_ptypes_get(struct rte_eth_dev *dev,
@@ -3196,6 +3197,14 @@ iavf_dev_close(struct rte_eth_dev *dev)
 	iavf_flow_uninit(adapter);
 
 	iavf_vf_reset(hw);
+	/*
+	 * If a reset is pending, wait for the PF to disable the VF's admin
+	 * receive queue (its first reset action) before we shut it down
+	 * ourselves.  This ensures iavf_check_vf_reset_done() does not see
+	 * a stale VFACTIVE value on the re-init path.
+	 */
+	if (vf->reset_pending)
+		iavf_is_reset_detected(adapter);
 	vf->aq_intr_enabled = false;
 	iavf_shutdown_adminq(hw);
 	if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_WB_ON_ITR) {
@@ -3273,6 +3282,7 @@ iavf_dev_reset(struct rte_eth_dev *dev)
 	struct iavf_adapter *adapter =
 		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
 	struct iavf_hw *hw = IAVF_DEV_PRIVATE_TO_HW(dev->data->dev_private);
+	struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(dev->data->dev_private);
 	/*
 	 * Check whether the VF reset has been done and inform application,
 	 * to avoid calling the virtual channel command, which may cause
@@ -3285,8 +3295,10 @@ iavf_dev_reset(struct rte_eth_dev *dev)
 	}
 	iavf_set_no_poll(adapter, false);
 
+	vf->reset_pending = true;
 	PMD_DRV_LOG(DEBUG, "Start dev_reset ...");
 	ret = iavf_dev_uninit(dev);
+	vf->reset_pending = false;
 	if (ret)
 		return ret;
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH 3/3] net/iavf: fix event handler refcount leak on HW reset
From: Ciara Loftus @ 2026-06-08 14:55 UTC (permalink / raw)
  To: dev; +Cc: Ciara Loftus, stable
In-Reply-To: <20260608145518.1705524-1-ciara.loftus@intel.com>

Currently, when handling a hardware reset, the uninit path skips
releasing the event handler reference while in_reset_recovery is set,
to prevent premature teardown of the event handler thread. However, the
subsequent re-init call unconditionally increments the reference count,
inflating ndev on every reset cycle. On the final device removal, the
count never reaches zero and the event handler thread is never joined.

Fix it by also skipping the event handler reference acquisition during
reset recovery, matching the symmetric skip in the uninit path so the
count stays stable across each reset cycle.

Fixes: 3e6a5d2d310a ("net/iavf: add devargs to enable VF auto-reset")
Cc: stable@dpdk.org

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 drivers/net/intel/iavf/iavf_ethdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/intel/iavf/iavf_ethdev.c b/drivers/net/intel/iavf/iavf_ethdev.c
index a38132e80e..ec1ad02826 100644
--- a/drivers/net/intel/iavf/iavf_ethdev.c
+++ b/drivers/net/intel/iavf/iavf_ethdev.c
@@ -3031,7 +3031,7 @@ iavf_dev_init(struct rte_eth_dev *eth_dev)
 	adapter->tpid = RTE_ETHER_TYPE_VLAN; /* VLAN TPID set to 0x8100 by default */
 	rte_spinlock_init(&adapter->phc_sync_lock);

-	if (iavf_dev_event_handler_init())
+	if (!vf->in_reset_recovery && iavf_dev_event_handler_init())
 		goto init_vf_err;

 	if (iavf_init_vf(eth_dev) != 0) {
-- 
2.43.0

^ permalink raw reply related

* Re: [PATCH v2] common/cnxk: allow typecasting to CN20K NPA structures
From: Jerin Jacob @ 2026-06-08 15:13 UTC (permalink / raw)
  To: Nawal Kishor
  Cc: dev, Nithin Dabilpuram, Kiran Kumar K, Sunil Kumar Kori,
	Satha Rao, Harman Kalra, jerinj, asekhar
In-Reply-To: <20260325062114.2595888-1-nkishor@marvell.com>

On Wed, Mar 25, 2026 at 12:10 PM Nawal Kishor <nkishor@marvell.com> wrote:
>
> Add __attribute__((may_alias)) to the CN20K-specific NPA structures
> (npa_cn20k_aura_s, npa_cn20k_pool_s, and npa_cn20k_halo_s) to allow
> safe type punning when casting between these structures and their
> base types (npa_aura_s and npa_pool_s).
>
> This attribute tells the compiler that these structures may alias
> with other types, which is necessary when casting pointers between
> compatible hardware register structures that share the same memory
> layout. Without this attribute, such casts violate strict aliasing
> rules and can lead to incorrect compiler optimizations.
>
> Signed-off-by: Nawal Kishor <nkishor@marvell.com>

Applied to dpdk-next-net-mrvl/for-main. Thanks

^ permalink raw reply

* Re: [PATCH 1/1] ml/cnxk: avoid overwriting layer name during load
From: Jerin Jacob @ 2026-06-08 15:14 UTC (permalink / raw)
  To: Srikanth Yalavarthi; +Cc: dev
In-Reply-To: <20260331085445.1105590-1-syalavarthi@marvell.com>

On Tue, Mar 31, 2026 at 7:10 PM Srikanth Yalavarthi
<syalavarthi@marvell.com> wrote:
>
> Layer name is initialized during metadata fetch and
> parsing stage. Avoid overwriting the layer name during
> layer load.
>
> Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>


Change as ml/cnxk: fix ...

and add Fixes: tag

> ---
>  drivers/ml/cnxk/cn10k_ml_ops.c | 3 ---
>  1 file changed, 3 deletions(-)
>
> diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
> index 628ff963c3c..77947120f25 100644
> --- a/drivers/ml/cnxk/cn10k_ml_ops.c
> +++ b/drivers/ml/cnxk/cn10k_ml_ops.c
> @@ -671,9 +671,6 @@ cn10k_ml_layer_load(void *device, uint16_t model_id, const char *layer_name, uin
>         rte_memcpy(&layer->glow.metadata, buffer, sizeof(struct cn10k_ml_model_metadata));
>         cn10k_ml_model_metadata_update(&layer->glow.metadata);
>
> -       /* Set layer name */
> -       rte_memcpy(layer->name, layer->glow.metadata.model.name, MRVL_ML_MODEL_NAME_LEN);
> -
>         /* Enable support for batch_size of 256 */
>         if (layer->glow.metadata.model.batch_size == 0)
>                 layer->batch_size = 256;
> --
> 2.34.1
>

^ permalink raw reply

* Re: [PATCH 1/1] ml/cnxk: enable data caching for all MRVL layers
From: Jerin Jacob @ 2026-06-08 15:15 UTC (permalink / raw)
  To: Srikanth Yalavarthi; +Cc: dev, Anup Prabhu
In-Reply-To: <20260331085350.1105103-1-syalavarthi@marvell.com>

On Tue, Mar 31, 2026 at 2:30 PM Srikanth Yalavarthi
<syalavarthi@marvell.com> wrote:
>
> From: Anup Prabhu <aprabhu@marvell.com>
>
> Enabled data caching for all MRVL layers in TVM models.
>
> Signed-off-by: Anup Prabhu <aprabhu@marvell.com>


Applied to dpdk-next-net-mrvl/for-main. Thanks

> ---
>  drivers/ml/cnxk/cn10k_ml_ops.c | 9 ++-------
>  1 file changed, 2 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
> index b30af7c7a44..628ff963c3c 100644
> --- a/drivers/ml/cnxk/cn10k_ml_ops.c
> +++ b/drivers/ml/cnxk/cn10k_ml_ops.c
> @@ -997,13 +997,8 @@ cn10k_ml_layer_start(void *device, uint16_t model_id, const char *layer_name)
>         if (ret < 0) {
>                 cn10k_ml_layer_stop(device, model_id, layer_name);
>         } else {
> -               if (cn10k_mldev->cache_model_data) {
> -                       if ((model->type == ML_CNXK_MODEL_TYPE_GLOW &&
> -                            model->subtype == ML_CNXK_MODEL_SUBTYPE_GLOW_MRVL) ||
> -                           (model->type == ML_CNXK_MODEL_TYPE_TVM &&
> -                            model->subtype == ML_CNXK_MODEL_SUBTYPE_TVM_MRVL))
> -                               ret = cn10k_ml_cache_model_data(cnxk_mldev, layer);
> -               }
> +               if (cn10k_mldev->cache_model_data)
> +                       ret = cn10k_ml_cache_model_data(cnxk_mldev, layer);
>         }
>
>         return ret;
> --
> 2.47.0
>

^ permalink raw reply

* Re: [PATCH 1/1] ml/cnxk: support for 64-bit int type in metadata
From: Jerin Jacob @ 2026-06-08 15:16 UTC (permalink / raw)
  To: Srikanth Yalavarthi; +Cc: dev
In-Reply-To: <20260331085529.1105898-1-syalavarthi@marvell.com>

On Tue, Mar 31, 2026 at 6:41 PM Srikanth Yalavarthi
<syalavarthi@marvell.com> wrote:
>
> Added support for 64-bit integer data type in model metadata.
>
> Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>

Applied to dpdk-next-net-mrvl/for-main. Thanks

^ permalink raw reply

* Re: [PATCH v2] net/ark: fix unsafe env variable in extension loading
From: Stephen Hemminger @ 2026-06-08 15:24 UTC (permalink / raw)
  To: Denis Sergeev
  Cc: dev, shepard.siegel, ed.czeck, john.miller, stable, sdl.dpdk
In-Reply-To: <20260603053313.119342-1-denserg.edu@gmail.com>

On Wed,  3 Jun 2026 08:32:45 +0300
Denis Sergeev <denserg.edu@gmail.com> wrote:

> The ARK_EXT_PATH environment variable is passed to dlopen without
> verifying process privileges. In a setuid/setgid scenario, this
> could allow loading an arbitrary shared library with elevated
> privileges.
> 
> Add a check that effective user/group IDs match real IDs before
> trusting the environment variable, consistent with the same
> protection already present in the mlx5 driver.
> 
> Found by Linux Verification Center (linuxtesting.org) with SVACE.
> 
> Fixes: 727b3fe292bc ("net/ark: integrate PMD")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Denis Sergeev <denserg.edu@gmail.com>

Thanks for the report, but it makes no sense.
DPDK already load shared libraries via -d command line arg without
checking. And running DPDK application as setuid would be completely
unsafe. The startup is not hardened in anyway.

NAK

That said, it would be good if DPDK had some security documentation
about what the trust boundary is and what capabilities are needed.

^ permalink raw reply

* Re: [EXTERNAL] [PATCH] net/octeontx: fix buffer overflow in device name formatting
From: Jerin Jacob @ 2026-06-08 15:29 UTC (permalink / raw)
  To: Sergei Iashin, Harman Kalra, Santosh Shukla
  Cc: dev@dpdk.org, stable@dpdk.org, jerin.jacob@caviumnetworks.com
In-Reply-To: <20260407075732.1175609-1-yashin.sergey@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2807 bytes --]


Applied to dpdk-next-net-mrvl/for-main. Thanks


________________________________
From: Sergei Iashin <yashin.sergey@gmail.com>
Sent: Tuesday, April 7, 2026 1:27 PM
To: Harman Kalra <hkalra@marvell.com>; Jerin Jacob <jerinj@marvell.com>; Santosh Shukla <santosh.shukla@caviumnetworks.com>
Cc: dev@dpdk.org <dev@dpdk.org>; stable@dpdk.org <stable@dpdk.org>; jerin.jacob@caviumnetworks.com <jerin.jacob@caviumnetworks.com>; Sergei Iashin <yashin.sergey@gmail.com>
Subject: [EXTERNAL] [PATCH] net/octeontx: fix buffer overflow in device name formatting

Replace sprintf with snprintf when formatting into the fixed-size octtx_name buffer in octeontx_create and octeontx_remove. The device name can be up to 63 bytes (RTE_DEV_NAME_MAX_LEN) while the buffer is only 32 bytes (OCTEONTX_MAX_NAME_LEN),
ZjQcmQRYFpfptBannerStart
Prioritize security for external emails:
Confirm sender and content safety before clicking links or opening attachments
Report Suspicious<https://us-phishalarm-ewt.proofpoint.com/EWT/v1/CRVmXkqW!tm3Z1f8UYnVa9O-8WX26DsK-0LaBO_9qwE4pEx2cpcKfFql8RWpbr-t0B-4n0FjU7XSDAvlitsV3KK8Ua-2nw37gJz6mivFAuDI$>

ZjQcmQRYFpfptBannerEnd

Replace sprintf with snprintf when formatting into the fixed-size
octtx_name buffer in octeontx_create and octeontx_remove. The device
name can be up to 63 bytes (RTE_DEV_NAME_MAX_LEN) while the buffer
is only 32 bytes (OCTEONTX_MAX_NAME_LEN), which may cause a stack
buffer overflow with a long user-provided --vdev name.

Fixes: f18b146c498d ("net/octeontx: create ethdev ports")
Cc: stable@dpdk.org

Signed-off-by: Sergei Iashin <yashin.sergey@gmail.com>
---
 drivers/net/octeontx/octeontx_ethdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/octeontx/octeontx_ethdev.c b/drivers/net/octeontx/octeontx_ethdev.c
index 21e3e56901..e4dca30d9d 100644
--- a/drivers/net/octeontx/octeontx_ethdev.c
+++ b/drivers/net/octeontx/octeontx_ethdev.c
@@ -1555,7 +1555,7 @@ octeontx_create(struct rte_vdev_device *dev, int port, uint8_t evdev,

        PMD_INIT_FUNC_TRACE();

-       sprintf(octtx_name, "%s_%d", name, port);
+       snprintf(octtx_name, sizeof(octtx_name), "%s_%d", name, port);
        if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
                eth_dev = rte_eth_dev_attach_secondary(octtx_name);
                if (eth_dev == NULL)
@@ -1711,7 +1711,7 @@ octeontx_remove(struct rte_vdev_device *dev)
                return -EINVAL;

        for (i = 0; i < OCTEONTX_VDEV_DEFAULT_MAX_NR_PORT; i++) {
-               sprintf(octtx_name, "eth_octeontx_%d", i);
+               snprintf(octtx_name, sizeof(octtx_name), "eth_octeontx_%d", i);

                eth_dev = rte_eth_dev_allocated(octtx_name);
                if (eth_dev == NULL)
--
2.39.5



[-- Attachment #2: Type: text/html, Size: 6645 bytes --]

^ permalink raw reply related

* Re: [PATCH] net/nfp: fix null dereference in flower ctrl NFD3 Tx
From: Stephen Hemminger @ 2026-06-08 15:30 UTC (permalink / raw)
  To: Denis Sergeev; +Cc: dev, chaoyong.he, stable, sdl.dpdk
In-Reply-To: <20260603055211.120315-1-denserg.edu@gmail.com>

On Wed,  3 Jun 2026 08:51:56 +0300
Denis Sergeev <denserg.edu@gmail.com> wrote:

> In nfp_flower_ctrl_vnic_nfd3_xmit(), when txq is NULL, goto xmit_end
> leads to unconditional dereference of txq->qcp_q in nfp_qcp_ptr_add().
> The same goto from the "no free descriptors" path incorrectly increments
> the hardware write pointer despite no descriptor being written.
> 
> Replace both gotos with early return, removing the unused xmit_end label.
> 
> Found by Linux Verification Center (linuxtesting.org) with SVACE.
> 
> Fixes: a36634e87e16 ("net/nfp: add flower ctrl VNIC Rx/Tx")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Denis Sergeev <denserg.edu@gmail.com>

Applied to next-net

^ permalink raw reply

* Re: [EXTERNAL] [PATCH] net/octeontx/base: fix out-of-bounds read in DQ range lookup
From: Jerin Jacob @ 2026-06-08 15:37 UTC (permalink / raw)
  To: Sergei Iashin, Harman Kalra, Santosh Shukla
  Cc: dev@dpdk.org, stable@dpdk.org, jerin.jacob@caviumnetworks.com
In-Reply-To: <20260407113001.1217481-1-yashin.sergey@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2732 bytes --]

Applied to dpdk-next-net-mrvl/for-main. Thanks

________________________________
From: Sergei Iashin <yashin.sergey@gmail.com>
Sent: Tuesday, April 7, 2026 5:00 PM
To: Harman Kalra <hkalra@marvell.com>; Santosh Shukla <santosh.shukla@caviumnetworks.com>; Jerin Jacob <jerinj@marvell.com>
Cc: dev@dpdk.org <dev@dpdk.org>; stable@dpdk.org <stable@dpdk.org>; jerin.jacob@caviumnetworks.com <jerin.jacob@caviumnetworks.com>; Sergei Iashin <yashin.sergey@gmail.com>
Subject: [EXTERNAL] [PATCH] net/octeontx/base: fix out-of-bounds read in DQ range lookup

In octeontx_pko_dq_range_lookup(), the inner while loop evaluates the array access ctl->dq_map[dq]. chanid before the bounds check dq < RTE_DIM(ctl->dq_map). When dq is incremented to 256 inside the loop, the next iteration reads one
ZjQcmQRYFpfptBannerStart
Prioritize security for external emails:
Confirm sender and content safety before clicking links or opening attachments
<https://us-phishalarm-ewt.proofpoint.com/EWT/v1/CRVmXkqW!tm3Z1f8UYnVa9O-cmb1abtPB-IORJwK3Jr3VXVds937zvL1Te5uABuIyTLhBPe1u0lFyd2PYF2MzgfBRj9IabE7Hc6ItR791qHo$>
Report Suspicious

ZjQcmQRYFpfptBannerEnd

In octeontx_pko_dq_range_lookup(), the inner while loop evaluates the
array access ctl->dq_map[dq].chanid before the bounds check
dq < RTE_DIM(ctl->dq_map). When dq is incremented to 256 inside the
loop, the next iteration reads one element past the end of the
256-element dq_map array before the bounds condition can short-circuit.

Swap the two conjuncts so the bounds check is evaluated first, matching
the pattern already used in the outer loop.

Fixes: cad78ca23818 ("net/octeontx/base: add base PKO operations")
Cc: jerin.jacob@caviumnetworks.com
Cc: stable@dpdk.org

Signed-off-by: Sergei Iashin <yashin.sergey@gmail.com>
---
 drivers/net/octeontx/base/octeontx_pkovf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/octeontx/base/octeontx_pkovf.c b/drivers/net/octeontx/base/octeontx_pkovf.c
index 7aec84a813..5326fe24b9 100644
--- a/drivers/net/octeontx/base/octeontx_pkovf.c
+++ b/drivers/net/octeontx/base/octeontx_pkovf.c
@@ -196,8 +196,8 @@ octeontx_pko_dq_range_lookup(struct octeontx_pko_vf_ctl_s *ctl, uint64_t chanid,
        while (dq < RTE_DIM(ctl->dq_map)) {
                dq_base = dq;
                dq_cnt = 0;
-               while (ctl->dq_map[dq].chanid == ~chanid &&
-                       dq < RTE_DIM(ctl->dq_map)) {
+               while (dq < RTE_DIM(ctl->dq_map) &&
+                       ctl->dq_map[dq].chanid == ~chanid) {
                        dq_cnt++;
                        if (dq_cnt == dq_num)
                                return dq_base;
--
2.39.5

[-- Attachment #2: Type: text/html, Size: 7065 bytes --]

^ permalink raw reply related

* Re: [PATCH v4] ethdev: support inline calculating masked item value
From: Stephen Hemminger @ 2026-06-08 15:45 UTC (permalink / raw)
  To: Bing Zhao
  Cc: viacheslavo, dev, rasland, orika, dsosnowski, suanmingm, matan,
	thomas
In-Reply-To: <20260603092805.9837-1-bingz@nvidia.com>

On Wed, 3 Jun 2026 12:28:05 +0300
Bing Zhao <bingz@nvidia.com> wrote:

> In the asynchronous API definition and some drivers, the
> rte_flow_item spec value may not be calculated by the driver due to the
> reason of speed of light rule insertion rate and sometimes the input
> parameters will be copied and changed internally.
> 
> After copying, the spec and last will be protected by the keyword
> const and cannot be changed in the code itself. And also the driver
> needs some extra memory to do the calculation and extra conditions
> to understand the length of each item spec. This is not efficient.
> 
> To solve the issue and support usage of the following fix, a new OP
> was introduced to calculate the spec and last values after applying
> the mask inline.
> 
> Signed-off-by: Bing Zhao <bingz@nvidia.com>
> ---

More detailed AI review found some things that still need addressing.

On Wed,  3 Jun 2026 12:28:05 +0300, Bing Zhao wrote:
> Subject: [PATCH v4] ethdev: support inline calculating masked item value

Error: byte-wise masking corrupts embedded pointers in deep-copy item
types (RAW, FLEX, GENEVE_OPT).

In rte_flow_conv_pattern(), the new mask application runs over the fixed
item struct:

	size_t item_mask_size = mask ? rte_flow_conv_item_mask_size(src) : 0;
	...
	size_t mask_size = RTE_MIN(ret, item_mask_size);

	for (j = 0; j < mask_size; j++)
		c_spec[j] &= mask[j];

item_mask_size is rte_flow_desc_item[type].size, the size of the fixed
item struct. For RTE_FLOW_ITEM_TYPE_RAW, FLEX, and GENEVE_OPT, that fixed
struct ends in an embedded pointer that rte_flow_conv_item_spec() has just
populated to point at the deep-copied trailing data (rte_flow_item_raw.pattern,
rte_flow_item_flex.pattern, rte_flow_item_geneve_opt.data). Because the masked
range covers the whole fixed struct, the loop ANDs the bytes of that pointer
with the mask's corresponding bytes (typically a NULL mask pointer), zeroing
or garbling it.

The converted item's pattern/data pointer is clobbered while the copied
payload it should reference is left unreachable. A consumer that follows
conv->pattern then dereferences NULL or a corrupt address. Plain value items
(eth, ipv4, ...) are unaffected; only the deep-copy item types break, which
is exactly what the test does not exercise.

Suggested fix: do not blind-mask the entire fixed struct for items that carry
an embedded pointer / desc_fn deep copy. Either skip masking when
rte_flow_desc_item[type].desc_fn != NULL, or mask only the leading plain-data
region and leave the pointer field (and trailing copied bytes) intact.

Warning: the new test validates only an ETH pattern, so the RAW/FLEX/GENEVE_OPT
path above is untested. A RAW item case would have surfaced the pointer
corruption.

Info: the Doxygen block for RTE_FLOW_CONV_OP_PATTERN_MASKED uses @p mask,
@p spec, @p last, but those are item fields, not parameters of the op; the
neighboring enum entries only document the @p src / @p dst types.

^ permalink raw reply

* Re: [PATCH] eal: fix core_index for non-EAL registered threads
From: David Marchand @ 2026-06-08 15:49 UTC (permalink / raw)
  To: Maxime Peim, Dariusz Sosnowski, Slava Ovsiienko
  Cc: dev, Matan Azrad, Thomas Monjalon
In-Reply-To: <20260422075414.2528455-1-maxime.peim@gmail.com>

Hello,

On Wed, 22 Apr 2026 at 09:54, Maxime Peim <maxime.peim@gmail.com> wrote:
>
> Threads registered via rte_thread_register() are assigned a valid
> lcore_id by eal_lcore_non_eal_allocate(), but their core_index in
> lcore_config is left at -1. This value was set during rte_eal_cpu_init()
> for lcores with ROLE_OFF (undetected CPUs) and is never updated when the
> lcore is later allocated to a non-EAL thread.
>
> As a result, rte_lcore_index() returns -1 for registered non-EAL
> threads. Libraries that use rte_lcore_index() to select per-lcore
> caches fall back to a shared global path when it returns -1, causing
> severe contention under concurrent access from multiple registered
> threads.
>
> A concrete example is the mlx5 indexed memory pool (mlx5_ipool), which
> uses rte_lcore_index() in mlx5_ipool_malloc_cache() to select a per-core
> cache slot. When core_index is -1, all registered threads are funneled
> into a single shared slot protected by a spinlock. In testing with VPP
> (which registers worker threads via rte_thread_register()), this caused
> async flow rule insertion throughput to drop from ~6.4M rules/sec to
> ~1.2M rules/sec with 4 workers -- a 5x regression attributable entirely
> to spinlock contention in the ipool allocator.
>
> Fix by setting core_index to the next sequential index (cfg->lcore_count)
> in eal_lcore_non_eal_allocate() before incrementing the count. Also reset
> core_index back to -1 on the error rollback path and in
> eal_lcore_non_eal_release() for correctness.
>
> Fixes: 5c307ba2a5b1 ("eal: register non-EAL threads as lcores")
> Signed-off-by: Maxime Peim <maxime.peim@gmail.com>

Thanks for the fix Maxime, it looks correct though I am a bit
skeptical about usage of this API with dynamic thread allocation.

In the net/mlx5 context, for example, I expect no memory saving from
using the lcore "index": mlx5 is allocating an array with
RTE_MAX_LCORE+1 entries.
Using rte_lcore_id() would probably be good enough.
Dariusz, Slava, any opinion?


-- 
David Marchand


^ permalink raw reply

* Re: [PATCH v2 2/3] event/cnxk: add pause to spinloops
From: Jerin Jacob @ 2026-06-08 15:49 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Pavan Nikhilesh, Shijith Thotton
In-Reply-To: <20260413164652.33291-3-stephen@networkplumber.org>

On Mon, Apr 13, 2026 at 10:36 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On SMT systems when a spinloop is done without a pause
> it may cause excessive latency. This problem was found
> by the fix_empty_spinloops coccinelle script.
>
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>

rte_pause() translates to YIELD instruction. Since cnxk is an
integrated SoC and it is a single threaded core, it won't help on
anything other than adding one instruction bit more latency.
In general 3/3 devtool is good. Please send a it separate version so
that 3/3 patches can be merged through the main tree.

---------------
The YIELD instruction write up from ARMv8 manual.

YIELD is a hint instruction in the ARMv8-A architecture. It tells the
CPU that the current hardware thread is doing nothing useful right now
(typically spinning in a busy-wait loop), so the processor may
reallocate shared execution resources to other hardware threads.

On SMT (multithreaded) cores, this can give sibling hardware threads
more resources, improving overall throughput.
On most current single-threaded ARM cores, YIELD executes as a NOP —
it has no microarchitectural effect, but it's architecturally valid
and harmless. It does not put the core to sleep (unlike WFE/WFI).
It's a pure hint: it never changes program correctness, only
potentially performance/fairness.
-------------

^ permalink raw reply

* Re: [PATCH] dma/cnxk: fix crash on secondary process cleanup
From: Jerin Jacob @ 2026-06-08 16:09 UTC (permalink / raw)
  To: pbhagavatula
  Cc: jerinj, Vamsi Attunuru, Anatoly Burakov, Radha Mohan Chintakuntla,
	dev, stable
In-Reply-To: <20260605081620.97056-1-pbhagavatula@marvell.com>

On Fri, Jun 5, 2026 at 2:11 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> cnxk_dmadev_probe() ran in secondary processes too, overwriting the
> shared rdpi->pci_dev with a process-local pointer and marking the
> device ready. With buses now cleaned up on shutdown, the primary's
> roc_dpi_dev_fini() dereferences that stale pointer and crashes.
>
> Skip HW init in secondary processes: attach to the shared device data
> and return, leaving rdpi and the device state untouched.
>
> Fixes: 53f6d7328bf4 ("dma/cnxk: create and initialize device on PCI probing")
> Cc: stable@dpdk.org
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Applied to dpdk-next-net-mrvl/for-main. Thanks

^ permalink raw reply

* Re: [PATCH] eal: fix core_index for non-EAL registered threads
From: David Marchand @ 2026-06-08 16:10 UTC (permalink / raw)
  To: Maxime Peim; +Cc: dev
In-Reply-To: <20260422075414.2528455-1-maxime.peim@gmail.com>

On Wed, 22 Apr 2026 at 09:54, Maxime Peim <maxime.peim@gmail.com> wrote:
>
> Threads registered via rte_thread_register() are assigned a valid
> lcore_id by eal_lcore_non_eal_allocate(), but their core_index in
> lcore_config is left at -1. This value was set during rte_eal_cpu_init()
> for lcores with ROLE_OFF (undetected CPUs) and is never updated when the
> lcore is later allocated to a non-EAL thread.
>
> As a result, rte_lcore_index() returns -1 for registered non-EAL
> threads. Libraries that use rte_lcore_index() to select per-lcore
> caches fall back to a shared global path when it returns -1, causing
> severe contention under concurrent access from multiple registered
> threads.
>
> A concrete example is the mlx5 indexed memory pool (mlx5_ipool), which
> uses rte_lcore_index() in mlx5_ipool_malloc_cache() to select a per-core
> cache slot. When core_index is -1, all registered threads are funneled
> into a single shared slot protected by a spinlock. In testing with VPP
> (which registers worker threads via rte_thread_register()), this caused
> async flow rule insertion throughput to drop from ~6.4M rules/sec to
> ~1.2M rules/sec with 4 workers -- a 5x regression attributable entirely
> to spinlock contention in the ipool allocator.
>
> Fix by setting core_index to the next sequential index (cfg->lcore_count)
> in eal_lcore_non_eal_allocate() before incrementing the count. Also reset
> core_index back to -1 on the error rollback path and in
> eal_lcore_non_eal_release() for correctness.
>
> Fixes: 5c307ba2a5b1 ("eal: register non-EAL threads as lcores")
Cc: stable@dpdk.org

> Signed-off-by: Maxime Peim <maxime.peim@gmail.com>
Acked-by: David Marchand <david.marchand@redhat.com>

Applied, thanks.


-- 
David Marchand


^ permalink raw reply

* [PATCH 0/7] intel network and pcapng updates
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Wesierski, Dawid

From: "Wesierski, Dawid" <dawid.wesierski@intel.com>

These patches provide various updates for Intel iavf/ice drivers and pcapng.
The changes include:
- Hardware limit ring descriptor increases for iavf.
- Runtime queue rate limit configuration for iavf.
- Scheduler burst size reductions for ice base.
- Global PTP timestamping for ice.
- Disabling runtime queue setup for iavf.
- User-supplied timestamp support in pcapng.
- Header split mbuf callback support for ice.

Marek Kasiewicz (7):
  net/iavf: increase max ring descriptors to hardware limit
  net/iavf: allow runtime queue rate limit configuration
  net/ice/base: reduce default scheduler burst size
  net/ice: timestamp all received packets when PTP is enabled
  net/iavf: disable runtime queue setup capability
  pcapng: add user-supplied timestamp support
  net/ice: add header split mbuf callback support

 drivers/net/intel/common/rx.h         |  2 +
 drivers/net/intel/iavf/iavf_ethdev.c  |  3 --
 drivers/net/intel/iavf/iavf_rxtx.h    |  2 +-
 drivers/net/intel/iavf/iavf_tm.c      | 11 ++--
 drivers/net/intel/ice/base/ice_type.h |  2 +-
 drivers/net/intel/ice/ice_ethdev.c    |  1 +
 drivers/net/intel/ice/ice_rxtx.c      | 72 ++++++++++++++++++++++++---
 drivers/net/intel/ice/ice_rxtx.h      |  2 +
 lib/ethdev/ethdev_driver.h            | 10 ++++
 lib/ethdev/rte_ethdev.c               | 17 +++++++
 lib/ethdev/rte_ethdev.h               | 46 +++++++++++++++++
 lib/pcapng/rte_pcapng.c               | 19 ++-----
 lib/pcapng/rte_pcapng.h               | 41 ++++++++++++++-
 13 files changed, 196 insertions(+), 32 deletions(-)

-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.


^ permalink raw reply

* [PATCH 1/7] net/iavf: increase max ring descriptors to hardware limit
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

The Intel E810 hardware supports up to 8160 (8K - 32) descriptors per
TX/RX ring, but IAVF_MAX_RING_DESC caps it at 4096. Applications that
need deep descriptor rings for hardware rate-limited pacing (e.g.,
ST2110 video with thousands of packets per frame) cannot queue enough
packets before the pacing epoch begins.

Increase IAVF_MAX_RING_DESC to the hardware maximum of 8160 to allow
full utilization of the ring depth on E810 VFs.

Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 drivers/net/intel/iavf/iavf_rxtx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/intel/iavf/iavf_rxtx.h b/drivers/net/intel/iavf/iavf_rxtx.h
index 8449236d4d..22ea415f44 100644
--- a/drivers/net/intel/iavf/iavf_rxtx.h
+++ b/drivers/net/intel/iavf/iavf_rxtx.h
@@ -16,7 +16,7 @@
 /* In QLEN must be whole number of 32 descriptors. */
 #define IAVF_ALIGN_RING_DESC      32
 #define IAVF_MIN_RING_DESC        64
-#define IAVF_MAX_RING_DESC        4096
+#define IAVF_MAX_RING_DESC        (8192 - 32)
 #define IAVF_DMA_MEM_ALIGN        4096
 /* Base address of the HW descriptor ring should be 128B aligned. */
 #define IAVF_RING_BASE_ALIGN      128
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply related

* [PATCH 2/7] net/iavf: allow runtime queue rate limit configuration
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

Allow per-queue bandwidth rate limiting to be configured without
stopping the port when only a single TC node and single QoS element
are involved. This enables dynamic session management where individual
queue pacing rates can be changed while other queues continue
transmitting.

Also fix the queue ID assignment in the bandwidth configuration to
use the actual TM node ID rather than a sequential counter index, and
only mark the TM hierarchy as committed when the port is stopped to
permit subsequent reconfiguration.

Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 drivers/net/intel/iavf/iavf_tm.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf_tm.c b/drivers/net/intel/iavf/iavf_tm.c
index 1cf7bfb106..43d7a44337 100644
--- a/drivers/net/intel/iavf/iavf_tm.c
+++ b/drivers/net/intel/iavf/iavf_tm.c
@@ -804,8 +804,10 @@ static int iavf_hierarchy_commit(struct rte_eth_dev *dev,
 	int index = 0, node_committed = 0;
 	int i, ret_val = IAVF_SUCCESS;

-	/* check if port is stopped */
-	if (adapter->stopped != 1) {
+	/* check if port is stopped, except for setting queue bandwidth */
+	if (vf->tm_conf.nb_tc_node != 1 &&
+	    vf->qos_cap->num_elem != 1 &&
+	    adapter->stopped != 1) {
 		PMD_DRV_LOG(ERR, "Please stop port first");
 		ret_val = IAVF_ERR_NOT_READY;
 		goto err;
@@ -856,7 +858,7 @@ static int iavf_hierarchy_commit(struct rte_eth_dev *dev,
 		q_tc_mapping->tc[tm_node->tc].req.queue_count++;

 		if (tm_node->shaper_profile) {
-			q_bw->cfg[node_committed].queue_id = node_committed;
+			q_bw->cfg[node_committed].queue_id = tm_node->id;
 			q_bw->cfg[node_committed].shaper.peak =
 			tm_node->shaper_profile->profile.peak.rate /
 			1000 * IAVF_BITS_PER_BYTE;
@@ -900,7 +902,8 @@ static int iavf_hierarchy_commit(struct rte_eth_dev *dev,
 		goto fail_clear;

 	vf->qtc_map = qtc_map;
-	vf->tm_conf.committed = true;
+	if (adapter->stopped == 1)
+		vf->tm_conf.committed = true;
 	return ret_val;

 fail_clear:
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply related

* [PATCH 3/7] net/ice/base: reduce default scheduler burst size
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

Reduce ICE_SCHED_DFLT_BURST_SIZE from 15 KB to 2 KB to improve
TX rate limiter granularity. The E810 TX scheduler uses a token
bucket algorithm where the burst size controls the maximum bytes
sent in a single burst before the rate limiter throttles.

A 15 KB burst allows micro-bursts of ~10 max-size frames, which
violates tight inter-packet spacing requirements in time-sensitive
networking applications such as SMPTE ST 2110-21 narrow-sender
compliance. Reducing to 2 KB forces near-constant-rate output
matching the configured shaper profile.

Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 drivers/net/intel/ice/base/ice_type.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/intel/ice/base/ice_type.h b/drivers/net/intel/ice/base/ice_type.h
index 6d8c187689..39569ff3e3 100644
--- a/drivers/net/intel/ice/base/ice_type.h
+++ b/drivers/net/intel/ice/base/ice_type.h
@@ -1100,7 +1100,7 @@ enum ice_rl_type {
 #define ICE_SCHED_NO_SHARED_RL_PROF_ID	0xFFFF
 #define ICE_SCHED_DFLT_BW_WT		4
 #define ICE_SCHED_INVAL_PROF_ID		0xFFFF
-#define ICE_SCHED_DFLT_BURST_SIZE	(15 * 1024)	/* in bytes (15k) */
+#define ICE_SCHED_DFLT_BURST_SIZE	(2 * 1024)	/* in bytes (2k) */

 /* Access Macros for Tx Sched RL Profile data */
 #define ICE_TXSCHED_GET_RL_PROF_ID(p) LE16_TO_CPU((p)->info.profile_id)
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply related

* [PATCH 4/7] net/ice: timestamp all received packets when PTP is enabled
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

When PTP is enabled on the ICE PMD, hardware RX timestamps are only
applied to packets classified as IEEE 1588 (Ethertype 0x88F7). This
prevents applications from obtaining hardware timestamps on regular
UDP/IP traffic.

Remove the TIMESYNC packet type filter so that all received packets
get hardware timestamps when PTP is enabled. This is required for
time-sensitive networking applications that need per-packet arrival
timing on media traffic, such as ST 2110-21 receiver compliance
monitoring.

The change affects all three RX paths: scan, scattered, and single
packet receive functions.

Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 drivers/net/intel/ice/ice_rxtx.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/intel/ice/ice_rxtx.c b/drivers/net/intel/ice/ice_rxtx.c
index c4b5454c53..8d709125f7 100644
--- a/drivers/net/intel/ice/ice_rxtx.c
+++ b/drivers/net/intel/ice/ice_rxtx.c
@@ -2023,8 +2023,7 @@ ice_rx_scan_hw_ring(struct ci_rx_queue *rxq)
 				pkt_flags |= rxq->ts_flag;
 			}

-			if (ad->ptp_ena && ((mb->packet_type &
-			    RTE_PTYPE_L2_MASK) == RTE_PTYPE_L2_ETHER_TIMESYNC)) {
+			if (ad->ptp_ena) {
 				rxq->time_high =
 				   rte_le_to_cpu_32(rxdp[j].wb.flex_ts.ts_high);
 				mb->timesync = rxq->queue_id;
@@ -2390,8 +2389,7 @@ ice_recv_scattered_pkts(void *rx_queue,
 			pkt_flags |= rxq->ts_flag;
 		}

-		if (ad->ptp_ena && ((first_seg->packet_type & RTE_PTYPE_L2_MASK)
-		    == RTE_PTYPE_L2_ETHER_TIMESYNC)) {
+		if (ad->ptp_ena) {
 			rxq->time_high =
 			   rte_le_to_cpu_32(rxd.wb.flex_ts.ts_high);
 			first_seg->timesync = rxq->queue_id;
@@ -2881,8 +2879,7 @@ ice_recv_pkts(void *rx_queue,
 			pkt_flags |= rxq->ts_flag;
 		}

-		if (ad->ptp_ena && ((rxm->packet_type & RTE_PTYPE_L2_MASK) ==
-		    RTE_PTYPE_L2_ETHER_TIMESYNC)) {
+		if (ad->ptp_ena) {
 			rxq->time_high =
 			   rte_le_to_cpu_32(rxd.wb.flex_ts.ts_high);
 			rxm->timesync = rxq->queue_id;
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply related

* [PATCH 5/7] net/iavf: disable runtime queue setup capability
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

Remove the advertisement of RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP
and RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP capabilities from the
iavf VF driver.

Runtime queue setup on E810 VFs causes queue state corruption when
queues are dynamically reconfigured while the hardware rate limiter
is actively pacing TX queues. Queue configuration messages to the PF
via virtchnl can race with ongoing TX operations, leading to undefined
behavior.

By not advertising these capabilities, all queues are configured at
port start and remain stable throughout the port lifecycle.

Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 drivers/net/intel/iavf/iavf_ethdev.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf_ethdev.c b/drivers/net/intel/iavf/iavf_ethdev.c
index a8031e23a5..4f6325ef78 100644
--- a/drivers/net/intel/iavf/iavf_ethdev.c
+++ b/drivers/net/intel/iavf/iavf_ethdev.c
@@ -1159,9 +1159,6 @@ iavf_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->reta_size = vf->vf_res->rss_lut_size;
 	dev_info->flow_type_rss_offloads = IAVF_RSS_OFFLOAD_ALL;
 	dev_info->max_mac_addrs = IAVF_NUM_MACADDR_MAX;
-	dev_info->dev_capa =
-		RTE_ETH_DEV_CAPA_RUNTIME_RX_QUEUE_SETUP |
-		RTE_ETH_DEV_CAPA_RUNTIME_TX_QUEUE_SETUP;
 	dev_info->rx_offload_capa =
 		RTE_ETH_RX_OFFLOAD_VLAN_STRIP |
 		RTE_ETH_RX_OFFLOAD_QINQ_STRIP |
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply related

* [PATCH 6/7] pcapng: add user-supplied timestamp support
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

Add rte_pcapng_copy_ts() which accepts an optional timestamp parameter
in nanoseconds. When the timestamp is non-zero, it is used directly
instead of reading the TSC. This allows applications to provide
hardware PTP timestamps from the NIC, enabling accurate packet capture
with PTP-domain timing rather than host-local TSC values.

The existing rte_pcapng_copy() function is preserved as a static inline
wrapper that passes zero for backward compatibility.

The TSC-to-epoch conversion in the write path is removed since callers
providing hardware timestamps have already performed the conversion.


Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 lib/pcapng/rte_pcapng.c | 19 ++++---------------
 lib/pcapng/rte_pcapng.h | 41 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
index b5d1026891..96b3aafeb6 100644
--- a/lib/pcapng/rte_pcapng.c
+++ b/lib/pcapng/rte_pcapng.c
@@ -546,14 +546,14 @@ pcapng_vlan_insert(struct rte_mbuf *m, uint16_t ether_type, uint16_t tci)
  */
 
 /* Make a copy of original mbuf with pcapng header and options */
-RTE_EXPORT_SYMBOL(rte_pcapng_copy)
+RTE_EXPORT_SYMBOL(rte_pcapng_copy_ts)
 struct rte_mbuf *
-rte_pcapng_copy(uint16_t port_id, uint32_t queue,
+rte_pcapng_copy_ts(uint16_t port_id, uint32_t queue,
 		const struct rte_mbuf *md,
 		struct rte_mempool *mp,
 		uint32_t length,
 		enum rte_pcapng_direction direction,
-		const char *comment)
+		const char *comment, uint64_t ts)
 {
 	struct pcapng_enhance_packet_block *epb;
 	uint32_t orig_len, pkt_len, padding, flags;
@@ -691,7 +691,7 @@ rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 	mc->port = port_id;
 
 	/* Put timestamp in cycles here - adjust in packet write */
-	timestamp = rte_get_tsc_cycles();
+	timestamp = ts ? ts : rte_get_tsc_cycles();
 	epb->timestamp_hi = timestamp >> 32;
 	epb->timestamp_lo = (uint32_t)timestamp;
 	epb->capture_length = pkt_len;
@@ -720,7 +720,6 @@ rte_pcapng_write_packets(rte_pcapng_t *self,
 	for (i = 0; i < nb_pkts; i++) {
 		struct rte_mbuf *m = pkts[i];
 		struct pcapng_enhance_packet_block *epb;
-		uint64_t cycles, timestamp;
 
 		/* sanity check that is really a pcapng mbuf */
 		epb = rte_pktmbuf_mtod(m, struct pcapng_enhance_packet_block *);
@@ -737,16 +736,6 @@ rte_pcapng_write_packets(rte_pcapng_t *self,
 			return -1;
 		}
 
-		/*
-		 * When data is captured by pcapng_copy the current TSC is stored.
-		 * Adjust the value recorded in file to PCAP epoch units.
-		 */
-		cycles = (uint64_t)epb->timestamp_hi << 32;
-		cycles += epb->timestamp_lo;
-		timestamp = tsc_to_ns_epoch(&self->clock, cycles);
-		epb->timestamp_hi = timestamp >> 32;
-		epb->timestamp_lo = (uint32_t)timestamp;
-
 		/*
 		 * Handle case of highly fragmented and large burst size
 		 * Note: this assumes that max segments per mbuf < IOV_MAX
diff --git a/lib/pcapng/rte_pcapng.h b/lib/pcapng/rte_pcapng.h
index d8d328f710..3d735e4ebe 100644
--- a/lib/pcapng/rte_pcapng.h
+++ b/lib/pcapng/rte_pcapng.h
@@ -109,7 +109,7 @@ enum rte_pcapng_direction {
 };
 
 /**
- * Format an mbuf for writing to file.
+ * Format an mbuf with time stamp for writing to file.
  *
  * @param port_id
  *   The Ethernet port on which packet was received
@@ -129,16 +129,53 @@ enum rte_pcapng_direction {
  * @param comment
  *   Optional per packet comment.
  *   Truncated to UINT16_MAX characters.
+ * @param ts
+ *   Optional timestamp in nanoseconds. If zero, the current TSC is used.
  *
  * @return
  *   - The pointer to the new mbuf formatted for pcapng_write
  *   - NULL on error such as invalid port or out of memory.
  */
 struct rte_mbuf *
+rte_pcapng_copy_ts(uint16_t port_id, uint32_t queue,
+		const struct rte_mbuf *m, struct rte_mempool *mp,
+		uint32_t length,
+		enum rte_pcapng_direction direction, const char *comment, uint64_t ts);
+
+/**
+ * Format an mbuf for writing to file.
+ *
+ * @param port_id
+ *   The Ethernet port on which packet was received
+ *   or is going to be transmitted.
+ * @param queue
+ *   The queue on the Ethernet port where packet was received
+ *   or is going to be transmitted.
+ * @param mp
+ *   The mempool from which the "clone" mbufs are allocated.
+ * @param m
+ *   The mbuf to copy
+ * @param length
+ *   The upper limit on bytes to copy.  Passing UINT32_MAX
+ *   means all data (after offset).
+ * @param direction
+ *   The direction of the packer: receive, transmit or unknown.
+ * @param comment
+ *   Packet comment.
+ *
+ * @return
+ *   - The pointer to the new mbuf formatted for pcapng_write
+ *   - NULL if allocation fails.
+ */
+static inline struct rte_mbuf *
 rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 		const struct rte_mbuf *m, struct rte_mempool *mp,
 		uint32_t length,
-		enum rte_pcapng_direction direction, const char *comment);
+		enum rte_pcapng_direction direction, const char *comment)
+{
+	return rte_pcapng_copy_ts(port_id, queue, m, mp, length, direction,
+				  comment, 0);
+}
 
 
 /**
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.


^ permalink raw reply related

* [PATCH 7/7] net/ice: add header split mbuf callback support
From: Dawid Wesierski @ 2026-06-08 16:40 UTC (permalink / raw)
  To: dev
  Cc: thomas, david.marchand, vladimir.medvedkin, bruce.richardson,
	anatoly.burakov, reshma.pattan, stephen, Marek Kasiewicz,
	Dawid Wesierski
In-Reply-To: <20260608164059.65420-1-dawid.wesierski@intel.com>

From: Marek Kasiewicz <marek.kasiewicz@intel.com>

Add an ethdev API rte_eth_hdrs_set_mbuf_callback() that allows
applications to register a callback providing custom payload mbufs
for header split RX mode. When registered, the ICE PMD calls this
callback at mbuf allocation points to obtain user-provided payload
buffers instead of allocating from the mempool.

This enables zero-copy RX for header split: the NIC DMAs the payload
directly into application-managed buffers (e.g., mapped frame buffers
with known IOVA), bypassing an extra memcpy from the mempool mbuf.

The callback is invoked at three allocation points in the ICE driver:
initial queue setup, bulk buffer allocation, and single-packet
receive path.


Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---
 drivers/net/intel/common/rx.h      |  2 +
 drivers/net/intel/ice/ice_ethdev.c |  1 +
 drivers/net/intel/ice/ice_rxtx.c   | 63 ++++++++++++++++++++++++++++++
 drivers/net/intel/ice/ice_rxtx.h   |  2 +
 lib/ethdev/ethdev_driver.h         | 10 +++++
 lib/ethdev/rte_ethdev.c            | 17 ++++++++
 lib/ethdev/rte_ethdev.h            | 46 ++++++++++++++++++++++
 7 files changed, 141 insertions(+)

diff --git a/drivers/net/intel/common/rx.h b/drivers/net/intel/common/rx.h
index e0bf520ebd..8abb2a3ce9 100644
--- a/drivers/net/intel/common/rx.h
+++ b/drivers/net/intel/common/rx.h
@@ -113,6 +113,8 @@ struct ci_rx_queue {
 			uint32_t hw_time_low; /* low 32 bits of timestamp */
 			int ts_offset; /* dynamic mbuf timestamp field offset */
 			uint64_t ts_flag; /* dynamic mbuf timestamp flag */
+			rte_eth_hdrs_mbuf_callback_fn hdrs_mbuf_cb; /* hdr split mbuf cb */
+			void *hdrs_mbuf_cb_priv; /* hdr split mbuf cb priv */
 		};
 		struct { /* iavf specific values */
 			const struct iavf_rxq_ops *ops; /**< queue ops */
diff --git a/drivers/net/intel/ice/ice_ethdev.c b/drivers/net/intel/ice/ice_ethdev.c
index b7cea3bfc1..fb15438dbc 100644
--- a/drivers/net/intel/ice/ice_ethdev.c
+++ b/drivers/net/intel/ice/ice_ethdev.c
@@ -282,6 +282,7 @@ static const struct eth_dev_ops ice_eth_dev_ops = {
 	.dev_set_link_down            = ice_dev_set_link_down,
 	.dev_led_on                   = ice_dev_led_on,
 	.dev_led_off                  = ice_dev_led_off,
+	.hdrs_mbuf_set_cb             = ice_hdrs_mbuf_set_cb,
 	.rx_queue_start               = ice_rx_queue_start,
 	.rx_queue_stop                = ice_rx_queue_stop,
 	.tx_queue_start               = ice_tx_queue_start,
diff --git a/drivers/net/intel/ice/ice_rxtx.c b/drivers/net/intel/ice/ice_rxtx.c
index 8d709125f7..867f595291 100644
--- a/drivers/net/intel/ice/ice_rxtx.c
+++ b/drivers/net/intel/ice/ice_rxtx.c
@@ -487,6 +487,17 @@ ice_alloc_rx_queue_mbufs(struct ci_rx_queue *rxq)
 				return -ENOMEM;
 			}
 
+			if (rxq->hdrs_mbuf_cb) {
+				struct rte_eth_hdrs_mbuf hdrs_mbuf = {0};
+				int ret = rxq->hdrs_mbuf_cb(rxq->hdrs_mbuf_cb_priv,
+					&hdrs_mbuf);
+
+				if (ret >= 0) {
+					mbuf_pay->buf_addr = hdrs_mbuf.buf_addr;
+					mbuf_pay->buf_iova = hdrs_mbuf.buf_iova;
+				}
+			}
+
 			mbuf_pay->next = NULL;
 			mbuf_pay->data_off = RTE_PKTMBUF_HEADROOM;
 			mbuf_pay->nb_segs = 1;
@@ -2126,6 +2137,16 @@ ice_rx_alloc_bufs(struct ci_rx_queue *rxq)
 			rxdp[i].read.pkt_addr = dma_addr;
 		} else {
 			mb->next = rxq->sw_split_buf[i].mbuf;
+			if (rxq->hdrs_mbuf_cb && mb->next) {
+				struct rte_eth_hdrs_mbuf hdrs_mbuf = {0};
+				int ret = rxq->hdrs_mbuf_cb(rxq->hdrs_mbuf_cb_priv,
+					&hdrs_mbuf);
+
+				if (ret >= 0) {
+					mb->next->buf_addr = hdrs_mbuf.buf_addr;
+					mb->next->buf_iova = hdrs_mbuf.buf_iova;
+				}
+			}
 			pay_addr = rte_cpu_to_le_64(rte_mbuf_data_iova_default(mb->next));
 			rxdp[i].read.hdr_addr = dma_addr;
 			rxdp[i].read.pkt_addr = pay_addr;
@@ -2810,6 +2831,17 @@ ice_recv_pkts(void *rx_queue,
 				break;
 			}
 
+			if (rxq->hdrs_mbuf_cb) {
+				struct rte_eth_hdrs_mbuf hdrs_mbuf = {0};
+				int ret = rxq->hdrs_mbuf_cb(rxq->hdrs_mbuf_cb_priv,
+					&hdrs_mbuf);
+
+				if (ret >= 0) {
+					nmb_pay->buf_addr = hdrs_mbuf.buf_addr;
+					nmb_pay->buf_iova = hdrs_mbuf.buf_iova;
+				}
+			}
+
 			nmb->next = nmb_pay;
 			nmb_pay->next = NULL;
 
@@ -4533,3 +4565,34 @@ ice_fdir_programming(struct ice_pf *pf, struct ice_fltr_desc *fdir_desc)
 
 
 }
+
+int
+ice_hdrs_mbuf_set_cb(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+	void *priv, rte_eth_hdrs_mbuf_callback_fn cb)
+{
+	struct ci_rx_queue *rxq;
+
+	if (rx_queue_id >= dev->data->nb_rx_queues) {
+		PMD_DRV_LOG(ERR, "RX queue %u out of range", rx_queue_id);
+		return -EINVAL;
+	}
+
+	rxq = dev->data->rx_queues[rx_queue_id];
+	if (rxq == NULL) {
+		PMD_DRV_LOG(ERR, "RX queue %u not available or setup", rx_queue_id);
+		return -EINVAL;
+	}
+
+	if (rxq->hdrs_mbuf_cb) {
+		PMD_DRV_LOG(ERR, "RX queue %u has hdrs mbuf cb already",
+			rx_queue_id);
+		return -EEXIST;
+	}
+
+	rxq->hdrs_mbuf_cb_priv = priv;
+	rxq->hdrs_mbuf_cb = cb;
+	PMD_DRV_LOG(NOTICE, "RX queue %u register hdrs mbuf cb at %p",
+		rx_queue_id, cb);
+
+	return 0;
+}
diff --git a/drivers/net/intel/ice/ice_rxtx.h b/drivers/net/intel/ice/ice_rxtx.h
index 999b6b30d6..7ed114ee94 100644
--- a/drivers/net/intel/ice/ice_rxtx.h
+++ b/drivers/net/intel/ice/ice_rxtx.h
@@ -303,6 +303,8 @@ uint16_t ice_xmit_pkts_vec_avx512_offload(void *tx_queue,
 int ice_fdir_programming(struct ice_pf *pf, struct ice_fltr_desc *fdir_desc);
 int ice_tx_done_cleanup(void *txq, uint32_t free_cnt);
 int ice_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc);
+int ice_hdrs_mbuf_set_cb(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+		void *priv, rte_eth_hdrs_mbuf_callback_fn cb);
 enum rte_vect_max_simd ice_get_max_simd_bitwidth(void);
 
 #define FDIR_PARSING_ENABLE_PER_QUEUE(ad, on) do { \
diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
index 0f336f9567..b48681268c 100644
--- a/lib/ethdev/ethdev_driver.h
+++ b/lib/ethdev/ethdev_driver.h
@@ -1292,6 +1292,13 @@ typedef int (*eth_cman_config_set_t)(struct rte_eth_dev *dev,
 typedef int (*eth_cman_config_get_t)(struct rte_eth_dev *dev,
 				struct rte_eth_cman_config *config);
 
+/** @internal
+ * Set header split payload mbuf callback for a receive queue.
+ */
+typedef int (*eth_hdrs_mbuf_set_cb_t)(struct rte_eth_dev *dev,
+	uint16_t rx_queue_id, void *priv,
+	rte_eth_hdrs_mbuf_callback_fn cb);
+
 /**
  * @internal
  * Dump Rx descriptor info to a file.
@@ -1652,6 +1659,9 @@ struct eth_dev_ops {
 	/** Dump Tx descriptor info */
 	eth_tx_descriptor_dump_t eth_tx_descriptor_dump;
 
+	/** Set header split mbuf callback */
+	eth_hdrs_mbuf_set_cb_t hdrs_mbuf_set_cb;
+
 	/** Get congestion management information */
 	eth_cman_info_get_t cman_info_get;
 	/** Initialize congestion management structure with default values */
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index 9efeaf77cb..d5820ccd22 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -7316,6 +7316,23 @@ rte_eth_ip_reassembly_conf_set(uint16_t port_id,
 	return ret;
 }
 
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_eth_hdrs_set_mbuf_callback, 26.07)
+int
+rte_eth_hdrs_set_mbuf_callback(uint16_t port_id, uint16_t rx_queue_id,
+	void *priv, rte_eth_hdrs_mbuf_callback_fn cb)
+{
+	struct rte_eth_dev *dev;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+	dev = &rte_eth_devices[port_id];
+
+	if (dev->dev_ops->hdrs_mbuf_set_cb == NULL)
+		return -ENOTSUP;
+
+	return eth_err(port_id,
+		dev->dev_ops->hdrs_mbuf_set_cb(dev, rx_queue_id, priv, cb));
+}
+
 RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_eth_dev_priv_dump, 22.03)
 int
 rte_eth_dev_priv_dump(uint16_t port_id, FILE *file)
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index ee400b386f..dbf2c23a35 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -6985,6 +6985,52 @@ rte_eth_tx_buffer(uint16_t port_id, uint16_t queue_id,
 	return rte_eth_tx_buffer_flush(port_id, queue_id, buffer);
 }
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Buffer descriptor for header split payload mbuf callback.
+ */
+struct rte_eth_hdrs_mbuf {
+	void *buf_addr;       /**< Virtual address of payload buffer. */
+	rte_iova_t buf_iova;  /**< IOVA of payload buffer. */
+};
+
+/**
+ * Callback function type for providing custom payload mbufs
+ * in header split mode.
+ *
+ * @param priv
+ *   User-provided private context.
+ * @param mbuf
+ *   Pointer to buffer descriptor to be filled by the callback.
+ * @return
+ *   0 on success, negative errno on failure.
+ */
+typedef int (*rte_eth_hdrs_mbuf_callback_fn)(void *priv,
+	struct rte_eth_hdrs_mbuf *mbuf);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice.
+ *
+ * Register a callback to provide custom payload mbufs for header split RX.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param rx_queue_id
+ *   The index of the receive queue.
+ * @param priv
+ *   User-provided private context passed to the callback.
+ * @param cb
+ *   Callback function that provides payload buffer descriptors.
+ * @return
+ *   0 on success, negative errno on failure.
+ */
+__rte_experimental
+int rte_eth_hdrs_set_mbuf_callback(uint16_t port_id, uint16_t rx_queue_id,
+		void *priv, rte_eth_hdrs_mbuf_callback_fn cb);
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.


^ permalink raw reply related

* Re: [PATCH v16 4/5] vhost: add mem region add/remove handlers
From: Maxime Coquelin @ 2026-06-08 16:16 UTC (permalink / raw)
  To: pravin.bathija; +Cc: dev, stephen, fengchengwen, thomas
In-Reply-To: <20260606025211.1082615-5-pravin.bathija@dell.com>

On Sat, Jun 6, 2026 at 4:52 AM <pravin.bathija@dell.com> wrote:
>
> From: Pravin M Bathija <pravin.bathija@dell.com>
>
> Add support for VHOST_USER_ADD_MEM_REG, VHOST_USER_REM_MEM_REG and
> VHOST_USER_GET_MAX_MEM_SLOTS. Refactor memory initialization into
> common helper and add supporting functions for dynamic memory management.
>
> Signed-off-by: Pravin M Bathija <pravin.bathija@dell.com>
> ---
>  lib/vhost/vhost_user.c | 266 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 266 insertions(+)
>
> diff --git a/lib/vhost/vhost_user.c b/lib/vhost/vhost_user.c
> index 94fca8b589..020c993b29 100644
> --- a/lib/vhost/vhost_user.c
> +++ b/lib/vhost/vhost_user.c
> @@ -71,6 +71,9 @@ VHOST_MESSAGE_HANDLER(VHOST_USER_SET_FEATURES, vhost_user_set_features, false, t
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_OWNER, vhost_user_set_owner, false, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_RESET_OWNER, vhost_user_reset_owner, false, false) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_MEM_TABLE, vhost_user_set_mem_table, true, true) \
> +VHOST_MESSAGE_HANDLER(VHOST_USER_GET_MAX_MEM_SLOTS, vhost_user_get_max_mem_slots, false, false) \
> +VHOST_MESSAGE_HANDLER(VHOST_USER_ADD_MEM_REG, vhost_user_add_mem_reg, true, true) \
> +VHOST_MESSAGE_HANDLER(VHOST_USER_REM_MEM_REG, vhost_user_rem_mem_reg, false, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_BASE, vhost_user_set_log_base, true, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_FD, vhost_user_set_log_fd, true, true) \
>  VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_NUM, vhost_user_set_vring_num, false, true) \
> @@ -1167,6 +1170,24 @@ add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
>         return 0;
>  }
>
> +static void
> +remove_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg)
> +{
> +       uint64_t reg_start = reg->host_user_addr;
> +       uint64_t reg_end = reg_start + reg->size;
> +       uint32_t i, j = 0;
> +
> +       for (i = 0; i < dev->nr_guest_pages; i++) {
> +               if (dev->guest_pages[i].host_user_addr >= reg_start &&
> +                   dev->guest_pages[i].host_user_addr < reg_end)
> +                       continue;
> +               if (j != i)
> +                       dev->guest_pages[j] = dev->guest_pages[i];
> +               j++;
> +       }
> +       dev->nr_guest_pages = j;
> +}
> +
>  #ifdef RTE_LIBRTE_VHOST_DEBUG
>  /* TODO: enable it only in debug mode? */
>  static void
> @@ -1591,6 +1612,251 @@ vhost_user_set_mem_table(struct virtio_net **pdev,
>         return RTE_VHOST_MSG_RESULT_ERR;
>  }
>
> +
> +static int
> +vhost_user_get_max_mem_slots(struct virtio_net **pdev __rte_unused,
> +                       struct vhu_msg_context *ctx,
> +                       int main_fd __rte_unused)
> +{
> +       uint32_t max_mem_slots = VHOST_MEMORY_MAX_NREGIONS;
> +
> +       ctx->msg.payload.u64 = max_mem_slots;
> +       ctx->msg.size = sizeof(ctx->msg.payload.u64);
> +       ctx->fd_num = 0;
> +
> +       return RTE_VHOST_MSG_RESULT_REPLY;
> +}
> +
> +/*
> + * Invalidate and re-translate all vring addresses after the memory table
> + * has been modified (add/remove region).
> + *
> + * translate_ring_addresses() may call numa_realloc(), which can reallocate
> + * the device structure.  The updated pointer is written back through *pdev
> + * so callers must refresh their local "dev" afterwards: dev = *pdev.
> + */
> +static void
> +vhost_user_invalidate_vrings(struct virtio_net **pdev)
> +{
> +       struct virtio_net *dev = *pdev;
> +       uint32_t i;
> +
> +       for (i = 0; i < dev->nr_vring; i++) {
> +               struct vhost_virtqueue *vq = dev->virtqueue[i];
> +
> +               if (!vq)
> +                       continue;
> +
> +               if (vq->desc || vq->avail || vq->used) {
> +                       vq_assert_lock(dev, vq);
> +
> +                       vring_invalidate(dev, vq);
> +
> +                       translate_ring_addresses(&dev, &vq);
> +               }
> +       }
> +
> +       *pdev = dev;
> +}
> +
> +/*
> + * Macro wrapper that performs the compile-time lock assertion with the
> + * correct message ID at the call site, then calls the implementation.
> + */
> +#define dev_invalidate_vrings(pdev, id) do { \
> +       static_assert(id ## _LOCK_ALL_QPS, \
> +               #id " handler is not declared as locking all queue pairs"); \
> +       vhost_user_invalidate_vrings(pdev); \
> +} while (0)
> +
> +static int
> +vhost_user_add_mem_reg(struct virtio_net **pdev,
> +                       struct vhu_msg_context *ctx,
> +                       int main_fd __rte_unused)
> +{
> +       struct VhostUserMemoryRegion *region = &ctx->msg.payload.memreg.region;
> +       struct virtio_net *dev = *pdev;
> +       uint32_t i;
> +
> +       /* convert first region add to normal memory table set */
> +       if (dev->mem == NULL) {
> +               if (vhost_user_initialize_memory(pdev) < 0)
> +                       goto close_msg_fds;
> +       }
> +
> +       /* make sure new region will fit */
> +       if (dev->mem->nregions >= VHOST_MEMORY_MAX_NREGIONS) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "too many memory regions already (%u)",
> +                                                                       dev->mem->nregions);
> +               goto close_msg_fds;
> +       }
> +
> +       /* make sure supplied memory fd present */
> +       if (ctx->fd_num != 1) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "fd count makes no sense (%u)", ctx->fd_num);
> +               goto close_msg_fds;
> +       }
> +
> +       /* Make sure no overlap in guest virtual address space */
> +       for (i = 0; i < dev->mem->nregions; i++) {
> +               struct rte_vhost_mem_region *cur = &dev->mem->regions[i];
> +               uint64_t cur_start = cur->guest_user_addr;
> +               uint64_t cur_end = cur_start + cur->size - 1;
> +               uint64_t new_start = region->userspace_addr;
> +               uint64_t new_end = new_start + region->memory_size - 1;
> +
> +               if (new_end >= cur_start && new_start <= cur_end) {
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "requested memory region overlaps with another region");
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tRequested region address:0x%" PRIx64,
> +                               region->userspace_addr);
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tRequested region size:0x%" PRIx64,
> +                               region->memory_size);
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tOverlapping region address:0x%" PRIx64,
> +                               cur->guest_user_addr);
> +                       VHOST_CONFIG_LOG(dev->ifname, ERR,
> +                               "\tOverlapping region size:0x%" PRIx64,
> +                               cur->size);
> +                       goto close_msg_fds;
> +               }
> +       }
> +
> +       /* New region goes at the end of the contiguous array */
> +       struct rte_vhost_mem_region *reg = &dev->mem->regions[dev->mem->nregions];
> +
> +       reg->guest_phys_addr = region->guest_phys_addr;
> +       reg->guest_user_addr = region->userspace_addr;
> +       reg->size            = region->memory_size;
> +       reg->fd              = ctx->fds[0];
> +       ctx->fds[0]          = -1;
> +
> +       if (vhost_user_mmap_region(dev, reg, region->mmap_offset) < 0) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap region");
> +               if (reg->mmap_addr) {
> +                       /* mmap succeeded but a later step (e.g. add_guest_pages)
> +                        * failed; undo the mapping and any guest-page entries.
> +                        */
> +                       remove_guest_pages(dev, reg);
> +                       free_mem_region(reg);
> +               } else {
> +                       close(reg->fd);
> +                       reg->fd = -1;
> +               }
> +               goto close_msg_fds;
> +       }
> +
> +       dev->mem->nregions++;
> +
> +       if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
> +               if (async_dma_map_region(dev, reg, true) < 0)
> +                       goto free_new_region_no_dma;
> +       }
> +
> +       if (dev->postcopy_listening) {
> +               /*
> +                * Cannot use vhost_user_postcopy_register() here because it
> +                * reads ctx->msg.payload.memory (SET_MEM_TABLE layout), but
> +                * ADD_MEM_REG uses the memreg payload.  Register the
> +                * single new region directly instead.
> +                */
> +               if (vhost_user_postcopy_region_register(dev, reg) < 0)
> +                       goto free_new_region;
> +       }
> +
> +       dev_invalidate_vrings(pdev, VHOST_USER_ADD_MEM_REG);
> +       dev = *pdev;
> +       dump_guest_pages(dev);
> +
> +       /*
> +        * In postcopy mode the front-end expects the back-end to reply with
> +        * the base of the mapped region (see VHOST_USER_SET_MEM_TABLE, which
> +        * applies here accordingly).  No reply is expected otherwise.
> +        *
> +        * translate_ring_addresses() above may have reallocated dev->mem via
> +        * numa_realloc(), so re-derive the region pointer from the refreshed
> +        * dev rather than using the now-stale reg.  The new region is the last
> +        * entry in the contiguous array.
> +        */
> +       if (dev->postcopy_listening) {
> +               reg = &dev->mem->regions[dev->mem->nregions - 1];
> +               ctx->msg.payload.memreg.region.userspace_addr = reg->host_user_addr;
> +               ctx->msg.size = sizeof(ctx->msg.payload.memreg);
> +               ctx->fd_num = 0;
> +               return RTE_VHOST_MSG_RESULT_REPLY;
> +       }

Thanks Stephen, good catch by the AI.
I did some digging into Qemu code, which AI later confirmed, and if
the series was tested,
the test passed "by accident" when postcopy was not enabled because
Qemu would read the
padding field of the payload, and would treat its value as
RTE_VHOST_MSG_RESULT_OK
because it is zero-initialized...


With this fix:
Reviewed-by: Maxime Coquelin <maxime.coquelin@redhat.com>

> +
> +       return RTE_VHOST_MSG_RESULT_OK;
> +
> +free_new_region:
> +       if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> +               async_dma_map_region(dev, reg, false);
> +free_new_region_no_dma:
> +       remove_guest_pages(dev, reg);
> +       free_mem_region(reg);
> +       dev->mem->nregions--;
> +close_msg_fds:
> +       close_msg_fds(ctx);
> +       return RTE_VHOST_MSG_RESULT_ERR;
> +}
> +
> +static int
> +vhost_user_rem_mem_reg(struct virtio_net **pdev,
> +                       struct vhu_msg_context *ctx,
> +                       int main_fd __rte_unused)
> +{
> +       struct VhostUserMemoryRegion *region = &ctx->msg.payload.memreg.region;
> +       struct virtio_net *dev = *pdev;
> +       uint32_t i;
> +
> +       if (dev->mem == NULL || dev->mem->nregions == 0) {
> +               VHOST_CONFIG_LOG(dev->ifname, ERR, "no memory regions to remove");
> +               return RTE_VHOST_MSG_RESULT_ERR;
> +       }
> +
> +       for (i = 0; i < dev->mem->nregions; i++) {
> +               struct rte_vhost_mem_region *current_region = &dev->mem->regions[i];
> +
> +               /*
> +                * According to the vhost-user specification:
> +                * The memory region to be removed is identified by its GPA,
> +                * user address and size. The mmap offset is ignored.
> +                */
> +               if (region->userspace_addr == current_region->guest_user_addr
> +                       && region->guest_phys_addr == current_region->guest_phys_addr
> +                       && region->memory_size == current_region->size) {
> +                       if (dev->async_copy && rte_vfio_is_enabled("vfio"))
> +                               async_dma_map_region(dev, current_region, false);
> +                       if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> +                               vhost_user_iotlb_cache_remove(dev,
> +                                       current_region->guest_phys_addr,
> +                                       current_region->size);
> +                       remove_guest_pages(dev, current_region);
> +                       free_mem_region(current_region);
> +
> +                       /* Compact the regions array to keep it contiguous */
> +                       if (i < dev->mem->nregions - 1) {
> +                               memmove(&dev->mem->regions[i],
> +                                       &dev->mem->regions[i + 1],
> +                                       (dev->mem->nregions - 1 - i) *
> +                                       sizeof(struct rte_vhost_mem_region));
> +                               memset(&dev->mem->regions[dev->mem->nregions - 1],
> +                                       0, sizeof(struct rte_vhost_mem_region));
> +                       }
> +
> +                       dev->mem->nregions--;
> +                       dev_invalidate_vrings(pdev, VHOST_USER_REM_MEM_REG);
> +                       dev = *pdev;
> +                       return RTE_VHOST_MSG_RESULT_OK;
> +               }
> +       }
> +
> +       VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to find region");
> +       return RTE_VHOST_MSG_RESULT_ERR;
> +}
> +
>  static bool
>  vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  {
> --
> 2.43.0
>


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox