DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] net/af_packet: fix parsing of numeric device args
From: Stephen Hemminger @ 2026-06-03 18:13 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Denis Sergeev, stable, John W. Linville
In-Reply-To: <20260603170812.212262-1-denserg.edu@gmail.com>

This driver has several numeric arguments but it was using
atoi() which allows garbage and negative values.
Convert to a helper using strtoul() with upper bound.

First found by Linux Verification Center (linuxtesting.org) with SVACE.

Reported-by: Denis Sergeev <denserg.edu@gmail.com>
Fixes: 364e08f2bbc0 ("af_packet: add PMD for AF_PACKET-based virtual devices")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/af_packet/rte_eth_af_packet.c | 58 +++++++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/drivers/net/af_packet/rte_eth_af_packet.c b/drivers/net/af_packet/rte_eth_af_packet.c
index 8303ff5ca9..b0ff22ea55 100644
--- a/drivers/net/af_packet/rte_eth_af_packet.c
+++ b/drivers/net/af_packet/rte_eth_af_packet.c
@@ -15,7 +15,9 @@
 #include <rte_kvargs.h>
 #include <bus_vdev_driver.h>
 
+#include <ctype.h>
 #include <errno.h>
+#include <limits.h>
 #include <linux/if_ether.h>
 #include <linux/if_packet.h>
 #include <arpa/inet.h>
@@ -1138,6 +1140,42 @@ rte_pmd_init_internals(struct rte_vdev_device *dev,
 	return -1;
 }
 
+/* Parse an unsigned integer device argument. */
+static int
+parse_uint(const char *key, const char *value,
+	   unsigned int *out, unsigned long limit)
+{
+	unsigned long val;
+	char *endptr;
+
+	if (value == NULL) {
+		PMD_LOG(ERR, "no value for argument \"%s\"", key);
+		return -1;
+	}
+
+	/* Skip leading whitespace so a leading sign can be detected. */
+	while (isspace((unsigned char)*value))
+		value++;
+
+	/* strtoul() silently accepts and negates a leading '-'. */
+	if (*value == '\0' || *value == '-') {
+		PMD_LOG(ERR, "invalid value \"%s\" for argument \"%s\"",
+			value, key);
+		return -1;
+	}
+
+	errno = 0;
+	val = strtoul(value, &endptr, 10);
+	if (errno != 0 || *endptr != '\0' || val > limit) {
+		PMD_LOG(ERR, "invalid value \"%s\" for argument \"%s\"",
+			value, key);
+		return -1;
+	}
+
+	*out = (unsigned int)val;
+	return 0;
+}
+
 static int
 rte_eth_from_packet(struct rte_vdev_device *dev,
 		    int const *sockfd,
@@ -1168,7 +1206,9 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 	for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
 		pair = &kvlist->pairs[k_idx];
 		if (strstr(pair->key, ETH_AF_PACKET_NUM_Q_ARG) != NULL) {
-			qpairs = atoi(pair->value);
+			if (parse_uint(pair->key, pair->value,
+				       &qpairs, RTE_MAX_QUEUES_PER_PORT) < 0)
+				return -1;
 			if (qpairs < 1) {
 				PMD_LOG(ERR,
 					"%s: invalid qpairs value",
@@ -1178,7 +1218,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			continue;
 		}
 		if (strstr(pair->key, ETH_AF_PACKET_BLOCKSIZE_ARG) != NULL) {
-			blocksize = atoi(pair->value);
+			if (parse_uint(pair->key, pair->value, &blocksize, UINT_MAX) < 0)
+				return -1;
 			if (!blocksize) {
 				PMD_LOG(ERR,
 					"%s: invalid blocksize value",
@@ -1188,7 +1229,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			continue;
 		}
 		if (strstr(pair->key, ETH_AF_PACKET_FRAMESIZE_ARG) != NULL) {
-			framesize = atoi(pair->value);
+			if (parse_uint(pair->key, pair->value, &framesize, UINT_MAX) < 0)
+				return -1;
 			if (!framesize) {
 				PMD_LOG(ERR,
 					"%s: invalid framesize value",
@@ -1198,7 +1240,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			continue;
 		}
 		if (strstr(pair->key, ETH_AF_PACKET_FRAMECOUNT_ARG) != NULL) {
-			framecount = atoi(pair->value);
+			if (parse_uint(pair->key, pair->value, &framecount, UINT_MAX) < 0)
+				return -1;
 			if (!framecount) {
 				PMD_LOG(ERR,
 					"%s: invalid framecount value",
@@ -1208,13 +1251,8 @@ rte_eth_from_packet(struct rte_vdev_device *dev,
 			continue;
 		}
 		if (strstr(pair->key, ETH_AF_PACKET_QDISC_BYPASS_ARG) != NULL) {
-			qdisc_bypass = atoi(pair->value);
-			if (qdisc_bypass > 1) {
-				PMD_LOG(ERR,
-					"%s: invalid bypass value",
-					name);
+			if (parse_uint(pair->key, pair->value, &qdisc_bypass, 1) < 0)
 				return -1;
-			}
 			continue;
 		}
 		if (strstr(pair->key, ETH_AF_PACKET_FANOUT_MODE_ARG) != NULL) {
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v8 20/20] net/sxe2: update sxe2 feature matrix docs
From: Stephen Hemminger @ 2026-06-03 18:19 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260603062945.1253672-21-liujie5@linkdatatechnology.com>

On Wed,  3 Jun 2026 14:29:45 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> Update the sxe2.ini feature sheet to accurately reflect the recently
> implemented hardware capabilities in the sxe2 PMD.
> 
> Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
> ---
The flow support documentation does not match code.

$ ./devtools/check-doc-vs-code.sh 
rte_flow doc out of sync for sxe2
	item ah
	item any
	item ecpri
	item esp
	item icmp
	item mpls
	item pfcp
	item raw

^ permalink raw reply

* Re: [PATCH v8 11/20] drivers: add support for VF representors
From: Stephen Hemminger @ 2026-06-03 18:22 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260603062945.1253672-12-liujie5@linkdatatechnology.com>

On Wed,  3 Jun 2026 14:29:36 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> Add support for VF representors in sxe2 PMD. This allows the host
> application (e.g., OVS-DPDK) to control and monitor virtual functions
> through a dedicated ethdev on the PF (Physical Function) side.
> 
> Key changes include:
> - Added representor enumeration and identification logic.
> - Implemented representor-specific dev_ops (link update, stats, etc.).
> - Configured back-channel communication between PF and VF for control
>   messages.
> - Supported the "-a <DBDF>,representor=[0-N]" EAL parameter to
>   instantiate representor ports.
> 
> Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
> ---
### [PATCH] drivers: add support for VF representors

WARNING: [TYPO_SPELLING] 'macth' may be misspelled - perhaps 'match'?
#  drivers/net/sxe2/sxe2_flow_parse_pattern.c:1786:
+  					"Invalid pattern spec miss macth mask for rule.");

WARNING: [TYPO_SPELLING] 'macth' may be misspelled - perhaps 'match'?
#  drivers/net/sxe2/sxe2_flow_parse_pattern.c:1787:
+  			PMD_LOG_ERR(DRV, "Invalid pattern spec miss macth mask for rule.");

WARNING: [TYPO_SPELLING] 'macth' may be misspelled - perhaps 'match'?
#  drivers/net/sxe2/sxe2_switchdev.c:197:
+  		PMD_DEV_LOG_INFO(adapter, DRV, "current switchdev mode miss macth");

total: 0 errors, 3 warnings, 0 checks, 6571 lines checked

^ permalink raw reply

* Re: [PATCH v8 07/20] net/sxe2: support IPsec inline protocol offload
From: Stephen Hemminger @ 2026-06-03 18:17 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260603062945.1253672-8-liujie5@linkdatatechnology.com>

On Wed,  3 Jun 2026 14:29:32 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> This patch adds support for IPsec inline protocol offload for both
> inbound and outbound traffic.
> 
> - Implement rte_security_ops: session_create, session_destroy.
> - Add hardware SA table management.
> - Update Rx/Tx data path to handle security offload flags.
> 
> The hardware offloads the ESP encapsulation/decapsulation and
> cryptographic processing.
> 
> Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>

Git complains about blank line:
Applying: net/sxe2: support IPsec inline protocol offload
/home/shemminger/DPDK/main/.git/worktrees/sxe2/rebase-apply/patch:236: new blank line at EOF.
+
warning: 1 line adds whitespace errors.

^ permalink raw reply

* Re: [PATCH v8 04/20] net/sxe2: support L2 filtering and MAC config
From: Stephen Hemminger @ 2026-06-03 18:21 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260603062945.1253672-5-liujie5@linkdatatechnology.com>

On Wed,  3 Jun 2026 14:29:29 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> - Support primary/secondary MAC address setup.
> - Enable L2 broadcast/multicast filter bits.
> - Add multicast address update logic.
> 
> Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
> ---

As part of "inclusive naming" DPDK does not allow use of variables name master or slave.
### [PATCH] net/sxe2: support L2 filtering and MAC config

WARNING:TYPO_SPELLING: 'master' may be misspelled - perhaps 'primary'?
#264: FILE: drivers/net/sxe2/sxe2_drv_cmd.h:238:
+	uint8_t master;
 	        ^^^^^^


^ permalink raw reply

* Re: [PATCH v8 19/20] drivers: add testpmd commands for private features
From: Stephen Hemminger @ 2026-06-03 18:23 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260603062945.1253672-20-liujie5@linkdatatechnology.com>

On Wed,  3 Jun 2026 14:29:44 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> Introduce private testpmd commands and implementation files to enable
> debugging and testing of sxe2-specific hardware features (such as
> packet scheduling reset, UDP tunnel configuration, and IPsec ingress/
> egress offloads) directly within the testpmd application.
> 
> The parameters are parsed using the standard 'rte_kvargs' library during
> the PCI/vdev probing phase. Documentation for these parameters is also
> updated.
> 
> During memory hotplug events, the SXE2 driver needs to track memory
> segment layout changes to maintain internal DMA mappings. However,
> existing memseg walk functions (rte_memseg_walk) acquire memory locks
> and cannot be called from within memory event callbacks, leading to
> potential deadlocks.
> 
> This commit introduces sxe2_memseg_walk_cb() as a helper that walks
> memory segments using the thread-unsafe variant
> rte_memseg_walk_thread_unsafe(), which is safe to call from
> memory-related callbacks [citation:1][citation:3][citation:5].
> 
> The implementation follows the standard rte_memseg_walk_t prototype,
> processing each memseg to update driver-specific data structures.
> 
> Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
> ---

WARNING: [TYPO_SPELLING] 'thw' may be misspelled - perhaps 'the'?
#  drivers/net/sxe2/sxe2_testpmd_lib.c:90:
+  			cmdline_printf(cl, "\thw flow id: %d\n", hw_flow->flow_id);

total: 0 errors, 1 warnings, 0 checks, 2371 lines checked

18/20 valid patches

^ permalink raw reply

* [PATCH v2] net/bnxt: fix RSS hash mode configuration for VFs
From: Mohammad Shuab Siddique @ 2026-06-03 18:36 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Mohammad Shuab Siddique
In-Reply-To: <20260603175137.1990204-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>

Fixed VFs attempting global RSS configuration which is not
permitted by firmware. VFs (including trusted VFs) must use
per-VNIC RSS configuration with actual vnic_id and rss_ctx_idx
values.

Fixes: 8b9adaf0da6b ("net/bnxt: support RSS on ESP and AH headers")
Cc: stable@dpdk.org
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_hwrm.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index 0c82935de9..afc948ac29 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -2970,8 +2970,22 @@ bnxt_hwrm_vnic_rss_cfg_hash_mode_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic
 		req.hash_mode_flags = BNXT_HASH_MODE_INNERMOST;
 	else
 		req.hash_mode_flags = vnic->hash_mode;
-	req.vnic_id = rte_cpu_to_le_16(BNXT_DFLT_VNIC_ID_INVALID);
-	req.rss_ctx_idx = rte_cpu_to_le_16(BNXT_RSS_CTX_IDX_INVALID);
+
+	/* VFs must use actual vnic_id for per-VNIC configuration.
+	 * PFs can use INVALID vnic_id for global configuration.
+	 * This is because VFs don't have permission to configure
+	 * global hash mode, even if they're trusted.
+	 */
+	if (BNXT_VF(bp)) {
+		req.vnic_id = rte_cpu_to_le_16(vnic->fw_vnic_id);
+		req.rss_ctx_idx = rte_cpu_to_le_16(vnic->fw_grp_ids[0]);
+		PMD_DRV_LOG_LINE(DEBUG, "VF using per-VNIC RSS config (vnic_id=%u)",
+				 vnic->fw_vnic_id);
+	} else {
+		req.vnic_id = rte_cpu_to_le_16(BNXT_DFLT_VNIC_ID_INVALID);
+		req.rss_ctx_idx = rte_cpu_to_le_16(BNXT_RSS_CTX_IDX_INVALID);
+		PMD_DRV_LOG_LINE(DEBUG, "PF using global RSS config");
+	}
 
 	PMD_DRV_LOG_LINE(DEBUG, "RSS CFG: Hash level %d", req.hash_mode_flags);
 	rc = bnxt_hwrm_send_message(bp, &req, sizeof(req),
-- 
2.47.3


^ permalink raw reply related

* [PATCH v7 00/10] selective Rx
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger
In-Reply-To: <20260202160903.254621-1-getelson@nvidia.com>

This is a new feature in ethdev with tests and mlx5 implementation.
Selective Rx allows to receive partial data,
saving some hardware bandwidth.

Note 1: mlx5 support patch is not correctly indented
to make review easier. An indent patch follows to be squashed.

Note 2: DTS patch is an attempt to test the feature on day 1,
it is not mandatory if it is blocking the merge.

v2: rework after Gregory
v3: fix bugs found with AI by Stephen
v4: fix packet type in DTS test
v5: fix mlx5 Rx to handle discarding first segment
v6: fix reindent patch
v7: fix mlx5 CQE error handling + outdated mcqe + redundant assignment


Gregory Etelson (4):
  ethdev: introduce selective Rx
  app/testpmd: support selective Rx
  common/mlx5: add null MR functions
  net/mlx5: support selective Rx

Thomas Monjalon (6):
  app/testpmd: print Rx split capabilities
  net/mlx5: fix Rx split segment counter type
  net/mlx5: reindent previous changes
  common/mlx5: remove callbacks for MR registration
  dts: fix topology capability comparison
  dts: add selective Rx tests

 app/test-pmd/config.c                         |  17 ++
 app/test-pmd/testpmd.c                        |  71 ++++-
 devtools/libabigail.abignore                  |   7 +
 doc/guides/nics/features.rst                  |  14 +
 doc/guides/nics/features/default.ini          |   1 +
 doc/guides/nics/features/mlx5.ini             |   1 +
 doc/guides/nics/mlx5.rst                      |  86 ++++--
 doc/guides/rel_notes/release_26_07.rst        |  11 +
 doc/guides/testpmd_app_ug/run_app.rst         |  20 ++
 drivers/common/mlx5/linux/mlx5_common_verbs.c |  53 ++--
 drivers/common/mlx5/mlx5_common.c             |   6 +-
 drivers/common/mlx5/mlx5_common_mr.c          |  37 ++-
 drivers/common/mlx5/mlx5_common_mr.h          |  29 +-
 drivers/common/mlx5/windows/mlx5_common_os.c  |  31 ++-
 drivers/compress/mlx5/mlx5_compress.c         |   4 +-
 drivers/crypto/mlx5/mlx5_crypto.h             |   2 -
 drivers/crypto/mlx5/mlx5_crypto_gcm.c         |   6 +-
 drivers/net/mlx5/mlx5.c                       |   7 +
 drivers/net/mlx5/mlx5.h                       |   4 +-
 drivers/net/mlx5/mlx5_ethdev.c                |  25 ++
 drivers/net/mlx5/mlx5_flow_aso.c              |  21 +-
 drivers/net/mlx5/mlx5_flow_hw.c               |  11 +-
 drivers/net/mlx5/mlx5_flow_quota.c            |   6 +-
 drivers/net/mlx5/mlx5_hws_cnt.c               |  19 +-
 drivers/net/mlx5/mlx5_rx.c                    | 187 ++++++++-----
 drivers/net/mlx5/mlx5_rx.h                    |   5 +-
 drivers/net/mlx5/mlx5_rxq.c                   |  75 +++--
 drivers/net/mlx5/mlx5_trigger.c               |  64 ++++-
 dts/api/capabilities.py                       |   2 +
 dts/api/testpmd/__init__.py                   |  17 ++
 dts/api/testpmd/types.py                      |   6 +
 dts/framework/testbed_model/capability.py     |  10 +-
 dts/tests/TestSuite_rx_split.py               | 262 ++++++++++++++++++
 lib/ethdev/rte_ethdev.c                       |  24 +-
 lib/ethdev/rte_ethdev.h                       |  14 +-
 35 files changed, 881 insertions(+), 274 deletions(-)
 create mode 100644 dts/tests/TestSuite_rx_split.py

-- 
2.54.0


^ permalink raw reply

* [PATCH v7 01/10] app/testpmd: print Rx split capabilities
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Aman Singh
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

The capabilities from rte_eth_rxseg_capa are added
to the command "show port info".

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 app/test-pmd/config.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index c950793aaf..55d1c6d696 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -790,6 +790,12 @@ rss_offload_types_display(uint64_t offload_types, uint16_t char_num_per_line)
 	printf("\n");
 }
 
+static void
+print_bool_capa(const char *label, int value)
+{
+	printf("%s: %s\n", label, value ? "supported" : "not supported");
+}
+
 void
 port_infos_display(portid_t port_id)
 {
@@ -911,6 +917,16 @@ port_infos_display(portid_t port_id)
 		dev_info.max_rx_pktlen);
 	printf("Maximum configurable size of LRO aggregated packet: %u\n",
 		dev_info.max_lro_pkt_size);
+
+	printf("Rx split:\n");
+	printf("\tMax segments: %hu\n", dev_info.rx_seg_capa.max_nseg);
+	if (dev_info.rx_seg_capa.max_nseg > 0) {
+		print_bool_capa("\tMulti-pool", dev_info.rx_seg_capa.multi_pools);
+		print_bool_capa("\tBuffer offset", dev_info.rx_seg_capa.offset_allowed);
+		printf("\tOffset alignment: %u\n",
+				RTE_BIT32(dev_info.rx_seg_capa.offset_align_log2));
+	}
+
 	if (dev_info.max_vfs)
 		printf("Maximum number of VFs: %u\n", dev_info.max_vfs);
 	if (dev_info.max_vmdq_pools)
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 02/10] ethdev: introduce selective Rx
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Gregory Etelson, Andrew Rybchenko, Aman Singh
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

From: Gregory Etelson <getelson@nvidia.com>

Receiving an entire packet is not always needed.
The Rx performance can be improved by receiving only partial data
and safely discard the rest of the packet data,
because it reduces the PCI bandwidth and the memory consumption.

Selective Rx allows an application to receive
only pre-configured packet segments and discard the rest.
For example:
- Deliver the first N bytes only.
- Deliver the last N bytes only.
- Deliver N1 bytes from offset Off1 and N2 bytes from offset Off2.

Selective Rx is implemented on top of the Rx buffer split API:
- rte_eth_rxseg_split uses the null mempool for segments
that should be discarded.
- the PMD does not create mbuf segments if no data read.

For example: Deliver Ethernet header only

Rx queue segments configuration:
struct rte_eth_rxseg_split split[2] = {
    {
        .mp = <some mempool>,
        .length = sizeof(struct rte_ether_hdr)
    },
    {
        .mp = NULL, /* discard data */
        .length = 0 /* default to buffer size */
    }
};

Received mbuf:
    pkt_len = sizeof(struct rte_ether_hdr);
    data_len = sizeof(struct rte_ether_hdr);
    next = NULL; /* The next segment did not deliver data */

After selective Rx, the mbuf packet length reflects only the data
that was actually received,
and can be less than the original wire packet length.

A PMD activates the selective Rx capability by setting
the rte_eth_rxseg_capa.selective_rx bit.

This new capability bit is inserted in a bitmap hole
of the struct rte_eth_rxseg_capa,
but it needs to be ignored in the ABI check as libabigail sees a change.

Signed-off-by: Gregory Etelson <getelson@nvidia.com>
Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
Reviewed-by: Andrew Rybchenko <andrew.rybchenko@oktetlabs.ru>
---
 app/test-pmd/config.c                  |  1 +
 devtools/libabigail.abignore           |  7 +++++++
 doc/guides/nics/features.rst           | 14 ++++++++++++++
 doc/guides/nics/features/default.ini   |  1 +
 doc/guides/rel_notes/release_26_07.rst |  7 +++++++
 lib/ethdev/rte_ethdev.c                | 24 ++++++++++++++++--------
 lib/ethdev/rte_ethdev.h                | 14 +++++++++++++-
 7 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 55d1c6d696..9d457ca88e 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -925,6 +925,7 @@ port_infos_display(portid_t port_id)
 		print_bool_capa("\tBuffer offset", dev_info.rx_seg_capa.offset_allowed);
 		printf("\tOffset alignment: %u\n",
 				RTE_BIT32(dev_info.rx_seg_capa.offset_align_log2));
+		print_bool_capa("\tSelective Rx", dev_info.rx_seg_capa.selective_rx);
 	}
 
 	if (dev_info.max_vfs)
diff --git a/devtools/libabigail.abignore b/devtools/libabigail.abignore
index 21b8cd6113..2a0efd718e 100644
--- a/devtools/libabigail.abignore
+++ b/devtools/libabigail.abignore
@@ -33,3 +33,10 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; Temporary exceptions till next major ABI version ;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Ignore new bit selective_rx in rte_eth_rxseg_capa bitmap hole
+[suppress_type]
+        name = rte_eth_rxseg_capa
+        type_kind = struct
+        has_size_change = no
+        has_data_member_inserted_at = 6
diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index a075c057ec..26357036ca 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -199,6 +199,20 @@ Scatters the packets being received on specified boundaries to segmented mbufs.
 * **[related] API**: ``rte_eth_rx_queue_setup()``, ``rte_eth_buffer_split_get_supported_hdr_ptypes()``.
 
 
+.. _nic_features_selective_rx:
+
+Selective Rx
+------------
+
+Discards some segments of buffer split on Rx.
+
+* **[uses]     rte_eth_rxconf,rte_eth_rxmode**: ``offloads:RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT``.
+* **[uses]     rte_eth_rxconf**: ``rx_seg.mp = NULL`` to discard segments.
+* **[provides] rte_eth_dev_info**: ``rx_offload_capa:RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT``.
+* **[provides] rte_eth_dev_info**: ``rx_seg_capa.selective_rx``.
+* **[related]  API**: ``rte_eth_rx_queue_setup()``.
+
+
 .. _nic_features_lro:
 
 LRO
diff --git a/doc/guides/nics/features/default.ini b/doc/guides/nics/features/default.ini
index e50514d750..8303a530c1 100644
--- a/doc/guides/nics/features/default.ini
+++ b/doc/guides/nics/features/default.ini
@@ -25,6 +25,7 @@ Burst mode info      =
 Power mgmt address monitor =
 MTU update           =
 Buffer split on Rx   =
+Selective Rx         =
 Scattered Rx         =
 LRO                  =
 TSO                  =
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index b8a3e2ced9..90a7948c1d 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -81,6 +81,13 @@ New Features
 
   Added no-IOMMU mode for devices without or not enabling IOMMU/SVA.
 
+* **Added selective Rx in ethdev API.**
+
+  Some parts of packets may be discarded in Rx
+  by configuring a split of packets received in a queue,
+  and assigning no mempool to some configuration segments.
+  This is a driver capability advertised in the ``selective_rx`` bit.
+
 * **Added LinkData sxe2 ethernet driver.**
 
   Added network driver for the LinkData network adapters.
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index d0273e3f7b..67d609820c 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -2129,7 +2129,7 @@ rte_eth_rx_queue_check_split(uint16_t port_id,
 			const struct rte_eth_dev_info *dev_info)
 {
 	const struct rte_eth_rxseg_capa *seg_capa = &dev_info->rx_seg_capa;
-	struct rte_mempool *mp_first;
+	struct rte_mempool *mp_first = NULL;
 	uint32_t offset_mask;
 	uint16_t seg_idx;
 	int ret = 0;
@@ -2148,7 +2148,6 @@ rte_eth_rx_queue_check_split(uint16_t port_id,
 	 * Check the sizes and offsets against buffer sizes
 	 * for each segment specified in extended configuration.
 	 */
-	mp_first = rx_seg[0].mp;
 	offset_mask = RTE_BIT32(seg_capa->offset_align_log2) - 1;
 
 	ptypes = NULL;
@@ -2160,13 +2159,17 @@ rte_eth_rx_queue_check_split(uint16_t port_id,
 		uint32_t offset = rx_seg[seg_idx].offset;
 		uint32_t proto_hdr = rx_seg[seg_idx].proto_hdr;
 
-		if (mpl == NULL) {
-			RTE_ETHDEV_LOG_LINE(ERR, "null mempool pointer");
-			ret = -EINVAL;
-			goto out;
+		if (mpl == NULL) { /* discarded segment */
+			if (seg_capa->selective_rx == 0) { /* not supported */
+				RTE_ETHDEV_LOG_LINE(ERR, "null mempool pointer");
+				ret = -EINVAL;
+				goto out;
+			}
+			continue; /* next checks are not relevant if no mempool */
 		}
-		if (seg_idx != 0 && mp_first != mpl &&
-		    seg_capa->multi_pools == 0) {
+		if (mp_first == NULL)
+			mp_first = mpl;
+		if (mp_first != mpl && seg_capa->multi_pools == 0) {
 			RTE_ETHDEV_LOG_LINE(ERR, "Receiving to multiple pools is not supported");
 			ret = -ENOTSUP;
 			goto out;
@@ -2233,6 +2236,11 @@ rte_eth_rx_queue_check_split(uint16_t port_id,
 		if (ret != 0)
 			goto out;
 	}
+	if (mp_first == NULL) {
+		RTE_ETHDEV_LOG_LINE(ERR, "At least one Rx segment must have a mempool");
+		ret = -EINVAL;
+		goto out;
+	}
 out:
 	free(ptypes);
 	return ret;
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index dedbc05554..d7e1560828 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -1073,6 +1073,7 @@ struct rte_eth_txmode {
  * - The first network buffer will be allocated from the memory pool,
  *   specified in the first array element, the second buffer, from the
  *   pool in the second element, and so on.
+ *   If the pool is NULL, the segment will be discarded, i.e. not received.
  *
  * - The proto_hdrs in the elements define the split position of
  *   received packets.
@@ -1121,7 +1122,15 @@ struct rte_eth_txmode {
  *   The rest will be put into the last valid pool.
  */
 struct rte_eth_rxseg_split {
-	struct rte_mempool *mp; /**< Memory pool to allocate segment from. */
+	/**
+	 * Memory pool to allocate segment from.
+	 *
+	 * NULL means discarded segment.
+	 * Length of discarded segment is not reflected in mbuf packet length
+	 * and not accounted in ibytes statistics.
+	 * @see rte_eth_rxseg_capa::selective_rx
+	 */
+	struct rte_mempool *mp;
 	uint16_t length; /**< Segment data length, configures split point. */
 	uint16_t offset; /**< Data offset from beginning of mbuf data buffer. */
 	/**
@@ -1752,12 +1761,15 @@ struct rte_eth_switch_info {
  * @b EXPERIMENTAL: this structure may change without prior notice.
  *
  * Ethernet device Rx buffer segmentation capabilities.
+ *
+ * @see rte_eth_rxseg_split
  */
 struct rte_eth_rxseg_capa {
 	__extension__
 	uint32_t multi_pools:1; /**< Supports receiving to multiple pools.*/
 	uint32_t offset_allowed:1; /**< Supports buffer offsets. */
 	uint32_t offset_align_log2:4; /**< Required offset alignment. */
+	uint32_t selective_rx:1; /**< Supports discarding segment. */
 	uint16_t max_nseg; /**< Maximum amount of segments to split. */
 	uint16_t reserved; /**< Reserved field. */
 };
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 03/10] app/testpmd: support selective Rx
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Gregory Etelson, Aman Singh
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

From: Gregory Etelson <getelson@nvidia.com>

Add support for selective Rx using existing rxoffs and rxpkts
command line parameters.

When both rxoffs and rxpkts are specified
on PMDs supporting selective Rx, testpmd automatically:
1. Inserts segments with NULL mempool
   for gaps between configured segments to discard unwanted data.
2. Adds a trailing segment with NULL mempool
   to cover any remaining data up to the max packet length.

Example usage to receive only Ethernet header and a segment at offset 128:

  --rxoffs=0,128 --rxpkts=14,64

This creates segments:
- [0-13]: 14 bytes with mempool (received)
- [14-127]: 114 bytes with NULL mempool (discarded)
- [128-191]: 64 bytes with mempool (received)
- [192-max]: remaining bytes with NULL mempool (discarded)

Note: RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT is required for this feature
and is checked at ethdev API level.
This check is removed from testpmd to allow negative testing of the API.

Signed-off-by: Gregory Etelson <getelson@nvidia.com>
---
 app/test-pmd/testpmd.c                | 71 ++++++++++++++++++++++-----
 doc/guides/testpmd_app_ug/run_app.rst | 20 ++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index a9b35f530a..d14341d3ff 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -2731,6 +2731,16 @@ port_is_started(portid_t port_id)
 	return 1;
 }
 
+static struct rte_eth_rxseg_split *
+next_rx_seg(union rte_eth_rxseg *segs, uint16_t *idx)
+{
+	if (*idx >= MAX_SEGS_BUFFER_SPLIT) {
+		fprintf(stderr, "Too many segments (max %u)\n", MAX_SEGS_BUFFER_SPLIT);
+		return NULL;
+	}
+	return &segs[(*idx)++].split;
+}
+
 /* Configure the Rx with optional split. */
 int
 rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
@@ -2744,31 +2754,70 @@ rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 	uint32_t prev_hdrs = 0;
 	int ret;
 
-	if ((rx_pkt_nb_segs > 1) &&
-	    (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT)) {
+	if (rx_pkt_nb_segs > 1 || rx_pkt_nb_offs > 0) {
+		struct rte_eth_dev_info dev_info;
+		uint16_t seg_idx = 0;
+		uint16_t next_offset = 0;
+
+		ret = rte_eth_dev_info_get(port_id, &dev_info);
+		if (ret != 0)
+			return ret;
+
 		/* multi-segment configuration */
 		for (i = 0; i < rx_pkt_nb_segs; i++) {
-			struct rte_eth_rxseg_split *rx_seg = &rx_useg[i].split;
-			/*
-			 * Use last valid pool for the segments with number
-			 * exceeding the pool index.
-			 */
+			struct rte_eth_rxseg_split *rx_seg;
+			uint16_t seg_offset;
+
+			if (i < rx_pkt_nb_offs)
+				seg_offset = rx_pkt_seg_offsets[i];
+			else
+				seg_offset = rx_pkt_nb_offs > 0 ? next_offset : 0;
+
+			/* Insert selective Rx discard segment if there's a gap */
+			if (seg_offset > next_offset) {
+				rx_seg = next_rx_seg(rx_useg, &seg_idx);
+				if (rx_seg == NULL)
+					return -EINVAL;
+				rx_seg->offset = next_offset;
+				rx_seg->length = seg_offset - next_offset;
+				rx_seg->mp = NULL;
+				next_offset = seg_offset;
+			}
+
+			rx_seg = next_rx_seg(rx_useg, &seg_idx);
+			if (rx_seg == NULL)
+				return -EINVAL;
 			mp_n = (i >= mbuf_data_size_n) ? mbuf_data_size_n - 1 : i;
 			mpx = mbuf_pool_find(socket_id, mp_n);
-			/* Handle zero as mbuf data buffer size. */
-			rx_seg->offset = i < rx_pkt_nb_offs ?
-					   rx_pkt_seg_offsets[i] : 0;
+			rx_seg->offset = seg_offset;
 			rx_seg->mp = mpx ? mpx : mp;
 			if (rx_pkt_hdr_protos[i] != 0 && rx_pkt_seg_lengths[i] == 0) {
 				rx_seg->proto_hdr = rx_pkt_hdr_protos[i] & ~prev_hdrs;
 				prev_hdrs |= rx_seg->proto_hdr;
+			} else if (rx_pkt_nb_offs > 0 && rx_pkt_seg_lengths[i] == 0) {
+				/* Insert fake discard segment if explicitly requested */
+				rx_seg->mp = NULL;
+				rx_seg->length = 0;
 			} else {
 				rx_seg->length = rx_pkt_seg_lengths[i] ?
 						rx_pkt_seg_lengths[i] :
 						mbuf_data_size[mp_n];
 			}
+
+			next_offset = seg_offset + rx_seg->length;
 		}
-		rx_conf->rx_nseg = rx_pkt_nb_segs;
+
+		/* Add trailing selective Rx discard segment up to max packet length */
+		if (rx_pkt_nb_offs > 0 && next_offset < dev_info.max_rx_pktlen) {
+			struct rte_eth_rxseg_split *rx_seg = next_rx_seg(rx_useg, &seg_idx);
+			if (rx_seg == NULL)
+				return -EINVAL;
+			rx_seg->offset = next_offset;
+			rx_seg->length = dev_info.max_rx_pktlen - next_offset;
+			rx_seg->mp = NULL;
+		}
+
+		rx_conf->rx_nseg = seg_idx;
 		rx_conf->rx_seg = rx_useg;
 		rx_conf->rx_mempools = NULL;
 		rx_conf->rx_nmempool = 0;
diff --git a/doc/guides/testpmd_app_ug/run_app.rst b/doc/guides/testpmd_app_ug/run_app.rst
index 1a4a4b6c12..b59991ed89 100644
--- a/doc/guides/testpmd_app_ug/run_app.rst
+++ b/doc/guides/testpmd_app_ug/run_app.rst
@@ -364,6 +364,11 @@ The command line options are:
     feature is engaged. Affects only the queues configured
     with split offloads (currently BUFFER_SPLIT is supported only).
 
+    When used with ``--rxpkts`` on PMDs supporting selective Rx,
+    enables receiving only specific packet segments and discarding the rest.
+    Gaps between configured segments and any trailing data up to the max packet length
+    are automatically filled with NULL mempool segments (data is discarded).
+
 *   ``--rxpkts=X[,Y]``
 
     Set the length of segments to scatter packets on receiving if split
@@ -373,6 +378,21 @@ The command line options are:
     command line parameter and the mbufs to receive will be allocated
     sequentially from these extra memory pools.
 
+    Note: ``--rxoffs`` is required to enable selective Rx in testpmd.
+    To receive only the first N bytes, use ``--rxoffs=0 --rxpkts=N``.
+
+    To receive only the Ethernet header (14 bytes at offset 0) and
+    a 64-byte segment starting at offset 128, while discarding the rest::
+
+       --rxoffs=0,128 --rxpkts=14,64
+
+    This configuration will:
+
+    * Receive 14 bytes at offset 0 (Ethernet header)
+    * Discard bytes 14-127 (inserted NULL mempool segment)
+    * Receive 64 bytes at offset 128
+    * Discard remaining bytes (inserted NULL mempool segment)
+
 *   ``--txpkts=X[,Y]``
 
     Set TX segment sizes or total packet length. Valid for ``tx-only``
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 04/10] common/mlx5: add null MR functions
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Gregory Etelson, Dariusz Sosnowski,
	Viacheslav Ovsiienko, Bing Zhao, Ori Kam, Suanming Mou,
	Matan Azrad
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

From: Gregory Etelson <getelson@nvidia.com>

Add functions to allocate and free a null Memory Region (MR)
using ibverbs on Linux.

There is no implementation for DevX on Windows.

Signed-off-by: Gregory Etelson <getelson@nvidia.com>
Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 drivers/common/mlx5/linux/mlx5_common_verbs.c | 35 +++++++++++++++++++
 drivers/common/mlx5/mlx5_common_mr.h          |  9 +++++
 drivers/common/mlx5/windows/mlx5_common_os.c  | 16 +++++++++
 3 files changed, 60 insertions(+)

diff --git a/drivers/common/mlx5/linux/mlx5_common_verbs.c b/drivers/common/mlx5/linux/mlx5_common_verbs.c
index 2322d9d033..6d44e1f566 100644
--- a/drivers/common/mlx5/linux/mlx5_common_verbs.c
+++ b/drivers/common/mlx5/linux/mlx5_common_verbs.c
@@ -161,3 +161,38 @@ mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, mlx5_dereg_mr_t *dereg_mr_cb)
 	*reg_mr_cb = mlx5_common_verbs_reg_mr;
 	*dereg_mr_cb = mlx5_common_verbs_dereg_mr;
 }
+
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_alloc_null_mr)
+struct mlx5_pmd_mr *
+mlx5_os_alloc_null_mr(struct rte_device *dev, void *pd)
+{
+	struct ibv_mr *ibv_mr;
+	struct mlx5_pmd_mr *null_mr;
+
+	null_mr = mlx5_malloc(MLX5_MEM_ZERO, sizeof(*null_mr), 0, dev->numa_node);
+	if (!null_mr)
+		return NULL;
+	ibv_mr = mlx5_glue->alloc_null_mr(pd);
+	if (!ibv_mr) {
+		mlx5_free(null_mr);
+		return NULL;
+	}
+	*null_mr = (struct mlx5_pmd_mr) {
+		.lkey = rte_cpu_to_be_32(ibv_mr->lkey),
+		.addr = ibv_mr->addr,
+		.len = ibv_mr->length,
+		.obj = (void *)ibv_mr,
+	};
+	return null_mr;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_free_null_mr)
+void
+mlx5_os_free_null_mr(struct mlx5_pmd_mr *null_mr)
+{
+	if (!null_mr)
+		return;
+	if (null_mr->obj)
+		claim_zero(mlx5_glue->dereg_mr(null_mr->obj));
+	mlx5_free(null_mr);
+}
diff --git a/drivers/common/mlx5/mlx5_common_mr.h b/drivers/common/mlx5/mlx5_common_mr.h
index cf7c685e9b..00f3d832c3 100644
--- a/drivers/common/mlx5/mlx5_common_mr.h
+++ b/drivers/common/mlx5/mlx5_common_mr.h
@@ -21,6 +21,8 @@
 #include "mlx5_common_mp.h"
 #include "mlx5_common_defs.h"
 
+struct rte_device;
+
 /* mlx5 PMD MR struct. */
 struct mlx5_pmd_mr {
 	uint32_t	     lkey;
@@ -258,6 +260,13 @@ __rte_internal
 void
 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, mlx5_dereg_mr_t *dereg_mr_cb);
 
+__rte_internal
+struct mlx5_pmd_mr *
+mlx5_os_alloc_null_mr(struct rte_device *dev, void *pd);
+__rte_internal
+void
+mlx5_os_free_null_mr(struct mlx5_pmd_mr *null_mr);
+
 __rte_internal
 int
 mlx5_mr_mempool_register(struct mlx5_common_device *cdev,
diff --git a/drivers/common/mlx5/windows/mlx5_common_os.c b/drivers/common/mlx5/windows/mlx5_common_os.c
index 16fcc5f9fc..692517a9bf 100644
--- a/drivers/common/mlx5/windows/mlx5_common_os.c
+++ b/drivers/common/mlx5/windows/mlx5_common_os.c
@@ -454,6 +454,22 @@ mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, mlx5_dereg_mr_t *dereg_mr_cb)
 	*dereg_mr_cb = mlx5_os_dereg_mr;
 }
 
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_alloc_null_mr)
+struct mlx5_pmd_mr *
+mlx5_os_alloc_null_mr(struct rte_device *dev, void *pd)
+{
+	RTE_SET_USED(dev);
+	RTE_SET_USED(pd);
+	return NULL;
+}
+
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_free_null_mr)
+void
+mlx5_os_free_null_mr(struct mlx5_pmd_mr *null_mr)
+{
+	RTE_SET_USED(null_mr);
+}
+
 /*
  * In Windows, no need to wrap the MR, no known issue for it in kernel.
  * Use the regular function to create direct MR.
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 05/10] net/mlx5: fix Rx split segment counter type
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, stable, Dariusz Sosnowski,
	Viacheslav Ovsiienko, Bing Zhao, Ori Kam, Suanming Mou,
	Matan Azrad
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

In the API, rx_nseg and max_nseg are uint16_t.
In mlx5, MLX5_MAX_RXQ_NSEG is 32.
So there is no reason to have rxseg_n as uint32_t.
Reduce the fields to uint16_t and move them to avoid struct holes.

Fixes: 9f209b59c8b0 ("net/mlx5: support Rx buffer split description")
Fixes: 572c9d4bda08 ("net/mlx5: fix shared Rx queue segment configuration match")
Cc: stable@dpdk.org

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 drivers/net/mlx5/mlx5_rx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index dffab3955b..01b563d981 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -164,9 +164,9 @@ struct __rte_cache_aligned mlx5_rxq_data {
 	uint64_t flow_meta_mask;
 	int32_t flow_meta_offset;
 	uint32_t flow_meta_port_mask;
-	uint32_t rxseg_n; /* Number of split segment descriptions. */
 	struct mlx5_eth_rxseg rxseg[MLX5_MAX_RXQ_NSEG];
 	/* Buffer split segment descriptions - sizes, offsets, pools. */
+	uint16_t rxseg_n; /* Number of split segment descriptions. */
 	uint16_t rq_win_cnt; /* Number of packets in the sliding window data. */
 	uint16_t rq_win_idx_mask; /* Sliding window index wrapping mask. */
 	uint16_t rq_win_idx; /* Index of the first element in sliding window. */
@@ -191,9 +191,9 @@ struct mlx5_rxq_ctrl {
 	unsigned int irq:1; /* Whether IRQ is enabled. */
 	uint32_t flow_tunnels_n[MLX5_FLOW_TUNNEL]; /* Tunnels counters. */
 	uint32_t wqn; /* WQ number. */
-	uint32_t rxseg_n; /* Number of split segment descriptions. */
 	struct rte_eth_rxseg_split rxseg[MLX5_MAX_RXQ_NSEG];
 	/* Saved original buffer split segment configuration. */
+	uint16_t rxseg_n; /* Number of split segment descriptions. */
 	uint16_t dump_file_n; /* Number of dump files. */
 };
 
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 06/10] net/mlx5: support selective Rx
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Gregory Etelson, Dariusz Sosnowski,
	Viacheslav Ovsiienko, Bing Zhao, Ori Kam, Suanming Mou,
	Matan Azrad
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

From: Gregory Etelson <getelson@nvidia.com>

Selective Rx may save some PCI bandwidth.
Implement selective Rx in the (quite slow) scalar SPRQ Rx path
mlx5_rx_burst() where the performance impact
of the added condition branches is acceptable.
Other Rx functions do not support this feature.
When using selective Rx, mlx5_rx_burst will be selected.

A null Memory Region (MR) is always allocated
at shared device context initialization.
The selective Rx capability is not advertised
if this special MR allocation fails.

For each Rx segment configured with a NULL mempool,
a "null mbuf" is created.
It is a fake mbuf allocated outside any mempool,
used as a placeholder in the Rx ring.
The null MR lkey is used in the WQE for these segments
so the NIC writes received data to a discard buffer.
The mbuf data room size is resolved from the first segment having a pool.
For null segments, the buffer length is from the last seen pool,
so that the WQE stride size remains consistent.

In mlx5_rx_burst, discarded segments are not chained
into the packet mbuf list, NB_SEGS is decremented accordingly,
and no replacement buffer is allocated.
A separate data_seg_len accumulator tracks the total length
of delivered segments only.
The packet length is adjusted to reflect only the data
actually delivered to the application.

Signed-off-by: Gregory Etelson <getelson@nvidia.com>
Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 doc/guides/nics/features/mlx5.ini      |  1 +
 doc/guides/nics/mlx5.rst               | 86 +++++++++++++++++++-------
 doc/guides/rel_notes/release_26_07.rst |  4 ++
 drivers/net/mlx5/mlx5.c                |  7 +++
 drivers/net/mlx5/mlx5.h                |  1 +
 drivers/net/mlx5/mlx5_ethdev.c         | 25 ++++++++
 drivers/net/mlx5/mlx5_rx.c             | 49 ++++++++++++---
 drivers/net/mlx5/mlx5_rx.h             |  1 +
 drivers/net/mlx5/mlx5_rxq.c            | 45 ++++++++++++--
 drivers/net/mlx5/mlx5_trigger.c        | 52 +++++++++++++---
 10 files changed, 230 insertions(+), 41 deletions(-)

diff --git a/doc/guides/nics/features/mlx5.ini b/doc/guides/nics/features/mlx5.ini
index 3b3eda28b8..ae8c83057b 100644
--- a/doc/guides/nics/features/mlx5.ini
+++ b/doc/guides/nics/features/mlx5.ini
@@ -16,6 +16,7 @@ Burst mode info      = Y
 Power mgmt address monitor = Y
 MTU update           = Y
 Buffer split on Rx   = Y
+Selective Rx         = Y
 Scattered Rx         = Y
 LRO                  = Y
 TSO                  = Y
diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index 00bfb31370..afbf040e66 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -84,6 +84,9 @@ The Rx / Tx data path use different techniques to offer the best performance.
   with :ref:`multi-packet Rx queues (MPRQ) <mlx5_mprq_params>`.
   This feature is disabled by default.
 
+- Some PCI bandwidth is saved by receiving partial packets
+  with :ref:`selective Rx <mlx5_selective_rx>`.
+
 More details about Rx implementations and their configurations are provided
 in the chapter about :ref:`mlx5_rx_functions`.
 
@@ -879,6 +882,8 @@ MLX5 supports various methods to report statistics:
 Basic port statistics can be queried using ``rte_eth_stats_get()``.
 The received and sent statistics are through SW only
 and counts the number of packets received or sent successfully by the PMD.
+In the case of :ref:`selective Rx <mlx5_selective_rx>`,
+the ``ibytes`` counter matches segments delivered, not the skipped ones.
 The ``imissed`` counter is the amount of packets that could not be delivered
 to SW because a queue was full.
 Packets not received due to congestion in the bus or on the NIC
@@ -992,25 +997,26 @@ These configurations may also have an impact on the behavior:
 
 .. table:: Rx burst functions
 
-   +-------------------+------------------------+---------+-----------------+------+-------+---------+
-   || Function Name    || Parameters to Enable  || Scatter|| Error Recovery || CQE || Large|| Shared |
-   |                   |                        |         |                 || comp|| MTU  |  RxQ    |
-   +===================+========================+=========+=================+======+=======+=========+
-   | rx_burst          | rx_vec_en=0            |   Yes   | Yes             |  Yes |  Yes  | No      |
-   +-------------------+------------------------+---------+-----------------+------+-------+---------+
-   | rx_burst_vec      | rx_vec_en=1 (default)  |   No    | if CQE comp off |  Yes |  No   | No      |
-   +-------------------+------------------------+---------+-----------------+------+-------+---------+
-   | rx_burst_mprq     || mprq_en=1             |   No    | Yes             |  Yes |  Yes  | No      |
-   |                   || RxQs >= rxqs_min_mprq |         |                 |      |       |         |
-   +-------------------+------------------------+---------+-----------------+------+-------+---------+
-   | rx_burst_mprq_vec || rx_vec_en=1 (default) |   No    | if CQE comp off |  Yes |  Yes  | No      |
-   |                   || mprq_en=1             |         |                 |      |       |         |
-   |                   || RxQs >= rxqs_min_mprq |         |                 |      |       |         |
-   +-------------------+------------------------+---------+-----------------+------+-------+---------+
-   | rx_burst          | at least one Rx queue  |   Yes   | Yes             |  Yes |  Yes  | Yes     |
-   |  (out of order)   | on the device          |         |                 |      |       |         |
-   |                   | is shared              |         |                 |      |       |         |
-   +-------------------+------------------------+---------+-----------------+------+-------+---------+
+   +----------+-----------------------+---------+--------+----------+------+-------+--------+
+   || Function|| Parameters to Enable || Scatter|| Selec-|| Error   || CQE || Large|| Shared|
+   || Name    |                       |         || tive  || Recovery|| comp|| MTU  || RxQ   |
+   +==========+=======================+=========+========+==========+======+=======+========+
+   | rx_burst | rx_vec_en=0           |   Yes   |   Yes  | Yes      |  Yes |  Yes  |   No   |
+   +----------+-----------------------+---------+--------+----------+------+-------+--------+
+   | _vec     | rx_vec_en=1 (default) |   No    |   No   || if CQE  |  Yes |  No   |   No   |
+   |          |                       |         |        || comp off|      |       |        |
+   +----------+-----------------------+---------+--------+----------+------+-------+--------+
+   | _mprq    || mprq_en=1            |   No    |   No   | Yes      |  Yes |  Yes  |   No   |
+   |          || RxQs >= rxqs_min_mprq|         |        |          |      |       |        |
+   +----------+-----------------------+---------+--------+----------+------+-------+--------+
+   | _mprq_vec|| rx_vec_en=1 (default)|   No    |   No   || if CQE  |  Yes |  Yes  |   No   |
+   |          || mprq_en=1            |         |        || comp off|      |       |        |
+   |          || RxQs >= rxqs_min_mprq|         |        |          |      |       |        |
+   +----------+-----------------------+---------+--------+----------+------+-------+--------+
+   || _out_of || at least one Rx queue|   Yes   |   No   | Yes      |  Yes |  Yes  |   Yes  |
+   || _order  || on the device        |         |        |          |      |       |        |
+   |          || is shared            |         |        |          |      |       |        |
+   +----------+-----------------------+---------+--------+----------+------+-------+--------+
 
 
 Rx/Tx Tuning
@@ -1105,13 +1111,14 @@ Rx interrupt                                X
 :ref:`Rx threshold <mlx5_rx_threshold>`     X        X
 :ref:`Rx drop delay <mlx5_drop>`            X        X
 :ref:`Rx timestamp <mlx5_rx_timstp>`        X        X
+:ref:`buffer split <mlx5_buf_split>`        X        X
+:ref:`selective Rx <mlx5_selective_rx>`     X
+:ref:`multi-segment <mlx5_multiseg>`        X        X
 :ref:`Tx scheduling <mlx5_tx_sched>`        X
 :ref:`Tx rate limit <mlx5_rate_limit>`      X
 :ref:`Tx inline <mlx5_tx_inline>`           X        X
 :ref:`Tx fast free <mlx5_tx_fast_free>`     X        X
 :ref:`Tx affinity <mlx5_aggregated>`        X
-:ref:`buffer split <mlx5_buf_split>`        X        X
-:ref:`multi-segment <mlx5_multiseg>`        X        X
 promiscuous                                 X        X
 multicast promiscuous                       X        X
 multiple MAC addresses                      X
@@ -2248,13 +2255,50 @@ OFED       5.1-2
 DPDK       20.11
 =========  ==========
 
+Runtime configuration
+^^^^^^^^^^^^^^^^^^^^^
+
+The offload flag ``RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT`` is required.
+
+When calling ``rte_eth_rx_queue_setup()``,
+the input ``rte_eth_rxconf::rx_seg`` defines the configuration of the segments,
+mainly offset and length.
+
 Limitations
 ^^^^^^^^^^^
 
+#. Splitting per protocol header is not supported.
+
 #. Buffer split offload is supported with regular Rx burst routine only,
    no MPRQ feature or vectorized code can be engaged.
 
 
+.. _mlx5_selective_rx:
+
+Selective Rx
+~~~~~~~~~~~~
+
+Some PCI bandwidth can be saved
+by :ref:`skipping some parts of Rx data <nic_features_selective_rx>`.
+It is enabled when using :ref:`buffer split <mlx5_buf_split>`
+and configuring no mempool in some segments to discard.
+
+Runtime configuration
+^^^^^^^^^^^^^^^^^^^^^
+
+The offload flag ``RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT`` is required.
+
+When calling ``rte_eth_rx_queue_setup()``,
+the segment to discard (``rte_eth_rxconf::rx_seg::split``)
+is marked by the absence of mempool (``mp = NULL``).
+
+Limitations
+^^^^^^^^^^^
+
+#. Selective Rx is supported with regular Rx burst routine only,
+   no MPRQ feature or vectorized code can be engaged.
+
+
 .. _mlx5_multiseg:
 
 Multi-Segment Scatter/Gather
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index 90a7948c1d..3532792229 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -97,6 +97,10 @@ New Features
   * Added support for transmitting LLDP packets based on mbuf packet type.
   * Implemented AVX2 context descriptor transmit paths.
 
+* **Updated NVIDIA mlx5 ethernet driver.**
+
+  * Added support for selective Rx in scalar SPRQ Rx path.
+
 * **Updated PCAP ethernet driver.**
 
   * Added support for VLAN insertion and stripping.
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index f190654756..61c26d1206 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1975,6 +1975,9 @@ mlx5_alloc_shared_dev_ctx(const struct mlx5_dev_spawn_data *spawn,
 	/* Init counter pool list header and lock. */
 	LIST_INIT(&sh->hws_cpool_list);
 	rte_spinlock_init(&sh->cpool_lock);
+	sh->null_mr = mlx5_os_alloc_null_mr(sh->cdev->dev, sh->cdev->pd);
+	if (!sh->null_mr)
+		DRV_LOG(DEBUG, "Fail to initialize NULL MR, selective Rx is disabled.");
 exit:
 	pthread_mutex_unlock(&mlx5_dev_ctx_list_mutex);
 	return sh;
@@ -2139,6 +2142,10 @@ mlx5_free_shared_dev_ctx(struct mlx5_dev_ctx_shared *sh)
 	MLX5_ASSERT(sh->geneve_tlv_option_resource == NULL);
 	pthread_mutex_destroy(&sh->txpp.mutex);
 	mlx5_lwm_unset(sh);
+	if (sh->null_mr) {
+		mlx5_os_free_null_mr(sh->null_mr);
+		sh->null_mr = NULL;
+	}
 	mlx5_physical_device_destroy(sh->phdev);
 	mlx5_free(sh);
 	return;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 92a00cfaa8..bd6ef35b53 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -1674,6 +1674,7 @@ struct mlx5_dev_ctx_shared {
 	rte_spinlock_t cpool_lock;
 	LIST_HEAD(hws_cpool_list, mlx5_hws_cnt_pool) hws_cpool_list; /* Count pool list. */
 	struct mlx5_dev_registers registers;
+	struct mlx5_pmd_mr *null_mr;
 	struct mlx5_dev_shared_port port[]; /* per device port data array. */
 };
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index a29cdeeb50..7b7536fa1e 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -381,6 +381,7 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 	info->rx_seg_capa.multi_pools = !priv->config.mprq.enabled;
 	info->rx_seg_capa.offset_allowed = !priv->config.mprq.enabled;
 	info->rx_seg_capa.offset_align_log2 = 0;
+	info->rx_seg_capa.selective_rx = !!priv->sh->null_mr;
 	info->rx_offload_capa = (mlx5_get_rx_port_offloads() |
 				 info->rx_queue_offload_capa);
 	info->tx_offload_capa = mlx5_get_tx_port_offloads(dev);
@@ -708,6 +709,25 @@ mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
 	return -rte_errno;
 }
 
+static bool
+mlx5_selective_rx_enabled(struct rte_eth_dev *dev)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	for (uint32_t q = 0; q < priv->rxqs_n; ++q) {
+		struct mlx5_rxq_ctrl *rxq_ctrl = mlx5_rxq_ctrl_get(dev, q);
+
+		if (rxq_ctrl == NULL || rxq_ctrl->is_hairpin)
+			continue;
+		for (uint16_t s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
+			if (rxq_ctrl->rxq.rxseg[s].mp == NULL)
+				return true;
+		}
+	}
+
+	return false;
+}
+
 /**
  * Configure the RX function to use.
  *
@@ -723,6 +743,11 @@ mlx5_select_rx_function(struct rte_eth_dev *dev)
 	eth_rx_burst_t rx_pkt_burst = mlx5_rx_burst;
 
 	MLX5_ASSERT(dev != NULL);
+	if (mlx5_selective_rx_enabled(dev)) {
+		DRV_LOG(DEBUG, "port %u forced to scalar SPRQ Rx (selective Rx configured)",
+			dev->data->port_id);
+		return rx_pkt_burst;
+	}
 	if (mlx5_shared_rq_enabled(dev)) {
 		rx_pkt_burst = mlx5_rx_burst_out_of_order;
 		DRV_LOG(DEBUG, "port %u forced to use SPRQ"
diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index 185bfd4fff..1eef8ab95e 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -486,7 +486,7 @@ mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
 					rxq->wqes)[i];
 			addr = rte_pktmbuf_mtod(buf, uintptr_t);
 			byte_count = DATA_LEN(buf);
-			lkey = mlx5_rx_mb2mr(rxq, buf);
+			lkey = buf->pool ? mlx5_rx_mb2mr(rxq, buf) : rxq->sh->null_mr->lkey;
 		}
 		/* scat->addr must be able to store a pointer. */
 		MLX5_ASSERT(sizeof(scat->addr) >= sizeof(uintptr_t));
@@ -1044,11 +1044,14 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	const unsigned int sges_n = rxq->sges_n;
 	struct rte_mbuf *pkt = NULL;
 	struct rte_mbuf *seg = NULL;
+	struct rte_mbuf *tail = NULL;
 	volatile struct mlx5_cqe *cqe =
 		&(*rxq->cqes)[rxq->cq_ci & cqe_mask];
+	volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 	unsigned int i = 0;
 	unsigned int rq_ci = rxq->rq_ci << sges_n;
 	int len = 0; /* keep its value across iterations. */
+	uint32_t data_seg_len = 0;
 
 	while (pkts_n) {
 		uint16_t skip_cnt;
@@ -1056,14 +1059,18 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		volatile struct mlx5_wqe_data_seg *wqe =
 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
-		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
 
-		if (pkt)
-			NEXT(seg) = rep;
+		if (pkt) {
+			if (rep->pool)
+				NEXT(tail) = rep;
+			else
+				--NB_SEGS(pkt);
+		}
 		seg = rep;
 		rte_prefetch0(seg);
 		rte_prefetch0(cqe);
 		rte_prefetch0(wqe);
+		if (seg->pool) {
 		/* Allocate the buf from the same pool. */
 		rep = rte_mbuf_raw_alloc(seg->pool);
 		if (unlikely(rep == NULL)) {
@@ -1088,12 +1095,16 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			rq_ci <<= sges_n;
 			break;
 		}
+		}
 		if (!pkt) {
+		if (len == 0) { /* no CQE polled yet */
+			mcqe = NULL;
 			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
 			len = mlx5_rx_poll_len(rxq, cqe, cqe_n, cqe_mask,
 					       &mcqe, &skip_cnt, false, NULL);
 			if (unlikely(len & MLX5_ERROR_CQE_MASK)) {
 				/* We drop packets with non-critical errors */
+				if (seg->pool)
 				rte_mbuf_raw_free(rep);
 				if (len == MLX5_CRITICAL_ERROR_CQE_RET) {
 					rq_ci = rxq->rq_ci << sges_n;
@@ -1104,20 +1115,24 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				rq_ci += skip_cnt;
 				rq_ci <<= sges_n;
 				MLX5_ASSERT(!pkt);
+				len = 0;
 				continue;
 			}
 			if (len == 0) {
+				if (seg->pool)
 				rte_mbuf_raw_free(rep);
 				break;
 			}
-			pkt = seg;
 			MLX5_ASSERT(len >= (int)(rxq->crc_present << 2));
+			if (rxq->crc_present)
+				len -= RTE_ETHER_CRC_LEN;
+		}
+		if (seg->pool) { /* first real segment */
+			pkt = seg;
 			pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
 			if (rxq->cqe_comp_layout && mcqe)
 				cqe = &rxq->title_cqe;
 			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
-			if (rxq->crc_present)
-				len -= RTE_ETHER_CRC_LEN;
 			PKT_LEN(pkt) = len;
 			if (cqe->lro_num_seg > 1) {
 				mlx5_lro_update_hdr
@@ -1127,6 +1142,9 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				pkt->tso_segsz = len / cqe->lro_num_seg;
 			}
 		}
+		}
+		if (seg->pool) { /* real segment: replenish WQE */
+		tail = seg;
 		DATA_LEN(rep) = DATA_LEN(seg);
 		PKT_LEN(rep) = PKT_LEN(seg);
 		SET_DATA_OFF(rep, DATA_OFF(seg));
@@ -1141,20 +1159,37 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		/* If there's only one MR, no need to replace LKey in WQE. */
 		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
 			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
+		}
 		if (len > DATA_LEN(seg)) {
+			if (seg->pool)
+				data_seg_len += DATA_LEN(seg);
 			len -= DATA_LEN(seg);
+			if (pkt)
 			++NB_SEGS(pkt);
 			++rq_ci;
 			continue;
 		}
+		if (seg->pool) { /* last segment */
 		DATA_LEN(seg) = len;
+		data_seg_len += len;
+		}
+		if (unlikely(!pkt)) { /* no real segment found, skip packet */
+			len = 0;
+			rq_ci >>= sges_n;
+			++rq_ci;
+			rq_ci <<= sges_n;
+			continue;
+		}
+		PKT_LEN(pkt) = RTE_MIN(PKT_LEN(pkt), data_seg_len);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		/* Increment bytes counter. */
 		rxq->stats.ibytes += PKT_LEN(pkt);
 #endif
+		data_seg_len = 0;
 		/* Return packet. */
 		*(pkts++) = pkt;
 		pkt = NULL;
+		len = 0;
 		--pkts_n;
 		++i;
 		/* Align consumer index to the next stride. */
diff --git a/drivers/net/mlx5/mlx5_rx.h b/drivers/net/mlx5/mlx5_rx.h
index 01b563d981..cd48ee37ef 100644
--- a/drivers/net/mlx5/mlx5_rx.h
+++ b/drivers/net/mlx5/mlx5_rx.h
@@ -96,6 +96,7 @@ struct mlx5_eth_rxseg {
 	uint16_t length; /**< Segment data length, configures split point. */
 	uint16_t offset; /**< Data offset from beginning of mbuf data buffer. */
 	uint32_t reserved; /**< Reserved field. */
+	struct rte_mbuf *null_mbuf; /**< For selective Rx. */
 };
 
 /* RX queue descriptor. */
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 48d982a8c2..3fae189fa4 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -151,6 +151,7 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 		struct mlx5_eth_rxseg *seg = &rxq_ctrl->rxq.rxseg[i % sges_n];
 		struct rte_mbuf *buf;
 
+		if (seg->mp) {
 		buf = rte_pktmbuf_alloc(seg->mp);
 		if (buf == NULL) {
 			if (rxq_ctrl->share_group == 0)
@@ -167,6 +168,9 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 		/* Only vectored Rx routines rely on headroom size. */
 		MLX5_ASSERT(!has_vec_support ||
 			    DATA_OFF(buf) >= RTE_PKTMBUF_HEADROOM);
+		} else {
+			buf = seg->null_mbuf;
+		}
 		/* Buffer is supposed to be empty. */
 		MLX5_ASSERT(rte_pktmbuf_data_len(buf) == 0);
 		MLX5_ASSERT(rte_pktmbuf_pkt_len(buf) == 0);
@@ -324,10 +328,14 @@ rxq_free_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 		rxq->rq_pi = elts_ci;
 	}
 	for (i = 0; i != q_n; ++i) {
-		if ((*rxq->elts)[i] != NULL)
+		if ((*rxq->elts)[i] != NULL && (*rxq->elts)[i]->pool != NULL)
 			rte_pktmbuf_free_seg((*rxq->elts)[i]);
 		(*rxq->elts)[i] = NULL;
 	}
+	for (i = 0; i < rxq->rxseg_n; i++) {
+		mlx5_free(rxq->rxseg[i].null_mbuf);
+		rxq->rxseg[i].null_mbuf = NULL;
+	}
 }
 
 /**
@@ -1815,7 +1823,9 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	int ret;
 	struct mlx5_priv *priv = dev->data->dev_private;
 	struct mlx5_rxq_ctrl *tmpl;
-	unsigned int mb_len = rte_pktmbuf_data_room_size(rx_seg[0].mp);
+	struct rte_mempool *first_mp = NULL;
+	struct rte_mempool *last_mp = NULL;
+	unsigned int mb_len;
 	struct mlx5_port_config *config = &priv->config;
 	uint64_t offloads = conf->offloads |
 			   dev->data->dev_conf.rxmode.offloads;
@@ -1827,7 +1837,7 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	unsigned int non_scatter_min_mbuf_size = max_rx_pktlen +
 							RTE_PKTMBUF_HEADROOM;
 	unsigned int max_lro_size = 0;
-	unsigned int first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+	unsigned int first_mb_free_size;
 	uint32_t mprq_log_actual_stride_num = 0;
 	uint32_t mprq_log_actual_stride_size = 0;
 	bool rx_seg_en = n_seg != 1 || rx_seg[0].offset || rx_seg[0].length;
@@ -1845,6 +1855,21 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	const struct rte_eth_rxseg_split *qs_seg = rx_seg;
 	unsigned int tail_len;
 
+	/* Find first segment with a mempool. */
+	for (uint16_t seg = 0; seg < n_seg; seg++) {
+		if (rx_seg[seg].mp != NULL) {
+			first_mp = rx_seg[seg].mp;
+			break;
+		}
+	}
+	if (first_mp == NULL) {
+		DRV_LOG(ERR, "port %u Rx queue %u has no mempool", dev->data->port_id, idx);
+		rte_errno = EINVAL;
+		return NULL;
+	}
+	mb_len = rte_pktmbuf_data_room_size(first_mp);
+	first_mb_free_size = mb_len - RTE_PKTMBUF_HEADROOM;
+
 	if (mprq_en) {
 		/* Trim the number of descs needed. */
 		desc >>= mprq_log_actual_stride_num;
@@ -1884,13 +1909,20 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	do {
 		struct mlx5_eth_rxseg *hw_seg =
 					&tmpl->rxq.rxseg[tmpl->rxq.rxseg_n];
-		uint32_t buf_len, offset, seg_len;
+		uint32_t buf_len = 0, offset, seg_len;
 
 		/*
 		 * For the buffers beyond descriptions offset is zero,
 		 * the first buffer contains head room.
 		 */
-		buf_len = rte_pktmbuf_data_room_size(qs_seg->mp);
+		if (qs_seg->mp != NULL) {
+			last_mp = qs_seg->mp;
+			buf_len = rte_pktmbuf_data_room_size(qs_seg->mp);
+		} else if (last_mp != NULL) {
+			buf_len = rte_pktmbuf_data_room_size(last_mp);
+		} else {
+			buf_len = mb_len;
+		}
 		offset = (tmpl->rxq.rxseg_n >= n_seg ? 0 : qs_seg->offset) +
 			 (tmpl->rxq.rxseg_n ? 0 : RTE_PKTMBUF_HEADROOM);
 		/*
@@ -2077,7 +2109,8 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	/* Save port ID. */
 	tmpl->rxq.port_id = dev->data->port_id;
 	tmpl->sh = priv->sh;
-	tmpl->rxq.mp = rx_seg[0].mp;
+	tmpl->rxq.sh = priv->sh;
+	tmpl->rxq.mp = first_mp;
 	tmpl->rxq.elts_n = log2above(desc);
 	tmpl->rxq.rq_repl_thresh = MLX5_VPMD_RXQ_RPLNSH_THRESH(desc_n);
 	tmpl->rxq.elts = (struct rte_mbuf *(*)[])(tmpl + 1);
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index a070aaecfd..5b04d9a234 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -116,6 +116,27 @@ mlx5_txq_start(struct rte_eth_dev *dev)
 	return -rte_errno;
 }
 
+static struct rte_mbuf *
+mlx5_alloc_null_mbuf(uint32_t data_len)
+{
+	size_t alloc_size = sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM +
+		rte_align32pow2(data_len);
+	struct rte_mbuf *m;
+
+	m = mlx5_malloc(MLX5_MEM_ZERO, alloc_size, 0, SOCKET_ID_ANY);
+	if (m == NULL)
+		return NULL;
+	m->buf_addr = RTE_PTR_ADD(m, sizeof(*m));
+	m->buf_len = alloc_size - sizeof(*m);
+	rte_mbuf_iova_set(m, rte_mem_virt2iova(m->buf_addr));
+	m->data_off = RTE_PKTMBUF_HEADROOM;
+	m->refcnt = 1;
+	m->nb_segs = 1;
+	m->port = RTE_MBUF_PORT_INVALID;
+	m->pool = NULL;
+	return m;
+}
+
 /**
  * Register Rx queue mempools and fill the Rx queue cache.
  * This function tolerates repeated mempool registration.
@@ -130,7 +151,8 @@ static int
 mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	struct rte_mempool *mp;
-	uint32_t s;
+	struct mlx5_eth_rxseg *seg;
+	uint16_t s;
 	int ret = 0;
 
 	mlx5_mr_flush_local_cache(&rxq_ctrl->rxq.mr_ctrl);
@@ -139,21 +161,37 @@ mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
 		return mlx5_mr_mempool_populate_cache(&rxq_ctrl->rxq.mr_ctrl,
 						      rxq_ctrl->rxq.mprq_mp);
 	for (s = 0; s < rxq_ctrl->rxq.rxseg_n; s++) {
-		bool is_extmem;
-
-		mp = rxq_ctrl->rxq.rxseg[s].mp;
-		is_extmem = (rte_pktmbuf_priv_flags(mp) &
+		seg = &rxq_ctrl->rxq.rxseg[s];
+		mp = seg->mp;
+		if (mp) { /* Regular segment */
+		bool is_extmem = (rte_pktmbuf_priv_flags(mp) &
 			     RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) != 0;
 		ret = mlx5_mr_mempool_register(rxq_ctrl->sh->cdev, mp,
 					       is_extmem);
 		if (ret < 0 && rte_errno != EEXIST)
-			return ret;
+			goto error;
 		ret = mlx5_mr_mempool_populate_cache(&rxq_ctrl->rxq.mr_ctrl,
 						     mp);
 		if (ret < 0)
-			return ret;
+			goto error;
+		} else { /* NULL segment used in selective Rx */
+			seg->null_mbuf = mlx5_alloc_null_mbuf(seg->length);
+			if (seg->null_mbuf == NULL) {
+				rte_errno = ENOMEM;
+				ret = -rte_errno;
+				goto error;
+			}
+		}
 	}
 	return 0;
+
+error:
+	while (s-- > 0) {
+		seg = &rxq_ctrl->rxq.rxseg[s];
+		mlx5_free(seg->null_mbuf);
+		seg->null_mbuf = NULL;
+	}
+	return ret;
 }
 
 /**
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 07/10] net/mlx5: reindent previous changes
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Dariusz Sosnowski, Viacheslav Ovsiienko,
	Bing Zhao, Ori Kam, Suanming Mou, Matan Azrad
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

Fix indent which was left untouched to help reviews.
This must be squashed before merging.

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 drivers/net/mlx5/mlx5_rx.c      | 164 ++++++++++++++++----------------
 drivers/net/mlx5/mlx5_rxq.c     |  32 +++----
 drivers/net/mlx5/mlx5_trigger.c |  18 ++--
 3 files changed, 106 insertions(+), 108 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rx.c b/drivers/net/mlx5/mlx5_rx.c
index 1eef8ab95e..9812bc7929 100644
--- a/drivers/net/mlx5/mlx5_rx.c
+++ b/drivers/net/mlx5/mlx5_rx.c
@@ -1071,107 +1071,107 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		rte_prefetch0(cqe);
 		rte_prefetch0(wqe);
 		if (seg->pool) {
-		/* Allocate the buf from the same pool. */
-		rep = rte_mbuf_raw_alloc(seg->pool);
-		if (unlikely(rep == NULL)) {
-			++rxq->stats.rx_nombuf;
-			if (!pkt) {
-				/*
-				 * no buffers before we even started,
-				 * bail out silently.
-				 */
-				break;
-			}
-			while (pkt != seg) {
-				MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
-				rep = NEXT(pkt);
-				NEXT(pkt) = NULL;
-				NB_SEGS(pkt) = 1;
-				rte_mbuf_raw_free(pkt);
-				pkt = rep;
-			}
-			rq_ci >>= sges_n;
-			++rq_ci;
-			rq_ci <<= sges_n;
-			break;
-		}
-		}
-		if (!pkt) {
-		if (len == 0) { /* no CQE polled yet */
-			mcqe = NULL;
-			cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
-			len = mlx5_rx_poll_len(rxq, cqe, cqe_n, cqe_mask,
-					       &mcqe, &skip_cnt, false, NULL);
-			if (unlikely(len & MLX5_ERROR_CQE_MASK)) {
-				/* We drop packets with non-critical errors */
-				if (seg->pool)
-				rte_mbuf_raw_free(rep);
-				if (len == MLX5_CRITICAL_ERROR_CQE_RET) {
-					rq_ci = rxq->rq_ci << sges_n;
+			/* Allocate the buf from the same pool. */
+			rep = rte_mbuf_raw_alloc(seg->pool);
+			if (unlikely(rep == NULL)) {
+				++rxq->stats.rx_nombuf;
+				if (!pkt) {
+					/*
+					 * no buffers before we even started,
+					 * bail out silently.
+					 */
 					break;
 				}
-				/* Skip specified amount of error CQEs packets */
+				while (pkt != seg) {
+					MLX5_ASSERT(pkt != (*rxq->elts)[idx]);
+					rep = NEXT(pkt);
+					NEXT(pkt) = NULL;
+					NB_SEGS(pkt) = 1;
+					rte_mbuf_raw_free(pkt);
+					pkt = rep;
+				}
 				rq_ci >>= sges_n;
-				rq_ci += skip_cnt;
+				++rq_ci;
 				rq_ci <<= sges_n;
-				MLX5_ASSERT(!pkt);
-				len = 0;
-				continue;
-			}
-			if (len == 0) {
-				if (seg->pool)
-				rte_mbuf_raw_free(rep);
 				break;
 			}
-			MLX5_ASSERT(len >= (int)(rxq->crc_present << 2));
-			if (rxq->crc_present)
-				len -= RTE_ETHER_CRC_LEN;
 		}
-		if (seg->pool) { /* first real segment */
-			pkt = seg;
-			pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
-			if (rxq->cqe_comp_layout && mcqe)
-				cqe = &rxq->title_cqe;
-			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
-			PKT_LEN(pkt) = len;
-			if (cqe->lro_num_seg > 1) {
-				mlx5_lro_update_hdr
-					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
-					 mcqe, rxq, len);
-				pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
-				pkt->tso_segsz = len / cqe->lro_num_seg;
+		if (!pkt) { /* new packet */
+			if (len == 0) { /* no CQE polled yet */
+				mcqe = NULL;
+				cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask];
+				len = mlx5_rx_poll_len(rxq, cqe, cqe_n, cqe_mask,
+							   &mcqe, &skip_cnt, false, NULL);
+				if (unlikely(len & MLX5_ERROR_CQE_MASK)) {
+					/* We drop packets with non-critical errors */
+					if (seg->pool)
+						rte_mbuf_raw_free(rep);
+					if (len == MLX5_CRITICAL_ERROR_CQE_RET) {
+						rq_ci = rxq->rq_ci << sges_n;
+						break;
+					}
+					/* Skip specified amount of error CQEs packets */
+					rq_ci >>= sges_n;
+					rq_ci += skip_cnt;
+					rq_ci <<= sges_n;
+					MLX5_ASSERT(!pkt);
+					len = 0;
+					continue;
+				}
+				if (len == 0) {
+					if (seg->pool)
+						rte_mbuf_raw_free(rep);
+					break;
+				}
+				MLX5_ASSERT(len >= (int)(rxq->crc_present << 2));
+				if (rxq->crc_present)
+					len -= RTE_ETHER_CRC_LEN;
+			}
+			if (seg->pool) { /* first real segment */
+				pkt = seg;
+				pkt->ol_flags &= RTE_MBUF_F_EXTERNAL;
+				if (rxq->cqe_comp_layout && mcqe)
+					cqe = &rxq->title_cqe;
+				rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
+				PKT_LEN(pkt) = len;
+				if (cqe->lro_num_seg > 1) {
+					mlx5_lro_update_hdr
+						(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
+						 mcqe, rxq, len);
+					pkt->ol_flags |= RTE_MBUF_F_RX_LRO;
+					pkt->tso_segsz = len / cqe->lro_num_seg;
+				}
 			}
-		}
 		}
 		if (seg->pool) { /* real segment: replenish WQE */
-		tail = seg;
-		DATA_LEN(rep) = DATA_LEN(seg);
-		PKT_LEN(rep) = PKT_LEN(seg);
-		SET_DATA_OFF(rep, DATA_OFF(seg));
-		PORT(rep) = PORT(seg);
-		(*rxq->elts)[idx] = rep;
-		/*
-		 * Fill NIC descriptor with the new buffer. The lkey and size
-		 * of the buffers are already known, only the buffer address
-		 * changes.
-		 */
-		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
-		/* If there's only one MR, no need to replace LKey in WQE. */
-		if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
-			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
+			tail = seg;
+			DATA_LEN(rep) = DATA_LEN(seg);
+			PKT_LEN(rep) = PKT_LEN(seg);
+			SET_DATA_OFF(rep, DATA_OFF(seg));
+			PORT(rep) = PORT(seg);
+			(*rxq->elts)[idx] = rep;
+			/*
+			 * Fill NIC descriptor with the new buffer. The lkey and size
+			 * of the buffers are already known, only the buffer address
+			 * changes.
+			 */
+			wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+			/* If there's only one MR, no need to replace LKey in WQE. */
+			if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+				wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
 		}
-		if (len > DATA_LEN(seg)) {
+		if (len > DATA_LEN(seg)) { /* more data: move to next segment */
 			if (seg->pool)
 				data_seg_len += DATA_LEN(seg);
 			len -= DATA_LEN(seg);
 			if (pkt)
-			++NB_SEGS(pkt);
+				++NB_SEGS(pkt);
 			++rq_ci;
 			continue;
 		}
 		if (seg->pool) { /* last segment */
-		DATA_LEN(seg) = len;
-		data_seg_len += len;
+			DATA_LEN(seg) = len;
+			data_seg_len += len;
 		}
 		if (unlikely(!pkt)) { /* no real segment found, skip packet */
 			len = 0;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 3fae189fa4..6ca29f7543 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -152,22 +152,22 @@ rxq_alloc_elts_sprq(struct mlx5_rxq_ctrl *rxq_ctrl)
 		struct rte_mbuf *buf;
 
 		if (seg->mp) {
-		buf = rte_pktmbuf_alloc(seg->mp);
-		if (buf == NULL) {
-			if (rxq_ctrl->share_group == 0)
-				DRV_LOG(ERR, "port %u queue %u empty mbuf pool",
-					RXQ_PORT_ID(rxq_ctrl),
-					rxq_ctrl->rxq.idx);
-			else
-				DRV_LOG(ERR, "share group %u queue %u empty mbuf pool",
-					rxq_ctrl->share_group,
-					rxq_ctrl->share_qid);
-			rte_errno = ENOMEM;
-			goto error;
-		}
-		/* Only vectored Rx routines rely on headroom size. */
-		MLX5_ASSERT(!has_vec_support ||
-			    DATA_OFF(buf) >= RTE_PKTMBUF_HEADROOM);
+			buf = rte_pktmbuf_alloc(seg->mp);
+			if (buf == NULL) {
+				if (rxq_ctrl->share_group == 0)
+					DRV_LOG(ERR, "port %u queue %u empty mbuf pool",
+						RXQ_PORT_ID(rxq_ctrl),
+						rxq_ctrl->rxq.idx);
+				else
+					DRV_LOG(ERR, "share group %u queue %u empty mbuf pool",
+						rxq_ctrl->share_group,
+						rxq_ctrl->share_qid);
+				rte_errno = ENOMEM;
+				goto error;
+			}
+			/* Only vectored Rx routines rely on headroom size. */
+			MLX5_ASSERT(!has_vec_support ||
+				    DATA_OFF(buf) >= RTE_PKTMBUF_HEADROOM);
 		} else {
 			buf = seg->null_mbuf;
 		}
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 5b04d9a234..ac966c51b4 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -164,16 +164,14 @@ mlx5_rxq_mempool_register(struct mlx5_rxq_ctrl *rxq_ctrl)
 		seg = &rxq_ctrl->rxq.rxseg[s];
 		mp = seg->mp;
 		if (mp) { /* Regular segment */
-		bool is_extmem = (rte_pktmbuf_priv_flags(mp) &
-			     RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) != 0;
-		ret = mlx5_mr_mempool_register(rxq_ctrl->sh->cdev, mp,
-					       is_extmem);
-		if (ret < 0 && rte_errno != EEXIST)
-			goto error;
-		ret = mlx5_mr_mempool_populate_cache(&rxq_ctrl->rxq.mr_ctrl,
-						     mp);
-		if (ret < 0)
-			goto error;
+			bool is_extmem = (rte_pktmbuf_priv_flags(mp) &
+					RTE_PKTMBUF_POOL_F_PINNED_EXT_BUF) != 0;
+			ret = mlx5_mr_mempool_register(rxq_ctrl->sh->cdev, mp, is_extmem);
+			if (ret < 0 && rte_errno != EEXIST)
+				goto error;
+			ret = mlx5_mr_mempool_populate_cache(&rxq_ctrl->rxq.mr_ctrl, mp);
+			if (ret < 0)
+				goto error;
 		} else { /* NULL segment used in selective Rx */
 			seg->null_mbuf = mlx5_alloc_null_mbuf(seg->length);
 			if (seg->null_mbuf == NULL) {
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 08/10] common/mlx5: remove callbacks for MR registration
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, Dariusz Sosnowski, Viacheslav Ovsiienko,
	Bing Zhao, Ori Kam, Suanming Mou, Matan Azrad, Fan Zhang,
	Ashish Gupta
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

The functions register/unregister for a Memory Region (MR)
were not called directly.
There are only 2 implementations for Linux and Windows,
no need of handling this difference with function pointers.
The callback pointers are replaced with direct calls
and link time decision based on the Operating System.

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 drivers/common/mlx5/linux/mlx5_common_verbs.c | 26 +++----------
 drivers/common/mlx5/mlx5_common.c             |  6 +--
 drivers/common/mlx5/mlx5_common_mr.c          | 37 ++++++++-----------
 drivers/common/mlx5/mlx5_common_mr.h          | 26 +++----------
 drivers/common/mlx5/windows/mlx5_common_os.c  | 23 ++----------
 drivers/compress/mlx5/mlx5_compress.c         |  4 +-
 drivers/crypto/mlx5/mlx5_crypto.h             |  2 -
 drivers/crypto/mlx5/mlx5_crypto_gcm.c         |  6 +--
 drivers/net/mlx5/mlx5.h                       |  3 +-
 drivers/net/mlx5/mlx5_flow_aso.c              | 21 +++++------
 drivers/net/mlx5/mlx5_flow_hw.c               | 11 ++----
 drivers/net/mlx5/mlx5_flow_quota.c            |  6 +--
 drivers/net/mlx5/mlx5_hws_cnt.c               | 19 ++++------
 13 files changed, 61 insertions(+), 129 deletions(-)

diff --git a/drivers/common/mlx5/linux/mlx5_common_verbs.c b/drivers/common/mlx5/linux/mlx5_common_verbs.c
index 6d44e1f566..5e23c5844d 100644
--- a/drivers/common/mlx5/linux/mlx5_common_verbs.c
+++ b/drivers/common/mlx5/linux/mlx5_common_verbs.c
@@ -106,10 +106,10 @@ mlx5_set_context_attr(struct rte_device *dev, struct ibv_context *ctx)
  * @return
  *   0 on successful registration, -1 otherwise
  */
-RTE_EXPORT_INTERNAL_SYMBOL(mlx5_common_verbs_reg_mr)
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_reg_mr)
 int
-mlx5_common_verbs_reg_mr(void *pd, void *addr, size_t length,
-			 struct mlx5_pmd_mr *pmd_mr)
+mlx5_os_reg_mr(void *pd, void *addr, size_t length,
+		struct mlx5_pmd_mr *pmd_mr)
 {
 	struct ibv_mr *ibv_mr;
 
@@ -136,9 +136,9 @@ mlx5_common_verbs_reg_mr(void *pd, void *addr, size_t length,
  *   pmd_mr struct set with lkey, address, length and pointer to mr object
  *
  */
-RTE_EXPORT_INTERNAL_SYMBOL(mlx5_common_verbs_dereg_mr)
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_dereg_mr)
 void
-mlx5_common_verbs_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
+mlx5_os_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
 {
 	if (pmd_mr && pmd_mr->obj != NULL) {
 		claim_zero(mlx5_glue->dereg_mr(pmd_mr->obj));
@@ -146,22 +146,6 @@ mlx5_common_verbs_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
 	}
 }
 
-/**
- * Set the reg_mr and dereg_mr callbacks.
- *
- * @param[out] reg_mr_cb
- *   Pointer to reg_mr func
- * @param[out] dereg_mr_cb
- *   Pointer to dereg_mr func
- */
-RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_set_reg_mr_cb)
-void
-mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, mlx5_dereg_mr_t *dereg_mr_cb)
-{
-	*reg_mr_cb = mlx5_common_verbs_reg_mr;
-	*dereg_mr_cb = mlx5_common_verbs_dereg_mr;
-}
-
 RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_alloc_null_mr)
 struct mlx5_pmd_mr *
 mlx5_os_alloc_null_mr(struct rte_device *dev, void *pd)
diff --git a/drivers/common/mlx5/mlx5_common.c b/drivers/common/mlx5/mlx5_common.c
index f71dbe4637..87de6d0ff0 100644
--- a/drivers/common/mlx5/mlx5_common.c
+++ b/drivers/common/mlx5/mlx5_common.c
@@ -1135,7 +1135,7 @@ mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
 		return -1;
 	}
 	mr = mlx5_create_mr_ext(dev->pd, (uintptr_t)addr, len,
-				SOCKET_ID_ANY, dev->mr_scache.reg_mr_cb);
+				SOCKET_ID_ANY);
 	if (!mr) {
 		DRV_LOG(WARNING, "Device %s unable to DMA map", rte_dev->name);
 		rte_errno = EINVAL;
@@ -1165,7 +1165,7 @@ mlx5_common_dev_dma_map(struct rte_device *rte_dev, void *addr,
 		ret = mlx5_mr_expand_cache(&dev->mr_scache, size,
 					   rte_dev->numa_node);
 		if (ret < 0) {
-			mlx5_mr_free(mr, dev->mr_scache.dereg_mr_cb);
+			mlx5_mr_free(mr);
 			rte_errno = ret;
 			return -1;
 		}
@@ -1221,7 +1221,7 @@ mlx5_common_dev_dma_unmap(struct rte_device *rte_dev, void *addr,
 	}
 	LIST_REMOVE(mr, mr);
 	DRV_LOG(DEBUG, "MR(%p) is removed from list.", (void *)mr);
-	mlx5_mr_free(mr, dev->mr_scache.dereg_mr_cb);
+	mlx5_mr_free(mr);
 	mlx5_mr_rebuild_cache(&dev->mr_scache);
 	/*
 	 * No explicit wmb is needed after updating dev_gen due to
diff --git a/drivers/common/mlx5/mlx5_common_mr.c b/drivers/common/mlx5/mlx5_common_mr.c
index 64ffc7f4ea..aa2d5e88a4 100644
--- a/drivers/common/mlx5/mlx5_common_mr.c
+++ b/drivers/common/mlx5/mlx5_common_mr.c
@@ -492,12 +492,12 @@ mlx5_mr_lookup_cache(struct mlx5_mr_share_cache *share_cache,
  *   Pointer to MR to free.
  */
 void
-mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb)
+mlx5_mr_free(struct mlx5_mr *mr)
 {
 	if (mr == NULL)
 		return;
 	DRV_LOG(DEBUG, "freeing MR(%p):", (void *)mr);
-	dereg_mr_cb(&mr->pmd_mr);
+	mlx5_os_dereg_mr(&mr->pmd_mr);
 	rte_bitmap_free(mr->ms_bmp);
 	mlx5_free(mr);
 }
@@ -545,7 +545,7 @@ mlx5_mr_garbage_collect(struct mlx5_mr_share_cache *share_cache)
 		struct mlx5_mr *mr = mr_next;
 
 		mr_next = LIST_NEXT(mr, mr);
-		mlx5_mr_free(mr, share_cache->dereg_mr_cb);
+		mlx5_mr_free(mr);
 	}
 }
 
@@ -821,7 +821,7 @@ mlx5_mr_create_primary(void *pd,
 		data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
 		data.end = data.start + msl->page_sz;
 		rte_mcfg_mem_read_unlock();
-		mlx5_mr_free(mr, share_cache->dereg_mr_cb);
+		mlx5_mr_free(mr);
 		goto alloc_resources;
 	}
 	MLX5_ASSERT(data.msl == data_re.msl);
@@ -845,7 +845,7 @@ mlx5_mr_create_primary(void *pd,
 		 * Must be unlocked before calling rte_free() because
 		 * mlx5_mr_mem_event_free_cb() can be called inside.
 		 */
-		mlx5_mr_free(mr, share_cache->dereg_mr_cb);
+		mlx5_mr_free(mr);
 		return entry->lkey;
 	}
 	/*
@@ -912,7 +912,7 @@ mlx5_mr_create_primary(void *pd,
 	 * mlx5_alloc_buf_extern() which eventually calls rte_malloc_socket()
 	 * through mlx5_alloc_verbs_buf().
 	 */
-	share_cache->reg_mr_cb(pd, (void *)data.start, len, &mr->pmd_mr);
+	mlx5_os_reg_mr(pd, (void *)data.start, len, &mr->pmd_mr);
 	if (mr->pmd_mr.obj == NULL) {
 		DRV_LOG(DEBUG, "Fail to create an MR for address (%p)",
 		      (void *)addr);
@@ -948,7 +948,7 @@ mlx5_mr_create_primary(void *pd,
 	 * calling rte_free() because mlx5_mr_mem_event_free_cb() can be called
 	 * inside.
 	 */
-	mlx5_mr_free(mr, share_cache->dereg_mr_cb);
+	mlx5_mr_free(mr);
 	return UINT32_MAX;
 }
 
@@ -1139,9 +1139,6 @@ mlx5_mr_release_cache(struct mlx5_mr_share_cache *share_cache)
 int
 mlx5_mr_create_cache(struct mlx5_mr_share_cache *share_cache, int socket)
 {
-	/* Set the reg_mr and dereg_mr callback functions */
-	mlx5_os_set_reg_mr_cb(&share_cache->reg_mr_cb,
-			      &share_cache->dereg_mr_cb);
 	rte_rwlock_init(&share_cache->rwlock);
 	rte_rwlock_init(&share_cache->mprwlock);
 	/* Initialize B-tree and allocate memory for global MR cache table. */
@@ -1189,8 +1186,7 @@ mlx5_mr_flush_local_cache(struct mlx5_mr_ctrl *mr_ctrl)
  *   Pointer to MR structure on success, NULL otherwise.
  */
 struct mlx5_mr *
-mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id,
-		   mlx5_reg_mr_t reg_mr_cb)
+mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id)
 {
 	struct mlx5_mr *mr = NULL;
 
@@ -1199,7 +1195,7 @@ mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id,
 			 RTE_CACHE_LINE_SIZE, socket_id);
 	if (mr == NULL)
 		return NULL;
-	reg_mr_cb(pd, (void *)addr, len, &mr->pmd_mr);
+	mlx5_os_reg_mr(pd, (void *)addr, len, &mr->pmd_mr);
 	if (mr->pmd_mr.obj == NULL) {
 		DRV_LOG(WARNING,
 			"Fail to create MR for address (%p)",
@@ -1624,14 +1620,13 @@ mlx5_mempool_reg_create(struct rte_mempool *mp, unsigned int mrs_n,
  *   Whether @p mpr owns its MRs exclusively, i.e. they are not shared.
  */
 static void
-mlx5_mempool_reg_destroy(struct mlx5_mr_share_cache *share_cache,
-			 struct mlx5_mempool_reg *mpr, bool standalone)
+mlx5_mempool_reg_destroy(struct mlx5_mempool_reg *mpr, bool standalone)
 {
 	if (standalone) {
 		unsigned int i;
 
 		for (i = 0; i < mpr->mrs_n; i++)
-			share_cache->dereg_mr_cb(&mpr->mrs[i].pmd_mr);
+			mlx5_os_dereg_mr(&mpr->mrs[i].pmd_mr);
 		mlx5_free(mpr->mrs);
 	}
 	mlx5_free(mpr);
@@ -1748,7 +1743,7 @@ mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache,
 		const struct mlx5_range *range = &ranges[i];
 		size_t len = range->end - range->start;
 
-		if (share_cache->reg_mr_cb(pd, (void *)range->start, len,
+		if (mlx5_os_reg_mr(pd, (void *)range->start, len,
 		    &mr->pmd_mr) < 0) {
 			DRV_LOG(ERR,
 				"Failed to create an MR in PD %p for address range "
@@ -1763,7 +1758,7 @@ mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache,
 			mp->name);
 	}
 	if (i != ranges_n) {
-		mlx5_mempool_reg_destroy(share_cache, new_mpr, true);
+		mlx5_mempool_reg_destroy(new_mpr, true);
 		rte_errno = EINVAL;
 		goto exit;
 	}
@@ -1785,13 +1780,13 @@ mlx5_mr_mempool_register_primary(struct mlx5_mr_share_cache *share_cache,
 	if (mpr != NULL) {
 		DRV_LOG(DEBUG, "Mempool %s is already registered for PD %p",
 			mp->name, pd);
-		mlx5_mempool_reg_destroy(share_cache, new_mpr, true);
+		mlx5_mempool_reg_destroy(new_mpr, true);
 		rte_errno = EEXIST;
 		goto exit;
 	} else if (old_mpr != NULL) {
 		DRV_LOG(DEBUG, "Mempool %s registration for PD %p updated for external memory",
 			mp->name, pd);
-		mlx5_mempool_reg_destroy(share_cache, old_mpr, standalone);
+		mlx5_mempool_reg_destroy(old_mpr, standalone);
 	}
 exit:
 	free(ranges);
@@ -1860,7 +1855,7 @@ mlx5_mr_mempool_unregister_primary(struct mlx5_mr_share_cache *share_cache,
 		rte_errno = ENOENT;
 		return -1;
 	}
-	mlx5_mempool_reg_destroy(share_cache, mpr, standalone);
+	mlx5_mempool_reg_destroy(mpr, standalone);
 	return 0;
 }
 
diff --git a/drivers/common/mlx5/mlx5_common_mr.h b/drivers/common/mlx5/mlx5_common_mr.h
index 00f3d832c3..5fb931a1b5 100644
--- a/drivers/common/mlx5/mlx5_common_mr.h
+++ b/drivers/common/mlx5/mlx5_common_mr.h
@@ -32,13 +32,6 @@ struct mlx5_pmd_mr {
 	struct mlx5_devx_obj *mkey; /* devx mkey object. */
 };
 
-/**
- * mr operations typedef
- */
-typedef int (*mlx5_reg_mr_t)(void *pd, void *addr, size_t length,
-			     struct mlx5_pmd_mr *pmd_mr);
-typedef void (*mlx5_dereg_mr_t)(struct mlx5_pmd_mr *pmd_mr);
-
 /* Memory Region object. */
 struct mlx5_mr {
 	LIST_ENTRY(mlx5_mr) mr; /**< Pointer to the prev/next entry. */
@@ -88,8 +81,6 @@ struct __rte_packed_begin mlx5_mr_share_cache {
 	struct mlx5_mr_list mr_list; /* Registered MR list. */
 	struct mlx5_mr_list mr_free_list; /* Freed MR list. */
 	struct mlx5_mempool_reg_list mempool_reg_list; /* Mempool database. */
-	mlx5_reg_mr_t reg_mr_cb; /* Callback to reg_mr func */
-	mlx5_dereg_mr_t dereg_mr_cb; /* Callback to dereg_mr func */
 } __rte_packed_end;
 
 /* Multi-Packet RQ buffer header. */
@@ -233,9 +224,8 @@ struct mlx5_mr *
 mlx5_mr_lookup_list(struct mlx5_mr_share_cache *share_cache,
 		    struct mr_cache_entry *entry, uintptr_t addr);
 struct mlx5_mr *
-mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id,
-		   mlx5_reg_mr_t reg_mr_cb);
-void mlx5_mr_free(struct mlx5_mr *mr, mlx5_dereg_mr_t dereg_mr_cb);
+mlx5_create_mr_ext(void *pd, uintptr_t addr, size_t len, int socket_id);
+void mlx5_mr_free(struct mlx5_mr *mr);
 __rte_internal
 uint32_t
 mlx5_mr_create(struct mlx5_common_device *cdev,
@@ -246,19 +236,13 @@ __rte_internal
 uint32_t
 mlx5_mr_addr2mr_bh(struct mlx5_mr_ctrl *mr_ctrl, uintptr_t addr);
 
-/* mlx5_common_verbs.c */
-
 __rte_internal
 int
-mlx5_common_verbs_reg_mr(void *pd, void *addr, size_t length,
-			 struct mlx5_pmd_mr *pmd_mr);
+mlx5_os_reg_mr(void *pd, void *addr, size_t length,
+		struct mlx5_pmd_mr *pmd_mr);
 __rte_internal
 void
-mlx5_common_verbs_dereg_mr(struct mlx5_pmd_mr *pmd_mr);
-
-__rte_internal
-void
-mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, mlx5_dereg_mr_t *dereg_mr_cb);
+mlx5_os_dereg_mr(struct mlx5_pmd_mr *pmd_mr);
 
 __rte_internal
 struct mlx5_pmd_mr *
diff --git a/drivers/common/mlx5/windows/mlx5_common_os.c b/drivers/common/mlx5/windows/mlx5_common_os.c
index 692517a9bf..bf1b654da3 100644
--- a/drivers/common/mlx5/windows/mlx5_common_os.c
+++ b/drivers/common/mlx5/windows/mlx5_common_os.c
@@ -377,7 +377,8 @@ mlx5_os_umem_dereg(void *pumem)
  * @return
  *   0 on successful registration, -1 otherwise
  */
-static int
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_reg_mr)
+int
 mlx5_os_reg_mr(void *pd,
 	       void *addr, size_t length, struct mlx5_pmd_mr *pmd_mr)
 {
@@ -425,7 +426,8 @@ mlx5_os_reg_mr(void *pd,
  * @param[in] pmd_mr
  *  Pointer to PMD mr object
  */
-static void
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_dereg_mr)
+void
 mlx5_os_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
 {
 	if (!pmd_mr)
@@ -437,23 +439,6 @@ mlx5_os_dereg_mr(struct mlx5_pmd_mr *pmd_mr)
 	memset(pmd_mr, 0, sizeof(*pmd_mr));
 }
 
-/**
- * Set the reg_mr and dereg_mr callbacks.
- *
- * @param[out] reg_mr_cb
- *   Pointer to reg_mr func
- * @param[out] dereg_mr_cb
- *   Pointer to dereg_mr func
- *
- */
-RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_set_reg_mr_cb)
-void
-mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb, mlx5_dereg_mr_t *dereg_mr_cb)
-{
-	*reg_mr_cb = mlx5_os_reg_mr;
-	*dereg_mr_cb = mlx5_os_dereg_mr;
-}
-
 RTE_EXPORT_INTERNAL_SYMBOL(mlx5_os_alloc_null_mr)
 struct mlx5_pmd_mr *
 mlx5_os_alloc_null_mr(struct rte_device *dev, void *pd)
diff --git a/drivers/compress/mlx5/mlx5_compress.c b/drivers/compress/mlx5/mlx5_compress.c
index e5325c6150..1361dab630 100644
--- a/drivers/compress/mlx5/mlx5_compress.c
+++ b/drivers/compress/mlx5/mlx5_compress.c
@@ -117,7 +117,7 @@ mlx5_compress_qp_release(struct rte_compressdev *dev, uint16_t qp_id)
 	if (qp->opaque_mr.obj != NULL) {
 		void *opaq = qp->opaque_mr.addr;
 
-		mlx5_common_verbs_dereg_mr(&qp->opaque_mr);
+		mlx5_os_dereg_mr(&qp->opaque_mr);
 		rte_free(opaq);
 	}
 	mlx5_mr_btree_free(&qp->mr_ctrl.cache_bh);
@@ -199,7 +199,7 @@ mlx5_compress_qp_setup(struct rte_compressdev *dev, uint16_t qp_id,
 	qp->priv = priv;
 	qp->ops = (struct rte_comp_op **)RTE_ALIGN((uintptr_t)(qp + 1),
 						   RTE_CACHE_LINE_SIZE);
-	if (mlx5_common_verbs_reg_mr(priv->cdev->pd, opaq_buf, qp->entries_n *
+	if (mlx5_os_reg_mr(priv->cdev->pd, opaq_buf, qp->entries_n *
 					sizeof(union mlx5_gga_compress_opaque),
 							 &qp->opaque_mr) != 0) {
 		rte_free(opaq_buf);
diff --git a/drivers/crypto/mlx5/mlx5_crypto.h b/drivers/crypto/mlx5/mlx5_crypto.h
index f9f127e9e6..93a2bb2c78 100644
--- a/drivers/crypto/mlx5/mlx5_crypto.h
+++ b/drivers/crypto/mlx5/mlx5_crypto.h
@@ -40,8 +40,6 @@ struct mlx5_crypto_priv {
 	TAILQ_ENTRY(mlx5_crypto_priv) next;
 	struct mlx5_common_device *cdev; /* Backend mlx5 device. */
 	struct rte_cryptodev *crypto_dev;
-	mlx5_reg_mr_t reg_mr_cb; /* Callback to reg_mr func */
-	mlx5_dereg_mr_t dereg_mr_cb; /* Callback to dereg_mr func */
 	struct mlx5_uar uar; /* User Access Region. */
 	uint32_t max_segs_num; /* Maximum supported data segs. */
 	uint32_t max_klm_num; /* Maximum supported klm. */
diff --git a/drivers/crypto/mlx5/mlx5_crypto_gcm.c b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
index 89f32c7722..1a2600655a 100644
--- a/drivers/crypto/mlx5/mlx5_crypto_gcm.c
+++ b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
@@ -219,7 +219,6 @@ mlx5_crypto_gcm_mkey_klm_update(struct mlx5_crypto_priv *priv,
 static int
 mlx5_crypto_gcm_qp_release(struct rte_cryptodev *dev, uint16_t qp_id)
 {
-	struct mlx5_crypto_priv *priv = dev->data->dev_private;
 	struct mlx5_crypto_qp *qp = dev->data->queue_pairs[qp_id];
 
 	if (qp->umr_qp_obj.qp != NULL)
@@ -231,7 +230,7 @@ mlx5_crypto_gcm_qp_release(struct rte_cryptodev *dev, uint16_t qp_id)
 	if (qp->mr.obj != NULL) {
 		void *opaq = qp->mr.addr;
 
-		priv->dereg_mr_cb(&qp->mr);
+		mlx5_os_dereg_mr(&qp->mr);
 		rte_free(opaq);
 	}
 	mlx5_crypto_indirect_mkeys_release(qp, qp->entries_n);
@@ -363,7 +362,7 @@ mlx5_crypto_gcm_qp_setup(struct rte_cryptodev *dev, uint16_t qp_id,
 		rte_errno = ENOMEM;
 		goto err;
 	}
-	if (priv->reg_mr_cb(priv->cdev->pd, mr_buf, mr_size, &qp->mr) != 0) {
+	if (mlx5_os_reg_mr(priv->cdev->pd, mr_buf, mr_size, &qp->mr) != 0) {
 		rte_free(mr_buf);
 		DRV_LOG(ERR, "Failed to register opaque MR.");
 		rte_errno = ENOMEM;
@@ -1186,7 +1185,6 @@ mlx5_crypto_gcm_init(struct mlx5_crypto_priv *priv)
 
 	/* Override AES-GCM specified ops. */
 	dev_ops->sym_session_configure = mlx5_crypto_sym_gcm_session_configure;
-	mlx5_os_set_reg_mr_cb(&priv->reg_mr_cb, &priv->dereg_mr_cb);
 	dev_ops->queue_pair_setup = mlx5_crypto_gcm_qp_setup;
 	dev_ops->queue_pair_release = mlx5_crypto_gcm_qp_release;
 	if (mlx5_crypto_is_ipsec_opt(priv)) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index bd6ef35b53..a4d5392e8f 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -2706,8 +2706,7 @@ int mlx5_aso_cnt_query(struct mlx5_dev_ctx_shared *sh,
 int mlx5_aso_ct_queue_init(struct mlx5_dev_ctx_shared *sh,
 			   struct mlx5_aso_ct_pools_mng *ct_mng,
 			   uint32_t nb_queues);
-int mlx5_aso_ct_queue_uninit(struct mlx5_dev_ctx_shared *sh,
-			     struct mlx5_aso_ct_pools_mng *ct_mng);
+int mlx5_aso_ct_queue_uninit(struct mlx5_aso_ct_pools_mng *ct_mng);
 int
 mlx5_aso_sq_create(struct mlx5_common_device *cdev, struct mlx5_aso_sq *sq,
 		   void *uar, uint16_t log_desc_n);
diff --git a/drivers/net/mlx5/mlx5_flow_aso.c b/drivers/net/mlx5/mlx5_flow_aso.c
index 5e2a81ef9c..cd84ab1966 100644
--- a/drivers/net/mlx5/mlx5_flow_aso.c
+++ b/drivers/net/mlx5/mlx5_flow_aso.c
@@ -19,17 +19,15 @@
 /**
  * Free MR resources.
  *
- * @param[in] cdev
- *   Pointer to the mlx5 common device.
  * @param[in] mr
  *   MR to free.
  */
 static void
-mlx5_aso_dereg_mr(struct mlx5_common_device *cdev, struct mlx5_pmd_mr *mr)
+mlx5_aso_dereg_mr(struct mlx5_pmd_mr *mr)
 {
 	void *addr = mr->addr;
 
-	cdev->mr_scache.dereg_mr_cb(mr);
+	mlx5_os_dereg_mr(mr);
 	mlx5_free(addr);
 	memset(mr, 0, sizeof(*mr));
 }
@@ -59,7 +57,7 @@ mlx5_aso_reg_mr(struct mlx5_common_device *cdev, size_t length,
 		DRV_LOG(ERR, "Failed to create ASO bits mem for MR.");
 		return -1;
 	}
-	ret = cdev->mr_scache.reg_mr_cb(cdev->pd, mr->addr, length, mr);
+	ret = mlx5_os_reg_mr(cdev->pd, mr->addr, length, mr);
 	if (ret) {
 		DRV_LOG(ERR, "Failed to create direct Mkey.");
 		mlx5_free(mr->addr);
@@ -362,7 +360,7 @@ mlx5_aso_queue_init(struct mlx5_dev_ctx_shared *sh,
 		if (mlx5_aso_sq_create(cdev, &sh->aso_age_mng->aso_sq,
 				       sh->tx_uar.obj,
 				       MLX5_ASO_QUEUE_LOG_DESC)) {
-			mlx5_aso_dereg_mr(cdev, &sh->aso_age_mng->aso_sq.mr);
+			mlx5_aso_dereg_mr(&sh->aso_age_mng->aso_sq.mr);
 			return -1;
 		}
 		mlx5_aso_age_init_sq(&sh->aso_age_mng->aso_sq);
@@ -399,14 +397,14 @@ mlx5_aso_queue_uninit(struct mlx5_dev_ctx_shared *sh,
 
 	switch (aso_opc_mod) {
 	case ASO_OPC_MOD_FLOW_HIT:
-		mlx5_aso_dereg_mr(sh->cdev, &sh->aso_age_mng->aso_sq.mr);
+		mlx5_aso_dereg_mr(&sh->aso_age_mng->aso_sq.mr);
 		sq = &sh->aso_age_mng->aso_sq;
 		break;
 	case ASO_OPC_MOD_POLICER:
 		mlx5_aso_mtr_queue_uninit(sh, NULL, &sh->mtrmng->pools_mng);
 		break;
 	case ASO_OPC_MOD_CONNECTION_TRACKING:
-		mlx5_aso_ct_queue_uninit(sh, sh->ct_mng);
+		mlx5_aso_ct_queue_uninit(sh->ct_mng);
 		break;
 	default:
 		DRV_LOG(ERR, "Unknown ASO operation mode");
@@ -1147,15 +1145,14 @@ __mlx5_aso_ct_get_pool(struct mlx5_dev_ctx_shared *sh,
 }
 
 int
-mlx5_aso_ct_queue_uninit(struct mlx5_dev_ctx_shared *sh,
-			 struct mlx5_aso_ct_pools_mng *ct_mng)
+mlx5_aso_ct_queue_uninit(struct mlx5_aso_ct_pools_mng *ct_mng)
 {
 	uint32_t i;
 
 	/* 64B per object for query. */
 	for (i = 0; i < ct_mng->nb_sq; i++) {
 		if (ct_mng->aso_sqs[i].mr.addr)
-			mlx5_aso_dereg_mr(sh->cdev, &ct_mng->aso_sqs[i].mr);
+			mlx5_aso_dereg_mr(&ct_mng->aso_sqs[i].mr);
 		mlx5_aso_destroy_sq(&ct_mng->aso_sqs[i]);
 	}
 	return 0;
@@ -1197,7 +1194,7 @@ mlx5_aso_ct_queue_init(struct mlx5_dev_ctx_shared *sh,
 error:
 	do {
 		if (ct_mng->aso_sqs[i].mr.addr)
-			mlx5_aso_dereg_mr(sh->cdev, &ct_mng->aso_sqs[i].mr);
+			mlx5_aso_dereg_mr(&ct_mng->aso_sqs[i].mr);
 		mlx5_aso_destroy_sq(&ct_mng->aso_sqs[i]);
 	} while (i--);
 	ct_mng->nb_sq = 0;
diff --git a/drivers/net/mlx5/mlx5_flow_hw.c b/drivers/net/mlx5/mlx5_flow_hw.c
index b6bb9f12a6..7cc601d681 100644
--- a/drivers/net/mlx5/mlx5_flow_hw.c
+++ b/drivers/net/mlx5/mlx5_flow_hw.c
@@ -11086,12 +11086,9 @@ flow_hw_create_nic_ctrl_tables(struct rte_eth_dev *dev, struct rte_flow_error *e
 }
 
 static void
-flow_hw_ct_mng_destroy(struct rte_eth_dev *dev,
-		       struct mlx5_aso_ct_pools_mng *ct_mng)
+flow_hw_ct_mng_destroy(struct mlx5_aso_ct_pools_mng *ct_mng)
 {
-	struct mlx5_priv *priv = dev->data->dev_private;
-
-	mlx5_aso_ct_queue_uninit(priv->sh, ct_mng);
+	mlx5_aso_ct_queue_uninit(ct_mng);
 	mlx5_free(ct_mng);
 }
 
@@ -11230,7 +11227,7 @@ mlx5_flow_ct_init(struct rte_eth_dev *dev,
 		priv->hws_ctpool = NULL;
 	}
 	if (priv->ct_mng) {
-		flow_hw_ct_mng_destroy(dev, priv->ct_mng);
+		flow_hw_ct_mng_destroy(priv->ct_mng);
 		priv->ct_mng = NULL;
 	}
 	return ret;
@@ -11804,7 +11801,7 @@ __mlx5_flow_hw_resource_release(struct rte_eth_dev *dev, bool ctx_close)
 		priv->hws_ctpool = NULL;
 	}
 	if (priv->ct_mng) {
-		flow_hw_ct_mng_destroy(dev, priv->ct_mng);
+		flow_hw_ct_mng_destroy(priv->ct_mng);
 		priv->ct_mng = NULL;
 	}
 	mlx5_flow_quota_destroy(dev);
diff --git a/drivers/net/mlx5/mlx5_flow_quota.c b/drivers/net/mlx5/mlx5_flow_quota.c
index d94167d0b0..b661bd376e 100644
--- a/drivers/net/mlx5/mlx5_flow_quota.c
+++ b/drivers/net/mlx5/mlx5_flow_quota.c
@@ -412,12 +412,11 @@ mlx5_quota_alloc_sq(struct mlx5_priv *priv)
 static void
 mlx5_quota_destroy_read_buf(struct mlx5_priv *priv)
 {
-	struct mlx5_dev_ctx_shared *sh = priv->sh;
 	struct mlx5_quota_ctx *qctx = &priv->quota_ctx;
 
 	if (qctx->mr.lkey) {
 		void *addr = qctx->mr.addr;
-		sh->cdev->mr_scache.dereg_mr_cb(&qctx->mr);
+		mlx5_os_dereg_mr(&qctx->mr);
 		mlx5_free(addr);
 	}
 	if (qctx->read_buf)
@@ -446,8 +445,7 @@ mlx5_quota_alloc_read_buf(struct mlx5_priv *priv)
 		DRV_LOG(DEBUG, "QUOTA: failed to allocate MTR ASO READ buffer [1]");
 		return -ENOMEM;
 	}
-	ret = sh->cdev->mr_scache.reg_mr_cb(sh->cdev->pd, buf,
-					    rd_buf_size, &qctx->mr);
+	ret = mlx5_os_reg_mr(sh->cdev->pd, buf, rd_buf_size, &qctx->mr);
 	if (ret) {
 		DRV_LOG(DEBUG, "QUOTA: failed to register MTR ASO READ MR");
 		return -errno;
diff --git a/drivers/net/mlx5/mlx5_hws_cnt.c b/drivers/net/mlx5/mlx5_hws_cnt.c
index 1b6acb7a3b..d0c4ead71b 100644
--- a/drivers/net/mlx5/mlx5_hws_cnt.c
+++ b/drivers/net/mlx5/mlx5_hws_cnt.c
@@ -259,12 +259,11 @@ mlx5_hws_aging_check(struct mlx5_priv *priv, struct mlx5_hws_cnt_pool *cpool)
 }
 
 static void
-mlx5_hws_cnt_raw_data_free(struct mlx5_dev_ctx_shared *sh,
-			   struct mlx5_hws_cnt_raw_data_mng *mng)
+mlx5_hws_cnt_raw_data_free(struct mlx5_hws_cnt_raw_data_mng *mng)
 {
 	if (mng == NULL)
 		return;
-	sh->cdev->mr_scache.dereg_mr_cb(&mng->mr);
+	mlx5_os_dereg_mr(&mng->mr);
 	mlx5_free(mng->raw);
 	mlx5_free(mng);
 }
@@ -296,8 +295,7 @@ mlx5_hws_cnt_raw_data_alloc(struct mlx5_dev_ctx_shared *sh, uint32_t n,
 				   NULL, "failed to allocate raw counters memory");
 		goto error;
 	}
-	ret = sh->cdev->mr_scache.reg_mr_cb(sh->cdev->pd, mng->raw, sz,
-					    &mng->mr);
+	ret = mlx5_os_reg_mr(sh->cdev->pd, mng->raw, sz, &mng->mr);
 	if (ret) {
 		rte_flow_error_set(error, errno,
 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
@@ -306,7 +304,7 @@ mlx5_hws_cnt_raw_data_alloc(struct mlx5_dev_ctx_shared *sh, uint32_t n,
 	}
 	return mng;
 error:
-	mlx5_hws_cnt_raw_data_free(sh, mng);
+	mlx5_hws_cnt_raw_data_free(mng);
 	return NULL;
 }
 
@@ -639,8 +637,7 @@ mlx5_hws_cnt_pool_dcs_alloc(struct mlx5_dev_ctx_shared *sh,
 }
 
 static void
-mlx5_hws_cnt_pool_dcs_free(struct mlx5_dev_ctx_shared *sh,
-			   struct mlx5_hws_cnt_pool *cpool)
+mlx5_hws_cnt_pool_dcs_free(struct mlx5_hws_cnt_pool *cpool)
 {
 	uint32_t idx;
 
@@ -649,7 +646,7 @@ mlx5_hws_cnt_pool_dcs_free(struct mlx5_dev_ctx_shared *sh,
 	for (idx = 0; idx < MLX5_HWS_CNT_DCS_NUM; idx++)
 		mlx5_devx_cmd_destroy(cpool->dcs_mng.dcs[idx].obj);
 	if (cpool->raw_mng) {
-		mlx5_hws_cnt_raw_data_free(sh, cpool->raw_mng);
+		mlx5_hws_cnt_raw_data_free(cpool->raw_mng);
 		cpool->raw_mng = NULL;
 	}
 }
@@ -842,8 +839,8 @@ mlx5_hws_cnt_pool_destroy(struct mlx5_dev_ctx_shared *sh,
 	}
 	mlx5_hws_cnt_pool_action_destroy(cpool);
 	if (cpool->cfg.host_cpool == NULL) {
-		mlx5_hws_cnt_pool_dcs_free(sh, cpool);
-		mlx5_hws_cnt_raw_data_free(sh, cpool->raw_mng);
+		mlx5_hws_cnt_pool_dcs_free(cpool);
+		mlx5_hws_cnt_raw_data_free(cpool->raw_mng);
 	}
 	mlx5_free((void *)cpool->cfg.name);
 	mlx5_hws_cnt_pool_deinit(cpool);
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 09/10] dts: fix topology capability comparison
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, stable, Luca Vizzarro, Patrick Robb,
	Jeremy Spewock, Dean Marx, Juraj Linkeš
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

TopologyCapability.__gt__() was delegating to __lt__(),
which caused infinite recursion when "other" is not a TopologyCapability:
other.__lt__(self) returns NotImplemented,
Python retries with self.__gt__(other),
and the cycle repeats.

dts/framework/testbed_model/capability.py", line 579, in __gt__
        return other < self
               ^^^^^^^^^^^^
    RecursionError: maximum recursion depth exceeded

Similarly, __le__() was delegating to "not __gt__()",
which returns True for non-comparable types instead of False.

Fix both by checking is_comparable_with() first
and comparing topology_type directly, consistent with __lt__().

Fixes: 039256daa8bf ("dts: add topology capability")
Cc: stable@dpdk.org

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 dts/framework/testbed_model/capability.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dts/framework/testbed_model/capability.py b/dts/framework/testbed_model/capability.py
index 960370fc72..96e1cd449f 100644
--- a/dts/framework/testbed_model/capability.py
+++ b/dts/framework/testbed_model/capability.py
@@ -574,7 +574,9 @@ def __gt__(self, other: Any) -> bool:
         Returns:
             :data:`True` if the instance's topology type is more complex than the compared object's.
         """
-        return other < self
+        if not self.is_comparable_with(other):
+            return False
+        return self.topology_type > other.topology_type
 
     def __le__(self, other: Any) -> bool:
         """Compare the :attr:`~TopologyCapability.topology_type`s.
@@ -586,7 +588,9 @@ def __le__(self, other: Any) -> bool:
             :data:`True` if the instance's topology type is less complex or equal than
             the compared object's.
         """
-        return not self > other
+        if not self.is_comparable_with(other):
+            return False
+        return self.topology_type <= other.topology_type
 
     def __hash__(self):
         """Each instance is identified by :attr:`topology_type`."""
-- 
2.54.0


^ permalink raw reply related

* [PATCH v7 10/10] dts: add selective Rx tests
From: Thomas Monjalon @ 2026-06-03 18:23 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Luca Vizzarro, Patrick Robb
In-Reply-To: <20260603184503.652030-1-thomas@monjalon.net>

Add TestSuite_rx_split with 7 test cases:
- 3 positive: headers only, payload only, two non-contiguous segments
- 4 negative: missing offload flag, out-of-range, overlap, all-discard

Add selective Rx capability detection via testpmd "show port info".

The test suite could be completed later for the basic buffer split
configuration based on offsets or protocols.

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 dts/api/capabilities.py                   |   2 +
 dts/api/testpmd/__init__.py               |  17 ++
 dts/api/testpmd/types.py                  |   6 +
 dts/framework/testbed_model/capability.py |   2 +
 dts/tests/TestSuite_rx_split.py           | 262 ++++++++++++++++++++++
 5 files changed, 289 insertions(+)
 create mode 100644 dts/tests/TestSuite_rx_split.py

diff --git a/dts/api/capabilities.py b/dts/api/capabilities.py
index 09bc538523..b0c1d81d36 100644
--- a/dts/api/capabilities.py
+++ b/dts/api/capabilities.py
@@ -136,6 +136,8 @@ class NicCapability(IntEnum):
     #: Device supports all VLAN capabilities.
     PORT_RX_OFFLOAD_VLAN = auto()
     QUEUE_RX_OFFLOAD_VLAN = auto()
+    #: Device supports selective Rx.
+    SELECTIVE_RX = auto()
     #: Device supports Rx queue setup after device started.
     RUNTIME_RX_QUEUE_SETUP = auto()
     #: Device supports Tx queue setup after device started.
diff --git a/dts/api/testpmd/__init__.py b/dts/api/testpmd/__init__.py
index e9187440bb..6973a64573 100644
--- a/dts/api/testpmd/__init__.py
+++ b/dts/api/testpmd/__init__.py
@@ -1409,6 +1409,23 @@ def get_capabilities_show_port_info(
             self.ports[0].device_capabilities,
         )
 
+    def get_capabilities_selective_rx(
+        self,
+        supported_capabilities: MutableSet["NicCapability"],
+        unsupported_capabilities: MutableSet["NicCapability"],
+    ) -> None:
+        """Get selective Rx capability from show port info.
+
+        Args:
+            supported_capabilities: Supported capabilities will be added to this set.
+            unsupported_capabilities: Unsupported capabilities will be added to this set.
+        """
+        port_info = self.show_port_info(self.ports[0].id)
+        if port_info.selective_rx:
+            supported_capabilities.add(NicCapability.SELECTIVE_RX)
+        else:
+            unsupported_capabilities.add(NicCapability.SELECTIVE_RX)
+
     def get_capabilities_mcast_filtering(
         self,
         supported_capabilities: MutableSet["NicCapability"],
diff --git a/dts/api/testpmd/types.py b/dts/api/testpmd/types.py
index 0d322aece2..6f1eaf47cc 100644
--- a/dts/api/testpmd/types.py
+++ b/dts/api/testpmd/types.py
@@ -614,6 +614,12 @@ def _validate(info: str) -> str | None:
         metadata=VLANOffloadFlag.make_parser(),
     )
 
+    #: Selective Rx support
+    selective_rx: bool = field(
+        default=False,
+        metadata=TextParser.find(r"Selective Rx: supported"),
+    )
+
     #: Maximum size of RX buffer
     max_rx_bufsize: int | None = field(
         default=None, metadata=TextParser.find_int(r"Maximum size of RX buffer: (\d+)")
diff --git a/dts/framework/testbed_model/capability.py b/dts/framework/testbed_model/capability.py
index 96e1cd449f..b10799ea4b 100644
--- a/dts/framework/testbed_model/capability.py
+++ b/dts/framework/testbed_model/capability.py
@@ -324,6 +324,8 @@ def mapping(cap: NicCapability) -> TestPmdNicCapability:
                     | NicCapability.FLOW_SHARED_OBJECT_KEEP
                 ):
                     return (TestPmd.get_capabilities_show_port_info, None)
+                case NicCapability.SELECTIVE_RX:
+                    return (TestPmd.get_capabilities_selective_rx, None)
                 case NicCapability.MCAST_FILTERING:
                     return (TestPmd.get_capabilities_mcast_filtering, None)
                 case NicCapability.FLOW_CTRL:
diff --git a/dts/tests/TestSuite_rx_split.py b/dts/tests/TestSuite_rx_split.py
new file mode 100644
index 0000000000..e12fe1a828
--- /dev/null
+++ b/dts/tests/TestSuite_rx_split.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2026 NVIDIA Corporation & Affiliates
+
+"""Rx split test suite.
+
+Test configuring a packet split on Rx,
+and discarding some segments (selective Rx) at NIC level.
+"""
+
+from typing import Any
+
+from scapy.layers.inet import IP
+from scapy.layers.l2 import Ether
+from scapy.packet import Packet, Raw
+
+from api.capabilities import (
+    NicCapability,
+    requires_nic_capability,
+)
+from api.packet import send_packet_and_capture
+from api.test import fail, verify
+from api.testpmd import TestPmd
+from api.testpmd.config import SimpleForwardingModes
+from api.testpmd.types import RxOffloadCapability, TxOffloadCapability
+from framework.exception import InteractiveCommandExecutionError
+from framework.test_suite import TestSuite, func_test
+
+PAYLOAD = bytes(range(256))
+ETHER_HDR_LEN = len(Ether())
+IP_HDR_LEN = len(IP())
+ETHER_IP_HDR_LEN = ETHER_HDR_LEN + IP_HDR_LEN
+
+
+@requires_nic_capability(NicCapability.PORT_RX_OFFLOAD_BUFFER_SPLIT)
+@requires_nic_capability(NicCapability.SELECTIVE_RX)
+class TestRxSplit(TestSuite):
+    """Rx split test suite.
+
+    Configure testpmd with various Rx segment offset/length combinations
+    and verify that only the requested portions of the packet are received
+    and forwarded.
+    """
+
+    def _create_testpmd(self, **kwargs: Any) -> TestPmd:
+        """Create a TestPmd instance with defaults overridden by kwargs."""
+        defaults: dict[str, Any] = {
+            "forward_mode": SimpleForwardingModes.mac,
+            "rx_offloads": RxOffloadCapability.BUFFER_SPLIT,
+            "enable_scatter": True,
+        }
+        return TestPmd(**{**defaults, **kwargs})
+
+    def _build_packet(self) -> Packet:
+        """Build a test packet with an incrementing byte pattern payload."""
+        return Ether() / IP() / Raw(load=PAYLOAD)
+
+    def _send_and_verify(
+        self,
+        testpmd: TestPmd,
+        packet: Packet,
+        expected_bytes: bytes,
+    ) -> None:
+        """Clear stats, send a packet, and verify received content and stats.
+
+        Args:
+            testpmd: The running testpmd instance.
+            packet: The packet to send.
+            expected_bytes: Expected raw bytes of the received packet.
+        """
+        expected_len = len(expected_bytes)
+        testpmd.clear_port_stats_all(verify=False)
+
+        received = send_packet_and_capture(packet)
+        verify(
+            len(received) > 0,
+            "Did not receive any packets.",
+        )
+
+        recv_bytes = bytes(received[0])
+        verify(
+            len(recv_bytes) == expected_len,
+            f"Expected packet length {expected_len}, got {len(recv_bytes)}.",
+        )
+        verify(
+            recv_bytes == expected_bytes,
+            "Received packet content does not match expected bytes.",
+        )
+
+        all_stats, _ = testpmd.show_port_stats_all()
+        total_rx_packets = sum(s.rx_packets for s in all_stats)
+        total_rx_bytes = sum(s.rx_bytes for s in all_stats)
+        verify(
+            total_rx_packets == 1,
+            f"Expected 1 Rx packet, got {total_rx_packets}.",
+        )
+        verify(
+            total_rx_bytes == expected_len,
+            f"Expected {expected_len} Rx bytes, got {total_rx_bytes}.",
+        )
+
+    @func_test
+    def selective_rx_headers(self) -> None:
+        """Keep only the Ethernet + IP headers, discard the payload.
+
+        Steps:
+            Start testpmd with rxoffs/rxpkts and buffer split enabled.
+            Send an Ether/IP/payload packet.
+
+        Verify:
+            Received packet has Ether + IP headers only.
+            Port stats show expected rx_packets and rx_bytes.
+        """
+        with self._create_testpmd(
+            rx_segments_offsets=[0],
+            rx_segments_length=[ETHER_IP_HDR_LEN],
+        ) as testpmd:
+            testpmd.start()
+            packet = self._build_packet()
+            expected = bytes(packet)[:ETHER_IP_HDR_LEN]
+            self._send_and_verify(testpmd, packet, expected)
+
+    @func_test
+    def selective_rx_payload_only(self) -> None:
+        """Skip the Ethernet + IP headers, keep only the payload.
+
+        Steps:
+            Start testpmd with rxoffs/rxpkts and buffer split enabled.
+            Send an Ether/IP/payload packet.
+
+        Verify:
+            Received packet is matching the original payload.
+            Port stats show expected rx_packets and rx_bytes.
+        """
+        with self._create_testpmd(
+            rx_segments_offsets=[ETHER_IP_HDR_LEN],
+            rx_segments_length=[len(PAYLOAD)],
+        ) as testpmd:
+            testpmd.start()
+            self._send_and_verify(testpmd, self._build_packet(), PAYLOAD)
+
+    @func_test
+    def selective_rx_two_segments(self) -> None:
+        """Keep the IP header and the middle of the payload, skip the rest.
+
+        Steps:
+            Start testpmd with rxoffs/rxpkts, buffer split
+            and multi-segment Tx enabled.
+            Send an Ether/IP/payload packet.
+
+        Verify:
+            Received packet is matching the IP header and middle of payload.
+            Port stats show expected rx_packets and rx_bytes.
+        """
+        payload_offset = 100
+        payload_length = 100
+        with self._create_testpmd(
+            tx_offloads=TxOffloadCapability.MULTI_SEGS,
+            rx_segments_offsets=[ETHER_HDR_LEN, ETHER_IP_HDR_LEN + payload_offset],
+            rx_segments_length=[IP_HDR_LEN, payload_length],
+        ) as testpmd:
+            testpmd.start()
+            packet = self._build_packet()
+            raw = bytes(packet)
+            payload_start = ETHER_IP_HDR_LEN + payload_offset
+            expected = (
+                raw[ETHER_HDR_LEN:ETHER_IP_HDR_LEN]
+                + raw[payload_start : payload_start + payload_length]
+            )
+            self._send_and_verify(testpmd, packet, expected)
+
+    @func_test
+    def selective_rx_no_offload(self) -> None:
+        """Configure selective Rx with buffer split disabled.
+
+        Steps:
+            Start testpmd with rxoffs/rxpkts, buffer split
+            and device start disabled.
+            Attempt to start ports.
+
+        Verify:
+            Queue configuration fails.
+        """
+        with self._create_testpmd(
+            rx_offloads=None,
+            rx_segments_offsets=[0],
+            rx_segments_length=[ETHER_IP_HDR_LEN],
+            disable_device_start=True,
+        ) as testpmd:
+            try:
+                testpmd.start_all_ports()
+                fail("Expected configuration to fail with buffer split disabled.")
+            except InteractiveCommandExecutionError:
+                pass
+
+    @func_test
+    def selective_rx_offset_out_of_range(self) -> None:
+        """Configure selective Rx with an offset beyond max_rx_pktlen.
+
+        Steps:
+            Start testpmd with rxoffs too big, buffer split enabled,
+            and device start disabled.
+            Attempt to start ports.
+
+        Verify:
+            Queue configuration fails.
+        """
+        with self._create_testpmd(
+            rx_segments_offsets=[20000],
+            rx_segments_length=[100],
+            disable_device_start=True,
+        ) as testpmd:
+            try:
+                testpmd.start_all_ports()
+                fail("Expected configuration to fail with out-of-range offset.")
+            except InteractiveCommandExecutionError:
+                pass
+
+    @func_test
+    def selective_rx_overlap(self) -> None:
+        """Configure selective Rx with overlapping segments.
+
+        Steps:
+            Start testpmd with overlapping rxoffs/rxpkts, buffer split enabled,
+            and device start disabled.
+            Attempt to start ports.
+
+        Verify:
+            Queue configuration fails.
+        """
+        with self._create_testpmd(
+            rx_segments_offsets=[0, 10],
+            rx_segments_length=[64, 64],
+            disable_device_start=True,
+        ) as testpmd:
+            try:
+                testpmd.start_all_ports()
+                fail("Expected configuration to fail with overlapping segments.")
+            except InteractiveCommandExecutionError:
+                pass
+
+    @func_test
+    def selective_rx_all_discard(self) -> None:
+        """Configure selective Rx with only discard segment.
+
+        Steps:
+            Start testpmd with rxoffs/rxpkts=0 (null segment), buffer split enabled,
+            and device start disabled.
+            Attempt to start ports.
+
+        Verify:
+            Queue configuration fails.
+        """
+        with self._create_testpmd(
+            rx_segments_offsets=[0],
+            rx_segments_length=[0],
+            disable_device_start=True,
+        ) as testpmd:
+            try:
+                testpmd.start_all_ports()
+                fail("Expected configuration to fail with only discard segment.")
+            except InteractiveCommandExecutionError:
+                pass
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v2 0/2] test: cleanup assertion macros
From: Thomas Monjalon @ 2026-06-03 19:25 UTC (permalink / raw)
  To: Stephen Hemminger, Weijun Pan; +Cc: dev
In-Reply-To: <20260501163533.2689152-1-stephen@networkplumber.org>

> Stephen Hemminger (1):
>   test: use inline helpers in buffer comparison macros
> 
> Weijun Pan (1):
>   test: parenthesize assertion macro parameters

Applied, thanks.




^ permalink raw reply

* Re: [PATCH 0/4] remove useless null checks before free
From: Thomas Monjalon @ 2026-06-03 19:51 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev
In-Reply-To: <20260401164108.201404-1-stephen@networkplumber.org>

01/04/2026 18:40, Stephen Hemminger:
> Some small cleanups found by coccinelle null free script
> 
> Stephen Hemminger (4):
>   net/ice: remove unnecessary null check
>   net/bnxt: remove unnecessary null check
>   net/zxdh: remove unnecessary null check
>   app/test_compress: remove unnecessary null check

zxdh was merged in next-net

The rest is applied, thanks.




^ permalink raw reply

* [PATCH v3 4/4] net/bnxt: fix RSS hash mode configuration for VFs
From: Mohammad Shuab Siddique @ 2026-06-03 20:02 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Mohammad Shuab Siddique
In-Reply-To: <20260603175137.1990204-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>

Fixed VFs attempting global RSS configuration which is not
permitted by firmware. VFs (including trusted VFs) must use
per-VNIC RSS configuration with actual vnic_id and rss_ctx_idx
values.

Fixes: 8b9adaf0da6b ("net/bnxt: support RSS on ESP and AH headers")
Cc: stable@dpdk.org
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_hwrm.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index 0c82935de9..afc948ac29 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -2970,8 +2970,22 @@ bnxt_hwrm_vnic_rss_cfg_hash_mode_p5(struct bnxt *bp, struct bnxt_vnic_info *vnic
 		req.hash_mode_flags = BNXT_HASH_MODE_INNERMOST;
 	else
 		req.hash_mode_flags = vnic->hash_mode;
-	req.vnic_id = rte_cpu_to_le_16(BNXT_DFLT_VNIC_ID_INVALID);
-	req.rss_ctx_idx = rte_cpu_to_le_16(BNXT_RSS_CTX_IDX_INVALID);
+
+	/* VFs must use actual vnic_id for per-VNIC configuration.
+	 * PFs can use INVALID vnic_id for global configuration.
+	 * This is because VFs don't have permission to configure
+	 * global hash mode, even if they're trusted.
+	 */
+	if (BNXT_VF(bp)) {
+		req.vnic_id = rte_cpu_to_le_16(vnic->fw_vnic_id);
+		req.rss_ctx_idx = rte_cpu_to_le_16(vnic->fw_grp_ids[0]);
+		PMD_DRV_LOG_LINE(DEBUG, "VF using per-VNIC RSS config (vnic_id=%u)",
+				 vnic->fw_vnic_id);
+	} else {
+		req.vnic_id = rte_cpu_to_le_16(BNXT_DFLT_VNIC_ID_INVALID);
+		req.rss_ctx_idx = rte_cpu_to_le_16(BNXT_RSS_CTX_IDX_INVALID);
+		PMD_DRV_LOG_LINE(DEBUG, "PF using global RSS config");
+	}
 
 	PMD_DRV_LOG_LINE(DEBUG, "RSS CFG: Hash level %d", req.hash_mode_flags);
 	rc = bnxt_hwrm_send_message(bp, &req, sizeof(req),
-- 
2.47.3


^ permalink raw reply related

* [PATCH 0/4] net/bnxt: ULP stats timer and PTP timestamping improvements
From: Mohammad Shuab Siddique @ 2026-06-03 20:37 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Mohammad Shuab Siddique

From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>

This series improves PTP timestamping support and the ULP stats timer:

 - Move the ULP stats timer thread to a dedicated CPU to avoid interfering
   with DPDK worker lcores
 - Replace register-based PTP initialization on Wh+ with HWRM interface calls
 - Add HWRM-based Tx/Rx timestamp retrieval for BCM57414 (Wh+)
 - Add PTP support for BCM5760X (Thor2) including TX timestamp completions

The PTP patches carry Fixes: tags and Cc: stable@dpdk.org.

Note: this series depends on series "net/bnxt: miscellaneous bug fixes".

Kalesh AP (3):
  net/bnxt: Support for PTP on Wh+ using HWRM interface
  net/bnxt: Support for Tx/Rx timestamp using HWRM on Wh+
  net/bnxt: Add PTP support for Thor2

Mohammad Shuab Siddique (1):
  net/bnxt: move ulp stats timer thread to dedicated cpu

 drivers/net/bnxt/bnxt.h              |   3 +
 drivers/net/bnxt/bnxt_ethdev.c       | 172 +--------------------------
 drivers/net/bnxt/bnxt_hwrm.c         |  37 ++----
 drivers/net/bnxt/bnxt_rxr.c          |  23 +++-
 drivers/net/bnxt/bnxt_txr.c          |  16 +++
 drivers/net/bnxt/tf_ulp/ulp_sc_mgr.c |  73 ++++++++++--
 6 files changed, 116 insertions(+), 208 deletions(-)

-- 
2.47.3


^ permalink raw reply

* [PATCH 1/4] net/bnxt: move ulp stats timer thread to dedicated cpu
From: Mohammad Shuab Siddique @ 2026-06-03 20:37 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Mohammad Shuab Siddique
In-Reply-To: <20260603203738.2084478-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>

The ulp stats timer thread is currently running in a lcore cpu
that is assigned for the dpdk worker thread and this causes the
performance to drop since ulp stats thread hogs the cpu to
collect the stats. The fix is to move the stats timer thread to
a dedicated cpu so it does not affect the performance of dpdk
worker threads.

Signed-off-by: Kishore Padmanabha <kishore.padmanabha@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/tf_ulp/ulp_sc_mgr.c | 73 ++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 9 deletions(-)

diff --git a/drivers/net/bnxt/tf_ulp/ulp_sc_mgr.c b/drivers/net/bnxt/tf_ulp/ulp_sc_mgr.c
index 23e1b59ca4..fd33e0d36a 100644
--- a/drivers/net/bnxt/tf_ulp/ulp_sc_mgr.c
+++ b/drivers/net/bnxt/tf_ulp/ulp_sc_mgr.c
@@ -347,6 +347,59 @@ bool ulp_sc_mgr_thread_isstarted(struct bnxt_ulp_context *ctxt)
 	return false;
 }
 
+/* Find a free CPU not used by DPDK lcores for the stats cache thread */
+static int ulp_sc_find_free_cpu(void)
+{
+	unsigned int lcore_id;
+	cpu_set_t dpdk_cpus, proc_cpus;
+	int cpu;
+	int rc;
+
+	CPU_ZERO(&dpdk_cpus);
+	CPU_ZERO(&proc_cpus);
+
+	/* Get process's CPU affinity mask */
+	rc = sched_getaffinity(0, sizeof(cpu_set_t), &proc_cpus);
+	if (rc)
+		return -1;
+
+	/* Collect all CPUs used by DPDK lcores */
+	RTE_LCORE_FOREACH(lcore_id) {
+		rte_cpuset_t cpuset = rte_lcore_cpuset(lcore_id);
+
+		for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+			if (CPU_ISSET(cpu, &cpuset))
+				CPU_SET(cpu, &dpdk_cpus);
+		}
+	}
+
+	/* Find first CPU in process affinity that's not used by DPDK */
+	for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+		if (CPU_ISSET(cpu, &proc_cpus) && !CPU_ISSET(cpu, &dpdk_cpus))
+			return cpu;
+	}
+
+	return -1;
+}
+
+/* Get the first CPU number from the first DPDK lcore */
+static int ulp_sc_get_first_dpdk_cpu(void)
+{
+	unsigned int lcore_id;
+	int cpu;
+
+	RTE_LCORE_FOREACH(lcore_id) {
+		rte_cpuset_t cpuset = rte_lcore_cpuset(lcore_id);
+
+		for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+			if (CPU_ISSET(cpu, &cpuset))
+				return cpu;
+		}
+	}
+
+	return -1;
+}
+
 /*
  * Setup the Flow counter timer thread that will fetch/accumulate raw counter
  * data from the chip's internal flow counters
@@ -360,7 +413,7 @@ ulp_sc_mgr_thread_start(struct bnxt_ulp_context *ctxt)
 	struct bnxt_ulp_sc_info *ulp_sc_info;
 	rte_thread_attr_t attr;
 	rte_cpuset_t mask;
-	size_t i;
+	int target_cpu;
 	int rc;
 
 	ulp_sc_info = bnxt_ulp_cntxt_ptr2_sc_info_get(ctxt);
@@ -368,16 +421,18 @@ ulp_sc_mgr_thread_start(struct bnxt_ulp_context *ctxt)
 	if (ulp_sc_info && !(ulp_sc_info->flags & ULP_FLAG_SC_THREAD)) {
 		rte_thread_attr_init(&attr);
 
-		rte_thread_get_affinity(&mask);
-
-		for (i = 1; i < CPU_SETSIZE; i++) {
-			if (CPU_ISSET(i, &mask)) {
-				CPU_ZERO(&mask);
-				CPU_SET(i + 2, &mask);
-				break;
-			}
+		/* Try to find a free CPU not used by DPDK lcores */
+		target_cpu = ulp_sc_find_free_cpu();
+		/* If no free CPU found, use first CPU from first DPDK lcore */
+		if (target_cpu == -1) {
+			target_cpu = ulp_sc_get_first_dpdk_cpu();
+			if (target_cpu == -1)
+				return -ENOENT;
 		}
 
+		CPU_ZERO(&mask);
+		CPU_SET(target_cpu, &mask);
+
 		rc = rte_thread_attr_set_affinity(&attr, &mask);
 		if (rc)
 			return rc;
-- 
2.47.3


^ permalink raw reply related

* [PATCH 2/4] net/bnxt: Support for PTP on Wh+ using HWRM interface
From: Mohammad Shuab Siddique @ 2026-06-03 20:37 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Kalesh AP, Mohammad Shuab Siddique
In-Reply-To: <20260603203738.2084478-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>

Get rid of old code relying on mapped registers and use HWRM
interface wherever applicable during PTP initialization for Wh+.

Fixes: b11cceb83a34 ("net/bnxt: support timesync")
Cc: stable@dpdk.org
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_ethdev.c | 63 ++--------------------------------
 drivers/net/bnxt/bnxt_hwrm.c   | 31 ++---------------
 2 files changed, 4 insertions(+), 90 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 7d70d0f3ec..14d8fa705d 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -3754,55 +3754,6 @@ bnxt_dev_supported_ptypes_get_op(struct rte_eth_dev *dev,
 	return ptypes;
 }
 
-static int bnxt_map_regs(struct bnxt *bp, uint32_t *reg_arr, int count,
-			 int reg_win)
-{
-	uint32_t reg_base = *reg_arr & 0xfffff000;
-	uint32_t win_off;
-	int i;
-
-	for (i = 0; i < count; i++) {
-		if ((reg_arr[i] & 0xfffff000) != reg_base)
-			return -ERANGE;
-	}
-	win_off = BNXT_GRCPF_REG_WINDOW_BASE_OUT + (reg_win - 1) * 4;
-	rte_write32(reg_base, (uint8_t *)bp->bar0 + win_off);
-	return 0;
-}
-
-static int bnxt_map_ptp_regs(struct bnxt *bp)
-{
-	struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
-	uint32_t *reg_arr;
-	int rc, i;
-
-	reg_arr = ptp->rx_regs;
-	rc = bnxt_map_regs(bp, reg_arr, BNXT_PTP_RX_REGS, 5);
-	if (rc)
-		return rc;
-
-	reg_arr = ptp->tx_regs;
-	rc = bnxt_map_regs(bp, reg_arr, BNXT_PTP_TX_REGS, 6);
-	if (rc)
-		return rc;
-
-	for (i = 0; i < BNXT_PTP_RX_REGS; i++)
-		ptp->rx_mapped_regs[i] = 0x5000 + (ptp->rx_regs[i] & 0xfff);
-
-	for (i = 0; i < BNXT_PTP_TX_REGS; i++)
-		ptp->tx_mapped_regs[i] = 0x6000 + (ptp->tx_regs[i] & 0xfff);
-
-	return 0;
-}
-
-static void bnxt_unmap_ptp_regs(struct bnxt *bp)
-{
-	rte_write32(0, (uint8_t *)bp->bar0 +
-			 BNXT_GRCPF_REG_WINDOW_BASE_OUT + 16);
-	rte_write32(0, (uint8_t *)bp->bar0 +
-			 BNXT_GRCPF_REG_WINDOW_BASE_OUT + 20);
-}
-
 static uint64_t bnxt_cc_read(struct bnxt *bp)
 {
 	uint64_t ns;
@@ -3976,13 +3927,7 @@ bnxt_timesync_enable(struct rte_eth_dev *dev)
 	ptp->tx_tstamp_tc.cc_shift = shift;
 	ptp->tx_tstamp_tc.nsec_mask = (1ULL << shift) - 1;
 
-	/* TODO Revisit for Thor 2 */
-	if (!BNXT_CHIP_P5(bp))
-		bnxt_map_ptp_regs(bp);
-	else
-		rc = bnxt_ptp_start(bp);
-
-	return rc;
+	return bnxt_ptp_start(bp);
 }
 
 static int
@@ -4001,12 +3946,8 @@ bnxt_timesync_disable(struct rte_eth_dev *dev)
 
 	bnxt_hwrm_ptp_cfg(bp);
 
-	/* TODO Revisit for Thor 2 */
 	bp->ptp_all_rx_tstamp = 0;
-	if (!BNXT_CHIP_P5(bp))
-		bnxt_unmap_ptp_regs(bp);
-	else
-		bnxt_ptp_stop(bp);
+	bnxt_ptp_stop(bp);
 
 	return 0;
 }
diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index afc948ac29..68e75e43bd 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -1027,14 +1027,8 @@ static int bnxt_hwrm_ptp_qcfg(struct bnxt *bp)
 
 	HWRM_CHECK_RESULT();
 
-	/* TODO Revisit for Thor 2 */
-	if (BNXT_CHIP_P5(bp)) {
-		if (!(resp->flags & HWRM_PORT_MAC_PTP_QCFG_OUTPUT_FLAGS_HWRM_ACCESS))
-			return 0;
-	} else {
-		if (!(resp->flags & HWRM_PORT_MAC_PTP_QCFG_OUTPUT_FLAGS_DIRECT_ACCESS))
-			return 0;
-	}
+	if (!(resp->flags & HWRM_PORT_MAC_PTP_QCFG_OUTPUT_FLAGS_HWRM_ACCESS))
+		return 0;
 
 	if (resp->flags & HWRM_PORT_MAC_PTP_QCFG_OUTPUT_FLAGS_ONE_STEP_TX_TS)
 		bp->flags |= BNXT_FLAG_FW_CAP_ONE_STEP_TX_TS;
@@ -1043,27 +1037,6 @@ static int bnxt_hwrm_ptp_qcfg(struct bnxt *bp)
 	if (!ptp)
 		return -ENOMEM;
 
-	if (!BNXT_CHIP_P5(bp)) {
-		ptp->rx_regs[BNXT_PTP_RX_TS_L] =
-			rte_le_to_cpu_32(resp->rx_ts_reg_off_lower);
-		ptp->rx_regs[BNXT_PTP_RX_TS_H] =
-			rte_le_to_cpu_32(resp->rx_ts_reg_off_upper);
-		ptp->rx_regs[BNXT_PTP_RX_SEQ] =
-			rte_le_to_cpu_32(resp->rx_ts_reg_off_seq_id);
-		ptp->rx_regs[BNXT_PTP_RX_FIFO] =
-			rte_le_to_cpu_32(resp->rx_ts_reg_off_fifo);
-		ptp->rx_regs[BNXT_PTP_RX_FIFO_ADV] =
-			rte_le_to_cpu_32(resp->rx_ts_reg_off_fifo_adv);
-		ptp->tx_regs[BNXT_PTP_TX_TS_L] =
-			rte_le_to_cpu_32(resp->tx_ts_reg_off_lower);
-		ptp->tx_regs[BNXT_PTP_TX_TS_H] =
-			rte_le_to_cpu_32(resp->tx_ts_reg_off_upper);
-		ptp->tx_regs[BNXT_PTP_TX_SEQ] =
-			rte_le_to_cpu_32(resp->tx_ts_reg_off_seq_id);
-		ptp->tx_regs[BNXT_PTP_TX_FIFO] =
-			rte_le_to_cpu_32(resp->tx_ts_reg_off_fifo);
-	}
-
 	ptp->bp = bp;
 	bp->ptp_cfg = ptp;
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH 3/4] net/bnxt: Support for Tx/Rx timestamp using HWRM on Wh+
From: Mohammad Shuab Siddique @ 2026-06-03 20:37 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Kalesh AP, Mohammad Shuab Siddique
In-Reply-To: <20260603203738.2084478-1-Mohammad-Shuab.Siddique@broadcom.com>

From: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>

The Rx/Tx timestamp will now be made available using the HWRM cmd
PORT_TS_QUERY on BCM57414 chips.
Get rid of the earlier code that was fetching this using register
access.

Fixes: b11cceb83a34 ("net/bnxt: support timesync")
Cc: stable@dpdk.org
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt_ethdev.c | 109 +--------------------------------
 drivers/net/bnxt/bnxt_rxr.c    |  17 ++++-
 2 files changed, 19 insertions(+), 107 deletions(-)

diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 14d8fa705d..64b12ada9f 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -3754,99 +3754,6 @@ bnxt_dev_supported_ptypes_get_op(struct rte_eth_dev *dev,
 	return ptypes;
 }
 
-static uint64_t bnxt_cc_read(struct bnxt *bp)
-{
-	uint64_t ns;
-
-	ns = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-			      BNXT_GRCPF_REG_SYNC_TIME));
-	ns |= (uint64_t)(rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-					  BNXT_GRCPF_REG_SYNC_TIME + 4))) << 32;
-	return ns;
-}
-
-static int bnxt_get_tx_ts(struct bnxt *bp, uint64_t *ts)
-{
-	struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
-	uint32_t fifo;
-
-	fifo = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->tx_mapped_regs[BNXT_PTP_TX_FIFO]));
-	if (fifo & BNXT_PTP_TX_FIFO_EMPTY)
-		return -EAGAIN;
-
-	fifo = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->tx_mapped_regs[BNXT_PTP_TX_FIFO]));
-	*ts = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->tx_mapped_regs[BNXT_PTP_TX_TS_L]));
-	*ts |= (uint64_t)rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->tx_mapped_regs[BNXT_PTP_TX_TS_H])) << 32;
-	rte_read32((uint8_t *)bp->bar0 + ptp->tx_mapped_regs[BNXT_PTP_TX_SEQ]);
-
-	return 0;
-}
-
-static int bnxt_clr_rx_ts(struct bnxt *bp, uint64_t *last_ts)
-{
-	struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
-	struct bnxt_pf_info *pf = bp->pf;
-	uint16_t port_id;
-	int i = 0;
-	uint32_t fifo;
-
-	if (!ptp || (bp->flags & BNXT_FLAG_CHIP_P5))
-		return -EINVAL;
-
-	port_id = pf->port_id;
-	fifo = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->rx_mapped_regs[BNXT_PTP_RX_FIFO]));
-	while ((fifo & BNXT_PTP_RX_FIFO_PENDING) && (i < BNXT_PTP_RX_PND_CNT)) {
-		rte_write32(1 << port_id, (uint8_t *)bp->bar0 +
-			    ptp->rx_mapped_regs[BNXT_PTP_RX_FIFO_ADV]);
-		fifo = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-					ptp->rx_mapped_regs[BNXT_PTP_RX_FIFO]));
-		*last_ts = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-					ptp->rx_mapped_regs[BNXT_PTP_RX_TS_L]));
-		*last_ts |= (uint64_t)rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-					ptp->rx_mapped_regs[BNXT_PTP_RX_TS_H])) << 32;
-		i++;
-	}
-
-	if (i >= BNXT_PTP_RX_PND_CNT)
-		return -EBUSY;
-
-	return 0;
-}
-
-static int bnxt_get_rx_ts(struct bnxt *bp, uint64_t *ts)
-{
-	struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
-	struct bnxt_pf_info *pf = bp->pf;
-	uint16_t port_id;
-	uint32_t fifo;
-
-	fifo = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->rx_mapped_regs[BNXT_PTP_RX_FIFO]));
-	if (!(fifo & BNXT_PTP_RX_FIFO_PENDING))
-		return -EAGAIN;
-
-	port_id = pf->port_id;
-	rte_write32(1 << port_id, (uint8_t *)bp->bar0 +
-	       ptp->rx_mapped_regs[BNXT_PTP_RX_FIFO_ADV]);
-
-	fifo = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				   ptp->rx_mapped_regs[BNXT_PTP_RX_FIFO]));
-	if (fifo & BNXT_PTP_RX_FIFO_PENDING)
-		return bnxt_clr_rx_ts(bp, ts);
-
-	*ts = rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->rx_mapped_regs[BNXT_PTP_RX_TS_L]));
-	*ts |= (uint64_t)rte_le_to_cpu_32(rte_read32((uint8_t *)bp->bar0 +
-				ptp->rx_mapped_regs[BNXT_PTP_RX_TS_H])) << 32;
-
-	return 0;
-}
-
 static int
 bnxt_timesync_write_time(struct rte_eth_dev *dev, const struct timespec *ts)
 {
@@ -3877,12 +3784,9 @@ bnxt_timesync_read_time(struct rte_eth_dev *dev, struct timespec *ts)
 	if (!ptp)
 		return -ENOTSUP;
 
-	/* TODO Revisit for Thor 2 */
 	if (BNXT_CHIP_P5(bp))
 		rc = bnxt_hwrm_port_ts_query(bp, BNXT_PTP_FLAGS_CURRENT_TIME,
 					     &systime_cycles);
-	else
-		systime_cycles = bnxt_cc_read(bp);
 
 	ns = rte_timecounter_update(&ptp->tc, systime_cycles);
 	*ts = rte_ns_to_timespec(ns);
@@ -3965,11 +3869,7 @@ bnxt_timesync_read_rx_timestamp(struct rte_eth_dev *dev,
 	if (!ptp)
 		return -ENOTSUP;
 
-	/* TODO Revisit for Thor 2 */
-	if (BNXT_CHIP_P5(bp))
-		rx_tstamp_cycles = ptp->rx_timestamp;
-	else
-		bnxt_get_rx_ts(bp, &rx_tstamp_cycles);
+	rx_tstamp_cycles = ptp->rx_timestamp;
 
 	ns = rte_timecounter_update(&ptp->rx_tstamp_tc, rx_tstamp_cycles);
 	*timestamp = rte_ns_to_timespec(ns);
@@ -3990,11 +3890,8 @@ bnxt_timesync_read_tx_timestamp(struct rte_eth_dev *dev,
 		return -ENOTSUP;
 
 	/* TODO Revisit for Thor 2 */
-	if (BNXT_CHIP_P5(bp))
-		rc = bnxt_hwrm_port_ts_query(bp, BNXT_PTP_FLAGS_PATH_TX,
-					     &tx_tstamp_cycles);
-	else
-		rc = bnxt_get_tx_ts(bp, &tx_tstamp_cycles);
+	rc = bnxt_hwrm_port_ts_query(bp, BNXT_PTP_FLAGS_PATH_TX,
+				     &tx_tstamp_cycles);
 
 	ns = rte_timecounter_update(&ptp->tx_tstamp_tc, tx_tstamp_cycles);
 	*timestamp = rte_ns_to_timespec(ns);
diff --git a/drivers/net/bnxt/bnxt_rxr.c b/drivers/net/bnxt/bnxt_rxr.c
index 73fee40401..fb7c70fd45 100644
--- a/drivers/net/bnxt/bnxt_rxr.c
+++ b/drivers/net/bnxt/bnxt_rxr.c
@@ -727,6 +727,18 @@ bnxt_set_ol_flags(struct bnxt_rx_ring_info *rxr, struct rx_pkt_cmpl *rxcmp,
 	mbuf->ol_flags |= ol_flags;
 }
 
+static void bnxt_get_rx_ts(struct bnxt *bp)
+{
+	struct bnxt_ptp_cfg *ptp = bp->ptp_cfg;
+
+	if (!ptp)
+		return;
+
+	rte_spinlock_lock(&ptp->ptp_lock);
+	ptp->rx_timestamp = ptp->old_time;
+	rte_spinlock_unlock(&ptp->ptp_lock);
+}
+
 static void
 bnxt_get_rx_ts_p5(struct bnxt *bp, uint32_t rx_ts_cmpl)
 {
@@ -1198,7 +1210,10 @@ static int bnxt_rx_pkt(struct rte_mbuf **rx_pkt,
 		      bp->ptp_cfg) {
 		mbuf->ol_flags |= RTE_MBUF_F_RX_IEEE1588_PTP |
 				  RTE_MBUF_F_RX_IEEE1588_TMST;
-		bnxt_get_rx_ts_p5(rxq->bp, rxcmp1->reorder);
+		if (BNXT_CHIP_P5(bp))
+			bnxt_get_rx_ts_p5(rxq->bp, rxcmp1->reorder);
+		else
+			bnxt_get_rx_ts(rxq->bp);
 #ifndef RTE_IOVA_IN_MBUF
 		bnxt_timestamp_dynfield_set(mbuf,
 					    bp->ptp_cfg->mb_rx_timestamp_offset,
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox