DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 4/6] net/gve: add periodic NIC clock synchronization
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce a mechanism to periodically fetch the NIC hardware timestamp
using the GVE_ADMINQ_REPORT_NIC_TIMESTAMP AdminQ command. The
synchronization runs every 250ms via a dedicated background control thread
(gve_nic_ts_thread). The thread polls using an incremental sleep loop
checking a stop flag to ensure fast termination during driver teardown.
After 7 consecutive clock read failures, the timestamp is marked as stale,
indicating to the RX path that reconstructed timestamps may be unreliable.

Atomics exist because of the potential for async callers (introduced
here) and async callers (introduced later in the RX datapath) accessing
the cached state.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v5:
    - Reword commit message body to accurately describe the dedicated control
      thread polling mechanism instead of obsolete rte_alarm.

v4:
    - Replace EAL alarm thread with a dedicated control thread to
      prevent blocking shared EAL interrupt thread for up to 2s
      on GVE AdminQ timeouts.
    - Implement incremental sleep loop to support fast termination.

v2:
    - Removed redundant void* casts.
    - Handled alarm reschedule failures by marking timestamp stale.
    - Added transient error logging on memzone allocation failure.
---
 drivers/net/gve/gve_ethdev.c | 122 +++++++++++++++++++++++++++++++++++
 drivers/net/gve/gve_ethdev.h |  12 ++++
 2 files changed, 134 insertions(+)

diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index 476b2c311f..fa91575b3e 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -452,6 +452,94 @@ gve_dev_start(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static void
+gve_read_nic_clock(void *arg)
+{
+	struct gve_priv *priv = arg;
+	uint32_t fails;
+	uint64_t ts;
+	int err;
+
+	if (!priv || !priv->nic_ts_report_mz)
+		return;
+
+	memset(priv->nic_ts_report, 0, sizeof(struct gve_nic_ts_report));
+
+	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
+	if (err == 0) {
+		ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+		rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts,
+					  rte_memory_order_relaxed);
+		PMD_DRV_LOG(DEBUG, "Fetched NIC Timestamp: %" PRIu64, ts);
+		rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0,
+					  rte_memory_order_relaxed);
+		rte_atomic_store_explicit(&priv->nic_ts_stale, 0,
+					  rte_memory_order_release);
+	} else {
+		PMD_DRV_LOG(ERR, "Failed to read NIC clock, AQ err: %d", err);
+		fails = rte_atomic_fetch_add_explicit(&priv->nic_ts_read_fails, 1,
+						      rte_memory_order_relaxed) + 1;
+		if (fails >= GVE_NIC_CLOCK_READ_MAX_FAILS) {
+			if (!rte_atomic_load_explicit(&priv->nic_ts_stale,
+						      rte_memory_order_relaxed))
+				PMD_DRV_LOG(ERR,
+					"NIC timestamping marked as stale after %u consecutive failures",
+					GVE_NIC_CLOCK_READ_MAX_FAILS);
+			rte_atomic_store_explicit(&priv->nic_ts_stale, 1,
+						  rte_memory_order_release);
+		}
+	}
+}
+
+static uint32_t
+gve_nic_ts_thread(void *arg)
+{
+	struct gve_priv *priv = arg;
+	unsigned int sleep_cnt;
+
+	while (!rte_atomic_load_explicit(&priv->nic_ts_thread_should_stop,
+					 rte_memory_order_relaxed)) {
+		gve_read_nic_clock(priv);
+		for (sleep_cnt = 0; sleep_cnt < GVE_NIC_CLOCK_READ_PERIOD_MS / 10; sleep_cnt++) {
+			if (rte_atomic_load_explicit(&priv->nic_ts_thread_should_stop,
+						     rte_memory_order_relaxed))
+				break;
+			rte_delay_us_sleep(10000);
+		}
+	}
+	return 0;
+}
+
+static int
+gve_alloc_nic_ts_report(struct gve_priv *priv)
+{
+	char z_name[RTE_MEMZONE_NAMESIZE];
+
+	snprintf(z_name, sizeof(z_name), "gve_%s_nic_ts_report",
+		 priv->pci_dev->device.name);
+	priv->nic_ts_report_mz = rte_memzone_reserve_aligned(z_name,
+			sizeof(struct gve_nic_ts_report), rte_socket_id(),
+			RTE_MEMZONE_IOVA_CONTIG, PAGE_SIZE);
+
+	if (!priv->nic_ts_report_mz) {
+		PMD_DRV_LOG(ERR, "Failed to allocate memzone for NIC TS report");
+		return -ENOMEM;
+	}
+	priv->nic_ts_report = priv->nic_ts_report_mz->addr;
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	return 0;
+}
+
+static void
+gve_free_nic_ts_report(struct gve_priv *priv)
+{
+	if (priv->nic_ts_report_mz) {
+		rte_memzone_free(priv->nic_ts_report_mz);
+		priv->nic_ts_report_mz = NULL;
+		priv->nic_ts_report = NULL;
+	}
+}
+
 static int
 gve_dev_stop(struct rte_eth_dev *dev)
 {
@@ -586,6 +674,13 @@ gve_teardown_device_resources(struct gve_priv *priv)
 				err);
 	}
 
+	if (priv->nic_ts_report_mz) {
+		rte_atomic_store_explicit(&priv->nic_ts_thread_should_stop, 1,
+					  rte_memory_order_relaxed);
+		rte_thread_join(priv->nic_ts_thread_id, NULL);
+		gve_free_nic_ts_report(priv);
+	}
+
 	gve_free_ptype_lut_dqo(priv);
 	gve_free_counter_array(priv);
 	gve_free_irq_db(priv);
@@ -1252,6 +1347,32 @@ pci_dev_msix_vec_count(struct rte_pci_device *pdev)
 	return 0;
 }
 
+static void
+gve_setup_nic_timestamp(struct gve_priv *priv)
+{
+	int err;
+
+	if (!priv->nic_timestamp_supported)
+		return;
+
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_stale, 1, rte_memory_order_relaxed);
+	err = gve_alloc_nic_ts_report(priv);
+	if (err == 0) {
+		gve_read_nic_clock(priv);
+		rte_atomic_store_explicit(&priv->nic_ts_thread_should_stop, 0,
+					  rte_memory_order_relaxed);
+		err = rte_thread_create_internal_control(&priv->nic_ts_thread_id, "gve-ts",
+							 gve_nic_ts_thread, priv);
+		if (err != 0) {
+			PMD_DRV_LOG(ERR, "Failed to create NIC clock sync thread, err=%d", err);
+			gve_free_nic_ts_report(priv);
+		}
+	} else {
+		PMD_DRV_LOG(ERR, "Failed to allocate memory for NIC timestamping subsystem. Please reset device to retry.");
+	}
+}
+
 static int
 gve_setup_device_resources(struct gve_priv *priv)
 {
@@ -1307,6 +1428,7 @@ gve_setup_device_resources(struct gve_priv *priv)
 			goto free_ptype_lut;
 		}
 	}
+	gve_setup_nic_timestamp(priv);
 
 	gve_set_device_resources_ok(priv);
 
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b67f82c263..4dcbaa9971 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -12,6 +12,8 @@
 #include <rte_pci.h>
 #include <pthread.h>
 #include <rte_bitmap.h>
+#include <rte_memzone.h>
+#include <rte_thread.h>
 
 #include "base/gve.h"
 
@@ -39,6 +41,9 @@
 #define GVE_RSS_HASH_KEY_SIZE 40
 #define GVE_RSS_INDIR_SIZE 128
 
+#define GVE_NIC_CLOCK_READ_PERIOD_MS 250
+#define GVE_NIC_CLOCK_READ_MAX_FAILS 7
+
 #define GVE_TX_CKSUM_OFFLOAD_MASK (		\
 		RTE_MBUF_F_TX_L4_MASK  |	\
 		RTE_MBUF_F_TX_TCP_SEG)
@@ -359,6 +364,13 @@ struct gve_priv {
 
 	/* HW Timestamping Fields */
 	bool nic_timestamp_supported;
+	const struct rte_memzone *nic_ts_report_mz;
+	struct gve_nic_ts_report *nic_ts_report;
+	RTE_ATOMIC(uint64_t) last_read_nic_timestamp;
+	RTE_ATOMIC(uint32_t) nic_ts_read_fails;
+	RTE_ATOMIC(uint8_t) nic_ts_stale;
+	rte_thread_t nic_ts_thread_id;
+	RTE_ATOMIC(uint8_t) nic_ts_thread_should_stop;
 };
 
 static inline bool
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 3/6] net/gve: add AdminQ command for NIC timestamps
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce the necessary definitions and functions for the
GVE_ADMINQ_REPORT_NIC_TIMESTAMP AdminQ command.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v2:
    - Added adminq timestamp counter reset to gve_adminq_alloc.
---
 drivers/net/gve/base/gve_adminq.c | 20 ++++++++++++++++++++
 drivers/net/gve/base/gve_adminq.h | 20 ++++++++++++++++++++
 drivers/net/gve/gve_ethdev.h      |  1 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 1ced1e442e..2b25c7f390 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -263,6 +263,8 @@ int gve_adminq_alloc(struct gve_priv *priv)
 	priv->adminq_get_ptype_map_cnt = 0;
 	priv->adminq_cfg_flow_rule_cnt = 0;
 
+	priv->adminq_report_nic_timestamp_cnt = 0;
+
 	pthread_mutexattr_init(&mutexattr);
 	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
 	pthread_mutex_init(&priv->adminq_lock, &mutexattr);
@@ -522,6 +524,10 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	case GVE_ADMINQ_CONFIGURE_FLOW_RULE:
 		priv->adminq_cfg_flow_rule_cnt++;
 		break;
+	case GVE_ADMINQ_REPORT_NIC_TIMESTAMP:
+		priv->adminq_report_nic_timestamp_cnt++;
+		break;
+
 	default:
 		PMD_DRV_LOG(ERR, "unknown AQ command opcode %d", opcode);
 	}
@@ -636,6 +642,20 @@ int gve_adminq_reset_flow_rules(struct gve_priv *priv)
 	return gve_adminq_configure_flow_rule(priv, &flow_rule_cmd);
 }
 
+int gve_adminq_report_nic_timestamp(struct gve_priv *priv, dma_addr_t nic_ts_report_addr)
+{
+	union gve_adminq_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_NIC_TIMESTAMP);
+	cmd.report_nic_timestamp = (struct gve_adminq_report_nic_timestamp) {
+		.nic_ts_report_len = cpu_to_be64(sizeof(struct gve_nic_ts_report)),
+		.nic_timestamp_addr = cpu_to_be64(nic_ts_report_addr),
+	};
+
+	return gve_adminq_execute_cmd(priv, &cmd);
+}
+
 /* The device specifies that the management vector can either be the first irq
  * or the last irq. ntfy_blk_msix_base_idx indicates the first irq assigned to
  * the ntfy blks. It if is 0 then the management vector is last, if it is 1 then
diff --git a/drivers/net/gve/base/gve_adminq.h b/drivers/net/gve/base/gve_adminq.h
index eaee5649f2..954be39fbf 100644
--- a/drivers/net/gve/base/gve_adminq.h
+++ b/drivers/net/gve/base/gve_adminq.h
@@ -26,6 +26,7 @@ enum gve_adminq_opcodes {
 	GVE_ADMINQ_REPORT_LINK_SPEED		= 0xD,
 	GVE_ADMINQ_GET_PTYPE_MAP		= 0xE,
 	GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY	= 0xF,
+	GVE_ADMINQ_REPORT_NIC_TIMESTAMP		= 0x11,
 	/* For commands that are larger than 56 bytes */
 	GVE_ADMINQ_EXTENDED_COMMAND		= 0xFF,
 };
@@ -373,6 +374,23 @@ struct gve_stats_report {
 
 GVE_CHECK_STRUCT_LEN(8, gve_stats_report);
 
+struct gve_adminq_report_nic_timestamp {
+	__be64 nic_ts_report_len;
+	__be64 nic_timestamp_addr;
+};
+
+GVE_CHECK_STRUCT_LEN(16, gve_adminq_report_nic_timestamp);
+
+struct gve_nic_ts_report {
+	__be64 nic_timestamp; /* NIC clock in nanoseconds */
+	__be64 pre_cycles; /* System cycle counter before NIC clock read */
+	__be64 post_cycles; /* System cycle counter after NIC clock read */
+	__be64 reserved3;
+	__be64 reserved4;
+};
+
+GVE_CHECK_STRUCT_LEN(40, gve_nic_ts_report);
+
 /* Numbers of gve tx/rx stats in stats report. */
 #define GVE_TX_STATS_REPORT_NUM        6
 #define GVE_RX_STATS_REPORT_NUM        2
@@ -490,6 +508,7 @@ union gve_adminq_command {
 			struct gve_adminq_verify_driver_compatibility
 				verify_driver_compatibility;
 			struct gve_adminq_extended_command extended_command;
+			struct gve_adminq_report_nic_timestamp report_nic_timestamp;
 		};
 	};
 	u8 reserved[64];
@@ -537,5 +556,6 @@ int gve_adminq_add_flow_rule(struct gve_priv *priv,
 			     struct gve_flow_rule_params *rule, u32 loc);
 int gve_adminq_del_flow_rule(struct gve_priv *priv, u32 loc);
 int gve_adminq_reset_flow_rules(struct gve_priv *priv);
+int gve_adminq_report_nic_timestamp(struct gve_priv *priv, dma_addr_t nic_ts_report_addr);
 
 #endif /* _GVE_ADMINQ_H */
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b9b4688367..b67f82c263 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -328,6 +328,7 @@ struct gve_priv {
 	uint32_t adminq_get_ptype_map_cnt;
 	uint32_t adminq_verify_driver_compatibility_cnt;
 	uint32_t adminq_cfg_flow_rule_cnt;
+	uint32_t adminq_report_nic_timestamp_cnt;
 	volatile uint32_t state_flags;
 
 	/* Gvnic device link speed from hypervisor. */
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 2/6] net/gve: add device option support for HW timestamps
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce the necessary definitions and functions for the device
option flag (GVE_DEV_OPT_ID_NIC_TIMESTAMP) to detect hardware
timestamping support in the gvnic device.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
 drivers/net/gve/base/gve_adminq.c | 41 ++++++++++++++++++++++++++-----
 drivers/net/gve/base/gve_adminq.h |  9 +++++++
 drivers/net/gve/gve_ethdev.h      |  3 +++
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 743ab8e7ae..1ced1e442e 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -38,7 +38,8 @@ void gve_parse_device_option(struct gve_priv *priv,
 			     struct gve_device_option_dqo_rda **dev_op_dqo_rda,
 			     struct gve_device_option_flow_steering **dev_op_flow_steering,
 			     struct gve_device_option_modify_ring **dev_op_modify_ring,
-			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames,
+			     struct gve_device_option_nic_timestamp **dev_op_nic_timestamp)
 {
 	u32 req_feat_mask = be32_to_cpu(option->required_features_mask);
 	u16 option_length = be16_to_cpu(option->option_length);
@@ -168,6 +169,24 @@ void gve_parse_device_option(struct gve_priv *priv,
 		}
 		*dev_op_jumbo_frames = RTE_PTR_ADD(option, sizeof(*option));
 		break;
+	case GVE_DEV_OPT_ID_NIC_TIMESTAMP:
+		if (option_length < sizeof(**dev_op_nic_timestamp) ||
+		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP) {
+			PMD_DRV_LOG(WARNING, GVE_DEVICE_OPTION_ERROR_FMT,
+				    "Nic Timestamp",
+				    (int)sizeof(**dev_op_nic_timestamp),
+				    GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP,
+				    option_length, req_feat_mask);
+			break;
+		}
+
+		if (option_length > sizeof(**dev_op_nic_timestamp)) {
+			PMD_DRV_LOG(WARNING,
+				    GVE_DEVICE_OPTION_TOO_BIG_FMT,
+				    "Nic Timestamp");
+		}
+		*dev_op_nic_timestamp = RTE_PTR_ADD(option, sizeof(*option));
+		break;
 	default:
 		/* If we don't recognize the option just continue
 		 * without doing anything.
@@ -186,7 +205,8 @@ gve_process_device_options(struct gve_priv *priv,
 			   struct gve_device_option_dqo_rda **dev_op_dqo_rda,
 			   struct gve_device_option_flow_steering **dev_op_flow_steering,
 			   struct gve_device_option_modify_ring **dev_op_modify_ring,
-			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames,
+			   struct gve_device_option_nic_timestamp **dev_op_nic_timestamp)
 {
 	const int num_options = be16_to_cpu(descriptor->num_device_options);
 	struct gve_device_option *dev_opt;
@@ -207,7 +227,8 @@ gve_process_device_options(struct gve_priv *priv,
 		gve_parse_device_option(priv, dev_opt,
 					dev_op_gqi_rda, dev_op_gqi_qpl,
 					dev_op_dqo_rda, dev_op_flow_steering,
-					dev_op_modify_ring, dev_op_jumbo_frames);
+					dev_op_modify_ring, dev_op_jumbo_frames,
+					dev_op_nic_timestamp);
 		dev_opt = next_opt;
 	}
 
@@ -920,7 +941,8 @@ static void gve_enable_supported_features(struct gve_priv *priv,
 	u32 supported_features_mask,
 	const struct gve_device_option_flow_steering *dev_op_flow_steering,
 	const struct gve_device_option_modify_ring *dev_op_modify_ring,
-	const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames)
+	const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames,
+	const struct gve_device_option_nic_timestamp *dev_op_nic_timestamp)
 {
 	if (dev_op_flow_steering &&
 	    (supported_features_mask & GVE_SUP_FLOW_STEERING_MASK) &&
@@ -947,6 +969,11 @@ static void gve_enable_supported_features(struct gve_priv *priv,
 		PMD_DRV_LOG(INFO, "JUMBO FRAMES device option enabled.");
 		priv->max_mtu = be16_to_cpu(dev_op_jumbo_frames->max_mtu);
 	}
+	if (dev_op_nic_timestamp &&
+	    (supported_features_mask & GVE_SUP_NIC_TIMESTAMP_MASK)) {
+		PMD_DRV_LOG(INFO, "NIC TIMESTAMP device option enabled.");
+		priv->nic_timestamp_supported = true;
+	}
 }
 
 int gve_adminq_describe_device(struct gve_priv *priv)
@@ -954,6 +981,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 	struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
 	struct gve_device_option_modify_ring *dev_op_modify_ring = NULL;
 	struct gve_device_option_flow_steering *dev_op_flow_steering = NULL;
+	struct gve_device_option_nic_timestamp *dev_op_nic_timestamp = NULL;
 	struct gve_device_option_gqi_rda *dev_op_gqi_rda = NULL;
 	struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL;
 	struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;
@@ -983,7 +1011,8 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 					 &dev_op_gqi_qpl, &dev_op_dqo_rda,
 					 &dev_op_flow_steering,
 					 &dev_op_modify_ring,
-					 &dev_op_jumbo_frames);
+					 &dev_op_jumbo_frames,
+					 &dev_op_nic_timestamp);
 	if (err)
 		goto free_device_descriptor;
 
@@ -1038,7 +1067,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 
 	gve_enable_supported_features(priv, supported_features_mask,
 				      dev_op_flow_steering, dev_op_modify_ring,
-				      dev_op_jumbo_frames);
+				      dev_op_jumbo_frames, dev_op_nic_timestamp);
 
 free_device_descriptor:
 	gve_free_dma_mem(&descriptor_dma_mem);
diff --git a/drivers/net/gve/base/gve_adminq.h b/drivers/net/gve/base/gve_adminq.h
index d8e5e6a352..eaee5649f2 100644
--- a/drivers/net/gve/base/gve_adminq.h
+++ b/drivers/net/gve/base/gve_adminq.h
@@ -153,6 +153,12 @@ struct gve_device_option_jumbo_frames {
 
 GVE_CHECK_STRUCT_LEN(8, gve_device_option_jumbo_frames);
 
+struct gve_device_option_nic_timestamp {
+	__be32 supported_features_mask;
+};
+
+GVE_CHECK_STRUCT_LEN(4, gve_device_option_nic_timestamp);
+
 /* Terminology:
  *
  * RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA
@@ -169,6 +175,7 @@ enum gve_dev_opt_id {
 	GVE_DEV_OPT_ID_MODIFY_RING = 0x6,
 	GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
 	GVE_DEV_OPT_ID_FLOW_STEERING = 0xb,
+	GVE_DEV_OPT_ID_NIC_TIMESTAMP = 0xd,
 };
 
 enum gve_dev_opt_req_feat_mask {
@@ -179,12 +186,14 @@ enum gve_dev_opt_req_feat_mask {
 	GVE_DEV_OPT_REQ_FEAT_MASK_FLOW_STEERING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0,
+	GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP = 0x0,
 };
 
 enum gve_sup_feature_mask {
 	GVE_SUP_MODIFY_RING_MASK = 1 << 0,
 	GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2,
 	GVE_SUP_FLOW_STEERING_MASK = 1 << 5,
+	GVE_SUP_NIC_TIMESTAMP_MASK = 1 << 8,
 };
 
 #define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 524e48e723..b9b4688367 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -355,6 +355,9 @@ struct gve_priv {
 	void *avail_flow_rule_bmp_mem; /* Backing memory for the bitmap */
 	pthread_mutex_t flow_rule_lock; /* Lock for bitmap and tailq access */
 	TAILQ_HEAD(, gve_flow) active_flows;
+
+	/* HW Timestamping Fields */
+	bool nic_timestamp_supported;
 };
 
 static inline bool
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 1/6] net/gve: add thread safety to admin queue
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce a pthread_mutex to protect the admin queue operations.
Locking was added around gve_adminq_execute_cmd and the batch
queue creation/destruction functions.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v2:
    - Dropped ROBUST mutex attribute.
---
 .mailmap                          |  1 +
 drivers/net/gve/base/gve_adminq.c | 67 +++++++++++++++++++++++++------
 drivers/net/gve/gve_ethdev.h      |  1 +
 3 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/.mailmap b/.mailmap
index e052b85213..1b10cfca35 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1016,6 +1016,7 @@ Mario Carrillo <mario.alfredo.c.arevalo@intel.com>
 Mário Kuka <kuka@cesnet.cz>
 Mariusz Drost <mariuszx.drost@intel.com>
 Mark Asselstine <mark.asselstine@windriver.com>
+Mark Blasko <blasko@google.com>
 Mark Bloch <mbloch@nvidia.com> <markb@mellanox.com>
 Mark Gillott <mgillott@vyatta.att-mail.com>
 Mark Kavanagh <mark.b.kavanagh@intel.com>
diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 9c5316fb00..743ab8e7ae 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -216,6 +216,7 @@ gve_process_device_options(struct gve_priv *priv,
 
 int gve_adminq_alloc(struct gve_priv *priv)
 {
+	pthread_mutexattr_t mutexattr;
 	uint8_t pci_rev_id;
 
 	priv->adminq = gve_alloc_dma_mem(&priv->adminq_dma_mem, PAGE_SIZE);
@@ -241,6 +242,11 @@ int gve_adminq_alloc(struct gve_priv *priv)
 	priv->adminq_get_ptype_map_cnt = 0;
 	priv->adminq_cfg_flow_rule_cnt = 0;
 
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->adminq_lock, &mutexattr);
+	pthread_mutexattr_destroy(&mutexattr);
+
 	/* Setup Admin queue with the device */
 	rte_pci_read_config(priv->pci_dev, &pci_rev_id, sizeof(pci_rev_id),
 			    RTE_PCI_REVISION_ID);
@@ -304,6 +310,7 @@ void gve_adminq_free(struct gve_priv *priv)
 		return;
 	gve_adminq_release(priv);
 	gve_free_dma_mem(&priv->adminq_dma_mem);
+	pthread_mutex_destroy(&priv->adminq_lock);
 	gve_clear_admin_queue_ok(priv);
 }
 
@@ -418,7 +425,10 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	    (tail & priv->adminq_mask)) {
 		int err;
 
-		/* Flush existing commands to make room. */
+		/* Flush existing commands to make room.
+		 * Note: This kicks the doorbell for all staged commands.
+		 * Any failure here means we failed after attempting to kick.
+		 */
 		err = gve_adminq_kick_and_wait(priv);
 		if (err)
 			return err;
@@ -509,17 +519,24 @@ static int gve_adminq_execute_cmd(struct gve_priv *priv,
 	u32 tail, head;
 	int err;
 
+	pthread_mutex_lock(&priv->adminq_lock);
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 	head = priv->adminq_prod_cnt;
-	if (tail != head)
+	if (tail != head) {
 		/* This is not a valid path */
-		return -EINVAL;
+		err = -EINVAL;
+		goto unlock_and_return;
+	}
 
 	err = gve_adminq_issue_cmd(priv, cmd_orig);
 	if (err)
-		return err;
+		goto unlock_and_return;
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode,
@@ -693,13 +710,19 @@ int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_create_tx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
@@ -747,13 +770,19 @@ int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_create_rx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
@@ -779,13 +808,19 @@ int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_destroy_tx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index)
@@ -811,13 +846,19 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_destroy_rx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_set_desc_cnt(struct gve_priv *priv,
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 0577f03974..524e48e723 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -339,6 +339,7 @@ struct gve_priv {
 	struct gve_tx_queue **txqs;
 	struct gve_rx_queue **rxqs;
 
+	pthread_mutex_t adminq_lock; /* Protects AdminQ command execution */
 	uint32_t stats_report_len;
 	const struct rte_memzone *stats_report_mem;
 	uint16_t stats_start_idx; /* start index of array of stats written by NIC */
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 0/6] net/gve: add hardware timestamping support
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

This patch series introduces support for GVE hardware timestamping
on DQO queues. To support concurrent access, a mutex lock is introduced
to protect admin queue operations. A mechanism is then added to
periodically synchronize the NIC clock via a dedicated control thread,
and support is introduced for the read_clock ethdev operation.
Finally, the RX datapath is updated to reconstruct full 64-bit
timestamps from the 32-bit values in DQO descriptors.

---
v5:
- Patch 4: Reword commit message to describe control thread implementation.

v4:
- Patch 4: Offload periodic NIC clock reads to a dedicated control thread.
- Patch 5: Correct mutex initialization and teardown ordering.
- Patch 6: Update GVE documentation to clarify queue format requirements.

v3:
- Patch 5:
  - Add mutex lock to protect shared NIC timestamp memzone access.
  - Fix missing read_clock assignment to DQO queue ops table
    (accidental omission in v2).

v2:
- Patch 1: Dropped ROBUST mutex attribute.
- Patch 3: Added adminq timestamp counter reset to gve_adminq_alloc.
- Patch 4:
  - Removed redundant void* casts.
  - Handled alarm reschedule failures by marking timestamp stale.
  - Added transient error logging on memzone allocation failure.
- Patch 5: Scoped read_clock ethdev operation strictly to DQO queues.
- Patch 6:
  - Scoped timestamp offload capability advertisement strictly to
    DQO queues.
  - Predicated capability advertisement directly on memzone
    allocation.
  - Initialized mbuf_timestamp_offset to -1.
  - Added blank line separating release notes.
---

Mark Blasko (6):
  net/gve: add thread safety to admin queue
  net/gve: add device option support for HW timestamps
  net/gve: add AdminQ command for NIC timestamps
  net/gve: add periodic NIC clock synchronization
  net/gve: support read clock ethdev op
  net/gve: reconstruct HW timestamps from DQO

 .mailmap                               |   1 +
 doc/guides/nics/features/gve.ini       |   1 +
 doc/guides/nics/gve.rst                |  20 +++
 doc/guides/rel_notes/release_26_07.rst |   4 +
 drivers/net/gve/base/gve_adminq.c      | 128 +++++++++++++---
 drivers/net/gve/base/gve_adminq.h      |  29 ++++
 drivers/net/gve/base/gve_desc_dqo.h    |   8 +-
 drivers/net/gve/gve_ethdev.c           | 194 +++++++++++++++++++++++--
 drivers/net/gve/gve_ethdev.h           |  43 ++++++
 drivers/net/gve/gve_rx_dqo.c           |  26 ++++
 10 files changed, 424 insertions(+), 30 deletions(-)

-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply

* [PATCH] net/bnxt: avoid link flap on flow control set
From: Mohammad Shuab Siddique @ 2026-06-16 23:23 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Chenna Arnoori

From: Chenna Arnoori <chenna.arnoori@broadcom.com>

When OVS-DPDK reconciles port state, it calls flow_ctrl_get followed
by flow_ctrl_set if the returned configuration differs from its
database. The driver was reporting autoneg=1 whenever the firmware
had auto_pause set (including the AUTONEG_PAUSE bit 0x4), even
though no pause autoneg was actually requested. This mismatch
caused OVS to repeatedly call flow_ctrl_set, which triggered a
full link reconfig with PHY reset, flapping the link every time
any interface change occurred on the system.

Two problems were fixed. First, flow_ctrl_set was clearing all
autoneg bits instead of only the flow-control autoneg bit,
which also wiped the speed autoneg state. Second, the pause
set path was abandoning its own HWRM request and calling the
full link config function, which built a separate request
without the pause fields that set_pause_common would have
computed.

Port the kernel bnxt_en driver's approach: add a
set_link_common helper that merges link and speed fields into
an existing HWRM request, and rework set_pause to build a
single combined request with both pause and link fields when
a full reconfig is needed. When only pause changes without an
auto-to-force transition, a pause-only PHY config is sent
without a full link reprogram.

Fixes: 8aaf473bbed6 ("net/bnxt: add flow control operations")
Cc: stable@dpdk.org
Signed-off-by: Chenna Arnoori <chenna.arnoori@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt.h        |   4 +
 drivers/net/bnxt/bnxt_ethdev.c |  12 ++-
 drivers/net/bnxt/bnxt_hwrm.c   | 148 +++++++++++++++++++++++++++++++++
 drivers/net/bnxt/bnxt_hwrm.h   |   1 +
 4 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnxt/bnxt.h b/drivers/net/bnxt/bnxt.h
index 7515f0564f..066a6fd478 100644
--- a/drivers/net/bnxt/bnxt.h
+++ b/drivers/net/bnxt/bnxt.h
@@ -333,6 +333,10 @@ struct bnxt_link_info {
 	uint8_t                 active_lanes;
 	uint8_t			option_flags;
 	uint16_t                pmd_speed_lanes;
+	uint8_t			autoneg;
+#define BNXT_AUTONEG_SPEED		1
+#define BNXT_AUTONEG_FLOW_CTRL		2
+	bool			force_link_chng;
 };
 
 #define BNXT_COS_QUEUE_COUNT	8
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 1b8cf3a52a..23a6b193b7 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -2565,7 +2565,7 @@ static int bnxt_flow_ctrl_get_op(struct rte_eth_dev *dev,
 		return rc;
 
 	memset(fc_conf, 0, sizeof(*fc_conf));
-	if (bp->link_info->auto_pause)
+	if (bp->link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
 		fc_conf->autoneg = 1;
 	switch (bp->link_info->pause) {
 	case 0:
@@ -2601,6 +2601,14 @@ static int bnxt_flow_ctrl_set_op(struct rte_eth_dev *dev,
 		return -ENOTSUP;
 	}
 
+	if (fc_conf->autoneg) {
+		bp->link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+	} else {
+		if (bp->link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+			bp->link_info->force_link_chng = true;
+		bp->link_info->autoneg &= ~BNXT_AUTONEG_FLOW_CTRL;
+	}
+
 	switch (fc_conf->mode) {
 	case RTE_ETH_FC_NONE:
 		bp->link_info->auto_pause = 0;
@@ -2642,7 +2650,7 @@ static int bnxt_flow_ctrl_set_op(struct rte_eth_dev *dev,
 		}
 		break;
 	}
-	return bnxt_set_hwrm_link_config(bp, true);
+	return bnxt_hwrm_set_pause(bp);
 }
 
 /* Add UDP tunneling port */
diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index 0c82935de9..e333b3d182 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -1876,6 +1876,21 @@ static int bnxt_hwrm_port_phy_qcfg(struct bnxt *bp,
 	link_info->auto_pause = resp->auto_pause;
 	link_info->force_pause = resp->force_pause;
 	link_info->auto_mode = resp->auto_mode;
+
+	if (link_info->auto_mode != HWRM_PORT_PHY_QCFG_OUTPUT_AUTO_MODE_NONE) {
+		link_info->autoneg = BNXT_AUTONEG_SPEED;
+		if (bp->hwrm_spec_code >= 0x10201) {
+			if (link_info->auto_pause &
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_AUTONEG_PAUSE)
+				link_info->autoneg |=
+					BNXT_AUTONEG_FLOW_CTRL;
+		} else {
+			link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		}
+	} else {
+		link_info->autoneg = 0;
+	}
+
 	link_info->phy_type = resp->phy_type;
 	link_info->media_type = resp->media_type;
 
@@ -4048,6 +4063,139 @@ static int bnxt_hwrm_port_phy_cfg_v2(struct bnxt *bp, struct bnxt_link_info *con
 	return rc;
 }
 
+static void
+bnxt_hwrm_set_pause_common(struct bnxt *bp,
+			   struct hwrm_port_phy_cfg_input *req)
+{
+	struct bnxt_link_info *link_info = bp->link_info;
+
+	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) {
+		if (bp->hwrm_spec_code >= 0x10201)
+			req->auto_pause =
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_AUTONEG_PAUSE;
+		if (link_info->auto_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_RX)
+			req->auto_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_RX;
+		if (link_info->auto_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_TX)
+			req->auto_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_TX;
+		req->enables |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_PAUSE);
+	} else {
+		if (link_info->force_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_RX)
+			req->force_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_RX;
+		if (link_info->force_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_TX)
+			req->force_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_TX;
+		req->enables |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_FORCE_PAUSE);
+		if (bp->hwrm_spec_code >= 0x10201) {
+			req->auto_pause = req->force_pause;
+			req->enables |=
+				rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_PAUSE);
+		}
+	}
+}
+
+static void
+bnxt_hwrm_set_link_common(struct bnxt *bp, struct hwrm_port_phy_cfg_input *req)
+{
+	struct rte_eth_conf *dev_conf = &bp->eth_dev->data->dev_conf;
+	uint16_t autoneg, speed;
+
+	autoneg = bnxt_check_eth_link_autoneg(dev_conf->link_speeds);
+
+	if (BNXT_CHIP_P5(bp) &&
+	    dev_conf->link_speeds & RTE_ETH_LINK_SPEED_40G)
+		autoneg = 0;
+
+	if (autoneg == 1 && BNXT_CHIP_P5(bp) &&
+	    bp->link_info->auto_mode == 0 &&
+	    bp->link_info->force_pam4_link_speed ==
+	    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAM4_LINK_SPEED_200GB)
+		autoneg = 0;
+
+	speed = bnxt_parse_eth_link_speed(bp, dev_conf->link_speeds,
+					  bp->link_info);
+	req->flags |=
+		rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_RESET_PHY);
+
+	if (autoneg == 1 &&
+	    (bp->link_info->support_auto_speeds ||
+	     bp->link_info->support_pam4_auto_speeds)) {
+		uint16_t spd_mask;
+		uint32_t en;
+
+		req->flags |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_RESTART_AUTONEG);
+		spd_mask = bnxt_parse_eth_link_speed_mask(bp,
+							  dev_conf->link_speeds);
+		req->auto_link_speed_mask = rte_cpu_to_le_16(spd_mask);
+		req->auto_link_pam4_speed_mask =
+			rte_cpu_to_le_16(bp->link_info->auto_pam4_link_speed_mask);
+		en = HWRM_PORT_PHY_CFG_IN_EN_AUTO_LINK_SPEED_MASK |
+		     HWRM_PORT_PHY_CFG_IN_EN_AUTO_PAM4_LINK_SPD_MASK |
+		     HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_MODE;
+		req->enables |= rte_cpu_to_le_32(en);
+		req->auto_mode =
+			HWRM_PORT_PHY_CFG_INPUT_AUTO_MODE_SPEED_MASK;
+	} else {
+		uint32_t en;
+
+		req->flags |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_FORCE);
+		if (speed) {
+			if (bp->link_info->link_signal_mode) {
+				req->force_pam4_link_speed =
+					rte_cpu_to_le_16(speed);
+				en = HWRM_PORT_PHY_CFG_IN_EN_FORCE_PAM4_LINK_SPEED;
+				req->enables |= rte_cpu_to_le_32(en);
+			} else {
+				req->force_link_speed =
+					rte_cpu_to_le_16(speed);
+			}
+		}
+	}
+	req->auto_duplex =
+		bnxt_parse_eth_link_duplex(dev_conf->link_speeds);
+	req->enables |=
+		rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_DUPLEX);
+}
+
+int bnxt_hwrm_set_pause(struct bnxt *bp)
+{
+	struct hwrm_port_phy_cfg_input req = {0};
+	struct hwrm_port_phy_cfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_link_info *link_info = bp->link_info;
+	int rc;
+
+	HWRM_PREP(&req, HWRM_PORT_PHY_CFG, BNXT_USE_CHIMP_MB);
+
+	bnxt_hwrm_set_pause_common(bp, &req);
+
+	if ((link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) ||
+	    link_info->force_link_chng)
+		bnxt_hwrm_set_link_common(bp, &req);
+
+	rc = bnxt_hwrm_send_message(bp, &req, sizeof(req), BNXT_USE_CHIMP_MB);
+
+	HWRM_CHECK_RESULT();
+
+	if (!rc && !(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)) {
+		link_info->pause = link_info->force_pause;
+		link_info->auto_pause = 0;
+	}
+	link_info->force_link_chng = false;
+
+	HWRM_UNLOCK();
+	return rc;
+}
+
 static int bnxt_set_hwrm_link_config_v2(struct bnxt *bp, bool link_up)
 {
 	struct rte_eth_conf *dev_conf = &bp->eth_dev->data->dev_conf;
diff --git a/drivers/net/bnxt/bnxt_hwrm.h b/drivers/net/bnxt/bnxt_hwrm.h
index fc56223ab4..3034803023 100644
--- a/drivers/net/bnxt/bnxt_hwrm.h
+++ b/drivers/net/bnxt/bnxt_hwrm.h
@@ -261,6 +261,7 @@ void bnxt_free_hwrm_rx_ring(struct bnxt *bp, int queue_index);
 int bnxt_alloc_hwrm_resources(struct bnxt *bp);
 int bnxt_get_hwrm_link_config(struct bnxt *bp, struct rte_eth_link *link);
 int bnxt_set_hwrm_link_config(struct bnxt *bp, bool link_up);
+int bnxt_hwrm_set_pause(struct bnxt *bp);
 int bnxt_hwrm_func_qcfg(struct bnxt *bp, uint16_t *mtu);
 int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp);
 int bnxt_hwrm_func_reserve_vf_resc(struct bnxt *bp, bool test);
-- 
2.47.3


^ permalink raw reply related

* DPDK next-net will closed for summer holidays in July
From: Stephen Hemminger @ 2026-06-16 22:50 UTC (permalink / raw)
  To: dev

I will not be doing any active review, merging, or contributing
to DPDK from July 4th - July 21st. If your patch is in patchwork
will get back to it after holiday (on pause); just don't panic if
no review or merge happens.


^ permalink raw reply

* Re: [PATCH v2] tools: AI review handle empty Error sections
From: Stephen Hemminger @ 2026-06-16 22:46 UTC (permalink / raw)
  To: Matthew Gee; +Cc: dev, aconole, lylavoie
In-Reply-To: <20260612190807.1020863-1-mgee@iol.unh.edu>

On Fri, 12 Jun 2026 15:08:07 -0400
Matthew Gee <mgee@iol.unh.edu> wrote:

> Previous review-patch.py would detect and report and error or warning
> based off of the occurrence of the headers of the error and warning
> sections. This led to consistent false positives as often AI reviewers
> will create the header but put "none" or similar filler text within
> the following body.
> 
> This patch updates the code in order to check if the AI review has a
> body with error or warnings to fix and not just filler text. This is
> done by querying multiple lines at once and adjusting the regex to
> filter out headers followed only by filler text or formatting in the
> review.
> 
> These changes were tested against 10+ AI review outputs with several
> variations in formatting and filler text in order to catch a good
> variety of cases to make sure code reviews with actual errors or
> warnings are caught and not missed.
> 
> Signed-off-by: Matthew Gee <mgee@iol.unh.edu>
> ---

AI saw some stuff which I missed.

Matthew,

The windowing (tee/islice/chain) is aligned correctly, but the new
matching has a few problems.

The bigger issue is that this only recognizes markdown '#' headers now.
The old scan matched an optional '#' prefix, '**' bold, and '<h1-3>' too.
rgx_should_match requires "#+\s", so plain-text "Errors:" / "**Errors**"
and HTML "<h2>Errors</h2>" no longer match. Default --format is text and
html is supported, so reviews in those formats classify clean (exit 0)
even when they report errors, which silently breaks the 2/3 exit codes
compare-patch-reviews.sh relies on. The '#'/'**'/'<h>' alternatives need
to stay.

The 3-line concatenation also reintroduces the false positive you're
trying to kill. curr+next+next_next are joined with no separator, so

	## Errors
	None
	## Warnings

collapses to "## errorsnone## warnings"; the 'none...$' filler anchor
fails because the next header follows, and has_errors gets set. Any
output that doesn't blank-line-separate sections hits this.

And in the other direction, content more than two lines below a header
is missed: two blank lines between "## Errors" and the body leave
stripped == "## errors", the '$' branch matches, and a real finding is
suppressed. The heuristic ends up sensitive to exact line spacing both
ways.

Minor: iter[str] / iter[str | None] aren't valid generics (use the
already-imported Iterator); the vars are also re-annotated with
conflicting types and annotate the iterators rather than the loop
targets. mypy will reject these. Run black too - the new for/zip line
and the stripped assignment are over width.

Suggest keeping header matching in all three formats, then scanning past
blank lines to the first non-empty body line and suppressing only if
it's filler (none/n/a/empty). That decouples detection from spacing
instead of fixing it to a 3-line shape.

^ permalink raw reply

* Re: [PATCH v2] devtools: add Vertex AI to review scripts
From: Stephen Hemminger @ 2026-06-16 22:44 UTC (permalink / raw)
  To: David Marchand; +Cc: dev, thomas, Aaron Conole
In-Reply-To: <20260602064406.1230601-1-david.marchand@redhat.com>

On Tue,  2 Jun 2026 08:44:06 +0200
David Marchand <david.marchand@redhat.com> wrote:

> OpenAI, xAI) can now use Vertex AI with Application Default Credentials.
> 
> This requires a python dependency google-auth but it is left as
> optional.
> 
> Key features:
> - Auto-detection of authentication method based on environment
> - Manual override via --auth flag (auto, direct, vertex)
> - Automatic model name translation for Vertex format
> - Support for both global and regional Vertex endpoints
> - Proper error handling for Vertex API responses
> 
> Provider-specific implementations:
> - Anthropic: Uses /publishers/anthropic/models/{model}:rawPredict
>   with model name format claude-sonnet-4-5@20250929
> - Google: Uses /publishers/google/models/{model}:generateContent
> - OpenAI/xAI: Use /endpoints/openapi/chat/completions
>   with publisher prefix (e.g., openai/gpt-oss-120b-maas)
> 
> Authentication detection logic:
> - Vertex: Requires google-auth library and ADC configured
> - Direct: Falls back to API key from environment variables
> 
> Available models on Vertex AI:
> - Anthropic: All Claude models
> - Google: All Gemini models
> - OpenAI: gpt-oss-120b-maas, gpt-oss-20b-maas (open-weight only)
> - xAI: grok-4.20-*, grok-4.1-fast-* variants
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---

I have no direct knowledge of Vertex pro/con.
AI review with better model had some insights.

On the whole the refactor is sound: moving "model" insertion into
_build_request_meta and threading an auth string instead of api_key is
consistent across _common.py, review-patch.py and review-doc.py, and the
build_*_request signature changes match their only callers. Two issues and
a couple of minor notes.

Warning: project-id resolution is split and the early error is unreachable-
proof in the wrong direction.

  get_vertex_credentials() does:

      credentials, project = google_auth_default()
      ...
      if not project:
          error("Could not detect GCP project. Set GOOGLE_CLOUD_PROJECT ...")
      return credentials.token, project

  but the caller then applies its own fallback:

      access_token, project_id = get_vertex_credentials()
      project_id = os.environ.get("GOOGLE_CLOUD_PROJECT") \
                   or os.environ.get("GCP_PROJECT") or project_id

  If ADC resolves no project, get_vertex_credentials() calls error() and
  exits before the GCP_PROJECT fallback can run. google.auth.default()
  honors GOOGLE_CLOUD_PROJECT itself, but not GCP_PROJECT, so the
  GCP_PROJECT branch here is dead in exactly the case it exists for, and
  the error message tells the user to set a variable that wouldn't have
  helped. Resolve the project in one place: drop the "if not project"
  error from get_vertex_credentials() and check after the env fallback,
  or move the env lookups into get_vertex_credentials().

Warning: several added lines exceed the 100-column limit.

      _common.py: error("Could not detect GCP project...")        137 cols
      _common.py: error("Vertex AI support requires...")          108 cols
      _common.py: project_id = os.environ.get(...) or ...         102 cols
      _common.py: vertex_base = f"https://aiplatform..."          104 cols
      _common.py: vertex_base = f"https://{location}-..."         115 cols

  pycodestyle/flake8 will flag all five; the project_id line is also
  black-splittable so "black --check" fails on it. Wrap the f-strings and
  split the message strings.

Info: detect_auth_method() catches bare Exception around
google_auth_default(). Acceptable for best-effort autodetection, but
narrowing it (or a "# noqa"/pylint disable with a reason) documents intent.

Info: the HTTPError path now does error_data[0].get('error', ...) after
unwrapping a list. If a Vertex error body is a list of non-dicts this
raises AttributeError outside the JSONDecodeError guard. Low risk, but a
type check would harden it.

Info: --auth is wired into review-patch.py and review-doc.py but
compare-patch-reviews.sh has no passthrough, so Vertex can't be selected
when comparing providers. May be intentional for this patch.

^ permalink raw reply

* Re: [RFC] devtools: add tool calling support to review-patch.py
From: Stephen Hemminger @ 2026-06-16 22:42 UTC (permalink / raw)
  To: Aaron Conole; +Cc: dev, David Marchand
In-Reply-To: <20260609182652.1053422-1-aconole@redhat.com>

On Tue,  9 Jun 2026 14:26:47 -0400
Aaron Conole <aconole@redhat.com> wrote:

> Add an iterative tool-use loop to review-patch.py for the Anthropic
> and OpenAI providers. The reviewer can now look up additional context
> from the DPDK source tree when the patch alone is insufficient,
> rather than having to guess at surrounding code, API contracts, or
> function signatures.
> 
> Tool calling is enabled by default with a limit of 10 rounds. Pass
> '--tool-rounds 0' to disable it and restore the previous single-shot
> behavior.  The round limit prevents runaway cost on large patches
> that when reached will force the model to deliver a final judgement.
> 
> Initial tool set:
>   - grep           Searches for regex across the file system with
>                    optional path restrictions and case-insensitive
>                    matches.
>   - file_read      Line range read of a specific path.
> 
> Both tools are limited to the repository root to prevent path
> traversal.  Path outputs are relative to the repo root.
> 
> The system prompt is extended when tool calling is active to
> encourage the model to use tools only when genuinely needed,
> keeping unnecessary round trips and token costs under control
> and to a minimum.
> 
> Internally, _common.py gains send_request_raw() (returning the
> raw response dict) so the tool-calling loops can inspect
> stop_reason / finish_reason before extracting text.
> 
> Signed-off-by: Aaron Conole <aconole@redhat.com>
> ---

Well AI saw the security bypass here...

Error
=====

devtools/ai/review-patch.py: path containment check is bypassable
(_tool_grep and _tool_file_read)

	repo_resolved = Path(repo_root).resolve()
	search_path = (repo_resolved / rel_path).resolve()
	if not str(search_path).startswith(str(repo_resolved)):
		return "Error: path is outside the repository"

str.startswith() on the resolved path string is a prefix match, not a
path-component match. If repo_root resolves to /home/me/dpdk, then
/home/me/dpdk-private/secrets passes the check, since the string starts
with "/home/me/dpdk". The commit message states both tools are "limited
to the repository root to prevent path traversal" -- that property does
not hold for sibling directories sharing the name prefix. Same bug in
both tool helpers.

	# is_relative_to (3.9+), raises nothing, no string games
	if not search_path.is_relative_to(repo_resolved):
		return "Error: path is outside the repository"

(Absolute rel_path and ../ escapes are already caught by resolve() +
this fix; only the prefix case is currently open.)


Warning
=======

devtools/ai/_common.py: unrelated reformatting mixed into a feature
patch. The docstring re-wrapping in print_token_summary(),
_extract_text(), _print_verbose_usage(), and the classify_review()
regex reflow in review-patch.py are cosmetic churn unrelated to tool
calling. Drop them or split into a separate cleanup; they inflate the
diff and obscure the actual change.

devtools/ai/review-patch.py: grep --include list excludes the files the
tool prompt points the model at. _tool_grep restricts to
*.[ch]/*.py/*.rst/*.ini, but TOOL_PROMPT_EXTENSION explicitly directs
the model to check ".ci/" scripts, meson.build, and MAINTAINERS. Those
are extensionless or shell, so grep silently returns "No matches found"
rather than surfacing the omission. Either widen the include set or
note the restriction in the tool description so the model doesn't draw
false conclusions from empty results.


Info
====

xai is OpenAI wire-compatible and already shares build_openai_request()
elsewhere, but tool calling is gated to provider == "openai" in
call_api() and the verbose banner. xai (and any future
OpenAI-compatible provider) falls back to single-shot with no notice.
Reusing call_api_with_tools_openai() for xai looks free.

send_request_raw() bypasses _extract_text(), which is where the
"error" in result check lived. The tool loops read stop_reason /
choices directly, so a provider error returned with HTTP 200 yields an
empty string and a misleading "clean" review instead of a hard failure.
Worth a guard on api_result.get("error") at the top of each loop body.

Default-on at 10 rounds is a behavior and cost change for every existing
caller, and grants repo-wide read to the model by default. Reasonable
for an RFC to propose, but worth calling out explicitly for the list --
some CI users may want opt-in rather than opt-out.

^ permalink raw reply

* Re: [PATCH 0/6] net/dpaa2: RSS fixes and improvements
From: Stephen Hemminger @ 2026-06-16 22:07 UTC (permalink / raw)
  To: Maxime Leroy; +Cc: dev
In-Reply-To: <20260616104717.723087-1-maxime@leroys.fr>

On Tue, 16 Jun 2026 12:47:09 +0200
Maxime Leroy <maxime@leroys.fr> wrote:

> A set of RSS fixes and improvements for the dpaa2 PMD.
> 
> Patches 1 and 2 fix the RSS hash key: the L4 destination port was never
> added (both extracts used the source port), and SCTP now uses the same L4
> port extraction as TCP/UDP. Both are tagged for stable.
> 
> Patches 3 and 4 are small cleanups in the key builder (a dead num_extracts
> increment, and the unset PPPoE guard flag).
> 
> Patch 5 honours RTE_ETH_RSS_LEVEL_INNERMOST so tunnelled traffic hashes on
> the inner IP header. Patch 6 implements reta_query / reta_update as an
> emulation over the HW distribution-size mechanism, since dpaa2 has no
> software-visible indirection table.
> 
> Tested on LX2160A (lx2160acex7).

Applied to next-net

^ permalink raw reply

* Re: [PATCH v6 00/21] Wangxun Fixes
From: Stephen Hemminger @ 2026-06-16 21:42 UTC (permalink / raw)
  To: Zaiyu Wang; +Cc: dev
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>

On Tue, 16 Jun 2026 20:20:08 +0800
Zaiyu Wang <zaiyuwang@trustnetic.com> wrote:

> This series fixes several issues found on Wangxun Emerald, Sapphire and
> Amber-lite NICs, with a focus on link-related problems.
> ---
> v6:
> - Fixed more issues identified by AI review
> ---
> v5:
> - Fixed issues identified by AI review
> ---
> v4:
> - Fixed issues identified by devtools scripts
> ---
> v3:
> - Addressed Stephen's comments
> ---
> v2:
> - Fixed compilation error and code style issues
> ---
> 
> Zaiyu Wang (21):
>   net/txgbe: remove duplicate xstats counters
>   net/ngbe: remove duplicate xstats counters
>   net/ngbe: add missing CDR config for YT PHY
>   net/ngbe: fix VF promiscuous and allmulticast
>   net/txgbe: fix inaccuracy in Tx rate limiting
>   net/txgbe: fix link status check condition
>   net/txgbe: fix Tx desc free logic
>   net/txgbe: fix link flow control registers for Amber-Lite
>   net/txgbe: fix link flow control config for Sapphire
>   net/txgbe: fix a mass of unknown interrupts
>   net/txgbe: fix traffic class priority configuration
>   net/txgbe: fix link stability for 25G NIC
>   net/txgbe: fix link stability for 40G NIC
>   net/txgbe: fix link stability for Amber-Lite backplane mode
>   net/txgbe: fix FEC mode configuration on 25G NIC
>   net/txgbe: fix SFP module identification
>   net/txgbe: fix get module info operation
>   net/txgbe: fix get EEPROM operation
>   net/txgbe: fix to reset Tx write-back pointer
>   net/txgbe: fix to enable Tx desc check
>   net/txgbe: fix temperature track for AML NIC
> 
>  drivers/net/ngbe/base/ngbe_phy_yt.c       |    3 +
>  drivers/net/ngbe/ngbe_ethdev.c            |    5 -
>  drivers/net/ngbe/ngbe_ethdev_vf.c         |   11 +-
>  drivers/net/txgbe/base/meson.build        |    2 +
>  drivers/net/txgbe/base/txgbe.h            |    2 +
>  drivers/net/txgbe/base/txgbe_aml.c        |  185 +-
>  drivers/net/txgbe/base/txgbe_aml.h        |    6 +-
>  drivers/net/txgbe/base/txgbe_aml40.c      |  114 +-
>  drivers/net/txgbe/base/txgbe_aml40.h      |    6 +-
>  drivers/net/txgbe/base/txgbe_dcb_hw.c     |    2 +-
>  drivers/net/txgbe/base/txgbe_e56.c        | 3773 +++++++++++++++++++++
>  drivers/net/txgbe/base/txgbe_e56.h        | 1753 ++++++++++
>  drivers/net/txgbe/base/txgbe_e56_bp.c     | 2597 ++++++++++++++
>  drivers/net/txgbe/base/txgbe_e56_bp.h     |  282 ++
>  drivers/net/txgbe/base/txgbe_hw.c         |   54 +-
>  drivers/net/txgbe/base/txgbe_hw.h         |    4 +-
>  drivers/net/txgbe/base/txgbe_osdep.h      |    4 +
>  drivers/net/txgbe/base/txgbe_phy.c        |  362 +-
>  drivers/net/txgbe/base/txgbe_phy.h        |   46 +-
>  drivers/net/txgbe/base/txgbe_regs.h       |   13 +-
>  drivers/net/txgbe/base/txgbe_type.h       |   43 +-
>  drivers/net/txgbe/txgbe_ethdev.c          |  472 ++-
>  drivers/net/txgbe/txgbe_ethdev.h          |    7 +-
>  drivers/net/txgbe/txgbe_rxtx.c            |  109 +-
>  drivers/net/txgbe/txgbe_rxtx.h            |   36 +
>  drivers/net/txgbe/txgbe_rxtx_vec_common.h |   17 +-
>  26 files changed, 9464 insertions(+), 444 deletions(-)
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56.c
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56.h
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.c
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.h
> 

AI review was mostly clean, do you want to change this one thing?

Review of [PATCH v6 00/21] net/{txgbe,ngbe} fixes from Zaiyu Wang

All findings from the v5 review are addressed:

  - 07/21: Tx desc free now loads the 32-bit headwb_mem atomically and
    casts to uint16_t at the use site, removing the type pun.
  - 14/21: kr_read_poll() switches from usleep() to usec_delay() to
    match the rest of the txgbe base layer.
  - 16/21: stray tab+space indent in txgbe_read_i2c_sff8636() is gone.
  - 17/21: stray tab+space indent in txgbe_get_module_info() is gone.
    TXGBE_SFF_DDM_IMPLEMENTED is now indented as a bit mask under the
    8472_SWAP register definition, which reads correctly.
  - 18/21: page is reset to 0 inside each iteration of the for loop so
    it no longer accumulates across iterations.  The flat-memory check
    is now driven by an explicit read of module byte 2 via
    read_i2c_sff8636() into a local is_flat_mem flag, instead of
    reading data[0x2] out of the caller's output buffer.  databyte is
    cleared to 0 at the top of each loop body so the skipped-read
    branch can no longer leak the previous iteration's value.
  - 20/21: RTE_LIBRTE_SECURITY is corrected to RTE_LIB_SECURITY so the
    !txq->using_ipsec guard is actually preprocessed in.

One small remaining comment on 18/21, not blocking.

Patch 18/21 (net/txgbe: fix get EEPROM operation)

Info: is_flat_mem has inverted semantics relative to its name, which
is going to confuse future maintainers.  SFF-8636 byte 2 bit 2
("Flat_mem") is 1 for flat-memory modules and 0 for paged-memory
modules (this is the same convention ethtool uses).  The new code
treats the bit the opposite way:

  +	bool is_flat_mem = true;
  ...
  +		if (rdata & 0x4)
  +			is_flat_mem = false;
  ...
  +			if (page == 0 || is_flat_mem) {
  +				status = hw->phy.read_i2c_sff8636(hw, page, ...);

So when the module reports flat (bit set), is_flat_mem is set to
false; when paged (bit clear), is_flat_mem stays true.  The condition
'page == 0 || is_flat_mem' reads only page 0 in the flat case and any
page in the paged case, which is the right behavior, but the variable
ends up meaning "paged memory is OK to read" rather than "module is
flat".  Suggest either:

	bool is_flat_mem = false;
	if (rdata & 0x4)
		is_flat_mem = true;
	...
	if (page == 0 || !is_flat_mem) {
		...
	}

or renaming to allow_paged_reads (or similar) and keeping the existing
default and condition.



^ permalink raw reply

* Re: [PATCH v10 1/1] net/mana: add device reset support
From: Stephen Hemminger @ 2026-06-16 21:22 UTC (permalink / raw)
  To: Wei Hu; +Cc: dev, longli, weh
In-Reply-To: <20260616123158.43583-2-weh@linux.microsoft.com>

On Tue, 16 Jun 2026 05:31:58 -0700
Wei Hu <weh@linux.microsoft.com> wrote:

> teardown immediately (dev_stop, secondary IPC, dev_close, MR cache
> free) before waiting for the hardware recovery timer to fire. This
> avoids blocking the EAL interrupt thread on multi-second IPC
> timeouts and ibverbs calls. After the recovery delay, the thread
> unregisters the interrupt handler, re-probes the PCI device,
> reinitializes MR caches, and restarts queues. Each function owns
> its own lock scope with no lock hand-off between threads.
> 
> Each queue has an atomic burst_state variable where bit 0 is the
> in-burst flag and bit 1 is a blocked flag. The data path uses a
> single compare-and-swap (0 to 1) to enter a burst, which fails
> immediately if the blocked bit is set. The reset path sets the
> blocked bit via atomic fetch-or and polls bit 0 to wait for
> in-flight bursts to drain. This single-variable design avoids the
> need for sequential consistency ordering.
> 
> A per-device mutex serializes the reset path with ethdev
> operations. The mutex uses PTHREAD_PROCESS_SHARED for multi-process
> support and is held across blocking IB verbs calls. A trylock
> helper encapsulates the lock acquisition and device state check
> for all ethdev operation wrappers. Operations that cannot wait
> (configure, queue setup) return -EBUSY during reset, while
> dev_stop and dev_close join the reset thread before acquiring
> the lock to ensure proper sequencing.
> 
> The reset thread keeps reset_thread_active true throughout its
> lifetime. mana_join_reset_thread uses rte_thread_equal to detect
> the self-join case (when a recovery callback calls dev_stop or
> dev_close from the reset thread itself) and calls
> rte_thread_detach instead of join, so thread resources are freed
> on exit. External callers join normally.
> 
> The condvar wait in the reset thread uses a predicate loop that
> checks dev_state under reset_cond_mutex, so a PCI remove signal
> that arrives before the thread enters the wait is not lost. The
> PCI remove callback sets dev_state to RESET_FAILED under the
> same mutex before signaling. A lock/unlock barrier on
> reset_ops_lock in the PCI remove path ensures teardown has
> completed before emitting the INTR_RMV event.
> 
> Multi-process support is included: secondary processes unmap and
> remap doorbell pages via IPC during the reset enter and exit
> phases. The secondary RESET_EXIT handler closes the received fd
> unconditionally after processing, even when the doorbell page is
> already mapped. Data path functions in both primary and secondary
> processes check the device state atomically and return early when
> the device is not active.
> 
> The driver emits RTE_ETH_EVENT_ERR_RECOVERING before entering the
> reset path so that upper layers (e.g. netvsc) can switch their
> data path before queues are stopped. The event is emitted outside
> the reset lock to avoid deadlock if the callback calls dev_stop or
> dev_close. On completion, the driver emits RECOVERY_SUCCESS or
> RECOVERY_FAILED after releasing the lock. If a recovery callback
> triggers dev_stop or dev_close, the self-join detection in
> mana_join_reset_thread detaches the thread to avoid deadlock. If
> the enter phase fails internally, RECOVERY_FAILED is sent
> immediately so the application receives a terminal event. A PCI
> device removal event callback distinguishes hot-remove from
> service reset.
> 
> Documentation for the device reset feature is added in the MANA
> NIC guide and the 26.07 release notes.
> 
> Signed-off-by: Wei Hu <weh@microsoft.com>
> ---
Applied to next-net

^ permalink raw reply

* [PATCH 6/6] app/test: add test for IP reassembly
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

There was no functional test for IPv4/IPv6 reassembly,
only a performance test. Add a new test  modeled on the Linux kernel
selftest tools/testing/selftests/net/ip_defrag.c. This is new
code so no license conflict.

Test covers:
size and fragment-size sweep across in-order, reverse,
odd/even and block delivery orders with byte-exact payload validation;
minimum 8-byte fragments; the fragment-count limit;
timeout of incomplete datagrams;
and the duplicate, overlap, extension-header and
oversized-fragment cases fixed earlier in this series.

The reassembled packet is checked with rte_mbuf_check() to catch
malformed segment chains in addition to wrong content.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/meson.build       |   1 +
 app/test/test_reassembly.c | 644 +++++++++++++++++++++++++++++++++++++
 2 files changed, 645 insertions(+)
 create mode 100644 app/test/test_reassembly.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 61024125a7..b8c2208d0b 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -160,6 +160,7 @@ source_file_deps = {
     'test_rawdev.c': ['rawdev', 'bus_vdev', 'raw_skeleton'],
     'test_rcu_qsbr.c': ['rcu', 'hash'],
     'test_rcu_qsbr_perf.c': ['rcu', 'hash'],
+    'test_reassembly.c': ['net', 'ip_frag'],
     'test_reassembly_perf.c': ['net', 'ip_frag'],
     'test_reciprocal_division.c': [],
     'test_reciprocal_division_perf.c': [],
diff --git a/app/test/test_reassembly.c b/app/test/test_reassembly.c
new file mode 100644
index 0000000000..9cada5f3b4
--- /dev/null
+++ b/app/test/test_reassembly.c
@@ -0,0 +1,644 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026
+ *
+ * Functional unit tests for the IP reassembly path of librte_ip_frag.
+ *
+ * Coverage mirrors the Linux selftest tools/testing/selftests/net/ip_defrag.c
+ * adapted to the library API and to DPDK-specific constraints:
+ *
+ *   - size / fragment-size sweep, bounded by RTE_LIBRTE_IP_FRAG_MAX_FRAG
+ *   - in-order, reverse, odd-then-even, and block-reordered delivery
+ *   - byte-exact validation of the reassembled payload (not just length)
+ *   - minimum (8-byte) fragments
+ *   - fragment-count boundary: exactly MAX reassembles, MAX + 1 fails
+ *   - incomplete datagram reaped on timeout
+ *   - zero-length fragment rejected
+ *   - duplicate fragment tolerated in a reordered set
+ *   - overlapping fragments (leading/trailing/contained) discarded
+ *   - IPv6 fragment with extension headers in the unfragmentable part dropped
+ *   - fragment whose end exceeds the maximum datagram size dropped
+ *
+ * The last four groups depend on the corresponding reassembly fixes
+ * (duplicate tolerance, overlap discard, extension-header drop, oversize
+ * drop); they pass once those are applied and fail on unpatched code. The
+ * remaining cases pass regardless.
+ *
+ * Fragments use l2_len == 0; the library reads the L3 header at offset 0.
+ */
+
+#include "test.h"
+
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_log.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#define NB_MBUF		1024
+#define MBUF_CACHE	0		/* exact accounting for leak checks */
+#define MBUF_DATA	2048
+#define V4_L3_LEN	((uint16_t)sizeof(struct rte_ipv4_hdr))
+#define V6_L3_LEN	((uint16_t)(sizeof(struct rte_ipv6_hdr) + \
+				    RTE_IPV6_FRAG_HDR_SIZE))
+#define TEST_ID		0x4242
+
+#ifndef RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define RTE_LIBRTE_IP_FRAG_MAX_FRAG 8
+#endif
+#define MAX_FRAG	RTE_LIBRTE_IP_FRAG_MAX_FRAG
+
+#define MAX_PAYLOAD	(MAX_FRAG * 256)	/* keeps a fragment in one mbuf */
+
+enum family { V4, V6 };
+enum order  { IN_ORDER, REVERSE, ODD_EVEN, BLOCK };
+
+struct frag_desc {
+	uint16_t ofs;	/* byte offset into the payload */
+	uint16_t plen;	/* payload bytes after L3 */
+	uint8_t  mf;
+};
+
+static struct rte_mempool *pkt_pool;
+
+/* position-dependent payload pattern, non-periodic at 256 so a misordered
+ * reassembly is detected even when lengths line up.
+ */
+static inline uint8_t
+pat(uint32_t k)
+{
+	return (uint8_t)(k * 31u + 7u);
+}
+
+/* ------------------------------- harness -------------------------------- */
+
+static int
+testsuite_setup(void)
+{
+	/* the table create/destroy per case is chatty at INFO level */
+	rte_log_set_level_pattern("lib.ip_frag", RTE_LOG_NOTICE);
+
+	pkt_pool = rte_pktmbuf_pool_create("REASM_POOL", NB_MBUF, MBUF_CACHE,
+					   0, MBUF_DATA, SOCKET_ID_ANY);
+	return pkt_pool == NULL ? TEST_FAILED : TEST_SUCCESS;
+}
+
+static void
+testsuite_teardown(void)
+{
+	rte_mempool_free(pkt_pool);
+	pkt_pool = NULL;
+}
+
+/* Every case must start and end with a full pool, so a leak in one case is
+ * pinpointed here rather than silently masking the next one.
+ */
+static int
+ut_setup(void)
+{
+	if (rte_mempool_avail_count(pkt_pool) != NB_MBUF) {
+		printf("pool not full at case start: %u/%u\n",
+		       rte_mempool_avail_count(pkt_pool), NB_MBUF);
+		return TEST_FAILED;
+	}
+	return TEST_SUCCESS;
+}
+
+static struct rte_ip_frag_tbl *
+tbl_new(uint64_t max_cycles)
+{
+	return rte_ip_frag_table_create(16, MAX_FRAG, 16, max_cycles,
+					rte_socket_id());
+}
+
+/* Build one fragment with a position-dependent payload. */
+static struct rte_mbuf *
+build_frag(enum family fam, uint16_t ofs, uint16_t plen, uint8_t mf)
+{
+	struct rte_mbuf *m = rte_pktmbuf_alloc(pkt_pool);
+	uint16_t l3 = (fam == V4) ? V4_L3_LEN : V6_L3_LEN;
+	char *p;
+	uint16_t i;
+
+	if (m == NULL)
+		return NULL;
+	m->data_off = 0;
+
+	if (fam == V4) {
+		struct rte_ipv4_hdr *ip = rte_pktmbuf_mtod(m,
+						struct rte_ipv4_hdr *);
+		uint16_t fo = ofs / RTE_IPV4_HDR_OFFSET_UNITS;
+
+		memset(ip, 0, V4_L3_LEN);
+		if (mf)
+			fo |= RTE_IPV4_HDR_MF_FLAG;
+		ip->version_ihl = 0x45;
+		ip->total_length = rte_cpu_to_be_16(V4_L3_LEN + plen);
+		ip->packet_id = rte_cpu_to_be_16(TEST_ID);
+		ip->fragment_offset = rte_cpu_to_be_16(fo);
+		ip->time_to_live = 64;
+		ip->next_proto_id = IPPROTO_UDP;
+		ip->src_addr = rte_cpu_to_be_32(0x0a000001);
+		ip->dst_addr = rte_cpu_to_be_32(0x0a000002);
+	} else {
+		struct rte_ipv6_hdr *ip = rte_pktmbuf_mtod(m,
+						struct rte_ipv6_hdr *);
+		struct rte_ipv6_fragment_ext *fh =
+			rte_pktmbuf_mtod_offset(m,
+				struct rte_ipv6_fragment_ext *,
+				sizeof(struct rte_ipv6_hdr));
+
+		memset(ip, 0, V6_L3_LEN);
+		ip->vtc_flow = rte_cpu_to_be_32(6u << 28);
+		ip->payload_len = rte_cpu_to_be_16(RTE_IPV6_FRAG_HDR_SIZE + plen);
+		ip->proto = IPPROTO_FRAGMENT;
+		ip->hop_limits = 64;
+		ip->src_addr.a[15] = 1;
+		ip->dst_addr.a[15] = 2;
+		fh->next_header = IPPROTO_UDP;
+		fh->reserved = 0;
+		fh->frag_data = rte_cpu_to_be_16(
+				RTE_IPV6_SET_FRAG_DATA(ofs, mf ? 1 : 0));
+		fh->id = rte_cpu_to_be_32(TEST_ID);
+	}
+
+	p = rte_pktmbuf_mtod_offset(m, char *, l3);
+	for (i = 0; i < plen; i++)
+		p[i] = (char)pat(ofs + i);
+
+	m->data_len = m->pkt_len = l3 + plen;
+	m->l2_len = 0;
+	m->l3_len = l3;
+	return m;
+}
+
+static struct rte_mbuf *
+feed(enum family fam, struct rte_ip_frag_tbl *tbl,
+	struct rte_ip_frag_death_row *dr, const struct frag_desc *d, uint64_t tms)
+{
+	struct rte_mbuf *m = build_frag(fam, d->ofs, d->plen, d->mf);
+
+	if (m == NULL)
+		return NULL;
+	if (fam == V4) {
+		struct rte_ipv4_hdr *ip = rte_pktmbuf_mtod(m, struct rte_ipv4_hdr *);
+		return rte_ipv4_frag_reassemble_packet(tbl, dr, m, tms, ip);
+	} else {
+		struct rte_ipv6_hdr *ip = rte_pktmbuf_mtod(m, struct rte_ipv6_hdr *);
+		struct rte_ipv6_fragment_ext *fh =
+			rte_pktmbuf_mtod_offset(m, struct rte_ipv6_fragment_ext *,
+						sizeof(struct rte_ipv6_hdr));
+		return rte_ipv6_frag_reassemble_packet(tbl, dr, m, tms, ip, fh);
+	}
+}
+
+/* Split a datagram of total_plen into fragments of frag_size (multiple of 8).
+ * Returns the fragment count, or -1 if it would exceed MAX_FRAG.
+ */
+static int
+make_datagram(uint16_t total_plen, uint16_t frag_size, struct frag_desc *out)
+{
+	int n = 0;
+	uint16_t ofs = 0;
+
+	while (ofs < total_plen) {
+		uint16_t rem = total_plen - ofs;
+		uint16_t len = rem <= frag_size ? rem : frag_size;
+
+		if (n >= MAX_FRAG)
+			return -1;
+		out[n].ofs = ofs;
+		out[n].plen = len;
+		out[n].mf = (ofs + len < total_plen);
+		ofs += len;
+		n++;
+	}
+	return n;
+}
+
+/* Produce a delivery order (array of indices into descs). */
+static void
+make_order(enum order ord, int n, int *idx)
+{
+	int i, k = 0;
+
+	switch (ord) {
+	case IN_ORDER:
+		for (i = 0; i < n; i++)
+			idx[i] = i;
+		break;
+	case REVERSE:
+		for (i = 0; i < n; i++)
+			idx[i] = n - 1 - i;
+		break;
+	case ODD_EVEN:
+		for (i = 1; i < n; i += 2)
+			idx[k++] = i;
+		for (i = 0; i < n; i += 2)
+			idx[k++] = i;
+		break;
+	case BLOCK: {
+		int t = n / 3 ? n / 3 : 1;
+
+		for (i = 2 * t; i < n; i++)
+			idx[k++] = i;
+		for (i = t; i < 2 * t && i < n; i++)
+			idx[k++] = i;
+		for (i = 0; i < t && i < n; i++)
+			idx[k++] = i;
+		break;
+	}
+	}
+}
+
+/* Feed descs in the given order; return reassembled mbuf or NULL. */
+static struct rte_mbuf *
+run_ordered(enum family fam, const struct frag_desc *descs, int n,
+	    const int *idx)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct rte_mbuf *out = NULL;
+	uint64_t tms = rte_rdtsc();
+	int i;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	if (tbl == NULL)
+		return NULL;
+	for (i = 0; i < n; i++) {
+		struct rte_mbuf *r = feed(fam, tbl, &dr, &descs[idx[i]], tms);
+
+		if (r != NULL)
+			out = r;
+	}
+	rte_ip_frag_free_death_row(&dr, 0);
+	rte_ip_frag_table_destroy(tbl);
+	return out;
+}
+
+/* Validate length and byte-exact payload, then free. Returns 0 on success.
+ * Note: reassembly strips the IPv6 fragment header, so the reassembled v6
+ * header is sizeof(rte_ipv6_hdr), not the V6_L3_LEN the fragments were built
+ * with. v4 has no fragment header to remove.
+ */
+static int
+validate(struct rte_mbuf *m, enum family fam, uint16_t total_plen)
+{
+	uint16_t l3 = (fam == V4) ? V4_L3_LEN :
+				    (uint16_t)sizeof(struct rte_ipv6_hdr);
+	uint8_t buf[MAX_PAYLOAD];
+	const uint8_t *p;
+	const char *reason;
+	uint16_t k;
+	int rc = 0;
+
+	if (m == NULL)
+		return -1;
+	if (rte_mbuf_check(m, 1, &reason) != 0) {
+		printf("  bad mbuf fam=%d total=%u: %s\n", fam, total_plen,
+		       reason);
+		rte_pktmbuf_free(m);
+		return -1;
+	}
+	if (m->pkt_len != (uint32_t)(l3 + total_plen)) {
+		rte_pktmbuf_free(m);
+		return -1;
+	}
+	p = rte_pktmbuf_read(m, l3, total_plen, buf);
+	if (p == NULL) {
+		rte_pktmbuf_free(m);
+		return -1;
+	}
+	for (k = 0; k < total_plen; k++) {
+		if (p[k] != pat(k)) {
+			rc = -1;
+			break;
+		}
+	}
+	rte_pktmbuf_free(m);
+	return rc;
+}
+
+/* --------------------------- baseline / sweep --------------------------- */
+
+static int
+sweep_one(enum family fam, uint16_t total_plen, uint16_t frag_size)
+{
+	struct frag_desc descs[MAX_FRAG];
+	int idx[MAX_FRAG];
+	const enum order orders[] = { IN_ORDER, REVERSE, ODD_EVEN, BLOCK };
+	int n = make_datagram(total_plen, frag_size, descs);
+	unsigned int o;
+
+	if (n < 2)		/* skip single-fragment / oversized for sweep */
+		return 0;
+
+	for (o = 0; o < RTE_DIM(orders); o++) {
+		make_order(orders[o], n, idx);
+		if (validate(run_ordered(fam, descs, n, idx), fam,
+			     total_plen) != 0) {
+			printf("  sweep fail: fam=%d total=%u fs=%u order=%u n=%d\n",
+			       fam, total_plen, frag_size, orders[o], n);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int
+sweep(enum family fam)
+{
+	const uint16_t fsizes[] = { 8, 16, 64, 256 };
+	unsigned int f;
+
+	for (f = 0; f < RTE_DIM(fsizes); f++) {
+		uint16_t fs = fsizes[f];
+		uint16_t total;
+
+		/* cover 2..MAX_FRAG fragments, last fragment partial */
+		for (total = fs + 8; total <= fs * MAX_FRAG; total += fs) {
+			if (sweep_one(fam, total, fs) != 0)
+				return TEST_FAILED;
+			if (total > fs + 4 &&
+			    sweep_one(fam, total - 4, fs) != 0)
+				return TEST_FAILED;
+		}
+	}
+	return TEST_SUCCESS;
+}
+
+static int test_sweep_v4(void) { return sweep(V4); }
+static int test_sweep_v6(void) { return sweep(V6); }
+
+/* Minimum 8-byte fragments. */
+static int
+test_min_fragment(void)
+{
+	struct frag_desc d[3] = {
+		{ 0, 8, 1 }, { 8, 8, 1 }, { 16, 8, 0 },
+	};
+	int idx[3];
+
+	make_order(REVERSE, 3, idx);
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V4, d, 3, idx), V4, 24),
+				"min 8-byte fragments not reassembled");
+	make_order(ODD_EVEN, 3, idx);
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V6, d, 3, idx), V6, 24),
+				"min 8-byte fragments not reassembled (v6)");
+	return TEST_SUCCESS;
+}
+
+/* Exactly MAX_FRAG fragments reassembles; MAX_FRAG + 1 fails. */
+static int
+test_cap_boundary(void)
+{
+	struct frag_desc d[MAX_FRAG + 1];
+	int idx[MAX_FRAG + 1];
+	uint16_t fs = 8, total = fs * MAX_FRAG;
+	int n, i;
+
+	n = make_datagram(total, fs, d);
+	TEST_ASSERT_EQUAL(n, MAX_FRAG, "expected MAX_FRAG fragments");
+	make_order(IN_ORDER, n, idx);
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V4, d, n, idx), V4, total),
+				"MAX_FRAG fragments should reassemble");
+
+	/* one more fragment than the table can hold */
+	for (i = 0; i <= MAX_FRAG; i++) {
+		d[i].ofs = i * fs;
+		d[i].plen = fs;
+		d[i].mf = (i < MAX_FRAG);
+		idx[i] = i;
+	}
+	TEST_ASSERT_NULL(run_ordered(V4, d, MAX_FRAG + 1, idx),
+			     "MAX_FRAG + 1 fragments should not reassemble");
+	TEST_ASSERT_EQUAL(rte_mempool_avail_count(pkt_pool), NB_MBUF,
+			      "overflowing set leaked mbufs");
+	return TEST_SUCCESS;
+}
+
+/* Incomplete datagram: no output, reaped on timeout. */
+static int
+test_incomplete_timeout(void)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	uint64_t mc = rte_get_tsc_hz(), tms = rte_rdtsc();
+	struct frag_desc d[2] = { { 0, 64, 1 }, { 128, 64, 0 } }; /* gap */
+	struct rte_mbuf *out = NULL;
+	int i;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(mc);
+	TEST_ASSERT_NOT_NULL(tbl, "table create failed");
+	for (i = 0; i < 2; i++) {
+		struct rte_mbuf *r = feed(V4, tbl, &dr, &d[i], tms);
+
+		if (r != NULL)
+			out = r;
+	}
+	TEST_ASSERT_NULL(out, "incomplete datagram reassembled");
+	rte_ip_frag_table_del_expired_entries(tbl, &dr, tms + mc + 1);
+	rte_ip_frag_free_death_row(&dr, 0);
+	rte_ip_frag_table_destroy(tbl);
+	TEST_ASSERT_EQUAL(rte_mempool_avail_count(pkt_pool), NB_MBUF,
+			      "expired fragments not freed");
+	return TEST_SUCCESS;
+}
+
+static int
+test_zero_len(void)
+{
+	struct frag_desc d = { 0, 0, 1 };
+	int idx = 0;
+
+	TEST_ASSERT_NULL(run_ordered(V4, &d, 1, &idx),
+			     "zero-length fragment accepted");
+	TEST_ASSERT_EQUAL(rte_mempool_avail_count(pkt_pool), NB_MBUF,
+			      "zero-length fragment leaked");
+	return TEST_SUCCESS;
+}
+
+/* --------------------- duplicate / overlap / reject --------------------- */
+
+/* A duplicate anywhere in a reordered set must not break reassembly. */
+static int
+test_dup_tolerated(void)
+{
+	/* offsets 0,64,128,192 with 64B frags; inject a dup of frag 1 */
+	struct frag_desc d[5] = {
+		{   0, 64, 1 }, {  64, 64, 1 }, {  64, 64, 1 }, /* dup */
+		{ 128, 64, 1 }, { 192, 64, 0 },
+	};
+	int idx[5] = { 1, 4, 2, 0, 3 };	/* reordered, dup interleaved */
+
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V4, d, 5, idx), V4, 256),
+				"duplicate fragment broke reassembly");
+	return TEST_SUCCESS;
+}
+
+/* Overlap geometries; the datagram must be discarded and every collected
+ * fragment freed. The last fragment is withheld so that on unfixed code the
+ * entry is *retained* (total_size stays UINT32_MAX) rather than torn down by
+ * the frag_size > total_size path: that retention is what we detect. We
+ * capture the mbufs still held in the table after draining the death row,
+ * before destroying the table (destroy frees held mbufs, hiding the leak).
+ */
+static int
+overlap_case(enum family fam, const struct frag_desc *d, int n, const char *what)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct rte_mbuf *out = NULL;
+	uint64_t tms = rte_rdtsc();
+	unsigned int held;
+	int i;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	if (tbl == NULL)
+		return -1;
+	for (i = 0; i < n; i++) {
+		struct rte_mbuf *r = feed(fam, tbl, &dr, &d[i], tms);
+
+		if (r != NULL)
+			out = r;
+	}
+	rte_ip_frag_free_death_row(&dr, 0);
+	held = NB_MBUF - rte_mempool_avail_count(pkt_pool);
+	rte_ip_frag_table_destroy(tbl);
+
+	if (out != NULL) {
+		rte_pktmbuf_free(out);
+		printf("  overlap reassembled instead of discarded: %s\n", what);
+		return -1;
+	}
+	if (held != 0) {
+		printf("  overlap kept %u fragment(s) instead of discarding: %s\n",
+		       held, what);
+		return -1;
+	}
+	return 0;
+}
+
+static int
+test_overlap(void)
+{
+	/* last fragment withheld in every case (all MF=1) */
+
+	/* overlapping fragment arrives second */
+	const struct frag_desc tail[2] = { { 0, 600, 1 }, { 300, 600, 1 } };
+	/* overlapping fragment arrives first */
+	const struct frag_desc head[2] = { { 300, 600, 1 }, { 0, 600, 1 } };
+	/* a fragment fully contained in an existing one */
+	const struct frag_desc cont[2] = { { 0, 600, 1 }, { 200, 200, 1 } };
+
+	TEST_ASSERT_SUCCESS(overlap_case(V6, tail, 2, "v6 overlap second"), "");
+	TEST_ASSERT_SUCCESS(overlap_case(V6, head, 2, "v6 overlap first"), "");
+	TEST_ASSERT_SUCCESS(overlap_case(V6, cont, 2, "v6 contained"), "");
+	TEST_ASSERT_SUCCESS(overlap_case(V4, tail, 2, "v4 overlap second"), "");
+	return TEST_SUCCESS;
+}
+
+/* IPv6 fragment with an unfragmentable extension header is dropped, not
+ * stored. Capture whether the fragment is still held in the table after the
+ * death row is drained but before the table is destroyed.
+ */
+static int
+test_v6_ext_header_drop(void)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct rte_mbuf *m, *r;
+	struct rte_ipv6_hdr *ip;
+	struct rte_ipv6_fragment_ext *fh;
+	unsigned int held;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	TEST_ASSERT_NOT_NULL(tbl, "table create failed");
+
+	m = build_frag(V6, 0, 64, 1);
+	TEST_ASSERT_NOT_NULL(m, "alloc failed");
+	/* pretend an 8-byte extension header sits before the fragment header */
+	m->l3_len += 8;
+	ip = rte_pktmbuf_mtod(m, struct rte_ipv6_hdr *);
+	fh = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_fragment_ext *,
+				     sizeof(struct rte_ipv6_hdr));
+	r = rte_ipv6_frag_reassemble_packet(tbl, &dr, m, rte_rdtsc(), ip, fh);
+	rte_ip_frag_free_death_row(&dr, 0);
+	held = NB_MBUF - rte_mempool_avail_count(pkt_pool);
+	rte_ip_frag_table_destroy(tbl);
+
+	TEST_ASSERT_NULL(r, "fragment with extension header accepted");
+	TEST_ASSERT_EQUAL(held, 0,
+			  "extension-header fragment stored instead of dropped");
+	return TEST_SUCCESS;
+}
+
+/* A fragment whose end exceeds the max datagram size is dropped, not stored. */
+static int
+oversize_drop_one(enum family fam)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct frag_desc d = { 0xFFF8, 64, 0 };	/* offset 65528 + 64 > 65535 */
+	struct rte_mbuf *r;
+	unsigned int held;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	if (tbl == NULL)
+		return -1;
+	r = feed(fam, tbl, &dr, &d, rte_rdtsc());
+	rte_ip_frag_free_death_row(&dr, 0);
+	held = NB_MBUF - rte_mempool_avail_count(pkt_pool);
+	rte_ip_frag_table_destroy(tbl);
+
+	if (r != NULL) {
+		rte_pktmbuf_free(r);
+		return -1;
+	}
+	return held == 0 ? 0 : -1;
+}
+
+static int
+test_oversize_drop(void)
+{
+	TEST_ASSERT_SUCCESS(oversize_drop_one(V4),
+			    "oversized v4 fragment stored instead of dropped");
+	TEST_ASSERT_SUCCESS(oversize_drop_one(V6),
+			    "oversized v6 fragment stored instead of dropped");
+	return TEST_SUCCESS;
+}
+
+static struct unit_test_suite reassembly_testsuite = {
+	.suite_name = "IP Reassembly Unit Test Suite",
+	.setup = testsuite_setup,
+	.teardown = testsuite_teardown,
+	.unit_test_cases = {
+		TEST_CASE_ST(ut_setup, NULL, test_sweep_v4),
+		TEST_CASE_ST(ut_setup, NULL, test_sweep_v6),
+		TEST_CASE_ST(ut_setup, NULL, test_min_fragment),
+		TEST_CASE_ST(ut_setup, NULL, test_cap_boundary),
+		TEST_CASE_ST(ut_setup, NULL, test_incomplete_timeout),
+		TEST_CASE_ST(ut_setup, NULL, test_zero_len),
+		TEST_CASE_ST(ut_setup, NULL, test_dup_tolerated),
+		TEST_CASE_ST(ut_setup, NULL, test_overlap),
+		TEST_CASE_ST(ut_setup, NULL, test_v6_ext_header_drop),
+		TEST_CASE_ST(ut_setup, NULL, test_oversize_drop),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_reassembly(void)
+{
+	return unit_test_suite_runner(&reassembly_testsuite);
+}
+
+REGISTER_FAST_TEST(reassembly_autotest, NOHUGE_SKIP, ASAN_OK, test_reassembly);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 5/6] ip_frag: reject oversized reassembled datagrams
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, stable, Konstantin Ananyev
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

The reassembled total length of a packet must not exceed 65535.
A fragment with a high offset could drive the sum past that,
causing silent truncation since IP payload_len/total_length is 16 bits.

When reassembling a packet the total length should not be allowed
to exceed 65535. A fragment with high offset could drive the sum
past that, causing silent truncation.

A valid datagram never exceeds 65535 bytes, so reject any fragment
whose resulting length would exceed that.
Fold the test into the existing zero-length check.

Fixes: cc8f4d020c0b ("examples/ip_reassembly: initial import")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/ip_frag/rte_ipv4_reassembly.c | 9 +++++++--
 lib/ip_frag/rte_ipv6_reassembly.c | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/ip_frag/rte_ipv4_reassembly.c b/lib/ip_frag/rte_ipv4_reassembly.c
index 980f7a3b77..727fc58243 100644
--- a/lib/ip_frag/rte_ipv4_reassembly.c
+++ b/lib/ip_frag/rte_ipv4_reassembly.c
@@ -136,8 +136,13 @@ rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 		tbl, tbl->max_cycles, tbl->entry_mask, tbl->max_entries,
 		tbl->use_entries);
 
-	/* check that fragment length is greater then zero. */
-	if (ip_len <= 0) {
+	/*
+	 * Drop fragments with no payload, and any fragment whose end would
+	 * make the reassembled datagram exceed the maximum IPv4 size. The
+	 * total_length field is 16 bits, so otherwise it is silently
+	 * truncated while the mbuf still holds the full length.
+	 */
+	if (ip_len <= 0 || ip_ofs + ip_len + mb->l3_len > UINT16_MAX) {
 		IP_FRAG_MBUF2DR(dr, mb);
 		return NULL;
 	}
diff --git a/lib/ip_frag/rte_ipv6_reassembly.c b/lib/ip_frag/rte_ipv6_reassembly.c
index 7c1659002b..0b44275b37 100644
--- a/lib/ip_frag/rte_ipv6_reassembly.c
+++ b/lib/ip_frag/rte_ipv6_reassembly.c
@@ -174,8 +174,13 @@ rte_ipv6_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 		tbl, tbl->max_cycles, tbl->entry_mask, tbl->max_entries,
 		tbl->use_entries);
 
-	/* check that fragment length is greater then zero. */
-	if (ip_len <= 0) {
+	/*
+	 * Drop fragments with no payload, and any fragment whose end would
+	 * make the reassembled payload exceed 65535 bytes. The payload_len
+	 * field is 16 bits, so otherwise it is silently truncated while the
+	 * mbuf still holds the full length.
+	 */
+	if (ip_len <= 0 || ip_ofs + ip_len > UINT16_MAX) {
 		IP_FRAG_MBUF2DR(dr, mb);
 		return NULL;
 	}
-- 
2.53.0


^ permalink raw reply related

* [PATCH 4/6] ip_frag: drop IPv6 fragments with unexpected headers
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev
  Cc: Stephen Hemminger, stable, Konstantin Ananyev, Anatoly Burakov,
	Thomas Monjalon
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

DPDK version of IPv6 reassembly only handles a fragment header placed
directly after the IPv6 header. With other extension headers in the
unfragmentable part, ipv6_frag_reassemble() patches the wrong
next-header field, miscomputes the payload length, and shifts the
wrong bytes, corrupting the result.

Drop the fragment when l3_len covers more than the IPv6 and fragment
headers. RFC 8200 allows a receiver to discard packets whose extension
headers are not in the recommended order, and RFC 9099 recommends
dropping non-conforming fragmented IPv6 packets, so dropping here is
permitted rather than a deviation.

Fixes: 4f1a8f633862 ("ip_frag: add IPv6 reassembly")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/ip_frag/rte_ipv6_reassembly.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lib/ip_frag/rte_ipv6_reassembly.c b/lib/ip_frag/rte_ipv6_reassembly.c
index 0e809a01e5..7c1659002b 100644
--- a/lib/ip_frag/rte_ipv6_reassembly.c
+++ b/lib/ip_frag/rte_ipv6_reassembly.c
@@ -180,6 +180,19 @@ rte_ipv6_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 		return NULL;
 	}
 
+	/*
+	 * Only a fragment header directly following the IPv6 header is
+	 * supported. Other extension headers in the unfragmentable part are
+	 * not handled: ipv6_frag_reassemble() assumes l3_len covers exactly
+	 * the IPv6 and fragment headers when it patches the next-header field
+	 * and removes the fragment header. Drop the fragment rather than
+	 * produce a corrupt datagram.
+	 */
+	if (mb->l3_len != sizeof(struct rte_ipv6_hdr) + sizeof(*frag_hdr)) {
+		IP_FRAG_MBUF2DR(dr, mb);
+		return NULL;
+	}
+
 	if (unlikely(trim > 0))
 		rte_pktmbuf_trim(mb, trim);
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 3/6] ip_frag: include protocol in IPv4 reassembly key
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, stable, Konstantin Ananyev
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

DPDK IPv4 reassembly code was not following RFC 791 section 3.2
which says:
    The internet identification field (ID) is used together with the
    source and destination address, and the protocol fields, to identify
    datagram fragments for reassembly.

Omitting the protocol means two datagrams between the
same pair of hosts that share an IP id but carry different protocols
(for example UDP and ICMP) are merged into a single reassembly context,
producing a corrupted datagram.

Fold the protocol into the unused upper bits of the 32-bit id field
of the key. The IPv4 identification is 16 bits and occupies the low
half, so the protocol can be carried in the upper bits without changing
the key layout, the key comparison or the hash.

Fixes: cc8f4d020c0b ("examples/ip_reassembly: initial import")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/ip_frag/rte_ipv4_reassembly.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/ip_frag/rte_ipv4_reassembly.c b/lib/ip_frag/rte_ipv4_reassembly.c
index 3c8ae113ba..980f7a3b77 100644
--- a/lib/ip_frag/rte_ipv4_reassembly.c
+++ b/lib/ip_frag/rte_ipv4_reassembly.c
@@ -111,9 +111,15 @@ rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 	ip_ofs = (uint16_t)(flag_offset & RTE_IPV4_HDR_OFFSET_MASK);
 	ip_flag = (uint16_t)(flag_offset & RTE_IPV4_HDR_MF_FLAG);
 
+	/*
+	 * RFC 791 requires using: source, destination, identifier field and protocol
+	 */
+
 	/* use first 8 bytes only */
 	memcpy(&key.src_dst[0], &ip_hdr->src_addr, 8);
-	key.id = ip_hdr->packet_id;
+
+	/* packet_id is 16 bits and proto id is 8 bits */
+	key.id = ((uint32_t) ip_hdr->next_proto_id << 16) | ip_hdr->packet_id;
 	key.key_len = IPV4_KEYLEN;
 
 	ip_ofs *= RTE_IPV4_HDR_OFFSET_UNITS;
-- 
2.53.0


^ permalink raw reply related

* [PATCH 2/6] ip_frag: discard datagrams with overlapping fragments
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, stable, Konstantin Ananyev
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

Existing code does not handle overlapping fragments.

RFC 8200 (IPv6) requires that on overlap all reassembly is abandoned
andall received fragments are dropped. RFC 791 (IPv4) originally called
fortrimming and rewriting, but Linux discards for IPv4 as well, since
overlap has no legitimate use and is a known attack vector.

Depends on the duplicate-tolerance change so that an exact duplicate is
dropped on its own rather than discarding the whole datagram.

Fixes: cc8f4d020c0b ("examples/ip_reassembly: initial import")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/ip_frag/ip_frag_internal.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index 9a03ef995a..2505314a29 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -92,16 +92,34 @@ ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr,
 	uint32_t i, idx;
 
 	/*
-	 * Discard an exact duplicate fragment. If a previously stored fragment
-	 * already covers the same offset and length, this fragment carries no
-	 * new data. Reassembly is tolerant of duplicates (RFC 791), so drop
-	 * only this mbuf and keep the reassembly entry intact rather than
-	 * treating it as an error. Fragments overlapping an existing one with
-	 * different bounds are not handled here.
+	 * Scan the fragments already collected for this datagram before
+	 * storing the new one. The stored set is kept free of duplicates and
+	 * overlaps, so a single pass is sufficient.
 	 */
 	for (i = 0; i != fp->last_idx; i++) {
-		if (fp->frags[i].mb != NULL && fp->frags[i].ofs == ofs &&
-				fp->frags[i].len == len) {
+		if (fp->frags[i].mb == NULL)
+			continue;
+
+		/*
+		 * Exact duplicate: carries no new data. Reassembly tolerates
+		 * duplicates (RFC 791), so drop only this mbuf and keep the
+		 * entry.
+		 */
+		if (fp->frags[i].ofs == ofs && fp->frags[i].len == len) {
+			IP_FRAG_MBUF2DR(dr, mb);
+			return NULL;
+		}
+
+		/*
+		 * Overlap with an existing fragment. Per RFC 8200 section 4.5
+		 * (and RFC 5722) the datagram must be discarded; the same is
+		 * applied to IPv4. Free all collected fragments, drop this one,
+		 * and invalidate the entry.
+		 */
+		if (ofs < fp->frags[i].ofs + fp->frags[i].len &&
+				fp->frags[i].ofs < ofs + len) {
+			ip_frag_free(fp, dr);
+			ip_frag_key_invalidate(&fp->key);
 			IP_FRAG_MBUF2DR(dr, mb);
 			return NULL;
 		}
-- 
2.53.0


^ permalink raw reply related

* [PATCH 1/6] ip_frag: tolerate duplicate fragments
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, stable, Samyak Jain, Konstantin Ananyev
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

The reassembly code tracked only a running byte total and reserved slots
for the first and last fragments, with no check for a fragment
duplicating data already received. A single duplicate could destroy a
recoverable datagram:
 - a duplicate first or last fragment collided with the reserved slot and
   sent the whole entry down the error path, freeing every collected
   fragment;
 - a duplicate intermediate fragment was appended to a new slot, inflating
   frag_size past total_size so reassembly never completed.

RFC 791 reassembly tolerates duplicates: a fragment covering bytes
already present carries no new information. Check for an exact duplicate
(stored fragment with the same offset and length) and drop only that
mbuf, before frag_size is updated, leaving the entry's accounting
unchanged.

Overlapping fragments with differing bounds are a separate issue
addressed in the next patch.

Fixes: cc8f4d020c0b ("examples/ip_reassembly: initial import")
Cc: stable@dpdk.org
Reported-by: Samyak Jain <samyak.jain@amantyatech.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/ip_frag/ip_frag_internal.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index 382f42d0e1..9a03ef995a 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -89,7 +89,23 @@ struct rte_mbuf *
 ip_frag_process(struct ip_frag_pkt *fp, struct rte_ip_frag_death_row *dr,
 	struct rte_mbuf *mb, uint16_t ofs, uint16_t len, uint16_t more_frags)
 {
-	uint32_t idx;
+	uint32_t i, idx;
+
+	/*
+	 * Discard an exact duplicate fragment. If a previously stored fragment
+	 * already covers the same offset and length, this fragment carries no
+	 * new data. Reassembly is tolerant of duplicates (RFC 791), so drop
+	 * only this mbuf and keep the reassembly entry intact rather than
+	 * treating it as an error. Fragments overlapping an existing one with
+	 * different bounds are not handled here.
+	 */
+	for (i = 0; i != fp->last_idx; i++) {
+		if (fp->frags[i].mb != NULL && fp->frags[i].ofs == ofs &&
+				fp->frags[i].len == len) {
+			IP_FRAG_MBUF2DR(dr, mb);
+			return NULL;
+		}
+	}
 
 	fp->frag_size += len;
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 0/6] ip_frag: fix reassembly defects and add test
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger

The IP reassembly library tracks only a running byte total and reserved
slots for the first and last fragments, with no coverage map. As a result
it mishandles duplicate, overlapping, oversized, and misheadered
fragments, and the IPv4 key is missing a field RFC 791 requires. There
was also no functional test to catch any of it.

These came out of reviewing a duplicate-fragment report on the list.

Patches 1 and 2 are interdependent: the overlap discard relies on the
duplicate handling so an exact duplicate is dropped on its own rather
than discarding the whole datagram. The rest are independent.

Patch 6 adds a functional test modeled on the Linux selftest ip_defrag.c.
It passes on this series; with any single fix reverted the matching case
fails.

Stephen Hemminger (6):
  ip_frag: tolerate duplicate fragments
  ip_frag: discard datagrams with overlapping fragments
  ip_frag: include protocol in IPv4 reassembly key
  ip_frag: drop IPv6 fragments with unexpected headers
  ip_frag: reject oversized reassembled datagrams
  app/test: add test for IP reassembly

 app/test/meson.build              |   1 +
 app/test/test_reassembly.c        | 644 ++++++++++++++++++++++++++++++
 lib/ip_frag/ip_frag_internal.c    |  36 +-
 lib/ip_frag/rte_ipv4_reassembly.c |  17 +-
 lib/ip_frag/rte_ipv6_reassembly.c |  22 +-
 5 files changed, 714 insertions(+), 6 deletions(-)
 create mode 100644 app/test/test_reassembly.c

-- 
2.53.0


^ permalink raw reply

* Re: DPDK release candidate 26.07-rc1
From: Riley Fletcher @ 2026-06-16 15:48 UTC (permalink / raw)
  To: Thomas Monjalon, dev
In-Reply-To: <EE7NFX47QzqKRvGYzs3QUA@monjalon.net>

IBM - Power Systems Testing
DPDK v26.07-rc1

* Build CI on Fedora 40, 41, 42 and 43 container images for ppc64le
* Basic PF on Mellanox: No issues found
* Performance: TestPMD throughput single core tests
* OS:- RHEL 10.2  kernel: 6.12.0-211.7.3.el10_2.ppc64le
         with gcc version 14.3.1 20251022 (Red Hat 14.3.1-4)

Systems tested:
  - LPARs on IBM Power11 CHRP IBM, 9105-22A
     NICs:
     - Mellanox Technologies MT4129 Family [ConnectX-7 100 GbE 2-Port]
     - Firmware Version: 28.48.1000 (MT_0000000834)
     - OFED 26.04-0.8.6

Regards,

Riley Fletcher

On Thu, 2026-06-11 at 04:52 +0200, Thomas Monjalon wrote:
> A new DPDK release candidate is ready for testing:
> 	https://git.dpdk.org/dpdk/tag/?id=v26.07-rc1
> 
> There are 432 new patches in this snapshot.
> 
> Release notes:
> 	https://doc.dpdk.org/guides/rel_notes/release_26_07.html
> 
> Highlights of 26.07-rc1:
> 	- option --pagesz-mem for per-page-size maximum
> 	- option --no-auto-probing for no device at init
> 	- less mempool cache misses in run-to-completion
> 	- peek style API for staged-ordered ring
> 	- RISC-V vector paths
> 	- PTP helpers and clock relay example
> 	- selective Rx API to save PCI bandwidth
> 	- vhost memory hotplug
> 	- pcap driver enhanced
> 	- LinkData sxe2 NIC driver
> 	- AI review helpers
> 
> Important note:
> Pipelined applications, where ethdev Rx and Tx run on separate
> lcores,
> should adapt to the new algorithm by doubling their configured
> mempool
> cache size, to avoid doubling their mempool cache miss rate.
> 
> Please test and report issues on bugs.dpdk.org.
> 
> We plan to release -rc2 in 2 weeks.
> 
> Thank you everyone
> 

^ permalink raw reply

* RE: [PATCH v2 6/6] net/dpaa2: drop the fake software VLAN strip offload
From: Hemant Agrawal @ 2026-06-16 15:40 UTC (permalink / raw)
  To: Stephen Hemminger, Maxime Leroy; +Cc: dev@dpdk.org, Sachin Saxena
In-Reply-To: <20260616071110.1baf1beb@phoenix.local>



> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: 16 June 2026 19:41
> To: Maxime Leroy <maxime@leroys.fr>
> Cc: dev@dpdk.org; Hemant Agrawal <hemant.agrawal@nxp.com>; Sachin
> Saxena <sachin.saxena@nxp.com>
> Subject: Re: [PATCH v2 6/6] net/dpaa2: drop the fake software VLAN strip
> offload
> Importance: High
> 
> On Tue, 16 Jun 2026 12:27:26 +0200
> Maxime Leroy <maxime@leroys.fr> wrote:
> 
> > RTE_ETH_RX_OFFLOAD_VLAN_STRIP is advertised, but no hardware VLAN
> > strip backs it: when enabled, the Rx burst calls rte_vlan_strip() on
> > every frame, a software op masquerading as a hardware offload.
> >
> > It saves a forwarding application nothing: the datapath reads the L2
> > header anyway to classify or strip. The offload does not remove that
> > read, it relocates it into the driver Rx burst, where it is far more
> > expensive.
> >
> > The cost is a matter of timing. rte_vlan_strip() reaches the L2 header
> > through rte_pktmbuf_mtod(), which dereferences mbuf->buf_addr. On a
> > freshly recycled buffer that mbuf cacheline is cold. eth_fd_to_mbuf()
> > has just written other fields of it (data_off, ol_flags), but buf_addr
> > is a persistent field it does not rewrite. A write does not stall: it
> > posts to the store buffer while the line fills in the background, and
> > the rewritten fields are forwarded straight from there. buf_addr has
> > nothing to forward, so it must be read from the line, whose fill is
> > still in flight, and the read stalls. The ethertype read that follows,
> > on the cold payload line, stalls again. Read later by the application,
> > when the fill has completed, the same read hits. The offload just
> > performs it at the worst possible moment.
> >
> > Measured on a single-core port-to-port forwarding test over two 10G
> > ports (one core at 2 GHz, 64-byte untagged frames):
> >
> >   - throughput 4.22 -> 5.00 Mpps (+18 percent)
> >   - IPC 0.93 -> 1.25: the cost was memory stall, not compute
> >   - L3/DRAM-bound L2 refills 319M -> 200M over 10s (-37 percent)
> >
> > perf confirms it: with the offload, the buf_addr load (the cold mbuf
> > field) and the payload load account for about 84 percent of the Rx
> > burst's L2 refills; removing it, those vanish and only the inherent
> > DQRR dequeue misses remain.
> >
> > Stop advertising VLAN_STRIP and remove the rte_vlan_strip() calls from
> > every Rx path. This is a behavioural change: the tag is left in the
> > frame, so an application must strip it itself, on the L2 header it
> > already reads.
> >
> > Signed-off-by: Maxime Leroy <maxime@leroys.fr>
> 
> Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Hemant Agrawal <hemant.agrawal@nxp.com>

^ permalink raw reply

* Re: [RFC 0/4] alternative capture mechanism
From: Stephen Hemminger @ 2026-06-16 14:28 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev
In-Reply-To: <ajFDjhDc5hXhE8km@bricha3-mobl1.ger.corp.intel.com>

On Tue, 16 Jun 2026 13:37:34 +0100
Bruce Richardson <bruce.richardson@intel.com> wrote:

> On Tue, Jun 09, 2026 at 02:02:01PM -0700, Stephen Hemminger wrote:
> > This is an RFC for an alternative way to capture packets from a DPDK
> > application. I did brief demo of similar mechanism at DPDK summit but
> > this is more complete. Capture runs in the primary process and is driven
> > entirely over telemetry; no secondary process is involved.
> > 
> > A client asks the application to start capturing and passes it a file
> > descriptor to write to. The application writes pcapng to that descriptor.
> > A Wireshark extcap script is the intended front end, but the control path
> > is just telemetry and the output is just a pipe, so other front ends are
> > possible.
> > 
> >   1/4  telemetry: let a command receive file descriptors from the client
> >   2/4  capture: the library
> >   3/4  test: functional test
> >   4/4  app: the Wireshark extcap script and its documentation
> > 
> > Setup and usage are in doc/guides/tools/wireshark_extcap.rst.
> > 
> > Primary process only for now; secondary-process capture is possible as
> > follow-on. Posting as RFC to get feedback on the approach.
> > 
> > The extcap script is dual licensed (BSD-3-Clause OR GPL-2.0-or-later) as
> > it may be more useful in the Wireshark tree.
> >   
> 
> One concern I have though - does this cause system-calls to be made in the
> fast-path because we are writting to a passed in FD? For performance
> reasons, would it not be better to use a memory buffer for this, thereby
> avoiding syscalls? For example, rather than passing in an FD to telemetry,
> we could pass in a key to be passed to shmget (going old-school!), or
> name parameter for shm_open. Thereafter with the memory buffer we can use a
> circular ring or similar to pass the data from app to client.
> 
> /Bruce

The system calls are contained inside the thread spawned when capture starts.
The flow is:
         callback -> ring -> capture thread -> FIFO -> wireshark

^ permalink raw reply

* Re: [RFC 1/4] telemetry: allow commands to receive file descriptors
From: Stephen Hemminger @ 2026-06-16 14:26 UTC (permalink / raw)
  To: Bruce Richardson; +Cc: dev
In-Reply-To: <ajFCXCZfikPJTrLH@bricha3-mobl1.ger.corp.intel.com>

On Tue, 16 Jun 2026 13:32:28 +0100
Bruce Richardson <bruce.richardson@intel.com> wrote:

> On Tue, Jun 09, 2026 at 02:02:02PM -0700, Stephen Hemminger wrote:
> > Add rte_telemetry_register_cmd_fd_arg() to register a command whose
> > callback also receives file descriptors passed by the client as
> > SCM_RIGHTS ancillary data. The callback owns the descriptors and must
> > close them.
> > 
> > This lets a client open a file itself and hand the descriptor to the
> > primary process, so DPDK never opens the path. That avoids path and
> > permission problems and works across container filesystem namespaces.
> > 
> > Existing commands and clients are unaffected. If unsolicited file
> > descriptor is passed, it is closed.
> >   
> 
> This scheme seems reasonable in general. My only concern is whether the
> lack of potential windows support is an issue? For regular telemetry, there
> was always the option of a windows implementation using regular
> TCP/UDP/SCTP sockets bound to localhost. However, AFAIK there is no windows
> implementation of anything that supports file descriptors or handles
> between processes.
> 
> Some other pieces of feedback inline below.
> 
> /Bruce

I have new version (testing) that passes filename as parameter.
That should work without the fd passing.

^ permalink raw reply

* Re: [PATCH v2] app/testpmd: add VLAN priority insert support
From: Stephen Hemminger @ 2026-06-16 14:23 UTC (permalink / raw)
  To: Xingui Yang
  Cc: dev, david.marchand, aman.deep.singh, fengchengwen, yangshuaisong,
	lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260616131001.2955655-1-yangxingui@huawei.com>

On Tue, 16 Jun 2026 21:10:01 +0800
Xingui Yang <yangxingui@huawei.com> wrote:

> The tx_vlan set and tx_qinq set commands only accepted VLAN ID in range
> [0, 4095]. This prevented users from setting 802.1p priority and CFI
> bits when using hardware VLAN insertion.
> 
> Since mbuf vlan_tci field already supports full 16-bit VLAN Tag Control
> Information (TCI), relax the validation for TX paths to allow priority
> and CFI bits. The vlan_id parameter now accepts:
>   - Bits 0-11:  VLAN ID (0-4095)
>   - Bit 12:    CFI (Canonical Format Indicator)
>   - Bits 13-15: Priority (0-7, 802.1p CoS)
> 
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Suggested-by: fengchengwen <fengchengwen@huawei.com>
> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
> ---
> v2:
> - Removed --enable-vlan-priority option and global variable as suggested
>   by Stephen Hemminger. The feature is now always enabled for TX paths
> - RX VLAN filter continues to enforce strict VLAN ID validation as
>   suggested by fengchengwen
> - Added documentation updates for testpmd_funcs.rst and release notes
> 
>  app/test-pmd/config.c                       | 13 ++++++++-----
>  doc/guides/rel_notes/release_26_07.rst      |  7 +++++++
>  doc/guides/testpmd_app_ug/testpmd_funcs.rst | 17 ++++++++++++++---
>  3 files changed, 29 insertions(+), 8 deletions(-)
> 
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 9d457ca88e..38758f9c05 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -1241,8 +1241,11 @@ void print_valid_ports(void)
>  }
>  
>  static int
> -vlan_id_is_invalid(uint16_t vlan_id)
> +vlan_id_is_invalid(uint16_t vlan_id, bool is_tx)
>  {
> +	if (is_tx)
> +		return 0;
> +
>  	if (vlan_id < 4096)
>  		return 0;
>  	fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
> @@ -6876,7 +6879,7 @@ rx_vft_set(portid_t port_id, uint16_t vlan_id, int on)
>  
>  	if (port_id_is_invalid(port_id, ENABLED_WARN))
>  		return 1;
> -	if (vlan_id_is_invalid(vlan_id))
> +	if (vlan_id_is_invalid(vlan_id, false))
>  		return 1;
>  	diag = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
>  	if (diag == 0)
> @@ -6923,7 +6926,7 @@ tx_vlan_set(portid_t port_id, uint16_t vlan_id)
>  	struct rte_eth_dev_info dev_info;
>  	int ret;
>  
> -	if (vlan_id_is_invalid(vlan_id))
> +	if (vlan_id_is_invalid(vlan_id, true))
>  		return;

Why have the is_tx flag if it is always used as constant?
Just remove the whole vlan_id_is_invalid() branch test in the transmit path.
Maybe add a comment that any VLAN is allowed on transmit?

Or make a new function. Since VLAN of 0xffff is reserved. Though you might want
to allow it since testpmd is for testing even invalid packets.


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox