DPDK-dev Archive on lore.kernel.org

DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v7 06/21] net/txgbe: fix link status check condition
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260617033400.376-1-zaiyuwang@trustnetic.com>

The original code incorrectly used 'if (link_up)' instead of
'if (*link_up)', causing the condition to always evaluate to true
because the pointer itself is non-NULL. This led to incorrect speed
assignment.

Fixes: fb6eb170dfa2 ("net/txgbe: add basic link configuration for Amber-Lite")
Cc: stable@dpdk.org

Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
 drivers/net/txgbe/base/txgbe_aml.c   | 2 +-
 drivers/net/txgbe/base/txgbe_aml40.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/txgbe/base/txgbe_aml.c b/drivers/net/txgbe/base/txgbe_aml.c
index b376eca5b5..de9a1b1c93 100644
--- a/drivers/net/txgbe/base/txgbe_aml.c
+++ b/drivers/net/txgbe/base/txgbe_aml.c
@@ -67,7 +67,7 @@ s32 txgbe_check_mac_link_aml(struct txgbe_hw *hw, u32 *speed,
 			*link_up = false;
 	}
 
-	if (link_up) {
+	if (*link_up) {
 		switch (links_reg & TXGBE_CFG_PORT_ST_AML_LINK_MASK) {
 		case TXGBE_CFG_PORT_ST_AML_LINK_25G:
 			*speed = TXGBE_LINK_SPEED_25GB_FULL;
diff --git a/drivers/net/txgbe/base/txgbe_aml40.c b/drivers/net/txgbe/base/txgbe_aml40.c
index 733bbac13a..eefd7119fd 100644
--- a/drivers/net/txgbe/base/txgbe_aml40.c
+++ b/drivers/net/txgbe/base/txgbe_aml40.c
@@ -68,7 +68,7 @@ s32 txgbe_check_mac_link_aml40(struct txgbe_hw *hw, u32 *speed,
 			*link_up = false;
 	}
 
-	if (link_up) {
+	if (*link_up) {
 		if ((links_reg & TXGBE_CFG_PORT_ST_AML_LINK_40G) ==
 			TXGBE_CFG_PORT_ST_AML_LINK_40G)
 			*speed = TXGBE_LINK_SPEED_40GB_FULL;
-- 
2.21.0.windows.1


^ permalink raw reply related

* [PATCH v7 05/21] net/txgbe: fix inaccuracy in Tx rate limiting
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260617033400.376-1-zaiyuwang@trustnetic.com>

Amber-lite NIC's TX rate limiting has large deviations for small
packets. To address this, the following changes are made:

1. Set TDM_RL_ADJ (0x1820c) to 21B (includes 7B Ethernet preamble,
   1B SFD, 1B EFD, and 12B IPG).
2) Remove the rate offset in the driver (e.g., 105 / 100, a rough
   compensation value from Linux kernel driver tests).

After these changes, accuracy deviation for 64B packets is within
~5%, while large packets show lower deviation.

Fixes: a309ab43acf3 ("net/txgbe: support Tx queue rate limiting for Amber-Lite")
Cc: stable@dpdk.org

Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
 drivers/net/txgbe/base/txgbe_regs.h | 3 +++
 drivers/net/txgbe/txgbe_ethdev.c    | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/txgbe/base/txgbe_regs.h b/drivers/net/txgbe/base/txgbe_regs.h
index 95c585a025..3d1bc88430 100644
--- a/drivers/net/txgbe/base/txgbe_regs.h
+++ b/drivers/net/txgbe/base/txgbe_regs.h
@@ -1670,6 +1670,9 @@ enum txgbe_5tuple_protocol {
 #define   TXGBE_TDM_FACTOR_INT_SHIFT    16
 #define   TXGBE_TDM_FACTOR_FRA_SHIFT    2
 
+#define TXGBE_TDM_RL_ADJ                0x1820C
+ /* Ethernet framing overhead: 7B Ethernet preamble + 1B SFD + 1B EFD + 12B IPG */
+#define   TXGBE_FRAME_OVERHEAD          21
 #define TXGBE_TDM_RL_VM_IDX             0x018218
 #define TXGBE_TDM_RL_VM_CFG             0x01821C
 #define TXGBE_TDM_RL_CFG                0x018400
diff --git a/drivers/net/txgbe/txgbe_ethdev.c b/drivers/net/txgbe/txgbe_ethdev.c
index 779874aac9..099341b5ab 100644
--- a/drivers/net/txgbe/txgbe_ethdev.c
+++ b/drivers/net/txgbe/txgbe_ethdev.c
@@ -4314,7 +4314,6 @@ txgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 			u16 frac;
 
 			link_speed = dev->data->dev_link.link_speed;
-			tx_rate  = tx_rate * 105 / 100;
 			/* Calculate the rate factor values to set */
 			factor_int = link_speed / tx_rate;
 			frac = (link_speed % tx_rate) * 10000 / tx_rate;
@@ -4324,6 +4323,7 @@ txgbe_set_queue_rate_limit(struct rte_eth_dev *dev,
 				factor_fra = 0;
 			}
 
+			wr32(hw, TXGBE_TDM_RL_ADJ, TXGBE_FRAME_OVERHEAD);
 			wr32(hw, TXGBE_TDM_RL_QUEUE_IDX, queue_idx);
 			wr32m(hw, TXGBE_TDM_RL_QUEUE_CFG,
 			      TXGBE_TDM_FACTOR_INT_MASK, factor_int << TXGBE_TDM_FACTOR_INT_SHIFT);
-- 
2.21.0.windows.1


^ permalink raw reply related

* [PATCH v7 01/21] net/txgbe: remove duplicate xstats counters
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu, Ferruh Yigit
In-Reply-To: <20260617033400.376-1-zaiyuwang@trustnetic.com>

Remove four redundant counters (tx_xon_packets, rx_xon_packets,
tx_xoff_packets and rx_xoff_packets) from xstats, as they were duplicates
of tx_flow_control_xon_packets and others. Both sets were reading the same
registers but being output twice under different names. After removing
these entries, the flow control counters in DPDK now align with those in
our Linux kernel driver.

Fixes: 91fe49c87d76 ("net/txgbe: support device xstats")
Cc: stable@dpdk.org

Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
 drivers/net/txgbe/txgbe_ethdev.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/net/txgbe/txgbe_ethdev.c b/drivers/net/txgbe/txgbe_ethdev.c
index 5d360f8305..779874aac9 100644
--- a/drivers/net/txgbe/txgbe_ethdev.c
+++ b/drivers/net/txgbe/txgbe_ethdev.c
@@ -261,11 +261,6 @@ static const struct rte_txgbe_xstats_name_off rte_txgbe_stats_strings[] = {
 	HW_XSTAT(tx_size_1024_to_max_packets),
 
 	/* Flow Control */
-	HW_XSTAT(tx_xon_packets),
-	HW_XSTAT(rx_xon_packets),
-	HW_XSTAT(tx_xoff_packets),
-	HW_XSTAT(rx_xoff_packets),
-
 	HW_XSTAT_NAME(tx_xon_packets, "tx_flow_control_xon_packets"),
 	HW_XSTAT_NAME(rx_xon_packets, "rx_flow_control_xon_packets"),
 	HW_XSTAT_NAME(tx_xoff_packets, "tx_flow_control_xoff_packets"),
-- 
2.21.0.windows.1


^ permalink raw reply related

* [PATCH v7 04/21] net/ngbe: fix VF promiscuous and allmulticast
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260617033400.376-1-zaiyuwang@trustnetic.com>

The configuration of allmulti and promiscuous modes conflicts
together. For instance, if we enable promiscuous mode, then enable and
disable allmulti, then the promiscuous mode is wrongly disabled.

To resolve this, the following changes are made:
- do nothing when we set/unset allmulti if promiscuous mode is on
- restore the proper mode (none or allmulti) when we disable
  promiscuous mode

Fixes: 7744e90805b5 ("net/ngbe: add promiscuous and allmulticast ops for VF device")
Cc: stable@dpdk.org

Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
 drivers/net/ngbe/ngbe_ethdev_vf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ngbe/ngbe_ethdev_vf.c b/drivers/net/ngbe/ngbe_ethdev_vf.c
index 6406df40d0..81511fed8a 100644
--- a/drivers/net/ngbe/ngbe_ethdev_vf.c
+++ b/drivers/net/ngbe/ngbe_ethdev_vf.c
@@ -1196,9 +1196,13 @@ static int
 ngbevf_dev_promiscuous_disable(struct rte_eth_dev *dev)
 {
 	struct ngbe_hw *hw = ngbe_dev_hw(dev);
+	int mode = NGBEVF_XCAST_MODE_NONE;
 	int ret;
 
-	switch (hw->mac.update_xcast_mode(hw, NGBEVF_XCAST_MODE_NONE)) {
+	if (dev->data->all_multicast)
+		mode = NGBEVF_XCAST_MODE_ALLMULTI;
+
+	switch (hw->mac.update_xcast_mode(hw, mode)) {
 	case 0:
 		ret = 0;
 		break;
@@ -1219,7 +1223,7 @@ ngbevf_dev_allmulticast_enable(struct rte_eth_dev *dev)
 	struct ngbe_hw *hw = ngbe_dev_hw(dev);
 	int ret;
 
-	if (dev->data->promiscuous == 1)
+	if (dev->data->promiscuous)
 		return 0;
 
 	switch (hw->mac.update_xcast_mode(hw, NGBEVF_XCAST_MODE_ALLMULTI)) {
@@ -1243,6 +1247,9 @@ ngbevf_dev_allmulticast_disable(struct rte_eth_dev *dev)
 	struct ngbe_hw *hw = ngbe_dev_hw(dev);
 	int ret;
 
+	if (dev->data->promiscuous)
+		return 0;
+
 	switch (hw->mac.update_xcast_mode(hw, NGBEVF_XCAST_MODE_MULTI)) {
 	case 0:
 		ret = 0;
-- 
2.21.0.windows.1


^ permalink raw reply related

* [PATCH v7 03/21] net/ngbe: add missing CDR config for YT PHY
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260617033400.376-1-zaiyuwang@trustnetic.com>

According to the PHY vendor, when YT8531S operates in UTP-to-Fiber or
RGMII-to-Fiber mode with auto-negotiation disabled (Force mode),
additional CDR (Clock Data Recovery) configuration is required to
improve link connectivity. Without this config, link may be unstable
or fail to establish.

Fixes: f1268369403d ("net/ngbe: support autoneg on/off for external PHY SFI mode")
Cc: stable@dpdk.org

Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
 drivers/net/ngbe/base/ngbe_phy_yt.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ngbe/base/ngbe_phy_yt.c b/drivers/net/ngbe/base/ngbe_phy_yt.c
index d110fbc8b2..ab0778d246 100644
--- a/drivers/net/ngbe/base/ngbe_phy_yt.c
+++ b/drivers/net/ngbe/base/ngbe_phy_yt.c
@@ -264,6 +264,9 @@ s32 ngbe_setup_phy_link_yt(struct ngbe_hw *hw, u32 speed,
 			value = YT_BCR_RESET | YT_BCR_ANE | YT_BCR_RESTART_AN |
 				YT_BCR_DUPLEX | YT_BCR_SPEED_SELECT1;
 		} else {
+			/* force mode need to config cdr */
+			ngbe_write_phy_reg_sds_ext_yt(hw, 0x3, 0, 0x1434);
+			ngbe_write_phy_reg_sds_ext_yt(hw, 0xe, 0, 0x163);
 			value = YT_BCR_RESET | YT_BCR_DUPLEX;
 			if (speed & NGBE_LINK_SPEED_1GB_FULL)
 				value |= YT_BCR_SPEED_SELECT1;
-- 
2.21.0.windows.1


^ permalink raw reply related

* [PATCH v7 02/21] net/ngbe: remove duplicate xstats counters
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260617033400.376-1-zaiyuwang@trustnetic.com>

Remove four redundant counters (tx_xon_packets, rx_xon_packets,
tx_xoff_packets and rx_xoff_packets) from xstats, as they were duplicates
of tx_flow_control_xon_packets and others. Both sets were reading the same
registers but being output twice under different names. After removing
these entries, the flow control counters in DPDK now align with those in
our Linux kernel driver.

Fixes: 8b433d04adc9 ("net/ngbe: support device xstats")
Cc: stable@dpdk.org

Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
 drivers/net/ngbe/ngbe_ethdev.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/net/ngbe/ngbe_ethdev.c b/drivers/net/ngbe/ngbe_ethdev.c
index 8b9d6371fb..6df53f3266 100644
--- a/drivers/net/ngbe/ngbe_ethdev.c
+++ b/drivers/net/ngbe/ngbe_ethdev.c
@@ -227,11 +227,6 @@ static const struct rte_ngbe_xstats_name_off rte_ngbe_stats_strings[] = {
 	HW_XSTAT(tx_size_1024_to_max_packets),
 
 	/* Flow Control */
-	HW_XSTAT(tx_xon_packets),
-	HW_XSTAT(rx_xon_packets),
-	HW_XSTAT(tx_xoff_packets),
-	HW_XSTAT(rx_xoff_packets),
-
 	HW_XSTAT_NAME(tx_xon_packets, "tx_flow_control_xon_packets"),
 	HW_XSTAT_NAME(rx_xon_packets, "rx_flow_control_xon_packets"),
 	HW_XSTAT_NAME(tx_xoff_packets, "tx_flow_control_xoff_packets"),
-- 
2.21.0.windows.1


^ permalink raw reply related

* [PATCH v7 00/21] Wangxun Fixes
From: Zaiyu Wang @ 2026-06-17  3:33 UTC (permalink / raw)
  To: dev; +Cc: Zaiyu Wang
In-Reply-To: <20260423034024.14404-1-zaiyuwang@trustnetic.com>

This series fixes several issues found on Wangxun Emerald, Sapphire and
Amber-lite NICs, with a focus on link-related problems.
---
v7:
- Fixed inverted semantics of is_flat_mem to match SFF8636
---
v6:
- Fixed more issues identified by AI review
---
v5:
- Fixed issues identified by AI review
---
v4:
- Fixed issues identified by devtools scripts
---
v3:
- Addressed Stephen's comments
---
v2:
- Fixed compilation error and code style issues
---

Zaiyu Wang (21):
  net/txgbe: remove duplicate xstats counters
  net/ngbe: remove duplicate xstats counters
  net/ngbe: add missing CDR config for YT PHY
  net/ngbe: fix VF promiscuous and allmulticast
  net/txgbe: fix inaccuracy in Tx rate limiting
  net/txgbe: fix link status check condition
  net/txgbe: fix Tx desc free logic
  net/txgbe: fix link flow control registers for Amber-Lite
  net/txgbe: fix link flow control config for Sapphire
  net/txgbe: fix a mass of unknown interrupts
  net/txgbe: fix traffic class priority configuration
  net/txgbe: fix link stability for 25G NIC
  net/txgbe: fix link stability for 40G NIC
  net/txgbe: fix link stability for Amber-Lite backplane mode
  net/txgbe: fix FEC mode configuration on 25G NIC
  net/txgbe: fix SFP module identification
  net/txgbe: fix get module info operation
  net/txgbe: fix get EEPROM operation
  net/txgbe: fix to reset Tx write-back pointer
  net/txgbe: fix to enable Tx desc check
  net/txgbe: fix temperature track for AML NIC

 drivers/net/ngbe/base/ngbe_phy_yt.c       |    3 +
 drivers/net/ngbe/ngbe_ethdev.c            |    5 -
 drivers/net/ngbe/ngbe_ethdev_vf.c         |   11 +-
 drivers/net/txgbe/base/meson.build        |    2 +
 drivers/net/txgbe/base/txgbe.h            |    2 +
 drivers/net/txgbe/base/txgbe_aml.c        |  185 +-
 drivers/net/txgbe/base/txgbe_aml.h        |    6 +-
 drivers/net/txgbe/base/txgbe_aml40.c      |  114 +-
 drivers/net/txgbe/base/txgbe_aml40.h      |    6 +-
 drivers/net/txgbe/base/txgbe_dcb_hw.c     |    2 +-
 drivers/net/txgbe/base/txgbe_e56.c        | 3773 +++++++++++++++++++++
 drivers/net/txgbe/base/txgbe_e56.h        | 1753 ++++++++++
 drivers/net/txgbe/base/txgbe_e56_bp.c     | 2597 ++++++++++++++
 drivers/net/txgbe/base/txgbe_e56_bp.h     |  282 ++
 drivers/net/txgbe/base/txgbe_hw.c         |   54 +-
 drivers/net/txgbe/base/txgbe_hw.h         |    4 +-
 drivers/net/txgbe/base/txgbe_osdep.h      |    4 +
 drivers/net/txgbe/base/txgbe_phy.c        |  362 +-
 drivers/net/txgbe/base/txgbe_phy.h        |   46 +-
 drivers/net/txgbe/base/txgbe_regs.h       |   13 +-
 drivers/net/txgbe/base/txgbe_type.h       |   43 +-
 drivers/net/txgbe/txgbe_ethdev.c          |  472 ++-
 drivers/net/txgbe/txgbe_ethdev.h          |    7 +-
 drivers/net/txgbe/txgbe_rxtx.c            |  109 +-
 drivers/net/txgbe/txgbe_rxtx.h            |   36 +
 drivers/net/txgbe/txgbe_rxtx_vec_common.h |   17 +-
 26 files changed, 9464 insertions(+), 444 deletions(-)
 create mode 100644 drivers/net/txgbe/base/txgbe_e56.c
 create mode 100644 drivers/net/txgbe/base/txgbe_e56.h
 create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.c
 create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.h

-- 
2.21.0.windows.1


^ permalink raw reply

* RE: [PATCH v6 00/21] Wangxun Fixes
From: Zaiyu Wang @ 2026-06-17  1:38 UTC (permalink / raw)
  To: 'Stephen Hemminger'; +Cc: dev
In-Reply-To: <20260616144218.3bb7ea85@phoenix.local>



> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Wednesday, June 17, 2026 5:42 AM
> To: Zaiyu Wang <zaiyuwang@trustnetic.com>; Zaiyu Wang <zaiyuwang@trustnetic.com>
> Cc: dev@dpdk.org; dev@dpdk.org
> Subject: Re: [PATCH v6 00/21] Wangxun Fixes
> 
> On Tue, 16 Jun 2026 20:20:08 +0800
> Zaiyu Wang <zaiyuwang@trustnetic.com> wrote:
> 
> > This series fixes several issues found on Wangxun Emerald, Sapphire
> > and Amber-lite NICs, with a focus on link-related problems.
> > ---
> > v6:
> > - Fixed more issues identified by AI review
> > ---
> > v5:
> > - Fixed issues identified by AI review
> > ---
> > v4:
> > - Fixed issues identified by devtools scripts
> > ---
> > v3:
> > - Addressed Stephen's comments
> > ---
> > v2:
> > - Fixed compilation error and code style issues
> > ---
> >
> > Zaiyu Wang (21):
> >   net/txgbe: remove duplicate xstats counters
> >   net/ngbe: remove duplicate xstats counters
> >   net/ngbe: add missing CDR config for YT PHY
> >   net/ngbe: fix VF promiscuous and allmulticast
> >   net/txgbe: fix inaccuracy in Tx rate limiting
> >   net/txgbe: fix link status check condition
> >   net/txgbe: fix Tx desc free logic
> >   net/txgbe: fix link flow control registers for Amber-Lite
> >   net/txgbe: fix link flow control config for Sapphire
> >   net/txgbe: fix a mass of unknown interrupts
> >   net/txgbe: fix traffic class priority configuration
> >   net/txgbe: fix link stability for 25G NIC
> >   net/txgbe: fix link stability for 40G NIC
> >   net/txgbe: fix link stability for Amber-Lite backplane mode
> >   net/txgbe: fix FEC mode configuration on 25G NIC
> >   net/txgbe: fix SFP module identification
> >   net/txgbe: fix get module info operation
> >   net/txgbe: fix get EEPROM operation
> >   net/txgbe: fix to reset Tx write-back pointer
> >   net/txgbe: fix to enable Tx desc check
> >   net/txgbe: fix temperature track for AML NIC
> >
> >  drivers/net/ngbe/base/ngbe_phy_yt.c       |    3 +
> >  drivers/net/ngbe/ngbe_ethdev.c            |    5 -
> >  drivers/net/ngbe/ngbe_ethdev_vf.c         |   11 +-
> >  drivers/net/txgbe/base/meson.build        |    2 +
> >  drivers/net/txgbe/base/txgbe.h            |    2 +
> >  drivers/net/txgbe/base/txgbe_aml.c        |  185 +-
> >  drivers/net/txgbe/base/txgbe_aml.h        |    6 +-
> >  drivers/net/txgbe/base/txgbe_aml40.c      |  114 +-
> >  drivers/net/txgbe/base/txgbe_aml40.h      |    6 +-
> >  drivers/net/txgbe/base/txgbe_dcb_hw.c     |    2 +-
> >  drivers/net/txgbe/base/txgbe_e56.c        | 3773 +++++++++++++++++++++
> >  drivers/net/txgbe/base/txgbe_e56.h        | 1753 ++++++++++
> >  drivers/net/txgbe/base/txgbe_e56_bp.c     | 2597 ++++++++++++++
> >  drivers/net/txgbe/base/txgbe_e56_bp.h     |  282 ++
> >  drivers/net/txgbe/base/txgbe_hw.c         |   54 +-
> >  drivers/net/txgbe/base/txgbe_hw.h         |    4 +-
> >  drivers/net/txgbe/base/txgbe_osdep.h      |    4 +
> >  drivers/net/txgbe/base/txgbe_phy.c        |  362 +-
> >  drivers/net/txgbe/base/txgbe_phy.h        |   46 +-
> >  drivers/net/txgbe/base/txgbe_regs.h       |   13 +-
> >  drivers/net/txgbe/base/txgbe_type.h       |   43 +-
> >  drivers/net/txgbe/txgbe_ethdev.c          |  472 ++-
> >  drivers/net/txgbe/txgbe_ethdev.h          |    7 +-
> >  drivers/net/txgbe/txgbe_rxtx.c            |  109 +-
> >  drivers/net/txgbe/txgbe_rxtx.h            |   36 +
> >  drivers/net/txgbe/txgbe_rxtx_vec_common.h |   17 +-
> >  26 files changed, 9464 insertions(+), 444 deletions(-)  create mode
> > 100644 drivers/net/txgbe/base/txgbe_e56.c
> >  create mode 100644 drivers/net/txgbe/base/txgbe_e56.h
> >  create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.c
> >  create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.h
> >
> 
> AI review was mostly clean, do you want to change this one thing?
> 
> Review of [PATCH v6 00/21] net/{txgbe,ngbe} fixes from Zaiyu Wang
> 
> All findings from the v5 review are addressed:
> 
>   - 07/21: Tx desc free now loads the 32-bit headwb_mem atomically and
>     casts to uint16_t at the use site, removing the type pun.
>   - 14/21: kr_read_poll() switches from usleep() to usec_delay() to
>     match the rest of the txgbe base layer.
>   - 16/21: stray tab+space indent in txgbe_read_i2c_sff8636() is gone.
>   - 17/21: stray tab+space indent in txgbe_get_module_info() is gone.
>     TXGBE_SFF_DDM_IMPLEMENTED is now indented as a bit mask under the
>     8472_SWAP register definition, which reads correctly.
>   - 18/21: page is reset to 0 inside each iteration of the for loop so
>     it no longer accumulates across iterations.  The flat-memory check
>     is now driven by an explicit read of module byte 2 via
>     read_i2c_sff8636() into a local is_flat_mem flag, instead of
>     reading data[0x2] out of the caller's output buffer.  databyte is
>     cleared to 0 at the top of each loop body so the skipped-read
>     branch can no longer leak the previous iteration's value.
>   - 20/21: RTE_LIBRTE_SECURITY is corrected to RTE_LIB_SECURITY so the
>     !txq->using_ipsec guard is actually preprocessed in.
> 
> One small remaining comment on 18/21, not blocking.
> 
> Patch 18/21 (net/txgbe: fix get EEPROM operation)
> 
> Info: is_flat_mem has inverted semantics relative to its name, which is going to confuse
future
> maintainers.  SFF-8636 byte 2 bit 2
> ("Flat_mem") is 1 for flat-memory modules and 0 for paged-memory modules (this is the
same
> convention ethtool uses).  The new code treats the bit the opposite way:
> 
>   +	bool is_flat_mem = true;
>   ...
>   +		if (rdata & 0x4)
>   +			is_flat_mem = false;
>   ...
>   +			if (page == 0 || is_flat_mem) {
>   +				status = hw->phy.read_i2c_sff8636(hw, page, ...);
> 
> So when the module reports flat (bit set), is_flat_mem is set to false; when paged (bit
clear),
> is_flat_mem stays true.  The condition 'page == 0 || is_flat_mem' reads only page 0 in
the flat
> case and any page in the paged case, which is the right behavior, but the variable ends
up meaning
> "paged memory is OK to read" rather than "module is flat".  Suggest either:
> 
> 	bool is_flat_mem = false;
> 	if (rdata & 0x4)
> 		is_flat_mem = true;
> 	...
> 	if (page == 0 || !is_flat_mem) {
> 		...
> 	}
> 
> or renaming to allow_paged_reads (or similar) and keeping the existing default and
condition.
> 
> 
> 
Hi Stephen,
Thanks for your time. I will fix the issue as you suggested in v7 and send it shortly.



^ permalink raw reply

* [PATCH v5 6/6] net/gve: reconstruct HW timestamps from DQO
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

A full 64-bit NIC timestamp is periodically synced via an AdminQ
command and cached in the driver. In the RX datapath, this cached
value is used as a base to expand the 32-bit hardware timestamp into
a full 64-bit value, which is then stored in the mbuf's dynamic
timestamp field.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v4:
    - Clarify in GVE documentation that both read_clock ethdev op and
      per-packet RX timestamp offloading require the DQO queue format.

v2:
    - Scoped timestamp offload capability advertisement strictly
      to DQO queues.
    - Predicated capability advertisement directly on memzone
      allocation.
    - Initialized mbuf_timestamp_offset to -1.
    - Added blank line separating release notes.
---
 doc/guides/nics/features/gve.ini       |  1 +
 doc/guides/nics/gve.rst                | 20 ++++++++++++++++++++
 doc/guides/rel_notes/release_26_07.rst |  4 ++++
 drivers/net/gve/base/gve_desc_dqo.h    |  8 ++++++--
 drivers/net/gve/gve_ethdev.c           | 15 ++++++++++++++-
 drivers/net/gve/gve_ethdev.h           | 25 +++++++++++++++++++++++++
 drivers/net/gve/gve_rx_dqo.c           | 26 ++++++++++++++++++++++++++
 7 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/doc/guides/nics/features/gve.ini b/doc/guides/nics/features/gve.ini
index 89c97fd27a..117ad4fc65 100644
--- a/doc/guides/nics/features/gve.ini
+++ b/doc/guides/nics/features/gve.ini
@@ -13,6 +13,7 @@ RSS hash             = Y
 RSS key update       = Y
 RSS reta update      = Y
 L4 checksum offload  = Y
+Timestamp offload    = Y
 Basic stats          = Y
 FreeBSD              = Y
 Linux                = Y
diff --git a/doc/guides/nics/gve.rst b/doc/guides/nics/gve.rst
index be855b645d..b73375c5c9 100644
--- a/doc/guides/nics/gve.rst
+++ b/doc/guides/nics/gve.rst
@@ -72,6 +72,7 @@ Supported features of the GVE PMD are:
 - Tx UDP/TCP/SCTP Checksum
 - RSS hash configuration
 - RSS redirection table query and update
+- Timestamp offload
 
 Currently, only GQI_QPL and GQI_RDA queue format are supported in PMD.
 Jumbo Frame is not supported in PMD for now.
@@ -132,6 +133,25 @@ Security Protocols
 - Flow priorities are not supported (must be 0).
 - Masking is limited to full matches i.e. ``0x00...0`` or ``0xFF...F``.
 
+Timestamp Offload
+^^^^^^^^^^^^^^^^^
+
+The driver supports hardware-based packet timestamping on supported
+devices via the standard ``RTE_ETH_RX_OFFLOAD_TIMESTAMP`` offload capability.
+Both the ethdev ``.read_clock`` operation and per-packet RX timestamp offloading
+require the DQO queue format.
+
+**Limitations**
+
+- If the driver fails to fetch the NIC hardware clock for 7 consecutive periods,
+  the cached timestamp is marked as stale,
+  and the reconstructed timestamps are no longer propagated to the mbuf.
+- The timestamp reconstruction is only accurate
+  if the time between a packet's reception
+  and the last hardware clock sync is less than approximately 2 seconds.
+  The driver's internal clock sync period is set to respect this limitation.
+
+
 Device Reset
 ^^^^^^^^^^^^
 
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index 2e449d3ee8..73c9a36007 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -91,6 +91,10 @@ New Features
 
   Added network driver for the LinkData network adapters.
 
+* **Updated Google GVE net driver.**
+
+  * Added hardware timestamping support on DQO queues.
+
 * **Updated Intel iavf driver.**
 
   * Added support for transmitting LLDP packets based on mbuf packet type.
diff --git a/drivers/net/gve/base/gve_desc_dqo.h b/drivers/net/gve/base/gve_desc_dqo.h
index 71d9d60bb9..c1534959c2 100644
--- a/drivers/net/gve/base/gve_desc_dqo.h
+++ b/drivers/net/gve/base/gve_desc_dqo.h
@@ -226,7 +226,8 @@ struct gve_rx_compl_desc_dqo {
 
 	u8 status_error1;
 
-	__le16 reserved5;
+	u8 reserved5;
+	u8 ts_sub_nsecs_low;
 	__le16 buf_id; /* Buffer ID which was sent on the buffer queue. */
 
 	union {
@@ -237,9 +238,12 @@ struct gve_rx_compl_desc_dqo {
 	};
 	__le32 hash;
 	__le32 reserved6;
-	__le64 reserved7;
+	__le32 reserved7;
+	__le32 ts; /* timestamp in nanosecs */
 } __packed;
 
+#define GVE_DQO_RX_HWTSTAMP_VALID 0x1
+
 GVE_CHECK_STRUCT_LEN(32, gve_rx_compl_desc_dqo);
 
 /* Ringing the doorbell too often can hurt performance.
diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index 111f66efa8..0b02dcb3ad 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -214,6 +214,7 @@ static int
 gve_dev_configure(struct rte_eth_dev *dev)
 {
 	struct gve_priv *priv = dev->data->dev_private;
+	int err;
 
 	if (dev->data->dev_conf.rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG) {
 		dev->data->dev_conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
@@ -223,13 +224,22 @@ gve_dev_configure(struct rte_eth_dev *dev)
 	if (dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TCP_LRO)
 		priv->enable_rsc = 1;
 
+	if (dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) {
+		err = rte_mbuf_dyn_rx_timestamp_register(&priv->mbuf_timestamp_offset,
+							 &priv->mbuf_timestamp_mask);
+		if (err < 0) {
+			PMD_DRV_LOG(ERR, "Failed to register dynamic timestamp field");
+			return err;
+		}
+	}
+
 	/* Reset RSS RETA in case number of queues changed. */
 	if (priv->rss_config.indir) {
 		struct gve_rss_config update_reta_config;
 		gve_init_rss_config_from_priv(priv, &update_reta_config);
 		gve_generate_rss_reta(dev, &update_reta_config);
 
-		int err = gve_adminq_configure_rss(priv, &update_reta_config);
+		err = gve_adminq_configure_rss(priv, &update_reta_config);
 		if (err)
 			PMD_DRV_LOG(ERR,
 				"Could not reconfigure RSS redirection table.");
@@ -828,6 +838,8 @@ gve_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
 
 	dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_RSS_HASH;
+	if (!gve_is_gqi(priv) && priv->nic_ts_report_mz)
+		dev_info->rx_offload_capa |= RTE_ETH_RX_OFFLOAD_TIMESTAMP;
 	dev_info->tx_offload_capa =
 		RTE_ETH_TX_OFFLOAD_MULTI_SEGS	|
 		RTE_ETH_TX_OFFLOAD_UDP_CKSUM	|
@@ -1683,6 +1695,7 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
 	pthread_mutex_init(&priv->nic_ts_lock, &mutexattr);
 	pthread_mutexattr_destroy(&mutexattr);
 
+	priv->mbuf_timestamp_offset = -1;
 	err = gve_init_priv(priv, false);
 	if (err) {
 		pthread_mutex_destroy(&priv->flow_rule_lock);
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 1e80f3a906..4a7e5ecdf3 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -261,6 +261,7 @@ struct gve_rx_queue {
 	struct rte_mbuf **refill_bufs;
 
 	uint8_t is_gqi_qpl;
+	bool timestamp_enabled;
 };
 
 struct gve_flow {
@@ -372,8 +373,32 @@ struct gve_priv {
 	RTE_ATOMIC(uint8_t) nic_ts_stale;
 	rte_thread_t nic_ts_thread_id;
 	RTE_ATOMIC(uint8_t) nic_ts_thread_should_stop;
+
+	int mbuf_timestamp_offset;
+	uint64_t mbuf_timestamp_mask;
 };
 
+/* Expand the hardware timestamp to the full 64 bits of width.
+ *
+ * This algorithm works by using the passed hardware timestamp to generate a
+ * diff relative to the last read of the nic clock. This diff can be positive or
+ * negative, as it is possible that we have read the clock more recently than
+ * the hardware has received this packet. To detect this, we use the high bit of
+ * the diff, and assume that the read is more recent if the high bit is set. In
+ * this case we invert the process.
+ *
+ * Note that this means if the time delta between packet reception and the last
+ * clock read is greater than ~2 seconds, this will provide invalid results.
+ */
+static inline uint64_t
+gve_reconstruct_ts(uint64_t last_sync, uint32_t ts)
+{
+	uint32_t low = (uint32_t)last_sync;
+	int32_t diff = (int32_t)(ts - low);
+
+	return last_sync + diff;
+}
+
 static inline bool
 gve_is_gqi(struct gve_priv *priv)
 {
diff --git a/drivers/net/gve/gve_rx_dqo.c b/drivers/net/gve/gve_rx_dqo.c
index 8035aee572..cc343f3fd8 100644
--- a/drivers/net/gve/gve_rx_dqo.c
+++ b/drivers/net/gve/gve_rx_dqo.c
@@ -160,6 +160,8 @@ gve_rx_burst_dqo(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 {
 	volatile struct gve_rx_compl_desc_dqo *rx_desc;
 	struct gve_rx_queue *rxq;
+	uint64_t last_sync = 0;
+	struct gve_priv *priv;
 	struct rte_mbuf *rxm;
 	uint16_t rx_buf_id;
 	uint16_t pkt_len;
@@ -171,6 +173,15 @@ gve_rx_burst_dqo(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 	nb_rx = 0;
 	rxq = rx_queue;
 	rx_id = rxq->rx_tail;
+	priv = rxq->hw;
+
+	if (rxq->timestamp_enabled &&
+	    !rte_atomic_load_explicit(&priv->nic_ts_stale,
+				      rte_memory_order_acquire)) {
+		last_sync =
+			rte_atomic_load_explicit(&priv->last_read_nic_timestamp,
+						 rte_memory_order_relaxed);
+	}
 
 	while (nb_rx < nb_pkts) {
 		rx_desc = &rxq->compl_ring[rx_id];
@@ -208,6 +219,16 @@ gve_rx_burst_dqo(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		gve_parse_csum_ol_flags(rxm, rx_desc);
 		rxm->hash.rss = rte_le_to_cpu_32(rx_desc->hash);
 
+		if (last_sync != 0 &&
+		    (rx_desc->ts_sub_nsecs_low & GVE_DQO_RX_HWTSTAMP_VALID) &&
+		    priv->mbuf_timestamp_offset >= 0) {
+			uint32_t ts = rte_le_to_cpu_32(rx_desc->ts);
+			uint64_t full_ts = gve_reconstruct_ts(last_sync, ts);
+
+			*RTE_MBUF_DYNFIELD(rxm, priv->mbuf_timestamp_offset, uint64_t *) = full_ts;
+			rxm->ol_flags |= priv->mbuf_timestamp_mask;
+		}
+
 		rx_pkts[nb_rx++] = rxm;
 		bytes += pkt_len;
 	}
@@ -320,6 +341,11 @@ gve_rx_queue_setup_dqo(struct rte_eth_dev *dev, uint16_t queue_id,
 		return -ENOMEM;
 	}
 
+	/* Setup hardware timestamping if enabled */
+	if ((conf->offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP) ||
+	    (dev->data->dev_conf.rxmode.offloads & RTE_ETH_RX_OFFLOAD_TIMESTAMP))
+		rxq->timestamp_enabled = true;
+
 	/* check free_thresh here */
 	free_thresh = conf->rx_free_thresh ?
 			conf->rx_free_thresh : GVE_DEFAULT_RX_FREE_THRESH;
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 5/6] net/gve: support read clock ethdev op
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Implement the read_clock operation in eth_dev_ops. The function calls
the AdminQ command to fetch the current NIC timestamp synchronously,
updates the cached timestamp used for reconstruction, and returns the
full 64-bit value.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v4:
    - Fix mutex initialization order: initialize nic_ts_lock before
      calling gve_init_priv() (which invokes early setup-time read).
    - Fix mutex teardown order: destroy nic_ts_lock only after the
      background sync thread has been joined and stopped.

v3:
    - Add mutex lock to protect shared NIC timestamp memzone access.
    - Fix missing read_clock assignment to DQO queue ops table
      (accidental omission in v2).

v2:
    - Scoped read_clock ethdev operation strictly to DQO queues.
---
 drivers/net/gve/gve_ethdev.c | 57 +++++++++++++++++++++++++++++++-----
 drivers/net/gve/gve_ethdev.h |  1 +
 2 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index fa91575b3e..111f66efa8 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -463,11 +463,13 @@ gve_read_nic_clock(void *arg)
 	if (!priv || !priv->nic_ts_report_mz)
 		return;
 
+	pthread_mutex_lock(&priv->nic_ts_lock);
 	memset(priv->nic_ts_report, 0, sizeof(struct gve_nic_ts_report));
 
 	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
 	if (err == 0) {
 		ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+		pthread_mutex_unlock(&priv->nic_ts_lock);
 		rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts,
 					  rte_memory_order_relaxed);
 		PMD_DRV_LOG(DEBUG, "Fetched NIC Timestamp: %" PRIu64, ts);
@@ -476,6 +478,7 @@ gve_read_nic_clock(void *arg)
 		rte_atomic_store_explicit(&priv->nic_ts_stale, 0,
 					  rte_memory_order_release);
 	} else {
+		pthread_mutex_unlock(&priv->nic_ts_lock);
 		PMD_DRV_LOG(ERR, "Failed to read NIC clock, AQ err: %d", err);
 		fails = rte_atomic_fetch_add_explicit(&priv->nic_ts_read_fails, 1,
 						      rte_memory_order_relaxed) + 1;
@@ -705,12 +708,13 @@ gve_dev_close(struct rte_eth_dev *dev)
 	if (gve_get_flow_subsystem_ok(priv))
 		gve_teardown_flow_subsystem(priv);
 
-	pthread_mutex_destroy(&priv->flow_rule_lock);
-
 	gve_free_queues(dev);
 	gve_teardown_device_resources(priv);
 	gve_adminq_free(priv);
 
+	pthread_mutex_destroy(&priv->flow_rule_lock);
+	pthread_mutex_destroy(&priv->nic_ts_lock);
+
 	dev->data->mac_addrs = NULL;
 
 	return err;
@@ -1278,6 +1282,38 @@ gve_flow_ops_get(struct rte_eth_dev *dev, const struct rte_flow_ops **ops)
 	return 0;
 }
 
+static int
+gve_read_clock(struct rte_eth_dev *dev, uint64_t *clock)
+{
+	struct gve_priv *priv = dev->data->dev_private;
+	uint64_t ts;
+	int err;
+
+	if (!priv->nic_timestamp_supported)
+		return -EOPNOTSUPP;
+
+	if (!priv->nic_ts_report_mz)
+		return -EIO;
+
+	pthread_mutex_lock(&priv->nic_ts_lock);
+	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
+	if (err != 0) {
+		pthread_mutex_unlock(&priv->nic_ts_lock);
+		return err;
+	}
+
+	ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+	pthread_mutex_unlock(&priv->nic_ts_lock);
+	*clock = ts;
+
+	/* Update the cached value */
+	rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_stale, 0, rte_memory_order_release);
+
+	return 0;
+}
+
 static const struct eth_dev_ops gve_eth_dev_ops = {
 	.dev_configure        = gve_dev_configure,
 	.dev_start            = gve_dev_start,
@@ -1332,6 +1368,7 @@ static const struct eth_dev_ops gve_eth_dev_ops_dqo = {
 	.rss_hash_conf_get    = gve_rss_hash_conf_get,
 	.reta_update          = gve_rss_reta_update,
 	.reta_query           = gve_rss_reta_query,
+	.read_clock           = gve_read_clock,
 };
 
 static int
@@ -1640,9 +1677,18 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
 	priv->max_nb_txq = max_tx_queues;
 	priv->max_nb_rxq = max_rx_queues;
 
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->flow_rule_lock, &mutexattr);
+	pthread_mutex_init(&priv->nic_ts_lock, &mutexattr);
+	pthread_mutexattr_destroy(&mutexattr);
+
 	err = gve_init_priv(priv, false);
-	if (err)
+	if (err) {
+		pthread_mutex_destroy(&priv->flow_rule_lock);
+		pthread_mutex_destroy(&priv->nic_ts_lock);
 		return err;
+	}
 
 	if (gve_is_gqi(priv)) {
 		eth_dev->dev_ops = &gve_eth_dev_ops;
@@ -1656,11 +1702,6 @@ gve_dev_init(struct rte_eth_dev *eth_dev)
 
 	eth_dev->data->mac_addrs = &priv->dev_addr;
 
-	pthread_mutexattr_init(&mutexattr);
-	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
-	pthread_mutex_init(&priv->flow_rule_lock, &mutexattr);
-	pthread_mutexattr_destroy(&mutexattr);
-
 	return 0;
 }
 
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 4dcbaa9971..1e80f3a906 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -366,6 +366,7 @@ struct gve_priv {
 	bool nic_timestamp_supported;
 	const struct rte_memzone *nic_ts_report_mz;
 	struct gve_nic_ts_report *nic_ts_report;
+	pthread_mutex_t nic_ts_lock;
 	RTE_ATOMIC(uint64_t) last_read_nic_timestamp;
 	RTE_ATOMIC(uint32_t) nic_ts_read_fails;
 	RTE_ATOMIC(uint8_t) nic_ts_stale;
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 4/6] net/gve: add periodic NIC clock synchronization
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce a mechanism to periodically fetch the NIC hardware timestamp
using the GVE_ADMINQ_REPORT_NIC_TIMESTAMP AdminQ command. The
synchronization runs every 250ms via a dedicated background control thread
(gve_nic_ts_thread). The thread polls using an incremental sleep loop
checking a stop flag to ensure fast termination during driver teardown.
After 7 consecutive clock read failures, the timestamp is marked as stale,
indicating to the RX path that reconstructed timestamps may be unreliable.

Atomics exist because of the potential for async callers (introduced
here) and async callers (introduced later in the RX datapath) accessing
the cached state.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v5:
    - Reword commit message body to accurately describe the dedicated control
      thread polling mechanism instead of obsolete rte_alarm.

v4:
    - Replace EAL alarm thread with a dedicated control thread to
      prevent blocking shared EAL interrupt thread for up to 2s
      on GVE AdminQ timeouts.
    - Implement incremental sleep loop to support fast termination.

v2:
    - Removed redundant void* casts.
    - Handled alarm reschedule failures by marking timestamp stale.
    - Added transient error logging on memzone allocation failure.
---
 drivers/net/gve/gve_ethdev.c | 122 +++++++++++++++++++++++++++++++++++
 drivers/net/gve/gve_ethdev.h |  12 ++++
 2 files changed, 134 insertions(+)

diff --git a/drivers/net/gve/gve_ethdev.c b/drivers/net/gve/gve_ethdev.c
index 476b2c311f..fa91575b3e 100644
--- a/drivers/net/gve/gve_ethdev.c
+++ b/drivers/net/gve/gve_ethdev.c
@@ -452,6 +452,94 @@ gve_dev_start(struct rte_eth_dev *dev)
 	return 0;
 }
 
+static void
+gve_read_nic_clock(void *arg)
+{
+	struct gve_priv *priv = arg;
+	uint32_t fails;
+	uint64_t ts;
+	int err;
+
+	if (!priv || !priv->nic_ts_report_mz)
+		return;
+
+	memset(priv->nic_ts_report, 0, sizeof(struct gve_nic_ts_report));
+
+	err = gve_adminq_report_nic_timestamp(priv, priv->nic_ts_report_mz->iova);
+	if (err == 0) {
+		ts = be64_to_cpu(priv->nic_ts_report->nic_timestamp);
+		rte_atomic_store_explicit(&priv->last_read_nic_timestamp, ts,
+					  rte_memory_order_relaxed);
+		PMD_DRV_LOG(DEBUG, "Fetched NIC Timestamp: %" PRIu64, ts);
+		rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0,
+					  rte_memory_order_relaxed);
+		rte_atomic_store_explicit(&priv->nic_ts_stale, 0,
+					  rte_memory_order_release);
+	} else {
+		PMD_DRV_LOG(ERR, "Failed to read NIC clock, AQ err: %d", err);
+		fails = rte_atomic_fetch_add_explicit(&priv->nic_ts_read_fails, 1,
+						      rte_memory_order_relaxed) + 1;
+		if (fails >= GVE_NIC_CLOCK_READ_MAX_FAILS) {
+			if (!rte_atomic_load_explicit(&priv->nic_ts_stale,
+						      rte_memory_order_relaxed))
+				PMD_DRV_LOG(ERR,
+					"NIC timestamping marked as stale after %u consecutive failures",
+					GVE_NIC_CLOCK_READ_MAX_FAILS);
+			rte_atomic_store_explicit(&priv->nic_ts_stale, 1,
+						  rte_memory_order_release);
+		}
+	}
+}
+
+static uint32_t
+gve_nic_ts_thread(void *arg)
+{
+	struct gve_priv *priv = arg;
+	unsigned int sleep_cnt;
+
+	while (!rte_atomic_load_explicit(&priv->nic_ts_thread_should_stop,
+					 rte_memory_order_relaxed)) {
+		gve_read_nic_clock(priv);
+		for (sleep_cnt = 0; sleep_cnt < GVE_NIC_CLOCK_READ_PERIOD_MS / 10; sleep_cnt++) {
+			if (rte_atomic_load_explicit(&priv->nic_ts_thread_should_stop,
+						     rte_memory_order_relaxed))
+				break;
+			rte_delay_us_sleep(10000);
+		}
+	}
+	return 0;
+}
+
+static int
+gve_alloc_nic_ts_report(struct gve_priv *priv)
+{
+	char z_name[RTE_MEMZONE_NAMESIZE];
+
+	snprintf(z_name, sizeof(z_name), "gve_%s_nic_ts_report",
+		 priv->pci_dev->device.name);
+	priv->nic_ts_report_mz = rte_memzone_reserve_aligned(z_name,
+			sizeof(struct gve_nic_ts_report), rte_socket_id(),
+			RTE_MEMZONE_IOVA_CONTIG, PAGE_SIZE);
+
+	if (!priv->nic_ts_report_mz) {
+		PMD_DRV_LOG(ERR, "Failed to allocate memzone for NIC TS report");
+		return -ENOMEM;
+	}
+	priv->nic_ts_report = priv->nic_ts_report_mz->addr;
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	return 0;
+}
+
+static void
+gve_free_nic_ts_report(struct gve_priv *priv)
+{
+	if (priv->nic_ts_report_mz) {
+		rte_memzone_free(priv->nic_ts_report_mz);
+		priv->nic_ts_report_mz = NULL;
+		priv->nic_ts_report = NULL;
+	}
+}
+
 static int
 gve_dev_stop(struct rte_eth_dev *dev)
 {
@@ -586,6 +674,13 @@ gve_teardown_device_resources(struct gve_priv *priv)
 				err);
 	}
 
+	if (priv->nic_ts_report_mz) {
+		rte_atomic_store_explicit(&priv->nic_ts_thread_should_stop, 1,
+					  rte_memory_order_relaxed);
+		rte_thread_join(priv->nic_ts_thread_id, NULL);
+		gve_free_nic_ts_report(priv);
+	}
+
 	gve_free_ptype_lut_dqo(priv);
 	gve_free_counter_array(priv);
 	gve_free_irq_db(priv);
@@ -1252,6 +1347,32 @@ pci_dev_msix_vec_count(struct rte_pci_device *pdev)
 	return 0;
 }
 
+static void
+gve_setup_nic_timestamp(struct gve_priv *priv)
+{
+	int err;
+
+	if (!priv->nic_timestamp_supported)
+		return;
+
+	rte_atomic_store_explicit(&priv->nic_ts_read_fails, 0, rte_memory_order_relaxed);
+	rte_atomic_store_explicit(&priv->nic_ts_stale, 1, rte_memory_order_relaxed);
+	err = gve_alloc_nic_ts_report(priv);
+	if (err == 0) {
+		gve_read_nic_clock(priv);
+		rte_atomic_store_explicit(&priv->nic_ts_thread_should_stop, 0,
+					  rte_memory_order_relaxed);
+		err = rte_thread_create_internal_control(&priv->nic_ts_thread_id, "gve-ts",
+							 gve_nic_ts_thread, priv);
+		if (err != 0) {
+			PMD_DRV_LOG(ERR, "Failed to create NIC clock sync thread, err=%d", err);
+			gve_free_nic_ts_report(priv);
+		}
+	} else {
+		PMD_DRV_LOG(ERR, "Failed to allocate memory for NIC timestamping subsystem. Please reset device to retry.");
+	}
+}
+
 static int
 gve_setup_device_resources(struct gve_priv *priv)
 {
@@ -1307,6 +1428,7 @@ gve_setup_device_resources(struct gve_priv *priv)
 			goto free_ptype_lut;
 		}
 	}
+	gve_setup_nic_timestamp(priv);
 
 	gve_set_device_resources_ok(priv);
 
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b67f82c263..4dcbaa9971 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -12,6 +12,8 @@
 #include <rte_pci.h>
 #include <pthread.h>
 #include <rte_bitmap.h>
+#include <rte_memzone.h>
+#include <rte_thread.h>
 
 #include "base/gve.h"
 
@@ -39,6 +41,9 @@
 #define GVE_RSS_HASH_KEY_SIZE 40
 #define GVE_RSS_INDIR_SIZE 128
 
+#define GVE_NIC_CLOCK_READ_PERIOD_MS 250
+#define GVE_NIC_CLOCK_READ_MAX_FAILS 7
+
 #define GVE_TX_CKSUM_OFFLOAD_MASK (		\
 		RTE_MBUF_F_TX_L4_MASK  |	\
 		RTE_MBUF_F_TX_TCP_SEG)
@@ -359,6 +364,13 @@ struct gve_priv {
 
 	/* HW Timestamping Fields */
 	bool nic_timestamp_supported;
+	const struct rte_memzone *nic_ts_report_mz;
+	struct gve_nic_ts_report *nic_ts_report;
+	RTE_ATOMIC(uint64_t) last_read_nic_timestamp;
+	RTE_ATOMIC(uint32_t) nic_ts_read_fails;
+	RTE_ATOMIC(uint8_t) nic_ts_stale;
+	rte_thread_t nic_ts_thread_id;
+	RTE_ATOMIC(uint8_t) nic_ts_thread_should_stop;
 };
 
 static inline bool
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 3/6] net/gve: add AdminQ command for NIC timestamps
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce the necessary definitions and functions for the
GVE_ADMINQ_REPORT_NIC_TIMESTAMP AdminQ command.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v2:
    - Added adminq timestamp counter reset to gve_adminq_alloc.
---
 drivers/net/gve/base/gve_adminq.c | 20 ++++++++++++++++++++
 drivers/net/gve/base/gve_adminq.h | 20 ++++++++++++++++++++
 drivers/net/gve/gve_ethdev.h      |  1 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 1ced1e442e..2b25c7f390 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -263,6 +263,8 @@ int gve_adminq_alloc(struct gve_priv *priv)
 	priv->adminq_get_ptype_map_cnt = 0;
 	priv->adminq_cfg_flow_rule_cnt = 0;
 
+	priv->adminq_report_nic_timestamp_cnt = 0;
+
 	pthread_mutexattr_init(&mutexattr);
 	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
 	pthread_mutex_init(&priv->adminq_lock, &mutexattr);
@@ -522,6 +524,10 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	case GVE_ADMINQ_CONFIGURE_FLOW_RULE:
 		priv->adminq_cfg_flow_rule_cnt++;
 		break;
+	case GVE_ADMINQ_REPORT_NIC_TIMESTAMP:
+		priv->adminq_report_nic_timestamp_cnt++;
+		break;
+
 	default:
 		PMD_DRV_LOG(ERR, "unknown AQ command opcode %d", opcode);
 	}
@@ -636,6 +642,20 @@ int gve_adminq_reset_flow_rules(struct gve_priv *priv)
 	return gve_adminq_configure_flow_rule(priv, &flow_rule_cmd);
 }
 
+int gve_adminq_report_nic_timestamp(struct gve_priv *priv, dma_addr_t nic_ts_report_addr)
+{
+	union gve_adminq_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.opcode = cpu_to_be32(GVE_ADMINQ_REPORT_NIC_TIMESTAMP);
+	cmd.report_nic_timestamp = (struct gve_adminq_report_nic_timestamp) {
+		.nic_ts_report_len = cpu_to_be64(sizeof(struct gve_nic_ts_report)),
+		.nic_timestamp_addr = cpu_to_be64(nic_ts_report_addr),
+	};
+
+	return gve_adminq_execute_cmd(priv, &cmd);
+}
+
 /* The device specifies that the management vector can either be the first irq
  * or the last irq. ntfy_blk_msix_base_idx indicates the first irq assigned to
  * the ntfy blks. It if is 0 then the management vector is last, if it is 1 then
diff --git a/drivers/net/gve/base/gve_adminq.h b/drivers/net/gve/base/gve_adminq.h
index eaee5649f2..954be39fbf 100644
--- a/drivers/net/gve/base/gve_adminq.h
+++ b/drivers/net/gve/base/gve_adminq.h
@@ -26,6 +26,7 @@ enum gve_adminq_opcodes {
 	GVE_ADMINQ_REPORT_LINK_SPEED		= 0xD,
 	GVE_ADMINQ_GET_PTYPE_MAP		= 0xE,
 	GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY	= 0xF,
+	GVE_ADMINQ_REPORT_NIC_TIMESTAMP		= 0x11,
 	/* For commands that are larger than 56 bytes */
 	GVE_ADMINQ_EXTENDED_COMMAND		= 0xFF,
 };
@@ -373,6 +374,23 @@ struct gve_stats_report {
 
 GVE_CHECK_STRUCT_LEN(8, gve_stats_report);
 
+struct gve_adminq_report_nic_timestamp {
+	__be64 nic_ts_report_len;
+	__be64 nic_timestamp_addr;
+};
+
+GVE_CHECK_STRUCT_LEN(16, gve_adminq_report_nic_timestamp);
+
+struct gve_nic_ts_report {
+	__be64 nic_timestamp; /* NIC clock in nanoseconds */
+	__be64 pre_cycles; /* System cycle counter before NIC clock read */
+	__be64 post_cycles; /* System cycle counter after NIC clock read */
+	__be64 reserved3;
+	__be64 reserved4;
+};
+
+GVE_CHECK_STRUCT_LEN(40, gve_nic_ts_report);
+
 /* Numbers of gve tx/rx stats in stats report. */
 #define GVE_TX_STATS_REPORT_NUM        6
 #define GVE_RX_STATS_REPORT_NUM        2
@@ -490,6 +508,7 @@ union gve_adminq_command {
 			struct gve_adminq_verify_driver_compatibility
 				verify_driver_compatibility;
 			struct gve_adminq_extended_command extended_command;
+			struct gve_adminq_report_nic_timestamp report_nic_timestamp;
 		};
 	};
 	u8 reserved[64];
@@ -537,5 +556,6 @@ int gve_adminq_add_flow_rule(struct gve_priv *priv,
 			     struct gve_flow_rule_params *rule, u32 loc);
 int gve_adminq_del_flow_rule(struct gve_priv *priv, u32 loc);
 int gve_adminq_reset_flow_rules(struct gve_priv *priv);
+int gve_adminq_report_nic_timestamp(struct gve_priv *priv, dma_addr_t nic_ts_report_addr);
 
 #endif /* _GVE_ADMINQ_H */
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index b9b4688367..b67f82c263 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -328,6 +328,7 @@ struct gve_priv {
 	uint32_t adminq_get_ptype_map_cnt;
 	uint32_t adminq_verify_driver_compatibility_cnt;
 	uint32_t adminq_cfg_flow_rule_cnt;
+	uint32_t adminq_report_nic_timestamp_cnt;
 	volatile uint32_t state_flags;
 
 	/* Gvnic device link speed from hypervisor. */
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 2/6] net/gve: add device option support for HW timestamps
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce the necessary definitions and functions for the device
option flag (GVE_DEV_OPT_ID_NIC_TIMESTAMP) to detect hardware
timestamping support in the gvnic device.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
 drivers/net/gve/base/gve_adminq.c | 41 ++++++++++++++++++++++++++-----
 drivers/net/gve/base/gve_adminq.h |  9 +++++++
 drivers/net/gve/gve_ethdev.h      |  3 +++
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 743ab8e7ae..1ced1e442e 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -38,7 +38,8 @@ void gve_parse_device_option(struct gve_priv *priv,
 			     struct gve_device_option_dqo_rda **dev_op_dqo_rda,
 			     struct gve_device_option_flow_steering **dev_op_flow_steering,
 			     struct gve_device_option_modify_ring **dev_op_modify_ring,
-			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+			     struct gve_device_option_jumbo_frames **dev_op_jumbo_frames,
+			     struct gve_device_option_nic_timestamp **dev_op_nic_timestamp)
 {
 	u32 req_feat_mask = be32_to_cpu(option->required_features_mask);
 	u16 option_length = be16_to_cpu(option->option_length);
@@ -168,6 +169,24 @@ void gve_parse_device_option(struct gve_priv *priv,
 		}
 		*dev_op_jumbo_frames = RTE_PTR_ADD(option, sizeof(*option));
 		break;
+	case GVE_DEV_OPT_ID_NIC_TIMESTAMP:
+		if (option_length < sizeof(**dev_op_nic_timestamp) ||
+		    req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP) {
+			PMD_DRV_LOG(WARNING, GVE_DEVICE_OPTION_ERROR_FMT,
+				    "Nic Timestamp",
+				    (int)sizeof(**dev_op_nic_timestamp),
+				    GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP,
+				    option_length, req_feat_mask);
+			break;
+		}
+
+		if (option_length > sizeof(**dev_op_nic_timestamp)) {
+			PMD_DRV_LOG(WARNING,
+				    GVE_DEVICE_OPTION_TOO_BIG_FMT,
+				    "Nic Timestamp");
+		}
+		*dev_op_nic_timestamp = RTE_PTR_ADD(option, sizeof(*option));
+		break;
 	default:
 		/* If we don't recognize the option just continue
 		 * without doing anything.
@@ -186,7 +205,8 @@ gve_process_device_options(struct gve_priv *priv,
 			   struct gve_device_option_dqo_rda **dev_op_dqo_rda,
 			   struct gve_device_option_flow_steering **dev_op_flow_steering,
 			   struct gve_device_option_modify_ring **dev_op_modify_ring,
-			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames)
+			   struct gve_device_option_jumbo_frames **dev_op_jumbo_frames,
+			   struct gve_device_option_nic_timestamp **dev_op_nic_timestamp)
 {
 	const int num_options = be16_to_cpu(descriptor->num_device_options);
 	struct gve_device_option *dev_opt;
@@ -207,7 +227,8 @@ gve_process_device_options(struct gve_priv *priv,
 		gve_parse_device_option(priv, dev_opt,
 					dev_op_gqi_rda, dev_op_gqi_qpl,
 					dev_op_dqo_rda, dev_op_flow_steering,
-					dev_op_modify_ring, dev_op_jumbo_frames);
+					dev_op_modify_ring, dev_op_jumbo_frames,
+					dev_op_nic_timestamp);
 		dev_opt = next_opt;
 	}
 
@@ -920,7 +941,8 @@ static void gve_enable_supported_features(struct gve_priv *priv,
 	u32 supported_features_mask,
 	const struct gve_device_option_flow_steering *dev_op_flow_steering,
 	const struct gve_device_option_modify_ring *dev_op_modify_ring,
-	const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames)
+	const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames,
+	const struct gve_device_option_nic_timestamp *dev_op_nic_timestamp)
 {
 	if (dev_op_flow_steering &&
 	    (supported_features_mask & GVE_SUP_FLOW_STEERING_MASK) &&
@@ -947,6 +969,11 @@ static void gve_enable_supported_features(struct gve_priv *priv,
 		PMD_DRV_LOG(INFO, "JUMBO FRAMES device option enabled.");
 		priv->max_mtu = be16_to_cpu(dev_op_jumbo_frames->max_mtu);
 	}
+	if (dev_op_nic_timestamp &&
+	    (supported_features_mask & GVE_SUP_NIC_TIMESTAMP_MASK)) {
+		PMD_DRV_LOG(INFO, "NIC TIMESTAMP device option enabled.");
+		priv->nic_timestamp_supported = true;
+	}
 }
 
 int gve_adminq_describe_device(struct gve_priv *priv)
@@ -954,6 +981,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 	struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL;
 	struct gve_device_option_modify_ring *dev_op_modify_ring = NULL;
 	struct gve_device_option_flow_steering *dev_op_flow_steering = NULL;
+	struct gve_device_option_nic_timestamp *dev_op_nic_timestamp = NULL;
 	struct gve_device_option_gqi_rda *dev_op_gqi_rda = NULL;
 	struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL;
 	struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL;
@@ -983,7 +1011,8 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 					 &dev_op_gqi_qpl, &dev_op_dqo_rda,
 					 &dev_op_flow_steering,
 					 &dev_op_modify_ring,
-					 &dev_op_jumbo_frames);
+					 &dev_op_jumbo_frames,
+					 &dev_op_nic_timestamp);
 	if (err)
 		goto free_device_descriptor;
 
@@ -1038,7 +1067,7 @@ int gve_adminq_describe_device(struct gve_priv *priv)
 
 	gve_enable_supported_features(priv, supported_features_mask,
 				      dev_op_flow_steering, dev_op_modify_ring,
-				      dev_op_jumbo_frames);
+				      dev_op_jumbo_frames, dev_op_nic_timestamp);
 
 free_device_descriptor:
 	gve_free_dma_mem(&descriptor_dma_mem);
diff --git a/drivers/net/gve/base/gve_adminq.h b/drivers/net/gve/base/gve_adminq.h
index d8e5e6a352..eaee5649f2 100644
--- a/drivers/net/gve/base/gve_adminq.h
+++ b/drivers/net/gve/base/gve_adminq.h
@@ -153,6 +153,12 @@ struct gve_device_option_jumbo_frames {
 
 GVE_CHECK_STRUCT_LEN(8, gve_device_option_jumbo_frames);
 
+struct gve_device_option_nic_timestamp {
+	__be32 supported_features_mask;
+};
+
+GVE_CHECK_STRUCT_LEN(4, gve_device_option_nic_timestamp);
+
 /* Terminology:
  *
  * RDA - Raw DMA Addressing - Buffers associated with SKBs are directly DMA
@@ -169,6 +175,7 @@ enum gve_dev_opt_id {
 	GVE_DEV_OPT_ID_MODIFY_RING = 0x6,
 	GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8,
 	GVE_DEV_OPT_ID_FLOW_STEERING = 0xb,
+	GVE_DEV_OPT_ID_NIC_TIMESTAMP = 0xd,
 };
 
 enum gve_dev_opt_req_feat_mask {
@@ -179,12 +186,14 @@ enum gve_dev_opt_req_feat_mask {
 	GVE_DEV_OPT_REQ_FEAT_MASK_FLOW_STEERING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0,
 	GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0,
+	GVE_DEV_OPT_REQ_FEAT_MASK_NIC_TIMESTAMP = 0x0,
 };
 
 enum gve_sup_feature_mask {
 	GVE_SUP_MODIFY_RING_MASK = 1 << 0,
 	GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2,
 	GVE_SUP_FLOW_STEERING_MASK = 1 << 5,
+	GVE_SUP_NIC_TIMESTAMP_MASK = 1 << 8,
 };
 
 #define GVE_DEV_OPT_LEN_GQI_RAW_ADDRESSING 0x0
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 524e48e723..b9b4688367 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -355,6 +355,9 @@ struct gve_priv {
 	void *avail_flow_rule_bmp_mem; /* Backing memory for the bitmap */
 	pthread_mutex_t flow_rule_lock; /* Lock for bitmap and tailq access */
 	TAILQ_HEAD(, gve_flow) active_flows;
+
+	/* HW Timestamping Fields */
+	bool nic_timestamp_supported;
 };
 
 static inline bool
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 1/6] net/gve: add thread safety to admin queue
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260617000712.2195506-1-blasko@google.com>

Introduce a pthread_mutex to protect the admin queue operations.
Locking was added around gve_adminq_execute_cmd and the batch
queue creation/destruction functions.

Signed-off-by: Mark Blasko <blasko@google.com>
Reviewed-by: Joshua Washington <joshwash@google.com>
Reviewed-by: Jasper Tran O'Leary <jtranoleary@google.com>
---
v2:
    - Dropped ROBUST mutex attribute.
---
 .mailmap                          |  1 +
 drivers/net/gve/base/gve_adminq.c | 67 +++++++++++++++++++++++++------
 drivers/net/gve/gve_ethdev.h      |  1 +
 3 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/.mailmap b/.mailmap
index e052b85213..1b10cfca35 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1016,6 +1016,7 @@ Mario Carrillo <mario.alfredo.c.arevalo@intel.com>
 Mário Kuka <kuka@cesnet.cz>
 Mariusz Drost <mariuszx.drost@intel.com>
 Mark Asselstine <mark.asselstine@windriver.com>
+Mark Blasko <blasko@google.com>
 Mark Bloch <mbloch@nvidia.com> <markb@mellanox.com>
 Mark Gillott <mgillott@vyatta.att-mail.com>
 Mark Kavanagh <mark.b.kavanagh@intel.com>
diff --git a/drivers/net/gve/base/gve_adminq.c b/drivers/net/gve/base/gve_adminq.c
index 9c5316fb00..743ab8e7ae 100644
--- a/drivers/net/gve/base/gve_adminq.c
+++ b/drivers/net/gve/base/gve_adminq.c
@@ -216,6 +216,7 @@ gve_process_device_options(struct gve_priv *priv,
 
 int gve_adminq_alloc(struct gve_priv *priv)
 {
+	pthread_mutexattr_t mutexattr;
 	uint8_t pci_rev_id;
 
 	priv->adminq = gve_alloc_dma_mem(&priv->adminq_dma_mem, PAGE_SIZE);
@@ -241,6 +242,11 @@ int gve_adminq_alloc(struct gve_priv *priv)
 	priv->adminq_get_ptype_map_cnt = 0;
 	priv->adminq_cfg_flow_rule_cnt = 0;
 
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&priv->adminq_lock, &mutexattr);
+	pthread_mutexattr_destroy(&mutexattr);
+
 	/* Setup Admin queue with the device */
 	rte_pci_read_config(priv->pci_dev, &pci_rev_id, sizeof(pci_rev_id),
 			    RTE_PCI_REVISION_ID);
@@ -304,6 +310,7 @@ void gve_adminq_free(struct gve_priv *priv)
 		return;
 	gve_adminq_release(priv);
 	gve_free_dma_mem(&priv->adminq_dma_mem);
+	pthread_mutex_destroy(&priv->adminq_lock);
 	gve_clear_admin_queue_ok(priv);
 }
 
@@ -418,7 +425,10 @@ static int gve_adminq_issue_cmd(struct gve_priv *priv,
 	    (tail & priv->adminq_mask)) {
 		int err;
 
-		/* Flush existing commands to make room. */
+		/* Flush existing commands to make room.
+		 * Note: This kicks the doorbell for all staged commands.
+		 * Any failure here means we failed after attempting to kick.
+		 */
 		err = gve_adminq_kick_and_wait(priv);
 		if (err)
 			return err;
@@ -509,17 +519,24 @@ static int gve_adminq_execute_cmd(struct gve_priv *priv,
 	u32 tail, head;
 	int err;
 
+	pthread_mutex_lock(&priv->adminq_lock);
 	tail = ioread32be(&priv->reg_bar0->adminq_event_counter);
 	head = priv->adminq_prod_cnt;
-	if (tail != head)
+	if (tail != head) {
 		/* This is not a valid path */
-		return -EINVAL;
+		err = -EINVAL;
+		goto unlock_and_return;
+	}
 
 	err = gve_adminq_issue_cmd(priv, cmd_orig);
 	if (err)
-		return err;
+		goto unlock_and_return;
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_execute_extended_cmd(struct gve_priv *priv, u32 opcode,
@@ -693,13 +710,19 @@ int gve_adminq_create_tx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_create_tx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_create_rx_queue(struct gve_priv *priv, u32 queue_index)
@@ -747,13 +770,19 @@ int gve_adminq_create_rx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_create_rx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_destroy_tx_queue(struct gve_priv *priv, u32 queue_index)
@@ -779,13 +808,19 @@ int gve_adminq_destroy_tx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_destroy_tx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_adminq_destroy_rx_queue(struct gve_priv *priv, u32 queue_index)
@@ -811,13 +846,19 @@ int gve_adminq_destroy_rx_queues(struct gve_priv *priv, u32 num_queues)
 	int err;
 	u32 i;
 
+	pthread_mutex_lock(&priv->adminq_lock);
+
 	for (i = 0; i < num_queues; i++) {
 		err = gve_adminq_destroy_rx_queue(priv, i);
 		if (err)
-			return err;
+			goto unlock_and_return;
 	}
 
-	return gve_adminq_kick_and_wait(priv);
+	err = gve_adminq_kick_and_wait(priv);
+
+unlock_and_return:
+	pthread_mutex_unlock(&priv->adminq_lock);
+	return err;
 }
 
 static int gve_set_desc_cnt(struct gve_priv *priv,
diff --git a/drivers/net/gve/gve_ethdev.h b/drivers/net/gve/gve_ethdev.h
index 0577f03974..524e48e723 100644
--- a/drivers/net/gve/gve_ethdev.h
+++ b/drivers/net/gve/gve_ethdev.h
@@ -339,6 +339,7 @@ struct gve_priv {
 	struct gve_tx_queue **txqs;
 	struct gve_rx_queue **rxqs;
 
+	pthread_mutex_t adminq_lock; /* Protects AdminQ command execution */
 	uint32_t stats_report_len;
 	const struct rte_memzone *stats_report_mem;
 	uint16_t stats_start_idx; /* start index of array of stats written by NIC */
-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply related

* [PATCH v5 0/6] net/gve: add hardware timestamping support
From: Mark Blasko @ 2026-06-17  0:07 UTC (permalink / raw)
  To: stephen; +Cc: dev, joshwash, jtranoleary, blasko
In-Reply-To: <20260613042300.3760470-1-blasko@google.com>

This patch series introduces support for GVE hardware timestamping
on DQO queues. To support concurrent access, a mutex lock is introduced
to protect admin queue operations. A mechanism is then added to
periodically synchronize the NIC clock via a dedicated control thread,
and support is introduced for the read_clock ethdev operation.
Finally, the RX datapath is updated to reconstruct full 64-bit
timestamps from the 32-bit values in DQO descriptors.

---
v5:
- Patch 4: Reword commit message to describe control thread implementation.

v4:
- Patch 4: Offload periodic NIC clock reads to a dedicated control thread.
- Patch 5: Correct mutex initialization and teardown ordering.
- Patch 6: Update GVE documentation to clarify queue format requirements.

v3:
- Patch 5:
  - Add mutex lock to protect shared NIC timestamp memzone access.
  - Fix missing read_clock assignment to DQO queue ops table
    (accidental omission in v2).

v2:
- Patch 1: Dropped ROBUST mutex attribute.
- Patch 3: Added adminq timestamp counter reset to gve_adminq_alloc.
- Patch 4:
  - Removed redundant void* casts.
  - Handled alarm reschedule failures by marking timestamp stale.
  - Added transient error logging on memzone allocation failure.
- Patch 5: Scoped read_clock ethdev operation strictly to DQO queues.
- Patch 6:
  - Scoped timestamp offload capability advertisement strictly to
    DQO queues.
  - Predicated capability advertisement directly on memzone
    allocation.
  - Initialized mbuf_timestamp_offset to -1.
  - Added blank line separating release notes.
---

Mark Blasko (6):
  net/gve: add thread safety to admin queue
  net/gve: add device option support for HW timestamps
  net/gve: add AdminQ command for NIC timestamps
  net/gve: add periodic NIC clock synchronization
  net/gve: support read clock ethdev op
  net/gve: reconstruct HW timestamps from DQO

 .mailmap                               |   1 +
 doc/guides/nics/features/gve.ini       |   1 +
 doc/guides/nics/gve.rst                |  20 +++
 doc/guides/rel_notes/release_26_07.rst |   4 +
 drivers/net/gve/base/gve_adminq.c      | 128 +++++++++++++---
 drivers/net/gve/base/gve_adminq.h      |  29 ++++
 drivers/net/gve/base/gve_desc_dqo.h    |   8 +-
 drivers/net/gve/gve_ethdev.c           | 194 +++++++++++++++++++++++--
 drivers/net/gve/gve_ethdev.h           |  43 ++++++
 drivers/net/gve/gve_rx_dqo.c           |  26 ++++
 10 files changed, 424 insertions(+), 30 deletions(-)

-- 
2.54.0.1189.g8c84645362-goog


^ permalink raw reply

* [PATCH] net/bnxt: avoid link flap on flow control set
From: Mohammad Shuab Siddique @ 2026-06-16 23:23 UTC (permalink / raw)
  To: dev; +Cc: kishore.padmanabha, stable, Chenna Arnoori

From: Chenna Arnoori <chenna.arnoori@broadcom.com>

When OVS-DPDK reconciles port state, it calls flow_ctrl_get followed
by flow_ctrl_set if the returned configuration differs from its
database. The driver was reporting autoneg=1 whenever the firmware
had auto_pause set (including the AUTONEG_PAUSE bit 0x4), even
though no pause autoneg was actually requested. This mismatch
caused OVS to repeatedly call flow_ctrl_set, which triggered a
full link reconfig with PHY reset, flapping the link every time
any interface change occurred on the system.

Two problems were fixed. First, flow_ctrl_set was clearing all
autoneg bits instead of only the flow-control autoneg bit,
which also wiped the speed autoneg state. Second, the pause
set path was abandoning its own HWRM request and calling the
full link config function, which built a separate request
without the pause fields that set_pause_common would have
computed.

Port the kernel bnxt_en driver's approach: add a
set_link_common helper that merges link and speed fields into
an existing HWRM request, and rework set_pause to build a
single combined request with both pause and link fields when
a full reconfig is needed. When only pause changes without an
auto-to-force transition, a pause-only PHY config is sent
without a full link reprogram.

Fixes: 8aaf473bbed6 ("net/bnxt: add flow control operations")
Cc: stable@dpdk.org
Signed-off-by: Chenna Arnoori <chenna.arnoori@broadcom.com>
Signed-off-by: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
---
 drivers/net/bnxt/bnxt.h        |   4 +
 drivers/net/bnxt/bnxt_ethdev.c |  12 ++-
 drivers/net/bnxt/bnxt_hwrm.c   | 148 +++++++++++++++++++++++++++++++++
 drivers/net/bnxt/bnxt_hwrm.h   |   1 +
 4 files changed, 163 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bnxt/bnxt.h b/drivers/net/bnxt/bnxt.h
index 7515f0564f..066a6fd478 100644
--- a/drivers/net/bnxt/bnxt.h
+++ b/drivers/net/bnxt/bnxt.h
@@ -333,6 +333,10 @@ struct bnxt_link_info {
 	uint8_t                 active_lanes;
 	uint8_t			option_flags;
 	uint16_t                pmd_speed_lanes;
+	uint8_t			autoneg;
+#define BNXT_AUTONEG_SPEED		1
+#define BNXT_AUTONEG_FLOW_CTRL		2
+	bool			force_link_chng;
 };
 
 #define BNXT_COS_QUEUE_COUNT	8
diff --git a/drivers/net/bnxt/bnxt_ethdev.c b/drivers/net/bnxt/bnxt_ethdev.c
index 1b8cf3a52a..23a6b193b7 100644
--- a/drivers/net/bnxt/bnxt_ethdev.c
+++ b/drivers/net/bnxt/bnxt_ethdev.c
@@ -2565,7 +2565,7 @@ static int bnxt_flow_ctrl_get_op(struct rte_eth_dev *dev,
 		return rc;
 
 	memset(fc_conf, 0, sizeof(*fc_conf));
-	if (bp->link_info->auto_pause)
+	if (bp->link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
 		fc_conf->autoneg = 1;
 	switch (bp->link_info->pause) {
 	case 0:
@@ -2601,6 +2601,14 @@ static int bnxt_flow_ctrl_set_op(struct rte_eth_dev *dev,
 		return -ENOTSUP;
 	}
 
+	if (fc_conf->autoneg) {
+		bp->link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+	} else {
+		if (bp->link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)
+			bp->link_info->force_link_chng = true;
+		bp->link_info->autoneg &= ~BNXT_AUTONEG_FLOW_CTRL;
+	}
+
 	switch (fc_conf->mode) {
 	case RTE_ETH_FC_NONE:
 		bp->link_info->auto_pause = 0;
@@ -2642,7 +2650,7 @@ static int bnxt_flow_ctrl_set_op(struct rte_eth_dev *dev,
 		}
 		break;
 	}
-	return bnxt_set_hwrm_link_config(bp, true);
+	return bnxt_hwrm_set_pause(bp);
 }
 
 /* Add UDP tunneling port */
diff --git a/drivers/net/bnxt/bnxt_hwrm.c b/drivers/net/bnxt/bnxt_hwrm.c
index 0c82935de9..e333b3d182 100644
--- a/drivers/net/bnxt/bnxt_hwrm.c
+++ b/drivers/net/bnxt/bnxt_hwrm.c
@@ -1876,6 +1876,21 @@ static int bnxt_hwrm_port_phy_qcfg(struct bnxt *bp,
 	link_info->auto_pause = resp->auto_pause;
 	link_info->force_pause = resp->force_pause;
 	link_info->auto_mode = resp->auto_mode;
+
+	if (link_info->auto_mode != HWRM_PORT_PHY_QCFG_OUTPUT_AUTO_MODE_NONE) {
+		link_info->autoneg = BNXT_AUTONEG_SPEED;
+		if (bp->hwrm_spec_code >= 0x10201) {
+			if (link_info->auto_pause &
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_AUTONEG_PAUSE)
+				link_info->autoneg |=
+					BNXT_AUTONEG_FLOW_CTRL;
+		} else {
+			link_info->autoneg |= BNXT_AUTONEG_FLOW_CTRL;
+		}
+	} else {
+		link_info->autoneg = 0;
+	}
+
 	link_info->phy_type = resp->phy_type;
 	link_info->media_type = resp->media_type;
 
@@ -4048,6 +4063,139 @@ static int bnxt_hwrm_port_phy_cfg_v2(struct bnxt *bp, struct bnxt_link_info *con
 	return rc;
 }
 
+static void
+bnxt_hwrm_set_pause_common(struct bnxt *bp,
+			   struct hwrm_port_phy_cfg_input *req)
+{
+	struct bnxt_link_info *link_info = bp->link_info;
+
+	if (link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) {
+		if (bp->hwrm_spec_code >= 0x10201)
+			req->auto_pause =
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_AUTONEG_PAUSE;
+		if (link_info->auto_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_RX)
+			req->auto_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_RX;
+		if (link_info->auto_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_TX)
+			req->auto_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_AUTO_PAUSE_TX;
+		req->enables |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_PAUSE);
+	} else {
+		if (link_info->force_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_RX)
+			req->force_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_RX;
+		if (link_info->force_pause &
+		    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_TX)
+			req->force_pause |=
+			    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAUSE_TX;
+		req->enables |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_FORCE_PAUSE);
+		if (bp->hwrm_spec_code >= 0x10201) {
+			req->auto_pause = req->force_pause;
+			req->enables |=
+				rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_PAUSE);
+		}
+	}
+}
+
+static void
+bnxt_hwrm_set_link_common(struct bnxt *bp, struct hwrm_port_phy_cfg_input *req)
+{
+	struct rte_eth_conf *dev_conf = &bp->eth_dev->data->dev_conf;
+	uint16_t autoneg, speed;
+
+	autoneg = bnxt_check_eth_link_autoneg(dev_conf->link_speeds);
+
+	if (BNXT_CHIP_P5(bp) &&
+	    dev_conf->link_speeds & RTE_ETH_LINK_SPEED_40G)
+		autoneg = 0;
+
+	if (autoneg == 1 && BNXT_CHIP_P5(bp) &&
+	    bp->link_info->auto_mode == 0 &&
+	    bp->link_info->force_pam4_link_speed ==
+	    HWRM_PORT_PHY_CFG_INPUT_FORCE_PAM4_LINK_SPEED_200GB)
+		autoneg = 0;
+
+	speed = bnxt_parse_eth_link_speed(bp, dev_conf->link_speeds,
+					  bp->link_info);
+	req->flags |=
+		rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_RESET_PHY);
+
+	if (autoneg == 1 &&
+	    (bp->link_info->support_auto_speeds ||
+	     bp->link_info->support_pam4_auto_speeds)) {
+		uint16_t spd_mask;
+		uint32_t en;
+
+		req->flags |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_RESTART_AUTONEG);
+		spd_mask = bnxt_parse_eth_link_speed_mask(bp,
+							  dev_conf->link_speeds);
+		req->auto_link_speed_mask = rte_cpu_to_le_16(spd_mask);
+		req->auto_link_pam4_speed_mask =
+			rte_cpu_to_le_16(bp->link_info->auto_pam4_link_speed_mask);
+		en = HWRM_PORT_PHY_CFG_IN_EN_AUTO_LINK_SPEED_MASK |
+		     HWRM_PORT_PHY_CFG_IN_EN_AUTO_PAM4_LINK_SPD_MASK |
+		     HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_MODE;
+		req->enables |= rte_cpu_to_le_32(en);
+		req->auto_mode =
+			HWRM_PORT_PHY_CFG_INPUT_AUTO_MODE_SPEED_MASK;
+	} else {
+		uint32_t en;
+
+		req->flags |=
+			rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_FLAGS_FORCE);
+		if (speed) {
+			if (bp->link_info->link_signal_mode) {
+				req->force_pam4_link_speed =
+					rte_cpu_to_le_16(speed);
+				en = HWRM_PORT_PHY_CFG_IN_EN_FORCE_PAM4_LINK_SPEED;
+				req->enables |= rte_cpu_to_le_32(en);
+			} else {
+				req->force_link_speed =
+					rte_cpu_to_le_16(speed);
+			}
+		}
+	}
+	req->auto_duplex =
+		bnxt_parse_eth_link_duplex(dev_conf->link_speeds);
+	req->enables |=
+		rte_cpu_to_le_32(HWRM_PORT_PHY_CFG_INPUT_ENABLES_AUTO_DUPLEX);
+}
+
+int bnxt_hwrm_set_pause(struct bnxt *bp)
+{
+	struct hwrm_port_phy_cfg_input req = {0};
+	struct hwrm_port_phy_cfg_output *resp = bp->hwrm_cmd_resp_addr;
+	struct bnxt_link_info *link_info = bp->link_info;
+	int rc;
+
+	HWRM_PREP(&req, HWRM_PORT_PHY_CFG, BNXT_USE_CHIMP_MB);
+
+	bnxt_hwrm_set_pause_common(bp, &req);
+
+	if ((link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL) ||
+	    link_info->force_link_chng)
+		bnxt_hwrm_set_link_common(bp, &req);
+
+	rc = bnxt_hwrm_send_message(bp, &req, sizeof(req), BNXT_USE_CHIMP_MB);
+
+	HWRM_CHECK_RESULT();
+
+	if (!rc && !(link_info->autoneg & BNXT_AUTONEG_FLOW_CTRL)) {
+		link_info->pause = link_info->force_pause;
+		link_info->auto_pause = 0;
+	}
+	link_info->force_link_chng = false;
+
+	HWRM_UNLOCK();
+	return rc;
+}
+
 static int bnxt_set_hwrm_link_config_v2(struct bnxt *bp, bool link_up)
 {
 	struct rte_eth_conf *dev_conf = &bp->eth_dev->data->dev_conf;
diff --git a/drivers/net/bnxt/bnxt_hwrm.h b/drivers/net/bnxt/bnxt_hwrm.h
index fc56223ab4..3034803023 100644
--- a/drivers/net/bnxt/bnxt_hwrm.h
+++ b/drivers/net/bnxt/bnxt_hwrm.h
@@ -261,6 +261,7 @@ void bnxt_free_hwrm_rx_ring(struct bnxt *bp, int queue_index);
 int bnxt_alloc_hwrm_resources(struct bnxt *bp);
 int bnxt_get_hwrm_link_config(struct bnxt *bp, struct rte_eth_link *link);
 int bnxt_set_hwrm_link_config(struct bnxt *bp, bool link_up);
+int bnxt_hwrm_set_pause(struct bnxt *bp);
 int bnxt_hwrm_func_qcfg(struct bnxt *bp, uint16_t *mtu);
 int bnxt_hwrm_func_resc_qcaps(struct bnxt *bp);
 int bnxt_hwrm_func_reserve_vf_resc(struct bnxt *bp, bool test);
-- 
2.47.3


^ permalink raw reply related

* DPDK next-net will closed for summer holidays in July
From: Stephen Hemminger @ 2026-06-16 22:50 UTC (permalink / raw)
  To: dev

I will not be doing any active review, merging, or contributing
to DPDK from July 4th - July 21st. If your patch is in patchwork
will get back to it after holiday (on pause); just don't panic if
no review or merge happens.

^ permalink raw reply

* Re: [PATCH v2] tools: AI review handle empty Error sections
From: Stephen Hemminger @ 2026-06-16 22:46 UTC (permalink / raw)
  To: Matthew Gee; +Cc: dev, aconole, lylavoie
In-Reply-To: <20260612190807.1020863-1-mgee@iol.unh.edu>

On Fri, 12 Jun 2026 15:08:07 -0400
Matthew Gee <mgee@iol.unh.edu> wrote:

> Previous review-patch.py would detect and report and error or warning
> based off of the occurrence of the headers of the error and warning
> sections. This led to consistent false positives as often AI reviewers
> will create the header but put "none" or similar filler text within
> the following body.
> 
> This patch updates the code in order to check if the AI review has a
> body with error or warnings to fix and not just filler text. This is
> done by querying multiple lines at once and adjusting the regex to
> filter out headers followed only by filler text or formatting in the
> review.
> 
> These changes were tested against 10+ AI review outputs with several
> variations in formatting and filler text in order to catch a good
> variety of cases to make sure code reviews with actual errors or
> warnings are caught and not missed.
> 
> Signed-off-by: Matthew Gee <mgee@iol.unh.edu>
> ---

AI saw some stuff which I missed.

Matthew,

The windowing (tee/islice/chain) is aligned correctly, but the new
matching has a few problems.

The bigger issue is that this only recognizes markdown '#' headers now.
The old scan matched an optional '#' prefix, '**' bold, and '<h1-3>' too.
rgx_should_match requires "#+\s", so plain-text "Errors:" / "**Errors**"
and HTML "<h2>Errors</h2>" no longer match. Default --format is text and
html is supported, so reviews in those formats classify clean (exit 0)
even when they report errors, which silently breaks the 2/3 exit codes
compare-patch-reviews.sh relies on. The '#'/'**'/'<h>' alternatives need
to stay.

The 3-line concatenation also reintroduces the false positive you're
trying to kill. curr+next+next_next are joined with no separator, so

	## Errors
	None
	## Warnings

collapses to "## errorsnone## warnings"; the 'none...$' filler anchor
fails because the next header follows, and has_errors gets set. Any
output that doesn't blank-line-separate sections hits this.

And in the other direction, content more than two lines below a header
is missed: two blank lines between "## Errors" and the body leave
stripped == "## errors", the '$' branch matches, and a real finding is
suppressed. The heuristic ends up sensitive to exact line spacing both
ways.

Minor: iter[str] / iter[str | None] aren't valid generics (use the
already-imported Iterator); the vars are also re-annotated with
conflicting types and annotate the iterators rather than the loop
targets. mypy will reject these. Run black too - the new for/zip line
and the stripped assignment are over width.

Suggest keeping header matching in all three formats, then scanning past
blank lines to the first non-empty body line and suppressing only if
it's filler (none/n/a/empty). That decouples detection from spacing
instead of fixing it to a 3-line shape.

^ permalink raw reply

* Re: [PATCH v2] devtools: add Vertex AI to review scripts
From: Stephen Hemminger @ 2026-06-16 22:44 UTC (permalink / raw)
  To: David Marchand; +Cc: dev, thomas, Aaron Conole
In-Reply-To: <20260602064406.1230601-1-david.marchand@redhat.com>

On Tue,  2 Jun 2026 08:44:06 +0200
David Marchand <david.marchand@redhat.com> wrote:

> OpenAI, xAI) can now use Vertex AI with Application Default Credentials.
> 
> This requires a python dependency google-auth but it is left as
> optional.
> 
> Key features:
> - Auto-detection of authentication method based on environment
> - Manual override via --auth flag (auto, direct, vertex)
> - Automatic model name translation for Vertex format
> - Support for both global and regional Vertex endpoints
> - Proper error handling for Vertex API responses
> 
> Provider-specific implementations:
> - Anthropic: Uses /publishers/anthropic/models/{model}:rawPredict
>   with model name format claude-sonnet-4-5@20250929
> - Google: Uses /publishers/google/models/{model}:generateContent
> - OpenAI/xAI: Use /endpoints/openapi/chat/completions
>   with publisher prefix (e.g., openai/gpt-oss-120b-maas)
> 
> Authentication detection logic:
> - Vertex: Requires google-auth library and ADC configured
> - Direct: Falls back to API key from environment variables
> 
> Available models on Vertex AI:
> - Anthropic: All Claude models
> - Google: All Gemini models
> - OpenAI: gpt-oss-120b-maas, gpt-oss-20b-maas (open-weight only)
> - xAI: grok-4.20-*, grok-4.1-fast-* variants
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---

I have no direct knowledge of Vertex pro/con.
AI review with better model had some insights.

On the whole the refactor is sound: moving "model" insertion into
_build_request_meta and threading an auth string instead of api_key is
consistent across _common.py, review-patch.py and review-doc.py, and the
build_*_request signature changes match their only callers. Two issues and
a couple of minor notes.

Warning: project-id resolution is split and the early error is unreachable-
proof in the wrong direction.

  get_vertex_credentials() does:

      credentials, project = google_auth_default()
      ...
      if not project:
          error("Could not detect GCP project. Set GOOGLE_CLOUD_PROJECT ...")
      return credentials.token, project

  but the caller then applies its own fallback:

      access_token, project_id = get_vertex_credentials()
      project_id = os.environ.get("GOOGLE_CLOUD_PROJECT") \
                   or os.environ.get("GCP_PROJECT") or project_id

  If ADC resolves no project, get_vertex_credentials() calls error() and
  exits before the GCP_PROJECT fallback can run. google.auth.default()
  honors GOOGLE_CLOUD_PROJECT itself, but not GCP_PROJECT, so the
  GCP_PROJECT branch here is dead in exactly the case it exists for, and
  the error message tells the user to set a variable that wouldn't have
  helped. Resolve the project in one place: drop the "if not project"
  error from get_vertex_credentials() and check after the env fallback,
  or move the env lookups into get_vertex_credentials().

Warning: several added lines exceed the 100-column limit.

      _common.py: error("Could not detect GCP project...")        137 cols
      _common.py: error("Vertex AI support requires...")          108 cols
      _common.py: project_id = os.environ.get(...) or ...         102 cols
      _common.py: vertex_base = f"https://aiplatform..."          104 cols
      _common.py: vertex_base = f"https://{location}-..."         115 cols

  pycodestyle/flake8 will flag all five; the project_id line is also
  black-splittable so "black --check" fails on it. Wrap the f-strings and
  split the message strings.

Info: detect_auth_method() catches bare Exception around
google_auth_default(). Acceptable for best-effort autodetection, but
narrowing it (or a "# noqa"/pylint disable with a reason) documents intent.

Info: the HTTPError path now does error_data[0].get('error', ...) after
unwrapping a list. If a Vertex error body is a list of non-dicts this
raises AttributeError outside the JSONDecodeError guard. Low risk, but a
type check would harden it.

Info: --auth is wired into review-patch.py and review-doc.py but
compare-patch-reviews.sh has no passthrough, so Vertex can't be selected
when comparing providers. May be intentional for this patch.

^ permalink raw reply

* Re: [RFC] devtools: add tool calling support to review-patch.py
From: Stephen Hemminger @ 2026-06-16 22:42 UTC (permalink / raw)
  To: Aaron Conole; +Cc: dev, David Marchand
In-Reply-To: <20260609182652.1053422-1-aconole@redhat.com>

On Tue,  9 Jun 2026 14:26:47 -0400
Aaron Conole <aconole@redhat.com> wrote:

> Add an iterative tool-use loop to review-patch.py for the Anthropic
> and OpenAI providers. The reviewer can now look up additional context
> from the DPDK source tree when the patch alone is insufficient,
> rather than having to guess at surrounding code, API contracts, or
> function signatures.
> 
> Tool calling is enabled by default with a limit of 10 rounds. Pass
> '--tool-rounds 0' to disable it and restore the previous single-shot
> behavior.  The round limit prevents runaway cost on large patches
> that when reached will force the model to deliver a final judgement.
> 
> Initial tool set:
>   - grep           Searches for regex across the file system with
>                    optional path restrictions and case-insensitive
>                    matches.
>   - file_read      Line range read of a specific path.
> 
> Both tools are limited to the repository root to prevent path
> traversal.  Path outputs are relative to the repo root.
> 
> The system prompt is extended when tool calling is active to
> encourage the model to use tools only when genuinely needed,
> keeping unnecessary round trips and token costs under control
> and to a minimum.
> 
> Internally, _common.py gains send_request_raw() (returning the
> raw response dict) so the tool-calling loops can inspect
> stop_reason / finish_reason before extracting text.
> 
> Signed-off-by: Aaron Conole <aconole@redhat.com>
> ---

Well AI saw the security bypass here...

Error
=====

devtools/ai/review-patch.py: path containment check is bypassable
(_tool_grep and _tool_file_read)

	repo_resolved = Path(repo_root).resolve()
	search_path = (repo_resolved / rel_path).resolve()
	if not str(search_path).startswith(str(repo_resolved)):
		return "Error: path is outside the repository"

str.startswith() on the resolved path string is a prefix match, not a
path-component match. If repo_root resolves to /home/me/dpdk, then
/home/me/dpdk-private/secrets passes the check, since the string starts
with "/home/me/dpdk". The commit message states both tools are "limited
to the repository root to prevent path traversal" -- that property does
not hold for sibling directories sharing the name prefix. Same bug in
both tool helpers.

	# is_relative_to (3.9+), raises nothing, no string games
	if not search_path.is_relative_to(repo_resolved):
		return "Error: path is outside the repository"

(Absolute rel_path and ../ escapes are already caught by resolve() +
this fix; only the prefix case is currently open.)

Warning
=======

devtools/ai/_common.py: unrelated reformatting mixed into a feature
patch. The docstring re-wrapping in print_token_summary(),
_extract_text(), _print_verbose_usage(), and the classify_review()
regex reflow in review-patch.py are cosmetic churn unrelated to tool
calling. Drop them or split into a separate cleanup; they inflate the
diff and obscure the actual change.

devtools/ai/review-patch.py: grep --include list excludes the files the
tool prompt points the model at. _tool_grep restricts to
*.[ch]/*.py/*.rst/*.ini, but TOOL_PROMPT_EXTENSION explicitly directs
the model to check ".ci/" scripts, meson.build, and MAINTAINERS. Those
are extensionless or shell, so grep silently returns "No matches found"
rather than surfacing the omission. Either widen the include set or
note the restriction in the tool description so the model doesn't draw
false conclusions from empty results.

Info
====

xai is OpenAI wire-compatible and already shares build_openai_request()
elsewhere, but tool calling is gated to provider == "openai" in
call_api() and the verbose banner. xai (and any future
OpenAI-compatible provider) falls back to single-shot with no notice.
Reusing call_api_with_tools_openai() for xai looks free.

send_request_raw() bypasses _extract_text(), which is where the
"error" in result check lived. The tool loops read stop_reason /
choices directly, so a provider error returned with HTTP 200 yields an
empty string and a misleading "clean" review instead of a hard failure.
Worth a guard on api_result.get("error") at the top of each loop body.

Default-on at 10 rounds is a behavior and cost change for every existing
caller, and grants repo-wide read to the model by default. Reasonable
for an RFC to propose, but worth calling out explicitly for the list --
some CI users may want opt-in rather than opt-out.

^ permalink raw reply

* Re: [PATCH 0/6] net/dpaa2: RSS fixes and improvements
From: Stephen Hemminger @ 2026-06-16 22:07 UTC (permalink / raw)
  To: Maxime Leroy; +Cc: dev
In-Reply-To: <20260616104717.723087-1-maxime@leroys.fr>

On Tue, 16 Jun 2026 12:47:09 +0200
Maxime Leroy <maxime@leroys.fr> wrote:

> A set of RSS fixes and improvements for the dpaa2 PMD.
> 
> Patches 1 and 2 fix the RSS hash key: the L4 destination port was never
> added (both extracts used the source port), and SCTP now uses the same L4
> port extraction as TCP/UDP. Both are tagged for stable.
> 
> Patches 3 and 4 are small cleanups in the key builder (a dead num_extracts
> increment, and the unset PPPoE guard flag).
> 
> Patch 5 honours RTE_ETH_RSS_LEVEL_INNERMOST so tunnelled traffic hashes on
> the inner IP header. Patch 6 implements reta_query / reta_update as an
> emulation over the HW distribution-size mechanism, since dpaa2 has no
> software-visible indirection table.
> 
> Tested on LX2160A (lx2160acex7).

Applied to next-net

^ permalink raw reply

* Re: [PATCH v6 00/21] Wangxun Fixes
From: Stephen Hemminger @ 2026-06-16 21:42 UTC (permalink / raw)
  To: Zaiyu Wang; +Cc: dev
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>

On Tue, 16 Jun 2026 20:20:08 +0800
Zaiyu Wang <zaiyuwang@trustnetic.com> wrote:

> This series fixes several issues found on Wangxun Emerald, Sapphire and
> Amber-lite NICs, with a focus on link-related problems.
> ---
> v6:
> - Fixed more issues identified by AI review
> ---
> v5:
> - Fixed issues identified by AI review
> ---
> v4:
> - Fixed issues identified by devtools scripts
> ---
> v3:
> - Addressed Stephen's comments
> ---
> v2:
> - Fixed compilation error and code style issues
> ---
> 
> Zaiyu Wang (21):
>   net/txgbe: remove duplicate xstats counters
>   net/ngbe: remove duplicate xstats counters
>   net/ngbe: add missing CDR config for YT PHY
>   net/ngbe: fix VF promiscuous and allmulticast
>   net/txgbe: fix inaccuracy in Tx rate limiting
>   net/txgbe: fix link status check condition
>   net/txgbe: fix Tx desc free logic
>   net/txgbe: fix link flow control registers for Amber-Lite
>   net/txgbe: fix link flow control config for Sapphire
>   net/txgbe: fix a mass of unknown interrupts
>   net/txgbe: fix traffic class priority configuration
>   net/txgbe: fix link stability for 25G NIC
>   net/txgbe: fix link stability for 40G NIC
>   net/txgbe: fix link stability for Amber-Lite backplane mode
>   net/txgbe: fix FEC mode configuration on 25G NIC
>   net/txgbe: fix SFP module identification
>   net/txgbe: fix get module info operation
>   net/txgbe: fix get EEPROM operation
>   net/txgbe: fix to reset Tx write-back pointer
>   net/txgbe: fix to enable Tx desc check
>   net/txgbe: fix temperature track for AML NIC
> 
>  drivers/net/ngbe/base/ngbe_phy_yt.c       |    3 +
>  drivers/net/ngbe/ngbe_ethdev.c            |    5 -
>  drivers/net/ngbe/ngbe_ethdev_vf.c         |   11 +-
>  drivers/net/txgbe/base/meson.build        |    2 +
>  drivers/net/txgbe/base/txgbe.h            |    2 +
>  drivers/net/txgbe/base/txgbe_aml.c        |  185 +-
>  drivers/net/txgbe/base/txgbe_aml.h        |    6 +-
>  drivers/net/txgbe/base/txgbe_aml40.c      |  114 +-
>  drivers/net/txgbe/base/txgbe_aml40.h      |    6 +-
>  drivers/net/txgbe/base/txgbe_dcb_hw.c     |    2 +-
>  drivers/net/txgbe/base/txgbe_e56.c        | 3773 +++++++++++++++++++++
>  drivers/net/txgbe/base/txgbe_e56.h        | 1753 ++++++++++
>  drivers/net/txgbe/base/txgbe_e56_bp.c     | 2597 ++++++++++++++
>  drivers/net/txgbe/base/txgbe_e56_bp.h     |  282 ++
>  drivers/net/txgbe/base/txgbe_hw.c         |   54 +-
>  drivers/net/txgbe/base/txgbe_hw.h         |    4 +-
>  drivers/net/txgbe/base/txgbe_osdep.h      |    4 +
>  drivers/net/txgbe/base/txgbe_phy.c        |  362 +-
>  drivers/net/txgbe/base/txgbe_phy.h        |   46 +-
>  drivers/net/txgbe/base/txgbe_regs.h       |   13 +-
>  drivers/net/txgbe/base/txgbe_type.h       |   43 +-
>  drivers/net/txgbe/txgbe_ethdev.c          |  472 ++-
>  drivers/net/txgbe/txgbe_ethdev.h          |    7 +-
>  drivers/net/txgbe/txgbe_rxtx.c            |  109 +-
>  drivers/net/txgbe/txgbe_rxtx.h            |   36 +
>  drivers/net/txgbe/txgbe_rxtx_vec_common.h |   17 +-
>  26 files changed, 9464 insertions(+), 444 deletions(-)
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56.c
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56.h
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.c
>  create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.h
> 

AI review was mostly clean, do you want to change this one thing?

Review of [PATCH v6 00/21] net/{txgbe,ngbe} fixes from Zaiyu Wang

All findings from the v5 review are addressed:

  - 07/21: Tx desc free now loads the 32-bit headwb_mem atomically and
    casts to uint16_t at the use site, removing the type pun.
  - 14/21: kr_read_poll() switches from usleep() to usec_delay() to
    match the rest of the txgbe base layer.
  - 16/21: stray tab+space indent in txgbe_read_i2c_sff8636() is gone.
  - 17/21: stray tab+space indent in txgbe_get_module_info() is gone.
    TXGBE_SFF_DDM_IMPLEMENTED is now indented as a bit mask under the
    8472_SWAP register definition, which reads correctly.
  - 18/21: page is reset to 0 inside each iteration of the for loop so
    it no longer accumulates across iterations.  The flat-memory check
    is now driven by an explicit read of module byte 2 via
    read_i2c_sff8636() into a local is_flat_mem flag, instead of
    reading data[0x2] out of the caller's output buffer.  databyte is
    cleared to 0 at the top of each loop body so the skipped-read
    branch can no longer leak the previous iteration's value.
  - 20/21: RTE_LIBRTE_SECURITY is corrected to RTE_LIB_SECURITY so the
    !txq->using_ipsec guard is actually preprocessed in.

One small remaining comment on 18/21, not blocking.

Patch 18/21 (net/txgbe: fix get EEPROM operation)

Info: is_flat_mem has inverted semantics relative to its name, which
is going to confuse future maintainers.  SFF-8636 byte 2 bit 2
("Flat_mem") is 1 for flat-memory modules and 0 for paged-memory
modules (this is the same convention ethtool uses).  The new code
treats the bit the opposite way:

  +	bool is_flat_mem = true;
  ...
  +		if (rdata & 0x4)
  +			is_flat_mem = false;
  ...
  +			if (page == 0 || is_flat_mem) {
  +				status = hw->phy.read_i2c_sff8636(hw, page, ...);

So when the module reports flat (bit set), is_flat_mem is set to
false; when paged (bit clear), is_flat_mem stays true.  The condition
'page == 0 || is_flat_mem' reads only page 0 in the flat case and any
page in the paged case, which is the right behavior, but the variable
ends up meaning "paged memory is OK to read" rather than "module is
flat".  Suggest either:

	bool is_flat_mem = false;
	if (rdata & 0x4)
		is_flat_mem = true;
	...
	if (page == 0 || !is_flat_mem) {
		...
	}

or renaming to allow_paged_reads (or similar) and keeping the existing
default and condition.



^ permalink raw reply

* Re: [PATCH v10 1/1] net/mana: add device reset support
From: Stephen Hemminger @ 2026-06-16 21:22 UTC (permalink / raw)
  To: Wei Hu; +Cc: dev, longli, weh
In-Reply-To: <20260616123158.43583-2-weh@linux.microsoft.com>

On Tue, 16 Jun 2026 05:31:58 -0700
Wei Hu <weh@linux.microsoft.com> wrote:

> teardown immediately (dev_stop, secondary IPC, dev_close, MR cache
> free) before waiting for the hardware recovery timer to fire. This
> avoids blocking the EAL interrupt thread on multi-second IPC
> timeouts and ibverbs calls. After the recovery delay, the thread
> unregisters the interrupt handler, re-probes the PCI device,
> reinitializes MR caches, and restarts queues. Each function owns
> its own lock scope with no lock hand-off between threads.
> 
> Each queue has an atomic burst_state variable where bit 0 is the
> in-burst flag and bit 1 is a blocked flag. The data path uses a
> single compare-and-swap (0 to 1) to enter a burst, which fails
> immediately if the blocked bit is set. The reset path sets the
> blocked bit via atomic fetch-or and polls bit 0 to wait for
> in-flight bursts to drain. This single-variable design avoids the
> need for sequential consistency ordering.
> 
> A per-device mutex serializes the reset path with ethdev
> operations. The mutex uses PTHREAD_PROCESS_SHARED for multi-process
> support and is held across blocking IB verbs calls. A trylock
> helper encapsulates the lock acquisition and device state check
> for all ethdev operation wrappers. Operations that cannot wait
> (configure, queue setup) return -EBUSY during reset, while
> dev_stop and dev_close join the reset thread before acquiring
> the lock to ensure proper sequencing.
> 
> The reset thread keeps reset_thread_active true throughout its
> lifetime. mana_join_reset_thread uses rte_thread_equal to detect
> the self-join case (when a recovery callback calls dev_stop or
> dev_close from the reset thread itself) and calls
> rte_thread_detach instead of join, so thread resources are freed
> on exit. External callers join normally.
> 
> The condvar wait in the reset thread uses a predicate loop that
> checks dev_state under reset_cond_mutex, so a PCI remove signal
> that arrives before the thread enters the wait is not lost. The
> PCI remove callback sets dev_state to RESET_FAILED under the
> same mutex before signaling. A lock/unlock barrier on
> reset_ops_lock in the PCI remove path ensures teardown has
> completed before emitting the INTR_RMV event.
> 
> Multi-process support is included: secondary processes unmap and
> remap doorbell pages via IPC during the reset enter and exit
> phases. The secondary RESET_EXIT handler closes the received fd
> unconditionally after processing, even when the doorbell page is
> already mapped. Data path functions in both primary and secondary
> processes check the device state atomically and return early when
> the device is not active.
> 
> The driver emits RTE_ETH_EVENT_ERR_RECOVERING before entering the
> reset path so that upper layers (e.g. netvsc) can switch their
> data path before queues are stopped. The event is emitted outside
> the reset lock to avoid deadlock if the callback calls dev_stop or
> dev_close. On completion, the driver emits RECOVERY_SUCCESS or
> RECOVERY_FAILED after releasing the lock. If a recovery callback
> triggers dev_stop or dev_close, the self-join detection in
> mana_join_reset_thread detaches the thread to avoid deadlock. If
> the enter phase fails internally, RECOVERY_FAILED is sent
> immediately so the application receives a terminal event. A PCI
> device removal event callback distinguishes hot-remove from
> service reset.
> 
> Documentation for the device reset feature is added in the MANA
> NIC guide and the 26.07 release notes.
> 
> Signed-off-by: Wei Hu <weh@microsoft.com>
> ---
Applied to next-net

^ permalink raw reply

* [PATCH 6/6] app/test: add test for IP reassembly
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

There was no functional test for IPv4/IPv6 reassembly,
only a performance test. Add a new test  modeled on the Linux kernel
selftest tools/testing/selftests/net/ip_defrag.c. This is new
code so no license conflict.

Test covers:
size and fragment-size sweep across in-order, reverse,
odd/even and block delivery orders with byte-exact payload validation;
minimum 8-byte fragments; the fragment-count limit;
timeout of incomplete datagrams;
and the duplicate, overlap, extension-header and
oversized-fragment cases fixed earlier in this series.

The reassembled packet is checked with rte_mbuf_check() to catch
malformed segment chains in addition to wrong content.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 app/test/meson.build       |   1 +
 app/test/test_reassembly.c | 644 +++++++++++++++++++++++++++++++++++++
 2 files changed, 645 insertions(+)
 create mode 100644 app/test/test_reassembly.c

diff --git a/app/test/meson.build b/app/test/meson.build
index 61024125a7..b8c2208d0b 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -160,6 +160,7 @@ source_file_deps = {
     'test_rawdev.c': ['rawdev', 'bus_vdev', 'raw_skeleton'],
     'test_rcu_qsbr.c': ['rcu', 'hash'],
     'test_rcu_qsbr_perf.c': ['rcu', 'hash'],
+    'test_reassembly.c': ['net', 'ip_frag'],
     'test_reassembly_perf.c': ['net', 'ip_frag'],
     'test_reciprocal_division.c': [],
     'test_reciprocal_division_perf.c': [],
diff --git a/app/test/test_reassembly.c b/app/test/test_reassembly.c
new file mode 100644
index 0000000000..9cada5f3b4
--- /dev/null
+++ b/app/test/test_reassembly.c
@@ -0,0 +1,644 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2026
+ *
+ * Functional unit tests for the IP reassembly path of librte_ip_frag.
+ *
+ * Coverage mirrors the Linux selftest tools/testing/selftests/net/ip_defrag.c
+ * adapted to the library API and to DPDK-specific constraints:
+ *
+ *   - size / fragment-size sweep, bounded by RTE_LIBRTE_IP_FRAG_MAX_FRAG
+ *   - in-order, reverse, odd-then-even, and block-reordered delivery
+ *   - byte-exact validation of the reassembled payload (not just length)
+ *   - minimum (8-byte) fragments
+ *   - fragment-count boundary: exactly MAX reassembles, MAX + 1 fails
+ *   - incomplete datagram reaped on timeout
+ *   - zero-length fragment rejected
+ *   - duplicate fragment tolerated in a reordered set
+ *   - overlapping fragments (leading/trailing/contained) discarded
+ *   - IPv6 fragment with extension headers in the unfragmentable part dropped
+ *   - fragment whose end exceeds the maximum datagram size dropped
+ *
+ * The last four groups depend on the corresponding reassembly fixes
+ * (duplicate tolerance, overlap discard, extension-header drop, oversize
+ * drop); they pass once those are applied and fail on unpatched code. The
+ * remaining cases pass regardless.
+ *
+ * Fragments use l2_len == 0; the library reads the L3 header at offset 0.
+ */
+
+#include "test.h"
+
+#include <string.h>
+
+#include <rte_common.h>
+#include <rte_cycles.h>
+#include <rte_ip.h>
+#include <rte_ip_frag.h>
+#include <rte_log.h>
+#include <rte_mbuf.h>
+#include <rte_mempool.h>
+
+#define NB_MBUF		1024
+#define MBUF_CACHE	0		/* exact accounting for leak checks */
+#define MBUF_DATA	2048
+#define V4_L3_LEN	((uint16_t)sizeof(struct rte_ipv4_hdr))
+#define V6_L3_LEN	((uint16_t)(sizeof(struct rte_ipv6_hdr) + \
+				    RTE_IPV6_FRAG_HDR_SIZE))
+#define TEST_ID		0x4242
+
+#ifndef RTE_LIBRTE_IP_FRAG_MAX_FRAG
+#define RTE_LIBRTE_IP_FRAG_MAX_FRAG 8
+#endif
+#define MAX_FRAG	RTE_LIBRTE_IP_FRAG_MAX_FRAG
+
+#define MAX_PAYLOAD	(MAX_FRAG * 256)	/* keeps a fragment in one mbuf */
+
+enum family { V4, V6 };
+enum order  { IN_ORDER, REVERSE, ODD_EVEN, BLOCK };
+
+struct frag_desc {
+	uint16_t ofs;	/* byte offset into the payload */
+	uint16_t plen;	/* payload bytes after L3 */
+	uint8_t  mf;
+};
+
+static struct rte_mempool *pkt_pool;
+
+/* position-dependent payload pattern, non-periodic at 256 so a misordered
+ * reassembly is detected even when lengths line up.
+ */
+static inline uint8_t
+pat(uint32_t k)
+{
+	return (uint8_t)(k * 31u + 7u);
+}
+
+/* ------------------------------- harness -------------------------------- */
+
+static int
+testsuite_setup(void)
+{
+	/* the table create/destroy per case is chatty at INFO level */
+	rte_log_set_level_pattern("lib.ip_frag", RTE_LOG_NOTICE);
+
+	pkt_pool = rte_pktmbuf_pool_create("REASM_POOL", NB_MBUF, MBUF_CACHE,
+					   0, MBUF_DATA, SOCKET_ID_ANY);
+	return pkt_pool == NULL ? TEST_FAILED : TEST_SUCCESS;
+}
+
+static void
+testsuite_teardown(void)
+{
+	rte_mempool_free(pkt_pool);
+	pkt_pool = NULL;
+}
+
+/* Every case must start and end with a full pool, so a leak in one case is
+ * pinpointed here rather than silently masking the next one.
+ */
+static int
+ut_setup(void)
+{
+	if (rte_mempool_avail_count(pkt_pool) != NB_MBUF) {
+		printf("pool not full at case start: %u/%u\n",
+		       rte_mempool_avail_count(pkt_pool), NB_MBUF);
+		return TEST_FAILED;
+	}
+	return TEST_SUCCESS;
+}
+
+static struct rte_ip_frag_tbl *
+tbl_new(uint64_t max_cycles)
+{
+	return rte_ip_frag_table_create(16, MAX_FRAG, 16, max_cycles,
+					rte_socket_id());
+}
+
+/* Build one fragment with a position-dependent payload. */
+static struct rte_mbuf *
+build_frag(enum family fam, uint16_t ofs, uint16_t plen, uint8_t mf)
+{
+	struct rte_mbuf *m = rte_pktmbuf_alloc(pkt_pool);
+	uint16_t l3 = (fam == V4) ? V4_L3_LEN : V6_L3_LEN;
+	char *p;
+	uint16_t i;
+
+	if (m == NULL)
+		return NULL;
+	m->data_off = 0;
+
+	if (fam == V4) {
+		struct rte_ipv4_hdr *ip = rte_pktmbuf_mtod(m,
+						struct rte_ipv4_hdr *);
+		uint16_t fo = ofs / RTE_IPV4_HDR_OFFSET_UNITS;
+
+		memset(ip, 0, V4_L3_LEN);
+		if (mf)
+			fo |= RTE_IPV4_HDR_MF_FLAG;
+		ip->version_ihl = 0x45;
+		ip->total_length = rte_cpu_to_be_16(V4_L3_LEN + plen);
+		ip->packet_id = rte_cpu_to_be_16(TEST_ID);
+		ip->fragment_offset = rte_cpu_to_be_16(fo);
+		ip->time_to_live = 64;
+		ip->next_proto_id = IPPROTO_UDP;
+		ip->src_addr = rte_cpu_to_be_32(0x0a000001);
+		ip->dst_addr = rte_cpu_to_be_32(0x0a000002);
+	} else {
+		struct rte_ipv6_hdr *ip = rte_pktmbuf_mtod(m,
+						struct rte_ipv6_hdr *);
+		struct rte_ipv6_fragment_ext *fh =
+			rte_pktmbuf_mtod_offset(m,
+				struct rte_ipv6_fragment_ext *,
+				sizeof(struct rte_ipv6_hdr));
+
+		memset(ip, 0, V6_L3_LEN);
+		ip->vtc_flow = rte_cpu_to_be_32(6u << 28);
+		ip->payload_len = rte_cpu_to_be_16(RTE_IPV6_FRAG_HDR_SIZE + plen);
+		ip->proto = IPPROTO_FRAGMENT;
+		ip->hop_limits = 64;
+		ip->src_addr.a[15] = 1;
+		ip->dst_addr.a[15] = 2;
+		fh->next_header = IPPROTO_UDP;
+		fh->reserved = 0;
+		fh->frag_data = rte_cpu_to_be_16(
+				RTE_IPV6_SET_FRAG_DATA(ofs, mf ? 1 : 0));
+		fh->id = rte_cpu_to_be_32(TEST_ID);
+	}
+
+	p = rte_pktmbuf_mtod_offset(m, char *, l3);
+	for (i = 0; i < plen; i++)
+		p[i] = (char)pat(ofs + i);
+
+	m->data_len = m->pkt_len = l3 + plen;
+	m->l2_len = 0;
+	m->l3_len = l3;
+	return m;
+}
+
+static struct rte_mbuf *
+feed(enum family fam, struct rte_ip_frag_tbl *tbl,
+	struct rte_ip_frag_death_row *dr, const struct frag_desc *d, uint64_t tms)
+{
+	struct rte_mbuf *m = build_frag(fam, d->ofs, d->plen, d->mf);
+
+	if (m == NULL)
+		return NULL;
+	if (fam == V4) {
+		struct rte_ipv4_hdr *ip = rte_pktmbuf_mtod(m, struct rte_ipv4_hdr *);
+		return rte_ipv4_frag_reassemble_packet(tbl, dr, m, tms, ip);
+	} else {
+		struct rte_ipv6_hdr *ip = rte_pktmbuf_mtod(m, struct rte_ipv6_hdr *);
+		struct rte_ipv6_fragment_ext *fh =
+			rte_pktmbuf_mtod_offset(m, struct rte_ipv6_fragment_ext *,
+						sizeof(struct rte_ipv6_hdr));
+		return rte_ipv6_frag_reassemble_packet(tbl, dr, m, tms, ip, fh);
+	}
+}
+
+/* Split a datagram of total_plen into fragments of frag_size (multiple of 8).
+ * Returns the fragment count, or -1 if it would exceed MAX_FRAG.
+ */
+static int
+make_datagram(uint16_t total_plen, uint16_t frag_size, struct frag_desc *out)
+{
+	int n = 0;
+	uint16_t ofs = 0;
+
+	while (ofs < total_plen) {
+		uint16_t rem = total_plen - ofs;
+		uint16_t len = rem <= frag_size ? rem : frag_size;
+
+		if (n >= MAX_FRAG)
+			return -1;
+		out[n].ofs = ofs;
+		out[n].plen = len;
+		out[n].mf = (ofs + len < total_plen);
+		ofs += len;
+		n++;
+	}
+	return n;
+}
+
+/* Produce a delivery order (array of indices into descs). */
+static void
+make_order(enum order ord, int n, int *idx)
+{
+	int i, k = 0;
+
+	switch (ord) {
+	case IN_ORDER:
+		for (i = 0; i < n; i++)
+			idx[i] = i;
+		break;
+	case REVERSE:
+		for (i = 0; i < n; i++)
+			idx[i] = n - 1 - i;
+		break;
+	case ODD_EVEN:
+		for (i = 1; i < n; i += 2)
+			idx[k++] = i;
+		for (i = 0; i < n; i += 2)
+			idx[k++] = i;
+		break;
+	case BLOCK: {
+		int t = n / 3 ? n / 3 : 1;
+
+		for (i = 2 * t; i < n; i++)
+			idx[k++] = i;
+		for (i = t; i < 2 * t && i < n; i++)
+			idx[k++] = i;
+		for (i = 0; i < t && i < n; i++)
+			idx[k++] = i;
+		break;
+	}
+	}
+}
+
+/* Feed descs in the given order; return reassembled mbuf or NULL. */
+static struct rte_mbuf *
+run_ordered(enum family fam, const struct frag_desc *descs, int n,
+	    const int *idx)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct rte_mbuf *out = NULL;
+	uint64_t tms = rte_rdtsc();
+	int i;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	if (tbl == NULL)
+		return NULL;
+	for (i = 0; i < n; i++) {
+		struct rte_mbuf *r = feed(fam, tbl, &dr, &descs[idx[i]], tms);
+
+		if (r != NULL)
+			out = r;
+	}
+	rte_ip_frag_free_death_row(&dr, 0);
+	rte_ip_frag_table_destroy(tbl);
+	return out;
+}
+
+/* Validate length and byte-exact payload, then free. Returns 0 on success.
+ * Note: reassembly strips the IPv6 fragment header, so the reassembled v6
+ * header is sizeof(rte_ipv6_hdr), not the V6_L3_LEN the fragments were built
+ * with. v4 has no fragment header to remove.
+ */
+static int
+validate(struct rte_mbuf *m, enum family fam, uint16_t total_plen)
+{
+	uint16_t l3 = (fam == V4) ? V4_L3_LEN :
+				    (uint16_t)sizeof(struct rte_ipv6_hdr);
+	uint8_t buf[MAX_PAYLOAD];
+	const uint8_t *p;
+	const char *reason;
+	uint16_t k;
+	int rc = 0;
+
+	if (m == NULL)
+		return -1;
+	if (rte_mbuf_check(m, 1, &reason) != 0) {
+		printf("  bad mbuf fam=%d total=%u: %s\n", fam, total_plen,
+		       reason);
+		rte_pktmbuf_free(m);
+		return -1;
+	}
+	if (m->pkt_len != (uint32_t)(l3 + total_plen)) {
+		rte_pktmbuf_free(m);
+		return -1;
+	}
+	p = rte_pktmbuf_read(m, l3, total_plen, buf);
+	if (p == NULL) {
+		rte_pktmbuf_free(m);
+		return -1;
+	}
+	for (k = 0; k < total_plen; k++) {
+		if (p[k] != pat(k)) {
+			rc = -1;
+			break;
+		}
+	}
+	rte_pktmbuf_free(m);
+	return rc;
+}
+
+/* --------------------------- baseline / sweep --------------------------- */
+
+static int
+sweep_one(enum family fam, uint16_t total_plen, uint16_t frag_size)
+{
+	struct frag_desc descs[MAX_FRAG];
+	int idx[MAX_FRAG];
+	const enum order orders[] = { IN_ORDER, REVERSE, ODD_EVEN, BLOCK };
+	int n = make_datagram(total_plen, frag_size, descs);
+	unsigned int o;
+
+	if (n < 2)		/* skip single-fragment / oversized for sweep */
+		return 0;
+
+	for (o = 0; o < RTE_DIM(orders); o++) {
+		make_order(orders[o], n, idx);
+		if (validate(run_ordered(fam, descs, n, idx), fam,
+			     total_plen) != 0) {
+			printf("  sweep fail: fam=%d total=%u fs=%u order=%u n=%d\n",
+			       fam, total_plen, frag_size, orders[o], n);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int
+sweep(enum family fam)
+{
+	const uint16_t fsizes[] = { 8, 16, 64, 256 };
+	unsigned int f;
+
+	for (f = 0; f < RTE_DIM(fsizes); f++) {
+		uint16_t fs = fsizes[f];
+		uint16_t total;
+
+		/* cover 2..MAX_FRAG fragments, last fragment partial */
+		for (total = fs + 8; total <= fs * MAX_FRAG; total += fs) {
+			if (sweep_one(fam, total, fs) != 0)
+				return TEST_FAILED;
+			if (total > fs + 4 &&
+			    sweep_one(fam, total - 4, fs) != 0)
+				return TEST_FAILED;
+		}
+	}
+	return TEST_SUCCESS;
+}
+
+static int test_sweep_v4(void) { return sweep(V4); }
+static int test_sweep_v6(void) { return sweep(V6); }
+
+/* Minimum 8-byte fragments. */
+static int
+test_min_fragment(void)
+{
+	struct frag_desc d[3] = {
+		{ 0, 8, 1 }, { 8, 8, 1 }, { 16, 8, 0 },
+	};
+	int idx[3];
+
+	make_order(REVERSE, 3, idx);
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V4, d, 3, idx), V4, 24),
+				"min 8-byte fragments not reassembled");
+	make_order(ODD_EVEN, 3, idx);
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V6, d, 3, idx), V6, 24),
+				"min 8-byte fragments not reassembled (v6)");
+	return TEST_SUCCESS;
+}
+
+/* Exactly MAX_FRAG fragments reassembles; MAX_FRAG + 1 fails. */
+static int
+test_cap_boundary(void)
+{
+	struct frag_desc d[MAX_FRAG + 1];
+	int idx[MAX_FRAG + 1];
+	uint16_t fs = 8, total = fs * MAX_FRAG;
+	int n, i;
+
+	n = make_datagram(total, fs, d);
+	TEST_ASSERT_EQUAL(n, MAX_FRAG, "expected MAX_FRAG fragments");
+	make_order(IN_ORDER, n, idx);
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V4, d, n, idx), V4, total),
+				"MAX_FRAG fragments should reassemble");
+
+	/* one more fragment than the table can hold */
+	for (i = 0; i <= MAX_FRAG; i++) {
+		d[i].ofs = i * fs;
+		d[i].plen = fs;
+		d[i].mf = (i < MAX_FRAG);
+		idx[i] = i;
+	}
+	TEST_ASSERT_NULL(run_ordered(V4, d, MAX_FRAG + 1, idx),
+			     "MAX_FRAG + 1 fragments should not reassemble");
+	TEST_ASSERT_EQUAL(rte_mempool_avail_count(pkt_pool), NB_MBUF,
+			      "overflowing set leaked mbufs");
+	return TEST_SUCCESS;
+}
+
+/* Incomplete datagram: no output, reaped on timeout. */
+static int
+test_incomplete_timeout(void)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	uint64_t mc = rte_get_tsc_hz(), tms = rte_rdtsc();
+	struct frag_desc d[2] = { { 0, 64, 1 }, { 128, 64, 0 } }; /* gap */
+	struct rte_mbuf *out = NULL;
+	int i;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(mc);
+	TEST_ASSERT_NOT_NULL(tbl, "table create failed");
+	for (i = 0; i < 2; i++) {
+		struct rte_mbuf *r = feed(V4, tbl, &dr, &d[i], tms);
+
+		if (r != NULL)
+			out = r;
+	}
+	TEST_ASSERT_NULL(out, "incomplete datagram reassembled");
+	rte_ip_frag_table_del_expired_entries(tbl, &dr, tms + mc + 1);
+	rte_ip_frag_free_death_row(&dr, 0);
+	rte_ip_frag_table_destroy(tbl);
+	TEST_ASSERT_EQUAL(rte_mempool_avail_count(pkt_pool), NB_MBUF,
+			      "expired fragments not freed");
+	return TEST_SUCCESS;
+}
+
+static int
+test_zero_len(void)
+{
+	struct frag_desc d = { 0, 0, 1 };
+	int idx = 0;
+
+	TEST_ASSERT_NULL(run_ordered(V4, &d, 1, &idx),
+			     "zero-length fragment accepted");
+	TEST_ASSERT_EQUAL(rte_mempool_avail_count(pkt_pool), NB_MBUF,
+			      "zero-length fragment leaked");
+	return TEST_SUCCESS;
+}
+
+/* --------------------- duplicate / overlap / reject --------------------- */
+
+/* A duplicate anywhere in a reordered set must not break reassembly. */
+static int
+test_dup_tolerated(void)
+{
+	/* offsets 0,64,128,192 with 64B frags; inject a dup of frag 1 */
+	struct frag_desc d[5] = {
+		{   0, 64, 1 }, {  64, 64, 1 }, {  64, 64, 1 }, /* dup */
+		{ 128, 64, 1 }, { 192, 64, 0 },
+	};
+	int idx[5] = { 1, 4, 2, 0, 3 };	/* reordered, dup interleaved */
+
+	TEST_ASSERT_SUCCESS(validate(run_ordered(V4, d, 5, idx), V4, 256),
+				"duplicate fragment broke reassembly");
+	return TEST_SUCCESS;
+}
+
+/* Overlap geometries; the datagram must be discarded and every collected
+ * fragment freed. The last fragment is withheld so that on unfixed code the
+ * entry is *retained* (total_size stays UINT32_MAX) rather than torn down by
+ * the frag_size > total_size path: that retention is what we detect. We
+ * capture the mbufs still held in the table after draining the death row,
+ * before destroying the table (destroy frees held mbufs, hiding the leak).
+ */
+static int
+overlap_case(enum family fam, const struct frag_desc *d, int n, const char *what)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct rte_mbuf *out = NULL;
+	uint64_t tms = rte_rdtsc();
+	unsigned int held;
+	int i;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	if (tbl == NULL)
+		return -1;
+	for (i = 0; i < n; i++) {
+		struct rte_mbuf *r = feed(fam, tbl, &dr, &d[i], tms);
+
+		if (r != NULL)
+			out = r;
+	}
+	rte_ip_frag_free_death_row(&dr, 0);
+	held = NB_MBUF - rte_mempool_avail_count(pkt_pool);
+	rte_ip_frag_table_destroy(tbl);
+
+	if (out != NULL) {
+		rte_pktmbuf_free(out);
+		printf("  overlap reassembled instead of discarded: %s\n", what);
+		return -1;
+	}
+	if (held != 0) {
+		printf("  overlap kept %u fragment(s) instead of discarding: %s\n",
+		       held, what);
+		return -1;
+	}
+	return 0;
+}
+
+static int
+test_overlap(void)
+{
+	/* last fragment withheld in every case (all MF=1) */
+
+	/* overlapping fragment arrives second */
+	const struct frag_desc tail[2] = { { 0, 600, 1 }, { 300, 600, 1 } };
+	/* overlapping fragment arrives first */
+	const struct frag_desc head[2] = { { 300, 600, 1 }, { 0, 600, 1 } };
+	/* a fragment fully contained in an existing one */
+	const struct frag_desc cont[2] = { { 0, 600, 1 }, { 200, 200, 1 } };
+
+	TEST_ASSERT_SUCCESS(overlap_case(V6, tail, 2, "v6 overlap second"), "");
+	TEST_ASSERT_SUCCESS(overlap_case(V6, head, 2, "v6 overlap first"), "");
+	TEST_ASSERT_SUCCESS(overlap_case(V6, cont, 2, "v6 contained"), "");
+	TEST_ASSERT_SUCCESS(overlap_case(V4, tail, 2, "v4 overlap second"), "");
+	return TEST_SUCCESS;
+}
+
+/* IPv6 fragment with an unfragmentable extension header is dropped, not
+ * stored. Capture whether the fragment is still held in the table after the
+ * death row is drained but before the table is destroyed.
+ */
+static int
+test_v6_ext_header_drop(void)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct rte_mbuf *m, *r;
+	struct rte_ipv6_hdr *ip;
+	struct rte_ipv6_fragment_ext *fh;
+	unsigned int held;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	TEST_ASSERT_NOT_NULL(tbl, "table create failed");
+
+	m = build_frag(V6, 0, 64, 1);
+	TEST_ASSERT_NOT_NULL(m, "alloc failed");
+	/* pretend an 8-byte extension header sits before the fragment header */
+	m->l3_len += 8;
+	ip = rte_pktmbuf_mtod(m, struct rte_ipv6_hdr *);
+	fh = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_fragment_ext *,
+				     sizeof(struct rte_ipv6_hdr));
+	r = rte_ipv6_frag_reassemble_packet(tbl, &dr, m, rte_rdtsc(), ip, fh);
+	rte_ip_frag_free_death_row(&dr, 0);
+	held = NB_MBUF - rte_mempool_avail_count(pkt_pool);
+	rte_ip_frag_table_destroy(tbl);
+
+	TEST_ASSERT_NULL(r, "fragment with extension header accepted");
+	TEST_ASSERT_EQUAL(held, 0,
+			  "extension-header fragment stored instead of dropped");
+	return TEST_SUCCESS;
+}
+
+/* A fragment whose end exceeds the max datagram size is dropped, not stored. */
+static int
+oversize_drop_one(enum family fam)
+{
+	struct rte_ip_frag_death_row dr;
+	struct rte_ip_frag_tbl *tbl;
+	struct frag_desc d = { 0xFFF8, 64, 0 };	/* offset 65528 + 64 > 65535 */
+	struct rte_mbuf *r;
+	unsigned int held;
+
+	memset(&dr, 0, sizeof(dr));
+	tbl = tbl_new(rte_get_tsc_hz());
+	if (tbl == NULL)
+		return -1;
+	r = feed(fam, tbl, &dr, &d, rte_rdtsc());
+	rte_ip_frag_free_death_row(&dr, 0);
+	held = NB_MBUF - rte_mempool_avail_count(pkt_pool);
+	rte_ip_frag_table_destroy(tbl);
+
+	if (r != NULL) {
+		rte_pktmbuf_free(r);
+		return -1;
+	}
+	return held == 0 ? 0 : -1;
+}
+
+static int
+test_oversize_drop(void)
+{
+	TEST_ASSERT_SUCCESS(oversize_drop_one(V4),
+			    "oversized v4 fragment stored instead of dropped");
+	TEST_ASSERT_SUCCESS(oversize_drop_one(V6),
+			    "oversized v6 fragment stored instead of dropped");
+	return TEST_SUCCESS;
+}
+
+static struct unit_test_suite reassembly_testsuite = {
+	.suite_name = "IP Reassembly Unit Test Suite",
+	.setup = testsuite_setup,
+	.teardown = testsuite_teardown,
+	.unit_test_cases = {
+		TEST_CASE_ST(ut_setup, NULL, test_sweep_v4),
+		TEST_CASE_ST(ut_setup, NULL, test_sweep_v6),
+		TEST_CASE_ST(ut_setup, NULL, test_min_fragment),
+		TEST_CASE_ST(ut_setup, NULL, test_cap_boundary),
+		TEST_CASE_ST(ut_setup, NULL, test_incomplete_timeout),
+		TEST_CASE_ST(ut_setup, NULL, test_zero_len),
+		TEST_CASE_ST(ut_setup, NULL, test_dup_tolerated),
+		TEST_CASE_ST(ut_setup, NULL, test_overlap),
+		TEST_CASE_ST(ut_setup, NULL, test_v6_ext_header_drop),
+		TEST_CASE_ST(ut_setup, NULL, test_oversize_drop),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_reassembly(void)
+{
+	return unit_test_suite_runner(&reassembly_testsuite);
+}
+
+REGISTER_FAST_TEST(reassembly_autotest, NOHUGE_SKIP, ASAN_OK, test_reassembly);
-- 
2.53.0


^ permalink raw reply related

* [PATCH 5/6] ip_frag: reject oversized reassembled datagrams
From: Stephen Hemminger @ 2026-06-16 21:05 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, stable, Konstantin Ananyev
In-Reply-To: <20260616210656.464062-1-stephen@networkplumber.org>

The reassembled total length of a packet must not exceed 65535.
A fragment with a high offset could drive the sum past that,
causing silent truncation since IP payload_len/total_length is 16 bits.

When reassembling a packet the total length should not be allowed
to exceed 65535. A fragment with high offset could drive the sum
past that, causing silent truncation.

A valid datagram never exceeds 65535 bytes, so reject any fragment
whose resulting length would exceed that.
Fold the test into the existing zero-length check.

Fixes: cc8f4d020c0b ("examples/ip_reassembly: initial import")
Cc: stable@dpdk.org

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 lib/ip_frag/rte_ipv4_reassembly.c | 9 +++++++--
 lib/ip_frag/rte_ipv6_reassembly.c | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/ip_frag/rte_ipv4_reassembly.c b/lib/ip_frag/rte_ipv4_reassembly.c
index 980f7a3b77..727fc58243 100644
--- a/lib/ip_frag/rte_ipv4_reassembly.c
+++ b/lib/ip_frag/rte_ipv4_reassembly.c
@@ -136,8 +136,13 @@ rte_ipv4_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 		tbl, tbl->max_cycles, tbl->entry_mask, tbl->max_entries,
 		tbl->use_entries);
 
-	/* check that fragment length is greater then zero. */
-	if (ip_len <= 0) {
+	/*
+	 * Drop fragments with no payload, and any fragment whose end would
+	 * make the reassembled datagram exceed the maximum IPv4 size. The
+	 * total_length field is 16 bits, so otherwise it is silently
+	 * truncated while the mbuf still holds the full length.
+	 */
+	if (ip_len <= 0 || ip_ofs + ip_len + mb->l3_len > UINT16_MAX) {
 		IP_FRAG_MBUF2DR(dr, mb);
 		return NULL;
 	}
diff --git a/lib/ip_frag/rte_ipv6_reassembly.c b/lib/ip_frag/rte_ipv6_reassembly.c
index 7c1659002b..0b44275b37 100644
--- a/lib/ip_frag/rte_ipv6_reassembly.c
+++ b/lib/ip_frag/rte_ipv6_reassembly.c
@@ -174,8 +174,13 @@ rte_ipv6_frag_reassemble_packet(struct rte_ip_frag_tbl *tbl,
 		tbl, tbl->max_cycles, tbl->entry_mask, tbl->max_entries,
 		tbl->use_entries);
 
-	/* check that fragment length is greater then zero. */
-	if (ip_len <= 0) {
+	/*
+	 * Drop fragments with no payload, and any fragment whose end would
+	 * make the reassembled payload exceed 65535 bytes. The payload_len
+	 * field is 16 bits, so otherwise it is silently truncated while the
+	 * mbuf still holds the full length.
+	 */
+	if (ip_len <= 0 || ip_ofs + ip_len > UINT16_MAX) {
 		IP_FRAG_MBUF2DR(dr, mb);
 		return NULL;
 	}
-- 
2.53.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox