DPDK-dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] net/iavf: fix to consolidate link change event handling
From: Anurag Mandal @ 2026-06-10 14:12 UTC (permalink / raw)
  To: dev
  Cc: bruce.richardson, vladimir.medvedkin, ciara.loftus, Anurag Mandal,
	stable
In-Reply-To: <20260609001022.357509-1-anurag.mandal@intel.com>

Handled link-change events through a common static function that
reads the correct advanced & legacy link fields properly and
updates no-poll/watchdog/LSC state consistently.

Fixes: 5e03e316c753 ("net/iavf: handle virtchnl event message without interrupt")
Fixes: 48de41ca11f0 ("net/avf: enable link status update")
Cc: stable@dpdk.org

Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>
---
V3: Addressed Ciara Loftus's review comments
 - removed two unnecessary NULL checks
V2: Addressed Ciara Loftus's review comments
 - removed unnecessary NULL checks which were overly defensive checks

 drivers/net/intel/iavf/iavf_vchnl.c | 115 +++++++++++++++-------------
 1 file changed, 63 insertions(+), 52 deletions(-)

diff --git a/drivers/net/intel/iavf/iavf_vchnl.c b/drivers/net/intel/iavf/iavf_vchnl.c
index 0643a835d5..36b7ee9526 100644
--- a/drivers/net/intel/iavf/iavf_vchnl.c
+++ b/drivers/net/intel/iavf/iavf_vchnl.c
@@ -216,6 +216,67 @@ iavf_convert_link_speed(enum virtchnl_link_speed virt_link_speed)
 	return speed;
 }
 
+/*
+ * iavf_handle_link_change_event: common handler for VIRTCHNL link change events
+ *
+ * @dev: pointer to rte_eth_dev for this VF
+ * @vpe: pointer to the virtchnl_pf_event payload received from the PF
+ *
+ * Handle PF link-change event: decode adv/legacy link info, update VF
+ * link state, sync no-poll/watchdog behavior & notify app via LSC event.
+ */
+static void
+iavf_handle_link_change_event(struct rte_eth_dev *dev,
+			      struct virtchnl_pf_event *vpe)
+{
+	struct iavf_adapter *adapter =
+		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+	struct iavf_info *vf = &adapter->vf;
+	bool adv_link_speed;
+
+	adv_link_speed = (vf->vf_res != NULL) &&
+		((vf->vf_res->vf_cap_flags & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) != 0);
+
+	if (adv_link_speed) {
+		vf->link_up = vpe->event_data.link_event_adv.link_status;
+		vf->link_speed = vpe->event_data.link_event_adv.link_speed;
+	} else {
+		enum virtchnl_link_speed speed;
+
+		vf->link_up = vpe->event_data.link_event.link_status;
+		speed = vpe->event_data.link_event.link_speed;
+		vf->link_speed = iavf_convert_link_speed(speed);
+	}
+
+	iavf_dev_link_update(dev, 0);
+
+	/*
+	 * Update watchdog/no_poll state BEFORE notifying the application via
+	 * the LSC event. Otherwise the application's link-up callback could
+	 * race with stale (link-down) no_poll/watchdog state and either
+	 * continue to drop traffic or trigger a spurious reset detection.
+	 *
+	 * Keeping the watchdog enabled whenever the link cannot be trusted
+	 * (link is down or a VF reset is in progress); the watchdog drives
+	 * auto-reset recovery, so it must remain armed in those cases.
+	 */
+	if (vf->link_up && !vf->vf_reset)
+		iavf_dev_watchdog_disable(adapter);
+	else
+		iavf_dev_watchdog_enable(adapter);
+
+	if (adapter->devargs.no_poll_on_link_down) {
+		iavf_set_no_poll(adapter, true);
+		PMD_DRV_LOG(DEBUG, "VF no poll turned %s",
+			    adapter->no_poll ? "on" : "off");
+	}
+
+	iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
+
+	PMD_DRV_LOG(INFO, "Link status update:%s",
+		vf->link_up ? "up" : "down");
+}
+
 /* Read data in admin queue to get msg from pf driver */
 static enum iavf_aq_result
 iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
@@ -253,34 +314,7 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 		result = IAVF_MSG_SYS;
 		switch (vpe->event) {
 		case VIRTCHNL_EVENT_LINK_CHANGE:
-			vf->link_up =
-				vpe->event_data.link_event.link_status;
-			if (vf->vf_res != NULL &&
-			    vf->vf_res->vf_cap_flags & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) {
-				vf->link_speed =
-				    vpe->event_data.link_event_adv.link_speed;
-			} else {
-				enum virtchnl_link_speed speed;
-				speed = vpe->event_data.link_event.link_speed;
-				vf->link_speed = iavf_convert_link_speed(speed);
-			}
-			iavf_dev_link_update(vf->eth_dev, 0);
-			iavf_dev_event_post(vf->eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
-			if (vf->link_up && !vf->vf_reset) {
-				iavf_dev_watchdog_disable(adapter);
-			} else {
-				if (!vf->link_up)
-					iavf_dev_watchdog_enable(adapter);
-			}
-			if (adapter->devargs.no_poll_on_link_down) {
-				iavf_set_no_poll(adapter, true);
-				if (adapter->no_poll)
-					PMD_DRV_LOG(DEBUG, "VF no poll turned on");
-				else
-					PMD_DRV_LOG(DEBUG, "VF no poll turned off");
-			}
-			PMD_DRV_LOG(INFO, "Link status update:%s",
-					vf->link_up ? "up" : "down");
+			iavf_handle_link_change_event(vf->eth_dev, vpe);
 			break;
 		case VIRTCHNL_EVENT_RESET_IMPENDING:
 			vf->vf_reset = true;
@@ -525,30 +559,7 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
 		break;
 	case VIRTCHNL_EVENT_LINK_CHANGE:
 		PMD_DRV_LOG(DEBUG, "VIRTCHNL_EVENT_LINK_CHANGE event");
-		vf->link_up = pf_msg->event_data.link_event.link_status;
-		if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) {
-			vf->link_speed =
-				pf_msg->event_data.link_event_adv.link_speed;
-		} else {
-			enum virtchnl_link_speed speed;
-			speed = pf_msg->event_data.link_event.link_speed;
-			vf->link_speed = iavf_convert_link_speed(speed);
-		}
-		iavf_dev_link_update(dev, 0);
-		if (vf->link_up && !vf->vf_reset) {
-			iavf_dev_watchdog_disable(adapter);
-		} else {
-			if (!vf->link_up)
-				iavf_dev_watchdog_enable(adapter);
-		}
-		if (adapter->devargs.no_poll_on_link_down) {
-			iavf_set_no_poll(adapter, true);
-			if (adapter->no_poll)
-				PMD_DRV_LOG(DEBUG, "VF no poll turned on");
-			else
-				PMD_DRV_LOG(DEBUG, "VF no poll turned off");
-		}
-		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
+		iavf_handle_link_change_event(dev, pf_msg);
 		break;
 	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
 		PMD_DRV_LOG(DEBUG, "VIRTCHNL_EVENT_PF_DRIVER_CLOSE event");
-- 
2.34.1


^ permalink raw reply related

* RE: [PATCH v2] net/iavf: fix to consolidate link change event handling
From: Mandal, Anurag @ 2026-06-10 14:13 UTC (permalink / raw)
  To: Loftus, Ciara, dev@dpdk.org
  Cc: Richardson, Bruce, Medvedkin, Vladimir, stable@dpdk.org
In-Reply-To: <IA4PR11MB92788A795E02D08A3287C3628E1A2@IA4PR11MB9278.namprd11.prod.outlook.com>

> -----Original Message-----
> From: Loftus, Ciara <ciara.loftus@intel.com>
> Sent: 10 June 2026 15:13
> To: Mandal, Anurag <anurag.mandal@intel.com>; dev@dpdk.org
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Medvedkin, Vladimir
> <vladimir.medvedkin@intel.com>; stable@dpdk.org
> Subject: RE: [PATCH v2] net/iavf: fix to consolidate link change event handling
> 
> > Subject: [PATCH v2] net/iavf: fix to consolidate link change event
> > handling
> >
> 
> [snip]
> 
> > +
> >  /* Read data in admin queue to get msg from pf driver */  static enum
> > iavf_aq_result  iavf_read_msg_from_pf(struct iavf_adapter *adapter,
> > uint16_t buf_len, @@ -249,38 +310,15 @@ iavf_read_msg_from_pf(struct
> > iavf_adapter *adapter, uint16_t buf_len,
> >  	if (opcode == VIRTCHNL_OP_EVENT) {
> >  		struct virtchnl_pf_event *vpe =
> >  			(struct virtchnl_pf_event *)event.msg_buf;
> > +		if (vpe == NULL) {
> > +			PMD_DRV_LOG(ERR, "Invalid PF event message");
> > +			return IAVF_MSG_ERR;
> > +		}
> 
> This check can be removed.
> iavf_read_msg_from_pf is called from iavf_wait_for_msg which performs a
> NULL check on the same location (args->out_buffer) before passing it to
> iavf_read_msg_from_pf
> 
> >
> >  		result = IAVF_MSG_SYS;
> >  		switch (vpe->event) {
> >  		case VIRTCHNL_EVENT_LINK_CHANGE:
> > -			vf->link_up =
> > -				vpe->event_data.link_event.link_status;
> > -			if (vf->vf_res != NULL &&
> > -			    vf->vf_res->vf_cap_flags &
> > VIRTCHNL_VF_CAP_ADV_LINK_SPEED) {
> > -				vf->link_speed =
> > -				    vpe-
> > >event_data.link_event_adv.link_speed;
> > -			} else {
> > -				enum virtchnl_link_speed speed;
> > -				speed = vpe-
> > >event_data.link_event.link_speed;
> > -				vf->link_speed =
> > iavf_convert_link_speed(speed);
> > -			}
> > -			iavf_dev_link_update(vf->eth_dev, 0);
> > -			iavf_dev_event_post(vf->eth_dev,
> > RTE_ETH_EVENT_INTR_LSC, NULL, 0);
> > -			if (vf->link_up && !vf->vf_reset) {
> > -				iavf_dev_watchdog_disable(adapter);
> > -			} else {
> > -				if (!vf->link_up)
> > -					iavf_dev_watchdog_enable(adapter);
> > -			}
> > -			if (adapter->devargs.no_poll_on_link_down) {
> > -				iavf_set_no_poll(adapter, true);
> > -				if (adapter->no_poll)
> > -					PMD_DRV_LOG(DEBUG, "VF no poll
> > turned on");
> > -				else
> > -					PMD_DRV_LOG(DEBUG, "VF no poll
> > turned off");
> > -			}
> > -			PMD_DRV_LOG(INFO, "Link status update:%s",
> > -					vf->link_up ? "up" : "down");
> > +			iavf_handle_link_change_event(vf->eth_dev, vpe);
> >  			break;
> >  		case VIRTCHNL_EVENT_RESET_IMPENDING:
> >  			vf->vf_reset = true;
> > @@ -505,6 +543,12 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev,
> > uint8_t *msg,
> >  		PMD_DRV_LOG(DEBUG, "Error event");
> >  		return;
> >  	}
> > +
> > +	if (pf_msg == NULL) {
> > +		PMD_DRV_LOG(ERR, "Invalid PF event message");
> > +		return;
> > +	}
> 
> This too can be removed.
> pf_msg resolves to vf->aq_resp which is a fixed buffer allocated at driver init
> time. It cannot be NULL here.
> 
> With those two changes I think the patch will be good to go.
> 
> > +
> >  	switch (pf_msg->event) {
> >  	case VIRTCHNL_EVENT_RESET_IMPENDING:
> >  		PMD_DRV_LOG(DEBUG,
> > "VIRTCHNL_EVENT_RESET_IMPENDING event"); @@ -518,30 +562,7 @@
> > iavf_handle_pf_event_msg(struct rte_eth_dev *dev, uint8_t *msg,
> >  		break;
> >  	case VIRTCHNL_EVENT_LINK_CHANGE:
> >  		PMD_DRV_LOG(DEBUG, "VIRTCHNL_EVENT_LINK_CHANGE
> event");
> > -		vf->link_up = pf_msg->event_data.link_event.link_status;
> > -		if (vf->vf_res->vf_cap_flags &
> > VIRTCHNL_VF_CAP_ADV_LINK_SPEED) {
> > -			vf->link_speed =
> > -				pf_msg-
> > >event_data.link_event_adv.link_speed;
> > -		} else {
> > -			enum virtchnl_link_speed speed;
> > -			speed = pf_msg->event_data.link_event.link_speed;
> > -			vf->link_speed = iavf_convert_link_speed(speed);
> > -		}
> > -		iavf_dev_link_update(dev, 0);
> > -		if (vf->link_up && !vf->vf_reset) {
> > -			iavf_dev_watchdog_disable(adapter);
> > -		} else {
> > -			if (!vf->link_up)
> > -				iavf_dev_watchdog_enable(adapter);
> > -		}
> > -		if (adapter->devargs.no_poll_on_link_down) {
> > -			iavf_set_no_poll(adapter, true);
> > -			if (adapter->no_poll)
> > -				PMD_DRV_LOG(DEBUG, "VF no poll turned
> > on");
> > -			else
> > -				PMD_DRV_LOG(DEBUG, "VF no poll turned
> > off");
> > -		}
> > -		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
> > 0);
> > +		iavf_handle_link_change_event(dev, pf_msg);
> >  		break;
> >  	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
> >  		PMD_DRV_LOG(DEBUG,
> > "VIRTCHNL_EVENT_PF_DRIVER_CLOSE event");
> > --
> > 2.34.1

Hi Ciara,

Thank you for the review. Removed those two unwarranted NULL checks.
Sent v3. Please check.

Thanks, 
Anurag


^ permalink raw reply

* Re: [PATCH] bus/fslmc: fix shadowed variables in queue storage macros
From: Thomas Monjalon @ 2026-06-10 14:15 UTC (permalink / raw)
  To: Weijun Pan, Hemant Agrawal, Weijun Pan
  Cc: dev, Sachin Saxena, Jun Yang, stable, Stephen Hemminger
In-Reply-To: <20260407070940.225de771@phoenix.local>

07/04/2026 16:09, Stephen Hemminger:
> Why are these not inline functions.
> Macros with lower case names are likely place fore confusion like this?

Hemant, Sachin, Weijun, please could you consider this comment?



^ permalink raw reply

* Re: [PATCH] fib6: fix error code propagation on next hop update
From: Thomas Monjalon @ 2026-06-10 14:26 UTC (permalink / raw)
  To: Vladimir Medvedkin; +Cc: dev, stable
In-Reply-To: <20260605164757.927661-1-vladimir.medvedkin@intel.com>

05/06/2026 18:47, Vladimir Medvedkin:
> When updating the next hop of an existing prefix, trie_modify() ignored
> the return value of modify_dp() and always returned 0.  An out-of-range
> next hop is rejected by modify_dp() with -EINVAL but was reported to
> the caller as success. Return the actual result.
> 
> Fixes: c3e12e0f0354 ("fib: add dataplane algorithm for IPv6")
> Cc: stable@dpdk.org
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>

Applied, thanks.



^ permalink raw reply

* RE: [PATCH v3] net/iavf: fix to consolidate link change event handling
From: Loftus, Ciara @ 2026-06-10 14:34 UTC (permalink / raw)
  To: Mandal, Anurag, dev@dpdk.org
  Cc: Richardson, Bruce, Medvedkin, Vladimir, stable@dpdk.org
In-Reply-To: <20260610141230.369232-1-anurag.mandal@intel.com>

> Subject: [PATCH v3] net/iavf: fix to consolidate link change event handling
> 
> Handled link-change events through a common static function that
> reads the correct advanced & legacy link fields properly and
> updates no-poll/watchdog/LSC state consistently.
> 
> Fixes: 5e03e316c753 ("net/iavf: handle virtchnl event message without
> interrupt")
> Fixes: 48de41ca11f0 ("net/avf: enable link status update")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>

Thanks Anurag.

Acked-by: Ciara Loftus <ciara.loftus@intel.com>

> ---
> V3: Addressed Ciara Loftus's review comments
>  - removed two unnecessary NULL checks
> V2: Addressed Ciara Loftus's review comments
>  - removed unnecessary NULL checks which were overly defensive checks
> 
>  drivers/net/intel/iavf/iavf_vchnl.c | 115 +++++++++++++++-------------
>  1 file changed, 63 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/net/intel/iavf/iavf_vchnl.c
> b/drivers/net/intel/iavf/iavf_vchnl.c
> index 0643a835d5..36b7ee9526 100644
> --- a/drivers/net/intel/iavf/iavf_vchnl.c
> +++ b/drivers/net/intel/iavf/iavf_vchnl.c
> @@ -216,6 +216,67 @@ iavf_convert_link_speed(enum virtchnl_link_speed
> virt_link_speed)
>  	return speed;
>  }
> 
> +/*
> + * iavf_handle_link_change_event: common handler for VIRTCHNL link
> change events
> + *
> + * @dev: pointer to rte_eth_dev for this VF
> + * @vpe: pointer to the virtchnl_pf_event payload received from the PF
> + *
> + * Handle PF link-change event: decode adv/legacy link info, update VF
> + * link state, sync no-poll/watchdog behavior & notify app via LSC event.
> + */
> +static void
> +iavf_handle_link_change_event(struct rte_eth_dev *dev,
> +			      struct virtchnl_pf_event *vpe)
> +{
> +	struct iavf_adapter *adapter =
> +		IAVF_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
> +	struct iavf_info *vf = &adapter->vf;
> +	bool adv_link_speed;
> +
> +	adv_link_speed = (vf->vf_res != NULL) &&
> +		((vf->vf_res->vf_cap_flags &
> VIRTCHNL_VF_CAP_ADV_LINK_SPEED) != 0);
> +
> +	if (adv_link_speed) {
> +		vf->link_up = vpe->event_data.link_event_adv.link_status;
> +		vf->link_speed = vpe->event_data.link_event_adv.link_speed;
> +	} else {
> +		enum virtchnl_link_speed speed;
> +
> +		vf->link_up = vpe->event_data.link_event.link_status;
> +		speed = vpe->event_data.link_event.link_speed;
> +		vf->link_speed = iavf_convert_link_speed(speed);
> +	}
> +
> +	iavf_dev_link_update(dev, 0);
> +
> +	/*
> +	 * Update watchdog/no_poll state BEFORE notifying the application
> via
> +	 * the LSC event. Otherwise the application's link-up callback could
> +	 * race with stale (link-down) no_poll/watchdog state and either
> +	 * continue to drop traffic or trigger a spurious reset detection.
> +	 *
> +	 * Keeping the watchdog enabled whenever the link cannot be trusted
> +	 * (link is down or a VF reset is in progress); the watchdog drives
> +	 * auto-reset recovery, so it must remain armed in those cases.
> +	 */
> +	if (vf->link_up && !vf->vf_reset)
> +		iavf_dev_watchdog_disable(adapter);
> +	else
> +		iavf_dev_watchdog_enable(adapter);
> +
> +	if (adapter->devargs.no_poll_on_link_down) {
> +		iavf_set_no_poll(adapter, true);
> +		PMD_DRV_LOG(DEBUG, "VF no poll turned %s",
> +			    adapter->no_poll ? "on" : "off");
> +	}
> +
> +	iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL, 0);
> +
> +	PMD_DRV_LOG(INFO, "Link status update:%s",
> +		vf->link_up ? "up" : "down");
> +}
> +
>  /* Read data in admin queue to get msg from pf driver */
>  static enum iavf_aq_result
>  iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
> @@ -253,34 +314,7 @@ iavf_read_msg_from_pf(struct iavf_adapter
> *adapter, uint16_t buf_len,
>  		result = IAVF_MSG_SYS;
>  		switch (vpe->event) {
>  		case VIRTCHNL_EVENT_LINK_CHANGE:
> -			vf->link_up =
> -				vpe->event_data.link_event.link_status;
> -			if (vf->vf_res != NULL &&
> -			    vf->vf_res->vf_cap_flags &
> VIRTCHNL_VF_CAP_ADV_LINK_SPEED) {
> -				vf->link_speed =
> -				    vpe-
> >event_data.link_event_adv.link_speed;
> -			} else {
> -				enum virtchnl_link_speed speed;
> -				speed = vpe-
> >event_data.link_event.link_speed;
> -				vf->link_speed =
> iavf_convert_link_speed(speed);
> -			}
> -			iavf_dev_link_update(vf->eth_dev, 0);
> -			iavf_dev_event_post(vf->eth_dev,
> RTE_ETH_EVENT_INTR_LSC, NULL, 0);
> -			if (vf->link_up && !vf->vf_reset) {
> -				iavf_dev_watchdog_disable(adapter);
> -			} else {
> -				if (!vf->link_up)
> -					iavf_dev_watchdog_enable(adapter);
> -			}
> -			if (adapter->devargs.no_poll_on_link_down) {
> -				iavf_set_no_poll(adapter, true);
> -				if (adapter->no_poll)
> -					PMD_DRV_LOG(DEBUG, "VF no poll
> turned on");
> -				else
> -					PMD_DRV_LOG(DEBUG, "VF no poll
> turned off");
> -			}
> -			PMD_DRV_LOG(INFO, "Link status update:%s",
> -					vf->link_up ? "up" : "down");
> +			iavf_handle_link_change_event(vf->eth_dev, vpe);
>  			break;
>  		case VIRTCHNL_EVENT_RESET_IMPENDING:
>  			vf->vf_reset = true;
> @@ -525,30 +559,7 @@ iavf_handle_pf_event_msg(struct rte_eth_dev *dev,
> uint8_t *msg,
>  		break;
>  	case VIRTCHNL_EVENT_LINK_CHANGE:
>  		PMD_DRV_LOG(DEBUG, "VIRTCHNL_EVENT_LINK_CHANGE
> event");
> -		vf->link_up = pf_msg->event_data.link_event.link_status;
> -		if (vf->vf_res->vf_cap_flags &
> VIRTCHNL_VF_CAP_ADV_LINK_SPEED) {
> -			vf->link_speed =
> -				pf_msg-
> >event_data.link_event_adv.link_speed;
> -		} else {
> -			enum virtchnl_link_speed speed;
> -			speed = pf_msg->event_data.link_event.link_speed;
> -			vf->link_speed = iavf_convert_link_speed(speed);
> -		}
> -		iavf_dev_link_update(dev, 0);
> -		if (vf->link_up && !vf->vf_reset) {
> -			iavf_dev_watchdog_disable(adapter);
> -		} else {
> -			if (!vf->link_up)
> -				iavf_dev_watchdog_enable(adapter);
> -		}
> -		if (adapter->devargs.no_poll_on_link_down) {
> -			iavf_set_no_poll(adapter, true);
> -			if (adapter->no_poll)
> -				PMD_DRV_LOG(DEBUG, "VF no poll turned
> on");
> -			else
> -				PMD_DRV_LOG(DEBUG, "VF no poll turned
> off");
> -		}
> -		iavf_dev_event_post(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
> 0);
> +		iavf_handle_link_change_event(dev, pf_msg);
>  		break;
>  	case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
>  		PMD_DRV_LOG(DEBUG,
> "VIRTCHNL_EVENT_PF_DRIVER_CLOSE event");
> --
> 2.34.1


^ permalink raw reply

* Re: [PATCH v4 02/11] bpf: introduce extensible load API
From: Thomas Monjalon @ 2026-06-10 14:46 UTC (permalink / raw)
  To: Marat Khalili; +Cc: Konstantin Ananyev, Wathsala Vithanage, dev
In-Reply-To: <20260520124922.42445-3-marat.khalili@huawei.com>

20/05/2026 14:49, Marat Khalili:
> Introduce new BPF load parameters struct rte_bpf_prm_ex that can be
> extended without breaking backward or forward compatibility. Introduce
> new function rte_bpf_load_ex consolidating in one code path loading from
> both ELF file and raw memory image, with possibility to add more options
> in the future.

Unfortunately, compilation is failing on this patch:

lib/bpf/bpf_load.c:274:40: error: 'struct rte_bpf_jit' has no member named 'raw'




^ permalink raw reply

* Re: [PATCH v2 2/3] ring: use GCC builtin as alternative to rte_atomic32
From: Thomas Monjalon @ 2026-06-10 15:41 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Konstantin Ananyev, Wathsala Vithanage
In-Reply-To: <20260604163656.1226902-3-stephen@networkplumber.org>

04/06/2026 18:32, Stephen Hemminger:
> This patch replaces use of the deprecated rte_atomic32 code with
> GCC builtin atomic operations.

It compiles fine with GCC, but there is an issue with clang:

ninja: Entering directory `build-gcc-static'
ninja: no work to do.
ninja: Entering directory `build-gcc-shared'
ninja: no work to do.
ninja: Entering directory `build-clang-static'
ninja: no work to do.
ninja: Entering directory `build-clang-shared'
[1/3069] Compiling C object lib/librte_ring.a.p/ring_soring.c.o
rte_ring_gcc_pvt.h:43:2: error: address argument to atomic operation must be a pointer to integer or pointer ('volatile _Atomic(uint32_t) *' invalid)
   43 |         __atomic_store_n(&ht->tail, new_val, __ATOMIC_RELEASE);
      |         ^                ~~~~~~~~~


> Although it would be preferable to use C11 version on all architectures,
> there is a performance loss if we do it that way:
> 
> Measured on i9-13900H, two physical cores MP/MC bulk n=128, 10 runs:
>   with C11 builtin:           5.86 cycles/elem
>   with __sync builtin:        5.36 cycles/elem  (-9.4%)

You don't compare with the current rte_atomic functions?



^ permalink raw reply

* [PATCH v6] net/iavf: fix duplicate VF reset during PF reset recovery
From: Anurag Mandal @ 2026-06-10 15:43 UTC (permalink / raw)
  To: dev
  Cc: bruce.richardson, vladimir.medvedkin, ciara.loftus, Anurag Mandal,
	stable
In-Reply-To: <20260605202911.314359-1-anurag.mandal@intel.com>

During PF initiated reset recovery, iavf_dev_close() sends an
extra 'VIRTCHNL_OP_RESET_VF' while recovery is already in progress.
That second reset can leave PF/VF virtchnl state inconsistent and
cause 'VIRTCHNL_OP_CONFIG_VSI_QUEUES' to fail with 'ERR_PARAM' after
ToR link flap/power-cycle, leaving the VF unable to recover.
This results in connection loss.

This patch introduces a new flag 'pf_reset_in_progress', which
is set only when iavf_handle_hw_reset() is entered for a
PF-initiated reset (vf_initiated_reset is false), and
it is cleared on exit.
The aforesaid flag is used to prevent sending close-time VF
reset and related close-time virtchnl operation messages to the
AdminQ when PF triggered reset recovery is set.
This is done to avoid duplicate VF reset requests while preserving
normal behavior for application-driven close or VF-initiated reinit.

Fixes: 675a104e2e94 ("net/iavf: fix abnormal disable HW interrupt")
Fixes: b34fe66ea893 ("net/iavf: delay VF reset command")
Fixes: 5e03e316c753 ("net/iavf: handle virtchnl event message without interrupt")
Cc: stable@dpdk.org

Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>
---
V6: Addressed Ciara Loftus's review comments
 - changed to concise relase note
 - removed unwarranted comment
 - added proper comments in two places
 - aligned commits with latest 'next-net-intel-for-next-net' branch
V5: Addressed Ciara Loftus's review comments
 - added separate flag for PF initiated reset recovery
V4: Addressed Ciara Loftus's review comments
 - split VF reset from other code changes
V3: Addressed latest ai-code-review comments
V2: Addressed ai-code-review comments

 doc/guides/rel_notes/release_26_07.rst |  2 ++
 drivers/net/intel/iavf/iavf.h          |  1 +
 drivers/net/intel/iavf/iavf_ethdev.c   | 42 +++++++++++++++++---------
 drivers/net/intel/iavf/iavf_vchnl.c    | 18 +++++++++--
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index b5285af5fe..3832410363 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -121,6 +121,8 @@ New Features
 
   * Added support for transmitting LLDP packets based on mbuf packet type.
   * Implemented AVX2 context descriptor transmit paths.
+  * Fixed duplicate send of 'VIRTCHNL_OP_RESET_VF' during PF reset recovery
+    which could cause virtchnl state corruption.
 
 * **Updated NVIDIA mlx5 ethernet driver.**
 
diff --git a/drivers/net/intel/iavf/iavf.h b/drivers/net/intel/iavf/iavf.h
index 4444602a30..293adaf6c9 100644
--- a/drivers/net/intel/iavf/iavf.h
+++ b/drivers/net/intel/iavf/iavf.h
@@ -292,6 +292,7 @@ struct iavf_info {
 
 	bool in_reset_recovery;
 	bool reset_pending;
+	bool pf_reset_in_progress;
 
 	uint32_t ptp_caps;
 	rte_spinlock_t phc_time_aq_lock;
diff --git a/drivers/net/intel/iavf/iavf_ethdev.c b/drivers/net/intel/iavf/iavf_ethdev.c
index ec1ad02826..4c8a1895e4 100644
--- a/drivers/net/intel/iavf/iavf_ethdev.c
+++ b/drivers/net/intel/iavf/iavf_ethdev.c
@@ -3168,22 +3168,29 @@ iavf_dev_close(struct rte_eth_dev *dev)
 	ret = iavf_dev_stop(dev);
 
 	/*
-	 * Release redundant queue resource when close the dev
-	 * so that other vfs can re-use the queues.
+	 * Prevent sending close-time virtchnl messages to the AdminQ
+	 * during PF-initiated reset recovery.
 	 */
-	if (vf->lv_enabled) {
-		ret = iavf_request_queues(dev, IAVF_MAX_NUM_QUEUES_DFLT);
-		if (ret)
-			PMD_DRV_LOG(ERR, "Reset the num of queues failed");
+	if (!vf->pf_reset_in_progress) {
 
-		vf->max_rss_qregion = IAVF_MAX_NUM_QUEUES_DFLT;
-	}
+		/*
+		 * Release redundant queue resource when close the dev
+		 * so that other vfs can re-use the queues.
+		 */
+		if (vf->lv_enabled) {
+			ret = iavf_request_queues(dev, IAVF_MAX_NUM_QUEUES_DFLT);
+			if (ret)
+				PMD_DRV_LOG(ERR, "Reset the num of queues failed");
+			vf->max_rss_qregion = IAVF_MAX_NUM_QUEUES_DFLT;
+		}
 
-	/* Disable promiscuous mode before resetting the VF. This is to avoid
-	 * potential issues when the PF is bound to the kernel driver.
-	 */
-	if (vf->promisc_unicast_enabled || vf->promisc_multicast_enabled)
-		iavf_config_promisc(adapter, false, false);
+		/*
+		 * Disable promiscuous mode before resetting the VF. This is to avoid
+		 * potential issues when the PF is bound to the kernel driver.
+		 */
+		if (vf->promisc_unicast_enabled || vf->promisc_multicast_enabled)
+			iavf_config_promisc(adapter, false, false);
+	}
 
 	adapter->closed = true;
 
@@ -3196,7 +3203,12 @@ iavf_dev_close(struct rte_eth_dev *dev)
 	iavf_flow_flush(dev, NULL);
 	iavf_flow_uninit(adapter);
 
-	iavf_vf_reset(hw);
+	/*
+	 * Prevent sending VIRTCHNL_OP_RESET_VF during PF-initiated
+	 * reset recovery.
+	 */
+	if (!vf->pf_reset_in_progress)
+		iavf_vf_reset(hw);
 	/*
 	 * If a reset is pending, wait for the PF to disable the VF's admin
 	 * receive queue (its first reset action) before we shut it down
@@ -3380,6 +3392,7 @@ iavf_handle_hw_reset(struct rte_eth_dev *dev, bool vf_initiated_reset)
 	}
 
 	vf->in_reset_recovery = true;
+	vf->pf_reset_in_progress = !vf_initiated_reset;
 	iavf_set_no_poll(adapter, false);
 
 	/* Call the pre reset callback */
@@ -3430,6 +3443,7 @@ iavf_handle_hw_reset(struct rte_eth_dev *dev, bool vf_initiated_reset)
 		vf->post_reset_cb(dev->data->port_id, ret, vf->post_reset_cb_arg);
 
 	vf->in_reset_recovery = false;
+	vf->pf_reset_in_progress = false;
 	iavf_set_no_poll(adapter, false);
 
 	return;
diff --git a/drivers/net/intel/iavf/iavf_vchnl.c b/drivers/net/intel/iavf/iavf_vchnl.c
index 0643a835d5..08ab11ccf1 100644
--- a/drivers/net/intel/iavf/iavf_vchnl.c
+++ b/drivers/net/intel/iavf/iavf_vchnl.c
@@ -283,9 +283,21 @@ iavf_read_msg_from_pf(struct iavf_adapter *adapter, uint16_t buf_len,
 					vf->link_up ? "up" : "down");
 			break;
 		case VIRTCHNL_EVENT_RESET_IMPENDING:
-			vf->vf_reset = true;
-			iavf_set_no_poll(adapter, false);
-			PMD_DRV_LOG(INFO, "VF is resetting");
+			/*
+			 * Force link down on impending reset to drop
+			 * the cached link-up state; a fresh LSC up
+			 * event will be re-issued by the PF once the
+			 * VF is reinitialised.
+			 */
+			vf->link_up = false;
+			if (!vf->vf_reset) {
+				vf->vf_reset = true;
+				iavf_set_no_poll(adapter, false);
+				iavf_dev_event_post(vf->eth_dev,
+					RTE_ETH_EVENT_INTR_RESET,
+					NULL, 0);
+			}
+			PMD_DRV_LOG(DEBUG, "VF is resetting");
 			break;
 		case VIRTCHNL_EVENT_PF_DRIVER_CLOSE:
 			vf->dev_closed = true;
-- 
2.34.1


^ permalink raw reply related

* RE: [PATCH v5] net/iavf: fix duplicate VF reset during PF reset recovery
From: Mandal, Anurag @ 2026-06-10 15:45 UTC (permalink / raw)
  To: Loftus, Ciara, dev@dpdk.org
  Cc: Richardson, Bruce, Medvedkin, Vladimir, stable@dpdk.org
In-Reply-To: <IA4PR11MB9278C1F04D8BB54089B361E58E1A2@IA4PR11MB9278.namprd11.prod.outlook.com>

> -----Original Message-----
> From: Loftus, Ciara <ciara.loftus@intel.com>
> Sent: 10 June 2026 16:20
> To: Mandal, Anurag <anurag.mandal@intel.com>; dev@dpdk.org
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Medvedkin, Vladimir
> <vladimir.medvedkin@intel.com>; stable@dpdk.org
> Subject: RE: [PATCH v5] net/iavf: fix duplicate VF reset during PF reset recovery
> 
> > Subject: [PATCH v5] net/iavf: fix duplicate VF reset during PF reset
> > recovery
> >
> > During PF initiated reset recovery, iavf_dev_close() sending an extra
> > VIRTCHNL_OP_RESET_VF while recovery is already in progress.
> > That second reset can leave PF/VF virtchnl state inconsistent and
> > cause VIRTCHNL_OP_CONFIG_VSI_QUEUES to fail with ERR_PARAM after ToR
> > link flap/power-cycle, leaving the VF unable to recover.
> > This results in connection loss.
> >
> > This patch introduces a new flag 'pf_reset_in_progress', that is set
> > only when iavf_handle_hw_reset() is entered with vf_initiated_reset as
> > false and is cleared on exit.
> > Also, close-time VF reset and related close-time virtchnl operations
> > are skipped when PF triggered reset recovery is set.
> > This is done to avoid a duplicate VF reset, and keep normal behavior
> > for application-driven close or VF-initiated reinit.
> >
> > Fixes: 675a104e2e94 ("net/iavf: fix abnormal disable HW interrupt")
> > Fixes: b34fe66ea893 ("net/iavf: delay VF reset command")
> > Fixes: 5e03e316c753 ("net/iavf: handle virtchnl event message without
> > interrupt")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Anurag Mandal <anurag.mandal@intel.com>
> 
> Acked-by: Ciara Loftus <ciara.loftus@intel.com>
> 
> I think you may need to respin due to patch application failure.
> I have some suggestions for improving the comments/release notes that you
> could include in the next version. Code looks good to me.
> 
> > ---
> > V5: Addressed Ciara Loftus's comments
> >  - added separate flag for PF initiated reset recovery
> > V4: Addressed Ciara Loftus's comments
> >  - split VF reset from other code changes
> > V3: Addressed latest ai-code-review comments
> > V2: Addressed ai-code-review comments
> >
> >  doc/guides/rel_notes/release_26_07.rst |  3 ++
> >  drivers/net/intel/iavf/iavf.h          |  7 +++++
> >  drivers/net/intel/iavf/iavf_ethdev.c   | 40 +++++++++++++++-----------
> >  drivers/net/intel/iavf/iavf_vchnl.c    | 18 ++++++++++--
> >  4 files changed, 49 insertions(+), 19 deletions(-)
> >
> > diff --git a/doc/guides/rel_notes/release_26_07.rst
> > b/doc/guides/rel_notes/release_26_07.rst
> > index d2563ac503..f6899a78c3 100644
> > --- a/doc/guides/rel_notes/release_26_07.rst
> > +++ b/doc/guides/rel_notes/release_26_07.rst
> > @@ -95,6 +95,9 @@ New Features
> >
> >    * Added support for transmitting LLDP packets based on mbuf packet type.
> >    * Implemented AVX2 context descriptor transmit paths.
> > +  * Prevented duplicate 'VIRTCHNL_OP_RESET_VF' during a PF-initiated
> > +    reset recovery, which earlier caused virtchnl state corruption
> > +    and connection loss after a top-of-rack (ToR) link flap/power-cycle.
> 
> I think something more concise here would be better.
> eg. "Fixed duplicate send of 'VIRTCHNL_OP_RESET_VF' during PF reset recovery
> which could cause virtchnl state corruption"
> 
> >
> >  * **Updated PCAP ethernet driver.**
> >
> > diff --git a/drivers/net/intel/iavf/iavf.h
> > b/drivers/net/intel/iavf/iavf.h index 2615b6f034..67aacbe7a6 100644
> > --- a/drivers/net/intel/iavf/iavf.h
> > +++ b/drivers/net/intel/iavf/iavf.h
> > @@ -292,6 +292,13 @@ struct iavf_info {
> >
> >  	bool in_reset_recovery;
> >
> > +	/*
> > +	 * Set only while iavf_handle_hw_reset()
> > +	 * is processing a PF-initiated reset
> > +	 * (vf_initiated_reset == false).
> > +	 */
> 
> I don't think a comment is warranted here, the variable name is self-
> explanatory.
> 
> > +	bool pf_reset_in_progress;
> > +
> >  	uint32_t ptp_caps;
> >  	rte_spinlock_t phc_time_aq_lock;
> >  };
> > diff --git a/drivers/net/intel/iavf/iavf_ethdev.c
> > b/drivers/net/intel/iavf/iavf_ethdev.c
> > index a8031e23a5..2b6f4daa99 100644
> > --- a/drivers/net/intel/iavf/iavf_ethdev.c
> > +++ b/drivers/net/intel/iavf/iavf_ethdev.c
> > @@ -3166,23 +3166,27 @@ iavf_dev_close(struct rte_eth_dev *dev)
> >
> >  	ret = iavf_dev_stop(dev);
> >
> > -	/*
> > -	 * Release redundant queue resource when close the dev
> > -	 * so that other vfs can re-use the queues.
> > -	 */
> > -	if (vf->lv_enabled) {
> > -		ret = iavf_request_queues(dev,
> > IAVF_MAX_NUM_QUEUES_DFLT);
> > -		if (ret)
> > -			PMD_DRV_LOG(ERR, "Reset the num of queues
> > failed");
> > +	/* Skip RESET_VF on a PF-initiated reset */
> 
> Regarding the comment above, here we're not skipping RESET_VF rather
> preventing sending virtchnl messages to the adminq during the PF-initiated
> reset. I suggest rewording the comment to reflect that.
> 
> > +	if (!vf->pf_reset_in_progress) {
> >
> > -		vf->max_rss_qregion = IAVF_MAX_NUM_QUEUES_DFLT;
> > -	}
> > +		/*
> > +		 * Release redundant queue resource when close the dev
> > +		 * so that other vfs can re-use the queues.
> > +		 */
> > +		if (vf->lv_enabled) {
> > +			ret = iavf_request_queues(dev,
> > IAVF_MAX_NUM_QUEUES_DFLT);
> > +			if (ret)
> > +				PMD_DRV_LOG(ERR, "Reset the num of
> > queues failed");
> > +			vf->max_rss_qregion =

Hi Ciara,

Thank you for the detailed review. I have addresses all the review comments.
Sent v6. Kindly review.

Thanks,
Anurag M

^ permalink raw reply

* Re: [PATCH v5] ethdev: support inline calculating masked item value
From: Stephen Hemminger @ 2026-06-10 15:46 UTC (permalink / raw)
  To: Bing Zhao
  Cc: viacheslavo, dev, rasland, orika, dsosnowski, suanmingm, matan,
	thomas
In-Reply-To: <20260610052729.5637-1-bingz@nvidia.com>

On Wed, 10 Jun 2026 08:27:29 +0300
Bing Zhao <bingz@nvidia.com> wrote:

> In the asynchronous API definition and some drivers, the
> rte_flow_item spec value may not be calculated by the driver due to the
> reason of speed of light rule insertion rate and sometimes the input
> parameters will be copied and changed internally.
> 
> After copying, the spec and last will be protected by the keyword
> const and cannot be changed in the code itself. And also the driver
> needs some extra memory to do the calculation and extra conditions
> to understand the length of each item spec. This is not efficient.
> 
> To solve the issue and support usage of the following fix, a new OP
> was introduced to calculate the spec and last values after applying
> the mask inline.
> 
> Signed-off-by: Bing Zhao <bingz@nvidia.com>
> Acked-by: Dariusz Sosnowski <dsosnowski@nvidia.com>
> ---

Detailed AI review still sees issues here:

On Wed, 10 Jun 2026 08:27:29 +0300, Bing Zhao wrote:
> Subject: [PATCH v5] ethdev: support inline calculating masked item value
> v5: handle some items separately and add test for them

The v5 lib/ethdev/rte_flow.c and app/test/test_ethdev_api.c hunks are
identical to v4 -- the masking loop is unchanged and the test still only
covers ETH. The changelog says items are handled separately and a test was
added, but no such change is present in the diff. The v4 issue is still open.

Error: byte-wise masking corrupts embedded pointers in deep-copy item
types (RAW, FLEX, GENEVE_OPT).

In rte_flow_conv_pattern(), the mask is applied over the fixed item struct:

	size_t item_mask_size = mask ? rte_flow_conv_item_mask_size(src) : 0;
	...
	size_t mask_size = RTE_MIN(ret, item_mask_size);

	for (j = 0; j < mask_size; j++)
		c_spec[j] &= mask[j];

item_mask_size is rte_flow_desc_item[type].size, the fixed item struct size.
For RTE_FLOW_ITEM_TYPE_RAW, FLEX, and GENEVE_OPT, that fixed struct ends in an
embedded pointer that rte_flow_conv_item_spec() has just populated to point at
the deep-copied trailing data (rte_flow_item_raw.pattern,
rte_flow_item_flex.pattern, rte_flow_item_geneve_opt.data). Because the masked
range covers the whole fixed struct, the loop ANDs the bytes of that pointer
with the mask's corresponding bytes (typically a NULL mask pointer), zeroing
or garbling it.

The converted item's pattern/data pointer is clobbered while the copied
payload it should reference is left unreachable. A consumer that follows
conv->pattern then dereferences NULL or a corrupt address. Plain value items
(eth, ipv4, ...) are unaffected; only the deep-copy item types break.

Suggested fix: do not blind-mask the entire fixed struct for items that carry
an embedded pointer / desc_fn deep copy. Either skip masking when
rte_flow_desc_item[type].desc_fn != NULL, or mask only the leading plain-data
region and leave the pointer field (and trailing copied bytes) intact.

Warning: the new test validates only an ETH pattern, so the RAW/FLEX/GENEVE_OPT
path above is untested. A RAW item case would have surfaced the pointer
corruption -- and is what the v5 changelog claims to have added but did not.

Info: the Doxygen block for RTE_FLOW_CONV_OP_PATTERN_MASKED uses @p mask,
@p spec, @p last, but those are item fields, not parameters of the op; the
neighboring enum entries only document the @p src / @p dst types.

^ permalink raw reply

* Re: [PATCH v2] eal: add destructor to unregister tailq on unload
From: Stephen Hemminger @ 2026-06-10 15:57 UTC (permalink / raw)
  To: fengchengwen; +Cc: dev, stable, Bruce Richardson, Neil Horman, David Marchand
In-Reply-To: <bfa31475-848f-42af-bfb4-f796433c3073@huawei.com>

On Wed, 10 Jun 2026 09:19:42 +0800
fengchengwen <fengchengwen@huawei.com> wrote:

> >  
> > +RTE_EXPORT_SYMBOL(rte_eal_tailq_unregister)  
> 
> this should be with EXPERIMENTAL

Not possible, this is part of the EAL_REGISTER_TAILQ macro and usage
is under the covers. So if anything was marked experimental it would
fail code that did not allow experimental

> 
> > +void
> > +rte_eal_tailq_unregister(struct rte_tailq_elem *t)
> > +{
> > +	TAILQ_REMOVE(&rte_tailq_elem_head, t, next);  
> 
> We need first make sure it exist the tailq, just like TAILQ_FOREACH rte_eal_tailq_local_register()

Ok cheap scan since not in critical path.

^ permalink raw reply

* Re: [PATCH 0/2] Pflock downgrade & stress tests for pflock/rwlock libraries
From: Stephen Hemminger @ 2026-06-10 15:59 UTC (permalink / raw)
  To: Eimear Morrissey; +Cc: dev
In-Reply-To: <20260610091147.88412-1-eimear.morrissey@huawei.com>

On Wed, 10 Jun 2026 10:11:45 +0100
Eimear Morrissey <eimear.morrissey@huawei.com> wrote:

> Add new downgrade option for pflock. Add stress tests for this &
> by extension the rest of the pflock/rwlock libraries.
> 
> Eimear Morrissey (1):
>   app/test: add stress tests for rwlock and pflock
> 
> Konstantin Ananyev (1):
>   eal/pflock: add API to downgrade from wr to rd lock
> 
>  app/test/meson.build               |   2 +
>  app/test/test_pflock_stress.c      |  76 ++++++
>  app/test/test_rwlock_stress.c      |  59 +++++
>  app/test/test_rwlock_stress_impl.h | 393 +++++++++++++++++++++++++++++
>  lib/eal/include/rte_pflock.h       |  21 ++
>  5 files changed, 551 insertions(+)
>  create mode 100644 app/test/test_pflock_stress.c
>  create mode 100644 app/test/test_rwlock_stress.c
>  create mode 100644 app/test/test_rwlock_stress_impl.h
> 

Interesting idea, lots of feedback from AI. Mostly about the test.

Patch 1/2 (eal/pflock: add API to downgrade from wr to rd lock)

Warning: new public API is not marked __rte_experimental.
New APIs (including static inline) should carry the experimental tag
for at least one release per the ABI policy:

	__rte_experimental
	static inline void
	rte_pflock_write_downgrade(rte_pflock_t *pf)

Warning: new EAL API added without a release notes entry.
Please add a note to doc/guides/rel_notes/release_26_07.rst.

Info: the hunk adds a double blank line before the #ifdef __cplusplus,
and the two atomic calls use different continuation indentation
(one tab vs two). checkpatch will complain about the blank lines.

Patch 2/2 (app/test: add stress tests for rwlock and pflock)

Error: read lock is leaked on reader error paths, hanging the test.
handle_error() with write_lock=false does not unlock, and its comment
claims the lock is "already unlocked by the calling function" -- but
handle_reader_work() calls it while still holding the read lock (both
the array-mismatch path and the counter-changed path), and the
DOWNGRADE_TEST failure path in handle_writer_work() likewise calls it
while holding the downgraded read lock. The leaked reader keeps rd.out
from ever matching, so any writer blocks forever in write_lock() and
rte_eal_wait_lcore() never returns: a detected failure becomes a hang
instead of a test failure. Simplest fix is to have callers unlock
before calling handle_error() and drop the unlock from it entirely;
that also fixes the downgrade path incrementing reader_errors for a
writer thread.

Error: stop flag uses volatile instead of atomics.
"volatile bool stop" is written by workers (handle_error) and the main
lcore and polled by all workers. volatile provides no ordering or
atomicity guarantee; use RTE_ATOMIC(bool) with
rte_atomic_load_explicit/rte_atomic_store_explicit as
test_ring_stress_impl.h does for wrk_cmd.
The volatile on counter and counter_array is unnecessary -- they are
only accessed under the lock, which already provides ordering -- and
it defeats compiler optimization of the 1024-element verify loops.

Warning: DYNAMIC_ROLES does not switch roles.
should_be_writer() is called once in lcore_function() before the loop,
so each thread's role is fixed for the whole run; the flag's stated
purpose ("Threads can switch between reader/writer roles") never
happens. Move the role decision inside the while loop when
DYNAMIC_ROLES is set.

Warning: should_be_writer() assumes the main lcore has index 0.
	unsigned int idx = rte_lcore_index(rte_lcore_id()) - 1;
With --main-lcore set to a non-lowest core, a worker can have index 0,
so idx underflows to UINT_MAX and the reader/writer split no longer
matches num_readers/num_writers. Compute the worker's position by
iterating RTE_LCORE_FOREACH_WORKER or skip the main lcore's index
explicitly.

Warning: trailing alignment attribute placement.
	} __rte_cache_aligned;
on struct lcore_stats and struct rwlock_test_shared must be written as
	struct __rte_cache_aligned lcore_stats {
checkpatch enforces this (required for MSVC).

Info: "max wait" statistic does not measure lock wait time.
acquire_time spans the entire iteration including the verify loops and
the configured reader/writer delays, so for long_hold it reports the
delay, not contention. Either time only the lock call or rename it.

Info: missing space in the summary printf string concatenation:
	"%"PRIu64" writer ops," "total time: ..."
prints "writer ops,total time". Also the first element of
pflock_specific_tests is mis-indented (opening brace at column 0).

^ permalink raw reply

* Re: [PATCH v8 0/1] net/mana: add device reset support
From: Stephen Hemminger @ 2026-06-10 16:56 UTC (permalink / raw)
  To: Wei Hu; +Cc: dev, longli, weh
In-Reply-To: <cover.1781017284.git.weh@linux.microsoft.com>

On Wed, 10 Jun 2026 00:21:21 -0700
Wei Hu <weh@linux.microsoft.com> wrote:

> From: Wei Hu <weh@microsoft.com>
> 
> Add support for handling hardware service reset events in the
> MANA driver. When the MANA kernel driver receives a hardware
> service event, it initiates a device reset and notifies userspace
> via IBV_EVENT_DEVICE_FATAL. The MANA PMD handles this by
> performing an automatic teardown and recovery sequence.
> 
> The driver uses ethdev recovery events (ERR_RECOVERING,
> RECOVERY_SUCCESS, RECOVERY_FAILED) to notify upper layers of
> the reset lifecycle, and a PCI device removal event callback
> to distinguish hot-remove from service reset.
> 
> Changes since v7:
> - Moved heavy teardown (dev_stop, IPC to secondaries, dev_close,
>   MR btree free) from mana_reset_enter (EAL interrupt thread)
>   to mana_reset_thread (control thread). The interrupt handler
>   now only sets state, drains in-flight bursts, and spawns the
>   thread. Teardown runs immediately in the control thread before
>   the recovery timer wait, avoiding blocking the interrupt thread
>   on multi-second IPC timeouts and ibverbs calls. Each function
>   now owns its own lock scope with no lock hand-off between
>   threads.
> - Fixed self-join deadlock: clear reset_thread_active before
>   emitting RECOVERY_SUCCESS/FAILED callbacks from the reset
>   thread. Without this, if the callback calls dev_stop/dev_close,
>   mana_join_reset_thread attempts to join the current thread.
> - Simplified burst_state from encoding device state in bits 1+
>   to a single blocked flag (bit 1). Only one value was ever
>   stored, so the multi-state encoding was misleading. Added
>   MANA_BURST_BLOCKED constant.
> - Updated mana.rst to reflect that teardown runs on the control
>   thread, not the interrupt handler.
> 
> Changes since v6:
> - Rebased onto latest upstream for-main
> - Replaced removed RTE_ETH_DEV_TO_PCI macro with
>   RTE_CLASS_TO_BUS_DEVICE (upstream commit 4757b8df04
>   removed the old bus-specific ethdev convenience macros)
> 
> Changes since v5:
> - Replaced RCU QSBR with per-queue atomic burst_state using a
>   single-variable CAS design: bit 0 is the in-burst flag, bit 1
>   is the blocked flag. The data path uses CAS(0→1) to enter
>   burst and fetch_and(~1) to exit. The reset path uses fetch_or
>   to set the blocked bit and polls bit 0 to drain in-flight
>   bursts. This eliminates the two-variable Dekker pattern and the
>   need for sequential consistency (seq_cst) ordering.
> - Removed librte_rcu dependency
> - Removed __rte_no_thread_safety_analysis annotations (no longer
>   needed after mutex conversion)
> - Moved ERR_RECOVERING event emission before acquiring
>   reset_ops_lock and before mana_reset_enter, so upper layers
>   (e.g. netvsc) can switch data path before mana stops queues.
>   Emitting outside the lock avoids deadlock if the callback
>   calls dev_stop or dev_close.
> - Replaced MANA_OPS_*_LOCK macros with mana_reset_trylock()
>   helper function and explicit per-operation wrappers
> - Removed unused rte_alarm.h and rte_lock_annotations.h includes
> - Added RECOVERY_FAILED event when mana_reset_enter fails
>   internally, so the application always receives a terminal event
> - Added mana_clear_burst_state() helper to clear per-queue
>   burst_state on failure paths (reset_failed, dev_stop_lock,
>   dev_close_lock) preventing permanent silent packet drop after
>   a failed reset
> 
> Changes since v4:
> - Fixed stale rte_spinlock_unlock call in mana_intr_handler that
>   was missed during the spinlock-to-mutex conversion, causing a
>   -Wincompatible-pointer-types warning
> 
> Changes since v3:
> - Converted reset_ops_lock from rte_spinlock_t to pthread_mutex_t
>   with PTHREAD_PROCESS_SHARED, since the lock is held across
>   blocking IB verbs calls and IPC with 5s timeout
> - Removed rte_dev_event_callback_unregister retry loop to avoid
>   deadlock: the callback itself blocks on reset_ops_lock, so
>   retrying on -EAGAIN while holding the lock is a deadlock
> - Introduced mana_join_reset_thread() helper using CAS on
>   reset_thread_active to prevent double-join undefined behavior
> - Added reset thread join in mana_dev_uninit to prevent thread
>   leak on device removal
> - Fixed ibv handle leak: priv->ib_ctx is now only set to NULL
>   after ibv_close_device succeeds
> - Fixed misleading "All secondary threads are quiescent" log in
>   mana_mp_reset_enter — changed to "Secondary doorbell pages
>   unmapped" since actual quiescence is enforced by the primary's
>   per-queue atomic flag check before IPC is sent
> - Changed event list in mana.rst to RST definition list style
> - Squashed documentation into the feature patch per convention
> 
> Changes since v2:
> - Fixed dev_state_qsv memory leak on device removal
> - Fixed reset thread TCB/stack leak: reset_thread_active is now
>   only cleared by the joiner, not the thread itself
> - Fixed second reset crash: removed reset thread join logic from
>   mana_dev_close (inner function) to avoid corrupting dev_state
>   when called from mana_reset_enter
> - Made reset_thread_active RTE_ATOMIC(bool) with explicit ordering
> - Added retry loop for rte_dev_event_callback_unregister on -EAGAIN
> - Initialized condvar/mutex with PTHREAD_PROCESS_SHARED since priv
>   is in hugepage shared memory
> - Added re-check of dev_state after lock acquisition in
>   mana_intr_handler to prevent racing with pci_remove_event_cb
> - Replaced (void *)0 with NULL in mp.c
> - Added lock ownership comment block at mana_reset_enter
> - Documented rte_dev_event_monitor_start() requirement
> - Added mana.rst documentation and release note
> 
> Changes since v1:
> - Removed net/netvsc patch from this series
> - Simplified reset exit: mana_reset_exit calls
>   mana_reset_exit_delay directly instead of spawning a thread
> - Added __rte_no_thread_safety_analysis annotations for clang
> - Switched to rte_thread_create_internal_control
> - Fixed declaration-after-statement style issues
> - Removed unnecessary blank lines and stale comments
> 
> Wei Hu (1):
>   net/mana: add device reset support
> 
>  doc/guides/nics/mana.rst               |   40 +
>  doc/guides/rel_notes/release_26_07.rst |    8 +
>  drivers/net/mana/mana.c                | 1076 ++++++++++++++++++++++--
>  drivers/net/mana/mana.h                |   52 +-
>  drivers/net/mana/mp.c                  |   89 +-
>  drivers/net/mana/mr.c                  |    6 +-
>  drivers/net/mana/rx.c                  |   23 +-
>  drivers/net/mana/tx.c                  |   44 +-
>  8 files changed, 1230 insertions(+), 108 deletions(-)
> 


AI review thread still sees some issues:

This is close. Teardown is off the interrupt thread now, the cross-thread lock
hand-off is gone, and the burst_state encoding reads cleanly. One real problem
left.

The reset thread leaks on every successful recovery. mana_reset_exit_delay and
the reset_failed path in mana_reset_thread clear reset_thread_active from inside
the thread itself, without joining. The thread is created joinable via
rte_thread_create_internal_control, so terminating it unjoined leaks its
resources, and because the flag is now false none of the join sites
(mana_join_reset_thread, the join-previous block in mana_reset_enter,
mana_dev_uninit) will ever reap it. The PCI-remove abort path leaves the flag
true and is reaped later, which is the inconsistency that exposes this: some
exits expect a join and some do not, and the latter have no reaper.

Simplest fix is to detach the reset thread (rte_thread_detach) and drop the
reset_thread_active / mana_join_reset_thread machinery, using reset_ops_lock and
dev_state for the dev_stop/dev_close sequencing instead. That removes the
self-join hazard too. If you keep the join, don't clear the flag from inside the
thread; have mana_join_reset_thread detect the self case and skip only the join.

Minor: the recovery condvar wait in mana_reset_thread is a bare cond_timedwait.
If pci_remove signals before the thread reaches the wait, the wakeup is lost and
removal isn't seen until the 15s timer expires. Use a dev_state predicate loop
under reset_cond_mutex.

^ permalink raw reply

* Re: [PATCH v1 00/20] net/sxe2: added Linkdata sxe2 ethernet driver
From: Stephen Hemminger @ 2026-06-10 17:11 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260610013936.3634968-1-liujie5@linkdatatechnology.com>

On Wed, 10 Jun 2026 09:39:16 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> This patch set implements core functionality for the SXE2 PMD,
> including basic driver framework, data path setup, and advanced
> offload features (VLAN, RSS,TM, PTP etc.).
> 
> Jie Liu (20):
>   net/sxe2: support AVX512 vectorized path for Rx and Tx
>   net/sxe2: add AVX2 vector data path for Rx and Tx
>   drivers: add supported packet types get callback
>   net/sxe2: support L2 filtering and MAC config
>   drivers: support RSS feature
>   net/sxe2: support TM hierarchy and shaping
>   net/sxe2: support IPsec inline protocol offload
>   net/sxe2: support statistics and multi-process
>   drivers: interrupt handling
>   net/sxe2: add NEON vec Rx/Tx burst functions
>   drivers: add support for VF representors
>   net/sxe2: add support for custom UDP tunnel ports
>   net/sxe2: support firmware version reading
>   net/sxe2: implement get monitor address
>   common/sxe2: add shared SFP module definitions
>   net/sxe2: support SFP module info and EEPROM access
>   net/sxe2: implement private dump info
>   net/sxe2: add mbuf validation in Tx debug mode
>   drivers: add testpmd commands for private features
>   net/sxe2: update sxe2 feature matrix docs
> 
>  doc/guides/nics/features/sxe2.ini          |   56 +
>  drivers/common/sxe2/sxe2_common.c          |  156 ++
>  drivers/common/sxe2/sxe2_common.h          |    4 +
>  drivers/common/sxe2/sxe2_flow_public.h     |  633 +++++++
>  drivers/common/sxe2/sxe2_ioctl_chnl.c      |  178 +-
>  drivers/common/sxe2/sxe2_ioctl_chnl_func.h |   18 +
>  drivers/common/sxe2/sxe2_msg.h             |  118 ++
>  drivers/common/sxe2/sxe2_ptype.h           | 1793 ++++++++++++++++++
>  drivers/net/sxe2/meson.build               |   56 +-
>  drivers/net/sxe2/sxe2_cmd_chnl.c           | 1587 +++++++++++++++-
>  drivers/net/sxe2/sxe2_cmd_chnl.h           |  139 ++
>  drivers/net/sxe2/sxe2_drv_cmd.h            |  521 +++++-
>  drivers/net/sxe2/sxe2_dump.c               |  304 +++
>  drivers/net/sxe2/sxe2_dump.h               |   12 +
>  drivers/net/sxe2/sxe2_ethdev.c             | 1531 +++++++++++++++-
>  drivers/net/sxe2/sxe2_ethdev.h             |  112 +-
>  drivers/net/sxe2/sxe2_ethdev_repr.c        |  610 ++++++
>  drivers/net/sxe2/sxe2_ethdev_repr.h        |   32 +
>  drivers/net/sxe2/sxe2_filter.c             |  895 +++++++++
>  drivers/net/sxe2/sxe2_filter.h             |  100 +
>  drivers/net/sxe2/sxe2_flow.c               | 1391 ++++++++++++++
>  drivers/net/sxe2/sxe2_flow.h               |   30 +
>  drivers/net/sxe2/sxe2_flow_define.h        |  144 ++
>  drivers/net/sxe2/sxe2_flow_parse_action.c  | 1182 ++++++++++++
>  drivers/net/sxe2/sxe2_flow_parse_action.h  |   23 +
>  drivers/net/sxe2/sxe2_flow_parse_engine.c  |  106 ++
>  drivers/net/sxe2/sxe2_flow_parse_engine.h  |   13 +
>  drivers/net/sxe2/sxe2_flow_parse_pattern.c | 1935 ++++++++++++++++++++
>  drivers/net/sxe2/sxe2_flow_parse_pattern.h |   46 +
>  drivers/net/sxe2/sxe2_ipsec.c              | 1565 ++++++++++++++++
>  drivers/net/sxe2/sxe2_ipsec.h              |  254 +++
>  drivers/net/sxe2/sxe2_irq.c                | 1026 +++++++++++
>  drivers/net/sxe2/sxe2_irq.h                |   25 +
>  drivers/net/sxe2/sxe2_mac.c                |  535 ++++++
>  drivers/net/sxe2/sxe2_mac.h                |   84 +
>  drivers/net/sxe2/sxe2_mp.c                 |  414 +++++
>  drivers/net/sxe2/sxe2_mp.h                 |   67 +
>  drivers/net/sxe2/sxe2_queue.c              |   17 +-
>  drivers/net/sxe2/sxe2_rss.c                |  584 ++++++
>  drivers/net/sxe2/sxe2_rss.h                |   81 +
>  drivers/net/sxe2/sxe2_rx.c                 |   38 +
>  drivers/net/sxe2/sxe2_rx.h                 |    2 +
>  drivers/net/sxe2/sxe2_security.c           |  335 ++++
>  drivers/net/sxe2/sxe2_security.h           |   77 +
>  drivers/net/sxe2/sxe2_stats.c              |  591 ++++++
>  drivers/net/sxe2/sxe2_stats.h              |   39 +
>  drivers/net/sxe2/sxe2_switchdev.c          |  332 ++++
>  drivers/net/sxe2/sxe2_switchdev.h          |   33 +
>  drivers/net/sxe2/sxe2_testpmd.c            |  733 ++++++++
>  drivers/net/sxe2/sxe2_testpmd_lib.c        |  969 ++++++++++
>  drivers/net/sxe2/sxe2_testpmd_lib.h        |  142 ++
>  drivers/net/sxe2/sxe2_tm.c                 | 1169 ++++++++++++
>  drivers/net/sxe2/sxe2_tm.h                 |   78 +
>  drivers/net/sxe2/sxe2_tx.c                 |    7 +
>  drivers/net/sxe2/sxe2_txrx.c               |  176 +-
>  drivers/net/sxe2/sxe2_txrx.h               |    4 +
>  drivers/net/sxe2/sxe2_txrx_check_mbuf.c    |  595 ++++++
>  drivers/net/sxe2/sxe2_txrx_check_mbuf.h    |   38 +
>  drivers/net/sxe2/sxe2_txrx_poll.c          |  243 ++-
>  drivers/net/sxe2/sxe2_txrx_vec.c           |   46 +-
>  drivers/net/sxe2/sxe2_txrx_vec.h           |   38 +-
>  drivers/net/sxe2/sxe2_txrx_vec_avx2.c      |  776 ++++++++
>  drivers/net/sxe2/sxe2_txrx_vec_avx512.c    |  897 +++++++++
>  drivers/net/sxe2/sxe2_txrx_vec_common.h    |    1 +
>  drivers/net/sxe2/sxe2_txrx_vec_neon.c      |  721 ++++++++
>  drivers/net/sxe2/sxe2_vsi.c                |  146 ++
>  drivers/net/sxe2/sxe2_vsi.h                |   12 +-
>  drivers/net/sxe2/sxe2vf_regs.h             |   85 +
>  68 files changed, 26576 insertions(+), 124 deletions(-)
>  create mode 100644 drivers/common/sxe2/sxe2_flow_public.h
>  create mode 100644 drivers/common/sxe2/sxe2_msg.h
>  create mode 100644 drivers/common/sxe2/sxe2_ptype.h
>  create mode 100644 drivers/net/sxe2/sxe2_dump.c
>  create mode 100644 drivers/net/sxe2/sxe2_dump.h
>  create mode 100644 drivers/net/sxe2/sxe2_ethdev_repr.c
>  create mode 100644 drivers/net/sxe2/sxe2_ethdev_repr.h
>  create mode 100644 drivers/net/sxe2/sxe2_filter.c
>  create mode 100644 drivers/net/sxe2/sxe2_filter.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_define.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_action.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_action.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_engine.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_engine.h
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_pattern.c
>  create mode 100644 drivers/net/sxe2/sxe2_flow_parse_pattern.h
>  create mode 100644 drivers/net/sxe2/sxe2_ipsec.c
>  create mode 100644 drivers/net/sxe2/sxe2_ipsec.h
>  create mode 100644 drivers/net/sxe2/sxe2_irq.c
>  create mode 100644 drivers/net/sxe2/sxe2_mac.c
>  create mode 100644 drivers/net/sxe2/sxe2_mac.h
>  create mode 100644 drivers/net/sxe2/sxe2_mp.c
>  create mode 100644 drivers/net/sxe2/sxe2_mp.h
>  create mode 100644 drivers/net/sxe2/sxe2_rss.c
>  create mode 100644 drivers/net/sxe2/sxe2_rss.h
>  create mode 100644 drivers/net/sxe2/sxe2_security.c
>  create mode 100644 drivers/net/sxe2/sxe2_security.h
>  create mode 100644 drivers/net/sxe2/sxe2_stats.c
>  create mode 100644 drivers/net/sxe2/sxe2_stats.h
>  create mode 100644 drivers/net/sxe2/sxe2_switchdev.c
>  create mode 100644 drivers/net/sxe2/sxe2_switchdev.h
>  create mode 100644 drivers/net/sxe2/sxe2_testpmd.c
>  create mode 100644 drivers/net/sxe2/sxe2_testpmd_lib.c
>  create mode 100644 drivers/net/sxe2/sxe2_testpmd_lib.h
>  create mode 100644 drivers/net/sxe2/sxe2_tm.c
>  create mode 100644 drivers/net/sxe2/sxe2_tm.h
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_check_mbuf.c
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_check_mbuf.h
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx2.c
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_avx512.c
>  create mode 100644 drivers/net/sxe2/sxe2_txrx_vec_neon.c
>  create mode 100644 drivers/net/sxe2/sxe2vf_regs.h
> 

I assume you meant v15 for this. The simple things first.
The code needs rebase against main and you need to run at a minimum the
full set of build tests: devtools/test-meson-builds.sh

Also, concerned that not all patches will bisect cleanly.
You should run a build at each step; yes that means 20 builds.

The amount of new devargs and testpmd functions concerns me.
Each new option increases the amount of options, and increases the probablity
of bugs. 

AI code review also found:

Deep-dive review of the latest sxe2 respin (01-20/20), including full
compilation of the assembled tree and per-commit bisect builds.

[09/20] drivers: interrupt handling

Error: patches 09/20 through 18/20 do not link. sxe2_irq.c (added by 09/20)
calls SXE2_DEV_TO_PCI() at lines 430 and 512, but that macro no longer exists:
upstream removed it from sxe2_ethdev.h in early June (it was present in the
May 29 tree, gone by June 8). Nothing in this series defines it. The series
only links from 19/20 onward, where the two calls are converted to
container_of(dev->device, struct rte_pci_device, device).

Verified empirically, not by patch inspection: I applied the series to a
June 8 upstream base (applies cleanly) and built at every one of the 20
commits. Commits 9-18 all fail with "undefined reference to SXE2_DEV_TO_PCI";
commits 19-20 link. Ten consecutive broken commits breaks git bisect.

This looks like rebase fallout from the upstream macro removal: the previous
revision removed the macro definition in 19/20 (correct when the base still
had it); after upstream deleted it, that hunk was dropped - but the
container_of conversion stayed in 19/20 instead of moving into 09/20 where
the calls are introduced. Fix: have 09/20 use RTE_DEV_TO_PCI(dev->device)
or container_of directly, and drop the conversion hunk from 19/20.

Also note the series needs a trivial rebase: 11/20 conflicts with the June 10
sxe2_common.c refactor on current main.



^ permalink raw reply

* Re: [PATCH v1 19/20] drivers: add testpmd commands for private features
From: Stephen Hemminger @ 2026-06-10 17:22 UTC (permalink / raw)
  To: liujie5; +Cc: dev
In-Reply-To: <20260610013936.3634968-20-liujie5@linkdatatechnology.com>

On Wed, 10 Jun 2026 09:39:35 +0800
liujie5@linkdatatechnology.com wrote:

> From: Jie Liu <liujie5@linkdatatechnology.com>
> 
> Introduce private testpmd commands and implementation files to enable
> debugging and testing of sxe2-specific hardware features (such as
> packet scheduling reset, UDP tunnel configuration, and IPsec ingress/
> egress offloads) directly within the testpmd application.
> 
> The parameters are parsed using the standard 'rte_kvargs' library during
> the PCI/vdev probing phase. Documentation for these parameters is also
> updated.
> 
> During memory hotplug events, the SXE2 driver needs to track memory
> segment layout changes to maintain internal DMA mappings. However,
> existing memseg walk functions (rte_memseg_walk) acquire memory locks
> and cannot be called from within memory event callbacks, leading to
> potential deadlocks.
> 
> This commit introduces sxe2_memseg_walk_cb() as a helper that walks
> memory segments using the thread-unsafe variant
> rte_memseg_walk_thread_unsafe(), which is safe to call from
> memory-related callbacks [citation:1][citation:3][citation:5].
> 
> The implementation follows the standard rte_memseg_walk_t prototype,
> processing each memseg to update driver-specific data structures.
> 
> Signed-off-by: Jie Liu <liujie5@linkdatatechnology.com>
> ---

This memory stuff looks problematic and needs more review.
At a minimum I see a pattern of not handling values from strtoul()
that are out of range.


I asked AI for a more detailed review and it saw.
[PATCH 19/20] drivers: add testpmd commands for private features

There is concern about the amount of driver-private testpmd plumbing and
devargs this patch adds. The raw command count (7) is within precedent
(i40e has 29, mlx5 13, ixgbe 11), but the mechanism and content are not.

Error: the command logic is placed in sxe2_testpmd_lib.c, compiled into the
driver library, and exposed through 14 new RTE_EXPORT_EXPERIMENTAL_SYMBOL
entries (sxe2_ipsec_egress_create, sxe2_ipsec_conf_set, sxe2_flow_rule_dump,
sxe2_udp_tunnel_operations, sxe2_stats_info_show, sxe2_testpmd_sched_reset,
etc). No upstream driver exports symbols for its testpmd commands; all six
existing drivers with testpmd integration compile their *_testpmd.c into
testpmd via testpmd_sources and use internal access. These exports are
vendor public API that any application can link against. The driver .so also
gains application state for the commands: g_tx_session[][], g_rx_session[][],
g_esp_header_offset[], g_sess_pool. SA-manager bookkeeping does not belong
in a PMD. Move the logic into sxe2_testpmd.c and drop all 14 exports; at
most RTE_EXPORT_INTERNAL_SYMBOL is appropriate here.

Error: three commands duplicate standard testpmd functionality the driver
already supports. "sxe2 flow rule dump" exists because the driver does not
implement the rte_flow dev_dump op; implement the op and the standard
"flow dump <port> all" works for every application. "sxe2 <port>
udp_tunnel_port add|rm" duplicates "port config <port> udp_tunnel_port
add|rm", which calls the udp_tunnel ops added in patch 12. "sxe2 show stats"
duplicates "show port xstats"; the driver already implements xstats, and
anything missing from xstats should be added there, not shown by a private
formatter.

Warning: the 9-subcommand ipsec suite (egress/ingress add/rm/show,
session-id and esp-hdr-offset set/get, flush, stats) is an SA management
application embedded in the driver. Inline crypto is exercised with
examples/ipsec-secgw, as done for other inline-crypto PMDs. If interactive
SA management in testpmd is needed, propose it as generic testpmd commands
over rte_security so all drivers benefit.

Warning: seven private devargs are added (flow-duplicate-pattern,
function-flow-direct, fnav-stat-type, drv-sw-stats, high-performance-mode,
sched-layer-mode, rx-low-latency) with no documentation: no Runtime
Configuration section in sxe2.rst and no RTE_PMD_REGISTER_PARAM_STRING, so
they are undiscoverable. Beyond documentation: flow-duplicate-pattern makes
rte_flow duplicate-rule semantics vary per boot option, which is not
acceptable for a standard API; fnav-stat-type and drv-sw-stats select stats
sources and belong in xstats; sched-layer-mode configures TM topology that
the rte_tm hierarchy built by the application should determine;
high-performance-mode accepts only the value 1 and is undocumented - if the
mode is safe make it the default, otherwise document the trade-off. Each
surviving devarg needs documentation and a rationale for why no standard
API covers it.


^ permalink raw reply

* Re: [PATCH v2 0/4] net/bnxt: miscellaneous bug fixes
From: Kishore Padmanabha @ 2026-06-10 18:17 UTC (permalink / raw)
  To: Mohammad Shuab Siddique; +Cc: dev, stable
In-Reply-To: <20260604225622.2285191-1-Mohammad-Shuab.Siddique@broadcom.com>


[-- Attachment #1.1: Type: text/plain, Size: 1305 bytes --]

On Thu, Jun 4, 2026 at 6:54 PM Mohammad Shuab Siddique <
mohammad-shuab.siddique@broadcom.com> wrote:

> From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
>
> This series collects four independent bug fixes for the bnxt PMD:
>
>  - Eliminate unnecessary long TX BDs when only checksum offload is needed
>  - Pass QP1 resource count correctly when configuring backing store
>  - Fix implicit integer sign-extension in the doorbell calculation
>  - Prevent VFs from attempting global RSS configuration
>
> All patches carry Fixes: tags and Cc: stable@dpdk.org.
>
> Changes in v2:
>  - Patch 4/4: add missing Fixes: tag for RSS hash mode fix
>
> Ajit Khaparde (2):
>   net/bnxt: modify check for short Tx BDs
>   net/bnxt: fix QP resource count in backing store config
>
> Mohammad Shuab Siddique (1):
>   net/bnxt: fix RSS hash mode configuration for VFs
>
> Zoe Cheimets (1):
>   net/bnxt: remove implicit integer sign-extension
>
>  drivers/net/bnxt/bnxt_ethdev.c |  4 ++--
>  drivers/net/bnxt/bnxt_hwrm.c   | 18 ++++++++++++++++--
>  drivers/net/bnxt/bnxt_ring.c   |  7 ++++---
>  drivers/net/bnxt/bnxt_txr.c    |  3 +--
>  4 files changed, 23 insertions(+), 9 deletions(-)
>
> patches merged into dpdk-next-net-brcm
Thanks

> --
> 2.47.3
>
>

[-- Attachment #1.2: Type: text/html, Size: 2024 bytes --]

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5493 bytes --]

^ permalink raw reply

* Re: [PATCH v2 0/5] net/bnxt: interrupt handling, external mbuf and stability fixes
From: Kishore Padmanabha @ 2026-06-10 18:17 UTC (permalink / raw)
  To: Mohammad Shuab Siddique; +Cc: dev, stable
In-Reply-To: <20260605005016.2290160-1-Mohammad-Shuab.Siddique@broadcom.com>


[-- Attachment #1.1: Type: text/plain, Size: 2084 bytes --]

On Thu, Jun 4, 2026 at 8:48 PM Mohammad Shuab Siddique <
mohammad-shuab.siddique@broadcom.com> wrote:

> From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
>
> This series addresses interrupt handling, external memory, and crash bugs:
>
>  - Fix incorrect completion validation for NQEs and RX completions causing
>    excess interrupts
>  - Use buf_addr instead of IOVA for mbufs from external memory pools
>  - Skip IOVA range check for external mbuf head nodes to avoid false
> failures
>  - Add null checks to prevent segfaults when accessing uninitialized
> structures
>  - Fix segfault on exit when bonded ports are present, by checking whether
>    ethdev has already freed the RX/TX queue arrays
>
> All patches carry Fixes: tags and Cc: stable@dpdk.org.
>
> Note: this series depends on series "net/bnxt: ULP stats timer and PTP".
>
> Changes in v2:
>  - Patch 1/5: replace printf() with PMD_DRV_LOG_LINE() (DPDK logging
> standard)
>  - Patch 2/5: replace custom bnxt_mbuf_buf_addr() with
> rte_pktmbuf_mtod_offset()
>
> Ajit Khaparde (2):
>   net/bnxt: use buf address for external mbuf
>   net/bnxt: prevent a potential segfault
>
> Keegan Freyhof (2):
>   net/bnxt: fix NQ/CQ processing for interrupt handling
>   net/bnxt: fix for segmentation fault that would occur on exit
>
> Mohammad Shuab Siddique (1):
>   net/bnxt: fix IOVA range check for external mbuf head node
>
>  drivers/net/bnxt/bnxt.h        |  2 +
>  drivers/net/bnxt/bnxt_cpr.c    | 100 ++++++++++++++++++++++++++++++++++
>  drivers/net/bnxt/bnxt_cpr.h    |  34 +++++++++++-
>  drivers/net/bnxt/bnxt_ethdev.c |   3 ++
>  drivers/net/bnxt/bnxt_hwrm.c   |   3 ++
>  drivers/net/bnxt/bnxt_ring.c   |  11 +++-
>  drivers/net/bnxt/bnxt_rxq.c    |  47 +++++++++++++++-
>  drivers/net/bnxt/bnxt_rxr.c    |   2 +-
>  drivers/net/bnxt/bnxt_stats.c  |  17 +++---
>  drivers/net/bnxt/bnxt_txr.c    |  19 +++++--
>  10 files changed, 223 insertions(+), 15 deletions(-)
>
> patches merged into dpdk-next-net-brcm
Thanks

> --
> 2.47.3
>
>

[-- Attachment #1.2: Type: text/html, Size: 2865 bytes --]

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5493 bytes --]

^ permalink raw reply

* Re: [PATCH 00/10] net/bnxt: vector mode V3 implementation and AVX2 improvements
From: Kishore Padmanabha @ 2026-06-10 18:18 UTC (permalink / raw)
  To: Mohammad Shuab Siddique; +Cc: dev, stable
In-Reply-To: <20260604031851.2267548-1-Mohammad-Shuab.Siddique@broadcom.com>


[-- Attachment #1.1: Type: text/plain, Size: 2765 bytes --]

On Wed, Jun 3, 2026 at 11:17 PM Mohammad Shuab Siddique <
mohammad-shuab.siddique@broadcom.com> wrote:

> From: Mohammad Shuab Siddique <mohammad-shuab.siddique@broadcom.com>
>
> This series adds vector mode support for BCM5760X (Thor2 / V3 packets)
> and fixes several AVX2 path issues:
>
>  - Implement AVX2 vector RX for V3 packet completions with VLAN TCI
> reporting
>  - Fix stale nr_bds values that could cause the producer to lag the
> consumer
>  - Fix incorrect advertisement of LRO offload capability
>  - Fix scalar RX path not checking rxcmp flags before setting the PTP mbuf
> flag
>  - Fix missing timestamps for non-PTP traffic when promiscuous
> timestamping is on
>  - Fix Tx ring corruption and burst truncation after an invalid Tx
> descriptor
>  - Optimise the AVX2 RX paths (dead code removal, register reduction for
> V3)
>  - Fix VLAN strip ol_flag being set per-port instead of per-packet for V3
>  - Add burst mode info entry for V3 in bnxt_rx_burst_info
>  - Fix V3 vector mode defaulting to cksum-good instead of cksum-unknown
>
> Most patches carry Fixes: tags. New functionality (V3 vector mode, AVX2
> optimisation) is targeted at 26.07.
>
> Note: this series depends on series "net/bnxt: stability fixes".
>
> Chenna Arnoori (1):
>   net/bnxt: fix RX timestamping for non-PTP packets
>
> Damodharam Ammepalli (1):
>   net/bnxt: fix advertising RX LRO offload capability
>
> Keegan Freyhof (6):
>   net/bnxt: vector mode implementation for V3 packets
>   net/bnxt: stale values in nr_bds are cleared
>   net/bnxt: optimization of the AVX2 RX paths
>   net/bnxt: fix for VLAN stripping being set incorrectly
>   net/bnxt: add vector AVX2 burst mode indicator for v3
>   net/bnxt: fix v3 vector mode not selecting cksum unknown
>
> Mohammad Shuab Siddique (1):
>   net/bnxt: scalar rx path disregarded rxcmp flags for setting ptp mbuf
>     flag
>
> Zoe Cheimets (1):
>   net/bnxt: fix packet burst truncation after invalid Tx descriptor
>
>  .gitignore                              |   1 +
>  drivers/net/bnxt/bnxt.h                 |   1 +
>  drivers/net/bnxt/bnxt_ethdev.c          |   6 +-
>  drivers/net/bnxt/bnxt_hwrm.c            |   7 +-
>  drivers/net/bnxt/bnxt_rxq.c             |   3 +-
>  drivers/net/bnxt/bnxt_rxr.c             |  25 +-
>  drivers/net/bnxt/bnxt_rxr.h             |  14 +-
>  drivers/net/bnxt/bnxt_rxtx_vec_avx2.c   | 444 +++++++++++++++++++++++-
>  drivers/net/bnxt/bnxt_rxtx_vec_common.h |  37 ++
>  drivers/net/bnxt/bnxt_stats.c           |   3 +
>  drivers/net/bnxt/bnxt_txr.c             | 170 ++++++++-
>  11 files changed, 677 insertions(+), 34 deletions(-)
>
> patches merged into dpdk-next-net-brcm
Thanks

> --
> 2.47.3
>
>

[-- Attachment #1.2: Type: text/html, Size: 3574 bytes --]

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5493 bytes --]

^ permalink raw reply

* [PATCH] dts: avoid Scapy MAC resolution in Rx split test
From: Thomas Monjalon @ 2026-06-10 18:32 UTC (permalink / raw)
  To: dev; +Cc: Luca Vizzarro, Patrick Robb

The test gets the Ethernet header length from Scapy with len(Ether()).

When building DTS API documentation, Sphinx imports the test module
and shows this warning:
WARNING: MAC address to reach destination not found. Using broadcast.

Use a dummy MAC address so Scapy no longer performs
destination resolution during import.

Fixes: 01c70544cffd ("dts: add selective Rx tests")

Signed-off-by: Thomas Monjalon <thomas@monjalon.net>
---
 dts/tests/TestSuite_rx_split.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dts/tests/TestSuite_rx_split.py b/dts/tests/TestSuite_rx_split.py
index 0c7913bbd8..5f5a2e6187 100644
--- a/dts/tests/TestSuite_rx_split.py
+++ b/dts/tests/TestSuite_rx_split.py
@@ -27,7 +27,7 @@
 from framework.test_suite import TestSuite, func_test
 
 PAYLOAD = bytes(range(256))
-ETHER_HDR_LEN = len(Ether())
+ETHER_HDR_LEN = len(Ether(dst="00:00:00:00:00:00"))
 IP_HDR_LEN = len(IP())
 ETHER_IP_HDR_LEN = ETHER_HDR_LEN + IP_HDR_LEN
 ETHER_MIN_FRAME_LEN = 60
-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 0/2] ring: replace use of rte_atomic
From: Stephen Hemminger @ 2026-06-10 18:43 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger
In-Reply-To: <20260602171552.686349-1-stephen@networkplumber.org>

This is part of the broader rte_atomic32 deprecation work, sent
separately because it is the most complex part and benefits from
independent review.

Convert lib/ring off rte_atomic32 and onto the C11 memory model,
except for the ring head compare-and-swap where special case
is needed. On x86 with GCC using C11 atomics produces measurably
worse code.

After this series only __rte_ring_headtail_move_head has separate
C11 and GCC-builtin implementations; everything else uses the same
code on all architectures. The default RTE_USE_C11_MEM_MODEL
selection per architecture is unchanged.

v3:
  - rebase and squash patches
  - keep original code for x86 single thread case

Stephen Hemminger (2):
  ring: split single thread vs multi-thread cases
  ring: replace rte_atomic32 with __sync builtin

 lib/ring/meson.build             |   2 +-
 lib/ring/rte_ring_c11_pvt.h      | 107 +++++++++++++--------
 lib/ring/rte_ring_elem_pvt.h     |  53 ++++++++---
 lib/ring/rte_ring_gcc_pvt.h      | 155 +++++++++++++++++++++++++++++++
 lib/ring/rte_ring_generic_pvt.h  | 119 ------------------------
 lib/ring/rte_ring_hts_elem_pvt.h |   8 +-
 lib/ring/soring.c                |  34 ++++---
 7 files changed, 289 insertions(+), 189 deletions(-)
 create mode 100644 lib/ring/rte_ring_gcc_pvt.h
 delete mode 100644 lib/ring/rte_ring_generic_pvt.h

-- 
2.53.0


^ permalink raw reply

* [PATCH v3 1/2] ring: split single thread vs multi-thread cases
From: Stephen Hemminger @ 2026-06-10 18:43 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Konstantin Ananyev, Wathsala Vithanage
In-Reply-To: <20260610184701.657769-1-stephen@networkplumber.org>

The move head function has optimization for updating when
being used on single threaded ring. Code is cleaner if the two
cases are split into separate functions.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
Tested-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/ring/rte_ring_c11_pvt.h     | 100 +++++++++++++++++++++++++-------
 lib/ring/rte_ring_elem_pvt.h    |  16 +++--
 lib/ring/rte_ring_generic_pvt.h |  77 ++++++++++++++++++++----
 lib/ring/soring.c               |  24 +++++---
 4 files changed, 171 insertions(+), 46 deletions(-)

diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index 07b6efc416..5afc14dec9 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -46,6 +46,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
 
 /**
  * @internal This is a helper function that moves the producer/consumer head
+ *    optimized for single threaded case
  *
  * @param d
  *   A pointer to the headtail structure with head value to be moved
@@ -54,8 +55,6 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
  *   function only reads tail value from it
  * @param capacity
  *   Either ring capacity value (for producer), or zero (for consumer)
- * @param is_st
- *   Indicates whether multi-thread safe path is needed or not
  * @param n
  *   The number of elements we want to move head value on
  * @param behavior
@@ -72,14 +71,77 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only
  */
 static __rte_always_inline unsigned int
-__rte_ring_headtail_move_head(struct rte_ring_headtail *d,
+__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d,
 		const struct rte_ring_headtail *s, uint32_t capacity,
-		unsigned int is_st, unsigned int n,
+		unsigned int n,
 		enum rte_ring_queue_behavior behavior,
 		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
 {
 	uint32_t stail;
-	int success;
+
+	/* Single producer: only this thread writes d->head,
+	 * so a relaxed load is sufficient.
+	 */
+	*old_head = rte_atomic_load_explicit(&d->head, rte_memory_order_relaxed);
+
+	/* Acquire pairs with the consumer's release-store of tail in __rte_ring_update_tail,
+	 * ensuring the consumer's ring-element reads are complete before
+	 * we observe the updated tail.
+	 */
+	stail = rte_atomic_load_explicit(&s->tail, rte_memory_order_acquire);
+
+	/* Unsigned subtraction is modulo 2^32, so entries is always in
+	 * [0, capacity) even if old_head > stail.
+	 */
+	*entries = capacity + stail - *old_head;
+
+	/* check that we have enough room in ring */
+	if (unlikely(n > *entries))
+		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
+
+	if (n > 0) {
+		*new_head = *old_head + n;
+		rte_atomic_store_explicit(&d->head, *new_head, rte_memory_order_relaxed);
+	}
+
+	return n;
+}
+
+/**
+ * @internal This is a helper function that moves the producer/consumer head
+ *    for use in multi-thread safe path
+ *
+ * @param d
+ *   A pointer to the headtail structure with head value to be moved
+ * @param s
+ *   A pointer to the counter-part headtail structure. Note that this
+ *   function only reads tail value from it
+ * @param capacity
+ *   Either ring capacity value (for producer), or zero (for consumer)
+ * @param n
+ *   The number of elements we want to move head value on
+ * @param behavior
+ *   RTE_RING_QUEUE_FIXED:    Move on a fixed number of items
+ *   RTE_RING_QUEUE_VARIABLE: Move on as many items as possible
+ * @param old_head
+ *   Returns head value as it was before the move
+ * @param new_head
+ *   Returns the new head value
+ * @param entries
+ *   Returns the number of ring entries available BEFORE head was moved
+ * @return
+ *   Actual number of objects the head was moved on
+ *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only
+ */
+static __rte_always_inline unsigned int
+__rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
+		const struct rte_ring_headtail *s, uint32_t capacity,
+		unsigned int n,
+		enum rte_ring_queue_behavior behavior,
+		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
+{
+	uint32_t stail;
+	bool success;
 	unsigned int max = n;
 
 	/*
@@ -120,25 +182,21 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail *d,
 			return 0;
 
 		*new_head = *old_head + n;
-		if (is_st) {
-			d->head = *new_head;
-			success = 1;
-		} else
-			/* on failure, *old_head is updated */
-			/*
-			 * R1/A2.
-			 * R1: Establishes a synchronizing edge with A0 of a
-			 * different thread.
-			 * A2: Establishes a synchronizing edge with R1 of a
-			 * different thread to observe same value for stail
-			 * observed by that thread on CAS failure (to retry
-			 * with an updated *old_head).
-			 */
-			success = rte_atomic_compare_exchange_strong_explicit(
+		/* on failure, *old_head is updated */
+		/*
+		 * R1/A2.
+		 * R1: Establishes a synchronizing edge with A0 of a
+		 * different thread.
+		 * A2: Establishes a synchronizing edge with R1 of a
+		 * different thread to observe same value for stail
+		 * observed by that thread on CAS failure (to retry
+		 * with an updated *old_head).
+		 */
+		success = rte_atomic_compare_exchange_strong_explicit(
 					&d->head, old_head, *new_head,
 					rte_memory_order_release,
 					rte_memory_order_acquire);
-	} while (unlikely(success == 0));
+	} while (unlikely(!success));
 	return n;
 }
 
diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
index 6eafae121f..a0fdec9812 100644
--- a/lib/ring/rte_ring_elem_pvt.h
+++ b/lib/ring/rte_ring_elem_pvt.h
@@ -341,8 +341,12 @@ __rte_ring_move_prod_head(struct rte_ring *r, unsigned int is_sp,
 		uint32_t *old_head, uint32_t *new_head,
 		uint32_t *free_entries)
 {
-	return __rte_ring_headtail_move_head(&r->prod, &r->cons, r->capacity,
-			is_sp, n, behavior, old_head, new_head, free_entries);
+	if (is_sp)
+		return __rte_ring_headtail_move_head_st(&r->prod, &r->cons, r->capacity,
+				n, behavior, old_head, new_head, free_entries);
+	else
+		return __rte_ring_headtail_move_head_mt(&r->prod, &r->cons, r->capacity,
+				n, behavior, old_head, new_head, free_entries);
 }
 
 /**
@@ -374,8 +378,12 @@ __rte_ring_move_cons_head(struct rte_ring *r, unsigned int is_sc,
 		uint32_t *old_head, uint32_t *new_head,
 		uint32_t *entries)
 {
-	return __rte_ring_headtail_move_head(&r->cons, &r->prod, 0,
-			is_sc, n, behavior, old_head, new_head, entries);
+	if (is_sc)
+		return __rte_ring_headtail_move_head_st(&r->cons, &r->prod, 0,
+				n, behavior, old_head, new_head, entries);
+	else
+		return __rte_ring_headtail_move_head_mt(&r->cons, &r->prod, 0,
+				n, behavior, old_head, new_head, entries);
 }
 
 /**
diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_generic_pvt.h
index affd2d5ba7..c044b0824f 100644
--- a/lib/ring/rte_ring_generic_pvt.h
+++ b/lib/ring/rte_ring_generic_pvt.h
@@ -42,6 +42,7 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
 
 /**
  * @internal This is a helper function that moves the producer/consumer head
+ *    for use in multi-thread safe path
  *
  * @param d
  *   A pointer to the headtail structure with head value to be moved
@@ -50,8 +51,6 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
  *   function only reads tail value from it
  * @param capacity
  *   Either ring capacity value (for producer), or zero (for consumer)
- * @param is_st
- *   Indicates whether multi-thread safe path is needed or not
  * @param n
  *   The number of elements we want to move head value on
  * @param behavior
@@ -68,10 +67,9 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only
  */
 static __rte_always_inline unsigned int
-__rte_ring_headtail_move_head(struct rte_ring_headtail *d,
+__rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
 		const struct rte_ring_headtail *s, uint32_t capacity,
-		unsigned int is_st, unsigned int n,
-		enum rte_ring_queue_behavior behavior,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
 		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
 {
 	unsigned int max = n;
@@ -105,15 +103,70 @@ __rte_ring_headtail_move_head(struct rte_ring_headtail *d,
 			return 0;
 
 		*new_head = *old_head + n;
-		if (is_st) {
-			d->head = *new_head;
-			success = 1;
-		} else
-			success = rte_atomic32_cmpset(
-					(uint32_t *)(uintptr_t)&d->head,
-					*old_head, *new_head);
+		success = rte_atomic32_cmpset(
+				(uint32_t *)(uintptr_t)&d->head,
+				*old_head, *new_head);
 	} while (unlikely(success == 0));
 	return n;
 }
 
+/**
+ * @internal This is a helper function that moves the producer/consumer head
+ *    optimized for single threaded case
+ *
+ * @param d
+ *   A pointer to the headtail structure with head value to be moved
+ * @param s
+ *   A pointer to the counter-part headtail structure. Note that this
+ *   function only reads tail value from it
+ * @param capacity
+ *   Either ring capacity value (for producer), or zero (for consumer)
+ * @param n
+ *   The number of elements we want to move head value on
+ * @param behavior
+ *   RTE_RING_QUEUE_FIXED:    Move on a fixed number of items
+ *   RTE_RING_QUEUE_VARIABLE: Move on as many items as possible
+ * @param old_head
+ *   Returns head value as it was before the move
+ * @param new_head
+ *   Returns the new head value
+ * @param entries
+ *   Returns the number of ring entries available BEFORE head was moved
+ * @return
+ *   Actual number of objects the head was moved on
+ *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only
+ */
+static __rte_always_inline unsigned int
+__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d,
+		const struct rte_ring_headtail *s, uint32_t capacity,
+		unsigned int n,
+		enum rte_ring_queue_behavior behavior,
+		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
+{
+	*old_head = d->head;
+
+	/* add rmb barrier to avoid load/load reorder in weak
+	 * memory model. It is noop on x86
+	 */
+	rte_smp_rmb();
+
+	/*
+	 *  The subtraction is done between two unsigned 32bits value
+	 * (the result is always modulo 32 bits even if we have
+	 * *old_head > s->tail). So 'entries' is always between 0
+	 * and capacity (which is < size).
+	 */
+	*entries = (capacity + s->tail - *old_head);
+
+	/* check that we have enough room in ring */
+	if (unlikely(n > *entries))
+		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
+
+	if (likely(n > 0)) {
+		*new_head = *old_head + n;
+		d->head = *new_head;
+	}
+	return n;
+}
+
 #endif /* _RTE_RING_GENERIC_PVT_H_ */
diff --git a/lib/ring/soring.c b/lib/ring/soring.c
index e9c75619fe..22f9c60e9c 100644
--- a/lib/ring/soring.c
+++ b/lib/ring/soring.c
@@ -135,9 +135,12 @@ __rte_soring_move_prod_head(struct rte_soring *r, uint32_t num,
 
 	switch (st) {
 	case RTE_RING_SYNC_ST:
+		n = __rte_ring_headtail_move_head_st(&r->prod.ht, &r->cons.ht,
+			r->capacity, num, behavior, head, next, free);
+		break;
 	case RTE_RING_SYNC_MT:
-		n = __rte_ring_headtail_move_head(&r->prod.ht, &r->cons.ht,
-			r->capacity, st, num, behavior, head, next, free);
+		n = __rte_ring_headtail_move_head_mt(&r->prod.ht, &r->cons.ht,
+			r->capacity, num, behavior, head, next, free);
 		break;
 	case RTE_RING_SYNC_MT_RTS:
 		n = __rte_ring_rts_move_head(&r->prod.rts, &r->cons.ht,
@@ -168,9 +171,13 @@ __rte_soring_move_cons_head(struct rte_soring *r, uint32_t stage, uint32_t num,
 
 	switch (st) {
 	case RTE_RING_SYNC_ST:
+		n = __rte_ring_headtail_move_head_st(&r->cons.ht,
+			&r->stage[stage].ht, 0, num, behavior,
+			head, next, avail);
+		break;
 	case RTE_RING_SYNC_MT:
-		n = __rte_ring_headtail_move_head(&r->cons.ht,
-			&r->stage[stage].ht, 0, st, num, behavior,
+		n = __rte_ring_headtail_move_head_mt(&r->cons.ht,
+			&r->stage[stage].ht, 0, num, behavior,
 			head, next, avail);
 		break;
 	case RTE_RING_SYNC_MT_RTS:
@@ -309,9 +316,8 @@ soring_enqueue_start(struct rte_soring *r, uint32_t num,
 
 	switch (st) {
 	case RTE_RING_SYNC_ST:
-		n = __rte_ring_headtail_move_head(&r->prod.ht, &r->cons.ht,
-			r->capacity, RTE_RING_SYNC_ST, num, behavior,
-			&head, &next, &free);
+		n = __rte_ring_headtail_move_head_st(&r->prod.ht, &r->cons.ht,
+			r->capacity, num, behavior, &head, &next, &free);
 		break;
 	case RTE_RING_SYNC_MT_HTS:
 		n = __rte_ring_hts_move_head(&r->prod.hts, &r->cons.ht,
@@ -419,8 +425,8 @@ soring_dequeue_start(struct rte_soring *r, void *objs, void *meta,
 
 	switch (st) {
 	case RTE_RING_SYNC_ST:
-		n = __rte_ring_headtail_move_head(&r->cons.ht, &r->stage[ns].ht,
-			0, RTE_RING_SYNC_ST, num, behavior, &head, &next,
+		n = __rte_ring_headtail_move_head_st(&r->cons.ht, &r->stage[ns].ht,
+			0, num, behavior, &head, &next,
 			&avail);
 		break;
 	case RTE_RING_SYNC_MT_HTS:
-- 
2.53.0


^ permalink raw reply related

* [PATCH v3 2/2] ring: replace rte_atomic32 with __sync builtin
From: Stephen Hemminger @ 2026-06-10 18:43 UTC (permalink / raw)
  To: dev; +Cc: Stephen Hemminger, Konstantin Ananyev, Wathsala Vithanage
In-Reply-To: <20260610184701.657769-1-stephen@networkplumber.org>

Replaces use of the deprecated rte_atomic32 code with GCC builtin
atomic operations on x86. The C11 version used on other architectures
is unchanged.

Although it would be preferable to use C11 on all architectures,
there is a performance loss if we do it that way.

On x86 i9-13900H, two physical cores MP/MC (cycles/elem),
ring_perf test with GCC 14.2:

  n      asm    sync     c11
  8    72.86   72.12   89.01
  32   18.74   18.80   24.62
  64   10.07    9.86   12.41
  128   6.99    6.74    9.01
  256   6.38    6.20    7.34

Pure C11 regresses 15-30% due to __atomic_compare_exchange_n's
failure-writeback semantic.

Drop the now-unused enqueue argument.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Konstantin Ananyev <konstantin.ananyev@huawei.com>
---
 lib/ring/meson.build                          |   2 +-
 lib/ring/rte_ring_c11_pvt.h                   |  25 ----
 lib/ring/rte_ring_elem_pvt.h                  |  37 +++--
 ..._ring_generic_pvt.h => rte_ring_gcc_pvt.h} | 141 ++++++++----------
 lib/ring/rte_ring_hts_elem_pvt.h              |   8 +-
 lib/ring/soring.c                             |  10 +-
 6 files changed, 99 insertions(+), 124 deletions(-)
 rename lib/ring/{rte_ring_generic_pvt.h => rte_ring_gcc_pvt.h} (81%)

diff --git a/lib/ring/meson.build b/lib/ring/meson.build
index 21f2c12989..2ba160b178 100644
--- a/lib/ring/meson.build
+++ b/lib/ring/meson.build
@@ -9,7 +9,7 @@ indirect_headers += files (
         'rte_ring_elem.h',
         'rte_ring_elem_pvt.h',
         'rte_ring_c11_pvt.h',
-        'rte_ring_generic_pvt.h',
+        'rte_ring_gcc_pvt.h',
         'rte_ring_hts.h',
         'rte_ring_hts_elem_pvt.h',
         'rte_ring_peek.h',
diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index 5afc14dec9..a6c14921d3 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -19,31 +19,6 @@
  * For more information please refer to <rte_ring.h>.
  */
 
-/**
- * @internal This function updates tail values.
- */
-static __rte_always_inline void
-__rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
-		uint32_t new_val, uint32_t single, uint32_t enqueue)
-{
-	RTE_SET_USED(enqueue);
-
-	/*
-	 * If there are other enqueues/dequeues in progress that preceded us,
-	 * we need to wait for them to complete
-	 */
-	if (!single)
-		rte_wait_until_equal_32((uint32_t *)(uintptr_t)&ht->tail, old_val,
-			rte_memory_order_relaxed);
-
-	/*
-	 * R0: Establishes a synchronizing edge with load-acquire of tail at A1.
-	 * Ensures that memory effects by this thread on ring elements array
-	 * is observed by a different thread of the other type.
-	 */
-	rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release);
-}
-
 /**
  * @internal This is a helper function that moves the producer/consumer head
  *    optimized for single threaded case
diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
index a0fdec9812..29758d0bb8 100644
--- a/lib/ring/rte_ring_elem_pvt.h
+++ b/lib/ring/rte_ring_elem_pvt.h
@@ -299,17 +299,36 @@ __rte_ring_dequeue_elems(struct rte_ring *r, uint32_t cons_head,
 			cons_head & r->mask, esize, num);
 }
 
-/* Between load and load. there might be cpu reorder in weak model
- * (powerpc/arm).
- * There are 2 choices for the users
- * 1.use rmb() memory barrier
- * 2.use one-direction load_acquire/store_release barrier
- * It depends on performance test results.
+static __rte_always_inline void
+__rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
+		       uint32_t new_val, uint32_t single)
+{
+	/*
+	 * If there are other enqueues/dequeues in progress that preceded us,
+	 * we need to wait for them to complete
+	 */
+	if (!single)
+		rte_wait_until_equal_32((uint32_t *)(uintptr_t)&ht->tail, old_val,
+			rte_memory_order_relaxed);
+
+	/*
+	 * R0: Establishes a synchronizing edge with load-acquire of tail at A1.
+	 * Ensures that memory effects by this thread on ring elements array
+	 * is observed by a different thread of the other type.
+	 */
+	rte_atomic_store_explicit(&ht->tail, new_val, rte_memory_order_release);
+}
+
+/*
+ * The function __rte_ring_headtail_move_head_mt,st has two versions
+ * based on what is most efficient on a given architecture.
+ *
+ * The C11 is preferred but on x86 GCC has 10% performance drop.
  */
 #ifdef RTE_USE_C11_MEM_MODEL
 #include "rte_ring_c11_pvt.h"
 #else
-#include "rte_ring_generic_pvt.h"
+#include "rte_ring_gcc_pvt.h"
 #endif
 
 /**
@@ -426,7 +445,7 @@ __rte_ring_do_enqueue_elem(struct rte_ring *r, const void *obj_table,
 
 	__rte_ring_enqueue_elems(r, prod_head, obj_table, esize, n);
 
-	__rte_ring_update_tail(&r->prod, prod_head, prod_next, is_sp, 1);
+	__rte_ring_update_tail(&r->prod, prod_head, prod_next, is_sp);
 end:
 	if (free_space != NULL)
 		*free_space = free_entries - n;
@@ -473,7 +492,7 @@ __rte_ring_do_dequeue_elem(struct rte_ring *r, void *obj_table,
 
 	__rte_ring_dequeue_elems(r, cons_head, obj_table, esize, n);
 
-	__rte_ring_update_tail(&r->cons, cons_head, cons_next, is_sc, 0);
+	__rte_ring_update_tail(&r->cons, cons_head, cons_next, is_sc);
 
 end:
 	if (available != NULL)
diff --git a/lib/ring/rte_ring_generic_pvt.h b/lib/ring/rte_ring_gcc_pvt.h
similarity index 81%
rename from lib/ring/rte_ring_generic_pvt.h
rename to lib/ring/rte_ring_gcc_pvt.h
index c044b0824f..340ece28c7 100644
--- a/lib/ring/rte_ring_generic_pvt.h
+++ b/lib/ring/rte_ring_gcc_pvt.h
@@ -7,42 +7,21 @@
  * Used as BSD-3 Licensed with permission from Kip Macy.
  */
 
-#ifndef _RTE_RING_GENERIC_PVT_H_
-#define _RTE_RING_GENERIC_PVT_H_
+#ifndef _RTE_RING_GCC_PVT_H_
+#define _RTE_RING_GCC_PVT_H_
 
 /**
- * @file rte_ring_generic_pvt.h
+ * @file rte_ring_gcc_pvt.h
  * It is not recommended to include this file directly,
  * include <rte_ring.h> instead.
  * Contains internal helper functions for MP/SP and MC/SC ring modes.
  * For more information please refer to <rte_ring.h>.
  */
 
-/**
- * @internal This function updates tail values.
- */
-static __rte_always_inline void
-__rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
-		uint32_t new_val, uint32_t single, uint32_t enqueue)
-{
-	if (enqueue)
-		rte_smp_wmb();
-	else
-		rte_smp_rmb();
-	/*
-	 * If there are other enqueues/dequeues in progress that preceded us,
-	 * we need to wait for them to complete
-	 */
-	if (!single)
-		rte_wait_until_equal_32((volatile uint32_t *)(uintptr_t)&ht->tail, old_val,
-			rte_memory_order_relaxed);
-
-	ht->tail = new_val;
-}
 
 /**
  * @internal This is a helper function that moves the producer/consumer head
- *    for use in multi-thread safe path
+ *    optimized for single threaded case
  *
  * @param d
  *   A pointer to the headtail structure with head value to be moved
@@ -67,52 +46,43 @@ __rte_ring_update_tail(struct rte_ring_headtail *ht, uint32_t old_val,
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only
  */
 static __rte_always_inline unsigned int
-__rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
+__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d,
 		const struct rte_ring_headtail *s, uint32_t capacity,
-		unsigned int n, enum rte_ring_queue_behavior behavior,
+		unsigned int n,
+		enum rte_ring_queue_behavior behavior,
 		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
 {
-	unsigned int max = n;
-	int success;
-
-	do {
-		/* Reset n to the initial burst count */
-		n = max;
 
-		*old_head = d->head;
+	*old_head = d->head;
 
-		/* add rmb barrier to avoid load/load reorder in weak
-		 * memory model. It is noop on x86
-		 */
-		rte_smp_rmb();
+	/* add rmb barrier to avoid load/load reorder in weak
+	 * memory model. It is noop on x86
+	 */
+	rte_smp_rmb();
 
-		/*
-		 *  The subtraction is done between two unsigned 32bits value
-		 * (the result is always modulo 32 bits even if we have
-		 * *old_head > s->tail). So 'entries' is always between 0
-		 * and capacity (which is < size).
-		 */
-		*entries = (capacity + s->tail - *old_head);
+	/*
+	 *  The subtraction is done between two unsigned 32bits value
+	 * (the result is always modulo 32 bits even if we have
+	 * *old_head > s->tail). So 'entries' is always between 0
+	 * and capacity (which is < size).
+	 */
+	*entries = capacity + s->tail - *old_head;
 
-		/* check that we have enough room in ring */
-		if (unlikely(n > *entries))
-			n = (behavior == RTE_RING_QUEUE_FIXED) ?
-					0 : *entries;
+	/* check that we have enough room in ring */
+	if (unlikely(n > *entries))
+		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
 
-		if (n == 0)
-			return 0;
+	if (n == 0)
+		return 0;
 
-		*new_head = *old_head + n;
-		success = rte_atomic32_cmpset(
-				(uint32_t *)(uintptr_t)&d->head,
-				*old_head, *new_head);
-	} while (unlikely(success == 0));
+	*new_head = *old_head + n;
+	d->head = *new_head;
 	return n;
 }
 
 /**
  * @internal This is a helper function that moves the producer/consumer head
- *    optimized for single threaded case
+ *    for use in multi-thread safe path
  *
  * @param d
  *   A pointer to the headtail structure with head value to be moved
@@ -137,36 +107,49 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only
  */
 static __rte_always_inline unsigned int
-__rte_ring_headtail_move_head_st(struct rte_ring_headtail *d,
+__rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
 		const struct rte_ring_headtail *s, uint32_t capacity,
-		unsigned int n,
-		enum rte_ring_queue_behavior behavior,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
 		uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
 {
-	*old_head = d->head;
+	unsigned int max = n;
+	bool success;
 
-	/* add rmb barrier to avoid load/load reorder in weak
-	 * memory model. It is noop on x86
-	 */
-	rte_smp_rmb();
+	do {
+		/* Reset n to the initial burst count */
+		n = max;
 
-	/*
-	 *  The subtraction is done between two unsigned 32bits value
-	 * (the result is always modulo 32 bits even if we have
-	 * *old_head > s->tail). So 'entries' is always between 0
-	 * and capacity (which is < size).
-	 */
-	*entries = (capacity + s->tail - *old_head);
+		*old_head = d->head;
 
-	/* check that we have enough room in ring */
-	if (unlikely(n > *entries))
-		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
+		/* add fence to avoid load/load reorder in weak
+		 * memory model. It is noop on x86
+		 */
+		__atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+		/*
+		 *  The subtraction is done between two unsigned 32bits value
+		 * (the result is always modulo 32 bits even if we have
+		 * *old_head > s->tail). So 'entries' is always between 0
+		 * and capacity (which is < size).
+		 */
+		*entries = (capacity + s->tail - *old_head);
+
+		/* check that we have enough room in ring */
+		if (unlikely(n > *entries))
+			n = (behavior == RTE_RING_QUEUE_FIXED) ?
+					0 : *entries;
+
+		if (n == 0)
+			return 0;
 
-	if (likely(n > 0)) {
 		*new_head = *old_head + n;
-		d->head = *new_head;
-	}
+
+		success = __sync_bool_compare_and_swap(
+				(uint32_t *)(uintptr_t)&d->head,
+				*old_head, *new_head);
+	} while (unlikely(!success));
+
 	return n;
 }
 
-#endif /* _RTE_RING_GENERIC_PVT_H_ */
+#endif /* _RTE_RING_GCC_PVT_H_ */
diff --git a/lib/ring/rte_ring_hts_elem_pvt.h b/lib/ring/rte_ring_hts_elem_pvt.h
index a01089d15d..97ae240e2e 100644
--- a/lib/ring/rte_ring_hts_elem_pvt.h
+++ b/lib/ring/rte_ring_hts_elem_pvt.h
@@ -25,12 +25,10 @@
  */
 static __rte_always_inline void
 __rte_ring_hts_update_tail(struct rte_ring_hts_headtail *ht, uint32_t old_tail,
-	uint32_t num, uint32_t enqueue)
+			   uint32_t num)
 {
 	uint32_t tail;
 
-	RTE_SET_USED(enqueue);
-
 	tail = old_tail + num;
 
 	/*
@@ -217,7 +215,7 @@ __rte_ring_do_hts_enqueue_elem(struct rte_ring *r, const void *obj_table,
 
 	if (n != 0) {
 		__rte_ring_enqueue_elems(r, head, obj_table, esize, n);
-		__rte_ring_hts_update_tail(&r->hts_prod, head, n, 1);
+		__rte_ring_hts_update_tail(&r->hts_prod, head, n);
 	}
 
 	if (free_space != NULL)
@@ -258,7 +256,7 @@ __rte_ring_do_hts_dequeue_elem(struct rte_ring *r, void *obj_table,
 
 	if (n != 0) {
 		__rte_ring_dequeue_elems(r, head, obj_table, esize, n);
-		__rte_ring_hts_update_tail(&r->hts_cons, head, n, 0);
+		__rte_ring_hts_update_tail(&r->hts_cons, head, n);
 	}
 
 	if (available != NULL)
diff --git a/lib/ring/soring.c b/lib/ring/soring.c
index 22f9c60e9c..45292c0f78 100644
--- a/lib/ring/soring.c
+++ b/lib/ring/soring.c
@@ -202,21 +202,21 @@ __rte_soring_move_cons_head(struct rte_soring *r, uint32_t stage, uint32_t num,
 
 static __rte_always_inline void
 __rte_soring_update_tail(struct __rte_ring_headtail *rht,
-	enum rte_ring_sync_type st, uint32_t head, uint32_t next, uint32_t enq)
+		 enum rte_ring_sync_type st, uint32_t head, uint32_t next)
 {
 	uint32_t n;
 
 	switch (st) {
 	case RTE_RING_SYNC_ST:
 	case RTE_RING_SYNC_MT:
-		__rte_ring_update_tail(&rht->ht, head, next, st, enq);
+		__rte_ring_update_tail(&rht->ht, head, next, st);
 		break;
 	case RTE_RING_SYNC_MT_RTS:
 		__rte_ring_rts_update_tail(&rht->rts);
 		break;
 	case RTE_RING_SYNC_MT_HTS:
 		n = next - head;
-		__rte_ring_hts_update_tail(&rht->hts, head, n, enq);
+		__rte_ring_hts_update_tail(&rht->hts, head, n);
 		break;
 	default:
 		/* unsupported mode, shouldn't be here */
@@ -295,7 +295,7 @@ soring_enqueue(struct rte_soring *r, const void *objs,
 			&prod_head, &prod_next, &nb_free);
 	if (n != 0) {
 		__enqueue_elems(r, objs, meta, prod_head, n);
-		__rte_soring_update_tail(&r->prod, st, prod_head, prod_next, 1);
+		__rte_soring_update_tail(&r->prod, st, prod_head, prod_next);
 	}
 
 	if (free_space != NULL)
@@ -401,7 +401,7 @@ soring_dequeue(struct rte_soring *r, void *objs, void *meta,
 	/* we have some elems to consume */
 	if (n != 0) {
 		__dequeue_elems(r, objs, meta, cons_head, n);
-		__rte_soring_update_tail(&r->cons, st, cons_head, cons_next, 0);
+		__rte_soring_update_tail(&r->cons, st, cons_head, cons_next);
 	}
 
 	if (available != NULL)
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH 0/8] telemetry: thread-safe and bounded parameter parsing
From: Thomas Monjalon @ 2026-06-10 20:42 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, Bruce Richardson
In-Reply-To: <aiZ1jB3_MaDP4OTK@bricha3-mobl1.ger.corp.intel.com>

> > Stephen Hemminger (8):
> >   telemetry: fix thread-unsafe command parsing
> >   ethdev: make telemetry parameter parsing thread-safe
> >   dmadev: validate telemetry parameters
> >   security: harden telemetry parameter parsing
> >   eventdev: remove strtok from telemetry handlers
> >   eventdev/eth_rx: fix thread-unsafe telemetry parsing
> >   eventdev/eth_rx: reject out-of-range telemetry adapter ID
> >   eventdev/timer: reject out-of-range ID
> > 
> Series-Acked-by: Bruce Richardson <bruce.richardson@intel.com>

After passing the automatic AI review in codex,
it seems the review is not relevant.

Applied, thanks.




^ permalink raw reply

* Re: [PATCH v3 0/2] ring: replace use of rte_atomic
From: Thomas Monjalon @ 2026-06-10 21:38 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev
In-Reply-To: <20260610184701.657769-1-stephen@networkplumber.org>

10/06/2026 20:43, Stephen Hemminger:
> v3:
>   - rebase and squash patches
>   - keep original code for x86 single thread case
> 
> Stephen Hemminger (2):
>   ring: split single thread vs multi-thread cases
>   ring: replace rte_atomic32 with __sync builtin

Applied, thanks.




^ permalink raw reply

* Re: [PATCH] power/amd_pstate: fix frequency matching for continuous scaling
From: Thomas Monjalon @ 2026-06-10 22:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: dev, stable, Anatoly Burakov, Sivaprasad Tummala
In-Reply-To: <20260328193419.106100-1-stephen@networkplumber.org>

28/03/2026 20:34, Stephen Hemminger:
> The power_init_for_setting_freq() function fails on systems using the
> amd-pstate-epp driver because the current CPU frequency read from
> scaling_setspeed does not exactly match any of the synthesized
> frequency buckets. Unlike acpi_cpufreq which provides a discrete list
> of frequencies, amd-pstate operates with continuously variable
> frequencies, so an exact match will rarely succeed.
> 
> For example, on a Ryzen 9 7945HX the sysfs file reports 2797172
> which rounds to 2797000, but this value does not appear in the
> generated frequency table.
> 
> Replace the exact match lookup with a nearest-frequency search.
> 
[...]
> -	freq = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL);
> +	errno = 0;
> +	freq = strtoul(buf, &endptr, POWER_CONVERT_TO_DECIMAL);
> +	if (errno != 0 || endptr == buf || freq == 0) {
> +		POWER_LOG(ERR, "Failed to parse frequency '%s' for lcore %u",
> +				buf, pi->lcore_id);
> +		goto err;
> +	}
>  
>  	/* convert the frequency to nearest 1000 value
>  	 * Ex: if freq=1396789 then freq_conv=1397000
>  	 * Ex: if freq=800030 then freq_conv=800000
>  	 */
> -	unsigned int freq_conv = 0;
> -	freq_conv = (freq + FREQ_ROUNDING_DELTA)
> -				/ ROUND_FREQ_TO_N_1000;
> +	freq_conv = (freq + FREQ_ROUNDING_DELTA) / ROUND_FREQ_TO_N_1000;
>  	freq_conv = freq_conv * ROUND_FREQ_TO_N_1000;
>  
> -	for (i = 0; i < pi->nb_freqs; i++) {
> -		if (freq_conv == pi->freqs[i]) {
> -			pi->curr_idx = i;
> -			pi->f = f;
> -			return 0;
> +	/* Find the nearest frequency in the table.
> +	 * With amd-pstate the CPU runs at continuously variable
> +	 * frequencies so the current frequency will not exactly
> +	 * match one of the synthesized frequency buckets.
> +	 */
> +	best_idx = 0;
> +	best_diff = abs_diff(freq_conv, pi->freqs[0]);
> +
> +	for (i = 1; i < pi->nb_freqs; i++) {
> +		diff = abs_diff(freq_conv, pi->freqs[i]);
> +		if (diff < best_diff) {
> +			best_diff = diff;
> +			best_idx = i;
>  		}
>  	}

GPT found this problem:

power_init_for_setting_freq() now assigns pi->curr_idx = best_idx
after finding the nearest synthesized frequency bucket.
However, set_freq_internal() skips the sysfs write
whenever idx == pi->curr_idx.

This means that if the current scaling_setspeed value is merely close
to a bucket but not equal to it, a later request to set that bucket
will return success without actually writing the requested frequency.
This can happen during init too: power_amd_pstate_cpufreq_init()
calls freq_max() after initialization, but if the current frequency
is nearest to the max bucket, freq_max() will be skipped even when
the actual sysfs value is not the synthesized max.
The nearest-bucket match should not be treated as an exact programmed
frequency, or the next explicit set to that bucket should be forced.



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox