Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v2 4/9] net: atlantic: add AQC113 hardware register definitions and accessors definitions and accessors
From: sukhdeeps @ 2026-05-08 12:01 UTC (permalink / raw)
  To: netdev
  Cc: irusskikh, epomozov, richardcochran, andrew+netdev, davem,
	edumazet, kuba, pabeni, vadim.fedorenko, linux-kernel,
	Sukhdeep Singh
In-Reply-To: <20260508120156.3060-1-sukhdeeps@marvell.com>

From: Sukhdeep Singh <sukhdeeps@marvell.com>

Add low-level hardware register definitions and accessor functions
for AQC113 (Antigua) chip features:

- L3/L4 filter command, tag, and address registers for IPv4/IPv6
- Ethertype filter tag registers
- TSG (Time Stamp Generator) clock control, modification, and
  GPIO event generation/input timestamp registers
- TX descriptor timestamp writeback, timestamp enable, and AVB
  enable registers
- TX data/descriptor read request limit registers
- TPB highest priority TC registers
- PCIe extended tag enable register
- RX descriptor timestamp request register
- Action resolver section enable getter
- GPIO special mode and TSG external GPIO TS input select

Signed-off-by: Sukhdeep Singh <sukhdeeps@marvell.com>
---
 .../aquantia/atlantic/hw_atl2/hw_atl2_llh.c   | 359 ++++++++++++++++++
 .../aquantia/atlantic/hw_atl2/hw_atl2_llh.h   | 107 +++++-
 .../atlantic/hw_atl2/hw_atl2_llh_internal.h   | 204 +++++++++-
 3 files changed, 663 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c
index cd954b11d24a..21fda387f60e 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c
@@ -7,6 +7,20 @@
 #include "hw_atl2_llh_internal.h"
 #include "aq_hw_utils.h"
 
+void hw_atl2_phi_ext_tag_set(struct aq_hw_s *aq_hw, u32 val)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_PHI_EXT_TAG_EN_ADR,
+			    HW_ATL2_PHI_EXT_TAG_EN_MSK,
+			    HW_ATL2_PHI_EXT_TAG_EN_SHIFT, val);
+}
+
+u32 hw_atl2_phi_ext_tag_get(struct aq_hw_s *aq_hw)
+{
+	return aq_hw_read_reg_bit(aq_hw, HW_ATL2_PHI_EXT_TAG_EN_ADR,
+				  HW_ATL2_PHI_EXT_TAG_EN_MSK,
+				  HW_ATL2_PHI_EXT_TAG_EN_SHIFT);
+}
+
 void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw,
 					       u32 select)
 {
@@ -66,6 +80,278 @@ void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter)
 			    tag);
 }
 
+void hw_atl2_rpf_etht_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_ET_TAG_ADR(filter),
+			    HW_ATL2_RPF_ET_TAG_MSK,
+			    HW_ATL2_RPF_ET_TAG_SHIFT, tag);
+}
+
+u32 hw_atl2_rpf_etht_flr_tag_get(struct aq_hw_s *aq_hw, u32 filter)
+{
+	return aq_hw_read_reg_bit(aq_hw, HW_ATL2_RPF_ET_TAG_ADR(filter),
+				  HW_ATL2_RPF_ET_TAG_MSK,
+				  HW_ATL2_RPF_ET_TAG_SHIFT);
+}
+
+void hw_atl2_rpf_l3_v4_dest_addr_set(struct aq_hw_s *aq_hw, u32 filter, u32 val)
+{
+	u32 addr_set = 6 + ((filter < 4) ? 0 : 1);
+	u32 dword = filter % 4;
+
+	aq_hw_write_reg(aq_hw, HW_ATL2_RPF_L3_DA_DW_ADR(addr_set, dword), val);
+}
+
+void hw_atl2_rpf_l3_v4_src_addr_set(struct aq_hw_s *aq_hw, u32 filter, u32 val)
+{
+	u32 addr_set = 6 + ((filter < 4) ? 0 : 1);
+	u32 dword = filter % 4;
+
+	aq_hw_write_reg(aq_hw, HW_ATL2_RPF_L3_SA_DW_ADR(addr_set, dword), val);
+}
+
+void hw_atl2_rpf_l3_v6_dest_addr_set(struct aq_hw_s *aq_hw, u8 location,
+				     u32 *ipv6_dst)
+{
+	int i;
+
+	for (i = 0; i < 4; ++i)
+		aq_hw_write_reg(aq_hw,
+				HW_ATL2_RPF_L3_DA_DW_ADR(location, 3 - i),
+				ipv6_dst[i]);
+}
+
+void hw_atl2_rpf_l3_v6_src_addr_set(struct aq_hw_s *aq_hw, u8 location,
+				    u32 *ipv6_src)
+{
+	int i;
+
+	for (i = 0; i < 4; ++i)
+		aq_hw_write_reg(aq_hw,
+				HW_ATL2_RPF_L3_SA_DW_ADR(location, 3 - i),
+				ipv6_src[i]);
+}
+
+void hw_atl2_rpf_l3_v4_cmd_set(struct aq_hw_s *aq_hw, u32 val, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L3_V4_CMD_ADR(filter),
+			    HW_ATL2_RPF_L3_V4_CMD_MSK,
+			    HW_ATL2_RPF_L3_V4_CMD_SHIFT, val);
+}
+
+void hw_atl2_rpf_l3_v6_cmd_set(struct aq_hw_s *aq_hw, u32 val, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L3_V6_CMD_ADR(filter),
+			    HW_ATL2_RPF_L3_V6_CMD_MSK,
+			    HW_ATL2_RPF_L3_V6_CMD_SHIFT, val);
+}
+
+void hw_atl2_rpf_l3_v6_v4_select_set(struct aq_hw_s *aq_hw, u32 val)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L3_V6_V4_SELECT_ADR,
+			    HW_ATL2_RPF_L3_V6_V4_SELECT_MSK,
+			    HW_ATL2_RPF_L3_V6_V4_SELECT_SHIFT, val);
+}
+
+void hw_atl2_rpf_l3_v4_tag_set(struct aq_hw_s *aq_hw, u32 val, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L3_V4_TAG_ADR(filter),
+			    HW_ATL2_RPF_L3_V4_TAG_MSK,
+			    HW_ATL2_RPF_L3_V4_TAG_SHIFT, val);
+}
+
+void hw_atl2_rpf_l3_v6_tag_set(struct aq_hw_s *aq_hw, u32 val, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L3_V6_TAG_ADR(filter),
+			    HW_ATL2_RPF_L3_V6_TAG_MSK,
+			    HW_ATL2_RPF_L3_V6_TAG_SHIFT, val);
+}
+
+void hw_atl2_rpf_l4_tag_set(struct aq_hw_s *aq_hw, u32 val, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L4_TAG_ADR(filter),
+			    HW_ATL2_RPF_L4_TAG_MSK,
+			    HW_ATL2_RPF_L4_TAG_SHIFT, val);
+}
+
+void hw_atl2_rpf_l4_cmd_set(struct aq_hw_s *aq_hw, u32 val, u32 filter)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_RPF_L4_CMD_ADR(filter),
+			    HW_ATL2_RPF_L4_CMD_MSK,
+			    HW_ATL2_RPF_L4_CMD_SHIFT, val);
+}
+
+/* tsg */
+static void hw_atl2_clock_modif_value_set(struct aq_hw_s *aq_hw,
+					  u32 clock_sel, u64 ns)
+{
+	aq_hw_write_reg64(aq_hw,
+			  HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_MODIF_VAL_LSW),
+			  ns);
+}
+
+void hw_atl2_tsg_clock_en(struct aq_hw_s *aq_hw,
+			  u32 clock_sel, u32 clock_enable)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_CFG),
+			    HW_ATL2_TSG_CLOCK_EN_MSK,
+			    HW_ATL2_TSG_CLOCK_EN_SHIFT,
+			    clock_enable);
+}
+
+void hw_atl2_tsg_clock_reset(struct aq_hw_s *aq_hw, u32 clock_sel)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_CFG),
+			    HW_ATL2_TSG_SYNC_RESET_MSK,
+			    HW_ATL2_TSG_SYNC_RESET_SHIFT, 1);
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_CFG),
+			    HW_ATL2_TSG_SYNC_RESET_MSK,
+			    HW_ATL2_TSG_SYNC_RESET_SHIFT, 0);
+}
+
+u64 hw_atl2_tsg_clock_read(struct aq_hw_s *aq_hw, u32 clock_sel)
+{
+	return aq_hw_read_reg64(aq_hw,
+				HW_ATL2_TSG_REG_ADR(clock_sel,
+						    READ_CUR_NS_LSW));
+}
+
+void hw_atl2_tsg_clock_add(struct aq_hw_s *aq_hw, u32 clock_sel, u64 ns)
+{
+	hw_atl2_clock_modif_value_set(aq_hw, clock_sel, ns);
+	aq_hw_write_reg(aq_hw,
+			HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_MODIF_CTRL),
+			HW_ATL2_TSG_ADD_COUNTER_MSK);
+}
+
+void hw_atl2_tsg_clock_sub(struct aq_hw_s *aq_hw, u32 clock_sel, u64 ns)
+{
+	hw_atl2_clock_modif_value_set(aq_hw, clock_sel, ns);
+	aq_hw_write_reg(aq_hw,
+			HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_MODIF_CTRL),
+			HW_ATL2_TSG_SUBTRACT_COUNTER_MSK);
+}
+
+void hw_atl2_tsg_clock_increment_set(struct aq_hw_s *aq_hw,
+				     u32 clock_sel, u32 ns, u32 fns)
+{
+	u32 nsfns = (ns & 0xff) | (fns & 0xffffff00);
+
+	aq_hw_write_reg(aq_hw,
+			HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_INC_CFG),
+			nsfns);
+	aq_hw_write_reg(aq_hw,
+			HW_ATL2_TSG_REG_ADR(clock_sel, CLOCK_MODIF_CTRL),
+			HW_ATL2_TSG_LOAD_INC_CFG_MSK);
+}
+
+void hw_atl2_tsg_ext_isr_to_host_set(struct aq_hw_s *aq_hw, int on)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_GLB_CONTROL_2_ADR,
+			    HW_ATL2_MIF_INTERRUPT_2_TO_ITR_MSK,
+			    HW_ATL2_MIF_INTERRUPT_TO_ITR_SHIFT + 2,
+			    !!on);
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_GLB_CONTROL_2_ADR,
+			    HW_ATL2_EN_INTERRUPT_MIF2_TO_ITR_MSK,
+			    HW_ATL2_EN_INTERRUPT_TO_ITR_SHIFT + 2,
+			    !!on);
+}
+
+void hw_atl2_tpb_tps_highest_priority_tc_enable_set(struct aq_hw_s *aq_hw,
+						    u32 tps_highest_prio_tc_en)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_HIGHEST_PRIO_TC_EN_ADR,
+			    HW_ATL2_TPB_HIGHEST_PRIO_TC_EN_MSK,
+			    HW_ATL2_TPB_HIGHEST_PRIO_TC_EN_SHIFT,
+			    tps_highest_prio_tc_en);
+}
+
+void hw_atl2_tpb_tps_highest_priority_tc_set(struct aq_hw_s *aq_hw,
+					     u32 tps_highest_prio_tc)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TPB_HIGHEST_PRIO_TC_ADR,
+			    HW_ATL2_TPB_HIGHEST_PRIO_TC_MSK,
+			    HW_ATL2_TPB_HIGHEST_PRIO_TC_SHIFT,
+			    tps_highest_prio_tc);
+}
+
+void hw_atl2_tsg_gpio_isr_to_host_set(struct aq_hw_s *aq_hw,
+				      int on, u32 clock_sel)
+{
+	aq_hw_write_reg_bit(aq_hw,
+			    HW_ATL2_GLOBAL_HIGH_PRIO_INTERRUPT_1_MASK_ADR,
+		clock_sel == 1 ? HW_ATL2_TSG_TSG1_GPIO_INTERRUPT_MSK :
+			HW_ATL2_TSG_TSG0_GPIO_INTERRUPT_MSK,
+		clock_sel == 1 ? HW_ATL2_TSG_TSG1_GPIO_INTERRUPT_SHIFT :
+			HW_ATL2_TSG_TSG0_GPIO_INTERRUPT_SHIFT,
+		!!on);
+}
+
+void hw_atl2_tsg_gpio_clear_status(struct aq_hw_s *aq_hw)
+{
+	aq_hw_read_reg(aq_hw, HW_ATL2_GLOBAL_INTERNAL_ALARMS_1_ADR);
+}
+
+void hw_atl2_tsg_gpio_input_event_info_get(struct aq_hw_s *aq_hw,
+					   u32 clock_sel,
+					   u32 *event_count,
+					   u64 *event_ts)
+{
+	if (event_count)
+		*event_count = aq_hw_read_reg(aq_hw,
+					      HW_ATL2_TSG_REG_ADR(clock_sel,
+								  EXT_CLK_COUNT));
+
+	if (event_ts)
+		*event_ts = aq_hw_read_reg64(aq_hw,
+					     HW_ATL2_TSG_REG_ADR(clock_sel,
+								 GPIO_EVENT_TS_LSW));
+}
+
+void hw_atl2_tsg_ptp_gpio_gen_pulse(struct aq_hw_s *aq_hw, u32 clk_sel,
+				    u64 ts, u32 period, u32 hightime)
+{
+	u32 val = (HW_ATL2_TSG_GPIO_EVENT_MODE_SET_ON_TIME <<
+		   (HW_ATL2_TSG_GPIO_EVENT_MODE_SHIFT -
+		    HW_ATL2_TSG_GPIO_OUTPUT_EN_SHIFT)) |
+		  (HW_ATL2_TSG_GPIO_GEN_OUTPUT_EN_MSK) |
+		  (HW_ATL2_TSG_GPIO_OUTPUT_EN_MSK);
+
+	if (ts != 0) {
+		aq_hw_write_reg64(aq_hw,
+				  HW_ATL2_TSG_REG_ADR(clk_sel,
+						      GPIO_EVENT_GEN_TS_LSW),
+				  ts);
+
+		aq_hw_write_reg64(aq_hw,
+				  HW_ATL2_TSG_REG_ADR(clk_sel,
+						      GPIO_EVENT_HIGH_TIME_LSW),
+				  hightime);
+
+		aq_hw_write_reg64(aq_hw,
+				  HW_ATL2_TSG_REG_ADR(clk_sel,
+						      GPIO_EVENT_LOW_TIME_LSW),
+				  (period - hightime));
+	}
+
+	aq_hw_write_reg_bit(aq_hw,
+			    HW_ATL2_TSG_REG_ADR(clk_sel, GPIO_EVENT_GEN_CFG),
+			    HW_ATL2_TSG_GPIO_EVENT_MODE_MSK |
+				HW_ATL2_TSG_GPIO_OUTPUT_EN_MSK |
+				HW_ATL2_TSG_GPIO_GEN_OUTPUT_EN_MSK,
+			   HW_ATL2_TSG_GPIO_OUTPUT_EN_SHIFT,
+			   (!ts ? 0 : val));
+}
+
+void hw_atl2_rpf_rx_desc_timestamp_req_set(struct aq_hw_s *aq_hw, u32 request,
+					   u32 descriptor)
+{
+	aq_hw_write_reg_bit(aq_hw,
+			    HW_ATL2_RPF_TIMESTAMP_REQ_DESCD_ADR(descriptor),
+			    HW_ATL2_RPF_TIMESTAMP_REQ_DESCD_MSK,
+			    HW_ATL2_RPF_TIMESTAMP_REQ_DESCD_SHIFT, request);
+}
+
 /* TX */
 
 void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw,
@@ -93,6 +379,30 @@ void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw,
 			tx_intr_moderation_ctl);
 }
 
+void hw_atl2_tdm_tx_desc_timestamp_writeback_en_set(struct aq_hw_s *aq_hw,
+						    u32 enable, u32 descriptor)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TDM_DESCD_TS_WRB_EN_ADR(descriptor),
+			    HW_ATL2_TDM_DESCD_TS_WRB_EN_MSK,
+			    HW_ATL2_TDM_DESCD_TS_WRB_EN_SHIFT, enable);
+}
+
+void hw_atl2_tdm_tx_desc_timestamp_en_set(struct aq_hw_s *aq_hw, u32 enable,
+					  u32 descriptor)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TDM_DESCD_TS_EN_ADR(descriptor),
+			    HW_ATL2_TDM_DESCD_TS_EN_MSK,
+			    HW_ATL2_TDM_DESCD_TS_EN_SHIFT, enable);
+}
+
+void hw_atl2_tdm_tx_desc_avb_en_set(struct aq_hw_s *aq_hw, u32 enable,
+				    u32 descriptor)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TDM_DESCD_AVB_EN_ADR(descriptor),
+			    HW_ATL2_TDM_DESCD_AVB_EN_MSK,
+			    HW_ATL2_TDM_DESCD_AVB_EN_SHIFT, enable);
+}
+
 void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw,
 					       const u32 data_arb_mode)
 {
@@ -122,6 +432,20 @@ void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
 			    weight);
 }
 
+void hw_atl2_tdm_tx_data_read_req_limit_set(struct aq_hw_s *aq_hw, u32 limit)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TDM_TX_DATA_RD_REQ_LIMIT_ADR,
+			    HW_ATL2_TDM_TX_DATA_RD_REQ_LIMIT_MSK,
+			    HW_ATL2_TDM_TX_DATA_RD_REQ_LIMIT_SHIFT, limit);
+}
+
+void hw_atl2_tdm_tx_desc_read_req_limit_set(struct aq_hw_s *aq_hw, u32 limit)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TDM_TX_DESC_RD_REQ_LIMIT_ADR,
+			    HW_ATL2_TDM_TX_DESC_RD_REQ_LIMIT_MSK,
+			    HW_ATL2_TDM_TX_DESC_RD_REQ_LIMIT_SHIFT, limit);
+}
+
 u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw)
 {
 	return aq_hw_read_reg(aq_hw, HW_ATL2_FPGA_VER_ADR);
@@ -164,6 +488,13 @@ void hw_atl2_rpf_act_rslvr_section_en_set(struct aq_hw_s *aq_hw, u32 sections)
 			    sections);
 }
 
+u32 hw_atl2_rpf_act_rslvr_section_en_get(struct aq_hw_s *aq_hw)
+{
+	return aq_hw_read_reg_bit(aq_hw, HW_ATL2_RPF_REC_TAB_EN_ADR,
+				  HW_ATL2_RPF_REC_TAB_EN_MSK,
+				  HW_ATL2_RPF_REC_TAB_EN_SHIFT);
+}
+
 void hw_atl2_mif_shared_buf_get(struct aq_hw_s *aq_hw, int offset, u32 *data,
 				int len)
 {
@@ -232,3 +563,31 @@ void hw_atl2_mif_host_req_int_clr(struct aq_hw_s *aq_hw, u32 val)
 	return aq_hw_write_reg(aq_hw, HW_ATL2_MCP_HOST_REQ_INT_CLR_ADR,
 			       val);
 }
+
+void hw_atl2_tsg1_ext_gpio_ts_input_select_set(struct aq_hw_s *aq_hw,
+					       u32 tsg_gpio_ts_select)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TSG1_EXT_GPIO_TS_INPUT_SEL_ADR,
+			    HW_ATL2_TSG1_EXT_GPIO_TS_INPUT_SEL_MSK,
+			    HW_ATL2_TSG1_EXT_GPIO_TS_INPUT_SEL_SHIFT,
+			    tsg_gpio_ts_select);
+}
+
+void hw_atl2_tsg0_ext_gpio_ts_input_select_set(struct aq_hw_s *aq_hw,
+					       u32 gpio_ts_in_select)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_TSG0_EXT_GPIO_TS_INPUT_SEL_ADR,
+			    HW_ATL2_TSG0_EXT_GPIO_TS_INPUT_SEL_MSK,
+			    HW_ATL2_TSG0_EXT_GPIO_TS_INPUT_SEL_SHIFT,
+			    gpio_ts_in_select);
+}
+
+void hw_atl2_gpio_special_mode_set(struct aq_hw_s *aq_hw,
+				   u32 gpio_special_mode,
+				   u32 pin)
+{
+	aq_hw_write_reg_bit(aq_hw, HW_ATL2_GPIO_PIN_SPEC_MODE_ADR(pin),
+			    HW_ATL2_GPIO_PIN_SPEC_MODE_MSK,
+			    HW_ATL2_GPIO_PIN_SPEC_MODE_SHIFT,
+			    gpio_special_mode);
+}
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h
index 98c7a4621297..01aaf701b201 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h
@@ -10,6 +10,11 @@
 
 struct aq_hw_s;
 
+/* Set Enable usage of extended tags from 32-255. */
+void hw_atl2_phi_ext_tag_set(struct aq_hw_s *aq_hw, u32 val);
+/* Get Enable usage of extended tags from 32-255. */
+u32 hw_atl2_phi_ext_tag_get(struct aq_hw_s *aq_hw);
+
 /* Set TX Interrupt Moderation Control Register */
 void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw,
 					u32 tx_intr_moderation_ctl,
@@ -19,7 +24,7 @@ void hw_atl2_reg_tx_intr_moder_ctrl_set(struct aq_hw_s *aq_hw,
 void hw_atl2_rpf_redirection_table2_select_set(struct aq_hw_s *aq_hw,
 					       u32 select);
 
-/** Set RSS HASH type */
+/* Set RSS HASH type */
 void hw_atl2_rpf_rss_hash_type_set(struct aq_hw_s *aq_hw, u32 rss_hash_type);
 
 /* set new RPF enable */
@@ -37,14 +42,92 @@ void hw_atl2_new_rpf_rss_redir_set(struct aq_hw_s *aq_hw, u32 tc, u32 index,
 
 /* Set VLAN filter tag */
 void hw_atl2_rpf_vlan_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter);
+/* set ethertype filter tag */
+void hw_atl2_rpf_etht_flr_tag_set(struct aq_hw_s *aq_hw, u32 tag, u32 filter);
+
+/* get ethertype filter tag */
+u32 hw_atl2_rpf_etht_flr_tag_get(struct aq_hw_s *aq_hw, u32 filter);
+
+/* set L3 v4 dest address */
+void hw_atl2_rpf_l3_v4_dest_addr_set(struct aq_hw_s *aq_hw,
+				     u32 filter, u32 val);
+
+/* set L3 v4 src address */
+void hw_atl2_rpf_l3_v4_src_addr_set(struct aq_hw_s *aq_hw, u32 filter, u32 val);
+
+/* set L3 v4 cmd */
+void hw_atl2_rpf_l3_v4_cmd_set(struct aq_hw_s *aq_hw, u32 val, u32 filter);
+
+/* set L3 v6 cmd */
+void hw_atl2_rpf_l3_v6_cmd_set(struct aq_hw_s *aq_hw, u32 val, u32 filter);
+
+/* set L3 v6 dest address */
+void hw_atl2_rpf_l3_v6_dest_addr_set(struct aq_hw_s *aq_hw, u8 location,
+				     u32 *ipv6_dst);
+
+/* set L3 v6 src address */
+void hw_atl2_rpf_l3_v6_src_addr_set(struct aq_hw_s *aq_hw, u8 location,
+				    u32 *ipv6_src);
+
+/* set L3 v6 v4 select */
+void hw_atl2_rpf_l3_v6_v4_select_set(struct aq_hw_s *aq_hw, u32 val);
+
+/* set L3 v4 tag */
+void hw_atl2_rpf_l3_v4_tag_set(struct aq_hw_s *aq_hw, u32 val, u32 filter);
+
+/* set L3 v6 tag */
+void hw_atl2_rpf_l3_v6_tag_set(struct aq_hw_s *aq_hw, u32 val, u32 filter);
+
+/* set L4 cmd */
+void hw_atl2_rpf_l4_cmd_set(struct aq_hw_s *aq_hw, u32 val, u32 filter);
+
+/* set L4 tag */
+void hw_atl2_rpf_l4_tag_set(struct aq_hw_s *aq_hw, u32 val, u32 filter);
 
 /* set tx random TC-queue mapping enable bit */
 void hw_atl2_tpb_tx_tc_q_rand_map_en_set(struct aq_hw_s *aq_hw,
 					 const u32 tc_q_rand_map_en);
 
+void hw_atl2_tpb_tps_highest_priority_tc_enable_set(struct aq_hw_s *aq_hw,
+						    u32 tps_highest_prio_tc_en);
+
+void hw_atl2_tpb_tps_highest_priority_tc_set(struct aq_hw_s *aq_hw,
+					     u32 tps_highest_prio_tc);
+
 /* set tx buffer clock gate enable */
 void hw_atl2_tpb_tx_buf_clk_gate_en_set(struct aq_hw_s *aq_hw, u32 clk_gate_en);
 
+/* tsg */
+
+void hw_atl2_tsg_clock_en(struct aq_hw_s *aq_hw, u32 clock_sel,
+			  u32 clock_enable);
+
+void hw_atl2_tsg_clock_reset(struct aq_hw_s *aq_hw, u32 clock_sel);
+u64 hw_atl2_tsg_clock_read(struct aq_hw_s *aq_hw, u32 clock_sel);
+void hw_atl2_tsg_clock_add(struct aq_hw_s *aq_hw, u32 clock_sel,
+			   u64 ns);
+void hw_atl2_tsg_clock_sub(struct aq_hw_s *aq_hw, u32 clock_sel,
+			   u64 ns);
+void hw_atl2_tsg_clock_increment_set(struct aq_hw_s *aq_hw, u32 clock_sel,
+				     u32 ns, u32 fns);
+void hw_atl2_tsg_gpio_isr_to_host_set(struct aq_hw_s *aq_hw, int on,
+				      u32 clock_sel);
+void hw_atl2_tsg_ext_isr_to_host_set(struct aq_hw_s *aq_hw, int on);
+void hw_atl2_tsg_gpio_clear_status(struct aq_hw_s *aq_hw);
+void hw_atl2_tsg_gpio_input_event_info_get(struct aq_hw_s *aq_hw,
+					   u32 clock_sel,
+					   u32 *event_count,
+					   u64 *event_ts);
+/* Set Rx Descriptor0 Timestamp request */
+void hw_atl2_rpf_rx_desc_timestamp_req_set(struct aq_hw_s *aq_hw, u32 request,
+					   u32 descriptor);
+/* Set Tx Descriptor Timestamp writeback Enable */
+void hw_atl2_tdm_tx_desc_timestamp_writeback_en_set(struct aq_hw_s *aq_hw,
+						    u32 enable,
+						    u32 descriptor);
+/* Set Tx Descriptor Timestamp enable */
+void hw_atl2_tdm_tx_desc_timestamp_en_set(struct aq_hw_s *aq_hw, u32 enable,
+					  u32 descriptor);
 void hw_atl2_tps_tx_pkt_shed_data_arb_mode_set(struct aq_hw_s *aq_hw,
 					       const u32 data_arb_mode);
 
@@ -57,6 +140,15 @@ void hw_atl2_tps_tx_pkt_shed_tc_data_max_credit_set(struct aq_hw_s *aq_hw,
 void hw_atl2_tps_tx_pkt_shed_tc_data_weight_set(struct aq_hw_s *aq_hw,
 						const u32 tc,
 						const u32 weight);
+/* Set Tx Descriptor AVB enable */
+void hw_atl2_tdm_tx_desc_avb_en_set(struct aq_hw_s *aq_hw, u32 enable,
+				    u32 descriptor);
+void hw_atl2_tsg_ptp_gpio_gen_pulse(struct aq_hw_s *aq_hw, u32 clk_sel,
+				    u64 ts, u32 period, u32 hightime);
+
+void hw_atl2_tdm_tx_data_read_req_limit_set(struct aq_hw_s *aq_hw, u32 limit);
+
+void hw_atl2_tdm_tx_desc_read_req_limit_set(struct aq_hw_s *aq_hw, u32 limit);
 
 u32 hw_atl2_get_hw_version(struct aq_hw_s *aq_hw);
 
@@ -69,6 +161,9 @@ void hw_atl2_rpf_act_rslvr_record_set(struct aq_hw_s *aq_hw, u8 location,
 /* set enable action resolver section */
 void hw_atl2_rpf_act_rslvr_section_en_set(struct aq_hw_s *aq_hw, u32 sections);
 
+/* get enable action resolver section */
+u32 hw_atl2_rpf_act_rslvr_section_en_get(struct aq_hw_s *aq_hw);
+
 /* get data from firmware shared input buffer */
 void hw_atl2_mif_shared_buf_get(struct aq_hw_s *aq_hw, int offset, u32 *data,
 				int len);
@@ -98,5 +193,13 @@ u32 hw_atl2_mif_host_req_int_get(struct aq_hw_s *aq_hw);
 
 /* clear host interrupt request */
 void hw_atl2_mif_host_req_int_clr(struct aq_hw_s *aq_hw, u32 val);
-
+/* Set TSG EXT GPIO TS Input select */
+void hw_atl2_tsg1_ext_gpio_ts_input_select_set(struct aq_hw_s *aq_hw,
+					       u32 tsg_gpio_ts_select);
+/* Set PTP EXT GPIO TS Input select */
+void hw_atl2_tsg0_ext_gpio_ts_input_select_set(struct aq_hw_s *aq_hw,
+					       u32 gpio_ts_in_select);
+/* Set GPIO Special Mode */
+void hw_atl2_gpio_special_mode_set(struct aq_hw_s *aq_hw,
+				   u32 gpio_special_mode, u32 pin);
 #endif /* HW_ATL2_LLH_H */
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h
index e34c5cda061e..9b9be3ef1332 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h
@@ -5,6 +5,11 @@
 
 #ifndef HW_ATL2_LLH_INTERNAL_H
 #define HW_ATL2_LLH_INTERNAL_H
+/* RX timestamp_req_desc{D} [1:0] Bitfield Definitions
+ */
+#define HW_ATL2_RPF_TIMESTAMP_REQ_DESCD_ADR(descr) (0x00005B08 + (descr) * 0x20)
+#define HW_ATL2_RPF_TIMESTAMP_REQ_DESCD_MSK 0x00030000
+#define HW_ATL2_RPF_TIMESTAMP_REQ_DESCD_SHIFT 16
 
 /* RX pif_rpf_redir_2_en_i Bitfield Definitions
  * PORT="pif_rpf_redir_2_en_i"
@@ -114,7 +119,68 @@
 #define HW_ATL2_RPF_VL_TAG_WIDTH 4
 /* default value of bitfield vlan_req_tag0{f}[3:0] */
 #define HW_ATL2_RPF_VL_TAG_DEFAULT 0x0
-
+/* register address for bitfield etype_req_tag0{f}[2:0] */
+#define HW_ATL2_RPF_ET_TAG_ADR(filter) (0x00005340 + (filter) * 0x4)
+/* bitmask for bitfield etype_req_tag0{f}[2:0] */
+#define HW_ATL2_RPF_ET_TAG_MSK 0x00000007
+/* lower bit position of bitfield etype_req_tag0{f}[2:0] */
+#define HW_ATL2_RPF_ET_TAG_SHIFT 0
+/* Lower bit position of bitfield l3_l4_act{F}[2:0] */
+#define HW_ATL2_RPF_L3_L4_ACTF_SHIFT 16
+/* Bitmask for bitfield l3_l4_rxq{F}[4:0] */
+#define HW_ATL2_RPF_L3_L4_RXQF_MSK 0x00001F00u
+/* Lower bit position of bitfield l3_l4_rxq{F}[4:0] */
+#define HW_ATL2_RPF_L3_L4_RXQF_SHIFT 8
+/* Register address for bitfield rpf_l3_v6_sa{F}_dw{D}[1F:0] */
+#define HW_ATL2_RPF_L3_SA_DW_ADR(filter, dword) \
+	(0x00006400u + (filter) * 0x10 + (dword) * 0x4)
+
+/* Register address for bitfield rpf_l3_v6_da{F}_dw{D}[1F:0] */
+#define HW_ATL2_RPF_L3_DA_DW_ADR(filter, dword) \
+	(0x00006480u + (filter) * 0x10 + (dword) * 0x4)
+
+/* Register address for bitfield rpf_l3_cmd{F}[1F:0] */
+#define HW_ATL2_RPF_L3_V4_CMD_ADR(filter) (0x00006500u + (filter) * 0x4)
+/* Bitmask for bitfield rpf_l3_cmd{F}[F:0] */
+#define HW_ATL2_RPF_L3_V4_CMD_MSK 0x0000FFFFu
+/* Lower bit position of bitfield rpf_l3_cmd{F}[1F:0] */
+#define HW_ATL2_RPF_L3_V4_CMD_SHIFT 0
+/* Register address for bitfield rpf_l3_v6_cmd{F}[1F:0] */
+#define HW_ATL2_RPF_L3_V6_CMD_ADR(filter) (0x00006500u + (filter) * 0x4)
+/* Bitmask for bitfield rpf_l3_v6_cmd{F}[F:0] */
+#define HW_ATL2_RPF_L3_V6_CMD_MSK 0xFF7F0000u
+/* Lower bit position of bitfield rpf_l3_v6_cmd{F}[1F:0] */
+#define HW_ATL2_RPF_L3_V6_CMD_SHIFT 0
+/* Register address for bitfield rpf_l3_v6_cmd{F}[F:0] */
+#define HW_ATL2_RPF_L3_V6_V4_SELECT_ADR 0x00006500u
+/* Bitmask for bitfield pif_rpf_l3_v6_v4_select*/
+#define HW_ATL2_RPF_L3_V6_V4_SELECT_MSK 0x00800000u
+/* Lower bit position of bitfield pif_rpf_l3_v6_v4_select */
+#define HW_ATL2_RPF_L3_V6_V4_SELECT_SHIFT 23
+/* Register address for bitfield rpf_l3_v4_req_tag{F}[2:0] */
+#define HW_ATL2_RPF_L3_V4_TAG_ADR(filter) (0x00006500u + (filter) * 0x4)
+/* Bitmask for bitfield rpf_l3_v4_req_tag{F}[2:0] */
+#define HW_ATL2_RPF_L3_V4_TAG_MSK 0x00000070u
+/* Lower bit position of bitfield rpf_l3_v4_req_tag{F}[2:0] */
+#define HW_ATL2_RPF_L3_V4_TAG_SHIFT 4
+/* Register address for bitfield rpf_l3_v6_req_tag{F}[2:0] */
+#define HW_ATL2_RPF_L3_V6_TAG_ADR(filter) (0x00006500u + (filter) * 0x4)
+/* Bitmask for bitfield rpf_l3_v6_req_tag{F}[2:0] */
+#define HW_ATL2_RPF_L3_V6_TAG_MSK 0x00700000
+/* Lower bit position of bitfield rpf_l3_v6_req_tag{F}[2:0] */
+#define HW_ATL2_RPF_L3_V6_TAG_SHIFT 20
+/* Register address for bitfield rpf_l4_cmd{F}[2:0] */
+#define HW_ATL2_RPF_L4_CMD_ADR(filter) (0x00006520u + (filter) * 0x4)
+/* Bitmask for bitfield rpf_l4_cmd{F}[2:0] */
+#define HW_ATL2_RPF_L4_CMD_MSK 0x00000007u
+/* Lower bit position of bitfield rpf_l4_cmd{F}[2:0] */
+#define HW_ATL2_RPF_L4_CMD_SHIFT 0
+/* Register address for bitfield rpf_l4_tag{F}[2:0] */
+#define HW_ATL2_RPF_L4_TAG_ADR(filter) (0x00006520u + (filter) * 0x4)
+/* Bitmask for bitfield rpf_l4_tag{F}[2:0] */
+#define HW_ATL2_RPF_L4_TAG_MSK 0x00000070u
+/* Lower bit position of bitfield rpf_l4_tag{F}[2:0] */
+#define HW_ATL2_RPF_L4_TAG_SHIFT 4
 /* RX rx_q{Q}_tc_map[2:0] Bitfield Definitions
  * Preprocessor definitions for the bitfield "rx_q{Q}_tc_map[2:0]".
  * Parameter: Queue {Q} | bit-level stride | range [0, 31]
@@ -131,7 +197,24 @@
 #define HW_ATL2_RX_Q_TC_MAP_WIDTH 3
 /* Default value of bitfield rx_q{Q}_tc_map[2:0] */
 #define HW_ATL2_RX_Q_TC_MAP_DEFAULT 0x0
-
+/* TX desc{D}_ts_wrb_en Bitfield Definitions
+ */
+#define HW_ATL2_TDM_DESCD_TS_WRB_EN_ADR(descriptor) \
+	(0x00007C08 + (descriptor) * 0x40)
+#define HW_ATL2_TDM_DESCD_TS_WRB_EN_MSK 0x00040000
+#define HW_ATL2_TDM_DESCD_TS_WRB_EN_SHIFT 18
+/* TX desc{D}_ts_en Bitfield Definitions
+ */
+#define HW_ATL2_TDM_DESCD_TS_EN_ADR(descriptor) \
+	(0x00007C08 + (descriptor) * 0x40)
+#define HW_ATL2_TDM_DESCD_TS_EN_MSK 0x00020000
+#define HW_ATL2_TDM_DESCD_TS_EN_SHIFT 17
+/* TX desc{D}_avb_en Bitfield Definitions
+ */
+#define HW_ATL2_TDM_DESCD_AVB_EN_ADR(descriptor) \
+	(0x00007C08 + (descriptor) * 0x40)
+#define HW_ATL2_TDM_DESCD_AVB_EN_MSK 0x00010000
+#define HW_ATL2_TDM_DESCD_AVB_EN_SHIFT 16
 /* tx tx_tc_q_rand_map_en bitfield definitions
  * preprocessor definitions for the bitfield "tx_tc_q_rand_map_en".
  * port="pif_tpb_tx_tc_q_rand_map_en_i"
@@ -221,7 +304,18 @@
 #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_WIDTH 16
 /* default value of bitfield data_tc{t}_credit_max[f:0] */
 #define HW_ATL2_TPS_DATA_TCTCREDIT_MAX_DEFAULT 0x0
-
+/* register address for bitfield pif_tpb_highest_prio_tc_en */
+#define HW_ATL2_TPB_HIGHEST_PRIO_TC_EN_ADR 0x00007180
+/* bitmask for bitfield pif_tpb_highest_prio_tc_en */
+#define HW_ATL2_TPB_HIGHEST_PRIO_TC_EN_MSK 0x00000100
+/* lower bit position of bitfield pif_tpb_highest_prio_tc_en */
+#define HW_ATL2_TPB_HIGHEST_PRIO_TC_EN_SHIFT 8
+/* register address for bitfield pif_tpb_highest_prio_tc */
+#define HW_ATL2_TPB_HIGHEST_PRIO_TC_ADR 0x00007180
+/* bitmask for bitfield pif_tpb_highest_prio_tc */
+#define HW_ATL2_TPB_HIGHEST_PRIO_TC_MSK 0x00000007
+/* lower bit position of bitfield pif_tpb_highest_prio_tc */
+#define HW_ATL2_TPB_HIGHEST_PRIO_TC_SHIFT 0
 /* tx data_tc{t}_weight[e:0] bitfield definitions
  * preprocessor definitions for the bitfield "data_tc{t}_weight[e:0]".
  * parameter: tc {t} | stride size 0x4 | range [0, 7]
@@ -248,7 +342,87 @@
  */
 
 #define HW_ATL2_TX_INTR_MODERATION_CTL_ADR(queue) (0x00007c28u + (queue) * 0x40)
-
+/* TX tx_data_rd_req_limit[7:0] Bitfield Definitions
+ */
+#define HW_ATL2_TDM_TX_DATA_RD_REQ_LIMIT_ADR 0x00007B04
+#define HW_ATL2_TDM_TX_DATA_RD_REQ_LIMIT_MSK 0x0000FF00
+#define HW_ATL2_TDM_TX_DATA_RD_REQ_LIMIT_SHIFT 8
+/* TX tx_desc_rd_req_limit[4:0] Bitfield Definitions
+ */
+#define HW_ATL2_TDM_TX_DESC_RD_REQ_LIMIT_ADR 0x00007B04
+#define HW_ATL2_TDM_TX_DESC_RD_REQ_LIMIT_MSK 0x0000001F
+#define HW_ATL2_TDM_TX_DESC_RD_REQ_LIMIT_SHIFT 0
+/* register address for bitfield uP Force Interrupt */
+#define HW_ATL2_GLB_CONTROL_2_ADR 0x00000404
+#define HW_ATL2_MIF_INTERRUPT_2_TO_ITR_MSK 0x00000100
+/* lower bit position of bitfield MIF Interrupt to ITR */
+#define HW_ATL2_MIF_INTERRUPT_TO_ITR_SHIFT 6
+#define HW_ATL2_EN_INTERRUPT_MIF2_TO_ITR_MSK 0x00001000
+/* lower bit position of bitfield Enable MIF Interrupt to ITR */
+#define HW_ATL2_EN_INTERRUPT_TO_ITR_SHIFT 0xA
+#define HW_ATL2_GLOBAL_INTERNAL_ALARMS_1_ADR 0x00000924
+#define HW_ATL2_GLOBAL_HIGH_PRIO_INTERRUPT_1_MASK_ADR 0x00000964
+/* bitmask for bitfield TSG PTM GPIO interrupt */
+#define HW_ATL2_TSG_TSG1_GPIO_INTERRUPT_MSK 0x00000200
+/* lower bit position of bitfield TSG PTM GPIO interrupt */
+#define HW_ATL2_TSG_TSG1_GPIO_INTERRUPT_SHIFT 9
+/* bitmask for bitfield TSG0 GPIO interrupt */
+#define HW_ATL2_TSG_TSG0_GPIO_INTERRUPT_MSK 0x00000020
+/* lower bit position of bitfield TSG0 GPIO interrupt */
+#define HW_ATL2_TSG_TSG0_GPIO_INTERRUPT_SHIFT 5
+/* TSG registers */
+#define HW_ATL2_TSG_REG_ADR(clk, reg_name) \
+	((clk) == 0 ? HW_ATL2_CLK0_##reg_name##_ADR :\
+		 HW_ATL2_CLK1_##reg_name##_ADR)
+
+#define HW_ATL2_CLK0_CLOCK_CFG_ADR 0x00000CA0u
+#define HW_ATL2_CLK1_CLOCK_CFG_ADR 0x00000D50u
+#define HW_ATL2_TSG_SYNC_RESET_MSK 0x00000001
+#define HW_ATL2_TSG_SYNC_RESET_SHIFT 0x00000000
+#define HW_ATL2_TSG_CLOCK_EN_MSK 0x00000002
+#define HW_ATL2_TSG_CLOCK_EN_SHIFT 0x00000001
+#define HW_ATL2_CLK0_CLOCK_MODIF_CTRL_ADR 0x00000CA4u
+#define HW_ATL2_CLK1_CLOCK_MODIF_CTRL_ADR 0x00000D54u
+#define HW_ATL2_TSG_SUBTRACT_COUNTER_MSK 0x00000002
+#define HW_ATL2_TSG_ADD_COUNTER_MSK 0x00000004
+#define HW_ATL2_TSG_LOAD_INC_CFG_MSK 0x00000008
+#define HW_ATL2_CLK0_CLOCK_MODIF_VAL_LSW_ADR 0x00000CA8u
+#define HW_ATL2_CLK1_CLOCK_MODIF_VAL_LSW_ADR 0x00000D58u
+#define HW_ATL2_CLK0_CLOCK_INC_CFG_ADR 0x00000CB0u
+#define HW_ATL2_CLK1_CLOCK_INC_CFG_ADR 0x00000D60u
+#define HW_ATL2_CLK0_READ_CUR_NS_LSW_ADR 0x00000CB8u
+#define HW_ATL2_CLK1_READ_CUR_NS_LSW_ADR 0x00000D68u
+
+#define HW_ATL2_CLK0_GPIO_CFG_ADR 0x00000CC4u
+#define HW_ATL2_CLK1_GPIO_CFG_ADR 0x00000D74u
+#define HW_ATL2_TSG_GPIO_IN_MONITOR_EN_SHIFT 0x00000000
+#define HW_ATL2_TSG_GPIO_IN_MONITOR_EN_MSK 0x00000001
+#define HW_ATL2_TSG_GPIO_IN_MODE_SHIFT 0x00000001
+#define HW_ATL2_TSG_GPIO_IN_MODE_MSK 0x00000006
+#define HW_ATL2_TSG_GPIO_IN_MODE_POSEDGE 0x00000000
+#define HW_ATL2_CLK0_EXT_CLK_COUNT_ADR 0x00000CCCu
+#define HW_ATL2_CLK1_EXT_CLK_COUNT_ADR 0x00000D7Cu
+#define HW_ATL2_CLK0_GPIO_EVENT_TS_LSW_ADR 0x00000CD0u
+#define HW_ATL2_CLK1_GPIO_EVENT_TS_LSW_ADR 0x00000D80u
+#define HW_ATL2_CLK0_GPIO_EVENT_GEN_TS_LSW_ADR 0x00000CE0u
+#define HW_ATL2_CLK1_GPIO_EVENT_GEN_TS_LSW_ADR 0x00000D90u
+#define HW_ATL2_CLK0_GPIO_EVENT_GEN_CFG_ADR 0x00000CE8u
+#define HW_ATL2_CLK1_GPIO_EVENT_GEN_CFG_ADR 0x00000D98u
+#define HW_ATL2_TSG_GPIO_OUTPUT_EN_SHIFT 0x00000000
+#define HW_ATL2_TSG_GPIO_OUTPUT_EN_MSK 0x00000001
+#define HW_ATL2_TSG_GPIO_EVENT_MODE_SHIFT 0x00000001
+#define HW_ATL2_TSG_GPIO_EVENT_MODE_MSK 0x00000006
+#define HW_ATL2_TSG_GPIO_EVENT_MODE_SET_ON_TIME 0x00000003
+#define HW_ATL2_TSG_GPIO_GEN_OUTPUT_EN_MSK 0x00000008
+#define HW_ATL2_CLK0_GPIO_EVENT_HIGH_TIME_LSW_ADR 0x00000CF0u
+#define HW_ATL2_CLK1_GPIO_EVENT_HIGH_TIME_LSW_ADR 0x00000DA0u
+#define HW_ATL2_CLK0_GPIO_EVENT_LOW_TIME_LSW_ADR 0x00000CF8u
+#define HW_ATL2_CLK1_GPIO_EVENT_LOW_TIME_LSW_ADR 0x00000DA8u
+/* PCIE Extended tag enable Bitfield Definitions
+ */
+#define HW_ATL2_PHI_EXT_TAG_EN_ADR 0x00001000
+#define HW_ATL2_PHI_EXT_TAG_EN_MSK 0x00000020
+#define HW_ATL2_PHI_EXT_TAG_EN_SHIFT 5
 /* Launch time control register */
 #define HW_ATL2_LT_CTRL_ADR 0x00007a1c
 
@@ -387,5 +561,25 @@
 #define HW_ATL2_MCP_HOST_REQ_INT_ADR 0x00000F00u
 #define HW_ATL2_MCP_HOST_REQ_INT_SET_ADR 0x00000F04u
 #define HW_ATL2_MCP_HOST_REQ_INT_CLR_ADR 0x00000F08u
-
+/* Register address for bitfield PTP EXT GPIO TS SEL */
+#define HW_ATL2_TSG0_EXT_GPIO_TS_INPUT_SEL_ADR 0x00003664
+/* Bitmask for bitfield PTP EXT GPIO TS SEL */
+#define HW_ATL2_TSG0_EXT_GPIO_TS_INPUT_SEL_MSK 0x00001F00
+/* Lower bit position of bitfield PTP EXT GPIO TS SEL */
+#define HW_ATL2_TSG0_EXT_GPIO_TS_INPUT_SEL_SHIFT 8
+/* Register address for bitfield TSG EXT GPIO TS SEL */
+#define HW_ATL2_TSG1_EXT_GPIO_TS_INPUT_SEL_ADR 0x00003660
+/* Bitmask for bitfield TSG EXT GPIO TS SEL */
+#define HW_ATL2_TSG1_EXT_GPIO_TS_INPUT_SEL_MSK 0x00001F00
+/* Lower bit position of bitfield TSG EXT GPIO TS SEL */
+#define HW_ATL2_TSG1_EXT_GPIO_TS_INPUT_SEL_SHIFT 8
+/* Register address for bitfield GPIO{P} Special Mode */
+#define HW_ATL2_GPIO_PIN_SPEC_MODE_ADR(pin) (0x00003698 + (pin) * 0x4)
+/* Bitmask for bitfield GPIO{P} Special Mode */
+#define HW_ATL2_GPIO_PIN_SPEC_MODE_MSK 0x0000000C
+/* Lower bit position of bitfield GPIO{P} Special Mode */
+#define HW_ATL2_GPIO_PIN_SPEC_MODE_SHIFT 2
+#define HW_ATL2_GPIO_PIN_SPEC_MODE_TSG1_EVENT_OUTPUT 0
+#define HW_ATL2_GPIO_PIN_SPEC_MODE_TSG0_EVENT_OUTPUT 2
+#define HW_ATL2_GPIO_PIN_SPEC_MODE_GPIO 3
 #endif /* HW_ATL2_LLH_INTERNAL_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 2/9] net: atlantic: move active_ipv4/ipv6 bitmap updates after HW write updates after HW write
From: sukhdeeps @ 2026-05-08 12:01 UTC (permalink / raw)
  To: netdev
  Cc: irusskikh, epomozov, richardcochran, andrew+netdev, davem,
	edumazet, kuba, pabeni, vadim.fedorenko, linux-kernel,
	Sukhdeep Singh
In-Reply-To: <20260508120156.3060-1-sukhdeeps@marvell.com>

From: Sukhdeep Singh <sukhdeeps@marvell.com>

Move active_ipv4/active_ipv6 bitmap updates from aq_set_data_fl3l4()
into aq_add_del_fl3l4() after the hardware write succeeds. The bitmaps
track which filter slots are actively programmed in hardware and must
only be updated once the HW write is confirmed.

Also remove bitmap manipulation from aq_nic_reserve_filter() and
aq_nic_release_filter(). These functions manage filter slot reservation
counts, not HW filter state. Setting active_ipv4 bits at reservation
time (before any filter is programmed) and clearing them at release
time (regardless of HW state) results in incorrect state visible to
aq_check_approve_fl3l4() for IPv4/IPv6 mixing validation.

This corrected state management is required for the AQC113 L3L4 filter
path introduced later in this series.

Signed-off-by: Sukhdeep Singh <sukhdeeps@marvell.com>
---
 .../ethernet/aquantia/atlantic/aq_filters.c   | 36 ++++++++++++-------
 .../net/ethernet/aquantia/atlantic/aq_nic.c   |  3 --
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
index eef52f23166d..150a0b1af26a 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
@@ -479,15 +479,8 @@ static int aq_set_data_fl3l4(struct aq_nic_s *aq_nic,
 	data->is_ipv6 = rx_fltrs->fl3l4.is_ipv6;
 	data->location = HW_ATL_GET_REG_LOCATION_FL3L4(fsp->location);
 
-	if (!add) {
-		if (!data->is_ipv6)
-			rx_fltrs->fl3l4.active_ipv4 &= ~BIT(data->location);
-		else
-			rx_fltrs->fl3l4.active_ipv6 &=
-				~BIT((data->location) / 4);
-
+	if (!add)
 		return 0;
-	}
 
 	data->cmd |= HW_ATL_RX_ENABLE_FLTR_L3L4;
 
@@ -515,11 +508,9 @@ static int aq_set_data_fl3l4(struct aq_nic_s *aq_nic,
 			ntohl(fsp->h_u.tcp_ip4_spec.ip4src);
 		data->ip_dst[0] =
 			ntohl(fsp->h_u.tcp_ip4_spec.ip4dst);
-		rx_fltrs->fl3l4.active_ipv4 |= BIT(data->location);
 	} else {
 		int i;
 
-		rx_fltrs->fl3l4.active_ipv6 |= BIT((data->location) / 4);
 		for (i = 0; i < HW_ATL_RX_CNT_REG_ADDR_IPV6; ++i) {
 			data->ip_dst[i] =
 				ntohl(fsp->h_u.tcp_ip6_spec.ip6dst[i]);
@@ -574,16 +565,35 @@ static int aq_set_fl3l4(struct aq_hw_s *aq_hw,
 static int aq_add_del_fl3l4(struct aq_nic_s *aq_nic,
 			    struct aq_rx_filter *aq_rx_fltr, bool add)
 {
+	struct aq_hw_rx_fltrs_s *rx_fltrs = aq_get_hw_rx_fltrs(aq_nic);
 	const struct aq_hw_ops *aq_hw_ops = aq_nic->aq_hw_ops;
 	struct aq_hw_s *aq_hw = aq_nic->aq_hw;
 	struct aq_rx_filter_l3l4 data;
+	int err;
 
 	if (unlikely(aq_rx_fltr->aq_fsp.location < AQ_RX_FIRST_LOC_FL3L4 ||
-		     aq_rx_fltr->aq_fsp.location > AQ_RX_LAST_LOC_FL3L4  ||
-		     aq_set_data_fl3l4(aq_nic, aq_rx_fltr, &data, add)))
+		     aq_rx_fltr->aq_fsp.location > AQ_RX_LAST_LOC_FL3L4))
 		return -EINVAL;
 
-	return aq_set_fl3l4(aq_hw, aq_hw_ops, &data);
+	aq_set_data_fl3l4(aq_nic, aq_rx_fltr, &data, add);
+
+	err = aq_set_fl3l4(aq_hw, aq_hw_ops, &data);
+	if (err)
+		return err;
+
+	if (add) {
+		if (!data.is_ipv6)
+			rx_fltrs->fl3l4.active_ipv4 |= BIT(data.location);
+		else
+			rx_fltrs->fl3l4.active_ipv6 |= BIT(data.location / 4);
+	} else {
+		if (!data.is_ipv6)
+			rx_fltrs->fl3l4.active_ipv4 &= ~BIT(data.location);
+		else
+			rx_fltrs->fl3l4.active_ipv6 &= ~BIT(data.location / 4);
+	}
+
+	return 0;
 }
 
 static int aq_add_del_rule(struct aq_nic_s *aq_nic,
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index ef9447810071..3cec853e9fad 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -1522,8 +1522,6 @@ u8 aq_nic_reserve_filter(struct aq_nic_s *self, enum aq_rx_filter_type type)
 	case aq_rx_filter_l3l4:
 		fltr_cnt = AQ_RX_LAST_LOC_FL3L4 - AQ_RX_FIRST_LOC_FL3L4;
 		n_bit = fltr_cnt - self->aq_hw_rx_fltrs.fl3l4.reserved_count;
-
-		self->aq_hw_rx_fltrs.fl3l4.active_ipv4 |= BIT(n_bit);
 		self->aq_hw_rx_fltrs.fl3l4.reserved_count++;
 		location = n_bit;
 		break;
@@ -1543,7 +1541,6 @@ void aq_nic_release_filter(struct aq_nic_s *self, enum aq_rx_filter_type type,
 		break;
 	case aq_rx_filter_l3l4:
 		self->aq_hw_rx_fltrs.fl3l4.reserved_count--;
-		self->aq_hw_rx_fltrs.fl3l4.active_ipv4 &= ~BIT(location);
 		break;
 	default:
 		break;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 3/9] net: atlantic: decouple aq_set_data_fl3l4() from driver internals driver internals
From: sukhdeeps @ 2026-05-08 12:01 UTC (permalink / raw)
  To: netdev
  Cc: irusskikh, epomozov, richardcochran, andrew+netdev, davem,
	edumazet, kuba, pabeni, vadim.fedorenko, linux-kernel,
	Sukhdeep Singh
In-Reply-To: <20260508120156.3060-1-sukhdeeps@marvell.com>

From: Sukhdeep Singh <sukhdeeps@marvell.com>

Refactor aq_set_data_fl3l4() to take an ethtool_rx_flow_spec pointer and
an explicit HW register location instead of driver-internal structures
(aq_nic_s, aq_rx_filter). This makes the function reusable for PTP
filter setup which constructs flow specs independently.

Key changes:
- Add aq_is_ipv6_flow_type() helper to derive IPv6 status from the
  flow_type field, replacing the dependency on rx_fltrs->fl3l4.is_ipv6
  shared state.
- Change aq_set_data_fl3l4() signature to accept (fsp, data, location,
  add) and export it via aq_filters.h.
- Update aq_add_del_fl3l4() to compute the HW register location and
  pass it explicitly.

Signed-off-by: Sukhdeep Singh <sukhdeeps@marvell.com>
---
 .../ethernet/aquantia/atlantic/aq_filters.c   | 31 ++++++++++++++-----
 .../ethernet/aquantia/atlantic/aq_filters.h   |  3 ++
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
index 150a0b1af26a..4be7b629bfac 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
@@ -181,6 +181,20 @@ aq_check_approve_fvlan(struct aq_nic_s *aq_nic,
 	return 0;
 }
 
+static bool aq_is_ipv6_flow_type(const struct ethtool_rx_flow_spec *fsp)
+{
+	switch (fsp->flow_type & ~FLOW_EXT) {
+	case TCP_V6_FLOW:
+	case UDP_V6_FLOW:
+	case SCTP_V6_FLOW:
+	case IPV6_FLOW:
+	case IPV6_USER_FLOW:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int __must_check
 aq_check_filter(struct aq_nic_s *aq_nic,
 		struct ethtool_rx_flow_spec *fsp)
@@ -466,18 +480,16 @@ static int aq_add_del_fvlan(struct aq_nic_s *aq_nic,
 	return aq_filters_vlans_update(aq_nic);
 }
 
-static int aq_set_data_fl3l4(struct aq_nic_s *aq_nic,
-			     struct aq_rx_filter *aq_rx_fltr,
-			     struct aq_rx_filter_l3l4 *data, bool add)
+int aq_set_data_fl3l4(const struct ethtool_rx_flow_spec *fsp,
+		      struct aq_rx_filter_l3l4 *data,
+		      int location, bool add)
 {
-	struct aq_hw_rx_fltrs_s *rx_fltrs = aq_get_hw_rx_fltrs(aq_nic);
-	const struct ethtool_rx_flow_spec *fsp = &aq_rx_fltr->aq_fsp;
 	u32 flow = fsp->flow_type & ~FLOW_EXT;
 
 	memset(data, 0, sizeof(*data));
 
-	data->is_ipv6 = rx_fltrs->fl3l4.is_ipv6;
-	data->location = HW_ATL_GET_REG_LOCATION_FL3L4(fsp->location);
+	data->is_ipv6 = aq_is_ipv6_flow_type(fsp);
+	data->location = location;
 
 	if (!add)
 		return 0;
@@ -569,13 +581,16 @@ static int aq_add_del_fl3l4(struct aq_nic_s *aq_nic,
 	const struct aq_hw_ops *aq_hw_ops = aq_nic->aq_hw_ops;
 	struct aq_hw_s *aq_hw = aq_nic->aq_hw;
 	struct aq_rx_filter_l3l4 data;
+	int location;
 	int err;
 
 	if (unlikely(aq_rx_fltr->aq_fsp.location < AQ_RX_FIRST_LOC_FL3L4 ||
 		     aq_rx_fltr->aq_fsp.location > AQ_RX_LAST_LOC_FL3L4))
 		return -EINVAL;
 
-	aq_set_data_fl3l4(aq_nic, aq_rx_fltr, &data, add);
+	location = HW_ATL_GET_REG_LOCATION_FL3L4(aq_rx_fltr->aq_fsp.location);
+
+	aq_set_data_fl3l4(&aq_rx_fltr->aq_fsp, &data, location, add);
 
 	err = aq_set_fl3l4(aq_hw, aq_hw_ops, &data);
 	if (err)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.h b/drivers/net/ethernet/aquantia/atlantic/aq_filters.h
index 122e06c88a33..96e89c8e52d0 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.h
@@ -32,5 +32,8 @@ int aq_clear_rxnfc_all_rules(struct aq_nic_s *aq_nic);
 int aq_reapply_rxnfc_all_rules(struct aq_nic_s *aq_nic);
 int aq_filters_vlans_update(struct aq_nic_s *aq_nic);
 int aq_filters_vlan_offload_off(struct aq_nic_s *aq_nic);
+int aq_set_data_fl3l4(const struct ethtool_rx_flow_spec *fsp,
+		      struct aq_rx_filter_l3l4 *data,
+		      int location, bool add);
 
 #endif /* AQ_FILTERS_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 1/9] net: atlantic: correct L3L4 filter flow_type masking and IPv6 handling masking and IPv6 handling
From: sukhdeeps @ 2026-05-08 12:01 UTC (permalink / raw)
  To: netdev
  Cc: irusskikh, epomozov, richardcochran, andrew+netdev, davem,
	edumazet, kuba, pabeni, vadim.fedorenko, linux-kernel,
	Sukhdeep Singh
In-Reply-To: <20260508120156.3060-1-sukhdeeps@marvell.com>

From: Sukhdeep Singh <sukhdeeps@marvell.com>

Correct three issues in aq_set_data_fl3l4() required for the AQC113
PTP filter path introduced later in this series:

1. Mask FLOW_EXT from flow_type before the protocol switch statement.
   Flow types with FLOW_EXT set (e.g. TCP_V4_FLOW | FLOW_EXT) fall
   through to the default case and skip protocol comparison flags.

2. Extend the L3 address comparison check to cover all four IPv6
   words. The original code only checked ip_src[0]/ip_dst[0] and
   required !is_ipv6, so CMP_SRC_ADDR_L3/CMP_DEST_ADDR_L3 were never
   set for IPv6 filters.

3. Use explicit flow type checks for port extraction instead of
   negating IP_USER_FLOW/IPV6_USER_FLOW. The old check did not mask
   FLOW_EXT, so IP_USER_FLOW | FLOW_EXT would incorrectly attempt
   port extraction. Use the actual flow type to pick the correct
   union member directly.

Signed-off-by: Sukhdeep Singh <sukhdeeps@marvell.com>
---
 .../ethernet/aquantia/atlantic/aq_filters.c   | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
index e419c73b32ce..eef52f23166d 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_filters.c
@@ -472,6 +472,7 @@ static int aq_set_data_fl3l4(struct aq_nic_s *aq_nic,
 {
 	struct aq_hw_rx_fltrs_s *rx_fltrs = aq_get_hw_rx_fltrs(aq_nic);
 	const struct ethtool_rx_flow_spec *fsp = &aq_rx_fltr->aq_fsp;
+	u32 flow = fsp->flow_type & ~FLOW_EXT;
 
 	memset(data, 0, sizeof(*data));
 
@@ -490,7 +491,7 @@ static int aq_set_data_fl3l4(struct aq_nic_s *aq_nic,
 
 	data->cmd |= HW_ATL_RX_ENABLE_FLTR_L3L4;
 
-	switch (fsp->flow_type) {
+	switch (flow) {
 	case TCP_V4_FLOW:
 	case TCP_V6_FLOW:
 		data->cmd |= HW_ATL_RX_ENABLE_CMP_PROT_L4;
@@ -527,23 +528,23 @@ static int aq_set_data_fl3l4(struct aq_nic_s *aq_nic,
 		}
 		data->cmd |= HW_ATL_RX_ENABLE_L3_IPV6;
 	}
-	if (fsp->flow_type != IP_USER_FLOW &&
-	    fsp->flow_type != IPV6_USER_FLOW) {
-		if (!data->is_ipv6) {
-			data->p_dst =
-				ntohs(fsp->h_u.tcp_ip4_spec.pdst);
-			data->p_src =
-				ntohs(fsp->h_u.tcp_ip4_spec.psrc);
-		} else {
-			data->p_dst =
-				ntohs(fsp->h_u.tcp_ip6_spec.pdst);
-			data->p_src =
-				ntohs(fsp->h_u.tcp_ip6_spec.psrc);
-		}
+	if (flow == TCP_V4_FLOW || flow == UDP_V4_FLOW ||
+	    flow == SCTP_V4_FLOW) {
+		data->p_dst = ntohs(fsp->h_u.tcp_ip4_spec.pdst);
+		data->p_src = ntohs(fsp->h_u.tcp_ip4_spec.psrc);
+	}
+	if (flow == TCP_V6_FLOW || flow == UDP_V6_FLOW ||
+	    flow == SCTP_V6_FLOW) {
+		data->p_dst = ntohs(fsp->h_u.tcp_ip6_spec.pdst);
+		data->p_src = ntohs(fsp->h_u.tcp_ip6_spec.psrc);
 	}
-	if (data->ip_src[0] && !data->is_ipv6)
+	if (data->ip_src[0] ||
+	    (data->is_ipv6 && (data->ip_src[1] || data->ip_src[2] ||
+			       data->ip_src[3])))
 		data->cmd |= HW_ATL_RX_ENABLE_CMP_SRC_ADDR_L3;
-	if (data->ip_dst[0] && !data->is_ipv6)
+	if (data->ip_dst[0] ||
+	    (data->is_ipv6 && (data->ip_dst[1] || data->ip_dst[2] ||
+			       data->ip_dst[3])))
 		data->cmd |= HW_ATL_RX_ENABLE_CMP_DEST_ADDR_L3;
 	if (data->p_dst)
 		data->cmd |= HW_ATL_RX_ENABLE_CMP_DEST_PORT_L4;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 0/9] net: atlantic: add PTP support for AQC113 (Antigua)
From: sukhdeeps @ 2026-05-08 12:01 UTC (permalink / raw)
  To: netdev
  Cc: irusskikh, epomozov, richardcochran, andrew+netdev, davem,
	edumazet, kuba, pabeni, vadim.fedorenko, linux-kernel,
	Sukhdeep Singh
In-Reply-To: <20260506135706.2834-1-sukhdeeps@marvell.com>

From: Sukhdeep Singh <sukhdeeps@marvell.com>

This series adds IEEE 1588 PTP support for the AQC113 (Antigua) network
controller. AQC113 is the successor to the existing AQC107 (Atlantic)
chip already supported by the atlantic driver.

AQC113 uses a substantially different hardware architecture for PTP
compared to AQC107:

  - Dual on-chip TSG clocks with direct register access instead of
    PHY-based timestamping via firmware
  - TX timestamps via descriptor writeback instead of firmware mailbox
  - Hardware L3/L4 RX filters for PTP multicast steering with both
    IPv4 and IPv6 support
  - Reference-counted shared filter slots managed through an Action
    Resolver Table (ART), allowing multiple rules to share L3/L4
    hardware filters when their match criteria are identical

The series is structured in three parts:

Patches 1-3 prepare the existing L3/L4 filter path:

  Patch 1 corrects flow_type masking and IPv6 address handling in
  aq_set_data_fl3l4(). Patch 2 moves the active_ipv4/ipv6 bitmap
  updates to after the hardware write succeeds. Patch 3 decouples
  the function from driver-internal structures so it can be called
  directly by the AQC113 PTP filter setup code.

Patches 4-6 add the AQC113 hardware infrastructure:

  Patch 4 adds the low-level register definitions and accessor
  functions. Patch 5 adds filter data structures and firmware
  capability query. Patch 6 implements the complete L2/L3/L4 RX
  filter management layer including the reference-counted sharing
  and ART integration.

Patches 7-9 add the AQC113 PTP feature:

  Patch 7 reserves the dedicated PTP traffic class buffer and
  configures the TX path. Patch 8 extends the hw_ops interface
  with PTP-specific function pointers and updates AQC107 to the
  new signatures. Patch 9 implements the full PTP subsystem
  integration for AQC113.

The existing AQC107 PTP implementation is not functionally changed
by this series; AQC113-specific code paths are gated on chip
detection throughout.

Tested on AQC113 at 1G, 2.5G, 5G, and 10G link speeds using
ptp4l/phc2sys with hardware timestamping in both L2 and L4
(IPv4/IPv6) modes.

Changes in v2:
- Patch 6: Remove redundant variable initializers for art_mask, h, l
  and err as suggested by Vadim Fedorenko

Sukhdeep Singh (9):
  net: atlantic: correct L3L4 filter flow_type masking and IPv6 handling
  net: atlantic: move active_ipv4/ipv6 bitmap updates after HW write
  net: atlantic: decouple aq_set_data_fl3l4() from driver internals
  net: atlantic: add AQC113 hardware register definitions and accessors
  net: atlantic: add AQC113 filter data structures and firmware query
  net: atlantic: implement AQC113 L2/L3/L4 RX filter management
  net: atlantic: add AQC113 PTP traffic class and TX path setup
  net: atlantic: extend hw_ops and TX descriptor for AQC113 PTP
  net: atlantic: add PTP support for AQC113 (Antigua)

 drivers/net/ethernet/aquantia/atlantic/aq_filters.c          |  64 +-
 drivers/net/ethernet/aquantia/atlantic/aq_filters.h          |   3 +
 drivers/net/ethernet/aquantia/atlantic/aq_hw.h               |  37 +-
 drivers/net/ethernet/aquantia/atlantic/aq_main.c             |  33 +-
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c              |  51 +-
 drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c         |   4 +-
 drivers/net/ethernet/aquantia/atlantic/aq_ptp.c              | 531 +++++++--
 drivers/net/ethernet/aquantia/atlantic/aq_ptp.h              |  15 +-
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c             |  42 +-
 drivers/net/ethernet/aquantia/atlantic/aq_ring.h             |   4 +-
 drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c   |  15 +-
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c    | 813 +++++++++++-
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.h    |  12 +
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_internal.h | 69 +-
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.c | 360 ++++++
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh.h | 107 +-
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_llh_internal.h | 204 ++-
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils.c |  33 +
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils.h |  15 +
 drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c |  52 +
 20 files changed, 2244 insertions(+), 224 deletions(-)

-- 
2.43.0

^ permalink raw reply

* Re: ipv6: ip6mr: Call ip6mr_fib_lookup() under RCU in pim6_rcv() and reg_vif6_xmit()
From: y2k @ 2026-05-08 12:02 UTC (permalink / raw)
  To: kuniyu; +Cc: davem, dsahern, edumazet, idosch, kuba, linux-kernel, netdev,
	pabeni
In-Reply-To: <20260508115204.4068686-1-kuniyu@google.com>

Thank you for the clarification. I missed that IP6MR is not yet
converted to ->exit_rtnl() and therefore does not need this change.

I appreciate the quick response.

y2k

^ permalink raw reply

* Re: ipv6: ip6mr: Call ip6mr_fib_lookup() under RCU in pim6_rcv() and reg_vif6_xmit()
From: Kuniyuki Iwashima @ 2026-05-08 11:51 UTC (permalink / raw)
  To: y2k; +Cc: davem, dsahern, edumazet, idosch, kuba, linux-kernel, netdev,
	pabeni
In-Reply-To: <863d7892459fd7627388d8b0c1670292.y2k@desarrollaria.com>

From: y2k <y2k@desarrollaria.com>
Date: Fri, 08 May 2026 13:21:21 +0200
> Commit 019c892e4654 ("ipmr: Call ipmr_fib_lookup() under RCU.") fixed
> the same issue in IPv4's reg_vif_xmit(). The IPv6 counterpart has the
> same problem in two places.

No.

The change is just for rcu_dereference() added in the commit below
for ->exit_rtnl() conversion, but IP6MR is not yet converted.

---8<---
commit b3b6babf47517fde6b6de2493dea28e8831b9347
Author: Kuniyuki Iwashima <kuniyu@google.com>
Date:   Thu Apr 23 05:34:54 2026

    ipmr: Free mr_table after RCU grace period.
...
    Note that IP6MR is not yet converted to ->exit_rtnl(), so this
    change is not needed for now but will be.
---8<---


> 
> In pim6_rcv() (net/ipv6/ip6mr.c:578) and reg_vif6_xmit()
> (net/ipv6/ip6mr.c:624), ip6mr_fib_lookup() is called without holding
> rcu_read_lock().
> 
> When CONFIG_IP6_MROUTE_MULTIPLE_TABLES=n, ip6mr_fib_lookup() accesses
> net->ipv6.mrt6 directly without rcu_dereference(), while the IPv4
> equivalent correctly uses rcu_dereference(net->ipv4.mrt). This
> inconsistency means IPv6 multicast routing lacks proper RCU protection.
> 
> In reg_vif6_xmit(), rcu_read_lock() is acquired at line 628 after the
> ip6mr_fib_lookup() call at line 624 — too late. In pim6_rcv(), there
> is no rcu_read_lock() before ip6mr_fib_lookup() at line 578 at all.
> 
> Suggested fix for reg_vif6_xmit():
> 
>   + rcu_read_lock();
>     if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) {
>   +   rcu_read_unlock();
>       goto tx_err;
>     }
>     DEV_STATS_ADD(dev, tx_bytes, skb->len);
>     DEV_STATS_INC(dev, tx_packets);
>   - rcu_read_lock();
>     ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
>                        MRT6MSG_WHOLEPKT);
>     rcu_read_unlock();
> 
> Suggested fix for pim6_rcv():
> 
>   + rcu_read_lock();
>     if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) {
>   +   rcu_read_unlock();
>       goto drop;
>     }
> 
> Additionally, net->ipv6.mrt6 should be accessed via rcu_dereference()
> in ip6mr_fib_lookup() to match the IPv4 pattern in ipmr_fib_lookup().

^ permalink raw reply

* [PATCH v8 2/2] net: mana: force full-page RX buffers via ethtool private flag
From: Dipayaan Roy @ 2026-05-08 11:46 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260508115100.488506-1-dipayanroy@linux.microsoft.com>

On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
allocation in the RX refill path can cause 15-20% throughput
regression under high connection counts (>16 TCP streams).

Add an ethtool private flag "full-page-rx" that allows the user to
force one RX buffer per page, bypassing the page_pool fragment path.
This restores line-rate (180+ Gbps) performance on affected platforms.

Usage:
  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag must be explicitly
enabled by the user or udev rule.

The existing single-buffer-per-page logic for XDP and jumbo frames is
consolidated into a new helper mana_use_single_rxbuf_per_page() which
is now the single decision point for both the automatic and
user-controlled paths.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 +++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 103 ++++++++++++++++++
 include/net/mana/mana.h                       |   8 ++
 3 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 462a457e7d53..c4bc8bf19d75 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 	return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+	/* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
+	 * in the RX refill path (~2kB buffer) can cause significant throughput
+	 * regression under high connection counts. Allow user to force one RX
+	 * buffer per page via ethtool private flag to bypass the fragment
+	 * path.
+	 */
+	if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
+		return true;
+
+	/* For xdp and jumbo frames make sure only one packet fits per page. */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+		return true;
+
+	return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 			       int mtu, u32 *datasize, u32 *alloc_size,
@@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 	/* Calculate datasize first (consistent across all cases) */
 	*datasize = mtu + ETH_HLEN;
 
-	/* For xdp and jumbo frames make sure only one packet fits per page */
-	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
 		if (mana_xdp_get(apc)) {
 			*headroom = XDP_PACKET_HEADROOM;
 			*alloc_size = PAGE_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 7e79681634db..f22bbb325948 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] = {
 	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
 };
 
+static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
+	[MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 		       ARRAY_SIZE(mana_phy_stats) +
 		       ARRAY_SIZE(mana_hc_stats)  +
 		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+
+	case ETH_SS_PRIV_FLAGS:
+		return MANA_PRIV_FLAG_MAX;
+
 	default:
 		return -EINVAL;
 	}
@@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 	}
 }
 
+static void mana_get_strings_priv_flags(u8 **data)
+{
+	int i;
+
+	for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
+		ethtool_puts(data, mana_priv_flags[i]);
+}
+
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 	case ETH_SS_STATS:
 		mana_get_strings_stats(apc, &data);
 		break;
+	case ETH_SS_PRIV_FLAGS:
+		mana_get_strings_priv_flags(&data);
+		break;
 	default:
 		break;
 	}
@@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device *ndev,
 	return 0;
 }
 
+static u32 mana_get_priv_flags(struct net_device *ndev)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	return apc->priv_flags;
+}
+
+static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+	u32 changed = apc->priv_flags ^ priv_flags;
+	u32 old_priv_flags = apc->priv_flags;
+	bool schedule_port_reset = false;
+	int err = 0;
+
+	if (!changed)
+		return 0;
+
+	/* Reject unknown bits */
+	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
+		return -EINVAL;
+
+	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
+		apc->priv_flags = priv_flags;
+
+		if (!apc->port_is_up) {
+			/* Port is down, flag updated to apply on next up
+			 * so just return.
+			 */
+			return 0;
+		}
+
+		/* Pre-allocate buffers to prevent failure in mana_attach
+		 * later
+		 */
+		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+		if (err) {
+			netdev_err(ndev,
+				   "Insufficient memory for new allocations\n");
+			apc->priv_flags = old_priv_flags;
+			return err;
+		}
+
+		err = mana_detach(ndev, false);
+		if (err) {
+			netdev_err(ndev, "mana_detach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+
+			/* Port is in an inconsistent state. Restore
+			 * 'port_is_up' so that queue reset work handler
+			 * can properly detach and re-attach.
+			 */
+			apc->port_is_up = true;
+			schedule_port_reset = true;
+			goto out;
+		}
+
+		err = mana_attach(ndev);
+		if (err) {
+			netdev_err(ndev, "mana_attach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+
+			/* Restore 'port_is_up' so the reset work handler
+			 * can properly detach/attach. Without this,
+			 * the handler sees port_is_up=false and skips
+			 * queue allocation, leaving the port dead.
+			 */
+			apc->port_is_up = true;
+			schedule_port_reset = true;
+		}
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(apc);
+
+	if (schedule_port_reset)
+		queue_work(apc->ac->per_port_queue_reset_wq,
+			   &apc->queue_reset_work);
+
+	return err;
+}
+
 const struct ethtool_ops mana_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
 	.get_ethtool_stats	= mana_get_ethtool_stats,
@@ -608,4 +709,6 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.set_ringparam          = mana_set_ringparam,
 	.get_link_ksettings	= mana_get_link_ksettings,
 	.get_link		= ethtool_op_get_link,
+	.get_priv_flags		= mana_get_priv_flags,
+	.set_priv_flags		= mana_set_priv_flags,
 };
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index aa90a858c8e3..1d44a78da520 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -30,6 +30,12 @@ enum TRI_STATE {
 	TRI_STATE_TRUE = 1
 };
 
+/* MANA ethtool private flag bit positions */
+enum mana_priv_flag_bits {
+	MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
+	MANA_PRIV_FLAG_MAX,
+};
+
 /* Number of entries for hardware indirection table must be in power of 2 */
 #define MANA_INDIRECT_TABLE_MAX_SIZE 512
 #define MANA_INDIRECT_TABLE_DEF_SIZE 64
@@ -531,6 +537,8 @@ struct mana_port_context {
 	u32 rxbpre_headroom;
 	u32 rxbpre_frag_count;
 
+	u32 priv_flags;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v8 1/2] net: mana: refactor mana_get_strings() and mana_get_sset_count() to use switch
From: Dipayaan Roy @ 2026-05-08 11:46 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260508115100.488506-1-dipayanroy@linux.microsoft.com>

Refactor mana_get_strings() and mana_get_sset_count() from if/else to
switch statements in preparation for adding ethtool private flags
support which requires handling ETH_SS_PRIV_FLAGS.

No functional change.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 .../ethernet/microsoft/mana/mana_ethtool.c    | 75 ++++++++++++-------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 04350973e19e..7e79681634db 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -138,53 +138,70 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 
-	if (stringset != ETH_SS_STATS)
+	switch (stringset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(mana_eth_stats) +
+		       ARRAY_SIZE(mana_phy_stats) +
+		       ARRAY_SIZE(mana_hc_stats)  +
+		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	default:
 		return -EINVAL;
-
-	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) +
-			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	}
 }
 
-static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 {
-	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	int i, j;
 
-	if (stringset != ETH_SS_STATS)
-		return;
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
-		ethtool_puts(&data, mana_eth_stats[i].name);
+		ethtool_puts(data, mana_eth_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
-		ethtool_puts(&data, mana_hc_stats[i].name);
+		ethtool_puts(data, mana_hc_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
-		ethtool_puts(&data, mana_phy_stats[i].name);
+		ethtool_puts(data, mana_phy_stats[i].name);
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "rx_%d_packets", i);
-		ethtool_sprintf(&data, "rx_%d_bytes", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
-		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+		ethtool_sprintf(data, "rx_%d_packets", i);
+		ethtool_sprintf(data, "rx_%d_bytes", i);
+		ethtool_sprintf(data, "rx_%d_xdp_drop", i);
+		ethtool_sprintf(data, "rx_%d_xdp_tx", i);
+		ethtool_sprintf(data, "rx_%d_xdp_redirect", i);
+		ethtool_sprintf(data, "rx_%d_pkt_len0_err", i);
 		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
-			ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
+			ethtool_sprintf(data,
+					"rx_%d_coalesced_cqe_%d",
+					i,
+					j + 2);
 	}
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "tx_%d_packets", i);
-		ethtool_sprintf(&data, "tx_%d_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_xdp_xmit", i);
-		ethtool_sprintf(&data, "tx_%d_tso_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_csum_partial", i);
-		ethtool_sprintf(&data, "tx_%d_mana_map_err", i);
+		ethtool_sprintf(data, "tx_%d_packets", i);
+		ethtool_sprintf(data, "tx_%d_bytes", i);
+		ethtool_sprintf(data, "tx_%d_xdp_xmit", i);
+		ethtool_sprintf(data, "tx_%d_tso_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_bytes", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i);
+		ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_csum_partial", i);
+		ethtool_sprintf(data, "tx_%d_mana_map_err", i);
+	}
+}
+
+static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		mana_get_strings_stats(apc, &data);
+		break;
+	default:
+		break;
 	}
 }
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v8 0/2] net: mana: add ethtool private flag for full-page RX buffers
From: Dipayaan Roy @ 2026-05-08 11:46 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov

On some ARM64 platforms with 4K PAGE_SIZE, utilizing page_pool 
fragments for allocation in the RX refill path (~2kB buffer per fragment)
causes 15-20% throughput regression under high connection counts
(>16 TCP streams at 180+ Gbps). Using full-page buffers on these
platforms shows no regression and restores line-rate performance.

This behavior is observed on a single platform; other platforms
perform better with page_pool fragments, indicating this is not a
page_pool issue but platform-specific.

This series adds an ethtool private flag "full-page-rx" to let the
user opt in to one RX buffer per page:

  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag can be persisted
via udev rule for affected platforms.

Changes in v8:
  - Fixed queue_reset_work recovery by restoring port_is_up before
    scheduling reset so the handler can properly re-attach.
  - Simplified "err && schedule_port_reset" to "schedule_port_reset".
Changes in v7:
  - Rebased onto net-next.
  - Retained private flag approach after David Wei's testing on
    Grace (ARM64) confirmed that fragment mode outperforms
    full-page mode on other platforms, validating this is a
    single-platform workaround rather than a generic issue.
Changes in v6:
  - Added missed maintainers.
Changes in v5:
  - Split prep refactor into separate patch (patch 1/2)
Changes in v4:
  - Dropping the smbios string parsing and add ethtool priv flag
    to reconfigure the queues with full page rx buffers.
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.

Dipayaan Roy (2):
  net: mana: refactor mana_get_strings() and mana_get_sset_count() to
    use switch
  net: mana: force full-page RX buffers via ethtool private flag

 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 178 +++++++++++++++---
 include/net/mana/mana.h                       |   8 +
 3 files changed, 177 insertions(+), 31 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH net-next 10/12] net: stmmac: tc956x: add TC956x/QPS615 support
From: Daniel Thompson @ 2026-05-08 11:25 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Alex Elder, andrew+netdev, davem, edumazet, kuba, pabeni,
	maxime.chevallier, rmk+kernel, andersson, konradybcio, robh,
	krzk+dt, conor+dt, linusw, brgl, arnd, gregkh, mohd.anwar,
	a0987203069, alexandre.torgue, ast, boon.khai.ng, chenchuangyu,
	chenhuacai, daniel, hawk, hkallweit1, inochiama, john.fastabend,
	julianbraha, livelycarpet87, matthew.gerlach, mcoquelin.stm32, me,
	prabhakar.mahadev-lad.rj, richardcochran, rohan.g.thomas, sdf,
	siyanteng, weishangjuan, wens, netdev, bpf, linux-arm-msm,
	devicetree, linux-gpio, linux-stm32, linux-arm-kernel,
	linux-kernel
In-Reply-To: <ef6df85f-11ac-404d-958a-8cf69b3b6bb6@lunn.ch>

On Thu, May 07, 2026 at 06:29:15PM +0200, Andrew Lunn wrote:
> On Thu, May 07, 2026 at 05:03:46PM +0100, Daniel Thompson wrote:
> > On Fri, May 01, 2026 at 09:04:58PM +0200, Andrew Lunn wrote:
> > > > +static struct tc956x_mac_speed mac_speed[] = {
> > > > +	{ PHY_INTERFACE_MODE_2500BASEX,	SPEED_2500,  SP_SEL_SGMII_2500M, },
> > > > +	{ PHY_INTERFACE_MODE_SGMII,	SPEED_2500,  SP_SEL_SGMII_2500M, },
> > > > +	{ PHY_INTERFACE_MODE_SGMII,	SPEED_1000,  SP_SEL_SGMII_1000M, },
> > >
> > > That looks odd. Some vendors implemented 2500BaseX using SGMII
> > > overclocked. But that is not strictly 2500BaseX. Having the 2500BASEX
> > > entry suggests you have real 2500BASEX, so why have an SGMII entry
> > > with SPEED_2500?
> >
> > This is a consequence of the code that uses this lookup table being
> > called both during initialization and from the fix_mac_speed() callback.
> >
> > During initialization we only have the value in plat->phy_interface to
> > go on so we run the lookup table using plat->phy_interface (which is
> > typically PHY_INTERFACE_MODE_SGMII) and with the maximum permitted
> > speed.
>
> Something sounds wrong here. SGMII only supports 10/100/1G. You should
> never be asked to do SGMII at 2500. It should ask for 2500BaseX.

We weren't being asked. It was just an internal driver trick to common
up some code paths.

However I did a few tests and the internal driver trick doesn't
actually do much we can't achieve a different way. With that changed I
can (and will) remove the PHY_INTERFACE_MODE_SGMII/SPEED_2500 entry
from the table.


> > I haven't got detailed enough notes to allow me to double check but I
> > think there were problems completing the initial MAC reset if we didn't
> > write something sensible to the hardware during initialization.
>
> > During fix_max_speed() we get told to adopt 2500base-x. Reviewing the
> > code I can see we don't propagate that and just use
> > plat->phy_interface for fix_mac_speed(). I will fix the code to that
> > the requested interface propagates properly to the lookup table but I
> > think we would still rely on the SGMII entry to get sane initial values
> > to write to the hardware.
>
> Getting sane values into the hardware is good, but 2500 SGMII is not
> sane :-(

BTW if you are bothered by SP_SEL_SGMII_2500M, that name comes directly
from the TRM and I'd prefer to keep it if I can. The enumerated value
we have to write into the SP_SEL for 2500base-X is "SGMII 2500M".


Daniel.

^ permalink raw reply

* Re: [PATCH v1 bpf-next 7/8] bpf: tcp: Add SOCK_OPS rcvlowat hook.
From: Kuniyuki Iwashima @ 2026-05-08 11:30 UTC (permalink / raw)
  To: Jiayuan Chen
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Yonghong Song, John Fastabend, Stanislav Fomichev, Eric Dumazet,
	Neal Cardwell, Willem de Bruijn, Tenzin Ukyab, Kuniyuki Iwashima,
	bpf, netdev
In-Reply-To: <21ee2d5d-fc8c-497e-aa98-e5e4e3fbecf8@linux.dev>

On Fri, May 8, 2026 at 3:37 AM Jiayuan Chen <jiayuan.chen@linux.dev> wrote:
>
>
> On 5/8/26 3:33 PM, Kuniyuki Iwashima wrote:
> > Now, it is time to add the new hooks for BPF_SOCK_OPS_RCVLOWAT_CB.
> >
> > Let's invoke the BPF SOCK_OPS prog when
> >
> >    1. TCP stack enqueues skb to sk->sk_receive_queue
> >       -> tcp_queue_rcv(), tcp_ofo_queue(), and tcp_fastopen_add_skb()
> >
> >    2. TCP recvmsg() completes
> >       -> __tcp_cleanup_rbuf()
> >
> > This will allow the BPF prog to parse each skb and dynamically
> > adjust sk->sk_rcvlowat to suppress unnecessary EPOLLIN wakeups
> > until sufficient data (e.g., a full RPC frame) is available
> > in the receive queue.
> >
> > Note that the direct access to bpf_sock_ops.data is intentionally
> > disabled by passing 0 as end_offset.
> >
> > Instead, the BPF prog is supposed to use bpf_skb_load_bytes()
> > with bpf_sock_ops because payload is not in the linear area
> > with TCP header/data split on and skb may contain a RPC
> > descriptor in skb frag.  This also simplifies the BPF prog.
> >
> > Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> > ---
> >   include/net/tcp.h       | 14 ++++++++++++++
> >   net/ipv4/tcp.c          |  2 ++
> >   net/ipv4/tcp_fastopen.c |  2 ++
> >   net/ipv4/tcp_input.c    | 10 ++++++++++
> >   4 files changed, 28 insertions(+)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 4e9e634e276b..003e46c9b500 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -737,6 +737,20 @@ static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock
> >   }
> >   #endif
> >
> > +#ifdef CONFIG_CGROUP_BPF
> > +void bpf_skops_rcvlowat(struct sock *sk, struct sk_buff *skb);
> > +
> > +static inline void tcp_bpf_rcvlowat(struct sock *sk, struct sk_buff *skb)
> > +{
> > +     if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RCVLOWAT_CB_FLAG))
> > +             bpf_skops_rcvlowat(sk, skb);
> > +}
> > +#else
> > +static inline void tcp_bpf_rcvlowat(struct sock *sk, struct sk_buff *skb)
> > +{
> > +}
> > +#endif
> > +
> >   /* From net/ipv6/syncookies.c */
> >   int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th);
> >   struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 1d9e52fc454f..80144b97a87a 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -1602,6 +1602,8 @@ void __tcp_cleanup_rbuf(struct sock *sk, int copied)
> >               tcp_mstamp_refresh(tp);
> >               tcp_send_ack(sk);
> >       }
> > +
> > +     tcp_bpf_rcvlowat(sk, NULL);
> >   }
> >
>
> tcp_read_skb (process frame 1 and __skb_unlink)
> └─ sk_psock_verdict_recv
>      └─ sk_psock_verdict_apply
>          └─ tcp_eat_skb
>              └─ tcp_cleanup_rbuf
>                  └─ __tcp_cleanup_rbuf
>                      └─ BPF RCVLOWAT_CB
>                          └─ bpf_sock_ops_tcp_set_rcvlowat (wakeup=true)
>                              └─ tcp_data_ready
>                                  └─ sk_psock_verdict_data_ready
>                                      └─ tcp_read_skb (frame 2)
>                                          └─ ... → tcp_read_skb (frame 3) ...
>
> For strparser it use read_sock instead of read_skb and it will become
> more complicated...

To be clear, this feature is NOT to use strparser/sockmap.

>
> I think this will cause stack overflow with amounts of skbs in receive
> queue or infinite call(not tested) for sockmap/kTLS/strparser.
>

BPF user is responsible for not doing silly things.

tcp_bpf_strp_read_sock() can have loop detection logic,
but it's only if really needed.

^ permalink raw reply

* [PATCH net] net: neigh: Reallocate headroom if necessary in neigh_hh_bridge()
From: Lorenzo Bianconi @ 2026-05-08 11:25 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	Nikolay Aleksandrov, Ido Schimmel, Bart De Schuymer,
	Patrick McHardy
  Cc: netdev, netfilter-devel, coreteam, bridge, Lorenzo Bianconi

neigh_hh_bridge() assumes the skb always has sufficient headroom to copy
the aligned  L2 header. This assumption can trigger the crash reported
below using the following netfilter setup:

$modprobe br_netfilter
$sysctl -w net.bridge.bridge-nf-call-iptables=1

$root@OpenWrt:~# nft list ruleset
table ip nat {
        chain prerouting {
                type nat hook prerouting priority dstnat; policy accept;
                ip daddr 192.168.83.123 dnat to 192.168.83.120
        }
}

- iperf3 client (192.168.83.119) --> bridge (192.168.83.118) --> iperf3 server (192.168.83.120)

the iperf3 client is sending packet for 192.168.83.123 to the bridge device.

[ 1579.036575] Unable to handle kernel write to read-only memory at virtual address ffffff8004d76ffe
[ 1579.045482] Mem abort info:
[ 1579.048273]   ESR = 0x000000009600004f
[ 1579.052024]   EC = 0x25: DABT (current EL), IL = 32 bits
[ 1579.057363]   SET = 0, FnV = 0
[ 1579.060417]   EA = 0, S1PTW = 0
[ 1579.063550]   FSC = 0x0f: level 3 permission fault
[ 1579.068345] Data abort info:
[ 1579.071224]   ISV = 0, ISS = 0x0000004f, ISS2 = 0x00000000
[ 1579.076720]   CM = 0, WnR = 1, TnD = 0, TagAccess = 0
[ 1579.081770]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[ 1579.087092] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000080dc4000
[ 1579.093794] [ffffff8004d76ffe] pgd=180000009ffff003, p4d=180000009ffff003, pud=180000009ffff003, pmd=180000009ffe3003, pte=0060000084d76787
[ 1579.106343] Internal error: Oops: 000000009600004f [#1] SMP
[ 1579.193824] CPU: 0 UID: 0 PID: 235 Comm: napi/qdma_eth-3 Tainted: G           O       6.12.57 #0
[ 1579.202614] Tainted: [O]=OOT_MODULE
[ 1579.206102] Hardware name: Airoha AN7581 Evaluation Board (DT)
[ 1579.211929] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 1579.218889] pc : br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter]
[ 1579.225859] lr : br_nf_pre_routing_finish_bridge+0x18c/0xcc8 [br_netfilter]
[ 1579.232822] sp : ffffffc0817cba20
[ 1579.236128] x29: ffffffc0817cba20 x28: 0000000000000000 x27: ffffff8002b89000
[ 1579.243273] x26: ffffff8004d7700e x25: 0000000000000008 x24: 0000000000000000
[ 1579.250416] x23: ffffffc08179d4c0 x22: 0000000000000000 x21: ffffffc08179d4c0
[ 1579.257561] x20: ffffff8004d9b800 x19: ffffff8015010000 x18: 0000000000000014
[ 1579.264704] x17: ffffffbf9e930000 x16: ffffffc0817c8000 x15: 0000000000000070
[ 1579.271848] x14: 0000000000000080 x13: 0000000000000001 x12: 0000000000000000
[ 1579.278993] x11: ffffffc0798caae0 x10: ffffff8014db6fd8 x9 : 0000000000000000
[ 1579.286136] x8 : 0000000000000003 x7 : ffffffc08171f628 x6 : 000000001a3b83d3
[ 1579.293281] x5 : 0000000000000000 x4 : 1beb76f22fee0000 x3 : ffffff8004d7700e
[ 1579.300425] x2 : 0000000000000000 x1 : ffffff8004d9b8bc x0 : ffffff80026ed000
[ 1579.307570] Call trace:
[ 1579.310018]  br_nf_pre_routing_finish_bridge+0x1ac/0xcc8 [br_netfilter]
[ 1579.316632]  br_nf_hook_thresh+0xd4/0x14bc [br_netfilter]
[ 1579.322032]  br_nf_hook_thresh+0x250/0x14bc [br_netfilter]
[ 1579.327517]  br_nf_hook_thresh+0x76c/0x14bc [br_netfilter]
[ 1579.333003]  br_handle_frame+0x180/0x480
[ 1579.336935]  __netif_receive_skb_core.constprop.0+0x540/0xf40
[ 1579.342682]  __netif_receive_skb_one_core+0x28/0x50
[ 1579.347561]  process_backlog+0x98/0x1e0
[ 1579.351398]  __napi_poll+0x34/0x1c4
[ 1579.354887]  net_rx_action+0x178/0x330
[ 1579.358638]  handle_softirqs+0x108/0x2d4
[ 1579.362560]  __do_softirq+0x10/0x18
[ 1579.366051]  ____do_softirq+0xc/0x20
[ 1579.369627]  call_on_irq_stack+0x30/0x4c
[ 1579.373550]  do_softirq_own_stack+0x18/0x20
[ 1579.377734]  do_softirq+0x4c/0x60
[ 1579.381050]  __local_bh_enable_ip+0x88/0x98
[ 1579.385234]  napi_threaded_poll_loop+0x188/0x21c
[ 1579.389853]  napi_threaded_poll+0x70/0x80
[ 1579.393863]  kthread+0xd8/0xdc
[ 1579.396918]  ret_from_fork+0x10/0x20
[ 1579.400499] Code: 88dffc22 3707ffc2 f9406663 f9406684 (f81f0064)
[ 1579.406589] ---[ end trace 0000000000000000 ]---
[ 1579.411209] Kernel panic - not syncing: Oops: Fatal exception in interrupt
[ 1579.418083] SMP: stopping secondary CPUs
[ 1579.422012] Kernel Offset: disabled

Fix the issue reallocating the skb headroom if necessary in neigh_hh_bridge routine.

Fixes: e179e6322ac33 ("netfilter: bridge-netfilter: Fix MAC header handling with IP DNAT")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/net/neighbour.h         | 15 +++++++++++----
 net/bridge/br_netfilter_hooks.c |  5 ++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 2dfee6d4258a..4e1222968753 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -487,16 +487,23 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 }
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
+static inline struct sk_buff *
+neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 {
-	unsigned int seq, hh_alen;
+	unsigned int seq, hh_alen = HH_DATA_ALIGN(ETH_HLEN);
+
+	if (unlikely(skb_headroom(skb) < hh_alen)) {
+		skb = skb_expand_head(skb, hh_alen);
+		if (!skb)
+			return NULL;
+	}
 
 	do {
 		seq = read_seqbegin(&hh->hh_lock);
-		hh_alen = HH_DATA_ALIGN(ETH_HLEN);
 		memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
 	} while (read_seqretry(&hh->hh_lock, seq));
-	return 0;
+
+	return skb;
 }
 #endif
 
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 0ab1c94db4b9..6b59d7eb7906 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -297,7 +297,10 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
 				goto free_skb;
 			}
 
-			neigh_hh_bridge(&neigh->hh, skb);
+			skb = neigh_hh_bridge(&neigh->hh, skb);
+			if (!skb)
+				return -ENOMEM;
+
 			skb->dev = br_indev;
 
 			ret = br_handle_frame_finish(net, sk, skb);

---
base-commit: fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0
change-id: 20260508-nf-neigh_hh_bridge-fix-9ab775ee23c6

Best regards,
-- 
Lorenzo Bianconi <lorenzo@kernel.org>


^ permalink raw reply related

* RE: [PATCH iwl-next v11] ice: add support for unmanaged DPLL on E830 NIC
From: Kubalewski, Arkadiusz @ 2026-05-08 11:24 UTC (permalink / raw)
  To: Keller, Jacob E, intel-wired-lan@lists.osuosl.org
  Cc: netdev@vger.kernel.org, Nguyen, Anthony L, Kitszel, Przemyslaw,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	pmenzel@molgen.mpg.de, Loktionov, Aleksandr, horms@kernel.org,
	Nitka, Grzegorz, Grinberg, Vitaly, Fodor, Zoltan
In-Reply-To: <ca256097-1538-4c08-ba01-777bf646fc33@intel.com>

>From: Keller, Jacob E <jacob.e.keller@intel.com>
>Sent: Tuesday, May 5, 2026 12:31 AM
>
>On 2/17/2026 7:58 AM, Arkadiusz Kubalewski wrote:
>> Hardware variants of E830 may support an unmanaged DPLL where the
>> configuration is hardcoded within the hardware and firmware, meaning
>> users cannot modify settings. However, users are able to check the DPLL
>> lock status and obtain configuration information through the Linux DPLL
>> and devlink health subsystem.
>>
>> Availability of 'loss of lock' health status code determines if such
>> support is available, if true, register single DPLL device with 1 input
>> and 1 output and provide hardcoded/read only properties of a pin and
>> DPLL device. User is only allowed to check DPLL device status and
>> receive
>> notifications on DPLL lock status change.
>>
>> When present, the DPLL device locks to an external signal provided
>> through the PCIe/OCP pin. The expected input signal is 1PPS
>> (1 Pulse Per Second) embedded on a 10MHz reference clock.
>> The DPLL produces output:
>> - for MAC (Media Access Control) & PHY (Physical Layer) clocks,
>> - 1PPS for synchronization of onboard PHC (Precision Hardware Clock)
>> timer.
>>
>> Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
>> Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
>> Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
>> Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>> ---
>> v11:
>> - rebase and fix conflicts
>
>
>Jakub rightfully pointed out the mistaken inclusion of HAVE_DPLL_ESYNC.
>

True, fixed in v12.

>> diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c
>> b/drivers/net/ethernet/intel/ice/ice_dpll.c
>> index 73a4e28ed75f..64518202dfff 100644
>> --- a/drivers/net/ethernet/intel/ice/ice_dpll.c
>> +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
>> @@ -2588,6 +2601,21 @@ static const struct dpll_pin_ops
>> ice_dpll_output_ops = {
>>  	.esync_get = ice_dpll_output_esync_get,
>>  };
>>
>> +static const struct dpll_pin_ops ice_dpll_input_unmanaged_ops = {
>> +	.frequency_get = ice_dpll_input_frequency_get,
>> +	.direction_get = ice_dpll_input_direction,
>> +	.state_on_dpll_get = ice_dpll_input_state_get,
>> +#if defined(HAVE_DPLL_ESYNC)
>> +	.esync_get = ice_dpll_input_esync_get,
>> +#endif /* HAVE_DPLL_ESYNC */
>> +};
>> +
>Please fix this, and remove any other remaining kcompat cruft as well.
>
>Also could you please review the following Sashiko reports and make sure
>there is no issues?
>

Sure thing.

>The netdev sashiko.dev bot says this:
>
>> commit 7aae4a54d3ef18d8694bc535b6e5959839c9adfd
>> Author: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>> ice: add support for unmanaged DPLL on E830 NIC
>> Hardware variants of E830 may support an unmanaged DPLL where the
>> configuration is hardcoded. This patch allows users to check the DPLL
>> lock status and obtain configuration information through the Linux DPLL
>> and devlink health subsystem.
>>> diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c
>>>b/drivers/net/ethernet/intel/ice/devlink/health.c
>>> --- a/drivers/net/ethernet/intel/ice/devlink/health.c
>>> +++ b/drivers/net/ethernet/intel/ice/devlink/health.c
>> [ ... ]
>>> @@ -242,6 +244,8 @@ void ice_process_health_status_event(struct ice_pf
>>>*pf, struct ice_rq_event_info
>>>  				pf->health_reporters.fw_status = *health_info;
>>>  				devlink_health_report(pf->health_reporters.fw,
>>>  						      "FW syndrome reported", NULL);
>>> +				if (status_code ==
>>> ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK)
>>> +					ice_dpll_lock_state_set_unmanaged(pf,
>> health_info, true);
>> Is it possible for this health event to arrive when the DPLL subsystem
>> is not
>> fully initialized or already torn down?
>> Looking at ice_dpll_lock_state_set_unmanaged(), it unconditionally
>> acquires
>> pf->dplls.lock:
>> void ice_dpll_lock_state_set_unmanaged(...)
>> {
>>     ...
>> 	mutex_lock(&pf->dplls.lock);
>>     ...
>> }
>> If the hardware doesn't support DPLL, ice_dpll_init() is never called,
>> leaving
>> the mutex uninitialized. Additionally, if ice_dpll_init_unmanaged()
>> fails,
>> it destroys the mutex and exits.
>> During driver unload, ice_dpll_deinit() destroys the mutex and frees the
>> DPLL structures, but the service task processing these admin queue
>> events
>> might not be stopped until later in the teardown sequence.
>> Could this lead to a kernel panic from locking an uninitialized or
>> destroyed
>> mutex, or a use-after-free when passing the d->dpll pointer to
>> dpll_device_change_ntf()?
>> Should there be a check ensuring DPLL is active (e.g. by checking
>> test_bit(ICE_FLAG_DPLL, pf->flags)) before handling the event?
>
>
>My own execution using claude had this to say:
>

Sure, fix in v12.
Except last finding.

>> Now I have everything. Let me compose the report.
>> commit d9b481c1d5d8be9dbbca36f40be64db91eb12caa
>> Author: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
>> ice: add support for unmanaged DPLL on E830 NIC
>> This patch adds support for an unmanaged DPLL on E830 NIC variants,
>> where the DPLL configuration is hardcoded in hardware/firmware.  It
>> registers a single PPS DPLL device with one input and one output,
>> reporting lock status via health status events and the DPLL subsystem.
>> A few things stood out during review.
>>> diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c
>>> b/drivers/net/ethernet/intel/ice/devlink/health.c
>>> index 8e9a8a8178d4..31e6c5107c97 100644
>>> --- a/drivers/net/ethernet/intel/ice/devlink/health.c
>>> +++ b/drivers/net/ethernet/intel/ice/devlink/health.c
>> [ ... ]
>>> @@ -242,6 +244,8 @@ void ice_process_health_status_event(struct ice_pf
>>> *pf, struct ice_rq_event_info
>>> 				pf->health_reporters.fw_status = *health_info;
>>> 				devlink_health_report(pf->health_reporters.fw,
>>> 						      "FW syndrome reported", NULL);
>>>+				if (status_code ==
>>>ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK)
>>>+					ice_dpll_lock_state_set_unmanaged(pf,
>>>health_info, true);
>>> 				break;
>> This calls ice_dpll_lock_state_set_unmanaged() for every PF that
>> receives a 0x601 health event, with no check for pf->dplls.unmanaged
>> or test_bit(ICE_FLAG_DPLL, pf->flags).
>> The only guard inside ice_dpll_lock_state_set_unmanaged() is
>> ice_pf_src_tmr_owned(), which checks PTP timer ownership -- not
>> whether the DPLL subsystem was initialized.
>> Consider a device where the firmware supports health code 0x601 but
>> the ZL80640 CGU is not in the netlist.  In ice_init_features():
>>>+	err = ice_is_health_status_code_supported(&pf->hw, code,
>>>+						  &pf->dplls.unmanaged);
>>>+	if (err || !ice_is_unmanaged_cgu_in_netlist(&pf->hw))
>>>+		pf->dplls.unmanaged = false;
>> This sets pf->dplls.unmanaged = false, so ice_dpll_init() is never
>> called and pf->dplls.lock is never initialized via mutex_init().
>> But the firmware can still send the health event, and then:
>>>+void ice_dpll_lock_state_set_unmanaged(struct ice_pf *pf,
>>>+				       const struct ice_aqc_health_status_elem
>>>*buff,
>>>+				       bool notify)
>>>+{
>> [ ... ]
>>>+	mutex_lock(&pf->dplls.lock);
>> Can this mutex_lock() on an uninitialized mutex cause undefined
>> behavior?  With CONFIG_DEBUG_MUTEXES this would trigger a BUG().
>> Also, on managed DPLL devices (e.g. E810 with CGU), the DPLL worker
>> thread maintains pf->dplls.pps.dpll_state via ice_dpll_update_state().
>> Would this unconditional call overwrite the managed state machine?
>>> diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c
>>>b/drivers/net/ethernet/intel/ice/ice_dpll.c
>>> index 3eea83b0cdc9..00704dfeb3b2 100644
>>> --- a/drivers/net/ethernet/intel/ice/ice_dpll.c
>>> +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
>> [ ... ]
>>>+static const struct dpll_pin_ops ice_dpll_input_unmanaged_ops = {
>>>+	.frequency_get = ice_dpll_input_frequency_get,
>>>+	.direction_get = ice_dpll_input_direction,
>>>+	.state_on_dpll_get = ice_dpll_input_state_get,
>>>+#if defined(HAVE_DPLL_ESYNC)
>>>+	.esync_get = ice_dpll_input_esync_get,
>>>+#endif /* HAVE_DPLL_ESYNC */
>>>+};
>> HAVE_DPLL_ESYNC does not appear to be defined anywhere in the upstream
>> kernel tree -- it looks like an out-of-tree compatibility macro.  In
>> upstream builds this means .esync_get is always NULL for unmanaged
>> input pins.
>> The managed ice_dpll_input_ops sets .esync_get unconditionally, and
>> ice_dpll_input_esync_get() already has explicit unmanaged-specific
>> logic added by this patch.  The init code also unconditionally sets
>> ICE_AQC_GET_CGU_IN_CFG_STATUS_ESYNC_CAP and ICE_DPLL_IN_ESYNC_ENABLED
>> on unmanaged input pins.
>> The documentation example at the end of the patch shows
>> esync-frequency and esync-pulse in the pin dump, but those fields
>> will never appear in an upstream build since the callback is compiled
>> out.  Should the ifdef be removed so this matches ice_dpll_input_ops?
>> [ ... ]
>>> @@ -3153,12 +3181,15 @@ ice_dpll_init_direct_pins(struct ice_pf *pf,
>>> bool cgu,
>>>  	int ret;
>>>
>>>  	ret = ice_dpll_get_pins(pf, pins, start_idx, count, pf-
>>> dplls.clock_id);
>>>- 	if (ret)
>>>+	if (!cgu || ret)
>>>  		return ret;
>>>- 	if (cgu) {
>>>+
>>>+	if (first) {
>>>  		ret = ice_dpll_register_pins(first, pins, ops, count);
>>>  		if (ret)
>>>  			goto release_pins;
>>>+	}
>>>+	if (second) {
>>>  		ret = ice_dpll_register_pins(second, pins, ops, count);
>>>  		if (ret)
>>>  			goto unregister_first;
>> ice_dpll_init_direct_pins() was updated with NULL guards for first
>> and second.  However, ice_dpll_deinit_direct_pins() was not:
>> ice_dpll_deinit_direct_pins() {
>>     if (cgu) {
>>         ice_dpll_unregister_pins(first, pins, ops, count);
>>         ice_dpll_unregister_pins(second, pins, ops, count);
>>     }
>>     ...
>> }
>> For the unmanaged case, ice_dpll_init_unmanaged() only creates the
>> PPS dpll -- eec.dpll stays NULL.  If output pin init fails inside
>> ice_dpll_init_pins(pf, true), the error path falls through to
>> deinit_inputs which calls:
>>     ice_dpll_deinit_direct_pins(pf, cgu=true, ...,
>>                                 pf->dplls.pps.dpll,
>>                                 pf->dplls.eec.dpll);
>>                                               ^^^^
>>                                               NULL
>> This passes NULL as second to ice_dpll_unregister_pins(), which
>> calls dpll_pin_unregister(NULL, ...), dereferencing the NULL dpll
>> pointer.  Would it make sense to add the same NULL guards to
>> ice_dpll_deinit_direct_pins() that were added to the init path?
>> [ ... ]
>>> @@ -3923,6 +4001,15 @@ ice_dpll_init_info_direct_pins(struct ice_pf
>>> *pf,
>>>  		pins[i].prop.board_label = ice_cgu_get_pin_name(hw, i, input);
>>>  		pins[i].prop.type = ice_cgu_get_pin_type(hw, i, input);
>>>  		if (input) {
>>>+			if (pf->dplls.unmanaged) {
>>>+				pins[i].freq = ice_dpll_pin_freq_info(hw, i,
>>>+								      input);
>>>+				pins[i].state[0] = DPLL_PIN_STATE_CONNECTED;
>>>+				pins[i].status =
>>>+					ICE_AQC_GET_CGU_IN_CFG_STATUS_ESYNC_CAP;
>>>+				pins[i].flags[0] = ICE_DPLL_IN_ESYNC_ENABLED;
>>>+				continue;
>>>+			}
>> [ ... ]
>>>  		} else {
>>>+			if (pf->dplls.unmanaged) {
>>>+				pins[i].freq = ice_dpll_pin_freq_info(hw, i,
>>>+								      input);
>>>+				pins[i].state[0] = DPLL_PIN_STATE_CONNECTED;
>>>+				continue;
>>>+			}
>> [ ... ]
>>>  		pins[i].prop.freq_supported =
>>>  			ice_cgu_get_pin_freq_supp(hw, i, input, &freq_supp_num);
>>>  		pins[i].prop.freq_supported_num = freq_supp_num;
>>>  		pins[i].pf = pf;
>>>  	}
>> Both unmanaged branches use continue, which skips the assignments of
>> pins[i].prop.freq_supported, pins[i].prop.freq_supported_num, and
>> pins[i].pf at the bottom of the loop.
>> The missing freq_supported / freq_supported_num means the DPLL netlink
>> code cannot report supported frequency ranges to userspace for
>> unmanaged pins, even though the data is available from
>> ice_cgu_get_pin_freq_supp().
>> The missing pins[i].pf = pf assignment leaves the pointer NULL (from
>> kcalloc zeroing).  Current unmanaged callbacks use d->pf from
>> dpll_priv so there is no immediate crash, but if any callback that
>> dereferences p->pf were added to the unmanaged ops in the future it
>> would hit a NULL pointer dereference.  Was the continue intended to
>> skip all of these assignments?
>

The last one is intended behavior, added a comment in v12.

Thank you!
Arkadiusz

^ permalink raw reply

* Re: [PATCH v4 net 3/3] net: hns3: fix CWR handling in drivers to preserve ACE signal
From: Jijie Shao @ 2026-05-08 11:22 UTC (permalink / raw)
  To: Chia-Yu Chang (Nokia), linyunsheng@huawei.com,
	andrew+netdev@lunn.ch, parav@nvidia.com, jasowang@redhat.com,
	mst@redhat.com, shenjian15@huawei.com, salil.mehta@huawei.com,
	saeedm@nvidia.com, tariqt@nvidia.com, mbloch@nvidia.com,
	leonro@nvidia.com, linux-rdma@vger.kernel.org,
	netdev@vger.kernel.org, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, horms@kernel.org,
	ij@kernel.org, ncardwell@google.com, Koen De Schepper (Nokia),
	g.white@cablelabs.com, ingemar.s.johansson@ericsson.com,
	mirja.kuehlewind@ericsson.com, cheshire@apple.com, rs.ietf@gmx.at,
	Jason_Livingood@comcast.com, vidhi_goel@apple.com
  Cc: shaojijie
In-Reply-To: <PAXPR07MB7984A31018E9B85DEC28B68CA3282@PAXPR07MB7984.eurprd07.prod.outlook.com>


on 2026/4/25 22:30, Chia-Yu Chang (Nokia) wrote:
>> -----Original Message-----
>> From: Jijie Shao <shaojijie@huawei.com>
>> Sent: Saturday, April 25, 2026 11:35 AM
>> To: Chia-Yu Chang (Nokia) <chia-yu.chang@nokia-bell-labs.com>; linyunsheng@huawei.com; andrew+netdev@lunn.ch; parav@nvidia.com; jasowang@redhat.com; mst@redhat.com; shenjian15@huawei.com; salil.mehta@huawei.com; saeedm@nvidia.com; tariqt@nvidia.com; mbloch@nvidia.com; leonro@nvidia.com; linux-rdma@vger.kernel.org; netdev@vger.kernel.org; davem@davemloft.net; edumazet@google.com; kuba@kernel.org; pabeni@redhat.com; horms@kernel.org; ij@kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) <koen.de_schepper@nokia-bell-labs.com>; g.white@cablelabs.com; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel@apple.com
>> Cc: shaojijie@huawei.com
>> Subject: Re: [PATCH v4 net 3/3] net: hns3: fix CWR handling in drivers to preserve ACE signal
>>
>>
>> on 2026/4/17 23:26, chia-yu.chang@nokia-bell-labs.com wrote:
>>> From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
>>>
>>> Currently, hns3 Rx paths use SKB_GSO_TCP_ECN flag when a TCP segment
>>> with the CWR flag set. This is wrong because SKB_GSO_TCP_ECN is only
>>> valid for RFC3168 ECN on Tx, and using it on Rx allows RFC3168 ECN
>>> offload to clear the CWR flag. As a result, incoming TCP segments lose
>>> their ACE signal integrity required for AccECN (RFC9768), especially
>>> when the packet is forwarded and later re-segmented by GSO.
>>>
>>> Fix this by setting SKB_GSO_TCP_ACCECN for any Rx segment with the CWR
>>> flag set. SKB_GSO_TCP_ACCECN ensure that RFC3168 ECN offload will not
>>> clear the CWR flag, therefore preserving the ACE signal.
>>>
>>> Fixes: d474d88f88261 ("net: hns3: add hns3_gro_complete for HW GRO
>>> process")
>>> Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
>>> ---
>>>    drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +-
>>>    1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>>> b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>>> index a3206c97923e..e1b0dba56182 100644
>>> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>>> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>>> @@ -3904,7 +3904,7 @@ static int hns3_gro_complete(struct sk_buff
>>> *skb, u32 l234info)
>>>    
>>>    	skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
>>>    	if (th->cwr)
>>> -		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
>>> +		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
>>>    
>>>    	if (l234info & BIT(HNS3_RXD_GRO_FIXID_B))
>>>    		skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
>> I agree with Paolo's previous point;
>> for already released hardware, it is indeed not suitable to modify it.
>> During the hardware aggregation process, the ACE signal may have already been lost.
>>
>> Jijie Shao
> Hi Jijie,
>
> I would disagree with not fixing on released hardware. (Did Paolo explicit mention that?)
> The ACCECN protocol is based on ACE signal, and a broken ACE signal might be due to SKB_GSO_TCP_ECN at the RX path.
> You can see the explicit explanations and examples in the commit message.
> There is already a fix in patch 4e4f7cefb130af6aba6a393b2d13930b49390df9 for tcp_gro_receive() of tcp_offload.c
>
> And In this patch series, we would like to propose the similar fix on hns3 and mlx5e.
> While one main issue is to confirm is how the GRO is done in the corresponding HW-GRO.
> And if the driver can be safely changed from SKB_GSO_TCP_ECN to SKB_GSO_TCP_ACCECN, then we can ensure ECN and AccECN can be supported over existing hardware.

Sorry for the late reply.

It is confirmed that ACC_ECN is not supported.
HW-GRO will set the TOS field to 0.

Jijie Shao



^ permalink raw reply

* ipv6: ip6mr: Call ip6mr_fib_lookup() under RCU in pim6_rcv() and reg_vif6_xmit()
From: y2k @ 2026-05-08 11:21 UTC (permalink / raw)
  To: dsahern; +Cc: idosch, edumazet, kuba, pabeni, davem, netdev, linux-kernel

Commit 019c892e4654 ("ipmr: Call ipmr_fib_lookup() under RCU.") fixed
the same issue in IPv4's reg_vif_xmit(). The IPv6 counterpart has the
same problem in two places.

In pim6_rcv() (net/ipv6/ip6mr.c:578) and reg_vif6_xmit()
(net/ipv6/ip6mr.c:624), ip6mr_fib_lookup() is called without holding
rcu_read_lock().

When CONFIG_IP6_MROUTE_MULTIPLE_TABLES=n, ip6mr_fib_lookup() accesses
net->ipv6.mrt6 directly without rcu_dereference(), while the IPv4
equivalent correctly uses rcu_dereference(net->ipv4.mrt). This
inconsistency means IPv6 multicast routing lacks proper RCU protection.

In reg_vif6_xmit(), rcu_read_lock() is acquired at line 628 after the
ip6mr_fib_lookup() call at line 624 — too late. In pim6_rcv(), there
is no rcu_read_lock() before ip6mr_fib_lookup() at line 578 at all.

Suggested fix for reg_vif6_xmit():

  + rcu_read_lock();
    if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) {
  +   rcu_read_unlock();
      goto tx_err;
    }
    DEV_STATS_ADD(dev, tx_bytes, skb->len);
    DEV_STATS_INC(dev, tx_packets);
  - rcu_read_lock();
    ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
                       MRT6MSG_WHOLEPKT);
    rcu_read_unlock();

Suggested fix for pim6_rcv():

  + rcu_read_lock();
    if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) {
  +   rcu_read_unlock();
      goto drop;
    }

Additionally, net->ipv6.mrt6 should be accessed via rcu_dereference()
in ip6mr_fib_lookup() to match the IPv4 pattern in ipmr_fib_lookup().

Thanks,
y2k
y2k@desarrollaria.com

^ permalink raw reply

* Re: [PATCH 6.12] block: fix memory leak in in bio_map_user_iov()
From: Fedor Pchelkin @ 2026-05-08 11:16 UTC (permalink / raw)
  To: Dmitry Antipov
  Cc: Greg Kroah-Hartman, stable, Jens Axboe, linux-block,
	Christoph Hellwig, lvc-project, netdev
In-Reply-To: <5bd98789901e6bcd2b41d646209deb6e48ffb711.camel@yandex.ru>

On Fri, 08. May 11:30, Dmitry Antipov wrote:
> On Thu, 2026-05-07 at 21:52 +0300, Fedor Pchelkin wrote:
> 
> > In some form the issue is present in current upstream as well.  For
> > example, there is another callsite of iov_iter_extract_pages() in
> > block/bio-integrity.c where the same pattern still persists. 
> 
> Good point, and skb_splice_from_iter() looks suspicious as well:
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 7dad68e3b518..bf053372acb2 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -7343,12 +7343,16 @@ ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
>  
>                 len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);

This function does allocate memory for @pages argument only if *@pages is
NULL.  I don't think it's NULL here, *@pages points to a stack-allocated
array.

>                 if (len <= 0) {
> +                       /* Possible memory leak - ppages should be vfree()'d
> +                          if reallocated (ppages != pages)? */
>                         ret = len ?: -EIO;
>                         break;
>                 }
>  
>                 i = 0;
>                 do {
> +                       /* This looks wrong if reallocated - ppages[i++]
> +                          should be used instead? */
>                         struct page *page = pages[i++];
>                         size_t part = min_t(size_t, PAGE_SIZE - off, len);
> 
> This issue likely crosses the boundaries of block subsystem so netdev
> people are encouraged to look as well.

Not in this case.  The situations where iov_iter_extract_pages() needs to
allocate memory for @pages on its own happen when *@pages is NULL.  In
current mainline it can occur at block/bio-integrity.c and probably
that's all.

Mind to prepare the patch, please?  There are better chances to discuss
the problem directly with the patch at hand instead of expecting someone
to look at this [PATCH 6.12] thread.

^ permalink raw reply

* [PATCH iwl-next v12] ice: add support for unmanaged DPLL on E830 NIC
From: Arkadiusz Kubalewski @ 2026-05-08 11:08 UTC (permalink / raw)
  To: intel-wired-lan
  Cc: netdev, anthony.l.nguyen, przemyslaw.kitszel, linux-doc,
	linux-kernel, pmenzel, aleksandr.loktionov, horms, grzegorz.nitka,
	vgrinber, zoltan.fodor, Arkadiusz Kubalewski

Hardware variants of E830 may support an unmanaged DPLL where the
configuration is hardcoded within the hardware and firmware, meaning
users cannot modify settings. However, users are able to check the DPLL
lock status and obtain configuration information through the Linux DPLL
and devlink health subsystem.

Availability of 'loss of lock' health status code determines if such
support is available, if true, register single DPLL device with 1 input
and 1 output and provide hardcoded/read only properties of a pin and
DPLL device. User is only allowed to check DPLL device status and receive
notifications on DPLL lock status change.

When present, the DPLL device locks to an external signal provided
through the PCIe/OCP pin. The expected input signal is 1PPS
(1 Pulse Per Second) embedded on a 10MHz reference clock.
The DPLL produces output:
- for MAC (Media Access Control) & PHY (Physical Layer) clocks,
- 1PPS for synchronization of onboard PHC (Precision Hardware Clock) timer.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Grzegorz Nitka <grzegorz.nitka@intel.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
---
v12:
- remove HAVE_DPLL_ESYNC ifdef
- guard ice_dpll_lock_state_set_unmanaged() call in health event handler
  with test_bit(ICE_FLAG_DPLL, pf->flags) and pf->dplls.unmanaged
- add NULL guards for first/second dpll in ice_dpll_deinit_direct_pins()
- add comments explaining intentional continue in
  ice_dpll_init_info_direct_pins() for unmanaged pins
v11:
- rebase and fix conflicts
---
 .../device_drivers/ethernet/intel/ice.rst     |  83 +++++
 .../net/ethernet/intel/ice/devlink/health.c   |   6 +
 .../net/ethernet/intel/ice/ice_adminq_cmd.h   |  12 +
 drivers/net/ethernet/intel/ice/ice_common.c   | 136 ++++++++
 drivers/net/ethernet/intel/ice/ice_common.h   |   8 +
 drivers/net/ethernet/intel/ice/ice_dpll.c     | 314 ++++++++++++++++--
 drivers/net/ethernet/intel/ice/ice_dpll.h     |  10 +
 drivers/net/ethernet/intel/ice/ice_main.c     |  11 +-
 drivers/net/ethernet/intel/ice/ice_ptp_hw.c   |  46 +++
 drivers/net/ethernet/intel/ice/ice_ptp_hw.h   |   1 +
 10 files changed, 604 insertions(+), 23 deletions(-)

diff --git a/Documentation/networking/device_drivers/ethernet/intel/ice.rst b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
index 0bca293cf9cb..09877066b031 100644
--- a/Documentation/networking/device_drivers/ethernet/intel/ice.rst
+++ b/Documentation/networking/device_drivers/ethernet/intel/ice.rst
@@ -941,6 +941,89 @@ To see input signal on those PTP pins, you need to configure DPLL properly.
 Output signal is only visible on DPLL and to send it to the board SMA/U.FL pins,
 DPLL output pins have to be manually configured.
 
+Unmanaged DPLL Support
+----------------------
+Hardware variants of E830 may support an unmanaged DPLL:
+
+- Intel(R) Ethernet Network Adapter E830-XXVDA8F for OCP 3.0,
+
+- Intel(R) Ethernet Network Adapter E830-XXVDA4F.
+
+In the case of the unmanaged DPLL, the configuration is hardcoded within the
+hardware and firmware, meaning users cannot modify settings. However,
+users can check the DPLL lock status and obtain configuration information
+through the Linux DPLL subsystem.
+
+When present, the DPLL device locks to an external signal provided through the
+PCIe/OCP pin. The expected input signal is 1PPS (1 Pulse Per Second) embedded
+on a 10MHz reference clock.
+The DPLL produces output:
+
+- for MAC (Media Access Control) & PHY (Physical Layer) clocks,
+
+- 1PPS for synchronization of onboard PHC (Precision Hardware Clock) timer.
+
+Requirements: The Linux kernel must have support for both the DPLL Subsystem
+and the Embedded Sync patch series.
+
+Example output of querying the Linux DPLL subsystem can be found below.
+
+.. code-block:: console
+  :caption: Dumping the DPLL pins
+
+  $ <ynl> --spec Documentation/netlink/specs/dpll.yaml --dump pin-get
+  [{'board-label': '1588-TIME_SYNC',
+    'capabilities': set(),
+    'clock-id': 282574471561216,
+    'esync-frequency': 1,
+    'esync-frequency-supported': [{'frequency-max': 1, 'frequency-min': 1}],
+    'esync-pulse': 25,
+    'frequency': 10000000,
+    'id': 13,
+    'module-name': 'ice',
+    'parent-device': [{'direction': 'input',
+                       'parent-id': 6,
+                       'state': 'connected'}],
+    'phase-adjust-max': 0,
+    'phase-adjust-min': 0,
+    'type': 'ext'},
+    {'board-label': 'MAC-PHY-CLK',
+      'capabilities': set(),
+    'clock-id': 282574471561216,
+    'frequency': 156250000,
+    'id': 14,
+    'module-name': 'ice',
+    'parent-device': [{'direction': 'output',
+                       'parent-id': 6,
+                       'state': 'connected'}],
+    'phase-adjust-max': 0,
+    'phase-adjust-min': 0,
+    'type': 'synce-eth-port'},
+  {'board-label': '1588-TIME_REF',
+    'capabilities': set(),
+    'clock-id': 282574471561216,
+    'frequency': 1,
+    'id': 15,
+    'module-name': 'ice',
+    'parent-device': [{'direction': 'output',
+                       'parent-id': 6,
+                       'state': 'connected'}],
+    'phase-adjust-max': 0,
+    'phase-adjust-min': 0,
+    'type': 'int-oscillator'}]
+
+.. code-block:: console
+  :caption: Dumping the DPLL devices
+
+  $ <ynl> --spec Documentation/netlink/specs/dpll.yaml --dump device-get
+  [{'clock-id': 282574471561216,
+    'id': 6,
+    'lock-status': 'locked',
+    'mode': 'manual',
+    'mode-supported': ['manual'],
+    'module-name': 'ice',
+    'type': 'pps'}]
+
 GNSS module
 -----------
 Requires kernel compiled with CONFIG_GNSS=y or CONFIG_GNSS=m.
diff --git a/drivers/net/ethernet/intel/ice/devlink/health.c b/drivers/net/ethernet/intel/ice/devlink/health.c
index 8e9a8a8178d4..a83eb9f104c8 100644
--- a/drivers/net/ethernet/intel/ice/devlink/health.c
+++ b/drivers/net/ethernet/intel/ice/devlink/health.c
@@ -101,6 +101,8 @@ static const struct ice_health_status ice_health_status_lookup[] = {
 		"Supplied MIB file is invalid. DCB reverted to default configuration.",
 		"Disable FW-LLDP and check DCBx system configuration.",
 		{ice_port_number_label, "MIB ID"}},
+	{ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK, "Local DPLL lock status",
+		NULL,},
 };
 
 static int ice_health_status_lookup_compare(const void *a, const void *b)
@@ -242,6 +244,10 @@ void ice_process_health_status_event(struct ice_pf *pf, struct ice_rq_event_info
 				pf->health_reporters.fw_status = *health_info;
 				devlink_health_report(pf->health_reporters.fw,
 						      "FW syndrome reported", NULL);
+				if (status_code == ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK &&
+				    test_bit(ICE_FLAG_DPLL, pf->flags) &&
+				    pf->dplls.unmanaged)
+					ice_dpll_lock_state_set_unmanaged(pf, health_info, true);
 				break;
 			case ICE_AQC_HEALTH_STATUS_PF:
 			case ICE_AQC_HEALTH_STATUS_PORT:
diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
index eeffbcf9480d..07fc72da347c 100644
--- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h
@@ -1498,6 +1498,7 @@ struct ice_aqc_get_link_topo {
 #define ICE_AQC_GET_LINK_TOPO_NODE_NR_PCA9575		0x21
 #define ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL30632_80032	0x24
 #define ICE_AQC_GET_LINK_TOPO_NODE_NR_SI5383_5384	0x25
+#define ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL80640		0x27
 #define ICE_AQC_GET_LINK_TOPO_NODE_NR_E822_PHY		0x30
 #define ICE_AQC_GET_LINK_TOPO_NODE_NR_C827		0x31
 #define ICE_AQC_GET_LINK_TOPO_NODE_NR_GEN_CLK_MUX	0x47
@@ -2481,11 +2482,14 @@ enum ice_aqc_health_status {
 	ICE_AQC_HEALTH_STATUS_ERR_BMC_RESET			= 0x50B,
 	ICE_AQC_HEALTH_STATUS_ERR_LAST_MNG_FAIL			= 0x50C,
 	ICE_AQC_HEALTH_STATUS_ERR_RESOURCE_ALLOC_FAIL		= 0x50D,
+	ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK			= 0x601,
 	ICE_AQC_HEALTH_STATUS_ERR_FW_LOOP			= 0x1000,
 	ICE_AQC_HEALTH_STATUS_ERR_FW_PFR_FAIL			= 0x1001,
 	ICE_AQC_HEALTH_STATUS_ERR_LAST_FAIL_AQ			= 0x1002,
 };
 
+#define ICE_AQC_HEALTH_STATUS_CODE_NUM				64
+
 /* Get Health Status (indirect 0xFF22) */
 struct ice_aqc_get_health_status {
 	__le16 health_status_count;
@@ -2512,6 +2516,13 @@ struct ice_aqc_health_status_elem {
 	__le32 internal_data2;
 };
 
+/* Get Health Status response buffer entry, (0xFF21)
+ * repeated per reported health status
+ */
+struct ice_aqc_health_status_supp_elem {
+	__le16 health_status_code;
+};
+
 /* Admin Queue command opcodes */
 enum ice_adminq_opc {
 	/* AQ commands */
@@ -2675,6 +2686,7 @@ enum ice_adminq_opc {
 
 	/* System Diagnostic commands */
 	ice_aqc_opc_set_health_status_cfg		= 0xFF20,
+	ice_aqc_opc_get_supported_health_status_codes	= 0xFF21,
 	ice_aqc_opc_get_health_status			= 0xFF22,
 
 	/* FW Logging Commands */
diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c
index b617a6bff891..ef856d686f0a 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -3048,6 +3048,29 @@ bool ice_is_cgu_in_netlist(struct ice_hw *hw)
 	return false;
 }
 
+/**
+ * ice_is_unmanaged_cgu_in_netlist - check for unmanaged CGU presence
+ * @hw: pointer to the hw struct
+ *
+ * Check if the unmanaged Clock Generation Unit (CGU) device is present in the netlist.
+ * Save the CGU part number in the hw structure for later use.
+ * Return:
+ * * true - unmanaged cgu is present
+ * * false - unmanaged cgu is not present
+ */
+bool ice_is_unmanaged_cgu_in_netlist(struct ice_hw *hw)
+{
+	if (!ice_find_netlist_node(hw, ICE_AQC_LINK_TOPO_NODE_TYPE_CLK_CTRL,
+				   ICE_AQC_LINK_TOPO_NODE_CTX_GLOBAL,
+				   ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL80640,
+				   NULL)) {
+		hw->cgu_part_number = ICE_AQC_GET_LINK_TOPO_NODE_NR_ZL80640;
+		return true;
+	}
+
+	return false;
+}
+
 /**
  * ice_is_gps_in_netlist
  * @hw: pointer to the hw struct
@@ -6310,6 +6333,119 @@ bool ice_is_fw_health_report_supported(struct ice_hw *hw)
 				     ICE_FW_API_HEALTH_REPORT_PATCH);
 }
 
+/**
+ * ice_aq_get_health_status_supported - get supported health status codes
+ * @hw: pointer to the HW struct
+ * @buff: pointer to buffer where health status elements will be stored
+ * @num: number of health status elements buffer can hold
+ *
+ * Return:
+ * * 0 - success,
+ * * negative - AQ error code.
+ */
+static int
+ice_aq_get_health_status_supported(struct ice_hw *hw,
+				   struct ice_aqc_health_status_supp_elem *buff,
+				   int num)
+{
+	u16 code = ice_aqc_opc_get_supported_health_status_codes;
+	struct libie_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, code);
+
+	return ice_aq_send_cmd(hw, &desc, buff, num * sizeof(*buff), NULL);
+}
+
+/**
+ * ice_aq_get_health_status - get current health status array from the firmware
+ * @hw: pointer to the HW struct
+ * @buff: pointer to buffer where health status elements will be stored
+ * @num: number of health status elements buffer can hold
+ *
+ * Return:
+ * * 0 - success,
+ * * negative - AQ error code.
+ */
+int ice_aq_get_health_status(struct ice_hw *hw,
+			     struct ice_aqc_health_status_elem *buff, int num)
+{
+	struct libie_aq_desc desc;
+
+	ice_fill_dflt_direct_cmd_desc(&desc,
+				      ice_aqc_opc_get_health_status);
+
+	return ice_aq_send_cmd(hw, &desc, buff, num * sizeof(*buff), NULL);
+}
+
+/**
+ * ice_is_health_status_code_supported - check if health status code is supported
+ * @hw: pointer to the hardware structure
+ * @code: health status code to check
+ * @supported: pointer to boolean result
+ *
+ * Return: 0 on success, negative error code otherwise
+ */
+int ice_is_health_status_code_supported(struct ice_hw *hw, u16 code,
+					bool *supported)
+{
+	const int BUFF_SIZE = ICE_AQC_HEALTH_STATUS_CODE_NUM;
+	struct ice_aqc_health_status_supp_elem *buff;
+	int ret;
+
+	*supported = false;
+	buff = kzalloc_objs(*buff, BUFF_SIZE);
+	if (!buff)
+		return -ENOMEM;
+	ret = ice_aq_get_health_status_supported(hw, buff, BUFF_SIZE);
+	if (ret)
+		goto free_buff;
+	for (int i = 0; i < BUFF_SIZE && buff[i].health_status_code; i++)
+		if (le16_to_cpu(buff[i].health_status_code) == code) {
+			*supported = true;
+			break;
+		}
+
+free_buff:
+	kfree(buff);
+	return ret;
+}
+
+/**
+ * ice_get_last_health_status_code - get last health status for given code
+ * @hw: pointer to the hardware structure
+ * @out: pointer to the health status struct to be filled
+ * @code: health status code to check
+ *
+ * Return: 0 on success, negative error code otherwise
+ */
+int ice_get_last_health_status_code(struct ice_hw *hw,
+				    struct ice_aqc_health_status_elem *out,
+				    u16 code)
+{
+	const int BUFF_SIZE = ICE_AQC_HEALTH_STATUS_CODE_NUM;
+	struct ice_aqc_health_status_elem *buff;
+	int ret, last_status = -1;
+
+	buff = kzalloc_objs(*buff, BUFF_SIZE);
+	if (!buff)
+		return -ENOMEM;
+	ret = ice_aq_get_health_status(hw, buff, BUFF_SIZE);
+	if (ret)
+		goto free_buff;
+	for (int i = 0; i < BUFF_SIZE && buff[i].health_status_code; i++)
+		if (le16_to_cpu(buff[i].health_status_code) == code)
+			last_status = i;
+
+	if (last_status >= 0)
+		memcpy(out, &buff[last_status], sizeof(*out));
+	else
+		memset(out, 0, sizeof(*out));
+
+free_buff:
+	kfree(buff);
+	return ret;
+}
+
 /**
  * ice_aq_set_health_status_cfg - Configure FW health events
  * @hw: pointer to the HW struct
diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h
index ff6393e9be0c..ebced9edd5e3 100644
--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -162,6 +162,7 @@ ice_aq_get_phy_caps(struct ice_port_info *pi, bool qual_mods, u8 report_mode,
 bool ice_is_phy_rclk_in_netlist(struct ice_hw *hw);
 bool ice_is_clock_mux_in_netlist(struct ice_hw *hw);
 bool ice_is_cgu_in_netlist(struct ice_hw *hw);
+bool ice_is_unmanaged_cgu_in_netlist(struct ice_hw *hw);
 bool ice_is_gps_in_netlist(struct ice_hw *hw);
 int
 ice_aq_get_netlist_node(struct ice_hw *hw, struct ice_aqc_get_link_topo *cmd,
@@ -188,6 +189,13 @@ ice_get_link_default_override(struct ice_link_default_override_tlv *ldo,
 			      struct ice_port_info *pi);
 bool ice_is_phy_caps_an_enabled(struct ice_aqc_get_phy_caps_data *caps);
 bool ice_is_fw_health_report_supported(struct ice_hw *hw);
+int ice_aq_get_health_status(struct ice_hw *hw,
+			     struct ice_aqc_health_status_elem *buff, int num);
+int ice_is_health_status_code_supported(struct ice_hw *hw, u16 code,
+					bool *supported);
+int ice_get_last_health_status_code(struct ice_hw *hw,
+				    struct ice_aqc_health_status_elem *out,
+				    u16 code);
 int ice_aq_set_health_status_cfg(struct ice_hw *hw, u8 event_source);
 int ice_aq_get_phy_equalization(struct ice_hw *hw, u16 data_in, u16 op_code,
 				u8 serdes_num, int *output);
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c
index b9c7df50123d..afe8a1480014 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.c
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.c
@@ -18,6 +18,8 @@
 #define ICE_DPLL_SW_PIN_INPUT_BASE_SFP		4
 #define ICE_DPLL_SW_PIN_INPUT_BASE_QSFP		6
 #define ICE_DPLL_SW_PIN_OUTPUT_BASE		0
+#define ICE_DPLL_HEALTH_STATUS_LOCKED		1
+#define ICE_DPLL_HEALTH_STATUS_UNLOCKED		0
 
 #define ICE_DPLL_PIN_SW_INPUT_ABS(in_idx) \
 	(ICE_DPLL_SW_PIN_INPUT_BASE_SFP + (in_idx))
@@ -80,6 +82,10 @@ static const struct dpll_pin_frequency ice_esync_range[] = {
 	DPLL_PIN_FREQUENCY_RANGE(0, DPLL_PIN_FREQUENCY_1_HZ),
 };
 
+static const struct dpll_pin_frequency ice_esync_range_unmanaged[] = {
+	DPLL_PIN_FREQUENCY_1PPS,
+};
+
 /**
  * ice_dpll_is_sw_pin - check if given pin shall be controlled by SW
  * @pf: private board structure
@@ -1089,9 +1095,11 @@ ice_dpll_pin_state_get(const struct dpll_pin *pin, void *pin_priv,
 		return -EBUSY;
 
 	mutex_lock(&pf->dplls.lock);
-	ret = ice_dpll_pin_state_update(pf, p, pin_type, extack);
-	if (ret)
-		goto unlock;
+	if (!pf->dplls.unmanaged) {
+		ret = ice_dpll_pin_state_update(pf, p, pin_type, extack);
+		if (ret)
+			goto unlock;
+	}
 	if (pin_type == ICE_DPLL_PIN_TYPE_INPUT ||
 	    pin_type == ICE_DPLL_PIN_TYPE_OUTPUT)
 		*state = p->state[d->dpll_idx];
@@ -2234,9 +2242,14 @@ ice_dpll_input_esync_get(const struct dpll_pin *pin, void *pin_priv,
 		mutex_unlock(&pf->dplls.lock);
 		return -EOPNOTSUPP;
 	}
-	esync->range = ice_esync_range;
-	esync->range_num = ARRAY_SIZE(ice_esync_range);
-	if (p->flags[0] & ICE_AQC_GET_CGU_IN_CFG_FLG2_ESYNC_EN) {
+	if (pf->dplls.unmanaged) {
+		esync->range = ice_esync_range_unmanaged;
+		esync->range_num = ARRAY_SIZE(ice_esync_range_unmanaged);
+	} else {
+		esync->range = ice_esync_range;
+		esync->range_num = ARRAY_SIZE(ice_esync_range);
+	}
+	if (p->flags[0] & ICE_DPLL_IN_ESYNC_ENABLED) {
 		esync->freq = DPLL_PIN_FREQUENCY_1_HZ;
 		esync->pulse = ICE_DPLL_PIN_ESYNC_PULSE_HIGH_PERCENT;
 	} else {
@@ -2671,6 +2684,19 @@ static const struct dpll_pin_ops ice_dpll_output_ops = {
 	.esync_get = ice_dpll_output_esync_get,
 };
 
+static const struct dpll_pin_ops ice_dpll_input_unmanaged_ops = {
+	.frequency_get = ice_dpll_input_frequency_get,
+	.direction_get = ice_dpll_input_direction,
+	.state_on_dpll_get = ice_dpll_input_state_get,
+	.esync_get = ice_dpll_input_esync_get,
+};
+
+static const struct dpll_pin_ops ice_dpll_output_unmanaged_ops = {
+	.frequency_get = ice_dpll_output_frequency_get,
+	.direction_get = ice_dpll_output_direction,
+	.state_on_dpll_get = ice_dpll_output_state_get,
+};
+
 static const struct dpll_device_ops ice_dpll_ops = {
 	.lock_status_get = ice_dpll_lock_status_get,
 	.mode_get = ice_dpll_mode_get,
@@ -3225,8 +3251,10 @@ ice_dpll_deinit_direct_pins(struct ice_pf *pf, bool cgu,
 			    struct dpll_device *second)
 {
 	if (cgu) {
-		ice_dpll_unregister_pins(first, pins, ops, count);
-		ice_dpll_unregister_pins(second, pins, ops, count);
+		if (first)
+			ice_dpll_unregister_pins(first, pins, ops, count);
+		if (second)
+			ice_dpll_unregister_pins(second, pins, ops, count);
 	}
 	ice_dpll_release_pins(pins, count);
 }
@@ -3258,12 +3286,15 @@ ice_dpll_init_direct_pins(struct ice_pf *pf, bool cgu,
 	int ret;
 
 	ret = ice_dpll_get_pins(pf, pins, start_idx, count, pf->dplls.clock_id);
-	if (ret)
+	if (!cgu || ret)
 		return ret;
-	if (cgu) {
+
+	if (first) {
 		ret = ice_dpll_register_pins(first, pins, ops, count);
 		if (ret)
 			goto release_pins;
+	}
+	if (second) {
 		ret = ice_dpll_register_pins(second, pins, ops, count);
 		if (ret)
 			goto unregister_first;
@@ -3272,7 +3303,8 @@ ice_dpll_init_direct_pins(struct ice_pf *pf, bool cgu,
 	return 0;
 
 unregister_first:
-	ice_dpll_unregister_pins(first, pins, ops, count);
+	if (first)
+		ice_dpll_unregister_pins(first, pins, ops, count);
 release_pins:
 	ice_dpll_release_pins(pins, count);
 	return ret;
@@ -3529,6 +3561,18 @@ static void ice_dpll_deinit_pins(struct ice_pf *pf, bool cgu)
 	struct ice_dpll *de = &d->eec;
 	struct ice_dpll *dp = &d->pps;
 
+	if (d->unmanaged) {
+		ice_dpll_unregister_pins(dp->dpll, inputs,
+					 &ice_dpll_input_unmanaged_ops,
+					 num_inputs);
+		ice_dpll_unregister_pins(dp->dpll, outputs,
+					 &ice_dpll_output_unmanaged_ops,
+					 num_outputs);
+		ice_dpll_release_pins(inputs, num_inputs);
+		ice_dpll_release_pins(outputs, num_outputs);
+		return;
+	}
+
 	ice_dpll_deinit_rclk_pin(pf);
 	if (pf->hw.mac_type == ICE_MAC_GENERIC_3K_E825)
 		ice_dpll_deinit_fwnode_pins(pf, pf->dplls.inputs, 0);
@@ -3713,23 +3757,29 @@ static int ice_dpll_init_pins(struct ice_pf *pf, bool cgu)
 	const struct dpll_pin_ops *input_ops;
 	int ret, count;
 
-	input_ops = &ice_dpll_input_ops;
-	output_ops = &ice_dpll_output_ops;
+	if (!pf->dplls.unmanaged) {
+		input_ops = &ice_dpll_input_ops;
+		output_ops = &ice_dpll_output_ops;
+	} else {
+		input_ops = &ice_dpll_input_unmanaged_ops;
+		output_ops = &ice_dpll_output_unmanaged_ops;
+	}
 
 	ret = ice_dpll_init_direct_pins(pf, cgu, pf->dplls.inputs, 0,
 					pf->dplls.num_inputs, input_ops,
-					pf->dplls.eec.dpll,
-					pf->dplls.pps.dpll);
+					pf->dplls.eec.dpll, pf->dplls.pps.dpll);
 	if (ret)
 		return ret;
 	count = pf->dplls.num_inputs;
-	if (cgu) {
+	if (cgu || pf->dplls.unmanaged) {
 		ret = ice_dpll_init_direct_pins(pf, cgu, pf->dplls.outputs,
 						count, pf->dplls.num_outputs,
 						output_ops, pf->dplls.eec.dpll,
 						pf->dplls.pps.dpll);
 		if (ret)
 			goto deinit_inputs;
+		if (pf->dplls.unmanaged)
+			return 0;
 		count += pf->dplls.num_outputs;
 		if (!pf->dplls.generic) {
 			ret = ice_dpll_init_direct_pins(pf, cgu, pf->dplls.sma,
@@ -3842,7 +3892,8 @@ ice_dpll_init_dpll(struct ice_pf *pf, struct ice_dpll *d, bool cgu,
 
 		if (type == DPLL_TYPE_PPS && ice_dpll_is_pps_phase_monitor(pf))
 			ops =  &ice_dpll_pom_ops;
-		ice_dpll_update_state(pf, d, true);
+		if (!pf->dplls.unmanaged)
+			ice_dpll_update_state(pf, d, true);
 		ret = dpll_device_register(d->dpll, type, ops, d);
 		if (ret) {
 			dpll_device_put(d->dpll, &d->tracker);
@@ -3869,6 +3920,33 @@ static void ice_dpll_deinit_worker(struct ice_pf *pf)
 	kthread_destroy_worker(d->kworker);
 }
 
+/**
+ * ice_dpll_pin_freq_info - find pin frequency from supported ones
+ * @hw: pointer to the hardware structure
+ * @pin_idx: pin index
+ * @input: if input pin
+ *
+ * This function searches through the array of supported frequencies for a
+ * DPLL pin and returns single frequency pin is capable, if pin support only
+ * one frequency. Shall be used only for dpll with driver hardcoded frequency.
+ *
+ * Return:
+ * * 0 - failure, pin uses multiple frequencies,
+ * * frequency - success.
+ */
+static u64 ice_dpll_pin_freq_info(struct ice_hw *hw, u8 pin_idx, bool input)
+{
+	struct dpll_pin_frequency *freqs;
+	u8 freq_num;
+
+	/* Get supported frequencies for this pin */
+	freqs = ice_cgu_get_pin_freq_supp(hw, pin_idx, input, &freq_num);
+	if (!freqs || freq_num != 1 || freqs[0].min != freqs[0].max)
+		return 0;
+
+	return freqs[0].min;
+}
+
 /**
  * ice_dpll_init_worker - Initialize DPLLs periodic worker
  * @pf: board private structure
@@ -4028,6 +4106,19 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 		pins[i].prop.board_label = ice_cgu_get_pin_name(hw, i, input);
 		pins[i].prop.type = ice_cgu_get_pin_type(hw, i, input);
 		if (input) {
+			if (pf->dplls.unmanaged) {
+				pins[i].freq = ice_dpll_pin_freq_info(hw, i,
+								      input);
+				pins[i].state[0] = DPLL_PIN_STATE_CONNECTED;
+				pins[i].status =
+					ICE_AQC_GET_CGU_IN_CFG_STATUS_ESYNC_CAP;
+				pins[i].flags[0] = ICE_DPLL_IN_ESYNC_ENABLED;
+				/* skip priority, capabilities, phase range,
+				 * pin state AQ query and freq_supported -
+				 * not available for unmanaged DPLL
+				 */
+				continue;
+			}
 			ret = ice_aq_get_cgu_ref_prio(hw, de->dpll_idx, i,
 						      &de->input_prio[i]);
 			if (ret)
@@ -4041,6 +4132,16 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 			if (ice_dpll_is_sw_pin(pf, i, true))
 				pins[i].hidden = true;
 		} else {
+			if (pf->dplls.unmanaged) {
+				pins[i].freq = ice_dpll_pin_freq_info(hw, i,
+								      input);
+				pins[i].state[0] = DPLL_PIN_STATE_CONNECTED;
+				/* skip output state caps, phase range,
+				 * pin state AQ query and freq_supported -
+				 * not available for unmanaged DPLL
+				 */
+				continue;
+			}
 			ret = ice_cgu_get_output_pin_state_caps(hw, i, &caps);
 			if (ret)
 				return ret;
@@ -4058,10 +4159,13 @@ ice_dpll_init_info_direct_pins(struct ice_pf *pf,
 		pins[i].prop.freq_supported_num = freq_supp_num;
 		pins[i].pf = pf;
 	}
-	if (input)
+	if (input && !pf->dplls.unmanaged) {
 		ret = ice_dpll_init_ref_sync_inputs(pf);
+		if (ret)
+			return ret;
+	}
 
-	return ret;
+	return 0;
 }
 
 /**
@@ -4271,7 +4375,6 @@ static int ice_dpll_init_info_e825c(struct ice_pf *pf)
 
 	d->clock_id = ice_generate_clock_id(pf);
 	d->num_inputs = ICE_SYNCE_CLK_NUM;
-
 	d->inputs = kzalloc_objs(*d->inputs, d->num_inputs);
 	if (!d->inputs)
 		return -ENOMEM;
@@ -4296,6 +4399,82 @@ static int ice_dpll_init_info_e825c(struct ice_pf *pf)
 	return ret;
 }
 
+/**
+ * ice_dpll_lock_state_init_unmanaged - initialize lock state for unmanaged dpll
+ * @pf: board private structure
+ *
+ * Initialize the lock state for unmanaged DPLL by checking health status.
+ * For unmanaged DPLL, we rely on hardware autonomous operation.
+ *
+ * Return:
+ * * 0 - success
+ * * negative - init failure reason
+ */
+static int ice_dpll_lock_state_init_unmanaged(struct ice_pf *pf)
+{
+	u16 code = ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK;
+	struct ice_aqc_health_status_elem buff;
+	int ret;
+
+	ret = ice_get_last_health_status_code(&pf->hw, &buff, code);
+	if (ret)
+		return ret;
+	ice_dpll_lock_state_set_unmanaged(pf, &buff, false);
+
+	return ret;
+}
+
+/**
+ * ice_dpll_init_info_unmanaged - init dpll information for unmanaged dpll
+ * @pf: board private structure
+ *
+ * Acquire (from HW) and set basic dpll information (on pf->dplls struct).
+ * For unmanaged dpll mode.
+ *
+ * Return:
+ * * 0 - success
+ * * negative - init failure reason
+ */
+static int ice_dpll_init_info_unmanaged(struct ice_pf *pf)
+{
+	struct ice_dplls *d = &pf->dplls;
+	int ret;
+
+	d->clock_id = ice_generate_clock_id(pf);
+	d->num_inputs = ice_cgu_get_pin_num(&pf->hw, true);
+	d->num_outputs = ice_cgu_get_pin_num(&pf->hw, false);
+	ret = ice_dpll_lock_state_init_unmanaged(pf);
+	if (ret)
+		return ret;
+	d->inputs = kzalloc_objs(*d->inputs, d->num_inputs);
+	if (!d->inputs)
+		return -ENOMEM;
+
+	ret = ice_dpll_init_pins_info(pf, ICE_DPLL_PIN_TYPE_INPUT);
+	if (ret)
+		goto deinit_info;
+
+	d->outputs = kzalloc_objs(*d->outputs, d->num_outputs);
+	if (!d->outputs) {
+		ret = -ENOMEM;
+		goto deinit_info;
+	}
+
+	ret = ice_dpll_init_pins_info(pf, ICE_DPLL_PIN_TYPE_OUTPUT);
+	if (ret)
+		goto deinit_info;
+
+	d->pps.mode = DPLL_MODE_MANUAL;
+	dev_dbg(ice_pf_to_dev(pf), "%s - success, inputs:%u, outputs:%u\n",
+		__func__, d->num_inputs, d->num_outputs);
+	return 0;
+deinit_info:
+	dev_err(ice_pf_to_dev(pf), "%s - fail: d->inputs:%p, d->outputs:%p\n",
+		__func__, d->inputs, d->outputs);
+	ice_dpll_deinit_info(pf);
+	return ret;
+}
+
 /**
  * ice_dpll_init_info - prepare pf's dpll information structure
  * @pf: board private structure
@@ -4395,6 +4574,42 @@ static int ice_dpll_init_info(struct ice_pf *pf, bool cgu)
 	return ret;
 }
 
+/**
+ * ice_dpll_lock_state_set_unmanaged - determine lock state from health status
+ * @pf: board private structure
+ * @buff: health status buffer
+ * @notify: if true, notify dpll device
+ *
+ * Set unmanaged dpll lock state based on health status code and internal data.
+ * Context: Acquires and releases pf->dplls.lock (must release before notify
+ * if called).
+ */
+void ice_dpll_lock_state_set_unmanaged(struct ice_pf *pf,
+				       const struct ice_aqc_health_status_elem *buff,
+				       bool notify)
+{
+	u32 internal_data = le32_to_cpu(buff->internal_data1);
+	struct ice_dpll *d = &pf->dplls.pps;
+
+	if (!ice_pf_src_tmr_owned(pf))
+		return;
+
+	mutex_lock(&pf->dplls.lock);
+	if (buff->health_status_code == 0 ||
+	    internal_data == ICE_DPLL_HEALTH_STATUS_LOCKED)
+		d->dpll_state = DPLL_LOCK_STATUS_LOCKED;
+	else
+		d->dpll_state = DPLL_LOCK_STATUS_UNLOCKED;
+
+	if (d->prev_dpll_state == d->dpll_state)
+		notify = false;
+	else
+		d->prev_dpll_state = d->dpll_state;
+	mutex_unlock(&pf->dplls.lock);
+	if (notify && d->dpll)
+		dpll_device_change_ntf(d->dpll);
+}
+
 /**
  * ice_dpll_deinit - Disable the driver/HW support for dpll subsystem
  * the dpll device.
@@ -4414,15 +4629,55 @@ void ice_dpll_deinit(struct ice_pf *pf)
 	if (cgu)
 		ice_dpll_deinit_worker(pf);
 
-	ice_dpll_deinit_pins(pf, cgu);
+	ice_dpll_deinit_pins(pf, cgu || pf->dplls.unmanaged);
 	if (!IS_ERR_OR_NULL(pf->dplls.pps.dpll))
-		ice_dpll_deinit_dpll(pf, &pf->dplls.pps, cgu);
+		ice_dpll_deinit_dpll(pf, &pf->dplls.pps,
+				     cgu || pf->dplls.unmanaged);
 	if (!IS_ERR_OR_NULL(pf->dplls.eec.dpll))
 		ice_dpll_deinit_dpll(pf, &pf->dplls.eec, cgu);
 	ice_dpll_deinit_info(pf);
 	mutex_destroy(&pf->dplls.lock);
 }
 
+/**
+ * ice_dpll_init_unmanaged - initialize support for unmanaged dpll subsystem
+ * @pf: board private structure
+ *
+ * Set up the device dplls for unmanaged mode, register them and pins connected
+ * within Linux dpll subsystem. Allow userspace to obtain state of DPLL.
+ *
+ * Context: Initializes pf->dplls.lock mutex.
+ */
+static void ice_dpll_init_unmanaged(struct ice_pf *pf)
+{
+	struct ice_dplls *d = &pf->dplls;
+	int err;
+
+	if (!ice_pf_src_tmr_owned(pf))
+		return;
+	mutex_init(&d->lock);
+	err = ice_dpll_init_info_unmanaged(pf);
+	if (err)
+		goto err_exit;
+	err = ice_dpll_init_dpll(pf, &pf->dplls.pps, true, DPLL_TYPE_PPS);
+	if (err)
+		goto deinit_info;
+	err = ice_dpll_init_pins(pf, true);
+	if (err)
+		goto deinit_pps;
+	set_bit(ICE_FLAG_DPLL, pf->flags);
+
+	return;
+
+deinit_pps:
+	ice_dpll_deinit_dpll(pf, &pf->dplls.pps, true);
+deinit_info:
+	ice_dpll_deinit_info(pf);
+err_exit:
+	mutex_destroy(&d->lock);
+	dev_warn(ice_pf_to_dev(pf), "DPLLs init failure err:%d\n", err);
+}
+
 /**
  * ice_dpll_init_e825 - initialize support for dpll subsystem
  * @pf: board private structure
@@ -4510,8 +4765,23 @@ static void ice_dpll_init_e810(struct ice_pf *pf)
 	dev_warn(ice_pf_to_dev(pf), "DPLLs init failure err:%d\n", err);
 }
 
+/**
+ * ice_dpll_init - initialize support for dpll subsystem
+ * @pf: board private structure
+ *
+ * Set up the device dplls, register them and pins connected within Linux dpll
+ * subsystem. Allow userspace to obtain state of DPLL and handling of DPLL
+ * configuration requests.
+ *
+ * Context: Initializes pf->dplls.lock mutex.
+ */
 void ice_dpll_init(struct ice_pf *pf)
 {
+	if (pf->dplls.unmanaged) {
+		ice_dpll_init_unmanaged(pf);
+		return;
+	}
+
 	switch (pf->hw.mac_type) {
 	case ICE_MAC_GENERIC_3K_E825:
 		ice_dpll_init_e825(pf);
diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.h b/drivers/net/ethernet/intel/ice/ice_dpll.h
index 8678575359b9..bb70c4333789 100644
--- a/drivers/net/ethernet/intel/ice/ice_dpll.h
+++ b/drivers/net/ethernet/intel/ice/ice_dpll.h
@@ -23,6 +23,8 @@
 #define ICE_CGU_R11_SYNCE_S_BYP_CLK	GENMASK(6, 1)
 
 #define ICE_CGU_BYPASS_MUX_OFFSET_E825C	3
+#define ICE_DPLL_UNMANAGED_PIN_NUM	4
+#define ICE_DPLL_IN_ESYNC_ENABLED	ICE_AQC_GET_CGU_IN_CFG_FLG2_ESYNC_EN
 
 /**
  * enum ice_dpll_pin_sw - enumerate ice software pin indices:
@@ -162,14 +164,22 @@ struct ice_dplls {
 	s32 output_phase_adj_max;
 	u32 periodic_counter;
 	bool generic;
+	bool unmanaged;
 };
 
 #if IS_ENABLED(CONFIG_PTP_1588_CLOCK)
 void ice_dpll_init(struct ice_pf *pf);
 void ice_dpll_deinit(struct ice_pf *pf);
+void ice_dpll_lock_state_set_unmanaged(struct ice_pf *pf,
+				       const struct ice_aqc_health_status_elem *buff,
+				       bool notify);
 #else
 static inline void ice_dpll_init(struct ice_pf *pf) { }
 static inline void ice_dpll_deinit(struct ice_pf *pf) { }
+static inline void
+ice_dpll_lock_state_set_unmanaged(struct ice_pf *pf,
+				  const struct ice_aqc_health_status_elem *buff,
+				  bool notify) { }
 #endif
 
 #endif
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index b9a421773e91..1748e6e1ed01 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -4716,7 +4716,9 @@ void ice_deinit_dev(struct ice_pf *pf)
 
 static void ice_init_features(struct ice_pf *pf)
 {
+	u16 code = ICE_AQC_HEALTH_STATUS_INFO_LOSS_OF_LOCK;
 	struct device *dev = ice_pf_to_dev(pf);
+	int err;
 
 	if (ice_is_safe_mode(pf))
 		return;
@@ -4728,8 +4730,15 @@ static void ice_init_features(struct ice_pf *pf)
 	if (ice_is_feature_supported(pf, ICE_F_GNSS))
 		ice_gnss_init(pf);
 
+	/* Initialize unmanaged DPLL detection */
+	err = ice_is_health_status_code_supported(&pf->hw, code,
+						  &pf->dplls.unmanaged);
+	if (err || !ice_is_unmanaged_cgu_in_netlist(&pf->hw))
+		pf->dplls.unmanaged = false;
+
 	if (ice_is_feature_supported(pf, ICE_F_CGU) ||
-	    ice_is_feature_supported(pf, ICE_F_PHY_RCLK))
+	    ice_is_feature_supported(pf, ICE_F_PHY_RCLK) ||
+	    pf->dplls.unmanaged)
 		ice_dpll_init(pf);
 
 	/* Note: Flow director init failure is non-fatal to load */
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
index 24fb7a3e14d6..d8d20b1ef209 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c
@@ -20,6 +20,10 @@ static struct dpll_pin_frequency ice_cgu_pin_freq_10_mhz[] = {
 	DPLL_PIN_FREQUENCY_10MHZ,
 };
 
+static struct dpll_pin_frequency ice_cgu_pin_freq_156_25mhz[] = {
+	DPLL_PIN_FREQUENCY_RANGE(156250000, 156250000),
+};
+
 static const struct ice_cgu_pin_desc ice_e810t_sfp_cgu_inputs[] = {
 	{ "CVL-SDP22",	  ZL_REF0P, DPLL_PIN_TYPE_INT_OSCILLATOR,
 		ARRAY_SIZE(ice_cgu_pin_freq_common), ice_cgu_pin_freq_common },
@@ -131,6 +135,18 @@ static const struct ice_cgu_pin_desc ice_e823_zl_cgu_outputs[] = {
 	{ "NONE",	   ZL_OUT5, 0, 0 },
 };
 
+static const struct ice_cgu_pin_desc ice_e830_unmanaged_inputs[] = {
+	{ "1588-TIME_SYNC", 0, DPLL_PIN_TYPE_EXT,
+	  ARRAY_SIZE(ice_cgu_pin_freq_10_mhz), ice_cgu_pin_freq_10_mhz },
+};
+
+static const struct ice_cgu_pin_desc ice_e830_unmanaged_outputs[] = {
+	{ "MAC-PHY-CLK", 0, DPLL_PIN_TYPE_SYNCE_ETH_PORT,
+	  ARRAY_SIZE(ice_cgu_pin_freq_156_25mhz), ice_cgu_pin_freq_156_25mhz },
+	{ "1588-TIME_REF", 1, DPLL_PIN_TYPE_INT_OSCILLATOR,
+	  ARRAY_SIZE(ice_cgu_pin_freq_1_hz), ice_cgu_pin_freq_1_hz},
+};
+
 /* Low level functions for interacting with and managing the device clock used
  * for the Precision Time Protocol.
  *
@@ -5923,6 +5939,24 @@ ice_cgu_get_pin_desc(struct ice_hw *hw, bool input, int *size)
 	case ICE_DEV_ID_E823C_SGMII:
 		t = ice_cgu_get_pin_desc_e823(hw, input, size);
 		break;
+	case ICE_DEV_ID_E830CC_BACKPLANE:
+	case ICE_DEV_ID_E830CC_QSFP56:
+	case ICE_DEV_ID_E830CC_SFP:
+	case ICE_DEV_ID_E830CC_SFP_DD:
+	case ICE_DEV_ID_E830C_BACKPLANE:
+	case ICE_DEV_ID_E830C_QSFP:
+	case ICE_DEV_ID_E830C_SFP:
+	case ICE_DEV_ID_E830_XXV_BACKPLANE:
+	case ICE_DEV_ID_E830_XXV_QSFP:
+	case ICE_DEV_ID_E830_XXV_SFP:
+		if (input) {
+			t = ice_e830_unmanaged_inputs;
+			*size = ARRAY_SIZE(ice_e830_unmanaged_inputs);
+		} else {
+			t = ice_e830_unmanaged_outputs;
+			*size = ARRAY_SIZE(ice_e830_unmanaged_outputs);
+		}
+		break;
 	default:
 		break;
 	}
@@ -5949,6 +5983,18 @@ int ice_cgu_get_num_pins(struct ice_hw *hw, bool input)
 	return 0;
 }
 
+/**
+ * ice_cgu_get_pin_num - get pin description array size
+ * @hw: pointer to the hw struct
+ * @input: if request is done against input or output pins
+ *
+ * Return: size of pin description array for given hw.
+ */
+int ice_cgu_get_pin_num(struct ice_hw *hw, bool input)
+{
+	return ice_cgu_get_num_pins(hw, input);
+}
+
 /**
  * ice_cgu_get_pin_type - get pin's type
  * @hw: pointer to the hw struct
diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
index 1c9e77dbc770..98bca7cae88d 100644
--- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
+++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h
@@ -357,6 +357,7 @@ int ice_read_sma_ctrl(struct ice_hw *hw, u8 *data);
 int ice_write_sma_ctrl(struct ice_hw *hw, u8 data);
 int ice_ptp_read_sdp_ac(struct ice_hw *hw, __le16 *entries, uint *num_entries);
 int ice_cgu_get_num_pins(struct ice_hw *hw, bool input);
+int ice_cgu_get_pin_num(struct ice_hw *hw, bool input);
 enum dpll_pin_type ice_cgu_get_pin_type(struct ice_hw *hw, u8 pin, bool input);
 struct dpll_pin_frequency *
 ice_cgu_get_pin_freq_supp(struct ice_hw *hw, u8 pin, bool input, u8 *num);

base-commit: 1a5abe9a93c8f8f0c9d10fc313aff320e4487268
-- 
2.47.0


^ permalink raw reply related

* Re: [PATCH net-next 03/13] dpaa2-switch: change dpaa2_switch_port_set_fdb() function prototype
From: Ioana Ciornei @ 2026-05-08 11:00 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260506151540.1242997-4-ioana.ciornei@nxp.com>

On Wed, May 06, 2026 at 06:15:30PM +0300, Ioana Ciornei wrote:
> Since there dpaa2_switch_port_set_fdb() never fails and its return value
> was never checked, change its prototype to return void.
> 
> Also, instead of determining if the DPAA2 port is joining or leaving an
> upper based on the value of the 'bridge_dev' parameter, add the
> 'linking' parameter to explicitly specify the action. This will enable
> us to pass the upper device that we are joining/leaving in all possible
> cases. This will get used in the next patches to determine what kind of
> device the upper is: a bridge or a bond.
> 
> Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
> ---
>  .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 33 +++++++++----------
>  1 file changed, 15 insertions(+), 18 deletions(-)

(...)

> @@ -83,13 +82,13 @@ static u16 dpaa2_switch_port_set_fdb(struct ethsw_port_priv *port_priv,
>  
>  		if (!fdb) {
>  			port_priv->fdb->bridge_dev = NULL;
> -			return 0;
> +			return;
>  		}
>  
>  		port_priv->fdb = fdb;
>  		port_priv->fdb->in_use = true;
>  		port_priv->fdb->bridge_dev = NULL;
> -		return 0;
> +		return;


Shahiko notes:
	This problem wasn't introduced by this patch, but does this leak
	the previous FDB tracking structure?  When a port leaves a
	bridge or bond (when linking is false), it assigns the newly
	allocated fdb to port_priv->fdb and sets in_use to true.
	However, the in_use flag of the old FDB is never cleared.  If
	this port was the last user of the shared FDB, does that FDB
	remain permanently leaked? Over time, repeated topology changes
	could exhaust all FDBs. Once exhausted,
	dpaa2_switch_fdb_get_unused() would return NULL, which seems
	like it would cause the leaving port to keep its shared FDB,
	potentially leaking traffic between standalone ports and active
	bridges.

That is not true. In case port_priv is the last user of the shared FDB,
aka the last port that leaves the last bridge as noted in the code
comment, then dpaa2_switch_fdb_get_unused() will return NULL indicating
this condition. In this case, port_priv will continue to use the current
FDB which is the correct action since it's certain that this is the only
user of the FDB.


(...)

> @@ -2381,7 +2378,7 @@ static int dpaa2_switch_port_bond_join(struct net_device *netdev,
>  	u8 lag_id;
>  
>  	/* Setup the egress flood policy (broadcast, unknown unicast) */
> -	dpaa2_switch_port_set_fdb(port_priv, bond_dev);
> +	dpaa2_switch_port_set_fdb(port_priv, bond_dev, true);

Sashiko notes:
	This problem wasn't introduced by this patch, but if a port
	joins a bond that is already enslaved to a bridge, does this
	split the bond across different FDBs?

	If dpaa2_switch_port_bond_join() is called, it first executes
	dpaa2_switch_port_set_fdb(..., bond_dev, true), adopting the
	bond's shared FDB.

	Immediately after, it calls dpaa2_switch_port_bridge_join(),
	which executes dpaa2_switch_port_set_fdb(..., bridge_dev, true).

	If the bridge contains another standalone DPAA2 port, the logic
	will match that port and execute: if (other_port_priv) {
	port_priv->fdb->in_use = false; port_priv->fdb->bridge_dev =
	NULL; port_priv->fdb = other_port_priv->fdb; }

	Because port_priv->fdb is the bond's shared FDB, doesn't this
	incorrectly mark the bond's active FDB as unused while other
	bond slaves are still actively using it?


There is no such problem as the one described above.

If a port joins a bond that is already under a bridge, then the joining
port will use the FDB shared by all the ports under that same bridge.

Also, any other bond slaves already under the bridged bond will be
already using the same FDB.

Ioana

^ permalink raw reply

* Re: [PATCH net-next v7 0/2] net: mana: add ethtool private flag for full-page RX buffers
From: Dipayaan Roy @ 2026-05-08 10:48 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260506170034.327907-1-dipayanroy@linux.microsoft.com>

On Wed, May 06, 2026 at 09:58:56AM -0700, Dipayaan Roy wrote:
> On some ARM64 platforms with 4K PAGE_SIZE, utilizing page_pool 
> fragments for allocation in the RX refill path (~2kB buffer per fragment)
> causes 15-20% throughput regression under high connection counts
> (>16 TCP streams at 180+ Gbps). Using full-page buffers on these
> platforms shows no regression and restores line-rate performance.
> 
> This behavior is observed on a single platform; other platforms
> perform better with page_pool fragments, indicating this is not a
> page_pool issue but platform-specific.
> 
> This series adds an ethtool private flag "full-page-rx" to let the
> user opt in to one RX buffer per page:
> 
>   ethtool --set-priv-flags eth0 full-page-rx on
> 
> There is no behavioral change by default. The flag can be persisted
> via udev rule for affected platforms.
> 
> Changes in v7:
>   - Rebased onto net-next.
>   - Retained private flag approach after David Wei's testing on
>     Grace (ARM64) confirmed that fragment mode outperforms
>     full-page mode on other platforms, validating this is a
>     single-platform workaround rather than a generic issue.
> Changes in v6:
>   - Added missed maintainers.
> Changes in v5:
>   - Split prep refactor into separate patch (patch 1/2)
> Changes in v4:
>   - Dropping the smbios string parsing and add ethtool priv flag
>     to reconfigure the queues with full page rx buffers.
> Changes in v3:
>   - changed u8* to char*
> Changes in v2:
>   - separate reading string index and the string, remove inline.
> 
> Dipayaan Roy (2):
>   net: mana: refactor mana_get_strings() and mana_get_sset_count() to
>     use switch
>   net: mana: force full-page RX buffers via ethtool private flag
> 
>  drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
>  .../ethernet/microsoft/mana/mana_ethtool.c    | 164 ++++++++++++++----
>  include/net/mana/mana.h                       |   8 +
>  3 files changed, 163 insertions(+), 31 deletions(-)
> 
> -- 
> 2.43.0
>

Sashiko pointed out a valid point,I will reshare a v8 adressing that.
https://netdev-ai.bots.linux.dev/sashiko/#/patchset/20260506170034.327907-1-dipayanroy%40linux.microsoft.com

Thank you 

^ permalink raw reply

* RE: [PATCH net] tipc: avoid sending zero-length stream messages
From: Tung Quang Nguyen @ 2026-05-08 10:38 UTC (permalink / raw)
  To: cassiogabrielcontato@gmail.com
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	stable@vger.kernel.org, Jon Maloy, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman
In-Reply-To: <dfd8fb00-29de-4151-86a7-307a7c721f7d@gmail.com>

>Subject: Re: [PATCH net] tipc: avoid sending zero-length stream messages
>
>Hi!
>
>On 5/6/26 03:41, Tung Quang Nguyen wrote:
>>> Subject: [PATCH net] tipc: avoid sending zero-length stream messages
>>>
>>> TIPC stream send currently enters the transmit loop even when the user
>>> payload length is zero. This can build and transmit a header-only
>connection
>>> message.
>>>
>>> For local TIPC sockets, such messages are delivered synchronously through
>the
>>> loopback receive path. When this happens while socket backlog processing
>is
>>> being flushed, reply transmission can re-enter TIPC receive processing
>>> repeatedly and trigger an RCU stall.
>>>
>> Can you demonstrate this scenario using code ? It is better to point out what
>current code is faulty.
>
>The minimized user-visible trigger is essentially:
>
>      int fd[2];
>      struct msghdr msg = {};
>
>      socketpair(AF_TIPC, SOCK_STREAM, 0, fd);
>
>      /* In parallel, this makes release_sock() flush backlog. */
>      setsockopt(fd[0], SOL_SOCKET, SO_ATTACH_BPF, &bad_fd,
>                 sizeof(bad_fd));
>
>      /* Repeated zero-length MSG_PROBE send on the connected peer. */
>      for (i = 0; i < 64; i++)
>              sendmsg(fd[1], &msg, MSG_PROBE | MSG_MORE);
>
>The faulty current-code path is that TIPC stream send does not handle
>MSG_PROBE before entering __tipc_sendstream(). MSG_PROBE is supposed to
>probe without transmitting data, but the call reaches __tipc_sendstream()
>with dlen == 0.
>
>__tipc_sendstream() uses a do/while loop, so even when dlen is 0 the body
>runs once:
>
>      send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE);
>
>At that point send is 0, but the code can still call tipc_msg_append() or
>tipc_msg_build(), creating a TIPC connection message with only the header.
>It then calls:
>
>      tipc_node_xmit(net, txq, dnode, tsk->portid);
>
>For a local TIPC socketpair, tipc_node_xmit() takes the in_own_node() path
>and synchronously calls tipc_sk_rcv(). When this happens while
>release_sock() is processing backlog, the receive path can generate
>response traffic through tipc_node_distr_xmit(), which re-enters the same
>local receive path.
>
>I should have made that explicit in the changelog and pointed at the
>missing MSG_PROBE handling as the faulty part.
TIPC does not support MSG_PROBE. So, It makes no sense handling this flag.
Even if user application sends zero length data message using this flag, the message will be dropped at receiving side.
>>>

>>> diff --git a/net/tipc/socket.c b/net/tipc/socket.c index
>>> 9329919fb07f..3c7838713d74 100644
>>> --- a/net/tipc/socket.c
>>> +++ b/net/tipc/socket.c
>>> @@ -1585,6 +1585,8 @@ static int __tipc_sendstream(struct socket *sock,
>>> struct msghdr *m, size_t dlen)
>>> 					 tipc_sk_connected(sk)));
>>> 		if (unlikely(rc))
>>> 			break;
>>> +		if (unlikely(!dlen && sk->sk_type == SOCK_STREAM))
>>> +			break;
>> This change is wrong. It immediately breaks normal connection set up
>because the ACK  (zero in length) has no chance to be sent back from the
>server to the client.
>> Please try to test your patch before submission.
>
>I did test the patch with the syzkaller C repro under QEMU for 10 minutes, and
>it did not trigger the reported RCU stall:
>
>      /tmp/repro & pid=$!; sleep 600; kill $pid
>      dmesg | grep -Ei 'rcu.*stall|rcu_preempt|soft
>lockup|panic|BUG|WARNING' (attached)
>
>The dmesg check did not show any repro-triggered RCU stall, soft lockup,
>panic, BUG, or WARNING. But that test only covered the syzkaller trigger;
>it did not cover normal active/passive TIPC stream connection setup, which
>your review points out is broken by this version.
>
>I re-checked the TIPC connection setup path as well.
>
>tipc_accept() intentionally sends the server-side ACK as a zero-length
>stream message:
>
>      iov_iter_kvec(&m.msg_iter, ITER_SOURCE, NULL, 0, 0);
>      __tipc_sendstream(new_sock, &m, 0);
>
>So blocking all zero-length sends inside __tipc_sendstream() prevents
>that ACK from being transmitted and can break normal SOCK_STREAM
>connection setup.
>
>After re-checking the syzkaller repro, the real trigger seems to be narrower
I am aware of this syzbot report about this issue. I will try to find some time to fix it.

>than zero-length stream send. The repro uses a user sendmsg() with
>MSG_PROBE | MSG_MORE and no payload on an already connected TIPC
>stream
>socket. MSG_PROBE is supposed to probe without sending, but TIPC stream
>send currently lets that path reach __tipc_sendstream(), where the
>do/while body can still run once with dlen == 0 and build/transmit a
>header-only message.
>
>I think we should avoid suppressing the internal __tipc_sendstream() ACK path
>and instead handle the user-originated zero-length MSG_PROBE case before it
>reaches the internal stream send helper.
>
>The v2 fix would look like this:
>
>-- 8< --
>
>diff --git a/net/tipc/socket.c b/net/tipc/socket.c
>index 9329919fb07f..4783df337971 100644
>--- a/net/tipc/socket.c
>+++ b/net/tipc/socket.c
>@@ -1542,6 +1542,10 @@ static int tipc_sendstream(struct socket *sock, struct
>msghdr *m, size_t dsz)
>        struct sock *sk = sock->sk;
>        int ret;
>
>+       /* MSG_PROBE asks only to probe the path, not to transmit data. */
>+       if (unlikely((m->msg_flags & MSG_PROBE) && !dsz))
>+               return 0;
>+
This is wrong. We cannot do this because it just silences syzbot and hides the real nested lock issue in current code.
>        lock_sock(sk);
>        ret = __tipc_sendstream(sock, m, dsz);
>        release_sock(sk);
>-- >8 --
>
>I tested the reworked patch with the syzkaller C reproducer under QEMU.
>The reproducer was run for 10 minutes:
>
>      /tmp/repro & pid=$!; sleep 600; kill $pid
>      dmesg | grep -Ei 'rcu.*stall|rcu_preempt|soft
>lockup|panic|BUG|WARNING' (attached)
>
>The grep only matched boot-time command-line/debug messages; no
>repro-triggered RCU stall, soft lockup, panic, BUG, or WARNING appeared.
>
>What you think?

^ permalink raw reply

* Re: [PATCH net] vsock/virtio: fix skb overhead accounting to preserve full buf_alloc
From: Stefano Garzarella @ 2026-05-08 10:38 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, Eric Dumazet, Stefan Hajnoczi, virtualization,
	David S. Miller, Jason Wang, Simon Horman, linux-kernel,
	Paolo Abeni, Xuan Zhuo, kvm, Jakub Kicinski, Eugenio Pérez
In-Reply-To: <20260508063104-mutt-send-email-mst@kernel.org>

On Fri, May 08, 2026 at 06:33:13AM -0400, Michael S. Tsirkin wrote:
>On Fri, May 08, 2026 at 12:01:50PM +0200, Stefano Garzarella wrote:
>> On Fri, May 08, 2026 at 05:53:07AM -0400, Michael S. Tsirkin wrote:
>> > On Fri, May 08, 2026 at 11:23:30AM +0200, Stefano Garzarella wrote:
>> > > From: Stefano Garzarella <sgarzare@redhat.com>
>> > >
>> > > After commit 059b7dbd20a6 ("vsock/virtio: fix potential unbounded skb
>> > > queue"), virtio_transport_inc_rx_pkt() subtracts per-skb overhead from
>> > > buf_alloc when checking whether a new packet fits. This reduces the
>> > > effective receive buffer below what the user configured via
>> > > SO_VM_SOCKETS_BUFFER_SIZE, causing legitimate data packets to be
>> > > silently dropped and applications that rely on the full buffer size
>> > > to deadlock.
>> > >
>> > > Also, the reduced space is not communicated to the remote peer, so
>> > > its credit calculation accounts more credit than the receiver will
>> > > actually accept, causing data loss (there is no retransmission).
>> > >
>> > > This also causes failures in tools/testing/vsock/vsock_test.c.
>> > > Test 18 sometimes fails, while test 22 always fails in this way:
>> > >     18 - SOCK_STREAM MSG_ZEROCOPY...hash mismatch
>> > >
>> > >     22 - SOCK_STREAM virtio credit update + SO_RCVLOWAT...send failed:
>> > >     Resource temporarily unavailable
>> > >
>> > > Fix this by introducing virtio_transport_rx_buf_size() to calculate the
>> > > size of the RX buffer based on the overhead. Using it in the acceptance
>> > > check, the advertised buf_alloc, and the credit update decision.
>> > > Use buf_alloc * 2 as total budget (payload + overhead), similar to how
>> > > SO_RCVBUF is doubled to reserve space for sk_buff metadata.
>> > > The function returns buf_alloc as long as overhead fits within the
>> > > reservation, then gradually reduces toward 0 as overhead exceeds
>> > > buf_alloc (e.g. under small-packet flooding), informing the peer to
>> > > slow down.
>> > >
>> > > Fixes: 059b7dbd20a6 ("vsock/virtio: fix potential unbounded skb queue")
>> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
>> >
>> >
>> > unfortunately, this is a bit of a spec violation and there is no guarantee
>> > it helps.
>>
>> Loosing data like we are doing in 059b7dbd20a6 is even worse IMHO.
>>
>> >
>> > a spec violation because the spec says:
>> > Only payload bytes are counted and header bytes are not
>> > included
>> >
>> > and the implication is that a side can not reduce its own buf_alloc.
>> >
>> > no guarantee because the other side is not required to process your
>> > packets, so it might not see your buf alloc reduction.
>> >
>> > as designed in the current spec, you can only increase your buf alloc,
>> > not decrease it.
>>
>> We never enforced this, currently an user can reduce it by
>> SO_VM_SOCKETS_BUFFER_SIZE and we haven't blocked it since virtio-vsock was
>> introduced, should we update the spec?
>
>
>it's not that we need to enforce it, it's that all synchronization
>assumes this. as in, anyone can use an old copy until they run out
>of credits.
>
>
>> >
>> > what can be done:
>> > - more efficient storage for small packets (poc i posted)
>> > - reduce buf alloc ahead of the time
>>
>> That's basically what I'm doing here: I'm using twice the size of
>> `buf_alloc` (just like `SO_RCVBUF` does for other socket types) and telling
>> the other peer just `buf_alloc`.
>>
>> But then, somehow, we have to let the other person know that we're running
>> out of space. With this patch that only happens when the other peer isn't
>> behaving properly, sending so many small packets that the overhead exceeds
>> `buf_alloc`.
>>
>> Stefano
>
>what is "not proper" here, it is up to the application what to send.

Sure, but here we're just slowing down the application by telling it we 
don't have any more space.

Again, without this patch we are just dropping data, which IMO is even 
worse.

So I think we should merge this for now, while we handle better the EOM.
If you prefer, I can drop the part where we reduce the buf_alloc 
advertised to the other peer, but at least we should drop data after 
`buf_alloc * 2` IMO.


Stefano


^ permalink raw reply

* Re: [PATCH v1 bpf-next 7/8] bpf: tcp: Add SOCK_OPS rcvlowat hook.
From: Jiayuan Chen @ 2026-05-08 10:37 UTC (permalink / raw)
  To: Kuniyuki Iwashima, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Martin KaFai Lau, Eduard Zingerman,
	Kumar Kartikeya Dwivedi
  Cc: Yonghong Song, John Fastabend, Stanislav Fomichev, Eric Dumazet,
	Neal Cardwell, Willem de Bruijn, Tenzin Ukyab, Kuniyuki Iwashima,
	bpf, netdev
In-Reply-To: <20260508073355.3916746-8-kuniyu@google.com>


On 5/8/26 3:33 PM, Kuniyuki Iwashima wrote:
> Now, it is time to add the new hooks for BPF_SOCK_OPS_RCVLOWAT_CB.
>
> Let's invoke the BPF SOCK_OPS prog when
>
>    1. TCP stack enqueues skb to sk->sk_receive_queue
>       -> tcp_queue_rcv(), tcp_ofo_queue(), and tcp_fastopen_add_skb()
>
>    2. TCP recvmsg() completes
>       -> __tcp_cleanup_rbuf()
>
> This will allow the BPF prog to parse each skb and dynamically
> adjust sk->sk_rcvlowat to suppress unnecessary EPOLLIN wakeups
> until sufficient data (e.g., a full RPC frame) is available
> in the receive queue.
>
> Note that the direct access to bpf_sock_ops.data is intentionally
> disabled by passing 0 as end_offset.
>
> Instead, the BPF prog is supposed to use bpf_skb_load_bytes()
> with bpf_sock_ops because payload is not in the linear area
> with TCP header/data split on and skb may contain a RPC
> descriptor in skb frag.  This also simplifies the BPF prog.
>
> Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
> ---
>   include/net/tcp.h       | 14 ++++++++++++++
>   net/ipv4/tcp.c          |  2 ++
>   net/ipv4/tcp_fastopen.c |  2 ++
>   net/ipv4/tcp_input.c    | 10 ++++++++++
>   4 files changed, 28 insertions(+)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 4e9e634e276b..003e46c9b500 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -737,6 +737,20 @@ static inline struct request_sock *cookie_bpf_check(struct net *net, struct sock
>   }
>   #endif
>   
> +#ifdef CONFIG_CGROUP_BPF
> +void bpf_skops_rcvlowat(struct sock *sk, struct sk_buff *skb);
> +
> +static inline void tcp_bpf_rcvlowat(struct sock *sk, struct sk_buff *skb)
> +{
> +	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RCVLOWAT_CB_FLAG))
> +		bpf_skops_rcvlowat(sk, skb);
> +}
> +#else
> +static inline void tcp_bpf_rcvlowat(struct sock *sk, struct sk_buff *skb)
> +{
> +}
> +#endif
> +
>   /* From net/ipv6/syncookies.c */
>   int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th);
>   struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 1d9e52fc454f..80144b97a87a 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1602,6 +1602,8 @@ void __tcp_cleanup_rbuf(struct sock *sk, int copied)
>   		tcp_mstamp_refresh(tp);
>   		tcp_send_ack(sk);
>   	}
> +
> +	tcp_bpf_rcvlowat(sk, NULL);
>   }
>   

tcp_read_skb (process frame 1 and __skb_unlink)
└─ sk_psock_verdict_recv
     └─ sk_psock_verdict_apply
         └─ tcp_eat_skb
             └─ tcp_cleanup_rbuf
                 └─ __tcp_cleanup_rbuf
                     └─ BPF RCVLOWAT_CB
                         └─ bpf_sock_ops_tcp_set_rcvlowat (wakeup=true)
                             └─ tcp_data_ready
                                 └─ sk_psock_verdict_data_ready
                                     └─ tcp_read_skb (frame 2)
                                         └─ ... → tcp_read_skb (frame 3) ...

For strparser it use read_sock instead of read_skb and it will become 
more complicated...

I think this will cause stack overflow with amounts of skbs in receive 
queue or infinite call(not tested) for sockmap/kTLS/strparser.


^ permalink raw reply

* Re: [PATCH net] vsock/virtio: fix skb overhead accounting to preserve full buf_alloc
From: Michael S. Tsirkin @ 2026-05-08 10:33 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: netdev, Eric Dumazet, Stefan Hajnoczi, virtualization,
	David S. Miller, Jason Wang, Simon Horman, linux-kernel,
	Paolo Abeni, Xuan Zhuo, kvm, Jakub Kicinski, Eugenio Pérez
In-Reply-To: <af2y2Fyp7H-om-ur@sgarzare-redhat>

On Fri, May 08, 2026 at 12:01:50PM +0200, Stefano Garzarella wrote:
> On Fri, May 08, 2026 at 05:53:07AM -0400, Michael S. Tsirkin wrote:
> > On Fri, May 08, 2026 at 11:23:30AM +0200, Stefano Garzarella wrote:
> > > From: Stefano Garzarella <sgarzare@redhat.com>
> > > 
> > > After commit 059b7dbd20a6 ("vsock/virtio: fix potential unbounded skb
> > > queue"), virtio_transport_inc_rx_pkt() subtracts per-skb overhead from
> > > buf_alloc when checking whether a new packet fits. This reduces the
> > > effective receive buffer below what the user configured via
> > > SO_VM_SOCKETS_BUFFER_SIZE, causing legitimate data packets to be
> > > silently dropped and applications that rely on the full buffer size
> > > to deadlock.
> > > 
> > > Also, the reduced space is not communicated to the remote peer, so
> > > its credit calculation accounts more credit than the receiver will
> > > actually accept, causing data loss (there is no retransmission).
> > > 
> > > This also causes failures in tools/testing/vsock/vsock_test.c.
> > > Test 18 sometimes fails, while test 22 always fails in this way:
> > >     18 - SOCK_STREAM MSG_ZEROCOPY...hash mismatch
> > > 
> > >     22 - SOCK_STREAM virtio credit update + SO_RCVLOWAT...send failed:
> > >     Resource temporarily unavailable
> > > 
> > > Fix this by introducing virtio_transport_rx_buf_size() to calculate the
> > > size of the RX buffer based on the overhead. Using it in the acceptance
> > > check, the advertised buf_alloc, and the credit update decision.
> > > Use buf_alloc * 2 as total budget (payload + overhead), similar to how
> > > SO_RCVBUF is doubled to reserve space for sk_buff metadata.
> > > The function returns buf_alloc as long as overhead fits within the
> > > reservation, then gradually reduces toward 0 as overhead exceeds
> > > buf_alloc (e.g. under small-packet flooding), informing the peer to
> > > slow down.
> > > 
> > > Fixes: 059b7dbd20a6 ("vsock/virtio: fix potential unbounded skb queue")
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > 
> > 
> > unfortunately, this is a bit of a spec violation and there is no guarantee
> > it helps.
> 
> Loosing data like we are doing in 059b7dbd20a6 is even worse IMHO.
> 
> > 
> > a spec violation because the spec says:
> > Only payload bytes are counted and header bytes are not
> > included
> > 
> > and the implication is that a side can not reduce its own buf_alloc.
> > 
> > no guarantee because the other side is not required to process your
> > packets, so it might not see your buf alloc reduction.
> > 
> > as designed in the current spec, you can only increase your buf alloc,
> > not decrease it.
> 
> We never enforced this, currently an user can reduce it by
> SO_VM_SOCKETS_BUFFER_SIZE and we haven't blocked it since virtio-vsock was
> introduced, should we update the spec?


it's not that we need to enforce it, it's that all synchronization
assumes this. as in, anyone can use an old copy until they run out
of credits.


> > 
> > what can be done:
> > - more efficient storage for small packets (poc i posted)
> > - reduce buf alloc ahead of the time
> 
> That's basically what I'm doing here: I'm using twice the size of
> `buf_alloc` (just like `SO_RCVBUF` does for other socket types) and telling
> the other peer just `buf_alloc`.
> 
> But then, somehow, we have to let the other person know that we're running
> out of space. With this patch that only happens when the other peer isn't
> behaving properly, sending so many small packets that the overhead exceeds
> `buf_alloc`.
> 
> Stefano

what is "not proper" here, it is up to the application what to send.


> > 
> > > ---
> > >  net/vmw_vsock/virtio_transport_common.c | 31 +++++++++++++++++++++----
> > >  1 file changed, 27 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
> > > index 9b8014516f4f..94a4beb8fd61 100644
> > > --- a/net/vmw_vsock/virtio_transport_common.c
> > > +++ b/net/vmw_vsock/virtio_transport_common.c
> > > @@ -444,12 +444,32 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
> > >  	return ret;
> > >  }
> > > 
> > > +/* vvs->rx_lock held by the caller */
> > > +static u32 virtio_transport_rx_buf_size(struct virtio_vsock_sock *vvs)
> > > +{
> > > +	u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0);
> > > +	/* Use buf_alloc * 2 as total budget (payload + overhead), similar to
> > > +	 * how SO_RCVBUF is doubled to reserve space for sk_buff metadata.
> > > +	 */
> > > +	u64 total_budget = (u64)vvs->buf_alloc * 2;
> > > +
> > > +	/* Overhead within buf_alloc: full buf_alloc available for payload */
> > > +	if (skb_overhead < vvs->buf_alloc)
> > > +		return vvs->buf_alloc;
> > > +
> > > +	/* Overhead exceeded buf_alloc: gradually reduce to bound skb queue */
> > > +	if (skb_overhead < total_budget)
> > > +		return total_budget - skb_overhead;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
> > >  					u32 len)
> > >  {
> > > -	u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0);
> > > +	u32 rx_buf_size = virtio_transport_rx_buf_size(vvs);
> > > 
> > > -	if (skb_overhead + vvs->buf_used + len > vvs->buf_alloc)
> > > +	if (!rx_buf_size || vvs->buf_used + len > rx_buf_size)
> > >  		return false;
> > > 
> > >  	vvs->rx_bytes += len;
> > > @@ -472,7 +492,7 @@ void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *
> > >  	spin_lock_bh(&vvs->rx_lock);
> > >  	vvs->last_fwd_cnt = vvs->fwd_cnt;
> > >  	hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
> > > -	hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc);
> > > +	hdr->buf_alloc = cpu_to_le32(virtio_transport_rx_buf_size(vvs));
> > >  	spin_unlock_bh(&vvs->rx_lock);
> > >  }
> > >  EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
> > > @@ -594,6 +614,7 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
> > >  	bool low_rx_bytes;
> > >  	int err = -EFAULT;
> > >  	size_t total = 0;
> > > +	u32 rx_buf_size;
> > >  	u32 free_space;
> > > 
> > >  	spin_lock_bh(&vvs->rx_lock);
> > > @@ -639,7 +660,9 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
> > >  	}
> > > 
> > >  	fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt;
> > > -	free_space = vvs->buf_alloc - fwd_cnt_delta;
> > > +	rx_buf_size = virtio_transport_rx_buf_size(vvs);
> > > +	free_space = rx_buf_size > fwd_cnt_delta ?
> > > +		     rx_buf_size - fwd_cnt_delta : 0;
> > >  	low_rx_bytes = (vvs->rx_bytes <
> > >  			sock_rcvlowat(sk_vsock(vsk), 0, INT_MAX));
> > > 
> > > --
> > > 2.54.0
> > 


^ permalink raw reply

* Re: [PATCH v2] mptcp: do not drop partial packets
From: Paolo Abeni @ 2026-05-08 10:23 UTC (permalink / raw)
  To: Shardul Bankar, matttbe, martineau
  Cc: geliang, davem, edumazet, kuba, horms, netdev, mptcp,
	linux-kernel, janak, kalpan.jani, Shardul Bankar
In-Reply-To: <20260422143931.43281-1-shardul.b@mpiricsoftware.com>

On 4/22/26 4:39 PM, Shardul Bankar wrote:
> When a packet arrives with map_seq < ack_seq < end_seq, the beginning
> of the packet has already been acknowledged but the end contains new
> data.  Currently the entire packet is dropped as "old data," forcing
> the sender to retransmit.
> 
> Instead, skip the already-acked bytes by adjusting the skb offset and
> enqueue only the new portion.  Update bytes_received and ack_seq to
> reflect the new data consumed.
> 
> A previous attempt at this fix (commit 1d2ce718811a ("mptcp: do not
> drop partial packets"), reverted in commit bf39160c4218 ("Revert
> "mptcp: do not drop partial packets"")) also added a zero-window
> check and changed rcv_wnd_sent initialization, which caused test
> regressions.  This version addresses only the partial packet handling
> without modifying receive window accounting.
> 
> Fixes: ab174ad8ef76 ("mptcp: move ooo skbs into msk out of order queue.")
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/600
> Signed-off-by: Shardul Bankar <shardul.b@mpiricsoftware.com>

It would be great if you could send a v3 addressing the AI comment.

If you don't have time or capacity, please LMK, I can send v3 with your
SoB and the needed editing.

Thanks,

Paolo


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox