Linux Documentation
 help / color / mirror / Atom feed
* [PATCH net-next v04 6/6] hinic3: Remove unneeded coalesce parameters
From: Fan Gong @ 2026-04-08  4:03 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775618797.git.zhuyikai1@h-partners.com>

  Remove unneeded coalesce parameters in irq handling.

Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 drivers/net/ethernet/huawei/hinic3/hinic3_irq.c | 6 +-----
 drivers/net/ethernet/huawei/hinic3/hinic3_rx.h  | 3 ---
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index d3b3927b5408..42464c007174 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -156,13 +156,9 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
 
 	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
-	if (err) {
+	if (err)
 		netdev_err(netdev,
 			   "Failed to modify moderation for Queue: %u\n", q_id);
-	} else {
-		nic_dev->rxqs[q_id].last_coalesc_timer_cfg = coalesc_timer_cfg;
-		nic_dev->rxqs[q_id].last_pending_limit = pending_limit;
-	}
 
 	return err;
 }
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index c11d080408a7..2ab691ed11a9 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -111,9 +111,6 @@ struct hinic3_rxq {
 	dma_addr_t             cqe_start_paddr;
 
 	struct dim             dim;
-
-	u8                     last_coalesc_timer_cfg;
-	u8                     last_pending_limit;
 } ____cacheline_aligned;
 
 struct hinic3_dyna_rxq_res {
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v04 3/6] hinic3: Add ethtool coalesce ops
From: Fan Gong @ 2026-04-08  4:03 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775618797.git.zhuyikai1@h-partners.com>

  Implement following ethtool callback function:
.get_coalesce
.set_coalesce

  These callbacks allow users to utilize ethtool for detailed
RX coalesce configuration and monitoring.

Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 232 +++++++++++++++++-
 1 file changed, 230 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index be26698fc658..a4b2d5ba81f8 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -17,6 +17,11 @@
 #include "hinic3_nic_cfg.h"
 
 #define HINIC3_MGMT_VERSION_MAX_LEN     32
+/* Coalesce time properties in microseconds */
+#define COALESCE_PENDING_LIMIT_UNIT     8
+#define COALESCE_TIMER_CFG_UNIT         5
+#define COALESCE_MAX_PENDING_LIMIT      (255 * COALESCE_PENDING_LIMIT_UNIT)
+#define COALESCE_MAX_TIMER_CFG          (255 * COALESCE_TIMER_CFG_UNIT)
 
 static void hinic3_get_drvinfo(struct net_device *netdev,
 			       struct ethtool_drvinfo *info)
@@ -985,9 +990,230 @@ static void hinic3_get_pause_stats(struct net_device *netdev,
 	kfree(ps);
 }
 
+static int hinic3_set_queue_coalesce(struct net_device *netdev, u16 q_id,
+				     struct hinic3_intr_coal_info *coal)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_intr_coal_info *intr_coal;
+	struct hinic3_interrupt_info info = {};
+	int err;
+
+	intr_coal = &nic_dev->intr_coalesce[q_id];
+
+	intr_coal->coalesce_timer_cfg = coal->coalesce_timer_cfg;
+	intr_coal->pending_limit = coal->pending_limit;
+	intr_coal->rx_pending_limit_low = coal->rx_pending_limit_low;
+	intr_coal->rx_pending_limit_high = coal->rx_pending_limit_high;
+
+	if (!test_bit(HINIC3_INTF_UP, &nic_dev->flags) ||
+	    q_id >= nic_dev->q_params.num_qps || nic_dev->adaptive_rx_coal)
+		return 0;
+
+	info.msix_index = nic_dev->q_params.irq_cfg[q_id].msix_entry_idx;
+	info.interrupt_coalesc_set = 1;
+	info.coalesc_timer_cfg = intr_coal->coalesce_timer_cfg;
+	info.pending_limit = intr_coal->pending_limit;
+	info.resend_timer_cfg = intr_coal->resend_timer_cfg;
+	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
+	if (err) {
+		netdev_warn(netdev, "Failed to set queue%u coalesce\n", q_id);
+		return err;
+	}
+
+	return 0;
+}
+
+static int is_coalesce_exceed_limit(struct net_device *netdev,
+				    const struct ethtool_coalesce *coal)
+{
+	const struct {
+		const char *name;
+		u32 value;
+		u32 limit;
+	} coalesce_limits[] = {
+		{"rx_coalesce_usecs",
+		 coal->rx_coalesce_usecs,
+		 COALESCE_MAX_TIMER_CFG},
+		{"rx_max_coalesced_frames",
+		 coal->rx_max_coalesced_frames,
+		 COALESCE_MAX_PENDING_LIMIT},
+		{"rx_max_coalesced_frames_low",
+		 coal->rx_max_coalesced_frames_low,
+		 COALESCE_MAX_PENDING_LIMIT},
+		{"rx_max_coalesced_frames_high",
+		 coal->rx_max_coalesced_frames_high,
+		 COALESCE_MAX_PENDING_LIMIT},
+	};
+
+	for (int i = 0; i < ARRAY_SIZE(coalesce_limits); i++) {
+		if (coalesce_limits[i].value > coalesce_limits[i].limit) {
+			netdev_err(netdev, "%s out of range %d-%d\n",
+				   coalesce_limits[i].name, 0,
+				   coalesce_limits[i].limit);
+			return -ERANGE;
+		}
+	}
+	return 0;
+}
+
+static int is_coalesce_legal(struct net_device *netdev,
+			     const struct ethtool_coalesce *coal)
+{
+	int err;
+
+	err = is_coalesce_exceed_limit(netdev, coal);
+	if (err)
+		return err;
+
+	if (coal->rx_max_coalesced_frames_low >
+	    coal->rx_max_coalesced_frames_high) {
+		netdev_err(netdev, "invalid coalesce frame high %u, low %u, unit %d\n",
+			   coal->rx_max_coalesced_frames_high,
+			   coal->rx_max_coalesced_frames_low,
+			   COALESCE_PENDING_LIMIT_UNIT);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void check_coalesce_align(struct net_device *netdev,
+				 u32 item, u32 unit, const char *str)
+{
+	if (item % unit)
+		netdev_warn(netdev, "%s in %d units, change to %u\n",
+			    str, unit, item - item % unit);
+}
+
+#define CHECK_COALESCE_ALIGN(member, unit) \
+	check_coalesce_align(netdev, member, unit, #member)
+
+static void check_coalesce_changed(struct net_device *netdev,
+				   u32 item, u32 unit, u32 ori_val,
+				   const char *obj_str, const char *str)
+{
+	if ((item / unit) != ori_val)
+		netdev_dbg(netdev, "Change %s from %d to %u %s\n",
+			   str, ori_val * unit, item - item % unit, obj_str);
+}
+
+#define CHECK_COALESCE_CHANGED(member, unit, ori_val, obj_str) \
+	check_coalesce_changed(netdev, member, unit, ori_val, obj_str, #member)
+
+static int hinic3_set_hw_coal_param(struct net_device *netdev,
+				    struct hinic3_intr_coal_info *intr_coal)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int err;
+	u16 i;
+
+	for (i = 0; i < nic_dev->max_qps; i++) {
+		err = hinic3_set_queue_coalesce(netdev, i, intr_coal);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int hinic3_get_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *coal,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_intr_coal_info *interrupt_info;
+
+	interrupt_info = &nic_dev->intr_coalesce[0];
+
+	/* TX/RX uses the same interrupt.
+	 * So we only declare RX ethtool_coalesce parameters.
+	 */
+	coal->rx_coalesce_usecs = interrupt_info->coalesce_timer_cfg *
+				  COALESCE_TIMER_CFG_UNIT;
+	coal->rx_max_coalesced_frames = interrupt_info->pending_limit *
+					COALESCE_PENDING_LIMIT_UNIT;
+
+	coal->use_adaptive_rx_coalesce = nic_dev->adaptive_rx_coal;
+
+	coal->rx_max_coalesced_frames_high =
+		interrupt_info->rx_pending_limit_high *
+		COALESCE_PENDING_LIMIT_UNIT;
+
+	coal->rx_max_coalesced_frames_low =
+		interrupt_info->rx_pending_limit_low *
+		COALESCE_PENDING_LIMIT_UNIT;
+
+	return 0;
+}
+
+static int hinic3_set_coalesce(struct net_device *netdev,
+			       struct ethtool_coalesce *coal,
+			       struct kernel_ethtool_coalesce *kernel_coal,
+			       struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_intr_coal_info *ori_intr_coal;
+	struct hinic3_intr_coal_info intr_coal = {};
+	char obj_str[32];
+	int err;
+
+	err = is_coalesce_legal(netdev, coal);
+	if (err)
+		return err;
+
+	CHECK_COALESCE_ALIGN(coal->rx_coalesce_usecs, COALESCE_TIMER_CFG_UNIT);
+	CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames,
+			     COALESCE_PENDING_LIMIT_UNIT);
+	CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames_high,
+			     COALESCE_PENDING_LIMIT_UNIT);
+	CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames_low,
+			     COALESCE_PENDING_LIMIT_UNIT);
+
+	ori_intr_coal = &nic_dev->intr_coalesce[0];
+	snprintf(obj_str, sizeof(obj_str), "for netdev");
+
+	CHECK_COALESCE_CHANGED(coal->rx_coalesce_usecs, COALESCE_TIMER_CFG_UNIT,
+			       ori_intr_coal->coalesce_timer_cfg, obj_str);
+	CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames,
+			       COALESCE_PENDING_LIMIT_UNIT,
+			       ori_intr_coal->pending_limit, obj_str);
+	CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames_high,
+			       COALESCE_PENDING_LIMIT_UNIT,
+			       ori_intr_coal->rx_pending_limit_high, obj_str);
+	CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames_low,
+			       COALESCE_PENDING_LIMIT_UNIT,
+			       ori_intr_coal->rx_pending_limit_low, obj_str);
+
+	intr_coal.coalesce_timer_cfg =
+		(u8)(coal->rx_coalesce_usecs / COALESCE_TIMER_CFG_UNIT);
+	intr_coal.pending_limit = (u8)(coal->rx_max_coalesced_frames /
+				      COALESCE_PENDING_LIMIT_UNIT);
+
+	nic_dev->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+
+	intr_coal.rx_pending_limit_high =
+		(u8)(coal->rx_max_coalesced_frames_high /
+		     COALESCE_PENDING_LIMIT_UNIT);
+
+	intr_coal.rx_pending_limit_low =
+		(u8)(coal->rx_max_coalesced_frames_low /
+		     COALESCE_PENDING_LIMIT_UNIT);
+
+	/* coalesce timer or pending set to zero will disable coalesce */
+	if (!nic_dev->adaptive_rx_coal &&
+	    (!intr_coal.coalesce_timer_cfg || !intr_coal.pending_limit))
+		netdev_warn(netdev, "Coalesce will be disabled\n");
+
+	return hinic3_set_hw_coal_param(netdev, &intr_coal);
+}
+
 static const struct ethtool_ops hinic3_ethtool_ops = {
-	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
-					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
+	.supported_coalesce_params      = ETHTOOL_COALESCE_RX_USECS |
+					  ETHTOOL_COALESCE_RX_MAX_FRAMES |
+					  ETHTOOL_COALESCE_USE_ADAPTIVE_RX |
+					  ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW |
+					  ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH,
 	.get_link_ksettings             = hinic3_get_link_ksettings,
 	.get_drvinfo                    = hinic3_get_drvinfo,
 	.get_msglevel                   = hinic3_get_msglevel,
@@ -1003,6 +1229,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_eth_ctrl_stats             = hinic3_get_eth_ctrl_stats,
 	.get_rmon_stats                 = hinic3_get_rmon_stats,
 	.get_pause_stats                = hinic3_get_pause_stats,
+	.get_coalesce                   = hinic3_get_coalesce,
+	.set_coalesce                   = hinic3_set_coalesce,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v04 5/6] hinic3: Configure netdev->watchdog_timeo to set nic tx timeout
From: Fan Gong @ 2026-04-08  4:03 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775618797.git.zhuyikai1@h-partners.com>

  Configure netdev watchdog timeout to improve transmission reliability.

Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 drivers/net/ethernet/huawei/hinic3/hinic3_main.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
index 3b470978714a..7e09b4b2da9f 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
@@ -33,6 +33,8 @@
 #define HINIC3_RX_PENDING_LIMIT_LOW   2
 #define HINIC3_RX_PENDING_LIMIT_HIGH  8
 
+#define HINIC3_WATCHDOG_TIMEOUT       5
+
 static void init_intr_coal_param(struct net_device *netdev)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
@@ -246,6 +248,8 @@ static void hinic3_assign_netdev_ops(struct net_device *netdev)
 {
 	hinic3_set_netdev_ops(netdev);
 	hinic3_set_ethtool_ops(netdev);
+
+	netdev->watchdog_timeo = HINIC3_WATCHDOG_TIMEOUT * HZ;
 }
 
 static void netdev_feature_init(struct net_device *netdev)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v04 2/6] hinic3: Add ethtool statistic ops
From: Fan Gong @ 2026-04-08  4:03 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775618797.git.zhuyikai1@h-partners.com>

  Add PF/VF statistics functions in TX and RX processing.
  Implement following ethtool callback function:
.get_sset_count
.get_ethtool_stats
.get_strings
.get_eth_phy_stats
.get_eth_mac_stats
.get_eth_ctrl_stats
.get_rmon_stats
.get_pause_stats

  These callbacks allow users to utilize ethtool for detailed
TX and RX netdev stats monitoring.

Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 485 ++++++++++++++++++
 .../ethernet/huawei/hinic3/hinic3_hw_intf.h   |  13 +-
 .../huawei/hinic3/hinic3_mgmt_interface.h     |  37 ++
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.c   |  64 +++
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.h   | 109 ++++
 .../net/ethernet/huawei/hinic3/hinic3_rx.c    |  59 ++-
 .../net/ethernet/huawei/hinic3/hinic3_rx.h    |  15 +-
 .../net/ethernet/huawei/hinic3/hinic3_tx.c    |  71 ++-
 .../net/ethernet/huawei/hinic3/hinic3_tx.h    |   2 +
 9 files changed, 845 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index e47c3f43e7b9..be26698fc658 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -508,6 +508,483 @@ static int hinic3_set_ringparam(struct net_device *netdev,
 	return 0;
 }
 
+struct hinic3_stats {
+	char name[ETH_GSTRING_LEN];
+	u32  size;
+	int  offset;
+};
+
+#define HINIC3_RXQ_STAT(_stat_item) { \
+	.name   = "rxq%d_"#_stat_item, \
+	.size   = sizeof_field(struct hinic3_rxq_stats, _stat_item), \
+	.offset = offsetof(struct hinic3_rxq_stats, _stat_item) \
+}
+
+#define HINIC3_TXQ_STAT(_stat_item) { \
+	.name   = "txq%d_"#_stat_item, \
+	.size   = sizeof_field(struct hinic3_txq_stats, _stat_item), \
+	.offset = offsetof(struct hinic3_txq_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_rx_queue_stats[] = {
+	HINIC3_RXQ_STAT(csum_errors),
+	HINIC3_RXQ_STAT(other_errors),
+	HINIC3_RXQ_STAT(rx_buf_empty),
+	HINIC3_RXQ_STAT(alloc_skb_err),
+	HINIC3_RXQ_STAT(alloc_rx_buf_err),
+};
+
+static struct hinic3_stats hinic3_tx_queue_stats[] = {
+	HINIC3_TXQ_STAT(busy),
+	HINIC3_TXQ_STAT(skb_pad_err),
+	HINIC3_TXQ_STAT(frag_len_overflow),
+	HINIC3_TXQ_STAT(offload_cow_skb_err),
+	HINIC3_TXQ_STAT(map_frag_err),
+	HINIC3_TXQ_STAT(unknown_tunnel_pkt),
+	HINIC3_TXQ_STAT(frag_size_err),
+};
+
+#define HINIC3_FUNC_STAT(_stat_item) {	\
+	.name   = #_stat_item, \
+	.size   = sizeof_field(struct l2nic_vport_stats, _stat_item), \
+	.offset = offsetof(struct l2nic_vport_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_function_stats[] = {
+	HINIC3_FUNC_STAT(tx_unicast_pkts_vport),
+	HINIC3_FUNC_STAT(tx_unicast_bytes_vport),
+	HINIC3_FUNC_STAT(tx_multicast_pkts_vport),
+	HINIC3_FUNC_STAT(tx_multicast_bytes_vport),
+	HINIC3_FUNC_STAT(tx_broadcast_pkts_vport),
+	HINIC3_FUNC_STAT(tx_broadcast_bytes_vport),
+
+	HINIC3_FUNC_STAT(rx_unicast_pkts_vport),
+	HINIC3_FUNC_STAT(rx_unicast_bytes_vport),
+	HINIC3_FUNC_STAT(rx_multicast_pkts_vport),
+	HINIC3_FUNC_STAT(rx_multicast_bytes_vport),
+	HINIC3_FUNC_STAT(rx_broadcast_pkts_vport),
+	HINIC3_FUNC_STAT(rx_broadcast_bytes_vport),
+
+	HINIC3_FUNC_STAT(tx_discard_vport),
+	HINIC3_FUNC_STAT(rx_discard_vport),
+	HINIC3_FUNC_STAT(tx_err_vport),
+	HINIC3_FUNC_STAT(rx_err_vport),
+};
+
+#define HINIC3_PORT_STAT(_stat_item) { \
+	.name   = #_stat_item, \
+	.size   = sizeof_field(struct mag_cmd_port_stats, _stat_item), \
+	.offset = offsetof(struct mag_cmd_port_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_port_stats[] = {
+	HINIC3_PORT_STAT(mac_tx_fragment_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_undersize_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_undermin_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_1519_max_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_1519_max_good_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_oversize_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_jabber_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_bad_oct_num),
+	HINIC3_PORT_STAT(mac_tx_good_oct_num),
+	HINIC3_PORT_STAT(mac_tx_total_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_uni_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri0_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri1_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri2_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri3_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri4_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri5_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri6_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_pfc_pri7_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_err_all_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_from_app_good_pkt_num),
+	HINIC3_PORT_STAT(mac_tx_from_app_bad_pkt_num),
+
+	HINIC3_PORT_STAT(mac_rx_undermin_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_1519_max_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_1519_max_good_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_bad_oct_num),
+	HINIC3_PORT_STAT(mac_rx_good_oct_num),
+	HINIC3_PORT_STAT(mac_rx_total_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_uni_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri0_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri1_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri2_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri3_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri4_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri5_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri6_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_pfc_pri7_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_send_app_good_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_send_app_bad_pkt_num),
+	HINIC3_PORT_STAT(mac_rx_unfilter_pkt_num),
+};
+
+static int hinic3_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	int count, q_num;
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		q_num = nic_dev->q_params.num_qps;
+		count = ARRAY_SIZE(hinic3_function_stats) +
+			(ARRAY_SIZE(hinic3_tx_queue_stats) +
+			 ARRAY_SIZE(hinic3_rx_queue_stats)) *
+			q_num;
+
+		if (!HINIC3_IS_VF(nic_dev->hwdev))
+			count += ARRAY_SIZE(hinic3_port_stats);
+
+		return count;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static u64 get_val_of_ptr(u32 size, const void *ptr)
+{
+	u64 ret = size == sizeof(u64) ? *(u64 *)ptr :
+		  size == sizeof(u32) ? *(u32 *)ptr :
+		  size == sizeof(u16) ? *(u16 *)ptr :
+		  *(u8 *)ptr;
+
+	return ret;
+}
+
+static void hinic3_get_drv_queue_stats(struct net_device *netdev, u64 *data)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_txq_stats txq_stats = {};
+	struct hinic3_rxq_stats rxq_stats = {};
+	u16 i = 0, j, qid;
+	char *p;
+
+	u64_stats_init(&txq_stats.syncp);
+	u64_stats_init(&rxq_stats.syncp);
+
+	for (qid = 0; qid < nic_dev->q_params.num_qps; qid++) {
+		if (!nic_dev->txqs)
+			break;
+
+		hinic3_txq_get_stats(&nic_dev->txqs[qid], &txq_stats);
+		for (j = 0; j < ARRAY_SIZE(hinic3_tx_queue_stats); j++, i++) {
+			p = (char *)&txq_stats +
+			    hinic3_tx_queue_stats[j].offset;
+			data[i] = get_val_of_ptr(hinic3_tx_queue_stats[j].size,
+						 p);
+		}
+	}
+
+	for (qid = 0; qid < nic_dev->q_params.num_qps; qid++) {
+		if (!nic_dev->rxqs)
+			break;
+
+		hinic3_rxq_get_stats(&nic_dev->rxqs[qid], &rxq_stats);
+		for (j = 0; j < ARRAY_SIZE(hinic3_rx_queue_stats); j++, i++) {
+			p = (char *)&rxq_stats +
+			    hinic3_rx_queue_stats[j].offset;
+			data[i] = get_val_of_ptr(hinic3_rx_queue_stats[j].size,
+						 p);
+		}
+	}
+}
+
+static u16 hinic3_get_ethtool_port_stats(struct net_device *netdev, u64 *data)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	u16 i = 0, j;
+	char *p;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		goto err_zero_stats;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get port stats from fw\n");
+		goto err_zero_stats;
+	}
+
+	for (j = 0; j < ARRAY_SIZE(hinic3_port_stats); j++, i++) {
+		p = (char *)ps + hinic3_port_stats[j].offset;
+		data[i] = get_val_of_ptr(hinic3_port_stats[j].size, p);
+	}
+
+	kfree(ps);
+
+	return i;
+
+err_zero_stats:
+	memset(&data[i], 0, ARRAY_SIZE(hinic3_port_stats) * sizeof(*data));
+
+	return i + ARRAY_SIZE(hinic3_port_stats);
+}
+
+static void hinic3_get_ethtool_stats(struct net_device *netdev,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct l2nic_vport_stats vport_stats = {};
+	u16 i = 0, j;
+	char *p;
+	int err;
+
+	err = hinic3_get_vport_stats(nic_dev->hwdev,
+				     hinic3_global_func_id(nic_dev->hwdev),
+				     &vport_stats);
+	if (err)
+		netdev_err(netdev, "Failed to get function stats from fw\n");
+
+	for (j = 0; j < ARRAY_SIZE(hinic3_function_stats); j++, i++) {
+		p = (char *)&vport_stats + hinic3_function_stats[j].offset;
+		data[i] = get_val_of_ptr(hinic3_function_stats[j].size, p);
+	}
+
+	if (!HINIC3_IS_VF(nic_dev->hwdev))
+		i += hinic3_get_ethtool_port_stats(netdev, data + i);
+
+	hinic3_get_drv_queue_stats(netdev, data + i);
+}
+
+static u16 hinic3_get_hw_stats_strings(struct net_device *netdev, char *p)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u16 i, cnt = 0;
+
+	for (i = 0; i < ARRAY_SIZE(hinic3_function_stats); i++) {
+		memcpy(p, hinic3_function_stats[i].name, ETH_GSTRING_LEN);
+		p += ETH_GSTRING_LEN;
+		cnt++;
+	}
+
+	if (!HINIC3_IS_VF(nic_dev->hwdev)) {
+		for (i = 0; i < ARRAY_SIZE(hinic3_port_stats); i++) {
+			memcpy(p, hinic3_port_stats[i].name, ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+			cnt++;
+		}
+	}
+
+	return cnt;
+}
+
+static void hinic3_get_qp_stats_strings(struct net_device *netdev, char *p)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u8 *data = p;
+	u16 i, j;
+
+	for (i = 0; i < nic_dev->q_params.num_qps; i++) {
+		for (j = 0; j < ARRAY_SIZE(hinic3_tx_queue_stats); j++)
+			ethtool_sprintf(&data,
+					hinic3_tx_queue_stats[j].name, i);
+	}
+
+	for (i = 0; i < nic_dev->q_params.num_qps; i++) {
+		for (j = 0; j < ARRAY_SIZE(hinic3_rx_queue_stats); j++)
+			ethtool_sprintf(&data,
+					hinic3_rx_queue_stats[j].name, i);
+	}
+}
+
+static void hinic3_get_strings(struct net_device *netdev,
+			       u32 stringset, u8 *data)
+{
+	char *p = (char *)data;
+	u16 offset;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		offset = hinic3_get_hw_stats_strings(netdev, p);
+		hinic3_get_qp_stats_strings(netdev,
+					    p + offset * ETH_GSTRING_LEN);
+
+		return;
+	default:
+		netdev_err(netdev, "Invalid string set %u.\n", stringset);
+		return;
+	}
+}
+
+static void hinic3_get_eth_phy_stats(struct net_device *netdev,
+				     struct ethtool_eth_phy_stats *phy_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth phy stats from fw\n");
+		return;
+	}
+
+	phy_stats->SymbolErrorDuringCarrier = ps->mac_rx_sym_err_pkt_num;
+
+	kfree(ps);
+}
+
+static void hinic3_get_eth_mac_stats(struct net_device *netdev,
+				     struct ethtool_eth_mac_stats *mac_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth mac stats from fw\n");
+		return;
+	}
+
+	mac_stats->FramesTransmittedOK = ps->mac_tx_good_pkt_num;
+	mac_stats->FramesReceivedOK = ps->mac_rx_good_pkt_num;
+	mac_stats->FrameCheckSequenceErrors = ps->mac_rx_fcs_err_pkt_num;
+	mac_stats->OctetsTransmittedOK = ps->mac_tx_total_oct_num;
+	mac_stats->OctetsReceivedOK = ps->mac_rx_total_oct_num;
+	mac_stats->MulticastFramesXmittedOK = ps->mac_tx_multi_pkt_num;
+	mac_stats->BroadcastFramesXmittedOK = ps->mac_tx_broad_pkt_num;
+	mac_stats->MulticastFramesReceivedOK = ps->mac_rx_multi_pkt_num;
+	mac_stats->BroadcastFramesReceivedOK = ps->mac_rx_broad_pkt_num;
+
+	kfree(ps);
+}
+
+static void hinic3_get_eth_ctrl_stats(struct net_device *netdev,
+				      struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth ctrl stats from fw\n");
+		return;
+	}
+
+	ctrl_stats->MACControlFramesTransmitted = ps->mac_tx_control_pkt_num;
+	ctrl_stats->MACControlFramesReceived = ps->mac_rx_control_pkt_num;
+
+	kfree(ps);
+}
+
+static const struct ethtool_rmon_hist_range hinic3_rmon_ranges[] = {
+	{     0,    64 },
+	{    65,   127 },
+	{   128,   255 },
+	{   256,   511 },
+	{   512,  1023 },
+	{  1024,  1518 },
+	{  1519,  2047 },
+	{  2048,  4095 },
+	{  4096,  8191 },
+	{  8192,  9216 },
+	{  9217, 12287 },
+	{}
+};
+
+static void hinic3_get_rmon_stats(struct net_device *netdev,
+				  struct ethtool_rmon_stats *rmon_stats,
+				  const struct ethtool_rmon_hist_range **ranges)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth rmon stats from fw\n");
+		return;
+	}
+
+	rmon_stats->undersize_pkts	= ps->mac_rx_undersize_pkt_num;
+	rmon_stats->oversize_pkts	= ps->mac_rx_oversize_pkt_num;
+	rmon_stats->fragments		= ps->mac_rx_fragment_pkt_num;
+	rmon_stats->jabbers		= ps->mac_rx_jabber_pkt_num;
+
+	rmon_stats->hist[0]		= ps->mac_rx_64_oct_pkt_num;
+	rmon_stats->hist[1]		= ps->mac_rx_65_127_oct_pkt_num;
+	rmon_stats->hist[2]		= ps->mac_rx_128_255_oct_pkt_num;
+	rmon_stats->hist[3]		= ps->mac_rx_256_511_oct_pkt_num;
+	rmon_stats->hist[4]		= ps->mac_rx_512_1023_oct_pkt_num;
+	rmon_stats->hist[5]		= ps->mac_rx_1024_1518_oct_pkt_num;
+	rmon_stats->hist[6]		= ps->mac_rx_1519_2047_oct_pkt_num;
+	rmon_stats->hist[7]		= ps->mac_rx_2048_4095_oct_pkt_num;
+	rmon_stats->hist[8]		= ps->mac_rx_4096_8191_oct_pkt_num;
+	rmon_stats->hist[9]		= ps->mac_rx_8192_9216_oct_pkt_num;
+	rmon_stats->hist[10]		= ps->mac_rx_9217_12287_oct_pkt_num;
+
+	rmon_stats->hist_tx[0]		= ps->mac_tx_64_oct_pkt_num;
+	rmon_stats->hist_tx[1]		= ps->mac_tx_65_127_oct_pkt_num;
+	rmon_stats->hist_tx[2]		= ps->mac_tx_128_255_oct_pkt_num;
+	rmon_stats->hist_tx[3]		= ps->mac_tx_256_511_oct_pkt_num;
+	rmon_stats->hist_tx[4]		= ps->mac_tx_512_1023_oct_pkt_num;
+	rmon_stats->hist_tx[5]		= ps->mac_tx_1024_1518_oct_pkt_num;
+	rmon_stats->hist_tx[6]		= ps->mac_tx_1519_2047_oct_pkt_num;
+	rmon_stats->hist_tx[7]		= ps->mac_tx_2048_4095_oct_pkt_num;
+	rmon_stats->hist_tx[8]		= ps->mac_tx_4096_8191_oct_pkt_num;
+	rmon_stats->hist_tx[9]		= ps->mac_tx_8192_9216_oct_pkt_num;
+	rmon_stats->hist_tx[10]		= ps->mac_tx_9217_12287_oct_pkt_num;
+
+	*ranges = hinic3_rmon_ranges;
+
+	kfree(ps);
+}
+
+static void hinic3_get_pause_stats(struct net_device *netdev,
+				   struct ethtool_pause_stats *pause_stats)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct mag_cmd_port_stats *ps;
+	int err;
+
+	ps = kmalloc_obj(*ps);
+	if (!ps)
+		return;
+
+	err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+	if (err) {
+		kfree(ps);
+		netdev_err(netdev, "Failed to get eth pause stats from fw\n");
+		return;
+	}
+
+	pause_stats->tx_pause_frames = ps->mac_tx_pause_num;
+	pause_stats->rx_pause_frames = ps->mac_rx_pause_num;
+
+	kfree(ps);
+}
+
 static const struct ethtool_ops hinic3_ethtool_ops = {
 	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
 					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
@@ -518,6 +995,14 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_link                       = ethtool_op_get_link,
 	.get_ringparam                  = hinic3_get_ringparam,
 	.set_ringparam                  = hinic3_set_ringparam,
+	.get_sset_count                 = hinic3_get_sset_count,
+	.get_ethtool_stats              = hinic3_get_ethtool_stats,
+	.get_strings                    = hinic3_get_strings,
+	.get_eth_phy_stats              = hinic3_get_eth_phy_stats,
+	.get_eth_mac_stats              = hinic3_get_eth_mac_stats,
+	.get_eth_ctrl_stats             = hinic3_get_eth_ctrl_stats,
+	.get_rmon_stats                 = hinic3_get_rmon_stats,
+	.get_pause_stats                = hinic3_get_pause_stats,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h b/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
index cfc9daa3034f..0b2ebef04c02 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
@@ -51,7 +51,18 @@ static inline void mgmt_msg_params_init_default(struct mgmt_msg_params *msg_para
 	msg_params->in_size = buf_size;
 	msg_params->expected_out_size = buf_size;
 	msg_params->timeout_ms = 0;
-}
+};
+
+static inline void
+mgmt_msg_params_init_in_out(struct mgmt_msg_params *msg_params, void *in_buf,
+			    void *out_buf, u32 in_buf_size, u32 out_buf_size)
+{
+	msg_params->buf_in = in_buf;
+	msg_params->buf_out = out_buf;
+	msg_params->in_size = in_buf_size;
+	msg_params->expected_out_size = out_buf_size;
+	msg_params->timeout_ms = 0;
+};
 
 enum cfg_cmd {
 	CFG_CMD_GET_DEV_CAP = 0,
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
index c5bca3c4af96..76c691f82703 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
@@ -143,6 +143,41 @@ struct l2nic_cmd_set_dcb_state {
 	u8                   rsvd[7];
 };
 
+struct l2nic_port_stats_info {
+	struct mgmt_msg_head msg_head;
+	u16                  func_id;
+	u16                  rsvd1;
+};
+
+struct l2nic_vport_stats {
+	u64 tx_unicast_pkts_vport;
+	u64 tx_unicast_bytes_vport;
+	u64 tx_multicast_pkts_vport;
+	u64 tx_multicast_bytes_vport;
+	u64 tx_broadcast_pkts_vport;
+	u64 tx_broadcast_bytes_vport;
+
+	u64 rx_unicast_pkts_vport;
+	u64 rx_unicast_bytes_vport;
+	u64 rx_multicast_pkts_vport;
+	u64 rx_multicast_bytes_vport;
+	u64 rx_broadcast_pkts_vport;
+	u64 rx_broadcast_bytes_vport;
+
+	u64 tx_discard_vport;
+	u64 rx_discard_vport;
+	u64 tx_err_vport;
+	u64 rx_err_vport;
+};
+
+struct l2nic_cmd_vport_stats {
+	struct mgmt_msg_head     msg_head;
+	u32                      stats_size;
+	u32                      rsvd1;
+	struct l2nic_vport_stats stats;
+	u64                      rsvd2[6];
+};
+
 struct l2nic_cmd_lro_config {
 	struct mgmt_msg_head msg_head;
 	u16                  func_id;
@@ -234,6 +269,7 @@ enum l2nic_cmd {
 	L2NIC_CMD_SET_VPORT_ENABLE    = 6,
 	L2NIC_CMD_SET_RX_MODE         = 7,
 	L2NIC_CMD_SET_SQ_CI_ATTR      = 8,
+	L2NIC_CMD_GET_VPORT_STAT      = 9,
 	L2NIC_CMD_CLEAR_QP_RESOURCE   = 11,
 	L2NIC_CMD_CFG_RX_LRO          = 13,
 	L2NIC_CMD_CFG_LRO_TIMER       = 14,
@@ -272,6 +308,7 @@ enum mag_cmd {
 	MAG_CMD_SET_PORT_ENABLE = 6,
 	MAG_CMD_GET_LINK_STATUS = 7,
 
+	MAG_CMD_GET_PORT_STAT   = 151,
 	MAG_CMD_GET_PORT_INFO   = 153,
 };
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
index de5a7984d2cb..1b14dc824ce1 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
@@ -639,6 +639,42 @@ int hinic3_get_link_status(struct hinic3_hwdev *hwdev, bool *link_status_up)
 	return 0;
 }
 
+int hinic3_get_phy_port_stats(struct hinic3_hwdev *hwdev,
+			      struct mag_cmd_port_stats *stats)
+{
+	struct mag_cmd_port_stats_info stats_info = {};
+	struct mag_cmd_get_port_stat *ps;
+	struct mgmt_msg_params msg_params = {};
+	int err;
+
+	ps = kzalloc_obj(*ps);
+	if (!ps)
+		return -ENOMEM;
+
+	stats_info.port_id = hinic3_physical_port_id(hwdev);
+
+	mgmt_msg_params_init_in_out(&msg_params, &stats_info, ps,
+				    sizeof(stats_info), sizeof(*ps));
+
+	err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_HILINK,
+				       MAG_CMD_GET_PORT_STAT, &msg_params);
+
+	if (err || ps->head.status) {
+		dev_err(hwdev->dev,
+			"Failed to get port statistics, err: %d, status: 0x%x\n",
+			err, ps->head.status);
+		err = -EFAULT;
+		goto out;
+	}
+
+	memcpy(stats, &ps->counter, sizeof(*stats));
+
+out:
+	kfree(ps);
+
+	return err;
+}
+
 int hinic3_get_port_info(struct hinic3_hwdev *hwdev,
 			 struct hinic3_nic_port_info *port_info)
 {
@@ -738,3 +774,31 @@ int hinic3_get_pause_info(struct hinic3_nic_dev *nic_dev,
 	return hinic3_cfg_hw_pause(nic_dev->hwdev, MGMT_MSG_CMD_OP_GET,
 				   nic_pause);
 }
+
+int hinic3_get_vport_stats(struct hinic3_hwdev *hwdev, u16 func_id,
+			   struct l2nic_vport_stats *stats)
+{
+	struct l2nic_cmd_vport_stats vport_stats = {};
+	struct l2nic_port_stats_info stats_info = {};
+	struct mgmt_msg_params msg_params = {};
+	int err;
+
+	stats_info.func_id = func_id;
+
+	mgmt_msg_params_init_in_out(&msg_params, &stats_info, &vport_stats,
+				    sizeof(stats_info), sizeof(vport_stats));
+
+	err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_L2NIC,
+				       L2NIC_CMD_GET_VPORT_STAT, &msg_params);
+
+	if (err || vport_stats.msg_head.status) {
+		dev_err(hwdev->dev,
+			"Failed to get function statistics, err: %d, status: 0x%x\n",
+			err, vport_stats.msg_head.status);
+		return -EFAULT;
+	}
+
+	memcpy(stats, &vport_stats.stats, sizeof(*stats));
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
index 5d52202a8d4e..80573c121539 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
@@ -129,6 +129,110 @@ struct mag_cmd_get_xsfp_present {
 	u8                   rsvd[2];
 };
 
+struct mag_cmd_port_stats {
+	u64 mac_tx_fragment_pkt_num;
+	u64 mac_tx_undersize_pkt_num;
+	u64 mac_tx_undermin_pkt_num;
+	u64 mac_tx_64_oct_pkt_num;
+	u64 mac_tx_65_127_oct_pkt_num;
+	u64 mac_tx_128_255_oct_pkt_num;
+	u64 mac_tx_256_511_oct_pkt_num;
+	u64 mac_tx_512_1023_oct_pkt_num;
+	u64 mac_tx_1024_1518_oct_pkt_num;
+	u64 mac_tx_1519_2047_oct_pkt_num;
+	u64 mac_tx_2048_4095_oct_pkt_num;
+	u64 mac_tx_4096_8191_oct_pkt_num;
+	u64 mac_tx_8192_9216_oct_pkt_num;
+	u64 mac_tx_9217_12287_oct_pkt_num;
+	u64 mac_tx_12288_16383_oct_pkt_num;
+	u64 mac_tx_1519_max_bad_pkt_num;
+	u64 mac_tx_1519_max_good_pkt_num;
+	u64 mac_tx_oversize_pkt_num;
+	u64 mac_tx_jabber_pkt_num;
+	u64 mac_tx_bad_pkt_num;
+	u64 mac_tx_bad_oct_num;
+	u64 mac_tx_good_pkt_num;
+	u64 mac_tx_good_oct_num;
+	u64 mac_tx_total_pkt_num;
+	u64 mac_tx_total_oct_num;
+	u64 mac_tx_uni_pkt_num;
+	u64 mac_tx_multi_pkt_num;
+	u64 mac_tx_broad_pkt_num;
+	u64 mac_tx_pause_num;
+	u64 mac_tx_pfc_pkt_num;
+	u64 mac_tx_pfc_pri0_pkt_num;
+	u64 mac_tx_pfc_pri1_pkt_num;
+	u64 mac_tx_pfc_pri2_pkt_num;
+	u64 mac_tx_pfc_pri3_pkt_num;
+	u64 mac_tx_pfc_pri4_pkt_num;
+	u64 mac_tx_pfc_pri5_pkt_num;
+	u64 mac_tx_pfc_pri6_pkt_num;
+	u64 mac_tx_pfc_pri7_pkt_num;
+	u64 mac_tx_control_pkt_num;
+	u64 mac_tx_err_all_pkt_num;
+	u64 mac_tx_from_app_good_pkt_num;
+	u64 mac_tx_from_app_bad_pkt_num;
+
+	u64 mac_rx_fragment_pkt_num;
+	u64 mac_rx_undersize_pkt_num;
+	u64 mac_rx_undermin_pkt_num;
+	u64 mac_rx_64_oct_pkt_num;
+	u64 mac_rx_65_127_oct_pkt_num;
+	u64 mac_rx_128_255_oct_pkt_num;
+	u64 mac_rx_256_511_oct_pkt_num;
+	u64 mac_rx_512_1023_oct_pkt_num;
+	u64 mac_rx_1024_1518_oct_pkt_num;
+	u64 mac_rx_1519_2047_oct_pkt_num;
+	u64 mac_rx_2048_4095_oct_pkt_num;
+	u64 mac_rx_4096_8191_oct_pkt_num;
+	u64 mac_rx_8192_9216_oct_pkt_num;
+	u64 mac_rx_9217_12287_oct_pkt_num;
+	u64 mac_rx_12288_16383_oct_pkt_num;
+	u64 mac_rx_1519_max_bad_pkt_num;
+	u64 mac_rx_1519_max_good_pkt_num;
+	u64 mac_rx_oversize_pkt_num;
+	u64 mac_rx_jabber_pkt_num;
+	u64 mac_rx_bad_pkt_num;
+	u64 mac_rx_bad_oct_num;
+	u64 mac_rx_good_pkt_num;
+	u64 mac_rx_good_oct_num;
+	u64 mac_rx_total_pkt_num;
+	u64 mac_rx_total_oct_num;
+	u64 mac_rx_uni_pkt_num;
+	u64 mac_rx_multi_pkt_num;
+	u64 mac_rx_broad_pkt_num;
+	u64 mac_rx_pause_num;
+	u64 mac_rx_pfc_pkt_num;
+	u64 mac_rx_pfc_pri0_pkt_num;
+	u64 mac_rx_pfc_pri1_pkt_num;
+	u64 mac_rx_pfc_pri2_pkt_num;
+	u64 mac_rx_pfc_pri3_pkt_num;
+	u64 mac_rx_pfc_pri4_pkt_num;
+	u64 mac_rx_pfc_pri5_pkt_num;
+	u64 mac_rx_pfc_pri6_pkt_num;
+	u64 mac_rx_pfc_pri7_pkt_num;
+	u64 mac_rx_control_pkt_num;
+	u64 mac_rx_sym_err_pkt_num;
+	u64 mac_rx_fcs_err_pkt_num;
+	u64 mac_rx_send_app_good_pkt_num;
+	u64 mac_rx_send_app_bad_pkt_num;
+	u64 mac_rx_unfilter_pkt_num;
+};
+
+struct mag_cmd_port_stats_info {
+	struct mgmt_msg_head head;
+
+	u8                   port_id;
+	u8                   rsvd0[3];
+};
+
+struct mag_cmd_get_port_stat {
+	struct mgmt_msg_head      head;
+
+	struct mag_cmd_port_stats counter;
+	u64                       rsvd1[15];
+};
+
 enum link_err_type {
 	LINK_ERR_MODULE_UNRECOGENIZED,
 	LINK_ERR_NUM,
@@ -209,6 +313,11 @@ int hinic3_get_port_info(struct hinic3_hwdev *hwdev,
 			 struct hinic3_nic_port_info *port_info);
 int hinic3_set_vport_enable(struct hinic3_hwdev *hwdev, u16 func_id,
 			    bool enable);
+int hinic3_get_phy_port_stats(struct hinic3_hwdev *hwdev,
+			      struct mag_cmd_port_stats *stats);
+int hinic3_get_vport_stats(struct hinic3_hwdev *hwdev, u16 func_id,
+			   struct l2nic_vport_stats *stats);
+
 int hinic3_add_vlan(struct hinic3_hwdev *hwdev, u16 vlan_id, u16 func_id);
 int hinic3_del_vlan(struct hinic3_hwdev *hwdev, u16 vlan_id, u16 func_id);
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
index 309ab5901379..7fadb88ff722 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
@@ -29,7 +29,7 @@
 #define HINIC3_LRO_PKT_HDR_LEN_IPV4     66
 #define HINIC3_LRO_PKT_HDR_LEN_IPV6     86
 #define HINIC3_LRO_PKT_HDR_LEN(cqe) \
-	(RQ_CQE_OFFOLAD_TYPE_GET((cqe)->offload_type, IP_TYPE) == \
+	(RQ_CQE_OFFOLAD_TYPE_GET(le32_to_cpu((cqe)->offload_type), IP_TYPE) == \
 	 HINIC3_RX_IPV6_PKT ? HINIC3_LRO_PKT_HDR_LEN_IPV6 : \
 	 HINIC3_LRO_PKT_HDR_LEN_IPV4)
 
@@ -46,7 +46,6 @@ static void hinic3_rxq_clean_stats(struct hinic3_rxq_stats *rxq_stats)
 
 	rxq_stats->alloc_skb_err = 0;
 	rxq_stats->alloc_rx_buf_err = 0;
-	rxq_stats->restore_drop_sge = 0;
 	u64_stats_update_end(&rxq_stats->syncp);
 }
 
@@ -155,8 +154,12 @@ static u32 hinic3_rx_fill_buffers(struct hinic3_rxq *rxq)
 
 		err = rx_alloc_mapped_page(rxq->page_pool, rx_info,
 					   rxq->buf_len);
-		if (unlikely(err))
+		if (unlikely(err)) {
+			u64_stats_update_begin(&rxq->rxq_stats.syncp);
+			rxq->rxq_stats.alloc_rx_buf_err++;
+			u64_stats_update_end(&rxq->rxq_stats.syncp);
 			break;
+		}
 
 		dma_addr = page_pool_get_dma_addr(rx_info->page) +
 			rx_info->page_offset;
@@ -170,6 +173,10 @@ static u32 hinic3_rx_fill_buffers(struct hinic3_rxq *rxq)
 				rxq->next_to_update << HINIC3_NORMAL_RQ_WQE);
 		rxq->delta -= i;
 		rxq->next_to_alloc = rxq->next_to_update;
+	} else if (free_wqebbs == rxq->q_depth - 1) {
+		u64_stats_update_begin(&rxq->rxq_stats.syncp);
+		rxq->rxq_stats.rx_buf_empty++;
+		u64_stats_update_end(&rxq->rxq_stats.syncp);
 	}
 
 	return i;
@@ -330,11 +337,23 @@ static void hinic3_rx_csum(struct hinic3_rxq *rxq, u32 offload_type,
 	struct net_device *netdev = rxq->netdev;
 	bool l2_tunnel;
 
+	if (unlikely(csum_err == HINIC3_RX_CSUM_IPSU_OTHER_ERR)) {
+		u64_stats_update_begin(&rxq->rxq_stats.syncp);
+		rxq->rxq_stats.other_errors++;
+		u64_stats_update_end(&rxq->rxq_stats.syncp);
+	}
+
 	if (!(netdev->features & NETIF_F_RXCSUM))
 		return;
 
 	if (unlikely(csum_err)) {
 		/* pkt type is recognized by HW, and csum is wrong */
+		if (!(csum_err & (HINIC3_RX_CSUM_HW_CHECK_NONE |
+				  HINIC3_RX_CSUM_IPSU_OTHER_ERR))) {
+			u64_stats_update_begin(&rxq->rxq_stats.syncp);
+			rxq->rxq_stats.csum_errors++;
+			u64_stats_update_end(&rxq->rxq_stats.syncp);
+		}
 		skb->ip_summed = CHECKSUM_NONE;
 		return;
 	}
@@ -387,8 +406,12 @@ static int recv_one_pkt(struct hinic3_rxq *rxq, struct hinic3_rq_cqe *rx_cqe,
 	u16 num_lro;
 
 	skb = hinic3_fetch_rx_buffer(rxq, pkt_len);
-	if (unlikely(!skb))
+	if (unlikely(!skb)) {
+		u64_stats_update_begin(&rxq->rxq_stats.syncp);
+		rxq->rxq_stats.alloc_skb_err++;
+		u64_stats_update_end(&rxq->rxq_stats.syncp);
 		return -ENOMEM;
+	}
 
 	/* place header in linear portion of buffer */
 	if (skb_is_nonlinear(skb))
@@ -550,11 +573,28 @@ int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
 	return 0;
 }
 
+void hinic3_rxq_get_stats(struct hinic3_rxq *rxq,
+			  struct hinic3_rxq_stats *stats)
+{
+	struct hinic3_rxq_stats *rxq_stats = &rxq->rxq_stats;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin(&rxq_stats->syncp);
+		stats->csum_errors = rxq_stats->csum_errors;
+		stats->other_errors = rxq_stats->other_errors;
+		stats->rx_buf_empty = rxq_stats->rx_buf_empty;
+		stats->alloc_skb_err = rxq_stats->alloc_skb_err;
+		stats->alloc_rx_buf_err = rxq_stats->alloc_rx_buf_err;
+	} while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
+}
+
 int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(rxq->netdev);
 	u32 sw_ci, status, pkt_len, vlan_len;
 	struct hinic3_rq_cqe *rx_cqe;
+	u64 rx_bytes = 0;
 	u32 num_wqe = 0;
 	int nr_pkts = 0;
 	u16 num_lro;
@@ -574,10 +614,14 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
 		if (recv_one_pkt(rxq, rx_cqe, pkt_len, vlan_len, status))
 			break;
 
+		rx_bytes += pkt_len;
 		nr_pkts++;
 		num_lro = RQ_CQE_STATUS_GET(status, NUM_LRO);
-		if (num_lro)
+		if (num_lro) {
+			rx_bytes += (num_lro - 1) *
+				    HINIC3_LRO_PKT_HDR_LEN(rx_cqe);
 			num_wqe += hinic3_get_sge_num(rxq, pkt_len);
+		}
 
 		rx_cqe->status = 0;
 
@@ -588,5 +632,10 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
 	if (rxq->delta >= HINIC3_RX_BUFFER_WRITE)
 		hinic3_rx_fill_buffers(rxq);
 
+	u64_stats_update_begin(&rxq->rxq_stats.syncp);
+	rxq->rxq_stats.packets += (u64)nr_pkts;
+	rxq->rxq_stats.bytes += rx_bytes;
+	u64_stats_update_end(&rxq->rxq_stats.syncp);
+
 	return nr_pkts;
 }
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index 06d1b3299e7c..c11d080408a7 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -8,6 +8,17 @@
 #include <linux/dim.h>
 #include <linux/netdevice.h>
 
+/* rx cqe checksum err */
+#define HINIC3_RX_CSUM_IP_CSUM_ERR      BIT(0)
+#define HINIC3_RX_CSUM_TCP_CSUM_ERR     BIT(1)
+#define HINIC3_RX_CSUM_UDP_CSUM_ERR     BIT(2)
+#define HINIC3_RX_CSUM_IGMP_CSUM_ERR    BIT(3)
+#define HINIC3_RX_CSUM_ICMPV4_CSUM_ERR  BIT(4)
+#define HINIC3_RX_CSUM_ICMPV6_CSUM_ERR  BIT(5)
+#define HINIC3_RX_CSUM_SCTP_CRC_ERR     BIT(6)
+#define HINIC3_RX_CSUM_HW_CHECK_NONE    BIT(7)
+#define HINIC3_RX_CSUM_IPSU_OTHER_ERR   BIT(8)
+
 #define RQ_CQE_OFFOLAD_TYPE_PKT_TYPE_MASK           GENMASK(4, 0)
 #define RQ_CQE_OFFOLAD_TYPE_IP_TYPE_MASK            GENMASK(6, 5)
 #define RQ_CQE_OFFOLAD_TYPE_TUNNEL_PKT_FORMAT_MASK  GENMASK(11, 8)
@@ -39,7 +50,6 @@ struct hinic3_rxq_stats {
 	u64                   rx_buf_empty;
 	u64                   alloc_skb_err;
 	u64                   alloc_rx_buf_err;
-	u64                   restore_drop_sge;
 	struct u64_stats_sync syncp;
 };
 
@@ -123,6 +133,9 @@ void hinic3_free_rxqs_res(struct net_device *netdev, u16 num_rq,
 			  u32 rq_depth, struct hinic3_dyna_rxq_res *rxqs_res);
 int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
 			  u32 rq_depth, struct hinic3_dyna_rxq_res *rxqs_res);
+
+void hinic3_rxq_get_stats(struct hinic3_rxq *rxq,
+			  struct hinic3_rxq_stats *stats);
 int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget);
 
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
index 9306bf0020ca..3fbbfa5d96b6 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
@@ -97,8 +97,12 @@ static int hinic3_tx_map_skb(struct net_device *netdev, struct sk_buff *skb,
 
 	dma_info[0].dma = dma_map_single(&pdev->dev, skb->data,
 					 skb_headlen(skb), DMA_TO_DEVICE);
-	if (dma_mapping_error(&pdev->dev, dma_info[0].dma))
+	if (dma_mapping_error(&pdev->dev, dma_info[0].dma)) {
+		u64_stats_update_begin(&txq->txq_stats.syncp);
+		txq->txq_stats.map_frag_err++;
+		u64_stats_update_end(&txq->txq_stats.syncp);
 		return -EFAULT;
+	}
 
 	dma_info[0].len = skb_headlen(skb);
 
@@ -117,6 +121,9 @@ static int hinic3_tx_map_skb(struct net_device *netdev, struct sk_buff *skb,
 						     skb_frag_size(frag),
 						     DMA_TO_DEVICE);
 		if (dma_mapping_error(&pdev->dev, dma_info[idx].dma)) {
+			u64_stats_update_begin(&txq->txq_stats.syncp);
+			txq->txq_stats.map_frag_err++;
+			u64_stats_update_end(&txq->txq_stats.syncp);
 			err = -EFAULT;
 			goto err_unmap_page;
 		}
@@ -260,6 +267,9 @@ static int hinic3_tx_csum(struct hinic3_txq *txq, struct hinic3_sq_task *task,
 		if (l4_proto != IPPROTO_UDP ||
 		    ((struct udphdr *)skb_transport_header(skb))->dest !=
 		    VXLAN_OFFLOAD_PORT_LE) {
+			u64_stats_update_begin(&txq->txq_stats.syncp);
+			txq->txq_stats.unknown_tunnel_pkt++;
+			u64_stats_update_end(&txq->txq_stats.syncp);
 			/* Unsupported tunnel packet, disable csum offload */
 			skb_checksum_help(skb);
 			return 0;
@@ -433,6 +443,27 @@ static u32 hinic3_tx_offload(struct sk_buff *skb, struct hinic3_sq_task *task,
 	return offload;
 }
 
+static void hinic3_get_pkt_stats(struct hinic3_txq *txq, struct sk_buff *skb)
+{
+	u32 hdr_len, tx_bytes;
+	unsigned short pkts;
+
+	if (skb_is_gso(skb)) {
+		hdr_len = (skb_shinfo(skb)->gso_segs - 1) *
+			  skb_tcp_all_headers(skb);
+		tx_bytes = skb->len + hdr_len;
+		pkts = skb_shinfo(skb)->gso_segs;
+	} else {
+		tx_bytes = skb->len > ETH_ZLEN ? skb->len : ETH_ZLEN;
+		pkts = 1;
+	}
+
+	u64_stats_update_begin(&txq->txq_stats.syncp);
+	txq->txq_stats.bytes += tx_bytes;
+	txq->txq_stats.packets += pkts;
+	u64_stats_update_end(&txq->txq_stats.syncp);
+}
+
 static u16 hinic3_get_and_update_sq_owner(struct hinic3_io_queue *sq,
 					  u16 curr_pi, u16 wqebb_cnt)
 {
@@ -539,8 +570,12 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
 	int err;
 
 	if (unlikely(skb->len < MIN_SKB_LEN)) {
-		if (skb_pad(skb, MIN_SKB_LEN - skb->len))
+		if (skb_pad(skb, MIN_SKB_LEN - skb->len)) {
+			u64_stats_update_begin(&txq->txq_stats.syncp);
+			txq->txq_stats.skb_pad_err++;
+			u64_stats_update_end(&txq->txq_stats.syncp);
 			goto err_out;
+		}
 
 		skb->len = MIN_SKB_LEN;
 	}
@@ -595,6 +630,7 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
 				  txq->tx_stop_thrs,
 				  txq->tx_start_thrs);
 
+	hinic3_get_pkt_stats(txq, skb);
 	hinic3_prepare_sq_ctrl(&wqe_combo, queue_info, num_sge, owner);
 	hinic3_write_db(txq->sq, 0, DB_CFLAG_DP_SQ,
 			hinic3_get_sq_local_pi(txq->sq));
@@ -604,6 +640,10 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
 err_drop_pkt:
 	dev_kfree_skb_any(skb);
 err_out:
+	u64_stats_update_begin(&txq->txq_stats.syncp);
+	txq->txq_stats.dropped++;
+	u64_stats_update_end(&txq->txq_stats.syncp);
+
 	return NETDEV_TX_OK;
 }
 
@@ -611,12 +651,19 @@ netdev_tx_t hinic3_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
 	u16 q_id = skb_get_queue_mapping(skb);
+	struct hinic3_txq *txq;
 
 	if (unlikely(!netif_carrier_ok(netdev)))
 		goto err_drop_pkt;
 
-	if (unlikely(q_id >= nic_dev->q_params.num_qps))
+	if (unlikely(q_id >= nic_dev->q_params.num_qps)) {
+		txq = &nic_dev->txqs[0];
+		u64_stats_update_begin(&txq->txq_stats.syncp);
+		txq->txq_stats.dropped++;
+		u64_stats_update_end(&txq->txq_stats.syncp);
+
 		goto err_drop_pkt;
+	}
 
 	return hinic3_send_one_skb(skb, netdev, &nic_dev->txqs[q_id]);
 
@@ -754,6 +801,24 @@ int hinic3_configure_txqs(struct net_device *netdev, u16 num_sq,
 	return 0;
 }
 
+void hinic3_txq_get_stats(struct hinic3_txq *txq,
+			  struct hinic3_txq_stats *stats)
+{
+	struct hinic3_txq_stats *txq_stats = &txq->txq_stats;
+	unsigned int start;
+
+	do {
+		start = u64_stats_fetch_begin(&txq_stats->syncp);
+		stats->busy = txq_stats->busy;
+		stats->skb_pad_err = txq_stats->skb_pad_err;
+		stats->frag_len_overflow = txq_stats->frag_len_overflow;
+		stats->offload_cow_skb_err = txq_stats->offload_cow_skb_err;
+		stats->map_frag_err = txq_stats->map_frag_err;
+		stats->unknown_tunnel_pkt = txq_stats->unknown_tunnel_pkt;
+		stats->frag_size_err = txq_stats->frag_size_err;
+	} while (u64_stats_fetch_retry(&txq_stats->syncp, start));
+}
+
 bool hinic3_tx_poll(struct hinic3_txq *txq, int budget)
 {
 	struct net_device *netdev = txq->netdev;
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
index 00194f2a1bcc..0a21c423618f 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
@@ -157,6 +157,8 @@ int hinic3_configure_txqs(struct net_device *netdev, u16 num_sq,
 			  u32 sq_depth, struct hinic3_dyna_txq_res *txqs_res);
 
 netdev_tx_t hinic3_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+void hinic3_txq_get_stats(struct hinic3_txq *txq,
+			  struct hinic3_txq_stats *stats);
 bool hinic3_tx_poll(struct hinic3_txq *txq, int budget);
 void hinic3_flush_txqs(struct net_device *netdev);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v04 1/6] hinic3: Add ethtool queue ops
From: Fan Gong @ 2026-04-08  4:03 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775618797.git.zhuyikai1@h-partners.com>

  Implement following ethtool callback function:
.get_ringparam
.set_ringparam

  These callbacks allow users to utilize ethtool for detailed
queue depth configuration and monitoring.

Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 101 +++++++++++++++++
 .../net/ethernet/huawei/hinic3/hinic3_irq.c   |  10 +-
 .../net/ethernet/huawei/hinic3/hinic3_main.c  |  11 ++
 .../huawei/hinic3/hinic3_netdev_ops.c         | 103 +++++++++++++++++-
 .../ethernet/huawei/hinic3/hinic3_nic_dev.h   |  16 +++
 .../ethernet/huawei/hinic3/hinic3_nic_io.h    |   4 +
 6 files changed, 240 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index 90fc16288de9..e47c3f43e7b9 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -409,6 +409,105 @@ hinic3_get_link_ksettings(struct net_device *netdev,
 	return 0;
 }
 
+static void hinic3_get_ringparam(struct net_device *netdev,
+				 struct ethtool_ringparam *ring,
+				 struct kernel_ethtool_ringparam *kernel_ring,
+				 struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+
+	ring->rx_max_pending = HINIC3_MAX_RX_QUEUE_DEPTH;
+	ring->tx_max_pending = HINIC3_MAX_TX_QUEUE_DEPTH;
+	ring->rx_pending = nic_dev->rxqs[0].q_depth;
+	ring->tx_pending = nic_dev->txqs[0].q_depth;
+}
+
+static void hinic3_update_qp_depth(struct net_device *netdev,
+				   u32 sq_depth, u32 rq_depth)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	u16 i;
+
+	nic_dev->q_params.sq_depth = sq_depth;
+	nic_dev->q_params.rq_depth = rq_depth;
+	for (i = 0; i < nic_dev->max_qps; i++) {
+		nic_dev->txqs[i].q_depth = sq_depth;
+		nic_dev->txqs[i].q_mask = sq_depth - 1;
+		nic_dev->rxqs[i].q_depth = rq_depth;
+		nic_dev->rxqs[i].q_mask = rq_depth - 1;
+	}
+}
+
+static int hinic3_check_ringparam_valid(struct net_device *netdev,
+					const struct ethtool_ringparam *ring)
+{
+	if (ring->rx_jumbo_pending || ring->rx_mini_pending) {
+		netdev_err(netdev, "Unsupported rx_jumbo_pending/rx_mini_pending\n");
+		return -EINVAL;
+	}
+
+	if (ring->tx_pending > HINIC3_MAX_TX_QUEUE_DEPTH ||
+	    ring->tx_pending < HINIC3_MIN_QUEUE_DEPTH ||
+	    ring->rx_pending > HINIC3_MAX_RX_QUEUE_DEPTH ||
+	    ring->rx_pending < HINIC3_MIN_QUEUE_DEPTH) {
+		netdev_err(netdev,
+			   "Queue depth out of range tx[%d-%d] rx[%d-%d]\n",
+			   HINIC3_MIN_QUEUE_DEPTH, HINIC3_MAX_TX_QUEUE_DEPTH,
+			   HINIC3_MIN_QUEUE_DEPTH, HINIC3_MAX_RX_QUEUE_DEPTH);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hinic3_set_ringparam(struct net_device *netdev,
+				struct ethtool_ringparam *ring,
+				struct kernel_ethtool_ringparam *kernel_ring,
+				struct netlink_ext_ack *extack)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_dyna_txrxq_params q_params = {};
+	u32 new_sq_depth, new_rq_depth;
+	int err;
+
+	err = hinic3_check_ringparam_valid(netdev, ring);
+	if (err)
+		return err;
+
+	new_sq_depth = 1U << ilog2(ring->tx_pending);
+	new_rq_depth = 1U << ilog2(ring->rx_pending);
+	if (new_sq_depth == nic_dev->q_params.sq_depth &&
+	    new_rq_depth == nic_dev->q_params.rq_depth)
+		return 0;
+
+	if (new_sq_depth != ring->tx_pending)
+		netdev_info(netdev, "Requested Tx depth trimmed to %d\n",
+			    new_sq_depth);
+	if (new_rq_depth != ring->rx_pending)
+		netdev_info(netdev, "Requested Rx depth trimmed to %d\n",
+			    new_rq_depth);
+
+	netdev_info(netdev, "Change Tx/Rx ring depth from %u/%u to %u/%u\n",
+		    nic_dev->q_params.sq_depth, nic_dev->q_params.rq_depth,
+		    new_sq_depth, new_rq_depth);
+
+	if (!netif_running(netdev)) {
+		hinic3_update_qp_depth(netdev, new_sq_depth, new_rq_depth);
+	} else {
+		q_params = nic_dev->q_params;
+		q_params.sq_depth = new_sq_depth;
+		q_params.rq_depth = new_rq_depth;
+
+		err = hinic3_change_channel_settings(netdev, &q_params);
+		if (err) {
+			netdev_err(netdev, "Failed to change channel settings\n");
+			return err;
+		}
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops hinic3_ethtool_ops = {
 	.supported_coalesce_params      = ETHTOOL_COALESCE_USECS |
 					  ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
@@ -417,6 +516,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
 	.get_msglevel                   = hinic3_get_msglevel,
 	.set_msglevel                   = hinic3_set_msglevel,
 	.get_link                       = ethtool_op_get_link,
+	.get_ringparam                  = hinic3_get_ringparam,
+	.set_ringparam                  = hinic3_set_ringparam,
 };
 
 void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index e7d6c2033b45..d3b3927b5408 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -135,10 +135,16 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
 	struct hinic3_interrupt_info info = {};
+	unsigned long flags;
 	int err;
 
-	if (q_id >= nic_dev->q_params.num_qps)
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+
+	if (!HINIC3_CHANNEL_RES_VALID(nic_dev) ||
+	    q_id >= nic_dev->q_params.num_qps) {
+		spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
 		return 0;
+	}
 
 	info.interrupt_coalesc_set = 1;
 	info.coalesc_timer_cfg = coalesc_timer_cfg;
@@ -147,6 +153,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 	info.resend_timer_cfg =
 		nic_dev->intr_coalesce[q_id].resend_timer_cfg;
 
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
 	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
 	if (err) {
 		netdev_err(netdev,
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
index 0a888fe4c975..3b470978714a 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
@@ -179,6 +179,8 @@ static int hinic3_sw_init(struct net_device *netdev)
 	int err;
 
 	mutex_init(&nic_dev->port_state_mutex);
+	mutex_init(&nic_dev->channel_cfg_lock);
+	spin_lock_init(&nic_dev->channel_res_lock);
 
 	nic_dev->q_params.sq_depth = HINIC3_SQ_DEPTH;
 	nic_dev->q_params.rq_depth = HINIC3_RQ_DEPTH;
@@ -314,6 +316,15 @@ static void hinic3_link_status_change(struct net_device *netdev,
 				      bool link_status_up)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	unsigned long flags;
+	bool valid;
+
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+	valid = HINIC3_CHANNEL_RES_VALID(nic_dev);
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+	if (!valid)
+		return;
 
 	if (link_status_up) {
 		if (netif_carrier_ok(netdev))
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
index da73811641a9..cec501a9dd43 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
@@ -428,6 +428,84 @@ static void hinic3_vport_down(struct net_device *netdev)
 	}
 }
 
+int
+hinic3_change_channel_settings(struct net_device *netdev,
+			       struct hinic3_dyna_txrxq_params *trxq_params)
+{
+	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+	struct hinic3_dyna_txrxq_params old_qp_params = {};
+	struct hinic3_dyna_qp_params new_qp_params = {};
+	struct hinic3_dyna_qp_params cur_qp_params = {};
+	bool need_teardown = false;
+	unsigned long flags;
+	int err;
+
+	mutex_lock(&nic_dev->channel_cfg_lock);
+
+	hinic3_config_num_qps(netdev, trxq_params);
+
+	err = hinic3_alloc_channel_resources(netdev, &new_qp_params,
+					     trxq_params);
+	if (err) {
+		netdev_err(netdev, "Failed to alloc channel resources\n");
+		mutex_unlock(&nic_dev->channel_cfg_lock);
+		return err;
+	}
+
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+	if (!test_and_set_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags))
+		need_teardown = true;
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+	if (need_teardown) {
+		hinic3_vport_down(netdev);
+		hinic3_close_channel(netdev);
+		hinic3_uninit_qps(nic_dev, &cur_qp_params);
+		hinic3_free_channel_resources(netdev, &cur_qp_params,
+					      &nic_dev->q_params);
+	}
+
+	if (nic_dev->num_qp_irq > trxq_params->num_qps)
+		hinic3_qp_irq_change(netdev, trxq_params->num_qps);
+
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+	old_qp_params = nic_dev->q_params;
+	nic_dev->q_params = *trxq_params;
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+	hinic3_init_qps(nic_dev, &new_qp_params);
+
+	err = hinic3_open_channel(netdev);
+	if (err)
+		goto err_uninit_qps;
+
+	err = hinic3_vport_up(netdev);
+	if (err)
+		goto err_close_channel;
+
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+	clear_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags);
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+	mutex_unlock(&nic_dev->channel_cfg_lock);
+
+	return 0;
+
+err_close_channel:
+	hinic3_close_channel(netdev);
+err_uninit_qps:
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+	nic_dev->q_params = old_qp_params;
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+	hinic3_uninit_qps(nic_dev, &new_qp_params);
+	hinic3_free_channel_resources(netdev, &new_qp_params, trxq_params);
+
+	mutex_unlock(&nic_dev->channel_cfg_lock);
+
+	return err;
+}
+
 static int hinic3_open(struct net_device *netdev)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
@@ -487,16 +565,33 @@ static int hinic3_close(struct net_device *netdev)
 {
 	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
 	struct hinic3_dyna_qp_params qp_params;
+	bool need_teardown = false;
+	unsigned long flags;
 
 	if (!test_and_clear_bit(HINIC3_INTF_UP, &nic_dev->flags)) {
 		netdev_dbg(netdev, "Netdev already close, do nothing\n");
 		return 0;
 	}
 
-	hinic3_vport_down(netdev);
-	hinic3_close_channel(netdev);
-	hinic3_uninit_qps(nic_dev, &qp_params);
-	hinic3_free_channel_resources(netdev, &qp_params, &nic_dev->q_params);
+	mutex_lock(&nic_dev->channel_cfg_lock);
+
+	spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+	if (!test_and_set_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags))
+		need_teardown = true;
+	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+	if (need_teardown) {
+		hinic3_vport_down(netdev);
+		hinic3_close_channel(netdev);
+		hinic3_uninit_qps(nic_dev, &qp_params);
+		hinic3_free_channel_resources(netdev, &qp_params,
+					      &nic_dev->q_params);
+	}
+
+	hinic3_free_nicio_res(nic_dev);
+	hinic3_destroy_num_qps(netdev);
+
+	mutex_unlock(&nic_dev->channel_cfg_lock);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
index 9502293ff710..55b280888ad8 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
@@ -10,6 +10,9 @@
 #include "hinic3_hw_cfg.h"
 #include "hinic3_hwdev.h"
 #include "hinic3_mgmt_interface.h"
+#include "hinic3_nic_io.h"
+#include "hinic3_tx.h"
+#include "hinic3_rx.h"
 
 #define HINIC3_VLAN_BITMAP_BYTE_SIZE(nic_dev)  (sizeof(*(nic_dev)->vlan_bitmap))
 #define HINIC3_VLAN_BITMAP_SIZE(nic_dev)  \
@@ -20,8 +23,13 @@ enum hinic3_flags {
 	HINIC3_MAC_FILTER_CHANGED,
 	HINIC3_RSS_ENABLE,
 	HINIC3_UPDATE_MAC_FILTER,
+	HINIC3_CHANGE_RES_INVALID,
 };
 
+#define HINIC3_CHANNEL_RES_VALID(nic_dev) \
+	(test_bit(HINIC3_INTF_UP, &(nic_dev)->flags) && \
+	 !test_bit(HINIC3_CHANGE_RES_INVALID, &(nic_dev)->flags))
+
 enum hinic3_event_work_flags {
 	HINIC3_EVENT_WORK_TX_TIMEOUT,
 };
@@ -129,6 +137,10 @@ struct hinic3_nic_dev {
 	struct work_struct              rx_mode_work;
 	/* lock for enable/disable port */
 	struct mutex                    port_state_mutex;
+	/* lock for channel configuration */
+	struct mutex                    channel_cfg_lock;
+	/* lock for channel resources */
+	spinlock_t                      channel_res_lock;
 
 	struct list_head                uc_filter_list;
 	struct list_head                mc_filter_list;
@@ -143,6 +155,10 @@ struct hinic3_nic_dev {
 
 void hinic3_set_netdev_ops(struct net_device *netdev);
 int hinic3_set_hw_features(struct net_device *netdev);
+int
+hinic3_change_channel_settings(struct net_device *netdev,
+			       struct hinic3_dyna_txrxq_params *trxq_params);
+
 int hinic3_qps_irq_init(struct net_device *netdev);
 void hinic3_qps_irq_uninit(struct net_device *netdev);
 
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
index 12eefabcf1db..3791b9bc865b 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
@@ -14,6 +14,10 @@ struct hinic3_nic_dev;
 #define HINIC3_RQ_WQEBB_SHIFT      3
 #define HINIC3_SQ_WQEBB_SIZE       BIT(HINIC3_SQ_WQEBB_SHIFT)
 
+#define HINIC3_MAX_TX_QUEUE_DEPTH  65536
+#define HINIC3_MAX_RX_QUEUE_DEPTH  16384
+#define HINIC3_MIN_QUEUE_DEPTH     128
+
 /* ******************** RQ_CTRL ******************** */
 enum hinic3_rq_wqe_type {
 	HINIC3_NORMAL_RQ_WQE = 1,
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v04 0/6] net: hinic3: PF initialization
From: Fan Gong @ 2026-04-08  4:03 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier

This is [3/3] part of hinic3 Ethernet driver second submission.
With this patch hinic3 becomes a complete Ethernet driver with
pf and vf.

Add 20 ethtool ops for information of queue, rss, coalesce and eth data.
Add MTU size validation
Config netdev watchdog timeout.
Remove unneed coalesce parameters.

Changes:

PATCH 03 V01: https://lore.kernel.org/netdev/cover.1773387649.git.zhuyikai1@h-partners.com/
* Add rmon/pause/phy/mac/ctrl stats (Ioana Ciornei)

PATCH 03 V02: https://lore.kernel.org/netdev/cover.1774684571.git.zhuyikai1@h-partners.com/
* Modify "return -EINVAL" intension problem (AI review)
* Use le16_to_cpu for rss_indir pair.out->buf (AI review)
* Use u32 instead of int in coalesce_limits to avoid overflow (AI review)
* Remove redundant u64_stats_update_begin/end when reading stats without
  concurrent reader (AI review)
* Modify nic_dev->stats.syncp logic (AI review)
* Complete rxq/txq stats stats fileds in hinic3_rx/txq_get_stats (AI review)
* Remove statistics values in rtnl_link_stats64 from ethtool statistics
  values (AI review)
* Add channel_cfg_lock & channel_res_lock to protect resources access (AI review)
* Remove OutOfRangeLengthField, FrameToolong and InRangeLengthErrors (Ioana Ciornei)
* Remove redundant mtu commit (Maxime Chevialler)

PATCH 03 V03: https://lore.kernel.org/netdev/cover.1774940117.git.zhuyikai1@h-partners.com/
* Change unnedd to unneeded (AI review)
* Remove packets,bytes,errors and dropped in hinic3_rx/tx_queue_stats (AI review)
* Remove duplicated entried in hinic3_port_stats[] (AI review)
* change stats_info.head.status to ps->head.status (AI review)

PATCH 03 V04:
* Remove restore_drop_sge in hinic3_rx_queue_stats (AI review)
* Remove hinic3_nic_stats (AI review)
* Use old_q_param to store old config and use it in error handling (Mohsin Bashir)
* Add netdev_info to inform the user that depth is trimmed (Mohsin Bashir)
* Remove const in hinic3_get_qp_stats_strings parameters (Mohsin Bashir)
* Change EOPNOTSUPP to ERANGE in is_coalesce_exceed_limit (Mohsin Bashir)
* Update nic_dev->rss_type after hinic3_set_rss_type (Mohsin Bashir)
* Modify MGMT_STATUS_CMD_UNSUPPORTED to EOPNOTSUPP for complying with the
  error code specifications  (Mohsin Bashir)

Fan Gong (6):
  hinic3: Add ethtool queue ops
  hinic3: Add ethtool statistic ops
  hinic3: Add ethtool coalesce ops
  hinic3: Add ethtool rss ops
  hinic3: Configure netdev->watchdog_timeo to set nic tx timeout
  hinic3: Remove unneeded coalesce parameters

 .../ethernet/huawei/hinic3/hinic3_ethtool.c   | 827 +++++++++++++++++-
 .../ethernet/huawei/hinic3/hinic3_hw_intf.h   |  13 +-
 .../net/ethernet/huawei/hinic3/hinic3_irq.c   |  16 +-
 .../net/ethernet/huawei/hinic3/hinic3_main.c  |  15 +
 .../huawei/hinic3/hinic3_mgmt_interface.h     |  39 +
 .../huawei/hinic3/hinic3_netdev_ops.c         | 103 ++-
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.c   |  64 ++
 .../ethernet/huawei/hinic3/hinic3_nic_cfg.h   | 109 +++
 .../ethernet/huawei/hinic3/hinic3_nic_dev.h   |  16 +
 .../ethernet/huawei/hinic3/hinic3_nic_io.h    |   4 +
 .../net/ethernet/huawei/hinic3/hinic3_rss.c   | 487 ++++++++++-
 .../net/ethernet/huawei/hinic3/hinic3_rss.h   |  19 +
 .../net/ethernet/huawei/hinic3/hinic3_rx.c    |  59 +-
 .../net/ethernet/huawei/hinic3/hinic3_rx.h    |  18 +-
 .../net/ethernet/huawei/hinic3/hinic3_tx.c    |  71 +-
 .../net/ethernet/huawei/hinic3/hinic3_tx.h    |   2 +
 16 files changed, 1835 insertions(+), 27 deletions(-)


base-commit: 8e7adcf81564a3fe886a6270eea7558f063e5538
-- 
2.43.0


^ permalink raw reply

* [PATCH] doc: watchdog: fix typos etc.
From: Randy Dunlap @ 2026-04-08  3:12 UTC (permalink / raw)
  To: linux-kernel
  Cc: Randy Dunlap, Andrew Morton, Jonathan Corbet, Shuah Khan,
	linux-doc

Correct grammar, plurality, and typos in lockup-watchdogs.rst.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
---
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: linux-doc@vger.kernel.org

 Documentation/admin-guide/lockup-watchdogs.rst |    6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

--- linux-next-20260406.orig/Documentation/admin-guide/lockup-watchdogs.rst
+++ linux-next-20260406/Documentation/admin-guide/lockup-watchdogs.rst
@@ -18,7 +18,7 @@ provided for this.
 A 'hardlockup' is defined as a bug that causes the CPU to loop in
 kernel mode for several seconds (see "Implementation" below for
 details), without letting other interrupts have a chance to run.
-Similarly to the softlockup case, the current stack trace is displayed
+Similar to the softlockup case, the current stack trace is displayed
 upon detection and the system will stay locked up unless the default
 behavior is changed, which can be done through a sysctl,
 'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
@@ -41,7 +41,7 @@ is a trade-off between fast response to
 Implementation
 ==============
 
-The soft and hard lockup detectors are built around a hrtimer.
+The soft and hard lockup detectors are built around an hrtimer.
 In addition, the softlockup detector regularly schedules a job, and
 the hard lockup detector might use Perf/NMI events on architectures
 that support it.
@@ -49,7 +49,7 @@ that support it.
 Frequency and Heartbeats
 ------------------------
 
-The core of the detectors in a hrtimer. It servers multiple purpose:
+The core of the detectors is an hrtimer. It servers multiple purposes:
 
 - schedules watchdog job for the softlockup detector
 - bumps the interrupt counter for hardlockup detectors (heartbeat)

^ permalink raw reply

* Re: (sashiko review) [PATCH v6 1/1] mm/damon: add node_eligible_mem_bp and node_ineligible_mem_bp goal metrics
From: Ravi Jonnalagadda @ 2026-04-08  2:33 UTC (permalink / raw)
  To: SeongJae Park
  Cc: damon, linux-mm, linux-kernel, linux-doc, akpm, corbet, bijan311,
	ajayjoshi, honggyu.kim, yunjeong.mun
In-Reply-To: <20260407160546.52220-1-sj@kernel.org>

On Tue, Apr 7, 2026 at 9:05 AM SeongJae Park <sj@kernel.org> wrote:
>
> Adding another thought at the end of the mail without cutting the previous
> unrelated questions, so that Ravi can answer all my questions at once.
>
> On Mon,  6 Apr 2026 17:13:08 -0700 SeongJae Park <sj@kernel.org> wrote:
>
> > On Mon, 6 Apr 2026 12:47:56 -0700 Ravi Jonnalagadda <ravis.opensrc@gmail.com> wrote:
> >
> > > On Sun, Apr 5, 2026 at 3:45 PM SeongJae Park <sj@kernel.org> wrote:
> > > >
> > > >
> > > > Ravi, thank you for reposting this patch after the rebase.  This time sashiko
> > > > was able to review this, and found good points including things that deserve
> > > > another revision of this patch.
> > > >
> > > > Forwarding full sashiko review in a reply format with my inline comments below,
> > > > for sharing details of my view and doing followup discussions via mails.  Ravi,
> > > > could you please reply?
> > > >
> > >
> > > Thanks SJ, providing your comments on top of sashiko's review is very helpful.
> >
> > I'm glad to hear that it is working for you :)
> >
> > [...]
> > > > > +static unsigned long damos_calc_eligible_bytes(struct damon_ctx *c,
> > > > > > +           struct damos *s, int nid, unsigned long *total)
> > > > > > +{
> > [...]
> > > > > > +                           struct folio *folio;
> > > > > > +                           unsigned long folio_sz, counted;
> > > > > > +
> > > > > > +                           folio = damon_get_folio(PHYS_PFN(addr));
> > > > >
> > > > > What happens if this metric is assigned to a DAMON context configured for
> > > > > virtual address space monitoring? If the context uses DAMON_OPS_VADDR,
> > > > > passing a user-space virtual address to PHYS_PFN() might cause invalid
> > > > > memory accesses or out-of-bounds page struct reads. Should this code
> > > > > explicitly verify the operations type first?
> > > >
> > > > Good finding.  We intend to support only paddr ops.  But there is no guard for
> > > > using this on vaddr ops configuration.  Ravi, could we add underlying ops
> > > > check?  I think damon_commit_ctx() is a good place to add that.  The check
> > > > could be something like below?
> > > >
> > >
> > > I plan to add the ops type check directly in the metric functions
> > > (damos_get_node_eligible_mem_bp and its counterpart) rather than in
> > > damon_commit_ctx(). The functions will return 0 early
> > > if c->ops.id != DAMON_OPS_PADDR.
> > >
> > > That said, if you prefer the damon_commit_ctx() validation approach to
> > > reject the configuration outright, I can implement it that way instead.
> > > Please let me know your preference.
> >
> > I'd prefer damon_commit_ctx() validation approach since it would give users
> > more clear message of the failure.
> >
> > >
> > > > '''
> > > > --- a/mm/damon/core.c
> > > > +++ b/mm/damon/core.c
> > > > @@ -1515,10 +1515,23 @@ static int damon_commit_sample_control(
> > > >  int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src)
> > > >  {
> > > >         int err;
> > > > +       struct damos *scheme;
> > > > +       struct damos_quota_goal *goal;
> > > >
> > > >         dst->maybe_corrupted = true;
> > > >         if (!is_power_of_2(src->min_region_sz))
> > > >                 return -EINVAL;
> > > > +       if (src->ops.id != DAMON_OPS_PADDR) {
> > > > +               damon_for_each_scheme(scheme, src) {
> > > > +                       damos_for_each_quota_goal(goal, &scheme->quota) {
> > > > +                               switch (goal->metric) {
> > > > +                               case DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP:
> > > > +                               case DAMOS_QUOTA_NODE_INELIGIBLE_MEMPBP:
> > > > +                                       return -EINVAL;
> > > > +                               }
> > > > +                       }
> > > > +               }
> > > > +       }
> > > >
> > > >         err = damon_commit_schemes(dst, src);
> > > >         if (err)
> > > > '''
> > [...]
> > > > > > +   /* Compute ineligible ratio directly: 10000 - eligible_bp */
> > > > > > +   return 10000 - mult_frac(node_eligible, 10000, total_eligible);
> > > > > > +}
> > > > >
> > > > > Does this return value match the documented metric? The formula computes the
> > > > > percentage of the system's eligible memory located on other NUMA nodes,
> > > > > rather than the amount of actual ineligible (filtered out) memory residing
> > > > > on the target node. Could this semantic mismatch cause confusion when
> > > > > configuring quota policies?
> > > >
> > > > Nice catch.  The name and the documentation are confusing.  We actually
> > > > confused a few times in previous revisions, and I'm again confused now.  IIUC,
> > > > the current implementation is the intended and right one for the given use
> > > > case, though.  If my understanding is correct, how about renaming
> > > > DAMOS_QUOTA_NODE_INELIGIBLE_MEM_BP to
> > > > DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP_COMPLEMENT, and updating the documentation
> > > > together?  Ravi, what do you think?
> > > >
> > >
> > > Agreed, the current name is confusing. How about
> > > DAMOS_QUOTA_NODE_ELIGIBLE_MEM_BP_OFFNODE?
> > >
> > > The rationale is that this metric measures "eligible memory that is off
> > > this node" (i.e., on other nodes).
> > >
> > >  I think "offnode" conveys the physical meaning more directly than "complement".
> > > That said, I'm happy to go with "complement" if you prefer.
> > > both are clearer than "ineligible".
> >
> > Thank you for the nice suggestion.  I like "offnode" term.  But I think having
> > "node" twice on the name is not really efficient for people who print code on
> > papers.  What about DAMOS_QUOTA_OFFNODE_ELIGIBLE_MEM_BP?
> >
> > But...  Maybe more importantly...  Now I realize this means that
> > offnode_eligible_mem_bp with target nid 0 is just same to node_eligible_mem_bp
> > with target nid 1, on your test setup.  Maybe we don't really need
> > offnode_eligible_mem_bp?  That is, your test setup could be like below.
> >
> > '''
> > For maintaining hot memory on DRAM (node 0) and CXL (node 1) in a 7:3
> > ratio:
> >
> >     PUSH scheme: migrate_hot from node 0 -> node 1
> >       goal: node_eligible_mem_bp, nid=1, target=3000
> >       "Move hot pages from DRAM to CXL if less thatn 30% of hot data is
> >        in CXL"
> >
> >     PULL scheme: migrate_hot from node 1 -> node 0
> >       goal: node_eligible_mem_bp, nid=0, target=7000
> >       "Move hot pages from CXL to DRAM if less than 70% of hot data is
> >        in DRAM"
> > '''
> >
> > And the schemes are more easy to read and understand for me.  This seems even
> > straightforward to scale for >2 nodes.  For example, if we want hot memory
> > distribution of 5:3:2 to nodes 0:1:2,
> >
> >       Two schemes for migrating hot pages out of node 0
> >       - migrate_hot from node 0 -> node 1
> >         - goal: node_eligible_mem_bp, nid=1, target=3000
> >       - migrate_hot from node 0 -> node 2
> >         - goal: node_eligible_mem_bp, nid=2, target=2000
> >
> >       Two schemes for migrating hot pages out of node 1
> >       - migrate_hot from node 1 -> node 0
> >         - goal: node_eligible_mem_bp, nid=0, target=5000
> >       - migrate_hot from node 1 -> node 2
> >         - goal: node_eligible_mem_bp, nid=2, target=2000
> >
> >       Two schemes for migrating hot pages out of node 2
> >       - migrate_hot from node 2 -> node 0
> >         - goal: node_eligible_mem_bp, nid=0, target=5000
> >       - migrate_hot from node 2 -> node 1
> >         - goal: node_eligible_mem_bp, nid=1, target=3000
> >
> > Do you think this makes sense?  If it makes sense and works for your use case,
> > what about dropping the offnode goal type?
>
> Now I recall I suggested the offnode metric because I suggested to run a
> kdamond per node.  That is, having one kdamond that monitors only node 0 and
> migrate hot memory to node 1, and another kdamond that monitors only node 1 and
> migrate hot memory to node 0.  And I suggested to do so because I knew it is
> suboptimal to run DAMOS schemes with node filter.
>
> We made a change [1] for making that more optimum, though.  The change is now
> in mm-stable, so hopefully it will be available from 7.1-rc1.  So I believe the
> single quota goal metric should work now.  Ravi, could you share what you
> think?
>

Yes SJ. I think we can make it work with single goal now that the
below commit is part of mainline. will give it a try and post an
update.

> [1] commit e1ace69c33ec ("mm/damon/core: set quota-score histogram with core filters")
>
>
> Thanks,
> SJ
>
> [...]

^ permalink raw reply

* Re: [PATCH v2 00/16] fs,x86/resctrl: Add kernel-mode (e.g., PLZA) support to the resctrl subsystem
From: Babu Moger @ 2026-04-08  1:01 UTC (permalink / raw)
  To: Reinette Chatre, corbet@lwn.net, tony.luck@intel.com,
	Dave.Martin@arm.com, james.morse@arm.com, tglx@kernel.org,
	mingo@redhat.com, bp@alien8.de, dave.hansen@linux.intel.com
  Cc: skhan@linuxfoundation.org, x86@kernel.org, hpa@zytor.com,
	peterz@infradead.org, juri.lelli@redhat.com,
	vincent.guittot@linaro.org, dietmar.eggemann@arm.com,
	rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de,
	vschneid@redhat.com, kas@kernel.org, rick.p.edgecombe@intel.com,
	akpm@linux-foundation.org, pmladek@suse.com,
	rdunlap@infradead.org, dapeng1.mi@linux.intel.com,
	kees@kernel.org, elver@google.com, paulmck@kernel.org,
	lirongqing@baidu.com, safinaskar@gmail.com, fvdl@google.com,
	seanjc@google.com, pawan.kumar.gupta@linux.intel.com,
	xin@zytor.com, tiala@microsoft.com, Neeraj.Upadhyay@amd.com,
	chang.seok.bae@intel.com, Lendacky, Thomas,
	elena.reshetova@intel.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-coco@lists.linux.dev,
	kvm@vger.kernel.org, eranian@google.com, peternewman@google.com
In-Reply-To: <3305c18e-9e50-4df0-b9f1-c61028628967@intel.com>

Hi Reinette,

On 4/7/26 12:48, Reinette Chatre wrote:
> Hi Babu,
> 
> On 4/6/26 3:45 PM, Babu Moger wrote:
>> Hi Reinette,
>>
>> Sorry for the late response. I was trying to get confirmation about the use case.
> 
> No problem. I appreciate that you did this so that we can make sure resctrl supports
> needed use cases.
> 
>>
>> On 3/31/26 17:24, Reinette Chatre wrote:
>>> On 3/30/26 11:46 AM, Babu Moger wrote:
>>>> On 3/27/26 17:11, Reinette Chatre wrote:
>>>>> On 3/26/26 10:12 AM, Babu Moger wrote:
>>>>>> On 3/24/26 17:51, Reinette Chatre wrote:
>>>>>>> On 3/12/26 1:36 PM, Babu Moger wrote:
> 
>>> can have domains that span different CPUs. There thus seem to be a built in assumption of what a "domain"
>>> means for PQR_PLZA_ASSOC so it sounds to me as though, instead of saying that "PQR_PLZA_ASSOC needs
>>> to be the same in QoS domain" it may be more accurate to, for example, say that "PQR_PLZA_ASSOC has L3 scope"?
>>
>> Yes.
> 
> Above is about L3 scope ...

Yes. The scope for PQR_PLZA_ASSOC is L3.

Is that what you are asking here?

>   
>>>
>>> This seems to be what this implementation does since it hardcodes PQR_PLZA_ASSOC scope to the L3
>>> resource but that creates dependency to the L3 resource that would make PLZA unusable if, for example,
>>> the user boots with "rdt=!l3cat" while wanting to use PLZA to manage MBA allocations when in kernel?
>>
>> Yes. that is correct. It should not be attached to one resource. We need to change it to global scope.
> 
> Can I interpret "global scope" as "all online CPUs"? Doing so will simplify

Yes. That is correct.


> supporting this feature. It does not sound practical for a user wanting to assign
> different resource groups to kernel work done in different domains ... the guidance should
> instead be to just set the allocations of one resource group to what is needed in the different
> domains? There may be more flexibility when supporting per-domain RMIDs though but so far
> it sounds as though the focus is global. We can consider what needs to be done to support
> some type of "per-domain" assignment as exercise whether current interface could support it
> in the future.

Yes. Makes sense.

> 
> ...
> 
>>>> There are multiple ways this feature can be applied. For simplicity, the discussion below focuses only on CLOSID.
>>>>
>>>>
>>>>        1. Global PLZA enablement
>>>>
>>>> PLZA can be configured as a global feature by setting |PQR_PLZA_ASSOC.closid = CLOSID| and |PQR_PLZA_ASSOC.plza_en = 1| on all threads in the system. A dedicated CLOSID is reserved for this purpose,
>>>
>>> Also discussed during v1 is that there is no need to dedicate a CLOSID for this purpose.
>>> There could be an "unthrottled" CLOSID to which all high priority user space tasks as
>>> well as all kernel work of all tasks are assigned.
>>> If user space chooses to dedicate a CLOSID for kernel work then that should supported and
>>> interface can allow that, but there is no need for resctrl to enforce this.
> 
> (above is comment about dedicated group - please see below)
> 
>   
>> Yes. I agree. The changes in context switch code is a concern.
>>
>> You covered some of the cases I was thinking(xx_set_individual).
>>
>> How about this idea?
>>
>> I suggest splitting the PLZA into two distinct aspects:
>>
>> 1. How PLZA is applied within a resource group
>>
>> 2. How PLZA is monitored
> 
> I think I see where you are going here. While the "How PLZA is monitored" naming
> refers to "monitoring" I *think* what you are separating here is (a) how PLZA is configured
> (CLOSID and RMID settings) and (b) how that PLZA configuration is assigned to tasks/CPUs,
> not just within a resource group but across the system. Please see below.
> 
> 
>> Introduce a new file, "info/kmode_type", to describe how kmode applies in the system.
> 
> ack. "in the system" as you have above, not "within a resource group" as mentioned
> before that.
> 
>>
>> # cat info/kmode_type
>> [global] <- Kernel mode applies to the entire system (all CPUs/tasks)
>>    cpus   <- Kernel mode applies only to the CPUs in the group
>>    tasks  <- Kernel mode applies only to the tasks in the group
>>
>> The "global" option is the default right now and it is current common use-case.
>>
>> The "info/kmode_type -> cpus" option introduces new files
>> "kmode_cpus" and "kmode_cpus_list" for users to apply kmode to
>> specific set of CPUs. This lets users change the CPU set for PLZA.
> Where were you thinking about placing these files in the hierarchy?

It needs to be inside the resctrl group (in struct rdtgroup).


> 
>> The PLZA MSR is updated when user changes the association to the
>> file. No context switch code changes are needed. This will be
>> dedicated group. The current resctrl group files, "cpus, cpus_list
> 
> Why does this have to be a dedicated group? One of the conclusions from v1
> discussion was that the "PLZA group" need *not* be a dedicated group. I repeated that
> in my earlier response that I left quoted above. You did not respond to these
> conclusions and statements in this regard while you keep coming back to this
> needing to be a dedicated group without providing a motivation to do so.
> Could you please elaborate why a dedicated group is required?

If the same group applies identical limits to both user and kernel 
space, it essentially behaves like a current resctrl group. In that 
sense, it’s not really a PLZA group. PLZA’s key value is the ability to 
separate allocations between user space and kernel space. A single CPU 
can belong to two groups: one group manages the user-space allocation 
for that CPU, while another manages the kernel-mode allocation.
This approach also simplifies file handling, which is another reason I 
prefer it.

That said, I’m open to not having a dedicated group if we can still 
support all the features that PLZA provides without it.


> 
> 
>> and tasks" will not be accessible in this mode. This option give
> 
> These files can continue to be accessible.

ok.

> 
>> some flexibility for the user without the context switch overhead.
> 
> Dedicating a resource group to PLZA removes flexibility though, no?

Yes. But makes it easy to handle the files as I mentioned above.

> 
>>
>> The "info/kmode_type -> tasks" option introduces a new file,
>> "kmode_tasks", for users to apply kmode to specific set of tasks.
>> This requires context switch changes. This will be dedicated group.
>> The current resctrl group files, "cpus, cpus_list and tasks" will
>> not be accessible in this mode. We currently have no use case for
>> this, so it will not be supported now.
> 
> Thank you for confirming. This is a relief.
> 
>>
>>
>> Add a file, "info/kmode_monitor", to describe how kmode is monitored.
>>
>> # cat info/kmode_monitor
>> [inherit_ctrl_and_mon] <- Kernel uses the same CLOSID/RMID as user. Default option for the "global"
>> assign_ctrl_inherit_mon <- One CLOSID for all kernel work; RMID inherited from user.
>> assign_ctrl_assign_mon <- One resource group (CLOSID+RMID) for all kernel work. Default option for "cpu" type.
> 
> My first thought is that the naming is confusing. resctrl has a very strong relationship between
> "RMID" and "monitoring" so naming a file "monitor" that deals with allocation/ctrl/CLOSID is
> potentially confusion.
> 
> Apart from that, while I think I understand where you are going by separating the mode into
> two files I am concerned about future complications needing to accommodate all different
> combinations of the (now) essentially two modes. My preference is thus to keep this simple by
> keeping the mode within one file.
> 
> Even so, when stepping back, it does not really look like we need to separate the "global"
> and "per CPU" modes. We could just have a single "per CPU" mode and the "global" is just
> its default of "all CPUs", no?

Yes. That correct.

> 
> Consider, for example, the implementation just consisting of:
> 
> 	# cat info/kernel_mode
> 	[inherit_ctrl_and_mon]
> 	global_assign_ctrl_inherit_mon_per_cpu
> 	global_assign_ctrl_assign_mon_per_cpu
>   
>>
>> Rename “kernel_mode_assignment” to “kmode_group” to assign the specific group to kmode. This file usage is same as before.
>>
>> #cat info/kmode_groups (Renamed "kernel_mode_assignment")
>> //
> 
> Please consider the intent of this file when thinking about names. The idea is that "info/kernel_mode"
> specifies the "mode" of how kernel work is handled and it determines the configuration files used in that
> mode as well as the syntax when interacting with those files. By renaming "kernel_mode_assignment" to
> "kmode_groups" it implicitly requires all future kernel mode enhancements to need some data related to "groups".
> 
> In summary, I think this can be simplified by introducing just two new files in info/ that enables the
> user to (a) select and (b) configure the "kernel mode". To start there can be just two modes,
> global_assign_ctrl_inherit_mon_per_cpu and global_assign_ctrl_assign_mon_per_cpu.
> global_assign_ctrl_inherit_mon_per_cpu mode requires a control group in kernel_mode_assignment while
> global_assign_ctrl_assign_mon_per_cpu requires a control and monitoring group.
> 
> The resource group in info/kernel_mode_assignment gets two additional files "kernel_mode_cpus" and
> "kernel_mode_cpus_list" that contains the CPUs enabled with the kernel mode configuration, by default
> it will be all online CPUs. The resource group can continue to be used to manage allocations of and
> monitor user space tasks. Specifically, the "cpus", "cpus_list", and "tasks" files remain.
> 
> A user wanting just "global" settings will get just that when writing the group to
> info/kernel_mode_assignment. A user wanting "per CPU" settings can follow the
> info/kernel_mode_assignment setting with changes to that resource group's kernel_mode_cpus/kernel_mode_cpus_list
> files. Any task running on a CPU that is *not* in kernel_mode_cpus/kernel_mode_cpus_list can be
> expected to inherit both CLOSID and RMID from user space for all kernel work.

After further consideration, I don’t think the info/kernel_mode file is 
necessary. There’s no need to enforce a specific mode for all the PLZA 
groups. Avoiding this constraint makes the design more flexible, 
particularly as we move toward supporting multiple PLZA groups in the 
future. MPAM already appears capable of handling more than one group—for 
example, one group could use inherit_ctrl_and_mon, while another could 
use global_assign_ctrl_inherit_mon_per_cpu.

The mode can simply be determined on a per-group basis. We can introduce 
two new files—kernel_mode_cpus and kernel_mode_cpus_list—within each 
resctrl group when kmode (or PLZA) is supported.

The info/kernel_mode_assignment file would indicate which resctrl 
group(or groups) is used for PLZA. The files—kernel_mode_cpus and 
kernel_mode_cpus_list would indicate how the plza is applied which each 
group.

Files and behavior:
- cpus / cpus_list:

CPUs listed here use the same allocation for both user and kernel space.
There is no change to the current semantics of these files.
If these files are empty, the group effectively becomes a PLZA-dedicated 
group.

- kernel_mode_cpus / kernel_mode_cpus_list:

These files determine whether a separate kernel allocation is applied.
If empty, user and kernel share the same allocation.
If non-empty, the kernel uses a separate allocation.

The group can be CTL_MON or MON group. Based on type the group the 
CLOSID and RMID will be used to enable PLZA. If it is MON, then rmid_en 
= 1 when writing PLZA MSR.


Here’s the proposed flow:

# mount -t resctrl resctrl /sys/fs/resctrl/
# cd /sys/fs/resctrl/
# cat info/kernel_mode_assignment
//

By default, the root (default) group is PLZA-enabled when resctrl is 
mounted. All CPUs use CLOSID 0 for both user and kernel-mode allocation.

# cat cpus_list
1-64
# cat kmode_cpus_list
1-64

Next, create a new group for PLZA:

# mkdir plza_group

# echo "plza_group//" > info/kernel_mode_assignment

At this point, plza_group becomes the new PLZA-enabled group, and the 
PLZA-related MSRs are updated accordingly.

# cat plza_group/cpus_list
<empty>

# cat plza_group/kmode_cpus_list
1-64

The user can then update kmode_cpus_list to apply PLZA only to a 
specific subset of CPUs, if desired.


What do you think of this approach?


Thanks
Babu

^ permalink raw reply

* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-08  0:47 UTC (permalink / raw)
  To: Dave Hansen
  Cc: Pawan Gupta, x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin,
	Josh Poimboeuf, David Kaplan, Sean Christopherson,
	Borislav Petkov, Dave Hansen, Peter Zijlstra, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, KP Singh, Jiri Olsa,
	David S. Miller, David Laight, Andy Lutomirski, Thomas Gleixner,
	Ingo Molnar, David Ahern, Martin KaFai Lau, Eduard Zingerman,
	Song Liu, Yonghong Song, John Fastabend, Stanislav Fomichev,
	Hao Luo, Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm,
	Asit Mallick, Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <a605fb45-f8e3-45ad-8924-1da43b9a9e05@intel.com>

On Tue, Apr 7, 2026 at 4:41 PM Dave Hansen <dave.hansen@intel.com> wrote:
>
> On 4/7/26 16:27, Jim Mattson wrote:
> > What is your proposed BHI_DIS_S override mechanism, then?
>
> Let me make sure I get this right. The desire is to:
>
> 1. Have hypervisors lie to guests about the CPU they are running on (for
>    the benefit of large/diverse migration pools)
> 2. Have guests be allowed to boot with BHI_DIS_S for performance
> 3. Have apps in those guests that care about security to opt back in to
>    BHI_DIS_S for themselves?

I just want guests on heterogeneous migration pools to properly
protect themselves from native BHI when running on host kernels at
least as far back as Linux v6.6.

To that end, I would be satisfied with using the longer BHB clearing
sequence when HYPERVISOR is true and BHI_CTRL is false.

^ permalink raw reply

* Re: [PATCH 14/24] nfsd: add tracepoint to dir_event handler
From: Steven Rostedt @ 2026-04-08  0:41 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, NeilBrown, Olga Kornievskaia,
	Dai Ngo, Tom Talpey, Trond Myklebust, Anna Schumaker,
	Amir Goldstein, Calum Mackay, linux-fsdevel, linux-kernel,
	linux-trace-kernel, linux-doc, linux-nfs
In-Reply-To: <20260407-dir-deleg-v1-14-aaf68c478abd@kernel.org>

On Tue, 07 Apr 2026 09:21:27 -0400
Jeff Layton <jlayton@kernel.org> wrote:

> Add some extra visibility around the fsnotify handlers.
> 
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4state.c |  2 ++
>  fs/nfsd/trace.h     | 20 ++++++++++++++++++++
>  2 files changed, 22 insertions(+)

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

-- Steve

^ permalink raw reply

* Re: [PATCH RFC v4 10/44] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
From: Sean Christopherson @ 2026-04-08  0:33 UTC (permalink / raw)
  To: Michael Roth
  Cc: Vishal Annapurve, Ackerley Tng, aik, andrew.jones, binbin.wu,
	brauner, chao.p.peng, david, ira.weiny, jmattson, jthoughton,
	oupton, pankaj.gupta, qperret, rick.p.edgecombe, rientjes,
	shivankg, steven.price, tabba, willy, wyihan, yan.y.zhao,
	forkloop, pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm
In-Reply-To: <nmy5polcxfnn3hoircsiqarxmzlulwwq7w34okanccntp32v56@h2eac44agovv>

On Tue, Apr 07, 2026, Michael Roth wrote:
> On Tue, Apr 07, 2026 at 02:50:58PM -0700, Vishal Annapurve wrote:
> > On Tue, Apr 7, 2026 at 2:09 PM Michael Roth <michael.roth@amd.com> wrote:
> > >
> > > > TLDR:
> > > >
> > > > + Think of populate ioctls not as KVM touching memory, but platform
> > > >   handling population.
> > > > + KVM code (kvm_gmem_populate) still doesn't touch memory contents
> > > > + post_populate is platform-specific code that handles loading into
> > > >   private destination memory just to support legacy non-in-place
> > > >   conversion.
> > > > + Don't complicate populate ioctls by doing conversion just to support
> > > >   legacy use-cases where platform-specific code has to do copying on
> > > >   the host.
> > >
> > > That's a good point: these are only considerations in the context of
> > > actually copying from src->dst, but with in-place conversion the
> > > primary/more-performant approach will be for userspace to initial
> > > directly. I.e. if we enforced that, then gmem could right ascertain that
> > > it isn't even writing to private pages via these hooks and any
> > > manipulation of that memory is purely on the part of the trusted entity
> > > handling initial encryption/etc.
> > >
> > > I understand that we decided to keep the option of allowing separate
> > > src/dst even with in-place conversion, but it doesn't seem worthwhile if
> > > that necessarily means we need to glue population+conversion together in
> > > 1 clumsy interface that needs to handle partial return/error responses to
> > > userspace (or potentially get stuck forever in the conversion path).
> > 
> > I think ARM needs userspace to specify separate source and destination
> > memory ranges for initial population as ARM doesn't support in-place
> > memory encryption. [1]
> > 
> > [1] https://lore.kernel.org/kvm/20260318155413.793430-25-steven.price@arm.com/
> > 
> > >
> > > So I agree with Ackerley's proposal (which I guess is the same as what's
> > > in this series).
> > >
> > > However, 1 other alternative would be to do what was suggested on the
> > > call, but require userspace to subsequently handle the shared->private
> > > conversion. I think that would be workable too.
> > 
> > IIUC, Converting memory ranges to private after it essentially is
> > treated as private by the KVM CC backend will expose the
> > implementation to the same risk of userspace being able to access
> > private memory and compromise host safety which guest_memfd was
> > invented to address.
> 
> Doh, fair point. Doing conversion as part of the populate call would allow
> us to use the filemap write-lock to avoid userspace being able to fault
> in private (as tracked by trusted entity) pages before they are
> transitioned to private (as tracked by KVM), so it's safer than having
> userspace drive it.
> 
> But obviously I still think Ackerley's original proposal has more
> upsides than the alternatives mentioned so far.

I'm a bit lost.  What exactly is/was Ackerley's original proposal?  If the answer
is "convert pages from shared=>private when populating via in-place conversion",
then I agree, because AFAICT, that's the only sane option.

^ permalink raw reply

* Re: [PATCH RFC v4 10/44] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
From: Sean Christopherson @ 2026-04-08  0:30 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Michael Roth, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, Paolo Bonzini, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm
In-Reply-To: <CAEvNRgEtigp7+PVDkyu_DH947CUqDt312d+P+hWjjd2fHONiag@mail.gmail.com>

On Fri, Apr 03, 2026, Ackerley Tng wrote:
> Currently, in TDX's populate flow, KVM doesn't do any copying, it only
> instructs TDX to do the copying.

I disagree with this statement.  For all intents and purposes, the TDX-Module is
firmware.  If Intel had elected to implement TDX via XuCode, and presented it to
software as ISA (see SGX), then under the hood "firmware" would still be doing the
actual copy, but KVM would be execute some form of "copy" instruction.

Saying "KVM doesn't do any copying" is (very loosely) analogous to saying that
KVM doesn't copy anything when it does REP MOVSQ.  It wasn't me your honor, Intel's
string engine did it!

I don't think it changes anything in practice, but I don't want to treat TDX
SEAMCALLs (or SNP PSP commands) as something completely different than what we
usually think of as "hardware".

^ permalink raw reply

* Re: [PATCH] KVM: x86: nSVM: Redirect IA32_PAT accesses to either hPAT or gPAT
From: Sean Christopherson @ 2026-04-07 23:52 UTC (permalink / raw)
  To: Jim Mattson
  Cc: Paolo Bonzini, Jonathan Corbet, Shuah Khan, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	kvm, linux-doc, linux-kernel, Yosry Ahmed
In-Reply-To: <CALMp9eS+XiJ=u2618Hke9ePyzeTuChU=dLt+e=x2nXLTMVH5mw@mail.gmail.com>

On Tue, Apr 07, 2026, Jim Mattson wrote:
> On Tue, Apr 7, 2026 at 12:24 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Tue, Apr 07, 2026, Jim Mattson wrote:
> > > When KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled and the vCPU is in
> > > guest mode with nested NPT enabled, guest accesses to IA32_PAT are
> > > redirected to the gPAT register, which is stored in VMCB02's g_pat field.
> > >
> > > Non-guest accesses (e.g. from userspace) to IA32_PAT are always redirected
> > > to hPAT, which is stored in vcpu->arch.pat.
> > >
> > > Directing host-initiated accesses to hPAT ensures that KVM_GET/SET_MSRS and
> > > KVM_GET/SET_NESTED_STATE are independent of each other and can be ordered
> > > arbitrarily during save and restore. gPAT is saved and restored separately
> > > via KVM_GET/SET_NESTED_STATE.
> > >
> > > Use WARN_ON_ONCE to flag any host-initiated accesses originating from KVM
> > > itself rather than userspace.
> > >
> > > Use pr_warn_once to flag any use of the common MSR-handling code (now
> > > shared by VMX and TDX) for IA32_PAT by a vCPU that is SVM-capable.
> >
> > Changelog is stale, but otherwise this LGTM.  I'll fixup the changelog when
> > applying (in a few weeks).
> 
> Oh, crud. This was supposed to be 5/8, but I made some changes after
> checkpatch.pl complained and then tried to just regenerate this one,
> but I totally flubbed it.

Huh.  The patch shows up when I grab the thread via b4 mbox and open it with mutt,
but b4 am skips it.  I'm guessing there's version-based filtering somewhere in b4.

No need for a v9 on my account, I can splice in 5/8 when applying.

^ permalink raw reply

* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Dave Hansen @ 2026-04-07 23:41 UTC (permalink / raw)
  To: Jim Mattson, Pawan Gupta
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
	David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
	Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
	Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <CALMp9eQjSqwnvJz4JVzYpMkkTiucSJtW48zC4Hj9GBiUhOH-Eg@mail.gmail.com>

On 4/7/26 16:27, Jim Mattson wrote:
> What is your proposed BHI_DIS_S override mechanism, then?

Let me make sure I get this right. The desire is to:

1. Have hypervisors lie to guests about the CPU they are running on (for
   the benefit of large/diverse migration pools)
2. Have guests be allowed to boot with BHI_DIS_S for performance
3. Have apps in those guests that care about security to opt back in to
   BHI_DIS_S for themselves?



^ permalink raw reply

* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-07 23:27 UTC (permalink / raw)
  To: Pawan Gupta
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
	David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
	Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
	Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <20260407222738.lrartp6evfp7yhti@desk>

On Tue, Apr 7, 2026 at 3:27 PM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Tue, Apr 07, 2026 at 01:53:24PM -0700, Jim Mattson wrote:
> > On Tue, Apr 7, 2026 at 12:11 PM Pawan Gupta
> > <pawan.kumar.gupta@linux.intel.com> wrote:
> > >
> > > On Tue, Apr 07, 2026 at 11:40:57AM -0700, Jim Mattson wrote:
> > > > My proposal is as follows:
> > > >
> > > > 1. The (advanced) hypervisor can advertise to the guest (via CPUID bit
> > > > or MSR bit) that the short BHB clearing sequence is adequate. This may
> > > > mean either that the VM will only be hosted on pre-Alder Lake hardware
> > > > or that the hypervisor will set BHI_DIS_S behind the back of the
> > > > guest. Presumably, this bit would not be reported if BHI_CTRL is
> > > > advertised to the guest.
> > > > 2. If the guest sees this bit, then it can use the short sequence. If
> > > > it doesn't see this bit, it must use the long sequence.
> > >
> > > Thats a good middle ground. Let me check with folks internally what they
> > > think about defining a new software-only bit.
> > >
> > > Third case, for a guest that doesn't want BHI_DIS_S, userspace should be
> > > allowed to override setting BHI_DIS_S. Then this proposed bit can indicate
> > > that long sequence is required.
> >
> > That case can be handled by the paravirtual mitigation MSRs, right?
>
> Yes. But, that was the part that received the most pushback.

What is your proposed BHI_DIS_S override mechanism, then?

^ permalink raw reply

* Re: [PATCH v2 0/5] Disregard patch series - docs: pt_BR: Complete PGP maintainer guide
From: Jonathan Corbet @ 2026-04-07 22:42 UTC (permalink / raw)
  To: Daniel Pereira, linux-doc
In-Reply-To: <CAMAsx6dF=FH7+DHK5+s6z8dc197ATn-seG7ZYYeUbcRb8xsuqg@mail.gmail.com>

Daniel Pereira <danielmaraboo@gmail.com> writes:

> Hi Jhonatan,
>
> To keep my recent contribution more organized, please disregard the
> patch series I sent a few days ago: [PATCH v2 0/5] docs: pt_BR:
> Complete PGP maintainer guide translation. I will be sending a single,
> consolidated patch in a few days instead.

OK ... bear in mind that we're heading into the merge window, so this
series will be 7.2 material regardless.

Thanks,

jon

^ permalink raw reply

* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-04-07 22:27 UTC (permalink / raw)
  To: Jim Mattson
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
	David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
	Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
	Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <CALMp9eTK0o7Z7-oTB8ohvmoh-vy-Y2qjdUvbqD6HaEhOEPZmhw@mail.gmail.com>

On Tue, Apr 07, 2026 at 01:53:24PM -0700, Jim Mattson wrote:
> On Tue, Apr 7, 2026 at 12:11 PM Pawan Gupta
> <pawan.kumar.gupta@linux.intel.com> wrote:
> >
> > On Tue, Apr 07, 2026 at 11:40:57AM -0700, Jim Mattson wrote:
> > > My proposal is as follows:
> > >
> > > 1. The (advanced) hypervisor can advertise to the guest (via CPUID bit
> > > or MSR bit) that the short BHB clearing sequence is adequate. This may
> > > mean either that the VM will only be hosted on pre-Alder Lake hardware
> > > or that the hypervisor will set BHI_DIS_S behind the back of the
> > > guest. Presumably, this bit would not be reported if BHI_CTRL is
> > > advertised to the guest.
> > > 2. If the guest sees this bit, then it can use the short sequence. If
> > > it doesn't see this bit, it must use the long sequence.
> >
> > Thats a good middle ground. Let me check with folks internally what they
> > think about defining a new software-only bit.
> >
> > Third case, for a guest that doesn't want BHI_DIS_S, userspace should be
> > allowed to override setting BHI_DIS_S. Then this proposed bit can indicate
> > that long sequence is required.
> 
> That case can be handled by the paravirtual mitigation MSRs, right?

Yes. But, that was the part that received the most pushback.

^ permalink raw reply

* [PATCH v2 0/5] Disregard patch series - docs: pt_BR: Complete PGP maintainer guide
From: Daniel Pereira @ 2026-04-07 22:26 UTC (permalink / raw)
  To: Jonathan Corbet, linux-doc

Hi Jhonatan,

To keep my recent contribution more organized, please disregard the
patch series I sent a few days ago: [PATCH v2 0/5] docs: pt_BR:
Complete PGP maintainer guide translation. I will be sending a single,
consolidated patch in a few days instead.

Thanks!

^ permalink raw reply

* Re: [PATCH RFC v4 10/44] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
From: Michael Roth @ 2026-04-07 22:09 UTC (permalink / raw)
  To: Vishal Annapurve
  Cc: Ackerley Tng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm
In-Reply-To: <CAGtprH-kgRByFvvCYeWMXtsvpb6qpaWAo8k-3PEnioyPg-LEvA@mail.gmail.com>

On Tue, Apr 07, 2026 at 02:50:58PM -0700, Vishal Annapurve wrote:
> On Tue, Apr 7, 2026 at 2:09 PM Michael Roth <michael.roth@amd.com> wrote:
> >
> > > TLDR:
> > >
> > > + Think of populate ioctls not as KVM touching memory, but platform
> > >   handling population.
> > > + KVM code (kvm_gmem_populate) still doesn't touch memory contents
> > > + post_populate is platform-specific code that handles loading into
> > >   private destination memory just to support legacy non-in-place
> > >   conversion.
> > > + Don't complicate populate ioctls by doing conversion just to support
> > >   legacy use-cases where platform-specific code has to do copying on
> > >   the host.
> >
> > That's a good point: these are only considerations in the context of
> > actually copying from src->dst, but with in-place conversion the
> > primary/more-performant approach will be for userspace to initial
> > directly. I.e. if we enforced that, then gmem could right ascertain that
> > it isn't even writing to private pages via these hooks and any
> > manipulation of that memory is purely on the part of the trusted entity
> > handling initial encryption/etc.
> >
> > I understand that we decided to keep the option of allowing separate
> > src/dst even with in-place conversion, but it doesn't seem worthwhile if
> > that necessarily means we need to glue population+conversion together in
> > 1 clumsy interface that needs to handle partial return/error responses to
> > userspace (or potentially get stuck forever in the conversion path).
> 
> I think ARM needs userspace to specify separate source and destination
> memory ranges for initial population as ARM doesn't support in-place
> memory encryption. [1]
> 
> [1] https://lore.kernel.org/kvm/20260318155413.793430-25-steven.price@arm.com/
> 
> >
> > So I agree with Ackerley's proposal (which I guess is the same as what's
> > in this series).
> >
> > However, 1 other alternative would be to do what was suggested on the
> > call, but require userspace to subsequently handle the shared->private
> > conversion. I think that would be workable too.
> 
> IIUC, Converting memory ranges to private after it essentially is
> treated as private by the KVM CC backend will expose the
> implementation to the same risk of userspace being able to access
> private memory and compromise host safety which guest_memfd was
> invented to address.

Doh, fair point. Doing conversion as part of the populate call would allow
us to use the filemap write-lock to avoid userspace being able to fault
in private (as tracked by trusted entity) pages before they are
transitioned to private (as tracked by KVM), so it's safer than having
userspace drive it.

But obviously I still think Ackerley's original proposal has more
upsides than the alternatives mentioned so far.

-Mike

> 
> >
> > One other benefit to Ackerley's/current approach however is that it allows
> > us to potentially keep hugepages intact in the populate path, since
> > prep'ing/encrypting everything while it's in a shared state means gmem will
> > split the hugepage and all the firmware/RMP/etc. data structures will only
> > be able to handle individual 4K pages. I still suspect doing things like
> > encoding the initial 2MB OVMF image as a single hugepage might yield
> > enough benefit to explore this (at some point). So there's some niceness
> > in knowing that Ackerley's approach would allow for that eventually and
> > not require a complete rethink on these same topics.
> >
> > Thanks,
> >
> > Mike
> >
> > >
> > > >>>
> > > >>> [...snip...]
> > > >>>

^ permalink raw reply

* Re: [PATCH RFC v4 10/44] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
From: Vishal Annapurve @ 2026-04-07 21:50 UTC (permalink / raw)
  To: Michael Roth
  Cc: Ackerley Tng, aik, andrew.jones, binbin.wu, brauner, chao.p.peng,
	david, ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta,
	qperret, rick.p.edgecombe, rientjes, shivankg, steven.price,
	tabba, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, Paolo Bonzini, Sean Christopherson,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm
In-Reply-To: <yvfwexsub7nrogh67hzcsupbrkzer6a7kbeao5tlq4elrzc2iz@xrwdjd7p32pp>

On Tue, Apr 7, 2026 at 2:09 PM Michael Roth <michael.roth@amd.com> wrote:
>
> > TLDR:
> >
> > + Think of populate ioctls not as KVM touching memory, but platform
> >   handling population.
> > + KVM code (kvm_gmem_populate) still doesn't touch memory contents
> > + post_populate is platform-specific code that handles loading into
> >   private destination memory just to support legacy non-in-place
> >   conversion.
> > + Don't complicate populate ioctls by doing conversion just to support
> >   legacy use-cases where platform-specific code has to do copying on
> >   the host.
>
> That's a good point: these are only considerations in the context of
> actually copying from src->dst, but with in-place conversion the
> primary/more-performant approach will be for userspace to initial
> directly. I.e. if we enforced that, then gmem could right ascertain that
> it isn't even writing to private pages via these hooks and any
> manipulation of that memory is purely on the part of the trusted entity
> handling initial encryption/etc.
>
> I understand that we decided to keep the option of allowing separate
> src/dst even with in-place conversion, but it doesn't seem worthwhile if
> that necessarily means we need to glue population+conversion together in
> 1 clumsy interface that needs to handle partial return/error responses to
> userspace (or potentially get stuck forever in the conversion path).

I think ARM needs userspace to specify separate source and destination
memory ranges for initial population as ARM doesn't support in-place
memory encryption. [1]

[1] https://lore.kernel.org/kvm/20260318155413.793430-25-steven.price@arm.com/

>
> So I agree with Ackerley's proposal (which I guess is the same as what's
> in this series).
>
> However, 1 other alternative would be to do what was suggested on the
> call, but require userspace to subsequently handle the shared->private
> conversion. I think that would be workable too.

IIUC, Converting memory ranges to private after it essentially is
treated as private by the KVM CC backend will expose the
implementation to the same risk of userspace being able to access
private memory and compromise host safety which guest_memfd was
invented to address.

>
> One other benefit to Ackerley's/current approach however is that it allows
> us to potentially keep hugepages intact in the populate path, since
> prep'ing/encrypting everything while it's in a shared state means gmem will
> split the hugepage and all the firmware/RMP/etc. data structures will only
> be able to handle individual 4K pages. I still suspect doing things like
> encoding the initial 2MB OVMF image as a single hugepage might yield
> enough benefit to explore this (at some point). So there's some niceness
> in knowing that Ackerley's approach would allow for that eventually and
> not require a complete rethink on these same topics.
>
> Thanks,
>
> Mike
>
> >
> > >>>
> > >>> [...snip...]
> > >>>

^ permalink raw reply

* [PATCH] docs: fix typo in zoran driver documentation
From: Gleb Golovko @ 2026-04-07 21:28 UTC (permalink / raw)
  To: corbet; +Cc: linux-doc, Gleb Golovko

Replace "an a few" with "and a few" in
Documentation/driver-api/media/drivers/zoran.rst.

Signed-off-by: Gleb Golovko <gaben123001@gmail.com>
---
 Documentation/driver-api/media/drivers/zoran.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/driver-api/media/drivers/zoran.rst b/Documentation/driver-api/media/drivers/zoran.rst
index 3e05b7f0442a..2538473c3233 100644
--- a/Documentation/driver-api/media/drivers/zoran.rst
+++ b/Documentation/driver-api/media/drivers/zoran.rst
@@ -222,7 +222,7 @@ The CCIR - I uses the PAL colorsystem, and is used in Great Britain, Hong Kong,
 Ireland, Nigeria, South Africa.
 
 The CCIR - N uses the PAL colorsystem and PAL frame size but the NTSC framerate,
-and is used in Argentina, Uruguay, an a few others
+and is used in Argentina, Uruguay, and a few others
 
 We do not talk about how the audio is broadcast !
 
-- 
2.42.0.windows.2


^ permalink raw reply related

* Re: [PATCH RFC v4 10/44] KVM: guest_memfd: Add support for KVM_SET_MEMORY_ATTRIBUTES2
From: Michael Roth @ 2026-04-07 21:09 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Baoquan He, Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm
In-Reply-To: <CAEvNRgEtigp7+PVDkyu_DH947CUqDt312d+P+hWjjd2fHONiag@mail.gmail.com>

On Fri, Apr 03, 2026 at 07:50:16AM -0700, Ackerley Tng wrote:
> Ackerley
> dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnng <ackerleytng@google.com> writes:
> 
> >
> > [...snip...]
> >
> > guest_memfd's populate will first check that the memory is shared, then
> > also set the memory to private after the populate.
> >
> > [...snip...]
> >
> Looking at this again, the above basically means that the entire
> conversion process needs to be performed within populate.
> 
> In addition to setting the attributes in guest_memfd as private, for
> consistency, populate will also have to do all the associated
> operations, especially unmapping from the host, checking refcounts,
> and the list of work in conversion will only increase in future with
> direct map removal/restoration and huge page merging.
> 
> The complexity of conversion also means possible errors (EAGAIN for
> elevated refcounts and ENOMEM when we're out of memory), and error
> information like the offset where the elevated refcount was.
> 
> It doesn't look like there's room for this information to be plumbed out
> through the platform-specific ioctls, and even if we make space, it
> seems odd to have conversion-related error information returned through
> the platform-specific call.
> 
> 
> I agree with the goal of not having KVM touch private memory contents as
> tracked by guest_memfd, so I'd like to propose that we distinguish:
> 
> 1. private as tracked by KVM (guest_memfd/vm_memory_attributes)
> 2. private as tracked by trusted entity

I think this is a good distinction to keep in mind, because if we adopt
the proposal from the call of having userspace set the destination memory
to shared prior to calling kvm_gmem_populate(), then they don't really
stay shared until gmem convert them to private: instead, they get set to
"private as tracked by trusted entity", but at the same time still have
'shared' memory attributes as far as KVM is concerned. Normally (SNP,
at least), the 'private (as tracked by KVM)' state is an intermediate state
on the way to 'private (as tracked by KVM + trusted entity)'.

So we introduce some inconsistencies on that side, in order to address
the inconsistency of kvm_gmem_populate() writing to 'private (as tracked
by KVM)' memory. But as you point out...

> + destination address: private (as tracked by guest_memfd)
> + source address: shared (as tracked by guest_memfd) or NULL
> 
> KVM doesn't touch private memory contents, even in this case, because
> it's really a platform-specific ioctl that handles loading, and the
> platform does expect the destination is private for both TDX and SNP
> at the firmware boundary.

...yah, it's not really gmem that's writing to that memory, it's the
platform-specific hooks that 'prepare' the memory as part of population
and puts that in a 'private (as tracked by trusted entity)' state, just as
it's the platform-specific hooks that 'prepare' the memory as part of vCPU
page fault path at run-time and put them into a private (as tracked by
trusted entity). You could even imagine a naive CoCo implementation that
encrypts memory in-place at initial fault time via kvm_gmem_prepare()
hooks... we likely wouldn't insist on some new flow because this results
in gmem calling something that writes to 'private (as tracked by KVM)'
pages and would consider that to be more of a platform-specific
implementation detail that should be handled the same as other
architectures. And that seems like it would be roughly analogous to what
is being discussed here WRT the kvm_gmem_populate() path, so I think it
makes sense to continue to expecting the pages to be marked private in
advance of platform-specific preparation, whether that be via the
populate path or the runtime/fault-time path.


And for recent KVM,

; all the things we exp
about how the callbacks



As far as the copying goes, 

By expecting 'private' (as tracked by KVM) as the initial state for
kvm_gmem_populate(), a lot of invariants about private memory (safe
refcount, directmap removal expectations, etc.) remain consistent even
in the populate path, where any special handling for private memory can be
accounted for in the same way rather than "shared, but..." or "private,
but...".
> 
> Since SNP (platform-specific) only allows in-place launch update, and
> KVM had to provide an interface that allows a different source address
> for support before in-place conversion, then SNP has to continue
> supporting the to-be-deprecated path by doing the copying within
> platform-specific code.
> 
> For consistency, guest_memfd can continue to check that it tracks the
> destination address as private, and sev_gmem_populate will then hide
> the copying code away just to support the legacy case.
> 
> 
> The flow before in-place conversion is
> 
> 1. Load memory (shared or non-guest_memfd memory)
> 2. KVM_SEV_SNP_LAUNCH_UPDATE or KVM_TDX_INIT_MEM_REGION (destination:
>    gfn for separate private memory, source: shared memory)
> 
> The proposed flow for in-place conversion is
> 
> 1. INIT_SHARED or convert to shared
> 2. Load memory while it is shared
> 3. Convert to private (PRESERVE, or new flag?)
> 4. KVM_SEV_SNP_LAUNCH_UPDATE or KVM_TDX_INIT_MEM_REGION (destination:
>    gfn for converted private memory, source: NULL)
> 
> 
> TLDR:
> 
> + Think of populate ioctls not as KVM touching memory, but platform
>   handling population.
> + KVM code (kvm_gmem_populate) still doesn't touch memory contents
> + post_populate is platform-specific code that handles loading into
>   private destination memory just to support legacy non-in-place
>   conversion.
> + Don't complicate populate ioctls by doing conversion just to support
>   legacy use-cases where platform-specific code has to do copying on
>   the host.

That's a good point: these are only considerations in the context of
actually copying from src->dst, but with in-place conversion the
primary/more-performant approach will be for userspace to initial
directly. I.e. if we enforced that, then gmem could right ascertain that
it isn't even writing to private pages via these hooks and any
manipulation of that memory is purely on the part of the trusted entity
handling initial encryption/etc.

I understand that we decided to keep the option of allowing separate
src/dst even with in-place conversion, but it doesn't seem worthwhile if
that necessarily means we need to glue population+conversion together in
1 clumsy interface that needs to handle partial return/error responses to
userspace (or potentially get stuck forever in the conversion path).

So I agree with Ackerley's proposal (which I guess is the same as what's
in this series).

However, 1 other alternative would be to do what was suggested on the
call, but require userspace to subsequently handle the shared->private
conversion. I think that would be workable too.

One other benefit to Ackerley's/current approach however is that it allows
us to potentially keep hugepages intact in the populate path, since
prep'ing/encrypting everything while it's in a shared state means gmem will
split the hugepage and all the firmware/RMP/etc. data structures will only
be able to handle individual 4K pages. I still suspect doing things like
encoding the initial 2MB OVMF image as a single hugepage might yield
enough benefit to explore this (at some point). So there's some niceness
in knowing that Ackerley's approach would allow for that eventually and
not require a complete rethink on these same topics.

Thanks,

Mike

> 
> >>>
> >>> [...snip...]
> >>>

^ permalink raw reply

* Re: [PATCH v9 02/10] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Jim Mattson @ 2026-04-07 20:53 UTC (permalink / raw)
  To: Pawan Gupta
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Sean Christopherson, Borislav Petkov, Dave Hansen,
	Peter Zijlstra, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, KP Singh, Jiri Olsa, David S. Miller,
	David Laight, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	David Ahern, Martin KaFai Lau, Eduard Zingerman, Song Liu,
	Yonghong Song, John Fastabend, Stanislav Fomichev, Hao Luo,
	Paolo Bonzini, Jonathan Corbet, linux-kernel, kvm, Asit Mallick,
	Tao Zhang, bpf, netdev, linux-doc, chao.gao
In-Reply-To: <20260407191128.b2hr2ttkdpyunhrr@desk>

On Tue, Apr 7, 2026 at 12:11 PM Pawan Gupta
<pawan.kumar.gupta@linux.intel.com> wrote:
>
> On Tue, Apr 07, 2026 at 11:40:57AM -0700, Jim Mattson wrote:
> > My proposal is as follows:
> >
> > 1. The (advanced) hypervisor can advertise to the guest (via CPUID bit
> > or MSR bit) that the short BHB clearing sequence is adequate. This may
> > mean either that the VM will only be hosted on pre-Alder Lake hardware
> > or that the hypervisor will set BHI_DIS_S behind the back of the
> > guest. Presumably, this bit would not be reported if BHI_CTRL is
> > advertised to the guest.
> > 2. If the guest sees this bit, then it can use the short sequence. If
> > it doesn't see this bit, it must use the long sequence.
>
> Thats a good middle ground. Let me check with folks internally what they
> think about defining a new software-only bit.
>
> Third case, for a guest that doesn't want BHI_DIS_S, userspace should be
> allowed to override setting BHI_DIS_S. Then this proposed bit can indicate
> that long sequence is required.

That case can be handled by the paravirtual mitigation MSRs, right?

^ permalink raw reply

* Re: [PATCH] KVM: x86: nSVM: Redirect IA32_PAT accesses to either hPAT or gPAT
From: Jim Mattson @ 2026-04-07 20:51 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Paolo Bonzini, Jonathan Corbet, Shuah Khan, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	kvm, linux-doc, linux-kernel, Yosry Ahmed
In-Reply-To: <adVZ7-EiekghvDMD@google.com>

On Tue, Apr 7, 2026 at 12:24 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Apr 07, 2026, Jim Mattson wrote:
> > When KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT is disabled and the vCPU is in
> > guest mode with nested NPT enabled, guest accesses to IA32_PAT are
> > redirected to the gPAT register, which is stored in VMCB02's g_pat field.
> >
> > Non-guest accesses (e.g. from userspace) to IA32_PAT are always redirected
> > to hPAT, which is stored in vcpu->arch.pat.
> >
> > Directing host-initiated accesses to hPAT ensures that KVM_GET/SET_MSRS and
> > KVM_GET/SET_NESTED_STATE are independent of each other and can be ordered
> > arbitrarily during save and restore. gPAT is saved and restored separately
> > via KVM_GET/SET_NESTED_STATE.
> >
> > Use WARN_ON_ONCE to flag any host-initiated accesses originating from KVM
> > itself rather than userspace.
> >
> > Use pr_warn_once to flag any use of the common MSR-handling code (now
> > shared by VMX and TDX) for IA32_PAT by a vCPU that is SVM-capable.
>
> Changelog is stale, but otherwise this LGTM.  I'll fixup the changelog when
> applying (in a few weeks).

Oh, crud. This was supposed to be 5/8, but I made some changes after
checkpatch.pl complained and then tried to just regenerate this one,
but I totally flubbed it.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox