Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* [PATCH net-next v13 2/2] net: mana: force full-page RX buffers via ethtool private flag
From: Dipayaan Roy @ 2026-07-29  6:21 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, linux-hyperv, netdev, linux-kernel, linux-rdma,
	stephen, jacob.e.keller, dipayanroy, leitao, kees, john.fastabend,
	hawk, bpf, daniel, ast, sdf, yury.norov, pavan.chebbi,
	schakrabarti, gargaditya
In-Reply-To: <20260729063347.3388035-1-dipayanroy@linux.microsoft.com>

On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
allocation in the RX refill path can cause 15-20% throughput
regression under high connection counts (>16 TCP streams).

Add an ethtool private flag "full-page-rx" that allows the user to
force one RX buffer per page, bypassing the page_pool fragment path.
This restores line-rate (180+ Gbps) performance on affected platforms.

Usage:
  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag must be explicitly
enabled by the user or udev rule.

The existing single-buffer-per-page logic for XDP and jumbo frames is
consolidated into a new helper mana_use_single_rxbuf_per_page() which
is now the single decision point for both the automatic and
user-controlled paths.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 22 ++++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 94 +++++++++++++++++++
 include/net/mana/mana.h                       |  8 ++
 3 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 89e7f59f635d..d859c2eae0a5 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -755,6 +755,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 	return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+	/* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
+	 * in the RX refill path (~2kB buffer) can cause significant throughput
+	 * regression under high connection counts. Allow user to force one RX
+	 * buffer per page via ethtool private flag to bypass the fragment
+	 * path.
+	 */
+	if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
+		return true;
+
+	/* For xdp and jumbo frames make sure only one packet fits per page. */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+		return true;
+
+	return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 			       int mtu, u32 *datasize, u32 *alloc_size,
@@ -765,8 +784,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 	/* Calculate datasize first (consistent across all cases) */
 	*datasize = mtu + ETH_HLEN;
 
-	/* For xdp and jumbo frames make sure only one packet fits per page */
-	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
 		if (mana_xdp_get(apc)) {
 			*headroom = XDP_PACKET_HEADROOM;
 			*alloc_size = PAGE_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 482cd16009ab..7e441d6ae5dc 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] = {
 	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
 };
 
+static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
+	[MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 		       ARRAY_SIZE(mana_phy_stats) +
 		       ARRAY_SIZE(mana_hc_stats)  +
 		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+
+	case ETH_SS_PRIV_FLAGS:
+		return MANA_PRIV_FLAG_MAX;
+
 	default:
 		return -EINVAL;
 	}
@@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 	}
 }
 
+static void mana_get_strings_priv_flags(u8 **data)
+{
+	int i;
+
+	for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
+		ethtool_puts(data, mana_priv_flags[i]);
+}
+
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 	case ETH_SS_STATS:
 		mana_get_strings_stats(apc, &data);
 		break;
+	case ETH_SS_PRIV_FLAGS:
+		mana_get_strings_priv_flags(&data);
+		break;
 	default:
 		break;
 	}
@@ -756,6 +775,78 @@ static int mana_get_link_ksettings(struct net_device *ndev,
 	return 0;
 }
 
+static u32 mana_get_priv_flags(struct net_device *ndev)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	return apc->priv_flags;
+}
+
+static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+	u32 changed = apc->priv_flags ^ priv_flags;
+	u32 old_priv_flags = apc->priv_flags;
+	int err = 0;
+
+	if (!changed)
+		return 0;
+
+	/* Reject unknown bits */
+	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
+		return -EINVAL;
+
+	apc->priv_flags = priv_flags;
+
+	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
+		if (!apc->port_is_up)
+			return 0;
+
+		/* If XDP is attached or MTU is jumbo, single-buffer-per-page
+		 * is already forced regardless of this flag. Skip the
+		 * expensive detach/attach cycle since nothing changes.
+		 */
+		if (ndev->mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 ||
+		    mana_xdp_get(apc))
+			return 0;
+
+		/* Block RDMA from grabbing the vport during detach/attach */
+		mutex_lock(&apc->vport_mutex);
+		apc->channel_changing = true;
+		mutex_unlock(&apc->vport_mutex);
+
+		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+		if (err) {
+			netdev_err(ndev,
+				   "Insufficient memory for new allocations\n");
+			apc->priv_flags = old_priv_flags;
+			goto clear_flag;
+		}
+
+		err = mana_detach(ndev, false);
+		if (err) {
+			netdev_err(ndev, "mana_detach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+			goto out;
+		}
+
+		err = mana_attach(ndev);
+		if (err) {
+			netdev_err(ndev, "mana_attach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+		}
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(apc);
+clear_flag:
+	mutex_lock(&apc->vport_mutex);
+	apc->channel_changing = false;
+	mutex_unlock(&apc->vport_mutex);
+
+	return err;
+}
+
 const struct ethtool_ops mana_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES |
 				     ETHTOOL_COALESCE_RX_USECS |
@@ -766,6 +857,7 @@ const struct ethtool_ops mana_ethtool_ops = {
 				     ETHTOOL_COALESCE_USE_ADAPTIVE_TX,
 	.op_needs_rtnl		= ETHTOOL_OP_NEEDS_RTNL_SCHANNELS |
 				  ETHTOOL_OP_NEEDS_RTNL_SRINGPARAM |
+				  ETHTOOL_OP_NEEDS_RTNL_SPFLAGS |
 				  ETHTOOL_OP_NEEDS_RTNL_GLINK,
 	.get_ethtool_stats	= mana_get_ethtool_stats,
 	.get_sset_count		= mana_get_sset_count,
@@ -783,4 +875,6 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.set_ringparam          = mana_set_ringparam,
 	.get_link_ksettings	= mana_get_link_ksettings,
 	.get_link		= ethtool_op_get_link,
+	.get_priv_flags		= mana_get_priv_flags,
+	.set_priv_flags		= mana_set_priv_flags,
 };
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 226b61504596..768d9f9bf167 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -31,6 +31,12 @@ enum TRI_STATE {
 	TRI_STATE_TRUE = 1
 };
 
+/* MANA ethtool private flag bit positions */
+enum mana_priv_flag_bits {
+	MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
+	MANA_PRIV_FLAG_MAX,
+};
+
 /* Number of entries for hardware indirection table must be in power of 2 */
 #define MANA_INDIRECT_TABLE_MAX_SIZE 512
 #define MANA_INDIRECT_TABLE_DEF_SIZE 64
@@ -565,6 +571,8 @@ struct mana_port_context {
 	u32 rxbpre_headroom;
 	u32 rxbpre_frag_count;
 
+	u32 priv_flags;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v13 1/2] net: mana: refactor mana_get_strings() and mana_get_sset_count() to use switch
From: Dipayaan Roy @ 2026-07-29  6:21 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, linux-hyperv, netdev, linux-kernel, linux-rdma,
	stephen, jacob.e.keller, dipayanroy, leitao, kees, john.fastabend,
	hawk, bpf, daniel, ast, sdf, yury.norov, pavan.chebbi,
	schakrabarti, gargaditya
In-Reply-To: <20260729063347.3388035-1-dipayanroy@linux.microsoft.com>

Refactor mana_get_strings() and mana_get_sset_count() from if/else to
switch statements in preparation for adding ethtool private flags
support which requires handling ETH_SS_PRIV_FLAGS.

No functional change.

Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 .../ethernet/microsoft/mana/mana_ethtool.c    | 75 ++++++++++++-------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 9e31e2595ae3..482cd16009ab 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -138,53 +138,70 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 
-	if (stringset != ETH_SS_STATS)
+	switch (stringset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(mana_eth_stats) +
+		       ARRAY_SIZE(mana_phy_stats) +
+		       ARRAY_SIZE(mana_hc_stats)  +
+		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	default:
 		return -EINVAL;
-
-	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) +
-			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	}
 }
 
-static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 {
-	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	int i, j;
 
-	if (stringset != ETH_SS_STATS)
-		return;
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
-		ethtool_puts(&data, mana_eth_stats[i].name);
+		ethtool_puts(data, mana_eth_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
-		ethtool_puts(&data, mana_hc_stats[i].name);
+		ethtool_puts(data, mana_hc_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
-		ethtool_puts(&data, mana_phy_stats[i].name);
+		ethtool_puts(data, mana_phy_stats[i].name);
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "rx_%d_packets", i);
-		ethtool_sprintf(&data, "rx_%d_bytes", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
-		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+		ethtool_sprintf(data, "rx_%d_packets", i);
+		ethtool_sprintf(data, "rx_%d_bytes", i);
+		ethtool_sprintf(data, "rx_%d_xdp_drop", i);
+		ethtool_sprintf(data, "rx_%d_xdp_tx", i);
+		ethtool_sprintf(data, "rx_%d_xdp_redirect", i);
+		ethtool_sprintf(data, "rx_%d_pkt_len0_err", i);
 		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
-			ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
+			ethtool_sprintf(data,
+					"rx_%d_coalesced_cqe_%d",
+					i,
+					j + 2);
 	}
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "tx_%d_packets", i);
-		ethtool_sprintf(&data, "tx_%d_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_xdp_xmit", i);
-		ethtool_sprintf(&data, "tx_%d_tso_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_csum_partial", i);
-		ethtool_sprintf(&data, "tx_%d_mana_map_err", i);
+		ethtool_sprintf(data, "tx_%d_packets", i);
+		ethtool_sprintf(data, "tx_%d_bytes", i);
+		ethtool_sprintf(data, "tx_%d_xdp_xmit", i);
+		ethtool_sprintf(data, "tx_%d_tso_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_bytes", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i);
+		ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_csum_partial", i);
+		ethtool_sprintf(data, "tx_%d_mana_map_err", i);
+	}
+}
+
+static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		mana_get_strings_stats(apc, &data);
+		break;
+	default:
+		break;
 	}
 }
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v13 0/2] net: mana: add ethtool private flag for full-page RX buffers
From: Dipayaan Roy @ 2026-07-29  6:21 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, linux-hyperv, netdev, linux-kernel, linux-rdma,
	stephen, jacob.e.keller, dipayanroy, leitao, kees, john.fastabend,
	hawk, bpf, daniel, ast, sdf, yury.norov, pavan.chebbi,
	schakrabarti, gargaditya

On some ARM64 platforms with 4K PAGE_SIZE, utilizing page_pool
fragments for allocation in the RX refill path (~2kB buffer per
fragment) causes 15-20% throughput regression under high connection
counts (>16 TCP streams at 180+ Gbps). Using full-page buffers on
these platforms shows no regression and restores line-rate
performance.

This behavior is observed on a single platform; other platforms
perform better with page_pool fragments, indicating this is not a
page_pool issue but platform-specific.

This series adds an ethtool private flag "full-page-rx" to let the
user opt in to one RX buffer per page:

  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag can be persisted
via udev rule for affected platforms.

This series depends on the following fixes now merged in net-next:
  commit 17bfe0a8c014 ("net: mana: Add NULL guards in teardown path to prevent panic on attach failure")
  commit 5b05aa36ee24 ("net: mana: Skip redundant detach on already-detached port")

Changes in v13:
  - Dropped patches 2/4 (mana_detach dealloc bail-out fix) and 4/4
    (attach failure recovery). Both will be sent as separate series:
    the dealloc fix as a standalone fix, and the recovery mechanism
    after concluding pre-allocate-and-replace approach discussion
    with the HW team.
  - Removed schedule_port_reset logic from mana_set_priv_flags();
    on mana_attach() failure, the old priv_flags are restored and
    the error is returned without scheduling async recovery. 
Changes in v12:
  - Added patch 2 to ensure mana_detach() always completes its full
    teardown even if mana_dealloc_queues() fails, keeping port state
    consistent for recovery.
  - Added patch 4 to schedule queue_reset_work when mana_attach()
    fails during ethtool ring size or channel count changes, with
    fallback values that maximize recovery chances.
Changes in v11:
  - Rebased on net-next
Changes in v10:
  - Rebased on net-next which now includes the prerequisite fixes.
  - Recovery logic in mana_set_priv_flags() leverages the idempotent
    mana_detach() from the merged fixes.
Changes in v9:
  - Added correct tree.
Changes in v8:
  - Fixed queue_reset_work recovery by restoring port_is_up before
    scheduling reset so the handler can properly re-attach.
  - Simplified "err && schedule_port_reset" to "schedule_port_reset".
Changes in v7:
  - Rebased onto net-next.
  - Retained private flag approach after David Wei's testing on
    Grace (ARM64) confirmed that fragment mode outperforms
    full-page mode on other platforms, validating this is a
    single-platform workaround rather than a generic issue.
Changes in v6:
  - Added missed maintainers.
Changes in v5:
  - Split prep refactor into separate patch (patch 1/2)
Changes in v4:
  - Dropping the smbios string parsing and add ethtool priv flag
    to reconfigure the queues with full page rx buffers.
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.

Dipayaan Roy (2):
  net: mana: refactor mana_get_strings() and mana_get_sset_count() to
    use switch
  net: mana: force full-page RX buffers via ethtool private flag

 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 169 +++++++++++++++---
 include/net/mana/mana.h                       |   8 +
 3 files changed, 168 insertions(+), 31 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH net-next v12 4/4] net: mana: recover port on attach failure in ethtool operations
From: Dipayaan Roy @ 2026-07-29  6:14 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, leon, longli, kotaranov, horms, shradhagupta, ssengar,
	ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov,
	pavan.chebbi, schakrabarti, gargaditya
In-Reply-To: <d2014044-d410-466a-ae66-2a173dbdba82@redhat.com>

On Thu, Jul 23, 2026 at 12:18:22PM +0200, Paolo Abeni wrote:
> On 7/11/26 6:10 AM, Dipayaan Roy wrote:
> > When mana_attach() fails during ethtool ring size or channel count
> > changes, the port is left in a broken state with no recovery
> > mechanism, requiring manual intervention to bring the port back up.
> > 
> > On VM SKUs without a netvsc fallback interface, this results in
> > complete loss of network connectivity to the VM.
> > 
> > Fix by scheduling queue_reset_work when mana_attach() fails. The
> > preceding patch ensures mana_detach() always completes its full
> > teardown (netif_device_detach + cleanup), so the reset handler's
> > mana_detach() takes the "already detached" early return, preserving
> > port_st_save for a successful mana_attach() recovery.
> > 
> > When mana_attach() fails, choose retry values that maximize recovery
> > chances: if the operation was an increase, fall back to the previous
> > working values; if it was a decrease but still above default, fall
> > back to defaults; otherwise use the minimum supported values.
> > 
> > Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> > Tested-by: Aditya Garg <gargaditya@linux.microsoft.com>
> > Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> sashiko gemini sees quite a bit of problem with the above approach:
> 
> https://sashiko.dev/#/patchset/20260711041415.3008868-1-dipayanroy%40linux.microsoft.com
> 
> Also, changing user-setting in case of fallback is quite a bad thing to
> be avoided.
> 
> Skimming over the code, it looks like than pre-allocate all the
> resources and to a replace instead of detach/attach() looks feasible. I
> suggest investigating such path. I also think that this last patch (and
> possibly 2/4) could/should be in separate series.
> 
> Finally and most relevant thing; AI review comments should be explicitly
> acknowledged by either debating them or addressing them. In both cases a
> reply on the ML is expected.
> 
> /P

Hi Paolo,

Thank you for the review.

Accepting the feedback on 4/4 and working on the
pre-allocate-and-replace approach you suggested. We are discussing with
the HW/NIC team and investigating pre-allocating the queue resources from HW
while a set already exists, as with this patch we are trying to handle
and recover from failures caused due to HW/NIC resource exhaustion for
queues.

I will drop 4/4 from this series in v13 and stick to only the ethtool
private flag changes. Will send out a separate series for the problem
currently discussed in 4/4 once all the above points are concluded.

For 2/4 I will send it as a fixes patch to net as it will be common for
all functions calling mana_detach.

Regards
Dipayaan Roy

^ permalink raw reply

* Re: [PATCH] Drivers: hv: Use meaningful errnos for hypercall status codes
From: Naman Jain @ 2026-07-29  5:18 UTC (permalink / raw)
  To: Easwar Hariharan, Hardik Garg
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel,
	ssengar
In-Reply-To: <bc819b23-49c5-43d1-9a69-d34dba8f4762@linux.microsoft.com>



On 7/28/2026 2:53 AM, Easwar Hariharan wrote:
> Hi Hardik,
> 
> On 7/27/2026 14:11, Hardik Garg wrote:
>> Commit 3817854ba892 ("hyperv: Log hypercall status codes as strings")
>> converted hv_result_to_errno() from a switch to a table and added status
>> codes used for string logging. Statuses without an existing specific
>> mapping were assigned the generic -EIO fallback even when a more specific
>> errno was available.
>>
>> Map HV_STATUS_ACCESS_DENIED and HV_STATUS_OPERATION_DENIED to -EACCES,
>> HV_STATUS_UNKNOWN_PROPERTY and HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE to
>> -EINVAL, and HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED to -EOPNOTSUPP.
>> This lets callers distinguish permission, argument, and capability
>> failures from generic I/O errors.
>>
>> The table conversion also added duplicate HV_STATUS_INVALID_LP_INDEX and
>> HV_STATUS_INVALID_REGISTER_VALUE entries. Remove the later -EIO entries,
>> which are unreachable because find_hv_status_info() returns the first
>> match.
>>
>> Signed-off-by: Hardik Garg <hargar@linux.microsoft.com>
>> ---
>>   drivers/hv/hv_common.c | 12 +++++-------
>>   1 file changed, 5 insertions(+), 7 deletions(-)
>>
> Glad to see that this was something that bothered you as well. If you like, you
> can pick up feedback from Nuno on my attempt to solve this [1] and do a de-facto v3.
> 
> [1] https://lore.kernel.org/all/479242d4-ae08-442c-b61b-c9408ba2e9b0@linux.microsoft.com/
> 
> Thanks,
> Easwar (he/him)

I see no harm but only benefits in returning meaningful error code 
corresponding to HV_STATUS_*, where we can, specially when such an error 
code exists. So I agree with the motivation of this exercise.

On this patch or a new v3 of previous patch, feel free to pick my review:

Reviewed-by: Naman Jain <namjain@linux.microsoft.com>

Regards,
Naman

^ permalink raw reply

* RE: [EXTERNAL] Re: [PATCH net-next v2 0/7] net: mana: harden the HWC and add dynamic queue depth
From: Long Li @ 2026-07-28 23:58 UTC (permalink / raw)
  To: Paolo Abeni, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Leon Romanovsky,
	Haiyang Zhang, KY Srinivasan, Wei Liu, Dexuan Cui,
	shradhagupta@linux.microsoft.com, Simon Horman,
	ernis@linux.microsoft.com, stephen@networkplumber.org
  Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <7108c005-6b8d-4dda-82cc-e665cbe3b6a4@redhat.com>



> -----Original Message-----
> From: Paolo Abeni <pabeni@redhat.com>
> Sent: Tuesday, July 28, 2026 3:53 AM
> To: Long Li <longli@microsoft.com>; Konstantin Taranov
> <kotaranov@microsoft.com>; Jakub Kicinski <kuba@kernel.org>; David S .
> Miller <davem@davemloft.net>; Eric Dumazet <edumazet@google.com>;
> Andrew Lunn <andrew+netdev@lunn.ch>; Jason Gunthorpe <jgg@ziepe.ca>;
> Leon Romanovsky <leon@kernel.org>; Haiyang Zhang
> <haiyangz@microsoft.com>; KY Srinivasan <kys@microsoft.com>; Wei Liu
> <wei.liu@kernel.org>; Dexuan Cui <DECUI@microsoft.com>;
> shradhagupta@linux.microsoft.com; Simon Horman <horms@kernel.org>;
> ernis@linux.microsoft.com; stephen@networkplumber.org
> Cc: netdev@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> hyperv@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: [EXTERNAL] Re: [PATCH net-next v2 0/7] net: mana: harden the HWC
> and add dynamic queue depth
> 
> On 7/22/26 1:43 AM, Long Li wrote:
> > This series hardens the MANA Hardware Channel (HWC) control-plane path
> > and then builds on that to support a dynamic HWC queue depth.
> >
> > The HWC is the command channel the driver uses to talk to the device.
> > Today it is created at a fixed depth of one outstanding request, and
> > several of its lookup and teardown paths predate the RCU and
> > DMA-lifetime rules they now need to follow.  Raising the queue depth
> > and allowing concurrent commands makes those latent races reachable,
> > so the fixes come first and the feature builds on them.
> >
> > Patches 1-5 are fixes for pre-existing HWC bugs, each with a Fixes: tag:
> >
> >   1: cq_table was a plain pointer array freed with no grace period while
> >      the EQ interrupt handler dereferenced it; put it under RCU.
> >   2: the HWC RQ and SQ were sized with each other's message size, so a
> >      response could overflow the RQ buffer and the RX slot stride was
> >      computed with the wrong size.
> >   3: comp_buf was freed before the EQ was destroyed, so a late completion
> >      handler could touch freed memory.
> >   4: the RX path consumed device-supplied lengths and indices without
> >      validation; validate them before use (this matters for confidential
> >      VMs, where the DMA buffer is shared with the host).
> >   5: a failed mana_hwc_establish_channel() could leave live MST entries
> >      while the driver freed the queue buffers, and destroy_channel() freed
> >      the TXQ/RXQ before the EQ was quiesced; add a setup_active teardown
> >      gate and destroy the CQ first.
> 
> Fixes should go via the net tree. Targeting net-next to avoid waiting the
> merge to post the dependant features is not a valid reason.
> 
> Also both sashikos agrees pcie_fpr() in patch 5 is not the correct function to
> be used there.
> 
> Also please note the expectations WRT LLMs feedback, commit c82ff94592fb.
> 
> /P

I'll post Patch 1-5 to net.

Thank you,
Long


^ permalink raw reply

* Re: [PATCH 1/3] PCI: hv: Set irq_retrigger callback for the Hyper-V PCI MSI irqchip
From: sashiko-bot @ 2026-07-28 14:04 UTC (permalink / raw)
  To: Naman Jain; +Cc: linux-pci, linux-hyperv
In-Reply-To: <20260728135117.2888175-2-namjain@linux.microsoft.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:

Pre-existing issues:
- [High] The fix for dropped pending interrupts during CPU hot-unplug was missed in the sister Hyper-V root partition MSI domain.
--

--- Patch [1]: [PATCH 1/3] PCI: hv: Set irq_retrigger callback for the Hyper-V PCI MSI irqchip ---
commit b8b1c843c457dd145f0d360b44f4b1bf34bd8cf5
Author: Naman Jain <namjain@linux.microsoft.com>

PCI: hv: Set irq_retrigger callback for the Hyper-V PCI MSI irqchip

The Hyper-V vPCI MSI irqchip never installs an irq_retrigger() callback.
On CPU hot-unplug fixup_irqs() migrates the interrupts which are affine to
the outgoing CPU to a new target. As the Hyper-V PCI/MSI chip does not
provide that callback, the pending interrupt is silently dropped, which
can result in lost interrupts, stalls and "No irq handler for vector"
messages.

Fixes: 4daace0d8ce85 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
Cc: stable@vger.kernel.org
Suggested-by: Long Li <longli@microsoft.com>
Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>

> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index cfc8fa403dad6..89816a2bd7cd3 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -2119,6 +2119,7 @@ static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *dom
>  	info->ops->msi_prepare = hv_msi_prepare;
>  
>  	chip->irq_set_affinity = irq_chip_set_affinity_parent;
> +	chip->irq_retrigger = irq_chip_retrigger_hierarchy;

[Severity: High]
This isn't a bug introduced by this patch, but does the sister Hyper-V root
partition MSI domain in arch/x86/hyperv/irqdomain.c also need this fix?

Looking at hv_init_dev_msi_info(), it appears to be missing the exact same
callback initialization for the root partition's top-level irq_chip:

arch/x86/hyperv/irqdomain.c:hv_init_dev_msi_info() {
	...
	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
		return false;

	chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED;

	info->ops->msi_prepare = pci_msi_prepare;

	return true;
}

If a CPU is hot-unplugged while a device MSI interrupt is pending in its
local APIC IRR, could fixup_irqs() fail to resend the pending interrupt
to the new target CPU because this callback is NULL?

This could potentially lead to the same lost interrupts and I/O stalls 
described in this commit message, just in the root partition MSI domain
instead of the guest paravirtual MSI domain.

>  
>  	if (IS_ENABLED(CONFIG_X86))
>  		chip->flags |= IRQCHIP_MOVE_DEFERRED;
>

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260728135117.2888175-1-namjain@linux.microsoft.com?part=1

^ permalink raw reply

* Re: [PATCH 2/3] x86/irq: Use irq_chip_retrigger_hierarchy() in fixup_irqs()
From: sashiko-bot @ 2026-07-28 14:03 UTC (permalink / raw)
  To: Naman Jain; +Cc: linux-pci, linux-hyperv
In-Reply-To: <20260728135117.2888175-3-namjain@linux.microsoft.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] The patch replaces the direct invocation of the outermost chip's irq_retrigger() callback with irq_chip_retrigger_hierarchy(). However, irq_chip_retrigger_hierarchy() initializes its loop with `data = data->parent_data`, completely skipping the outermost chip. This breaks interrupt retriggering for flat domains (like native lapic_controller) and any hierarchical domain where the outermost chip directly implements the retrigger callback.
--

--- Patch [2]: [PATCH 2/3] x86/irq: Use irq_chip_retrigger_hierarchy() in fixup_irqs() ---
commit f97e883de2f6626561de48c426194caa41c3e062
Author: Naman Jain <namjain@linux.microsoft.com>
x86/irq: Use irq_chip_retrigger_hierarchy() in fixup_irqs()

This commit modifies fixup_irqs() to use irq_chip_retrigger_hierarchy()
instead of invoking the outermost domain chip's irq_retrigger() callback
directly. It also refactors the loop to use scoped guards for the irq
descriptor lock.

> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
> index 30122f0b3af96..ef1bdd3c4659a 100644
> --- a/arch/x86/kernel/irq.c
> +++ b/arch/x86/kernel/irq.c
[ ... ]
> @@ -489,22 +484,18 @@ void fixup_irqs(void)
[ ... ]
> -	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
> -		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
> +	for (unsigned int vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
> +		struct irq_desc *desc = __this_cpu_read(vector_irq[vector]);
> +
> +		if (IS_ERR_OR_NULL(desc))
>  			continue;
>  
>  		if (is_vector_pending(vector)) {
> -			desc = __this_cpu_read(vector_irq[vector]);
> -
> -			raw_spin_lock(&desc->lock);
> -			data = irq_desc_get_irq_data(desc);
> -			chip = irq_data_get_irq_chip(data);
> -			if (chip->irq_retrigger) {
> -				chip->irq_retrigger(data);
> +			guard(raw_spinlock)(&desc->lock);
> +			if (irq_chip_retrigger_hierarchy(&desc->irq_data))
[Severity: High]
Will this skip the outermost chip and cause a regression for flat domains?

Looking at kernel/irq/chip.c:irq_chip_retrigger_hierarchy(), the loop
initializes by immediately moving to the parent_data:

    int irq_chip_retrigger_hierarchy(struct irq_data *data)
    {
        for (data = data->parent_data; data; data = data->parent_data)
            if (data->chip && data->chip->irq_retrigger)
                return data->chip->irq_retrigger(data);

        return 0;
    }

By passing &desc->irq_data directly to this function, it seems any retrigger
callback on the outermost chip itself will be bypassed.

Could this lead to dropped hardware interrupts during CPU hot-unplug for
flat domains (like native lapic_controller) or any hierarchical domain where
the outermost chip implements the retrigger callback?

>  				__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
> -			}
> -			raw_spin_unlock(&desc->lock);
>  		}

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260728135117.2888175-1-namjain@linux.microsoft.com?part=2

^ permalink raw reply

* Re: [PATCH 3/3] x86/irq, iommu/amd, PCI: Drop redundant irq_retrigger inits
From: sashiko-bot @ 2026-07-28 13:59 UTC (permalink / raw)
  To: Naman Jain; +Cc: linux-pci, linux-hyperv
In-Reply-To: <20260728135117.2888175-4-namjain@linux.microsoft.com>

> With fixup_irqs() now using irq_chip_retrigger_hierarchy() and the generic
> resend path (try_retrigger()) already falling back to it, installing
> irq_chip_retrigger_hierarchy() as the irq_retrigger() callback on the
> outermost domain chips is no longer required. The hierarchy walk reaches
> the x86 vector domain's apic_retrigger_irq() for all of these stacked
> domain chips.
> 
> Remove the now redundant irq_retrigger() initializations from the outermost
> domain interrupt chips.
> msi_set_affinity() is the only site that invoked the outermost chip's
> irq_retrigger() directly. Convert it to irq_chip_retrigger_hierarchy() so
> the x86_init_dev_msi_info() initialization can be dropped as well.
> 
> No functional change intended.
> 
> Suggested-by: Thomas Gleixner <tglx@kernel.org>
> Signed-off-by: Naman Jain <namjain@linux.microsoft.com>

Sashiko has reviewed this patch and found no issues. It looks great!

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260728135117.2888175-1-namjain@linux.microsoft.com?part=3


^ permalink raw reply

* [PATCH 3/3] x86/irq, iommu/amd, PCI: Drop redundant irq_retrigger inits
From: Naman Jain @ 2026-07-28 13:51 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring, Bjorn Helgaas,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Joerg Roedel (AMD), Suravee Suthikulpanit,
	Vasant Hegde, Will Deacon, Robin Murphy
  Cc: x86, Jake Oshins, linux-hyperv, linux-pci, iommu, linux-kernel
In-Reply-To: <20260728135117.2888175-1-namjain@linux.microsoft.com>

With fixup_irqs() now using irq_chip_retrigger_hierarchy() and the generic
resend path (try_retrigger()) already falling back to it, installing
irq_chip_retrigger_hierarchy() as the irq_retrigger() callback on the
outermost domain chips is no longer required. The hierarchy walk reaches
the x86 vector domain's apic_retrigger_irq() for all of these stacked
domain chips.

Remove the now redundant irq_retrigger() initializations from the outermost
domain interrupt chips.
msi_set_affinity() is the only site that invoked the outermost chip's
irq_retrigger() directly. Convert it to irq_chip_retrigger_hierarchy() so
the x86_init_dev_msi_info() initialization can be dropped as well.

No functional change intended.

Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
---
 arch/x86/kernel/apic/io_apic.c      | 2 --
 arch/x86/kernel/apic/msi.c          | 4 +---
 arch/x86/kernel/hpet.c              | 1 -
 drivers/iommu/amd/init.c            | 1 -
 drivers/pci/controller/pci-hyperv.c | 1 -
 5 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7d7175d012288..18f49a251f23b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1857,7 +1857,6 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_get_irqchip_state	= ioapic_irq_get_chip_state,
 	.flags			= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
 				  IRQCHIP_AFFINITY_PRE_STARTUP,
@@ -1871,7 +1870,6 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ir_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_get_irqchip_state	= ioapic_irq_get_chip_state,
 	.flags			= IRQCHIP_SKIP_SET_WAKE |
 				  IRQCHIP_AFFINITY_PRE_STARTUP,
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 66bc5d3e79db3..aad2f15eb2176 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -137,7 +137,7 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
 	 * IRR.
 	 */
 	if (lapic_vector_set_in_irr(cfg->vector))
-		irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+		irq_chip_retrigger_hierarchy(irqd);
 
 	return ret;
 }
@@ -246,7 +246,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 	info->ops->msi_prepare		= x86_msi_prepare;
 
 	info->chip->irq_ack		= irq_chip_ack_parent;
-	info->chip->irq_retrigger	= irq_chip_retrigger_hierarchy;
 	info->chip->flags		|= IRQCHIP_SKIP_SET_WAKE |
 					   IRQCHIP_AFFINITY_PRE_STARTUP;
 
@@ -313,7 +312,6 @@ static struct irq_chip dmar_msi_controller = {
 	.irq_mask		= dmar_msi_mask,
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= msi_domain_set_affinity,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_compose_msi_msg	= dmar_msi_compose_msg,
 	.irq_write_msi_msg	= dmar_msi_write_msg,
 	.flags			= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 8dc7b710e1250..2bb7a6f466af3 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -509,7 +509,6 @@ static struct irq_chip hpet_msi_controller __ro_after_init = {
 	.irq_mask = hpet_msi_mask,
 	.irq_ack = irq_chip_ack_parent,
 	.irq_set_affinity = msi_domain_set_affinity,
-	.irq_retrigger = irq_chip_retrigger_hierarchy,
 	.irq_write_msi_msg = hpet_msi_write_msg,
 	.flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP,
 };
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index ea3377e61aa5c..0c362e86a09d9 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -2468,7 +2468,6 @@ static struct irq_chip intcapxt_controller = {
 	.irq_unmask		= intcapxt_unmask_irq,
 	.irq_mask		= intcapxt_mask_irq,
 	.irq_ack		= irq_chip_ack_parent,
-	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.irq_set_affinity       = intcapxt_set_affinity,
 	.irq_set_wake		= intcapxt_set_wake,
 	.flags			= IRQCHIP_MASK_ON_SUSPEND | IRQCHIP_MOVE_DEFERRED,
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 89816a2bd7cd3..cfc8fa403dad6 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2119,7 +2119,6 @@ static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *dom
 	info->ops->msi_prepare = hv_msi_prepare;
 
 	chip->irq_set_affinity = irq_chip_set_affinity_parent;
-	chip->irq_retrigger = irq_chip_retrigger_hierarchy;
 
 	if (IS_ENABLED(CONFIG_X86))
 		chip->flags |= IRQCHIP_MOVE_DEFERRED;
-- 
2.43.0


^ permalink raw reply related

* [PATCH 2/3] x86/irq: Use irq_chip_retrigger_hierarchy() in fixup_irqs()
From: Naman Jain @ 2026-07-28 13:51 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring, Bjorn Helgaas,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Joerg Roedel (AMD), Suravee Suthikulpanit,
	Vasant Hegde, Will Deacon, Robin Murphy
  Cc: x86, Jake Oshins, linux-hyperv, linux-pci, iommu, linux-kernel
In-Reply-To: <20260728135117.2888175-1-namjain@linux.microsoft.com>

fixup_irqs() re-injects a pending interrupt on its new target CPU by
looking at the outermost domain chip and invoking its irq_retrigger()
callback directly. That only works when the outermost chip happens to
install an irq_retrigger() callback, which is not guaranteed for every
irqchip and could lead to lost interrupts on CPU hot-unplug.

Use irq_chip_retrigger_hierarchy() instead, which walks up the interrupt
hierarchy until it finds a chip that implements irq_retrigger().

While at it, move the loop-local variables into the loop scope and use a
scoped guard for desc->lock.

No functional change intended for chips which already provide an
irq_retrigger() callback on the outermost domain.

Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
---
 arch/x86/kernel/irq.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 30122f0b3af96..ef1bdd3c4659a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -466,11 +466,6 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
 {
-	unsigned int vector;
-	struct irq_desc *desc;
-	struct irq_data *data;
-	struct irq_chip *chip;
-
 	irq_migrate_all_off_this_cpu();
 
 	/*
@@ -489,22 +484,18 @@ void fixup_irqs(void)
 	 * vector_lock because the cpu is already marked !online, so
 	 * nothing else will touch it.
 	 */
-	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
+	for (unsigned int vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+		struct irq_desc *desc = __this_cpu_read(vector_irq[vector]);
+
+		if (IS_ERR_OR_NULL(desc))
 			continue;
 
 		if (is_vector_pending(vector)) {
-			desc = __this_cpu_read(vector_irq[vector]);
-
-			raw_spin_lock(&desc->lock);
-			data = irq_desc_get_irq_data(desc);
-			chip = irq_data_get_irq_chip(data);
-			if (chip->irq_retrigger) {
-				chip->irq_retrigger(data);
+			guard(raw_spinlock)(&desc->lock);
+			if (irq_chip_retrigger_hierarchy(&desc->irq_data))
 				__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
-			}
-			raw_spin_unlock(&desc->lock);
 		}
+
 		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
 			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 	}
-- 
2.43.0


^ permalink raw reply related

* [PATCH 1/3] PCI: hv: Set irq_retrigger callback for the Hyper-V PCI MSI irqchip
From: Naman Jain @ 2026-07-28 13:51 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring, Bjorn Helgaas,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Joerg Roedel (AMD), Suravee Suthikulpanit,
	Vasant Hegde, Will Deacon, Robin Murphy
  Cc: x86, Jake Oshins, linux-hyperv, linux-pci, iommu, linux-kernel
In-Reply-To: <20260728135117.2888175-1-namjain@linux.microsoft.com>

The Hyper-V vPCI MSI irqchip never installs an irq_retrigger() callback.

On CPU hot-unplug fixup_irqs() migrates the interrupts which are affine to
the outgoing CPU to a new target. If an interrupt still has its pending bit
set in the outgoing CPU's IRR at that point, fixup_irqs() resends it on the
new target through the irqchip's irq_retrigger() callback. As the Hyper-V
PCI/MSI chip does not provide that callback, the pending interrupt is
silently dropped, which can result in lost interrupts, stalls and "No irq
handler for vector" messages during CPU hotplug.

Install irq_chip_retrigger_hierarchy() as the irq_retrigger() callback for
the Hyper-V PCI/MSI irqchip, so that a pending interrupt is resent on its
new target CPU via the parent x86 vector domain.

Fixes: 4daace0d8ce85 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
Cc: stable@vger.kernel.org
Suggested-by: Long Li <longli@microsoft.com>
Suggested-by: Thomas Gleixner <tglx@kernel.org>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
---
 drivers/pci/controller/pci-hyperv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index cfc8fa403dad6..89816a2bd7cd3 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -2119,6 +2119,7 @@ static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *dom
 	info->ops->msi_prepare = hv_msi_prepare;

 	chip->irq_set_affinity = irq_chip_set_affinity_parent;
+	chip->irq_retrigger = irq_chip_retrigger_hierarchy;

 	if (IS_ENABLED(CONFIG_X86))
 		chip->flags |= IRQCHIP_MOVE_DEFERRED;
-- 
2.43.0

^ permalink raw reply related

* [PATCH 0/3] x86/irq: Fix lost interrupts on CPU hot-unplug for Hyper-V PCI/MSI
From: Naman Jain @ 2026-07-28 13:51 UTC (permalink / raw)
  To: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring, Bjorn Helgaas,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Joerg Roedel (AMD), Suravee Suthikulpanit,
	Vasant Hegde, Will Deacon, Robin Murphy
  Cc: x86, Jake Oshins, linux-hyperv, linux-pci, iommu, linux-kernel

On CPU hot-unplug, fixup_irqs() re-injects interrupts that are still
pending in the outgoing CPU's IRR via the irqchip's irq_retrigger()
callback. The Hyper-V vPCI MSI irqchip never installs one, so such a
pending interrupt is silently dropped, leading to lost interrupts, stalls
and "No irq handler for vector" messages.

Patch 1 is the minimal, backportable fix (Fixes:/Cc: stable) that installs
irq_chip_retrigger_hierarchy() on the Hyper-V PCI/MSI chip. Patch 2 then
switches fixup_irqs() to call irq_chip_retrigger_hierarchy() directly, so
the retrigger works for any stacked irqchip without a per-chip callback.
Patch 3 drops the now-redundant irq_retrigger() initializations.
This way, patch 2 and 3 do not need to be backported.

The series follows the original RFC:
Link: https://lore.kernel.org/all/87y0f3jhd6.ffs@fw13/

Naman Jain (3):
  PCI: hv: Set irq_retrigger callback for the Hyper-V PCI MSI irqchip
  x86/irq: Use irq_chip_retrigger_hierarchy() in fixup_irqs()
  x86/irq, iommu/amd, PCI: Drop redundant irq_retrigger inits

 arch/x86/kernel/apic/io_apic.c |  2 --
 arch/x86/kernel/apic/msi.c     |  4 +---
 arch/x86/kernel/hpet.c         |  1 -
 arch/x86/kernel/irq.c          | 23 +++++++----------------
 drivers/iommu/amd/init.c       |  1 -
 5 files changed, 8 insertions(+), 23 deletions(-)

base-commit: 0d33d21e47d9dc66f91e44da3fc9220c74d93df7
-- 
2.43.0

^ permalink raw reply

* Re: [PATCH net v2] net: mana: Return error code from mana_create_rxq()
From: sashiko-bot @ 2026-07-28 11:38 UTC (permalink / raw)
  To: Aditya Garg; +Cc: linux-hyperv
In-Reply-To: <20260727113759.2881500-1-gargaditya@linux.microsoft.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:

Pre-existing issues:
- [High] TOCTOU race and Use-After-Free in mana_get_stats64() and mana_xdp_xmit() due to lockless access of queues.
--

--- Patch [1]: [PATCH net v2] net: mana: Return error code from mana_create_rxq() ---
commit 61de155c59ba6f86af33fcadf7943152726f385f
Author: Aditya Garg <gargaditya@linux.microsoft.com>

net: mana: Return error code from mana_create_rxq()

mana_create_rxq() returns a struct mana_rxq pointer and returns NULL on
any failure. The caller, mana_add_rx_queues(), cannot tell what went
wrong and hardcodes the error as -ENOMEM. As a result the actual failure
reported by the lower layers is masked.

Link: https://lore.kernel.org/all/20260718024818.560552-1-gargaditya@linux.microsoft.com/

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 9d9bfd116dabc..92bb55935c1c4 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c

[ ... ]

> @@ -2964,8 +2964,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc,
>  
>  	for (i = 0; i < apc->num_queues; i++) {
>  		rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
> -		if (!rxq) {
> -			err = -ENOMEM;
> +		if (IS_ERR(rxq)) {
> +			err = PTR_ERR(rxq);
>  			netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
>  			goto out;
>  		}

[Severity: High]
This is a pre-existing issue, but is there a TOCTOU race and Use-After-Free
in mana_get_stats64() and mana_xdp_xmit() due to lockless access of queues?

If an administrator changes the MTU or closes the interface, mana_detach()
is triggered. It locklessly clears apc->port_is_up, disables TX, and then
frees the RX and TX queues using kvfree() directly via mana_dealloc_queues(),
without an RCU grace period:

mana_detach()
	apc->port_is_up = false;

	/* Ensure port state updated before txq state */
	smp_wmb();

	netif_tx_disable(ndev);

	if (apc->port_st_save) {
		err = mana_dealloc_queues(ndev);

Concurrently, can unprivileged userspace reading /proc/net/dev (triggering
mana_get_stats64()) check apc->port_is_up, evaluate it as true, and then
access the freed queues?

mana_get_stats64()
	if (!apc->port_is_up)
		return;
...
	for (q = 0; q < num_queues; q++) {
		rx_stats = &apc->rxqs[q]->stats;

Similarly, in mana_xdp_xmit(), the TX lock is temporarily acquired and
dropped for each packet in the loop, allowing mana_detach() to complete
netif_tx_disable() and proceed with freeing the queue before
mana_xdp_xmit() updates the stats:

mana_xdp_xmit()
	for (i = 0; i < n; i++) {
		if (mana_xdp_xmit_fm(ndev, frames[i], q_idx))
			break;

		count++;
	}

	tx_stats = &apc->tx_qp[q_idx]->txq.stats;

	u64_stats_update_begin(&tx_stats->syncp);

Can this lead to a Use-After-Free where memory corruption or a kernel
panic occurs?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260727113759.2881500-1-gargaditya@linux.microsoft.com?part=1

^ permalink raw reply

* Re: [PATCH net-next v2 0/7] net: mana: harden the HWC and add dynamic queue depth
From: Paolo Abeni @ 2026-07-28 10:53 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Eric Dumazet, Andrew Lunn, Jason Gunthorpe, Leon Romanovsky,
	Haiyang Zhang, K . Y . Srinivasan, Wei Liu, Dexuan Cui,
	shradhagupta, Simon Horman, ernis, stephen
  Cc: netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260721234339.1476932-1-longli@microsoft.com>

On 7/22/26 1:43 AM, Long Li wrote:
> This series hardens the MANA Hardware Channel (HWC) control-plane path
> and then builds on that to support a dynamic HWC queue depth.
> 
> The HWC is the command channel the driver uses to talk to the device.
> Today it is created at a fixed depth of one outstanding request, and
> several of its lookup and teardown paths predate the RCU and DMA-lifetime
> rules they now need to follow.  Raising the queue depth and allowing
> concurrent commands makes those latent races reachable, so the fixes come
> first and the feature builds on them.
> 
> Patches 1-5 are fixes for pre-existing HWC bugs, each with a Fixes: tag:
> 
>   1: cq_table was a plain pointer array freed with no grace period while
>      the EQ interrupt handler dereferenced it; put it under RCU.
>   2: the HWC RQ and SQ were sized with each other's message size, so a
>      response could overflow the RQ buffer and the RX slot stride was
>      computed with the wrong size.
>   3: comp_buf was freed before the EQ was destroyed, so a late completion
>      handler could touch freed memory.
>   4: the RX path consumed device-supplied lengths and indices without
>      validation; validate them before use (this matters for confidential
>      VMs, where the DMA buffer is shared with the host).
>   5: a failed mana_hwc_establish_channel() could leave live MST entries
>      while the driver freed the queue buffers, and destroy_channel() freed
>      the TXQ/RXQ before the EQ was quiesced; add a setup_active teardown
>      gate and destroy the CQ first.

Fixes should go via the net tree. Targeting net-next to avoid waiting
the merge to post the dependant features is not a valid reason.

Also both sashikos agrees pcie_fpr() in patch 5 is not the correct
function to be used there.

Also please note the expectations WRT LLMs feedback, commit c82ff94592fb.

/P


^ permalink raw reply

* Re: [PATCH V4 5/9] mshv: Import data structs around device passthru from hyperv headers
From: Mukesh R @ 2026-07-28  0:08 UTC (permalink / raw)
  To: sashiko-reviews; +Cc: linux-hyperv, linux-pci
In-Reply-To: <20260718023047.E8EC51F000E9@smtp.kernel.org>

On 7/17/26 19:30, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [High] The memory layout of `union hv_device_domain_id` is inverted compared to the expected Hyper-V ABI, resulting in corrupted device domain IDs being passed to the hypervisor.
> --
> 
> --- Patch [5]: [PATCH V4 5/9] mshv: Import data structs around device passthru from hyperv headers ---
> commit 02ed2cd292e85f0b6e754b6a29df473f95394315
> Author: Mukesh R <mrathor@linux.microsoft.com>
> 
> mshv: Import data structs around device passthru from hyperv headers
> 
> This commit copies and imports definitions and declarations from Hyper-V
> public headers. These additions are related to creating domains in the
> hypervisor and attaching devices to them.
> 
>> diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
>> index b4cb2fa26e9bc..90497635c3cbd 100644
>> --- a/include/hyperv/hvhdk_mini.h
>> +++ b/include/hyperv/hvhdk_mini.h
> 
> [ ... ]
> 
>> +/* 3 domain types: stage 1, stage 2, and SOC */
>> +#define HV_DEVICE_DOMAIN_TYPE_S2  0 /* HV_DEVICE_DOMAIN_ID_TYPE_S2 */
>> +#define HV_DEVICE_DOMAIN_TYPE_S1  1 /* HV_DEVICE_DOMAIN_ID_TYPE_S1 */
>> +#define HV_DEVICE_DOMAIN_TYPE_SOC 2 /* HV_DEVICE_DOMAIN_ID_TYPE_SOC */
>> +
>> +/* ID for stage 2 default domain and NULL domain */
>> +#define HV_DEVICE_DOMAIN_ID_S2_DEFAULT 0
>> +#define HV_DEVICE_DOMAIN_ID_S2_NULL    0xFFFFFFFFULL
>> +
>> +union hv_device_domain_id {
>> +	u64 as_uint64;
>> +	struct {
>> +		u32 type : 4;
>> +		u32 reserved : 28;
>> +		u32 id;
>> +	};
>> +} __packed;
> 
> [Severity: High]
> Is the memory layout of union hv_device_domain_id inverted compared to the
> expected Hyper-V ABI?

No.

> The macro HV_DEVICE_DOMAIN_ID_S2_NULL is defined as 0xFFFFFFFFULL, which
> places 0xFFFFFFFF in the lower 32 bits and 0 in the upper 32 bits. Given
> that Stage 2 (S2) domains have a type of 0, this macro implies that the id
> should be in the lower 32 bits and the type should be in the upper 32 bits.

HV_DEVICE_DOMAIN_ID_S2_NULL is never assigned to any data struct of
type "union hv_device_domain_id".


> However, union hv_device_domain_id defines type as the first bitfield
> (occupying the lower 32 bits) and id as the second (occupying the upper 32
> bits).
> 
> Under this layout, casting 0xFFFFFFFFULL evaluates to type = 15 and id = 0,
> instead of the expected type = 0 and id = 0xFFFFFFFF. Could this cause
> malformed domain IDs to be sent to the hypervisor?
> 


^ permalink raw reply

* Re: [PATCH] Drivers: hv: Use meaningful errnos for hypercall status codes
From: Easwar Hariharan @ 2026-07-27 21:23 UTC (permalink / raw)
  To: Hardik Garg
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv,
	easwar.hariharan, linux-kernel, ssengar, namjain
In-Reply-To: <20260727211101.2199630-1-hargar@linux.microsoft.com>

Hi Hardik,

On 7/27/2026 14:11, Hardik Garg wrote:
> Commit 3817854ba892 ("hyperv: Log hypercall status codes as strings")
> converted hv_result_to_errno() from a switch to a table and added status
> codes used for string logging. Statuses without an existing specific
> mapping were assigned the generic -EIO fallback even when a more specific
> errno was available.
> 
> Map HV_STATUS_ACCESS_DENIED and HV_STATUS_OPERATION_DENIED to -EACCES,
> HV_STATUS_UNKNOWN_PROPERTY and HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE to
> -EINVAL, and HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED to -EOPNOTSUPP.
> This lets callers distinguish permission, argument, and capability
> failures from generic I/O errors.
> 
> The table conversion also added duplicate HV_STATUS_INVALID_LP_INDEX and
> HV_STATUS_INVALID_REGISTER_VALUE entries. Remove the later -EIO entries,
> which are unreachable because find_hv_status_info() returns the first
> match.
> 
> Signed-off-by: Hardik Garg <hargar@linux.microsoft.com>
> ---
>  drivers/hv/hv_common.c | 12 +++++-------
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
Glad to see that this was something that bothered you as well. If you like, you
can pick up feedback from Nuno on my attempt to solve this [1] and do a de-facto v3.

[1] https://lore.kernel.org/all/479242d4-ae08-442c-b61b-c9408ba2e9b0@linux.microsoft.com/

Thanks,
Easwar (he/him)

^ permalink raw reply

* [PATCH] Drivers: hv: Use meaningful errnos for hypercall status codes
From: Hardik Garg @ 2026-07-27 21:11 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv
  Cc: linux-kernel, ssengar, namjain

Commit 3817854ba892 ("hyperv: Log hypercall status codes as strings")
converted hv_result_to_errno() from a switch to a table and added status
codes used for string logging. Statuses without an existing specific
mapping were assigned the generic -EIO fallback even when a more specific
errno was available.

Map HV_STATUS_ACCESS_DENIED and HV_STATUS_OPERATION_DENIED to -EACCES,
HV_STATUS_UNKNOWN_PROPERTY and HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE to
-EINVAL, and HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED to -EOPNOTSUPP.
This lets callers distinguish permission, argument, and capability
failures from generic I/O errors.

The table conversion also added duplicate HV_STATUS_INVALID_LP_INDEX and
HV_STATUS_INVALID_REGISTER_VALUE entries. Remove the later -EIO entries,
which are unreachable because find_hv_status_info() returns the first
match.

Signed-off-by: Hardik Garg <hargar@linux.microsoft.com>
---
 drivers/hv/hv_common.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 6b67ac6167891..31256cb22b39e 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -787,11 +787,11 @@ static const struct hv_status_info hv_status_infos[] = {
 	_STATUS_INFO(HV_STATUS_INVALID_HYPERCALL_INPUT,		-EINVAL),
 	_STATUS_INFO(HV_STATUS_INVALID_ALIGNMENT,		-EIO),
 	_STATUS_INFO(HV_STATUS_INVALID_PARAMETER,		-EINVAL),
-	_STATUS_INFO(HV_STATUS_ACCESS_DENIED,			-EIO),
+	_STATUS_INFO(HV_STATUS_ACCESS_DENIED,			-EACCES),
 	_STATUS_INFO(HV_STATUS_INVALID_PARTITION_STATE,		-EIO),
-	_STATUS_INFO(HV_STATUS_OPERATION_DENIED,		-EIO),
-	_STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY,		-EIO),
-	_STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE,	-EIO),
+	_STATUS_INFO(HV_STATUS_OPERATION_DENIED,		-EACCES),
+	_STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY,		-EINVAL),
+	_STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE,	-EINVAL),
 	_STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY,		-ENOMEM),
 	_STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY,	-ENOMEM),
 	_STATUS_INFO(HV_STATUS_INSUFFICIENT_ROOT_MEMORY,	-ENOMEM),
@@ -805,11 +805,9 @@ static const struct hv_status_info hv_status_infos[] = {
 	_STATUS_INFO(HV_STATUS_NOT_ACKNOWLEDGED,		-EIO),
 	_STATUS_INFO(HV_STATUS_INVALID_VP_STATE,		-EIO),
 	_STATUS_INFO(HV_STATUS_NO_RESOURCES,			-EIO),
-	_STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED,	-EIO),
+	_STATUS_INFO(HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED,	-EOPNOTSUPP),
 	_STATUS_INFO(HV_STATUS_INVALID_LP_INDEX,		-EINVAL),
 	_STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE,		-EINVAL),
-	_STATUS_INFO(HV_STATUS_INVALID_LP_INDEX,		-EIO),
-	_STATUS_INFO(HV_STATUS_INVALID_REGISTER_VALUE,		-EIO),
 	_STATUS_INFO(HV_STATUS_OPERATION_FAILED,		-EIO),
 	_STATUS_INFO(HV_STATUS_TIME_OUT,			-EIO),
 	_STATUS_INFO(HV_STATUS_CALL_PENDING,			-EIO),
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net v2] net: mana: Return error code from mana_create_rxq()
From: Joe Damato @ 2026-07-27 17:36 UTC (permalink / raw)
  To: Aditya Garg
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, horms, dipayanroy, shacharr,
	stephen, linux-hyperv, netdev, linux-kernel, ssengar, gargaditya
In-Reply-To: <20260727113759.2881500-1-gargaditya@linux.microsoft.com>

On Mon, Jul 27, 2026 at 04:37:59AM -0700, Aditya Garg wrote:
> mana_create_rxq() returns a struct mana_rxq pointer and returns NULL on
> any failure. The caller, mana_add_rx_queues(), cannot tell what went
> wrong and hardcodes the error as -ENOMEM. As a result the actual failure
> reported by the lower layers (for example -EPROTO from a failed HW
> request) is masked and every RX queue creation failure looks like an
> out-of-memory error.
> 
> Return an ERR_PTR() encoded error code from mana_create_rxq() on failure
> instead of NULL. The caller now propagates the returned error code
> directly instead of substituting -ENOMEM.
> 
> Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
> Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
> ---
> Changes in v2:
>  - Use ERR_PTR() to return the error from mana_create_rxq() instead of
>    adding an output parameter, keeping the pointer return type.
> 
> v1: https://lore.kernel.org/all/20260718024818.560552-1-gargaditya@linux.microsoft.com/
> 
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 

Reviewed-by: Joe Damato <joe@dama.to>

^ permalink raw reply

* Re: [PATCH V2 1/1] x86/VMBus: Confidential VMBus for dynamic DMA transfers
From: sashiko-bot @ 2026-07-27 15:39 UTC (permalink / raw)
  To: Tianyu Lan; +Cc: linux-hyperv, linux-scsi
In-Reply-To: <20260727152711.2045-1-tiala@microsoft.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:

Pre-existing issues:
- [High] If storvsc_do_io() returns an error other than -EAGAIN, storvsc_queuecommand() incorrectly returns 0 to the SCSI midlayer, leaking the dynamically allocated payload and causing a command timeout/hang.
--

--- Patch [1]: [PATCH V2 1/1] x86/VMBus: Confidential VMBus for dynamic DMA transfers ---
commit 0fcd974d529c3a7f6e32e98d08ecd19b7a1e7b0a
Author: Tianyu Lan <ltykernel@gmail.com>

x86/VMBus: Confidential VMBus for dynamic DMA transfers

This commit modifies the VMBus driver to support dynamic DMA transfers
for Confidential VMs. It introduces the co_external_memory flag to bypass
the swiotlb and DMA API, allowing direct DMA to private encrypted memory.

> diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
> index 571ea549152b1..656fe73e0d35e 100644
> --- a/drivers/scsi/storvsc_drv.c
> +++ b/drivers/scsi/storvsc_drv.c

[ ... ]

> @@ -1870,7 +1882,7 @@ static enum scsi_qc_status storvsc_queuecommand(struct Scsi_Host *host,
>  	ret = storvsc_do_io(dev, cmd_request, smp_processor_id());
>  	migrate_enable();
>  
> -	if (ret)
> +	if (ret && (!dev->channel->co_external_memory))
>  		scsi_dma_unmap(scmnd);
>  
>  	if (ret == -EAGAIN) {

[Severity: High]
This is a pre-existing issue, but if storvsc_do_io() returns an error
other than -EAGAIN, does storvsc_queuecommand() leak the payload buffer
and silently drop the error?

If storvsc_do_io() returns a failure like -ENODEV, the code bypasses the
err_free_payload block and returns 0 to the SCSI midlayer:

storvsc_queuecommand() {
    ...
    ret = storvsc_do_io(dev, cmd_request, smp_processor_id());
    ...
    if (ret == -EAGAIN) {
        ret = SCSI_MLQUEUE_DEVICE_BUSY;
        goto err_free_payload;
    }

    return 0;

err_free_payload:
    if (payload_sz > sizeof(cmd_request->mpb))
        kfree(payload);

    return ret;
}

Since 0 is returned, the midlayer believes the command queued successfully,
but because it was never sent to the host, scsi_done() is never called.
Will this cause the command to linger until a SCSI command timeout occurs
while also permanently leaking the dynamically allocated payload?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260727152711.2045-1-tiala@microsoft.com?part=1

^ permalink raw reply

* [PATCH V2 1/1] x86/VMBus: Confidential VMBus for dynamic DMA transfers
From: Tianyu Lan @ 2026-07-27 15:27 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, James.Bottomley,
	martin.petersen
  Cc: Tianyu Lan, linux-hyperv, linux-kernel, linux-scsi, hch,
	robin.murphy, vdso, mhklinux

Hyper-V provides Confidential VMBus to communicate between
device model and device guest driver via encrypted/private
memory in Confidential VM. The device model is in OpenHCL
(https://openvmm.dev/guide/user_guide/openhcl.html) that
plays the paravisor role.

For a VMBus device, there are two communication methods to
talk with Host/Hypervisor. 1) VMBUS Ring buffer 2) Dynamic
DMA transfer.

The Confidential VMBus Ring buffer has been upstreamed by
Roman Kisel(commit 6802d8af47d1).

The dynamic DMA transition of VMBus device normally goes
through DMA core and it uses SWIOTLB as bounce buffer in
a CoCo VM.

The Confidential VMBus device can do DMA directly to
private/encrypted memory. Because the swiotlb is decrypted
memory, the DMA transfer must not be bounced through the
swiotlb, so as to preserve confidentiality. This is different
from the default for Linux CoCo VMs, so not use DMA(SWIOTLB)
API in VMBus driver when confidential dynamic DMA transfers
capability is present.

Signed-off-by: Tianyu Lan <tiala@microsoft.com>
---
Change since v1:
	* Fix code style issue
	* Use co_external_memory flag in struct vmbus_channel
	   directly instead of adding new flag in the struct
	   hv_device.
---
 drivers/scsi/storvsc_drv.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c
index 6977ca8a0658..a3b22cba3421 100644
--- a/drivers/scsi/storvsc_drv.c
+++ b/drivers/scsi/storvsc_drv.c
@@ -1326,7 +1326,8 @@ static void storvsc_on_channel_callback(void *context)
 					continue;
 				}
 				request = (struct storvsc_cmd_request *)scsi_cmd_priv(scmnd);
-				scsi_dma_unmap(scmnd);
+				if (!device->channel->co_external_memory)
+					scsi_dma_unmap(scmnd);
 			}
 
 			storvsc_on_receive(stor_device, packet, request);
@@ -1815,7 +1816,7 @@ static enum scsi_qc_status storvsc_queuecommand(struct Scsi_Host *host,
 		unsigned long offset_in_hvpg = offset_in_hvpage(sgl->offset);
 		unsigned int hvpg_count = HVPFN_UP(offset_in_hvpg + length);
 		struct scatterlist *sg;
-		unsigned long hvpfn, hvpfns_to_add;
+		unsigned long hvpfn, hvpfns_to_add, hvpgoff;
 		int j, i = 0, sg_count;
 
 		payload_sz = (hvpg_count * sizeof(u64) +
@@ -1831,10 +1832,14 @@ static enum scsi_qc_status storvsc_queuecommand(struct Scsi_Host *host,
 		payload->range.len = length;
 		payload->range.offset = offset_in_hvpg;
 
-		sg_count = scsi_dma_map(scmnd);
-		if (sg_count < 0) {
-			ret = SCSI_MLQUEUE_DEVICE_BUSY;
-			goto err_free_payload;
+		if (dev->channel->co_external_memory) {
+			sg_count = scsi_sg_count(scmnd);
+		} else {
+			sg_count = scsi_dma_map(scmnd);
+			if (sg_count < 0) {
+				ret = SCSI_MLQUEUE_DEVICE_BUSY;
+				goto err_free_payload;
+			}
 		}
 
 		for_each_sg(sgl, sg, sg_count, j) {
@@ -1846,9 +1851,16 @@ static enum scsi_qc_status storvsc_queuecommand(struct Scsi_Host *host,
 			 * Such offsets are handled even on other than the first
 			 * sgl entry, provided they are a multiple of PAGE_SIZE.
 			 */
-			hvpfn = HVPFN_DOWN(sg_dma_address(sg));
-			hvpfns_to_add = HVPFN_UP(sg_dma_address(sg) +
-						 sg_dma_len(sg)) - hvpfn;
+			if (dev->channel->co_external_memory) {
+				hvpgoff = HVPFN_DOWN(sg->offset);
+				hvpfn = page_to_hvpfn(sg_page(sg)) + hvpgoff;
+				hvpfns_to_add =	HVPFN_UP(sg->offset + sg->length) -
+							hvpgoff;
+			} else {
+				hvpfn = HVPFN_DOWN(sg_dma_address(sg));
+				hvpfns_to_add = HVPFN_UP(sg_dma_address(sg) +
+							 sg_dma_len(sg)) - hvpfn;
+			}
 
 			/*
 			 * Fill the next portion of the PFN array with
@@ -1870,7 +1882,7 @@ static enum scsi_qc_status storvsc_queuecommand(struct Scsi_Host *host,
 	ret = storvsc_do_io(dev, cmd_request, smp_processor_id());
 	migrate_enable();
 
-	if (ret)
+	if (ret && (!dev->channel->co_external_memory))
 		scsi_dma_unmap(scmnd);
 
 	if (ret == -EAGAIN) {
-- 
2.50.1


^ permalink raw reply related

* [PATCH net v2] net: mana: Return error code from mana_create_rxq()
From: Aditya Garg @ 2026-07-27 11:37 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, ernis, horms, dipayanroy, gargaditya,
	shacharr, stephen, linux-hyperv, netdev, linux-kernel, ssengar,
	gargaditya

mana_create_rxq() returns a struct mana_rxq pointer and returns NULL on
any failure. The caller, mana_add_rx_queues(), cannot tell what went
wrong and hardcodes the error as -ENOMEM. As a result the actual failure
reported by the lower layers (for example -EPROTO from a failed HW
request) is masked and every RX queue creation failure looks like an
out-of-memory error.

Return an ERR_PTR() encoded error code from mana_create_rxq() on failure
instead of NULL. The caller now propagates the returned error code
directly instead of substituting -ENOMEM.

Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
---
Changes in v2:
 - Use ERR_PTR() to return the error from mana_create_rxq() instead of
   adding an output parameter, keeping the pointer return type.

v1: https://lore.kernel.org/all/20260718024818.560552-1-gargaditya@linux.microsoft.com/

 drivers/net/ethernet/microsoft/mana/mana_en.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 9d9bfd116dab..92bb55935c1c 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2829,7 +2829,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 
 	rxq = kvzalloc_flex(*rxq, rx_oobs, apc->rx_queue_size);
 	if (!rxq)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	rxq->ndev = ndev;
 	rxq->num_rx_buf = apc->rx_queue_size;
@@ -2930,7 +2930,7 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 
 	mana_destroy_rxq(apc, rxq, false);
 
-	return NULL;
+	return ERR_PTR(err);
 }
 
 static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx)
@@ -2964,8 +2964,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc,
 
 	for (i = 0; i < apc->num_queues; i++) {
 		rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
-		if (!rxq) {
-			err = -ENOMEM;
+		if (IS_ERR(rxq)) {
+			err = PTR_ERR(rxq);
 			netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
 			goto out;
 		}
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net] net: mana: Return error code from mana_create_rxq()
From: Aditya Garg @ 2026-07-27 11:24 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, pabeni, ernis, horms, dipayanroy, shacharr, stephen,
	linux-hyperv, netdev, linux-kernel, ssengar, gargaditya
In-Reply-To: <20260723090621.6261a3e8@kernel.org>

On 23-07-2026 21:36, Jakub Kicinski wrote:
> On Fri, 17 Jul 2026 19:48:18 -0700 Aditya Garg wrote:
>> Change mana_create_rxq() to return an int and pass the created rxq back
>> to the caller through an output parameter. The caller now propagates the
>> returned error code directly instead of substituting -ENOMEM.
> 
> We generally prefer the use of ERR_PTR to output arguments in
> the kernel. Looks like it'd work here nicely.

Thanks for the review Jakub,
I'll post a v2 using ERR_PTR/PTR_ERR

Regards,
Aditya

^ permalink raw reply

* Re: [PATCH V4 1/9] mshv: Provide a way to get partition ID if running in a VMM process
From: Mukesh R @ 2026-07-25  1:48 UTC (permalink / raw)
  To: Jacob Pan
  Cc: hpa, robin.murphy, robh, wei.liu, mhklinux, muislam, namjain,
	magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch, kys, haiyangz, decui, longli, tglx, mingo,
	bp, dave.hansen, x86, joro, will, lpieralisi, kwilczynski,
	bhelgaas, arnd
In-Reply-To: <20260724150824.000075a7@linux.microsoft.com>

On 7/24/26 15:08, Jacob Pan wrote:
> Hi Mukesh,
> 
> On Fri, 24 Jul 2026 12:14:08 -0700
> Mukesh R <mrathor@linux.microsoft.com> wrote:
> 
>> On 7/23/26 15:21, Jacob Pan wrote:
>>> Hi Mukesh,
>>
>> Hey Jacob, pl see inline..
>>
>>> On Fri, 17 Jul 2026 19:19:41 -0700
>>> Mukesh R <mrathor@linux.microsoft.com> wrote:
>>
>>     ... snip...
>>
>>>>    static int
>>>>    add_partition(struct mshv_partition *partition)
>>>>    {
>>>> @@ -2073,6 +2094,7 @@ mshv_ioctl_create_partition(void __user
>>>> *user_arg, struct device *module_dev) goto cleanup_irq_srcu;
>>>>    
>>>>    	partition->pt_id = pt_id;
>>>> +	partition->pt_vmm_tgid = current->tgid;
>>> I wonder how robust this mechanism is to identify target partition
>>> via tgid.
>>> 1) what prevents a VMM process create more than one partition? in
>>> that case each partition would have the same tgid.
>>
>> Currently, none of the VMMs we support do that, and doesn't look like
>> there is much of a demand for it.
>>
> My point is that from kernel UAPI pov, we cannot count on user behavior
> nor current VMM's implementation.

True, but as UAPI implementors, I thought we could list its
limitations. For example: an fwrite() implementation can say if you
don't fflush(), fwrite() can lose data!

>>> 2) IIUC, the lifetime of the partition is tied to FD, which is
>>> different than the lifetime of a PID. The partition FD can be
>>> inherited or passed to another process. The VMM tg can exit while
>>> the FDs can be alive. Then the tgid can be reused by another
>>> unrelated process, right?
>>
>> yeah, AI keeps telling me that, but not super accurate imo.
>>
>> we are using tgid and not pid. tgid is process group id, and that will
>> stay around as long as there is at least one process in it. if we used
>> pid, then that would be the case.
>>
> I understand it is tgid but don't think tgid makes difference in terms
> of FD lifetime.
> e.g. process A creates a partition, then passes the partition fd to
> process B over a Unix socket. Process A can then exit while process B
> still holds a reference to the partition file. At that point the tgid
> that was stored in pt_vmm_tgid can be reused by an unrelated process C,
> while the partition's file object remains alive.

Confused whether you are saying that C should still be able to manage
the VM without errors, or that C could cause harm because it has access
to possibly recycled pt_vmm_tgid. If latter, C cannot use pt_vmm_tgid
in harmful way because:

         if (pt->pt_vmm_tgid == current->tgid) {
                 ret_ptid = pt->pt_id;

         else return : HV_PARTITION_ID_INVALID

So, because C has different tgid, it will return HV_PARTITION_ID_INVALID
causing hypercalls to fail with invalid parameter.

>>> Would it be more robust to based this on the partition FD instead of
>>> tgid?
>>    
>> it might be, but problem with that is we need pt-id in other cases
>> where that is not available: for example in
>> hv_iommu_domain_alloc_paging and in irq remapping paths for direct
>> attached devices. if we can sort that out somehow, then we can do
>> that. but i suspect, it would take some time to figure that out, so i
>> hope we can make that a future enhancement.
>>
>> For now, i've been thinking of just putting a check and returning
>> ENOTSUPP if a vmm tries to create another partition.
>>
> IMHO, that only solves the uniqueness issue of partition ID but not the
> lifetime issue.
> maybe EEXIST instead of ENOTSUPP?

   EEXIST: A VM already exists for this process group
ENOTSUPP: more than one VM is not supported for a proc group

I think either are ok, but if you prefer EEXIST, that's fine too.


>> Thanks,
>> -Mukesh
>>
>>>>    	ret = add_partition(partition);
>>>>    	if (ret)
>>>> diff --git a/include/asm-generic/mshyperv.h
>>>> b/include/asm-generic/mshyperv.h index bf601d67cecb..e8cbc4e3f7ad
>>>> 100644 --- a/include/asm-generic/mshyperv.h
>>>> +++ b/include/asm-generic/mshyperv.h
>>>> @@ -350,6 +350,7 @@ int hv_call_add_logical_proc(int node, u32
>>>> lp_index, u32 acpi_id); int
>>>> hv_call_notify_all_processors_started(void); bool hv_lp_exists(u32
>>>> lp_index); int hv_call_create_vp(int node, u64 partition_id, u32
>>>> vp_index, u32 flags); +u64 mshv_current_partid(void);
>>>>    
>>>>    #else /* CONFIG_MSHV_ROOT */
>>>>    static inline bool hv_root_partition(void) { return false; }
>>>> @@ -380,6 +381,10 @@ static inline int hv_call_create_vp(int node,
>>>> u64 partition_id, u32 vp_index, u3 {
>>>>    	return -EOPNOTSUPP;
>>>>    }
>>>> +static inline u64 mshv_current_partid(void)
>>>> +{
>>>> +	return HV_PARTITION_ID_INVALID;
>>>> +}
>>>>    #endif /* CONFIG_MSHV_ROOT */
>>>>    
>>>>    static inline int hv_deposit_memory(u64 partition_id, u64
>>>> status)


^ permalink raw reply

* Re: [PATCH RFC 2/2] PCI: hv: Add vPCI device reset support
From: sashiko-bot @ 2026-07-24 23:25 UTC (permalink / raw)
  To: wei.liu; +Cc: linux-hyperv, linux-pci
In-Reply-To: <20260724230844.3259741-3-wei.liu@kernel.org>

> From: Wei Liu <wei.liu@kernel.org>
> 
> Hyper-V vPCI protocol version 1.5 adds a RESET_DEVICE request for projected
> PCI functions. Negotiate the new protocol version and issue the request
> through the vPCI VMBus channel from the PCI controller reset callback.
> 
> Use the existing VMBus response path. Return -ENOTTY when the host reports
> STATUS_NOT_SUPPORTED so PCI core may try another reset method.
> 
> Signed-off-by: Wei Liu <wei.liu@kernel.org>

Sashiko has reviewed this patch and found no issues. It looks great!

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260724230844.3259741-1-wei.liu@kernel.org?part=2


^ permalink raw reply

page: next (older)
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox