Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* [PATCH V2,net-next, 2/2] net: mana: Add ethtool counters for RX CQEs in coalesced type
From: Haiyang Zhang @ 2026-01-06 20:46 UTC (permalink / raw)
  To: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
	Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
	Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
	linux-rdma
  Cc: paulros
In-Reply-To: <1767732407-12389-1-git-send-email-haiyangz@linux.microsoft.com>

From: Haiyang Zhang <haiyangz@microsoft.com>

For RX CQEs with type CQE_RX_COALESCED_4, to measure the coalescing
efficiency, add counters to count how many contains 2, 3, 4 packets
respectively.
Also, add a counter for the error case of first packet with length == 0.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 25 +++++++++++++++++--
 .../ethernet/microsoft/mana/mana_ethtool.c    | 17 ++++++++++---
 include/net/mana/mana.h                       | 10 +++++---
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a46a1adf83bc..78824567d80b 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2083,8 +2083,22 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 
 nextpkt:
 	pktlen = oob->ppi[i].pkt_len;
-	if (pktlen == 0)
+	if (pktlen == 0) {
+		/* Collect coalesced CQE count based on packets processed.
+		 * Coalesced CQEs have at least 2 packets, so index is i - 2.
+		 */
+		if (i > 1) {
+			u64_stats_update_begin(&rxq->stats.syncp);
+			rxq->stats.coalesced_cqe[i - 2]++;
+			u64_stats_update_end(&rxq->stats.syncp);
+		} else if (i == 0) {
+			/* Error case stat */
+			u64_stats_update_begin(&rxq->stats.syncp);
+			rxq->stats.pkt_len0_err++;
+			u64_stats_update_end(&rxq->stats.syncp);
+		}
 		return;
+	}
 
 	curr = rxq->buf_index;
 	rxbuf_oob = &rxq->rx_oobs[curr];
@@ -2102,8 +2116,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 
 	mana_post_pkt_rxq(rxq);
 
-	if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
+	if (!coalesced)
+		return;
+
+	if (++i < MANA_RXCOMP_OOB_NUM_PPI)
 		goto nextpkt;
+
+	u64_stats_update_begin(&rxq->stats.syncp);
+	rxq->stats.coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 2]++;
+	u64_stats_update_end(&rxq->stats.syncp);
 }
 
 static void mana_poll_rx_cq(struct mana_cq *cq)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index b2b9bfb50396..635796bfdaf1 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -20,8 +20,6 @@ static const struct mana_stats_desc mana_eth_stats[] = {
 					tx_cqe_unknown_type)},
 	{"tx_linear_pkt_cnt", offsetof(struct mana_ethtool_stats,
 				       tx_linear_pkt_cnt)},
-	{"rx_coalesced_err", offsetof(struct mana_ethtool_stats,
-					rx_coalesced_err)},
 	{"rx_cqe_unknown_type", offsetof(struct mana_ethtool_stats,
 					rx_cqe_unknown_type)},
 };
@@ -151,7 +149,7 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
-	int i;
+	int i, j;
 
 	if (stringset != ETH_SS_STATS)
 		return;
@@ -170,6 +168,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
 		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
 		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
+		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
+			ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
 	}
 
 	for (i = 0; i < num_queues; i++) {
@@ -203,6 +204,8 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	u64 xdp_xmit;
 	u64 xdp_drop;
 	u64 xdp_tx;
+	u64 pkt_len0_err;
+	u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
 	u64 tso_packets;
 	u64 tso_bytes;
 	u64 tso_inner_packets;
@@ -211,7 +214,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	u64 short_pkt_fmt;
 	u64 csum_partial;
 	u64 mana_map_err;
-	int q, i = 0;
+	int q, i = 0, j;
 
 	if (!apc->port_is_up)
 		return;
@@ -241,6 +244,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 			xdp_drop = rx_stats->xdp_drop;
 			xdp_tx = rx_stats->xdp_tx;
 			xdp_redirect = rx_stats->xdp_redirect;
+			pkt_len0_err = rx_stats->pkt_len0_err;
+			for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
+				coalesced_cqe[j] = rx_stats->coalesced_cqe[j];
 		} while (u64_stats_fetch_retry(&rx_stats->syncp, start));
 
 		data[i++] = packets;
@@ -248,6 +254,9 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 		data[i++] = xdp_drop;
 		data[i++] = xdp_tx;
 		data[i++] = xdp_redirect;
+		data[i++] = pkt_len0_err;
+		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
+			data[i++] = coalesced_cqe[j];
 	}
 
 	for (q = 0; q < num_queues; q++) {
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 51d26ebeff6c..f8dd19860103 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -61,8 +61,11 @@ enum TRI_STATE {
 
 #define MAX_PORTS_IN_MANA_DEV 256
 
+/* Maximum number of packets per coalesced CQE */
+#define MANA_RXCOMP_OOB_NUM_PPI 4
+
 /* Update this count whenever the respective structures are changed */
-#define MANA_STATS_RX_COUNT 5
+#define MANA_STATS_RX_COUNT (6 + MANA_RXCOMP_OOB_NUM_PPI - 1)
 #define MANA_STATS_TX_COUNT 11
 
 #define MANA_RX_FRAG_ALIGNMENT 64
@@ -73,6 +76,8 @@ struct mana_stats_rx {
 	u64 xdp_drop;
 	u64 xdp_tx;
 	u64 xdp_redirect;
+	u64 pkt_len0_err;
+	u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
 	struct u64_stats_sync syncp;
 };
 
@@ -227,8 +232,6 @@ struct mana_rxcomp_perpkt_info {
 	u32 pkt_hash;
 }; /* HW DATA */
 
-#define MANA_RXCOMP_OOB_NUM_PPI 4
-
 /* Receive completion OOB */
 struct mana_rxcomp_oob {
 	struct mana_cqe_header cqe_hdr;
@@ -378,7 +381,6 @@ struct mana_ethtool_stats {
 	u64 tx_cqe_err;
 	u64 tx_cqe_unknown_type;
 	u64 tx_linear_pkt_cnt;
-	u64 rx_coalesced_err;
 	u64 rx_cqe_unknown_type;
 };
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH V2,net-next, 1/2] net: mana: Add support for coalesced RX packets on CQE
From: Haiyang Zhang @ 2026-01-06 20:46 UTC (permalink / raw)
  To: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
	Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
	Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
	linux-rdma
  Cc: paulros
In-Reply-To: <1767732407-12389-1-git-send-email-haiyangz@linux.microsoft.com>

From: Haiyang Zhang <haiyangz@microsoft.com>

Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
check and process the type CQE_RX_COALESCED_4. The default setting is
disabled, to avoid possible regression on latency.

And add ethtool handler to switch this feature. To turn it on, run:
  ethtool -C <nic> rx-frames 4
To turn it off:
  ethtool -C <nic> rx-frames 1

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
V2:
  Updated extack msg, as recommended by Jakub Kicinski, and Simon Horman.

---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 32 ++++++-----
 .../ethernet/microsoft/mana/mana_ethtool.c    | 55 +++++++++++++++++++
 include/net/mana/mana.h                       |  2 +
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 1ad154f9db1a..a46a1adf83bc 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1330,7 +1330,7 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
 	req->update_hashkey = update_key;
 	req->update_indir_tab = update_tab;
 	req->default_rxobj = apc->default_rxobj;
-	req->cqe_coalescing_enable = 0;
+	req->cqe_coalescing_enable = apc->cqe_coalescing_enable;
 
 	if (update_key)
 		memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE);
@@ -1864,11 +1864,12 @@ static struct sk_buff *mana_build_skb(struct mana_rxq *rxq, void *buf_va,
 }
 
 static void mana_rx_skb(void *buf_va, bool from_pool,
-			struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq)
+			struct mana_rxcomp_oob *cqe, struct mana_rxq *rxq,
+			int i)
 {
 	struct mana_stats_rx *rx_stats = &rxq->stats;
 	struct net_device *ndev = rxq->ndev;
-	uint pkt_len = cqe->ppi[0].pkt_len;
+	uint pkt_len = cqe->ppi[i].pkt_len;
 	u16 rxq_idx = rxq->rxq_idx;
 	struct napi_struct *napi;
 	struct xdp_buff xdp = {};
@@ -1912,7 +1913,7 @@ static void mana_rx_skb(void *buf_va, bool from_pool,
 	}
 
 	if (cqe->rx_hashtype != 0 && (ndev->features & NETIF_F_RXHASH)) {
-		hash_value = cqe->ppi[0].pkt_hash;
+		hash_value = cqe->ppi[i].pkt_hash;
 
 		if (cqe->rx_hashtype & MANA_HASH_L4)
 			skb_set_hash(skb, hash_value, PKT_HASH_TYPE_L4);
@@ -2047,9 +2048,11 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	struct mana_recv_buf_oob *rxbuf_oob;
 	struct mana_port_context *apc;
 	struct device *dev = gc->dev;
+	bool coalesced = false;
 	void *old_buf = NULL;
 	u32 curr, pktlen;
 	bool old_fp;
+	int i = 0;
 
 	apc = netdev_priv(ndev);
 
@@ -2064,9 +2067,8 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 		goto drop;
 
 	case CQE_RX_COALESCED_4:
-		netdev_err(ndev, "RX coalescing is unsupported\n");
-		apc->eth_stats.rx_coalesced_err++;
-		return;
+		coalesced = true;
+		break;
 
 	case CQE_RX_OBJECT_FENCE:
 		complete(&rxq->fence_event);
@@ -2079,14 +2081,10 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 		return;
 	}
 
-	pktlen = oob->ppi[0].pkt_len;
-
-	if (pktlen == 0) {
-		/* data packets should never have packetlength of zero */
-		netdev_err(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n",
-			   rxq->gdma_id, cq->gdma_id, rxq->rxobj);
+nextpkt:
+	pktlen = oob->ppi[i].pkt_len;
+	if (pktlen == 0)
 		return;
-	}
 
 	curr = rxq->buf_index;
 	rxbuf_oob = &rxq->rx_oobs[curr];
@@ -2097,12 +2095,15 @@ static void mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq,
 	/* Unsuccessful refill will have old_buf == NULL.
 	 * In this case, mana_rx_skb() will drop the packet.
 	 */
-	mana_rx_skb(old_buf, old_fp, oob, rxq);
+	mana_rx_skb(old_buf, old_fp, oob, rxq, i);
 
 drop:
 	mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu);
 
 	mana_post_pkt_rxq(rxq);
+
+	if (coalesced && (++i < MANA_RXCOMP_OOB_NUM_PPI))
+		goto nextpkt;
 }
 
 static void mana_poll_rx_cq(struct mana_cq *cq)
@@ -3276,6 +3277,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	apc->port_handle = INVALID_MANA_HANDLE;
 	apc->pf_filter_handle = INVALID_MANA_HANDLE;
 	apc->port_idx = port_idx;
+	apc->cqe_coalescing_enable = 0;
 
 	mutex_init(&apc->vport_mutex);
 	apc->vport_use_count = 0;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 0e2f4343ac67..b2b9bfb50396 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -397,6 +397,58 @@ static void mana_get_channels(struct net_device *ndev,
 	channel->combined_count = apc->num_queues;
 }
 
+static int mana_get_coalesce(struct net_device *ndev,
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	ec->rx_max_coalesced_frames =
+		apc->cqe_coalescing_enable ? MANA_RXCOMP_OOB_NUM_PPI : 1;
+
+	return 0;
+}
+
+static int mana_set_coalesce(struct net_device *ndev,
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+	u8 saved_cqe_coalescing_enable;
+	int err;
+
+	if (ec->rx_max_coalesced_frames != 1 &&
+	    ec->rx_max_coalesced_frames != MANA_RXCOMP_OOB_NUM_PPI) {
+		NL_SET_ERR_MSG_FMT(extack,
+				   "rx-frames must be 1 or %u, got %u",
+				   MANA_RXCOMP_OOB_NUM_PPI,
+				   ec->rx_max_coalesced_frames);
+		return -EINVAL;
+	}
+
+	saved_cqe_coalescing_enable = apc->cqe_coalescing_enable;
+	apc->cqe_coalescing_enable =
+		ec->rx_max_coalesced_frames == MANA_RXCOMP_OOB_NUM_PPI;
+
+	if (!apc->port_is_up)
+		return 0;
+
+	err = mana_config_rss(apc, TRI_STATE_TRUE, false, false);
+
+	if (err) {
+		netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
+			   ec->rx_max_coalesced_frames, err);
+		NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed",
+				   ec->rx_max_coalesced_frames);
+
+		apc->cqe_coalescing_enable = saved_cqe_coalescing_enable;
+	}
+
+	return err;
+}
+
 static int mana_set_channels(struct net_device *ndev,
 			     struct ethtool_channels *channels)
 {
@@ -517,6 +569,7 @@ static int mana_get_link_ksettings(struct net_device *ndev,
 }
 
 const struct ethtool_ops mana_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
 	.get_ethtool_stats	= mana_get_ethtool_stats,
 	.get_sset_count		= mana_get_sset_count,
 	.get_strings		= mana_get_strings,
@@ -527,6 +580,8 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.set_rxfh		= mana_set_rxfh,
 	.get_channels		= mana_get_channels,
 	.set_channels		= mana_set_channels,
+	.get_coalesce		= mana_get_coalesce,
+	.set_coalesce		= mana_set_coalesce,
 	.get_ringparam          = mana_get_ringparam,
 	.set_ringparam          = mana_set_ringparam,
 	.get_link_ksettings	= mana_get_link_ksettings,
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index d7e089c6b694..51d26ebeff6c 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -556,6 +556,8 @@ struct mana_port_context {
 	bool port_is_up;
 	bool port_st_save; /* Saved port state */
 
+	u8 cqe_coalescing_enable;
+
 	struct mana_ethtool_stats eth_stats;
 
 	struct mana_ethtool_phy_stats phy_stats;
-- 
2.34.1


^ permalink raw reply related

* [PATCH V2,net-next, 0/2] net: mana: Add support for coalesced RX packets
From: Haiyang Zhang @ 2026-01-06 20:46 UTC (permalink / raw)
  To: linux-hyperv, netdev; +Cc: haiyangz, paulros

From: Haiyang Zhang <haiyangz@microsoft.com>

Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
update the RX code path, and ethtool handler. Also add counters for it.

Haiyang Zhang (2):
  net: mana: Add support for coalesced RX packets on CQE
  net: mana: Add ethtool counters for RX CQEs in coalesced type

 drivers/net/ethernet/microsoft/mana/mana_en.c | 49 +++++++++----
 .../ethernet/microsoft/mana/mana_ethtool.c    | 72 +++++++++++++++++--
 include/net/mana/mana.h                       | 12 ++--
 3 files changed, 112 insertions(+), 21 deletions(-)

-- 
2.34.1


^ permalink raw reply

* Re: [PATCH v2 2/2] mshv: handle gpa intercepts for arm64
From: Nuno Das Neves @ 2026-01-06 18:59 UTC (permalink / raw)
  To: Stanislav Kinsburskii, Anirudh Rayabharam
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aV05_2Lw6x8Qr_Je@skinsburskii.localdomain>

On 1/6/2026 8:36 AM, Stanislav Kinsburskii wrote:
> On Tue, Jan 06, 2026 at 07:21:41AM +0000, Anirudh Rayabharam wrote:
>> On Mon, Jan 05, 2026 at 09:04:02AM -0800, Stanislav Kinsburskii wrote:
>>> On Mon, Jan 05, 2026 at 12:28:37PM +0000, Anirudh Rayabharam wrote:
>>>> From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
>>>>
>>>> The mshv driver now uses movable pages for guests. For arm64 guests
>>>> to be functional, handle gpa intercepts for arm64 too (the current
>>>> code implements handling only for x86).
>>>>
>>>> Move some arch-agnostic functions out of #ifdefs so that they can be
>>>> re-used.
>>>>
>>>> Fixes: b9a66cd5ccbb ("mshv: Add support for movable memory regions")
>>>
>>> I'm not sure that this patch needs "Fixes" tag as it introduced new
>>> functionality rather than fixing a bug.
>>
>> This does fix a bug. The commit mentioned here regressed arm64 guests because
>> it didn't have GPA intercept handling for arm64.
>>
> 
> Were ARM guests functional before this commit? If yes, then I agree that
> this patch fixes a bug. If no, then this is just adding new
> functionality.
> I had an impression ARM is not yet supported in MSHV, so please clarify.
> 

Chiming in, I had a similar discussion with Michael regarding correcting
the value of VpRootDispatchThreadBlocked for ARM64:
https://lore.kernel.org/linux-hyperv/SN6PR02MB41574240F18B87346C659DBFD4BBA@SN6PR02MB4157.namprd02.prod.outlook.com/T/#u

Michael didn't see need for a separate patch for that fix (with "Fixes"
tag, I assumed, though we didn't discuss it). This is because the code
is currently not doing anything due to ARM64 not yet being supported.

Keep in mind adding a "Fixes" tag marks the patch for backporting to LTS
kernels (if the original patch is in one of those, of course). There is
no need for such backporting as it has no impact for features that do not
yet exist. This 'fix' is really a precursor for ARM64 support, and we don't
have a special way of tagging those.

I can see that "Fixes" may help someone who is backporting entire features
to help them identify dependencies (e.g. backporting ARM64 support), but
the benefit of that is conjectural so I don't see it as a strong enough
reason given the above.

Nuno

> Thanks,
> Stanislav
> 
>> Thanks,
>> Anirudh.
>>
>>>
>>> Thanks,
>>> Stanislav
>>>
>>>> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
>>>> ---
>>>>  drivers/hv/mshv_root_main.c | 15 ++++++++-------
>>>>  1 file changed, 8 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>> index 9cf28a3f12fe..f8c4c2ae2cc9 100644
>>>> --- a/drivers/hv/mshv_root_main.c
>>>> +++ b/drivers/hv/mshv_root_main.c
>>>> @@ -608,7 +608,6 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>  	return NULL;
>>>>  }
>>>>  
>>>> -#ifdef CONFIG_X86_64
>>>>  static struct mshv_mem_region *
>>>>  mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>  {
>>>> @@ -640,12 +639,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>  {
>>>>  	struct mshv_partition *p = vp->vp_partition;
>>>>  	struct mshv_mem_region *region;
>>>> -	struct hv_x64_memory_intercept_message *msg;
>>>>  	bool ret;
>>>>  	u64 gfn;
>>>> -
>>>> -	msg = (struct hv_x64_memory_intercept_message *)
>>>> +#if defined(CONFIG_X86_64)
>>>> +	struct hv_x64_memory_intercept_message *msg =
>>>> +		(struct hv_x64_memory_intercept_message *)
>>>> +		vp->vp_intercept_msg_page->u.payload;
>>>> +#elif defined(CONFIG_ARM64)
>>>> +	struct hv_arm64_memory_intercept_message *msg =
>>>> +		(struct hv_arm64_memory_intercept_message *)
>>>>  		vp->vp_intercept_msg_page->u.payload;
>>>> +#endif
>>>>  
>>>>  	gfn = HVPFN_DOWN(msg->guest_physical_address);
>>>>  
>>>> @@ -663,9 +667,6 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>  
>>>>  	return ret;
>>>>  }
>>>> -#else  /* CONFIG_X86_64 */
>>>> -static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>> -#endif /* CONFIG_X86_64 */
>>>>  
>>>>  static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>  {
>>>> -- 
>>>> 2.34.1
>>>>


^ permalink raw reply

* Re: [PATCH v2 2/2] mshv: handle gpa intercepts for arm64
From: Stanislav Kinsburskii @ 2026-01-06 16:36 UTC (permalink / raw)
  To: Anirudh Rayabharam
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aVy4BUk9X18KiPCO@anirudh-surface.localdomain>

On Tue, Jan 06, 2026 at 07:21:41AM +0000, Anirudh Rayabharam wrote:
> On Mon, Jan 05, 2026 at 09:04:02AM -0800, Stanislav Kinsburskii wrote:
> > On Mon, Jan 05, 2026 at 12:28:37PM +0000, Anirudh Rayabharam wrote:
> > > From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > > 
> > > The mshv driver now uses movable pages for guests. For arm64 guests
> > > to be functional, handle gpa intercepts for arm64 too (the current
> > > code implements handling only for x86).
> > > 
> > > Move some arch-agnostic functions out of #ifdefs so that they can be
> > > re-used.
> > > 
> > > Fixes: b9a66cd5ccbb ("mshv: Add support for movable memory regions")
> > 
> > I'm not sure that this patch needs "Fixes" tag as it introduced new
> > functionality rather than fixing a bug.
> 
> This does fix a bug. The commit mentioned here regressed arm64 guests because
> it didn't have GPA intercept handling for arm64.
> 

Were ARM guests functional before this commit? If yes, then I agree that
this patch fixes a bug. If no, then this is just adding new
functionality.
I had an impression ARM is not yet supported in MSHV, so please clarify.

Thanks,
Stanislav

> Thanks,
> Anirudh.
> 
> > 
> > Thanks,
> > Stanislav
> > 
> > > Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > > ---
> > >  drivers/hv/mshv_root_main.c | 15 ++++++++-------
> > >  1 file changed, 8 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > index 9cf28a3f12fe..f8c4c2ae2cc9 100644
> > > --- a/drivers/hv/mshv_root_main.c
> > > +++ b/drivers/hv/mshv_root_main.c
> > > @@ -608,7 +608,6 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > >  	return NULL;
> > >  }
> > >  
> > > -#ifdef CONFIG_X86_64
> > >  static struct mshv_mem_region *
> > >  mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > >  {
> > > @@ -640,12 +639,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > >  {
> > >  	struct mshv_partition *p = vp->vp_partition;
> > >  	struct mshv_mem_region *region;
> > > -	struct hv_x64_memory_intercept_message *msg;
> > >  	bool ret;
> > >  	u64 gfn;
> > > -
> > > -	msg = (struct hv_x64_memory_intercept_message *)
> > > +#if defined(CONFIG_X86_64)
> > > +	struct hv_x64_memory_intercept_message *msg =
> > > +		(struct hv_x64_memory_intercept_message *)
> > > +		vp->vp_intercept_msg_page->u.payload;
> > > +#elif defined(CONFIG_ARM64)
> > > +	struct hv_arm64_memory_intercept_message *msg =
> > > +		(struct hv_arm64_memory_intercept_message *)
> > >  		vp->vp_intercept_msg_page->u.payload;
> > > +#endif
> > >  
> > >  	gfn = HVPFN_DOWN(msg->guest_physical_address);
> > >  
> > > @@ -663,9 +667,6 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > >  
> > >  	return ret;
> > >  }
> > > -#else  /* CONFIG_X86_64 */
> > > -static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > -#endif /* CONFIG_X86_64 */
> > >  
> > >  static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > >  {
> > > -- 
> > > 2.34.1
> > > 

^ permalink raw reply

* Re: [PATCH v2 2/2] mshv: handle gpa intercepts for arm64
From: Anirudh Rayabharam @ 2026-01-06  7:21 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aVvvAlsohGEdC6Wv@skinsburskii.localdomain>

On Mon, Jan 05, 2026 at 09:04:02AM -0800, Stanislav Kinsburskii wrote:
> On Mon, Jan 05, 2026 at 12:28:37PM +0000, Anirudh Rayabharam wrote:
> > From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > 
> > The mshv driver now uses movable pages for guests. For arm64 guests
> > to be functional, handle gpa intercepts for arm64 too (the current
> > code implements handling only for x86).
> > 
> > Move some arch-agnostic functions out of #ifdefs so that they can be
> > re-used.
> > 
> > Fixes: b9a66cd5ccbb ("mshv: Add support for movable memory regions")
> 
> I'm not sure that this patch needs "Fixes" tag as it introduced new
> functionality rather than fixing a bug.

This does fix a bug. The commit mentioned here regressed arm64 guests because
it didn't have GPA intercept handling for arm64.

Thanks,
Anirudh.

> 
> Thanks,
> Stanislav
> 
> > Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> > ---
> >  drivers/hv/mshv_root_main.c | 15 ++++++++-------
> >  1 file changed, 8 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > index 9cf28a3f12fe..f8c4c2ae2cc9 100644
> > --- a/drivers/hv/mshv_root_main.c
> > +++ b/drivers/hv/mshv_root_main.c
> > @@ -608,7 +608,6 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> >  	return NULL;
> >  }
> >  
> > -#ifdef CONFIG_X86_64
> >  static struct mshv_mem_region *
> >  mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> >  {
> > @@ -640,12 +639,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> >  {
> >  	struct mshv_partition *p = vp->vp_partition;
> >  	struct mshv_mem_region *region;
> > -	struct hv_x64_memory_intercept_message *msg;
> >  	bool ret;
> >  	u64 gfn;
> > -
> > -	msg = (struct hv_x64_memory_intercept_message *)
> > +#if defined(CONFIG_X86_64)
> > +	struct hv_x64_memory_intercept_message *msg =
> > +		(struct hv_x64_memory_intercept_message *)
> > +		vp->vp_intercept_msg_page->u.payload;
> > +#elif defined(CONFIG_ARM64)
> > +	struct hv_arm64_memory_intercept_message *msg =
> > +		(struct hv_arm64_memory_intercept_message *)
> >  		vp->vp_intercept_msg_page->u.payload;
> > +#endif
> >  
> >  	gfn = HVPFN_DOWN(msg->guest_physical_address);
> >  
> > @@ -663,9 +667,6 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> >  
> >  	return ret;
> >  }
> > -#else  /* CONFIG_X86_64 */
> > -static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > -#endif /* CONFIG_X86_64 */
> >  
> >  static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> >  {
> > -- 
> > 2.34.1
> > 

^ permalink raw reply

* Re: [PATCH net-next, v6] net: mana: Implement ndo_tx_timeout and serialize queue resets per port.
From: Jakub Kicinski @ 2026-01-06  1:30 UTC (permalink / raw)
  To: Dipayaan Roy
  Cc: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	pabeni, longli, kotaranov, horms, shradhagupta, ssengar, ernis,
	shirazsaleem, linux-hyperv, netdev, linux-kernel, linux-rdma,
	dipayanroy
In-Reply-To: <20260103045705.GA3757@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On Fri, 2 Jan 2026 20:57:05 -0800 Dipayaan Roy wrote:
> +		apc = netdev_priv(ndev);
> +		disable_work_sync(&apc->queue_reset_work.work);

AI code review points out:

  In mana_remove(), disable_work_sync() is called for each port's
  queue_reset_work. However, when resuming=true, mana_probe() creates a new
  workqueue but does not call mana_probe_port() (which contains INIT_WORK),
  and there is no enable_work() call for queue_reset_work in the resume path.

  The existing link_change_work handles this correctly: it is disabled in
  mana_remove() and re-enabled with enable_work(&ac->link_change_work) in
  mana_probe() when resuming=true.

  Should enable_work(&apc->queue_reset_work.work) be called for each port in
  the resuming path of mana_probe(), similar to how link_change_work is
  handled? Otherwise TX timeout recovery appears to remain disabled after a
  suspend/resume cycle.
-- 
pw-bot: cr

^ permalink raw reply

* Re: [PATCH] mshv: Align huge page stride with guest mapping
From: Stanislav Kinsburskii @ 2026-01-05 19:47 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <SN6PR02MB4157C177954C42B9C026DB27D486A@SN6PR02MB4157.namprd02.prod.outlook.com>

On Mon, Jan 05, 2026 at 06:07:00PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, January 5, 2026 9:25 AM
> > 
> > On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > > >
> > > > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > > > >
> 
> [snip]
> 
> > > > > >
> > > > > > I think see your point, but I also think this issue doesn't exist,
> > > > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > > > 1. the folio order is PMD_ORDER and
> > > > > > 2. GFN is huge page aligned and
> > > > > > 3. number of 4K pages is huge pages aligned.
> > > > > >
> > > > > > On other words, a host huge page won't be mapped as huge if the page
> > > > > > can't be mapped as huge in the guest.
> > > > >
> > > > > OK, I'm missing how what you say is true. For pinned regions,
> > > > > the memory is allocated and mapped into the host userspace address
> > > > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > > > which calls pin_user_pages_fast(). This is all done without considering
> > > > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > > > and mapped in the host before any guest mapping is looked at. Agreed?
> > > > >
> > > >
> > > > Agreed.
> > > >
> > > > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > > > that can be mapped into the guest as a 2M page.
> > > > >
> > > >
> > > > I'm trying to undestand how this can even happen, so please bear with
> > > > me.
> > > > In other words (and AFAIU), what you are saying in the following:
> > > >
> > > > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> > > >    address is huge page aligned, size is huge page aligned and physical
> > > >    pages are consequtive).
> > > > 2. VMM tries to create a region via ioctl, but instead of passing the
> > > >    start of the region, is passes an offset into one of the the region's
> > > >    huge pages, and in the same time with the base GFN and the size huge
> > > >    page aligned (to meet the #2 and #3 conditions).
> > > > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> > > >    the corresponding pages as huge, which will be rejected by the
> > > >    hypervisor.
> > > >
> > > > Is this accurate?
> > >
> > > Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> > > address space, and not do anything to populate it with physical pages.
> > > So populating with any 2M pages may not happen until Step 2 when
> > > the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> > > space gets populated with physical pages doesn't really matter. We
> > > just know that it happens before the ioctl tries to map the memory
> > > into the guest -- i.e., mshv_prepare_pinned_region() calls
> > > mshv_region_map().
> > >
> > > And yes, the problem is what you call out in Step 2: as input to the
> > > ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> > > mshv_user_mem_region could have different alignments modulo 2M
> > > boundaries. When they are different, that's what I'm calling a "mis-aligned
> > > region", (referring to a struct mshv_mem_region that is created and
> > > setup by the ioctl).
> > >
> > > > A subseqeunt question: if it is accurate, why the driver needs to
> > > > support this case? It looks like a VMM bug to me.
> > >
> > > I don't know if the driver needs to support this case. That's a question
> > > for the VMM people to answer. I wouldn't necessarily assume that the
> > > VMM always allocates virtual address space with exactly the size and
> > > alignment that matches the regions it creates with the ioctl. The
> > > kernel ioctl doesn't care how the VMM allocates and manages its
> > > virtual address space, so the VMM is free to do whatever it wants
> > > in that regard, as long as it meets the requirements of the ioctl. So
> > > the requirements of the ioctl in this case are something to be
> > > negotiated with the VMM.
> > >
> > > > Also, how should it support it? By rejecting such requests in the ioctl?
> > >
> > > Rejecting requests to create a mis-aligned region is certainly one option
> > > if the VMM agrees that's OK. The ioctl currently requires only that
> > > "userspace_addr" and "size" be page aligned, so those requirements
> > > could be tightened.
> > >
> > > The other approach is to fix mshv_chunk_stride() to handle the
> > > mis-aligned case. Doing so it even easier than I first envisioned.
> > > I think this works:
> > >
> > > @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
> > >          */
> > >         if (page_order &&
> > >             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> > > -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> > > +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> > > +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
> > >                 return 1 << page_order;
> > >
> > >         return 1;
> > >
> > > But as we discussed earlier, this fix means never getting 2M mappings
> > > in the guest for a region that is mis-aligned.
> > >
> > 
> > Although I understand the logic behind this fix, I’m hesitant to add it
> > because it looks like a workaround for a VMM bug that could bite back.
> > The approach you propose will silently map a huge page as a collection
> > of 4K pages, impacting guest performance (this will be especially
> > visible for a region containing a single huge page).
> > 
> > This fix silently allows such behavior instead of reporting it as an
> > error to user space. It’s worth noting that pinned-region population and
> > mapping happen upon ioctl invocation, so the VMM will either get an
> > error from the hypervisor (current behavior) or get a region mapped with
> > 4K pages (proposed behavior).
> > 
> > The first case is an explicit error; the second — although it allows
> > adding a region — will be less performant, significantly increase region
> > mapping time and thus potentailly guest spin-up (creation) time, and be
> > less noticeable to customers, especially those who don’t really
> > understand what’s happening under the hood and simply stumbled upon some
> > VMM bug.
> > 
> > What’s your take?
> > 
> 
> Yes, I agree with everything you say. Silently dropping into a mode where
> guest performance might be noticeably affected is usually not a good
> thing. So if the VMM code is OK with the restriction, then I'm fine with
> adding an explicit alignment check in the ioctl path code to disallow the
> mis-aligned case.
> 

But the explicit alignment check in the ioctl is already there. The only
difference is that it's done in the hypervisor and not in the kernel.

> An explicit check is needed because the code "as is" is somewhat flakey
> as I pointed out earlier. Mis-aligned pinned regions will succeed if the
> host doesn't allocate any 2M pages, but will fail it is does. And mis-aligned
> movable regions silently go into the mode of doing all 4K mappings. An
> explicit check in the ioctl path avoids the flakiness and makes pinned
> and movable regions have consistent requirements.
> 
> On the flip side: The ioctl that creates a region is only used by the VMM,
> not by random end-user provided code like the system call API or general
> ioctls. As such, I could see the VMM wanting mis-aligned regions to work,
> with the understanding that there is potential perf impact. The VMM is
> sophisticated system software, and it may want to take the responsibility
> for making that tradeoff rather than have the kernel enforce a requirement.
> There may be cases where it makes sense to create small regions that are
> mis-aligned. I just don't know what the VMM needs or wants to do in
> creating regions.
> 

That's a fair point. Let me loop back with the VMM folks and see what
they think.

Thanks,
Stanislav

> So it's hard for me to lean either way.  I think the question must go
> to the VMM folks.
> 
> Michael
> 
> 
> 
> 
> 
> 
> 
> 

^ permalink raw reply

* Re: [PATCH v2 1/2] hyperv: add definitions for arm64 gpa intercepts
From: mrathor @ 2026-01-05 19:44 UTC (permalink / raw)
  To: Anirudh Rayabharam, vdso
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <atnnav7x4gzbeghpuh4fjpdig3i4zxzb56kpfvx3stgelajbm6@52lzmsycwzss>

On 1/5/26 11:27, Anirudh Rayabharam wrote:
> On Mon, Jan 05, 2026 at 08:06:02AM -0800, vdso@mailbox.org wrote:
>>
>>> On 01/05/2026 4:28 AM  Anirudh Rayabharam <anirudh@anirudhrb.com> wrote:
>>>
>>
>> [...]
>>
>>>   
>>> +#if IS_ENABLED(CONFIG_ARM64)
>>> +union hv_arm64_vp_execution_state {
>>> +	u16 as_uint16;
>>> +	struct {
>>> +		u16 cpl:2; /* Exception Level (EL) */
>>
>> Anirudh,
>>
>> Appreciate following up on the CPL field in that ARM64 structure
>> and adding the comment!
> 
> My bad, actually I was gonna explain this in a reply to the previous
> thread but it slipped my mind.
> 
>>
>> Still, using something from the x86 parlance (CPL) and adding a comment
>> stating that this is actually ARM64 EL certainly needs an explanation
>> as to _why_ using an x86 term here is beneficial, why not just call
>> the field "el"? As an analogy, here is a thought experiment of writing
>>
>> #ffdef CONFIG_ARM64
>> u64 rax; /* This is X0 */
>> #endif
>>
>> where an x86 register name would be used to refer to X0 on ARM64, and
>> that doen't look natural.
> 
> Well, in this case neither CPL nor EL is an architecturally defined
> register name. These are just architectural concepts.
> 
>>
>> So far, I can't seem to find drawbacks in naming this field "el", only
>> benefits:
>> * ARM64 folks will immediately know what this field is, and
>> * the comment isn't required to explain the situation to the reader.
>>
>> Do you foresee any drawbacks of calling the field "el" and dropping
>> the comment? If you do, would these drawbacks outweigh the benefits?
> 
> As a general rule we want to keep these headers exactly same as the
> hypervisor headers so that we can directly ingest them at some point in
> the future.

Having said that, we've communicated the concern to the hyp team, and
there is no opposition to changing it. After the change is made on
that side, it will propagate to this side in future.

Thanks for your diligence.

-Mukesh



> I am not seeing a substantial benefit in breaking that rule. The CPL ->
> EL analogy is not a huge leap to make IMO and the comment helps. One
> could think of "current privilege level" as a generic term here.
> 
> Thanks,
> Anirudh.
> 
>>
>> [...]
>>
>> --
>> Cheers,
>> Roman


^ permalink raw reply

* Re: [PATCH v2 1/2] hyperv: add definitions for arm64 gpa intercepts
From: Anirudh Rayabharam @ 2026-01-05 19:27 UTC (permalink / raw)
  To: vdso
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <993970797.13531.1767629162352@app.mailbox.org>

On Mon, Jan 05, 2026 at 08:06:02AM -0800, vdso@mailbox.org wrote:
> 
> > On 01/05/2026 4:28 AM  Anirudh Rayabharam <anirudh@anirudhrb.com> wrote:
> > 
> 
> [...]
> 
> >  
> > +#if IS_ENABLED(CONFIG_ARM64)
> > +union hv_arm64_vp_execution_state {
> > +	u16 as_uint16;
> > +	struct {
> > +		u16 cpl:2; /* Exception Level (EL) */
> 
> Anirudh,
> 
> Appreciate following up on the CPL field in that ARM64 structure
> and adding the comment!

My bad, actually I was gonna explain this in a reply to the previous
thread but it slipped my mind.

> 
> Still, using something from the x86 parlance (CPL) and adding a comment
> stating that this is actually ARM64 EL certainly needs an explanation
> as to _why_ using an x86 term here is beneficial, why not just call
> the field "el"? As an analogy, here is a thought experiment of writing
> 
> #ffdef CONFIG_ARM64
> u64 rax; /* This is X0 */
> #endif
> 
> where an x86 register name would be used to refer to X0 on ARM64, and
> that doen't look natural.

Well, in this case neither CPL nor EL is an architecturally defined
register name. These are just architectural concepts.

> 
> So far, I can't seem to find drawbacks in naming this field "el", only
> benefits:
> * ARM64 folks will immediately know what this field is, and
> * the comment isn't required to explain the situation to the reader.
> 
> Do you foresee any drawbacks of calling the field "el" and dropping
> the comment? If you do, would these drawbacks outweigh the benefits?

As a general rule we want to keep these headers exactly same as the
hypervisor headers so that we can directly ingest them at some point in
the future.

I am not seeing a substantial benefit in breaking that rule. The CPL ->
EL analogy is not a huge leap to make IMO and the comment helps. One
could think of "current privilege level" as a generic term here.

Thanks,
Anirudh.

> 
> [...]
> 
> --
> Cheers,
> Roman

^ permalink raw reply

* RE: [PATCH] mshv: Align huge page stride with guest mapping
From: Michael Kelley @ 2026-01-05 18:07 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <aVv0ALacPukXIHTw@skinsburskii.localdomain>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, January 5, 2026 9:25 AM
> 
> On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > >
> > > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > > >

[snip]

> > > > >
> > > > > I think see your point, but I also think this issue doesn't exist,
> > > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > > 1. the folio order is PMD_ORDER and
> > > > > 2. GFN is huge page aligned and
> > > > > 3. number of 4K pages is huge pages aligned.
> > > > >
> > > > > On other words, a host huge page won't be mapped as huge if the page
> > > > > can't be mapped as huge in the guest.
> > > >
> > > > OK, I'm missing how what you say is true. For pinned regions,
> > > > the memory is allocated and mapped into the host userspace address
> > > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > > which calls pin_user_pages_fast(). This is all done without considering
> > > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > > and mapped in the host before any guest mapping is looked at. Agreed?
> > > >
> > >
> > > Agreed.
> > >
> > > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > > that can be mapped into the guest as a 2M page.
> > > >
> > >
> > > I'm trying to undestand how this can even happen, so please bear with
> > > me.
> > > In other words (and AFAIU), what you are saying in the following:
> > >
> > > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> > >    address is huge page aligned, size is huge page aligned and physical
> > >    pages are consequtive).
> > > 2. VMM tries to create a region via ioctl, but instead of passing the
> > >    start of the region, is passes an offset into one of the the region's
> > >    huge pages, and in the same time with the base GFN and the size huge
> > >    page aligned (to meet the #2 and #3 conditions).
> > > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> > >    the corresponding pages as huge, which will be rejected by the
> > >    hypervisor.
> > >
> > > Is this accurate?
> >
> > Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> > address space, and not do anything to populate it with physical pages.
> > So populating with any 2M pages may not happen until Step 2 when
> > the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> > space gets populated with physical pages doesn't really matter. We
> > just know that it happens before the ioctl tries to map the memory
> > into the guest -- i.e., mshv_prepare_pinned_region() calls
> > mshv_region_map().
> >
> > And yes, the problem is what you call out in Step 2: as input to the
> > ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> > mshv_user_mem_region could have different alignments modulo 2M
> > boundaries. When they are different, that's what I'm calling a "mis-aligned
> > region", (referring to a struct mshv_mem_region that is created and
> > setup by the ioctl).
> >
> > > A subseqeunt question: if it is accurate, why the driver needs to
> > > support this case? It looks like a VMM bug to me.
> >
> > I don't know if the driver needs to support this case. That's a question
> > for the VMM people to answer. I wouldn't necessarily assume that the
> > VMM always allocates virtual address space with exactly the size and
> > alignment that matches the regions it creates with the ioctl. The
> > kernel ioctl doesn't care how the VMM allocates and manages its
> > virtual address space, so the VMM is free to do whatever it wants
> > in that regard, as long as it meets the requirements of the ioctl. So
> > the requirements of the ioctl in this case are something to be
> > negotiated with the VMM.
> >
> > > Also, how should it support it? By rejecting such requests in the ioctl?
> >
> > Rejecting requests to create a mis-aligned region is certainly one option
> > if the VMM agrees that's OK. The ioctl currently requires only that
> > "userspace_addr" and "size" be page aligned, so those requirements
> > could be tightened.
> >
> > The other approach is to fix mshv_chunk_stride() to handle the
> > mis-aligned case. Doing so it even easier than I first envisioned.
> > I think this works:
> >
> > @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
> >          */
> >         if (page_order &&
> >             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> > -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> > +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> > +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
> >                 return 1 << page_order;
> >
> >         return 1;
> >
> > But as we discussed earlier, this fix means never getting 2M mappings
> > in the guest for a region that is mis-aligned.
> >
> 
> Although I understand the logic behind this fix, I’m hesitant to add it
> because it looks like a workaround for a VMM bug that could bite back.
> The approach you propose will silently map a huge page as a collection
> of 4K pages, impacting guest performance (this will be especially
> visible for a region containing a single huge page).
> 
> This fix silently allows such behavior instead of reporting it as an
> error to user space. It’s worth noting that pinned-region population and
> mapping happen upon ioctl invocation, so the VMM will either get an
> error from the hypervisor (current behavior) or get a region mapped with
> 4K pages (proposed behavior).
> 
> The first case is an explicit error; the second — although it allows
> adding a region — will be less performant, significantly increase region
> mapping time and thus potentailly guest spin-up (creation) time, and be
> less noticeable to customers, especially those who don’t really
> understand what’s happening under the hood and simply stumbled upon some
> VMM bug.
> 
> What’s your take?
> 

Yes, I agree with everything you say. Silently dropping into a mode where
guest performance might be noticeably affected is usually not a good
thing. So if the VMM code is OK with the restriction, then I'm fine with
adding an explicit alignment check in the ioctl path code to disallow the
mis-aligned case.

An explicit check is needed because the code "as is" is somewhat flakey
as I pointed out earlier. Mis-aligned pinned regions will succeed if the
host doesn't allocate any 2M pages, but will fail it is does. And mis-aligned
movable regions silently go into the mode of doing all 4K mappings. An
explicit check in the ioctl path avoids the flakiness and makes pinned
and movable regions have consistent requirements.

On the flip side: The ioctl that creates a region is only used by the VMM,
not by random end-user provided code like the system call API or general
ioctls. As such, I could see the VMM wanting mis-aligned regions to work,
with the understanding that there is potential perf impact. The VMM is
sophisticated system software, and it may want to take the responsibility
for making that tradeoff rather than have the kernel enforce a requirement.
There may be cases where it makes sense to create small regions that are
mis-aligned. I just don't know what the VMM needs or wants to do in
creating regions.

So it's hard for me to lean either way.  I think the question must go
to the VMM folks.

Michael









^ permalink raw reply

* Re: [PATCH] mshv: Align huge page stride with guest mapping
From: Stanislav Kinsburskii @ 2026-01-05 17:25 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <SN6PR02MB415756A0783B634297F51320D4B8A@SN6PR02MB4157.namprd02.prod.outlook.com>

On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > 
> > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > >
> > > > On Fri, Jan 02, 2026 at 06:04:56PM +0000, Michael Kelley wrote:
> > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> > > > > >
> > > > > > On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > > > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> > > > > > > >
> > > > > > > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > > > > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > > > > > > >
> > > > > > > > > [snip]
> > > > > > > > > >
> > > > > > > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > > > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > > > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > > > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > > > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > > > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > > > > > > specified where the userspace address and the guest PFN have different
> > > > > > > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > > > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > > > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > > > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > > > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > > > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > > > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > > > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > > > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > > > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > > > > > > though I guess it would work. My question is whether the
> > > > > > > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > > > > > > error immediately.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > In thinking about this more, I can answer my own question about the
> > > > > > > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > > > > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > > > > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > > > > > > sequential PFNs would be wrong, so it must return an error if the
> > > > > > > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > > > > > > >
> > > > > > > > > For a pinned region, this error happens in mshv_region_map() as
> > > > > > > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > > > > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > > > > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > > > > > > where the guest PFN and userspace address have different offsets
> > > > > > > > > modulo 2 Meg might or might not succeed.
> > > > > > > > >
> > > > > > > > > For a movable region, the error probably can't occur.
> > > > > > > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > > > > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > > > > > > determines the corresponding userspace addr, which won't be on
> > > > > > > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > > > > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > > > > > > always do 4K mappings and will succeed. The downside is that a
> > > > > > > > > movable region with a guest PFN and userspace address with
> > > > > > > > > different offsets never gets any 2 Meg pages or mappings.
> > > > > > > > >
> > > > > > > > > My conclusion is the same -- such misalignment should not be
> > > > > > > > > allowed when creating a region that has the potential to use 2 Meg
> > > > > > > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > > > > > > a requirement if there is benefit in doing so. It's possible to have
> > > > > > > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > > > > > > having a 2 Meg page, and those could also be excluded from the
> > > > > > > > > requirement.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I'm not sure I understand the problem.
> > > > > > > > There are three cases to consider:
> > > > > > > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > > > > > > 2. Host mapping, where page sizes are controlled by the host.
> > > > > > >
> > > > > > > And by "host", you mean specifically the Linux instance running in the
> > > > > > > root partition. It hosts the VMM processes and creates the memory
> > > > > > > regions for each guest.
> > > > > > >
> > > > > > > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > > > > > > >
> > > > > > > > The first case is not relevant here and is included for completeness.
> > > > > > >
> > > > > > > Agreed.
> > > > > > >
> > > > > > > >
> > > > > > > > The second and third cases (host and hypervisor) share the memory layout,
> > > > > > >
> > > > > > > Right. More specifically, they are both operating on the same set of physical
> > > > > > > memory pages, and hence "share" a set of what I've referred to as
> > > > > > > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> > > > > > >
> > > > > > > > but it is up
> > > > > > > > to each entity to decide which page sizes to use. For example, the host might map the
> > > > > > > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> > > > > > >
> > > > > > > Agreed.
> > > > > > >
> > > > > > > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > > > > > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> > > > > > >
> > > > > > > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > > > > > > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > > > > > > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > > > > > > and the virtual address to which it is being mapped must be on a 2M boundary.
> > > > > > > In the case of the host, that virtual address is the user space address in the
> > > > > > > user space process. In the case of the hypervisor, that "virtual address" is the
> > > > > > > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > > > > > > to be a guest physical address.
> > > > > > >
> > > > > > > These requirements are from the physical processor and its requirements on
> > > > > > > page table formats as specified by the hardware architecture. Whereas the
> > > > > > > page table entry for a 4K page contains the entire PFN, the page table entry
> > > > > > > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > > > > > > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > > > > > > requirements apply to both host and hypervisor mappings.
> > > > > > >
> > > > > > > When MSHV code in the host creates a new pinned region via the ioctl,
> > > > > > > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > > > > > > which returns the system PFN for each page of physical memory that is
> > > > > > > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > > > > > > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > > > > > > the 512 sequential PFNs must have its low order 9 bits be zero.
> > > > > > >
> > > > > > > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > > > > > > the hypervisor to map the allocated memory into the guest physical
> > > > > > > address space at a particular guest PFN. If the allocated memory contains
> > > > > > > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > > > > > > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > > > > > > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > > > > > > have the option of dropping back to 4K page mappings in this case. If
> > > > > > > the 2M alignment of the system PFN is different from the 2M alignment
> > > > > > > of the target guest PFN, it's not possible to create the mapping and the
> > > > > > > hypercall fails.
> > > > > > >
> > > > > > > The core problem is that the same 2M of physical memory wants to be
> > > > > > > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > > > > > > That can't be done unless the host alignment (in the VMM virtual address
> > > > > > > space) and the guest physical address (i.e., the target guest PFN) alignment
> > > > > > > match and are both on 2M boundaries.
> > > > > > >
> > > > > >
> > > > > > But why is it a problem? If both the host and the hypervisor can map ap
> > > > > > huge page, but the guest can't, it's still a win, no?
> > > > > > In other words, if VMM passes a host huge page aligned region as a guest
> > > > > > unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> > > > > > understand why would we want to prevent such cases.
> > > > > >
> > > > >
> > > > > Fair enough -- mostly. If you want to allow the misaligned case and live
> > > > > with not getting the 2M mapping in the guest, that works except in the
> > > > > situation that I described above, where the HVCALL_MAP_GPA_PAGES
> > > > > hypercall fails when creating a pinned region.
> > > > >
> > > > > The failure is flakey in that if the Linux in the root partition does not
> > > > > map any of the region as a 2M page, the hypercall succeeds and the
> > > > > MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
> > > > > happens to map any of the region as a 2M page, the hypercall will fail,
> > > > > and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
> > > > > flakey behavior is bad for the VMM.
> > > > >
> > > > > One solution is that mshv_chunk_stride() must return a stride > 1 only
> > > > > if both the gfn (which it currently checks) AND the corresponding
> > > > > userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
> > > > > hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
> > > > > misaligned case, and the failure won't occur.
> > > > >
> > > >
> > > > I think see your point, but I also think this issue doesn't exist,
> > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > 1. the folio order is PMD_ORDER and
> > > > 2. GFN is huge page aligned and
> > > > 3. number of 4K pages is huge pages aligned.
> > > >
> > > > On other words, a host huge page won't be mapped as huge if the page
> > > > can't be mapped as huge in the guest.
> > >
> > > OK, I'm missing how what you say is true. For pinned regions,
> > > the memory is allocated and mapped into the host userspace address
> > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > which calls pin_user_pages_fast(). This is all done without considering
> > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > and mapped in the host before any guest mapping is looked at. Agreed?
> > >
> > 
> > Agreed.
> > 
> > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > that can be mapped into the guest as a 2M page.
> > >
> > 
> > I'm trying to undestand how this can even happen, so please bear with
> > me.
> > In other words (and AFAIU), what you are saying in the following:
> > 
> > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> >    address is huge page aligned, size is huge page aligned and physical
> >    pages are consequtive).
> > 2. VMM tries to create a region via ioctl, but instead of passing the
> >    start of the region, is passes an offset into one of the the region's
> >    huge pages, and in the same time with the base GFN and the size huge
> >    page aligned (to meet the #2 and #3 conditions).
> > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> >    the corresponding pages as huge, which will be rejected by the
> >    hypervisor.
> > 
> > Is this accurate?
> 
> Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> address space, and not do anything to populate it with physical pages.
> So populating with any 2M pages may not happen until Step 2 when
> the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> space gets populated with physical pages doesn't really matter. We
> just know that it happens before the ioctl tries to map the memory
> into the guest -- i.e., mshv_prepare_pinned_region() calls
> mshv_region_map().
> 
> And yes, the problem is what you call out in Step 2: as input to the
> ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> mshv_user_mem_region could have different alignments modulo 2M
> boundaries. When they are different, that's what I'm calling a "mis-aligned
> region", (referring to a struct mshv_mem_region that is created and
> setup by the ioctl).
> 
> > A subseqeunt question: if it is accurate, why the driver needs to
> > support this case? It looks like a VMM bug to me.
> 
> I don't know if the driver needs to support this case. That's a question
> for the VMM people to answer. I wouldn't necessarily assume that the
> VMM always allocates virtual address space with exactly the size and
> alignment that matches the regions it creates with the ioctl. The
> kernel ioctl doesn't care how the VMM allocates and manages its
> virtual address space, so the VMM is free to do whatever it wants
> in that regard, as long as it meets the requirements of the ioctl. So
> the requirements of the ioctl in this case are something to be
> negotiated with the VMM.
> 
> > Also, how should it support it? By rejecting such requests in the ioctl?
> 
> Rejecting requests to create a mis-aligned region is certainly one option
> if the VMM agrees that's OK. The ioctl currently requires only that
> "userspace_addr" and "size" be page aligned, so those requirements
> could be tightened.
> 
> The other approach is to fix mshv_chunk_stride() to handle the
> mis-aligned case. Doing so it even easier than I first envisioned.
> I think this works:
> 
> @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
>          */
>         if (page_order &&
>             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
>                 return 1 << page_order;
> 
>         return 1;
> 
> But as we discussed earlier, this fix means never getting 2M mappings
> in the guest for a region that is mis-aligned.
> 

Although I understand the logic behind this fix, I’m hesitant to add it
because it looks like a workaround for a VMM bug that could bite back.
The approach you propose will silently map a huge page as a collection
of 4K pages, impacting guest performance (this will be especially
visible for a region containing a single huge page).

This fix silently allows such behavior instead of reporting it as an
error to user space. It’s worth noting that pinned-region population and
mapping happen upon ioctl invocation, so the VMM will either get an
error from the hypervisor (current behavior) or get a region mapped with
4K pages (proposed behavior).

The first case is an explicit error; the second — although it allows
adding a region — will be less performant, significantly increase region
mapping time and thus potentailly guest spin-up (creation) time, and be
less noticeable to customers, especially those who don’t really
understand what’s happening under the hood and simply stumbled upon some
VMM bug.

What’s your take?

Thanks,
Stanislav

> Michael
> 
> > 
> > Thanks,
> > Stanislav
> > 
> > > The problem does *not* happen with a movable region, but the reasoning
> > > is different. hmm_range_fault() is always called with a 2M range aligned
> > > to the GFN, which in a mis-aligned region means that the host userspace
> > > address is never 2M aligned. So hmm_range_fault() is never able to allocate
> > > and map a 2M page. mshv_chunk_stride() will never get a folio order > 1,
> > > and the hypercall is never asked to do a 2M mapping. Both host and guest
> > > mappings will always be 4K and everything works.
> > >
> > > Michael
> > >
> > > > And this function is called for
> > > > both movable and pinned region, so the hypercal should never fail due to
> > > > huge page alignment issue.
> > > >
> > > > What do I miss here?
> > > >
> > > > Thanks,
> > > > Stanislav
> > > >
> > > >
> > > > > Michael
> > > > >
> > > > > >
> > > > > > > Movable regions behave a bit differently because the memory for the
> > > > > > > region is not allocated on the host "up front" when the region is created.
> > > > > > > The memory is faulted in as the guest runs, and the vagaries of the current
> > > > > > > MSHV in Linux code are such that 2M pages are never created on the host
> > > > > > > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > > > > > > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > > > > > > mappings, which works even with the misalignment.
> > > > > > >
> > > > > > > >
> > > > > > > > This adjustment happens at runtime. Could this be the missing detail here?
> > > > > > >
> > > > > > > Adjustments at runtime are a different topic from the issue I'm raising,
> > > > > > > though eventually there's some relationship. My issue occurs in the
> > > > > > > creation of a new region, and the setting up of the initial hypervisor
> > > > > > > mapping. I haven't thought through the details of adjustments at runtime.
> > > > > > >
> > > > > > > My usual caveats apply -- this is all "thought experiment". If I had the
> > > > > > > means do some runtime testing to confirm, I would. It's possible the
> > > > > > > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > > > > > > that given the basics of how physical processors work with page tables.
> > > > > > >
> > > > > > > Michael

^ permalink raw reply

* Re: [PATCH v2 2/2] mshv: handle gpa intercepts for arm64
From: Stanislav Kinsburskii @ 2026-01-05 17:04 UTC (permalink / raw)
  To: Anirudh Rayabharam
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <20260105122837.1083896-3-anirudh@anirudhrb.com>

On Mon, Jan 05, 2026 at 12:28:37PM +0000, Anirudh Rayabharam wrote:
> From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> 
> The mshv driver now uses movable pages for guests. For arm64 guests
> to be functional, handle gpa intercepts for arm64 too (the current
> code implements handling only for x86).
> 
> Move some arch-agnostic functions out of #ifdefs so that they can be
> re-used.
> 
> Fixes: b9a66cd5ccbb ("mshv: Add support for movable memory regions")

I'm not sure that this patch needs "Fixes" tag as it introduced new
functionality rather than fixing a bug.

Thanks,
Stanislav

> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> ---
>  drivers/hv/mshv_root_main.c | 15 ++++++++-------
>  1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index 9cf28a3f12fe..f8c4c2ae2cc9 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -608,7 +608,6 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>  	return NULL;
>  }
>  
> -#ifdef CONFIG_X86_64
>  static struct mshv_mem_region *
>  mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>  {
> @@ -640,12 +639,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>  {
>  	struct mshv_partition *p = vp->vp_partition;
>  	struct mshv_mem_region *region;
> -	struct hv_x64_memory_intercept_message *msg;
>  	bool ret;
>  	u64 gfn;
> -
> -	msg = (struct hv_x64_memory_intercept_message *)
> +#if defined(CONFIG_X86_64)
> +	struct hv_x64_memory_intercept_message *msg =
> +		(struct hv_x64_memory_intercept_message *)
> +		vp->vp_intercept_msg_page->u.payload;
> +#elif defined(CONFIG_ARM64)
> +	struct hv_arm64_memory_intercept_message *msg =
> +		(struct hv_arm64_memory_intercept_message *)
>  		vp->vp_intercept_msg_page->u.payload;
> +#endif
>  
>  	gfn = HVPFN_DOWN(msg->guest_physical_address);
>  
> @@ -663,9 +667,6 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>  
>  	return ret;
>  }
> -#else  /* CONFIG_X86_64 */
> -static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> -#endif /* CONFIG_X86_64 */
>  
>  static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>  {
> -- 
> 2.34.1
> 

^ permalink raw reply

* RE: [EXTERNAL] Re: [PATCH RFC 1/2] net: mana: Add support for coalesced RX packets on CQE
From: Haiyang Zhang @ 2026-01-05 17:02 UTC (permalink / raw)
  To: Simon Horman, Haiyang Zhang
  Cc: linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	KY Srinivasan, Wei Liu, Dexuan Cui, Andrew Lunn, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Long Li,
	Konstantin Taranov, Erni Sri Satya Vennela, Shradha Gupta,
	Saurabh Sengar, Aditya Garg, Dipayaan Roy, Shiraz Saleem,
	linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org,
	Paul Rosswurm
In-Reply-To: <20260105114929.GA330625@horms.kernel.org>



> -----Original Message-----
> From: Simon Horman <horms@kernel.org>
> Sent: Monday, January 5, 2026 6:49 AM
> To: Haiyang Zhang <haiyangz@linux.microsoft.com>
> Cc: linux-hyperv@vger.kernel.org; netdev@vger.kernel.org; KY Srinivasan
> <kys@microsoft.com>; Haiyang Zhang <haiyangz@microsoft.com>; Wei Liu
> <wei.liu@kernel.org>; Dexuan Cui <DECUI@microsoft.com>; Andrew Lunn
> <andrew+netdev@lunn.ch>; David S. Miller <davem@davemloft.net>; Eric
> Dumazet <edumazet@google.com>; Jakub Kicinski <kuba@kernel.org>; Paolo
> Abeni <pabeni@redhat.com>; Long Li <longli@microsoft.com>; Konstantin
> Taranov <kotaranov@microsoft.com>; Erni Sri Satya Vennela
> <ernis@linux.microsoft.com>; Shradha Gupta
> <shradhagupta@linux.microsoft.com>; Saurabh Sengar
> <ssengar@linux.microsoft.com>; Aditya Garg
> <gargaditya@linux.microsoft.com>; Dipayaan Roy
> <dipayanroy@linux.microsoft.com>; Shiraz Saleem
> <shirazsaleem@microsoft.com>; linux-kernel@vger.kernel.org; linux-
> rdma@vger.kernel.org; Paul Rosswurm <paulros@microsoft.com>
> Subject: [EXTERNAL] Re: [PATCH RFC 1/2] net: mana: Add support for
> coalesced RX packets on CQE
> 
> On Tue, Dec 16, 2025 at 07:57:54AM -0800, Haiyang Zhang wrote:
> > From: Haiyang Zhang <haiyangz@microsoft.com>
> >
> > Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
> > check and process the type CQE_RX_COALESCED_4. The default setting is
> > disabled, to avoid possible regression on latency.
> >
> > And add ethtool handler to switch this feature. To turn it on, run:
> >   ethtool -C <nic> rx-frames 4
> > To turn it off:
> >   ethtool -C <nic> rx-frames 1
> >
> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> 
> ...
> 
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > index 0e2f4343ac67..1b9ed5c9bbff 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> > @@ -397,6 +397,58 @@ static void mana_get_channels(struct net_device
> *ndev,
> >  	channel->combined_count = apc->num_queues;
> >  }
> >
> > +static int mana_get_coalesce(struct net_device *ndev,
> > +			     struct ethtool_coalesce *ec,
> > +			     struct kernel_ethtool_coalesce *kernel_coal,
> > +			     struct netlink_ext_ack *extack)
> 
> ...
> 
> > +	if (err) {
> > +		netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
> > +			   ec->rx_max_coalesced_frames, err);
> > +		NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed:%d\n",
> > +				   ec->rx_max_coalesced_frames, err);
> 
> nit: I don't think the trailing '\n' is necessary here.
> 
Will update it.

Thanks,
- Haiyang

^ permalink raw reply

* Re: [PATCH v2 1/2] hyperv: add definitions for arm64 gpa intercepts
From: Stanislav Kinsburskii @ 2026-01-05 17:01 UTC (permalink / raw)
  To: Anirudh Rayabharam
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <20260105122837.1083896-2-anirudh@anirudhrb.com>

On Mon, Jan 05, 2026 at 12:28:36PM +0000, Anirudh Rayabharam wrote:
> From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> 
> Add definitions required for handling GPA intercepts on arm64.
> 
> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>

Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

> ---
>  include/hyperv/hvhdk.h | 47 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 47 insertions(+)
> 
> diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
> index 469186df7826..08965970c17d 100644
> --- a/include/hyperv/hvhdk.h
> +++ b/include/hyperv/hvhdk.h
> @@ -800,6 +800,53 @@ struct hv_x64_memory_intercept_message {
>  	u8 instruction_bytes[16];
>  } __packed;
>  
> +#if IS_ENABLED(CONFIG_ARM64)
> +union hv_arm64_vp_execution_state {
> +	u16 as_uint16;
> +	struct {
> +		u16 cpl:2; /* Exception Level (EL) */
> +		u16 debug_active:1;
> +		u16 interruption_pending:1;
> +		u16 vtl:4;
> +		u16 virtualization_fault_active:1;
> +		u16 reserved:7;
> +	} __packed;
> +};
> +
> +struct hv_arm64_intercept_message_header {
> +	u32 vp_index;
> +	u8 instruction_length;
> +	u8 intercept_access_type;
> +	union hv_arm64_vp_execution_state execution_state;
> +	u64 pc;
> +	u64 cpsr;
> +} __packed;
> +
> +union hv_arm64_memory_access_info {
> +	u8 as_uint8;
> +	struct {
> +		u8 gva_valid:1;
> +		u8 gva_gpa_valid:1;
> +		u8 hypercall_output_pending:1;
> +		u8 reserved:5;
> +	} __packed;
> +};
> +
> +struct hv_arm64_memory_intercept_message {
> +	struct hv_arm64_intercept_message_header header;
> +	u32 cache_type; /* enum hv_cache_type */
> +	u8 instruction_byte_count;
> +	union hv_arm64_memory_access_info memory_access_info;
> +	u16 reserved1;
> +	u8 instruction_bytes[4];
> +	u32 reserved2;
> +	u64 guest_virtual_address;
> +	u64 guest_physical_address;
> +	u64 syndrome;
> +} __packed;
> +
> +#endif /* CONFIG_ARM64 */
> +
>  /*
>   * Dispatch state for the VP communicated by the hypervisor to the
>   * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP.
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v2 1/2] hyperv: add definitions for arm64 gpa intercepts
From: vdso @ 2026-01-05 16:06 UTC (permalink / raw)
  To: Anirudh Rayabharam
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260105122837.1083896-2-anirudh@anirudhrb.com>

> On 01/05/2026 4:28 AM  Anirudh Rayabharam <anirudh@anirudhrb.com> wrote:
> 

[...]

>  
> +#if IS_ENABLED(CONFIG_ARM64)
> +union hv_arm64_vp_execution_state {
> +	u16 as_uint16;
> +	struct {
> +		u16 cpl:2; /* Exception Level (EL) */

Anirudh,

Appreciate following up on the CPL field in that ARM64 structure
and adding the comment!

Still, using something from the x86 parlance (CPL) and adding a comment
stating that this is actually ARM64 EL certainly needs an explanation
as to _why_ using an x86 term here is beneficial, why not just call
the field "el"? As an analogy, here is a thought experiment of writing

#ffdef CONFIG_ARM64
u64 rax; /* This is X0 */
#endif

where an x86 register name would be used to refer to X0 on ARM64, and
that doen't look natural.

So far, I can't seem to find drawbacks in naming this field "el", only
benefits:
* ARM64 folks will immediately know what this field is, and
* the comment isn't required to explain the situation to the reader.

Do you foresee any drawbacks of calling the field "el" and dropping
the comment? If you do, would these drawbacks outweigh the benefits?

[...]

--
Cheers,
Roman

^ permalink raw reply

* Re: [PATCH v2 8/8] KVM: SVM: Assert that Hyper-V's HV_SVM_EXITCODE_ENL == SVM_EXIT_SW
From: Sean Christopherson @ 2026-01-05 15:52 UTC (permalink / raw)
  To: Vitaly Kuznetsov
  Cc: Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, kvm, linux-hyperv, linux-kernel, Jim Mattson,
	Yosry Ahmed
In-Reply-To: <87eco8bajg.fsf@redhat.com>

On Fri, Jan 02, 2026, Vitaly Kuznetsov wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > Add a build-time assertiont that Hyper-V's "enlightened" exit code is that,
> > same as the AMD-defined "Reserved for Host" exit code, mostly to help
> > readers connect the dots and understand why synthesizing a software-defined
> > exit code is safe/ok.
> >
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  arch/x86/kvm/svm/hyperv.c | 6 ++++++
> >  1 file changed, 6 insertions(+)
> >
> > diff --git a/arch/x86/kvm/svm/hyperv.c b/arch/x86/kvm/svm/hyperv.c
> > index 3ec580d687f5..4f24dcb45116 100644
> > --- a/arch/x86/kvm/svm/hyperv.c
> > +++ b/arch/x86/kvm/svm/hyperv.c
> > @@ -10,6 +10,12 @@ void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu)
> >  {
> >  	struct vcpu_svm *svm = to_svm(vcpu);
> >  
> > +	/*
> > +	 * The exit code used by Hyper-V for software-defined exits is reserved
> > +	 * by AMD specifically for such use cases.
> > +	 */
> > +	BUILD_BUG_ON(HV_SVM_EXITCODE_ENL != SVM_EXIT_SW);
> > +
> >  	svm->vmcb->control.exit_code = HV_SVM_EXITCODE_ENL;
> >  	svm->vmcb->control.exit_info_1 = HV_SVM_ENL_EXITCODE_TRAP_AFTER_FLUSH;
> >  	svm->vmcb->control.exit_info_2 = 0;
> 
> Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
> 
> Alternatively (or additionally?) to BUG_ON, I guess we could've
> 
> #define HV_SVM_EXITCODE_ENL SVM_EXIT_SW 
> 
> unless including SVM's headers into include/hyperv/hvgdk.h is too big of
> a mess.

Heh, I had the same thought[*], but Wei pointed out that the definitions in hvgdk.h
mirror internal Microsoft headers:

  On Fri, Nov 14, 2025, Wei Liu wrote:
  > On Fri, Nov 14, 2025 at 07:22:41AM -0800, Sean Christopherson wrote:
  > > On Fri, Nov 14, 2025, Michael Kelley wrote:
  > > > From: Sean Christopherson <seanjc@google.com> Sent: Thursday, November 13, 2025 2:56 PM
  > > > > @@ -281,7 +281,7 @@ struct hv_vmcb_enlightenments {
  > > > >  #define HV_VMCB_NESTED_ENLIGHTENMENTS		31
  > > > > 
  > > > >  /* Synthetic VM-Exit */
  > > > > -#define HV_SVM_EXITCODE_ENL			0xf0000000
  > > > > +#define HV_SVM_EXITCODE_ENL			0xf0000000u
  > > > 
  > > > Is there a reason for making this Hyper-V code just "u", while
  > > > making the SVM_VMGEXIT_* values "ull"? I don't think
  > > > "u" vs. "ull" shouldn't make any difference when assigning to a
  > > > u64, but the inconsistency piqued my interest ....
  > > 
  > > I hedged and went for a more "minimal" change because it isn't KVM code, and at
  > > the time because I thought the value isn't defined by the APM.  Though looking
  > > again at the APM, it does reserve that value for software
  > > 
  > >   F000_000h    Unused    Reserved for Host.
  > > 
  > > and I can't find anything in the TLFS.  Ah, my PDF copy is just stale, it's indeed
  > > defined as a synthetic exit.
  > > 
  > >   https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/nested-virtualization#synthetic-vm-exit
  > > 
  > > Anyways, I'm in favor of making HV_SVM_EXITCODE_ENL an ull, though part of me
  > > wonders if we should do:
  > > 
  > >   #define HV_SVM_EXITCODE_ENL	SVM_EXIT_SW
  > 
  > I know this is very tempting, but these headers are supposed to mirror
  > Microsoft's internal headers, so we would like to keep them
  > self-contained for ease of tracking.
  > 
  > It should be fine to add the "ull" suffix here. I briefly talked to a
  > hypervisor developer and they agreed.

[*] https://lore.kernel.org/all/aRdJQQ7_j6RcHwjJ@google.com

^ permalink raw reply

* [PATCH v2 2/2] mshv: handle gpa intercepts for arm64
From: Anirudh Rayabharam @ 2026-01-05 12:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel; +Cc: anirudh
In-Reply-To: <20260105122837.1083896-1-anirudh@anirudhrb.com>

From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>

The mshv driver now uses movable pages for guests. For arm64 guests
to be functional, handle gpa intercepts for arm64 too (the current
code implements handling only for x86).

Move some arch-agnostic functions out of #ifdefs so that they can be
re-used.

Fixes: b9a66cd5ccbb ("mshv: Add support for movable memory regions")
Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
---
 drivers/hv/mshv_root_main.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 9cf28a3f12fe..f8c4c2ae2cc9 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -608,7 +608,6 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
 	return NULL;
 }
 
-#ifdef CONFIG_X86_64
 static struct mshv_mem_region *
 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
 {
@@ -640,12 +639,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
 {
 	struct mshv_partition *p = vp->vp_partition;
 	struct mshv_mem_region *region;
-	struct hv_x64_memory_intercept_message *msg;
 	bool ret;
 	u64 gfn;
-
-	msg = (struct hv_x64_memory_intercept_message *)
+#if defined(CONFIG_X86_64)
+	struct hv_x64_memory_intercept_message *msg =
+		(struct hv_x64_memory_intercept_message *)
+		vp->vp_intercept_msg_page->u.payload;
+#elif defined(CONFIG_ARM64)
+	struct hv_arm64_memory_intercept_message *msg =
+		(struct hv_arm64_memory_intercept_message *)
 		vp->vp_intercept_msg_page->u.payload;
+#endif
 
 	gfn = HVPFN_DOWN(msg->guest_physical_address);
 
@@ -663,9 +667,6 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
 
 	return ret;
 }
-#else  /* CONFIG_X86_64 */
-static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
-#endif /* CONFIG_X86_64 */
 
 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
 {
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 1/2] hyperv: add definitions for arm64 gpa intercepts
From: Anirudh Rayabharam @ 2026-01-05 12:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel; +Cc: anirudh
In-Reply-To: <20260105122837.1083896-1-anirudh@anirudhrb.com>

From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>

Add definitions required for handling GPA intercepts on arm64.

Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
---
 include/hyperv/hvhdk.h | 47 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 469186df7826..08965970c17d 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -800,6 +800,53 @@ struct hv_x64_memory_intercept_message {
 	u8 instruction_bytes[16];
 } __packed;
 
+#if IS_ENABLED(CONFIG_ARM64)
+union hv_arm64_vp_execution_state {
+	u16 as_uint16;
+	struct {
+		u16 cpl:2; /* Exception Level (EL) */
+		u16 debug_active:1;
+		u16 interruption_pending:1;
+		u16 vtl:4;
+		u16 virtualization_fault_active:1;
+		u16 reserved:7;
+	} __packed;
+};
+
+struct hv_arm64_intercept_message_header {
+	u32 vp_index;
+	u8 instruction_length;
+	u8 intercept_access_type;
+	union hv_arm64_vp_execution_state execution_state;
+	u64 pc;
+	u64 cpsr;
+} __packed;
+
+union hv_arm64_memory_access_info {
+	u8 as_uint8;
+	struct {
+		u8 gva_valid:1;
+		u8 gva_gpa_valid:1;
+		u8 hypercall_output_pending:1;
+		u8 reserved:5;
+	} __packed;
+};
+
+struct hv_arm64_memory_intercept_message {
+	struct hv_arm64_intercept_message_header header;
+	u32 cache_type; /* enum hv_cache_type */
+	u8 instruction_byte_count;
+	union hv_arm64_memory_access_info memory_access_info;
+	u16 reserved1;
+	u8 instruction_bytes[4];
+	u32 reserved2;
+	u64 guest_virtual_address;
+	u64 guest_physical_address;
+	u64 syndrome;
+} __packed;
+
+#endif /* CONFIG_ARM64 */
+
 /*
  * Dispatch state for the VP communicated by the hypervisor to the
  * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP.
-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 0/2] Fixes for movable pages
From: Anirudh Rayabharam @ 2026-01-05 12:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel; +Cc: anirudh

From: "Anirudh Rayabharam (Microsoft)" <anirudh@anirudhrb.com>

Fix movable pages for arm64 guests by implementing a GPA intercept
handler.

v2:
  - Added "Fixes:" tag
  - Got rid of the utility function to get intercept GPA and instead
    integrated the rather small logic into the GPA intercept handling
    function.
  - Dropped patch 3 since it was applied to the fixes tree.

Anirudh Rayabharam (Microsoft) (2):
  hyperv: add definitions for arm64 gpa intercepts
  mshv: handle gpa intercepts for arm64

 drivers/hv/mshv_root_main.c | 15 ++++++------
 include/hyperv/hvhdk.h      | 47 +++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 7 deletions(-)

-- 
2.34.1


^ permalink raw reply

* Re: [PATCH RFC 1/2] net: mana: Add support for coalesced RX packets on CQE
From: Simon Horman @ 2026-01-05 11:49 UTC (permalink / raw)
  To: Haiyang Zhang
  Cc: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Long Li, Konstantin Taranov,
	Erni Sri Satya Vennela, Shradha Gupta, Saurabh Sengar,
	Aditya Garg, Dipayaan Roy, Shiraz Saleem, linux-kernel,
	linux-rdma, paulros
In-Reply-To: <1765900682-22114-1-git-send-email-haiyangz@linux.microsoft.com>

On Tue, Dec 16, 2025 at 07:57:54AM -0800, Haiyang Zhang wrote:
> From: Haiyang Zhang <haiyangz@microsoft.com>
> 
> Our NIC can have up to 4 RX packets on 1 CQE. To support this feature,
> check and process the type CQE_RX_COALESCED_4. The default setting is
> disabled, to avoid possible regression on latency.
> 
> And add ethtool handler to switch this feature. To turn it on, run:
>   ethtool -C <nic> rx-frames 4
> To turn it off:
>   ethtool -C <nic> rx-frames 1
> 
> Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>

...

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> index 0e2f4343ac67..1b9ed5c9bbff 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> @@ -397,6 +397,58 @@ static void mana_get_channels(struct net_device *ndev,
>  	channel->combined_count = apc->num_queues;
>  }
>  
> +static int mana_get_coalesce(struct net_device *ndev,
> +			     struct ethtool_coalesce *ec,
> +			     struct kernel_ethtool_coalesce *kernel_coal,
> +			     struct netlink_ext_ack *extack)

...

> +	if (err) {
> +		netdev_err(ndev, "Set rx-frames to %u failed:%d\n",
> +			   ec->rx_max_coalesced_frames, err);
> +		NL_SET_ERR_MSG_FMT(extack, "Set rx-frames to %u failed:%d\n",
> +				   ec->rx_max_coalesced_frames, err);

nit: I don't think the trailing '\n' is necessary here.

     Flagged by coccinelle.

> +
> +		apc->cqe_coalescing_enable = saved_cqe_coalescing_enable;
> +	}
> +
> +	return err;
> +}

...

^ permalink raw reply

* [PATCH v5 21/21] x86/pvlocks: Move paravirt spinlock functions into own header
From: Juergen Gross @ 2026-01-05 11:05 UTC (permalink / raw)
  To: linux-kernel, x86, linux-hyperv, virtualization, kvm
  Cc: Juergen Gross, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Ajay Kaher,
	Alexey Makhalov, Broadcom internal kernel review list,
	Paolo Bonzini, Vitaly Kuznetsov, Boris Ostrovsky, Josh Poimboeuf,
	Peter Zijlstra, xen-devel
In-Reply-To: <20260105110520.21356-1-jgross@suse.com>

Instead of having the pv spinlock function definitions in paravirt.h,
move them into the new header paravirt-spinlock.h.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- use new header instead of qspinlock.h
- use dedicated pv_ops_lock array
- move more paravirt related lock code
V3:
- hide native_pv_lock_init() with CONFIG_SMP (kernel test robot)
V4:
- don't reference pv_ops_lock without CONFIG_PARAVIRT_SPINLOCKS
  (kernel test robot)
V5:
- move paravirt_set_cap() declaration into paravirt-base.h
  (kernel test robot)
---
 arch/x86/hyperv/hv_spinlock.c            |  10 +-
 arch/x86/include/asm/paravirt-base.h     |   6 +
 arch/x86/include/asm/paravirt-spinlock.h | 145 +++++++++++++++++++++++
 arch/x86/include/asm/paravirt.h          |  61 ----------
 arch/x86/include/asm/paravirt_types.h    |  17 ---
 arch/x86/include/asm/qspinlock.h         |  87 +-------------
 arch/x86/kernel/Makefile                 |   2 +-
 arch/x86/kernel/kvm.c                    |  12 +-
 arch/x86/kernel/paravirt-spinlocks.c     |  26 +++-
 arch/x86/kernel/paravirt.c               |  21 ----
 arch/x86/xen/spinlock.c                  |  10 +-
 tools/objtool/check.c                    |   1 +
 12 files changed, 198 insertions(+), 200 deletions(-)
 create mode 100644 arch/x86/include/asm/paravirt-spinlock.h

diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 2a3c2afb0154..210b494e4de0 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -78,11 +78,11 @@ void __init hv_init_spinlocks(void)
 	pr_info("PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
-	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-	pv_ops.lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
-	pv_ops.lock.wait = hv_qlock_wait;
-	pv_ops.lock.kick = hv_qlock_kick;
-	pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted);
+	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_ops_lock.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_ops_lock.wait = hv_qlock_wait;
+	pv_ops_lock.kick = hv_qlock_kick;
+	pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(hv_vcpu_is_preempted);
 }
 
 static __init int hv_parse_nopvspin(char *arg)
diff --git a/arch/x86/include/asm/paravirt-base.h b/arch/x86/include/asm/paravirt-base.h
index 3827ea20de18..982a0b93bc76 100644
--- a/arch/x86/include/asm/paravirt-base.h
+++ b/arch/x86/include/asm/paravirt-base.h
@@ -26,4 +26,10 @@ u64 _paravirt_ident_64(u64);
 #endif
 #define paravirt_nop	((void *)nop_func)
 
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void paravirt_set_cap(void);
+#else
+static inline void paravirt_set_cap(void) { }
+#endif
+
 #endif /* _ASM_X86_PARAVIRT_BASE_H */
diff --git a/arch/x86/include/asm/paravirt-spinlock.h b/arch/x86/include/asm/paravirt-spinlock.h
new file mode 100644
index 000000000000..a5011ef3a6cc
--- /dev/null
+++ b/arch/x86/include/asm/paravirt-spinlock.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_X86_PARAVIRT_SPINLOCK_H
+#define _ASM_X86_PARAVIRT_SPINLOCK_H
+
+#include <asm/paravirt_types.h>
+
+#ifdef CONFIG_SMP
+#include <asm/spinlock_types.h>
+#endif
+
+struct qspinlock;
+
+struct pv_lock_ops {
+	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+	struct paravirt_callee_save queued_spin_unlock;
+
+	void (*wait)(u8 *ptr, u8 val);
+	void (*kick)(int cpu);
+
+	struct paravirt_callee_save vcpu_is_preempted;
+} __no_randomize_layout;
+
+extern struct pv_lock_ops pv_ops_lock;
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+extern bool nopvspin;
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+							 u32 val)
+{
+	PVOP_VCALL2(pv_ops_lock, queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	PVOP_ALT_VCALLEE1(pv_ops_lock, queued_spin_unlock, lock,
+			  "movb $0, (%%" _ASM_ARG1 ");",
+			  ALT_NOT(X86_FEATURE_PVUNLOCK));
+}
+
+static __always_inline bool pv_vcpu_is_preempted(long cpu)
+{
+	return PVOP_ALT_CALLEE1(bool, pv_ops_lock, vcpu_is_preempted, cpu,
+				"xor %%" _ASM_AX ", %%" _ASM_AX ";",
+				ALT_NOT(X86_FEATURE_VCPUPREEMPT));
+}
+
+#define queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	smp_store_release(&lock->locked, 0);
+}
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	kcsan_release();
+	pv_queued_spin_unlock(lock);
+}
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(long cpu)
+{
+	return pv_vcpu_is_preempted(cpu);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	PVOP_VCALL2(pv_ops_lock, wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	PVOP_VCALL1(pv_ops_lock, kick, cpu);
+}
+
+void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
+bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+void __init native_pv_lock_init(void);
+__visible void __native_queued_spin_unlock(struct qspinlock *lock);
+bool pv_is_native_spin_unlock(void);
+__visible bool __native_vcpu_is_preempted(long cpu);
+bool pv_is_native_vcpu_is_preempted(void);
+
+/*
+ * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
+ *
+ * Native (and PV wanting native due to vCPU pinning) should keep this key
+ * disabled. Native does not touch the key.
+ *
+ * When in a guest then native_pv_lock_init() enables the key first and
+ * KVM/XEN might conditionally disable it later in the boot process again.
+ */
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+
+/*
+ * Shortcut for the queued_spin_lock_slowpath() function that allows
+ * virt to hijack it.
+ *
+ * Returns:
+ *   true - lock has been negotiated, all done;
+ *   false - queued_spin_lock_slowpath() will do its thing.
+ */
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
+{
+	int val;
+
+	if (!static_branch_likely(&virt_spin_lock_key))
+		return false;
+
+	/*
+	 * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+	 * back to a Test-and-Set spinlock, because fair locks have
+	 * horrible lock 'holder' preemption issues.
+	 */
+
+ __retry:
+	val = atomic_read(&lock->val);
+
+	if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+		cpu_relax();
+		goto __retry;
+	}
+
+	return true;
+}
+
+#endif /* _ASM_X86_PARAVIRT_SPINLOCK_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ec274d13bae0..b21072af731d 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -19,15 +19,6 @@
 #include <linux/cpumask.h>
 #include <asm/frame.h>
 
-__visible void __native_queued_spin_unlock(struct qspinlock *lock);
-bool pv_is_native_spin_unlock(void);
-__visible bool __native_vcpu_is_preempted(long cpu);
-bool pv_is_native_vcpu_is_preempted(void);
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-void __init paravirt_set_cap(void);
-#endif
-
 /* The paravirtualized I/O functions */
 static inline void slow_down_io(void)
 {
@@ -522,46 +513,7 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 {
 	pv_ops.mmu.set_fixmap(idx, phys, flags);
 }
-#endif
-
-#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-
-static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
-							u32 val)
-{
-	PVOP_VCALL2(pv_ops, lock.queued_spin_lock_slowpath, lock, val);
-}
-
-static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
-{
-	PVOP_ALT_VCALLEE1(pv_ops, lock.queued_spin_unlock, lock,
-			  "movb $0, (%%" _ASM_ARG1 ");",
-			  ALT_NOT(X86_FEATURE_PVUNLOCK));
-}
-
-static __always_inline void pv_wait(u8 *ptr, u8 val)
-{
-	PVOP_VCALL2(pv_ops, lock.wait, ptr, val);
-}
-
-static __always_inline void pv_kick(int cpu)
-{
-	PVOP_VCALL1(pv_ops, lock.kick, cpu);
-}
-
-static __always_inline bool pv_vcpu_is_preempted(long cpu)
-{
-	return PVOP_ALT_CALLEE1(bool, pv_ops, lock.vcpu_is_preempted, cpu,
-				"xor %%" _ASM_AX ", %%" _ASM_AX ";",
-				ALT_NOT(X86_FEATURE_VCPUPREEMPT));
-}
 
-void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
-bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
-
-#endif /* SMP && PARAVIRT_SPINLOCKS */
-
-#ifdef CONFIG_PARAVIRT_XXL
 static __always_inline unsigned long arch_local_save_flags(void)
 {
 	return PVOP_ALT_CALLEE0(unsigned long, pv_ops, irq.save_fl, "pushf; pop %%rax;",
@@ -588,8 +540,6 @@ static __always_inline unsigned long arch_local_irq_save(void)
 }
 #endif
 
-void native_pv_lock_init(void) __init;
-
 #else  /* __ASSEMBLER__ */
 
 #ifdef CONFIG_X86_64
@@ -613,12 +563,6 @@ void native_pv_lock_init(void) __init;
 #endif /* __ASSEMBLER__ */
 #else  /* CONFIG_PARAVIRT */
 # define default_banner x86_init_noop
-
-#ifndef __ASSEMBLER__
-static inline void native_pv_lock_init(void)
-{
-}
-#endif
 #endif /* !CONFIG_PARAVIRT */
 
 #ifndef __ASSEMBLER__
@@ -634,10 +578,5 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
 }
 #endif
 
-#ifndef CONFIG_PARAVIRT_SPINLOCKS
-static inline void paravirt_set_cap(void)
-{
-}
-#endif
 #endif /* __ASSEMBLER__ */
 #endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b36d425d099b..7ccd41628d36 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -184,22 +184,6 @@ struct pv_mmu_ops {
 #endif
 } __no_randomize_layout;
 
-#ifdef CONFIG_SMP
-#include <asm/spinlock_types.h>
-#endif
-
-struct qspinlock;
-
-struct pv_lock_ops {
-	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
-	struct paravirt_callee_save queued_spin_unlock;
-
-	void (*wait)(u8 *ptr, u8 val);
-	void (*kick)(int cpu);
-
-	struct paravirt_callee_save vcpu_is_preempted;
-} __no_randomize_layout;
-
 /* This contains all the paravirt structures: we get a convenient
  * number for each function using the offset which we use to indicate
  * what to patch. */
@@ -207,7 +191,6 @@ struct paravirt_patch_template {
 	struct pv_cpu_ops	cpu;
 	struct pv_irq_ops	irq;
 	struct pv_mmu_ops	mmu;
-	struct pv_lock_ops	lock;
 } __no_randomize_layout;
 
 extern struct paravirt_patch_template pv_ops;
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 68da67df304d..25a1919542d9 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -7,6 +7,9 @@
 #include <asm-generic/qspinlock_types.h>
 #include <asm/paravirt.h>
 #include <asm/rmwcc.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt-spinlock.h>
+#endif
 
 #define _Q_PENDING_LOOPS	(1 << 9)
 
@@ -27,90 +30,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo
 	return val;
 }
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __pv_init_lock_hash(void);
-extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
-extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
-extern bool nopvspin;
-
-#define	queued_spin_unlock queued_spin_unlock
-/**
- * queued_spin_unlock - release a queued spinlock
- * @lock : Pointer to queued spinlock structure
- *
- * A smp_store_release() on the least-significant byte.
- */
-static inline void native_queued_spin_unlock(struct qspinlock *lock)
-{
-	smp_store_release(&lock->locked, 0);
-}
-
-static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
-{
-	pv_queued_spin_lock_slowpath(lock, val);
-}
-
-static inline void queued_spin_unlock(struct qspinlock *lock)
-{
-	kcsan_release();
-	pv_queued_spin_unlock(lock);
-}
-
-#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(long cpu)
-{
-	return pv_vcpu_is_preempted(cpu);
-}
+#ifndef CONFIG_PARAVIRT
+static inline void native_pv_lock_init(void) { }
 #endif
 
-#ifdef CONFIG_PARAVIRT
-/*
- * virt_spin_lock_key - disables by default the virt_spin_lock() hijack.
- *
- * Native (and PV wanting native due to vCPU pinning) should keep this key
- * disabled. Native does not touch the key.
- *
- * When in a guest then native_pv_lock_init() enables the key first and
- * KVM/XEN might conditionally disable it later in the boot process again.
- */
-DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
-
-/*
- * Shortcut for the queued_spin_lock_slowpath() function that allows
- * virt to hijack it.
- *
- * Returns:
- *   true - lock has been negotiated, all done;
- *   false - queued_spin_lock_slowpath() will do its thing.
- */
-#define virt_spin_lock virt_spin_lock
-static inline bool virt_spin_lock(struct qspinlock *lock)
-{
-	int val;
-
-	if (!static_branch_likely(&virt_spin_lock_key))
-		return false;
-
-	/*
-	 * On hypervisors without PARAVIRT_SPINLOCKS support we fall
-	 * back to a Test-and-Set spinlock, because fair locks have
-	 * horrible lock 'holder' preemption issues.
-	 */
-
- __retry:
-	val = atomic_read(&lock->val);
-
-	if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
-		cpu_relax();
-		goto __retry;
-	}
-
-	return true;
-}
-
-#endif /* CONFIG_PARAVIRT */
-
 #include <asm-generic/qspinlock.h>
 
 #endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bc184dd38d99..e9aeeeafad17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -126,7 +126,7 @@ obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
-obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
+obj-$(CONFIG_PARAVIRT)		+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 21b4de55f823..de550b12d9ab 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -829,8 +829,10 @@ static void __init kvm_guest_init(void)
 		has_steal_clock = 1;
 		static_call_update(pv_steal_clock, kvm_steal_clock);
 
-		pv_ops.lock.vcpu_is_preempted =
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+		pv_ops_lock.vcpu_is_preempted =
 			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+#endif
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -1126,11 +1128,11 @@ void __init kvm_spinlock_init(void)
 	pr_info("PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
-	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-	pv_ops.lock.queued_spin_unlock =
+	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_ops_lock.queued_spin_unlock =
 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
-	pv_ops.lock.wait = kvm_wait;
-	pv_ops.lock.kick = kvm_kick_cpu;
+	pv_ops_lock.wait = kvm_wait;
+	pv_ops_lock.kick = kvm_kick_cpu;
 
 	/*
 	 * When PV spinlock is enabled which is preferred over
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 9e1ea99ad9df..95452444868f 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -3,12 +3,22 @@
  * Split spinlock implementation out into its own file, so it can be
  * compiled in a FTRACE-compatible way.
  */
+#include <linux/static_call.h>
 #include <linux/spinlock.h>
 #include <linux/export.h>
 #include <linux/jump_label.h>
 
-#include <asm/paravirt.h>
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
 
+#ifdef CONFIG_SMP
+void __init native_pv_lock_init(void)
+{
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+		static_branch_enable(&virt_spin_lock_key);
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 __visible void __native_queued_spin_unlock(struct qspinlock *lock)
 {
 	native_queued_spin_unlock(lock);
@@ -17,7 +27,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
 
 bool pv_is_native_spin_unlock(void)
 {
-	return pv_ops.lock.queued_spin_unlock.func ==
+	return pv_ops_lock.queued_spin_unlock.func ==
 		__raw_callee_save___native_queued_spin_unlock;
 }
 
@@ -29,7 +39,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
 
 bool pv_is_native_vcpu_is_preempted(void)
 {
-	return pv_ops.lock.vcpu_is_preempted.func ==
+	return pv_ops_lock.vcpu_is_preempted.func ==
 		__raw_callee_save___native_vcpu_is_preempted;
 }
 
@@ -41,3 +51,13 @@ void __init paravirt_set_cap(void)
 	if (!pv_is_native_vcpu_is_preempted())
 		setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
 }
+
+struct pv_lock_ops pv_ops_lock = {
+	.queued_spin_lock_slowpath	= native_queued_spin_lock_slowpath,
+	.queued_spin_unlock		= PV_CALLEE_SAVE(__native_queued_spin_unlock),
+	.wait				= paravirt_nop,
+	.kick				= paravirt_nop,
+	.vcpu_is_preempted		= PV_CALLEE_SAVE(__native_vcpu_is_preempted),
+};
+EXPORT_SYMBOL(pv_ops_lock);
+#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5dfbd3f55792..a6ed52cae003 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -57,14 +57,6 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
 DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
 #endif
 
-DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
-
-void __init native_pv_lock_init(void)
-{
-	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
-		static_branch_enable(&virt_spin_lock_key);
-}
-
 static noinstr void pv_native_safe_halt(void)
 {
 	native_safe_halt();
@@ -221,19 +213,6 @@ struct paravirt_patch_template pv_ops = {
 
 	.mmu.set_fixmap		= native_set_fixmap,
 #endif /* CONFIG_PARAVIRT_XXL */
-
-#if defined(CONFIG_PARAVIRT_SPINLOCKS)
-	/* Lock ops. */
-#ifdef CONFIG_SMP
-	.lock.queued_spin_lock_slowpath	= native_queued_spin_lock_slowpath,
-	.lock.queued_spin_unlock	=
-				PV_CALLEE_SAVE(__native_queued_spin_unlock),
-	.lock.wait			= paravirt_nop,
-	.lock.kick			= paravirt_nop,
-	.lock.vcpu_is_preempted		=
-				PV_CALLEE_SAVE(__native_vcpu_is_preempted),
-#endif /* SMP */
-#endif
 };
 
 #ifdef CONFIG_PARAVIRT_XXL
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index fe56646d6919..83ac24ead289 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -134,10 +134,10 @@ void __init xen_init_spinlocks(void)
 	printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
 
 	__pv_init_lock_hash();
-	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
-	pv_ops.lock.queued_spin_unlock =
+	pv_ops_lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_ops_lock.queued_spin_unlock =
 		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
-	pv_ops.lock.wait = xen_qlock_wait;
-	pv_ops.lock.kick = xen_qlock_kick;
-	pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
+	pv_ops_lock.wait = xen_qlock_wait;
+	pv_ops_lock.kick = xen_qlock_kick;
+	pv_ops_lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index b3fec88d5bd3..c2952df6842c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -527,6 +527,7 @@ static struct {
 	int idx_off;
 } pv_ops_tables[] = {
 	{ .name = "pv_ops", },
+	{ .name = "pv_ops_lock", },
 	{ .name = NULL, .idx_off = -1 }
 };
 
-- 
2.51.0


^ permalink raw reply related

* [PATCH v5 12/21] x86/paravirt: Move paravirt_sched_clock() related code into tsc.c
From: Juergen Gross @ 2026-01-05 11:05 UTC (permalink / raw)
  To: linux-kernel, x86, virtualization, kvm, linux-hyperv
  Cc: Juergen Gross, Ajay Kaher, Alexey Makhalov,
	Broadcom internal kernel review list, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, H. Peter Anvin,
	Paolo Bonzini, Vitaly Kuznetsov, Boris Ostrovsky,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Daniel Lezcano, xen-devel, Peter Zijlstra (Intel)
In-Reply-To: <20260105110520.21356-1-jgross@suse.com>

The only user of paravirt_sched_clock() is in tsc.c, so move the code
from paravirt.c and paravirt.h to tsc.c.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/include/asm/paravirt.h    | 12 ------------
 arch/x86/include/asm/timer.h       |  1 +
 arch/x86/kernel/kvmclock.c         |  1 +
 arch/x86/kernel/paravirt.c         |  7 -------
 arch/x86/kernel/tsc.c              | 10 +++++++++-
 arch/x86/xen/time.c                |  1 +
 drivers/clocksource/hyperv_timer.c |  2 ++
 7 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 766a7cee3d64..b69e75a5c872 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -14,20 +14,8 @@
 #ifndef __ASSEMBLER__
 #include <linux/types.h>
 #include <linux/cpumask.h>
-#include <linux/static_call_types.h>
 #include <asm/frame.h>
 
-u64 dummy_sched_clock(void);
-
-DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
-
-void paravirt_set_sched_clock(u64 (*func)(void));
-
-static __always_inline u64 paravirt_sched_clock(void)
-{
-	return static_call(pv_sched_clock)();
-}
-
 __visible void __native_queued_spin_unlock(struct qspinlock *lock);
 bool pv_is_native_spin_unlock(void);
 __visible bool __native_vcpu_is_preempted(long cpu);
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 23baf8c9b34c..fda18bcb19b4 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,6 +12,7 @@ extern void recalibrate_cpu_khz(void);
 extern int no_timer_check;
 
 extern bool using_native_sched_clock(void);
+void paravirt_set_sched_clock(u64 (*func)(void));
 
 /*
  * We use the full linear equation: f(x) = a + b*x, in order to allow
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca0a49eeac4a..b5991d53fc0e 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -19,6 +19,7 @@
 #include <linux/cc_platform.h>
 
 #include <asm/hypervisor.h>
+#include <asm/timer.h>
 #include <asm/x86_init.h>
 #include <asm/kvmclock.h>
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 42991d471bf3..4e37db8073f9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -60,13 +60,6 @@ void __init native_pv_lock_init(void)
 		static_branch_enable(&virt_spin_lock_key);
 }
 
-DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
-
-void paravirt_set_sched_clock(u64 (*func)(void))
-{
-	static_call_update(pv_sched_clock, func);
-}
-
 static noinstr void pv_native_safe_halt(void)
 {
 	native_safe_halt();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7d3e13e14eab..d5d0b500d13e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -267,19 +267,27 @@ u64 native_sched_clock_from_tsc(u64 tsc)
 /* We need to define a real function for sched_clock, to override the
    weak default version */
 #ifdef CONFIG_PARAVIRT
+DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
+
 noinstr u64 sched_clock_noinstr(void)
 {
-	return paravirt_sched_clock();
+	return static_call(pv_sched_clock)();
 }
 
 bool using_native_sched_clock(void)
 {
 	return static_call_query(pv_sched_clock) == native_sched_clock;
 }
+
+void paravirt_set_sched_clock(u64 (*func)(void))
+{
+	static_call_update(pv_sched_clock, func);
+}
 #else
 u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
 
 bool using_native_sched_clock(void) { return true; }
+void paravirt_set_sched_clock(u64 (*func)(void)) { }
 #endif
 
 notrace u64 sched_clock(void)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index e4754b2fa900..6f9f665bb7ae 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -19,6 +19,7 @@
 #include <linux/sched/cputime.h>
 
 #include <asm/pvclock.h>
+#include <asm/timer.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/cpuid.h>
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 10356d4ec55c..e9f5034a1bc8 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -535,6 +535,8 @@ static __always_inline void hv_setup_sched_clock(void *sched_clock)
 	sched_clock_register(sched_clock, 64, NSEC_PER_SEC);
 }
 #elif defined CONFIG_PARAVIRT
+#include <asm/timer.h>
+
 static __always_inline void hv_setup_sched_clock(void *sched_clock)
 {
 	/* We're on x86/x64 *and* using PV ops */
-- 
2.51.0


^ permalink raw reply related

* [PATCH v5 01/21] x86/paravirt: Remove not needed includes of paravirt.h
From: Juergen Gross @ 2026-01-05 11:05 UTC (permalink / raw)
  To: linux-kernel, x86, linux-hyperv
  Cc: Juergen Gross, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Peter Zijlstra,
	Will Deacon, Boqun Feng, Waiman Long, Jiri Kosina, Josh Poimboeuf,
	Pawan Gupta, Boris Ostrovsky, xen-devel
In-Reply-To: <20260105110520.21356-1-jgross@suse.com>

In some places asm/paravirt.h is included without really being needed.

Remove the related #include statements.

Signed-off-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
V3:
- reinstate the include in mmu_context.h (kernel test robot)
V4:
- reinstate the include in arch/x86/kernel/x86_init.c (Boris Petkov)
---
 arch/x86/entry/entry_64.S             | 1 -
 arch/x86/entry/vsyscall/vsyscall_64.c | 1 -
 arch/x86/hyperv/hv_spinlock.c         | 1 -
 arch/x86/include/asm/apic.h           | 4 ----
 arch/x86/include/asm/highmem.h        | 1 -
 arch/x86/include/asm/mshyperv.h       | 1 -
 arch/x86/include/asm/pgtable_32.h     | 1 -
 arch/x86/include/asm/spinlock.h       | 1 -
 arch/x86/include/asm/tlbflush.h       | 4 ----
 arch/x86/kernel/apm_32.c              | 1 -
 arch/x86/kernel/callthunks.c          | 1 -
 arch/x86/kernel/cpu/bugs.c            | 1 -
 arch/x86/kernel/vsmp_64.c             | 1 -
 arch/x86/lib/cache-smp.c              | 1 -
 arch/x86/mm/init.c                    | 1 -
 arch/x86/xen/spinlock.c               | 1 -
 16 files changed, 22 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f9983a1907bf..42447b1e1dff 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -31,7 +31,6 @@
 #include <asm/hw_irq.h>
 #include <asm/page_types.h>
 #include <asm/irqflags.h>
-#include <asm/paravirt.h>
 #include <asm/percpu.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 6e6c0a740837..4bd1e271bb22 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -37,7 +37,6 @@
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
 #include <asm/traps.h>
-#include <asm/paravirt.h>
 
 #define CREATE_TRACE_POINTS
 #include "vsyscall_trace.h"
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index 81b006601370..2a3c2afb0154 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -13,7 +13,6 @@
 #include <linux/spinlock.h>
 
 #include <asm/mshyperv.h>
-#include <asm/paravirt.h>
 #include <asm/apic.h>
 #include <asm/msr.h>
 
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index a26e66d66444..9cd493d467d4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -90,10 +90,6 @@ static inline bool apic_from_smp_config(void)
 /*
  * Basic functions accessing APICs.
  */
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
 static inline void native_apic_mem_write(u32 reg, u32 v)
 {
 	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 585bdadba47d..decfaaf52326 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -24,7 +24,6 @@
 #include <linux/interrupt.h>
 #include <linux/threads.h>
 #include <asm/tlbflush.h>
-#include <asm/paravirt.h>
 #include <asm/fixmap.h>
 #include <asm/pgtable_areas.h>
 
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index eef4c3a5ba28..f64393e853ee 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -8,7 +8,6 @@
 #include <linux/io.h>
 #include <linux/static_call.h>
 #include <asm/nospec-branch.h>
-#include <asm/paravirt.h>
 #include <asm/msr.h>
 #include <hyperv/hvhdk.h>
 #include <asm/fpu/types.h>
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index b612cc57a4d3..acea0cfa2460 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -16,7 +16,6 @@
 #ifndef __ASSEMBLER__
 #include <asm/processor.h>
 #include <linux/threads.h>
-#include <asm/paravirt.h>
 
 #include <linux/bitops.h>
 #include <linux/list.h>
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 5b6bc7016c22..934632b78d09 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -7,7 +7,6 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <linux/compiler.h>
-#include <asm/paravirt.h>
 #include <asm/bitops.h>
 
 /*
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 00daedfefc1b..238a6b807da5 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -300,10 +300,6 @@ static inline void mm_clear_asid_transition(struct mm_struct *mm) { }
 static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; }
 #endif /* CONFIG_BROADCAST_TLB_FLUSH */
 
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-
 #define flush_tlb_mm(mm)						\
 		flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
 
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b37ab1095707..3175d7c134e9 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,7 +229,6 @@
 #include <linux/uaccess.h>
 #include <asm/desc.h>
 #include <asm/olpc.h>
-#include <asm/paravirt.h>
 #include <asm/reboot.h>
 #include <asm/nospec-branch.h>
 #include <asm/ibt.h>
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index a951333c5995..e37728f70322 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -15,7 +15,6 @@
 #include <asm/insn.h>
 #include <asm/kexec.h>
 #include <asm/nospec-branch.h>
-#include <asm/paravirt.h>
 #include <asm/sections.h>
 #include <asm/switch_to.h>
 #include <asm/sync_core.h>
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d0a2847a4bb0..83f51cab0b1e 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -26,7 +26,6 @@
 #include <asm/fpu/api.h>
 #include <asm/msr.h>
 #include <asm/vmx.h>
-#include <asm/paravirt.h>
 #include <asm/cpu_device_id.h>
 #include <asm/e820/api.h>
 #include <asm/hypervisor.h>
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 73511332bb67..25625e3fc183 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -18,7 +18,6 @@
 #include <asm/apic.h>
 #include <asm/pci-direct.h>
 #include <asm/io.h>
-#include <asm/paravirt.h>
 #include <asm/setup.h>
 
 #define TOPOLOGY_REGISTER_OFFSET 0x10
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
index 824664c0ecbd..7d3edd6deb6b 100644
--- a/arch/x86/lib/cache-smp.c
+++ b/arch/x86/lib/cache-smp.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <asm/paravirt.h>
 #include <linux/smp.h>
 #include <linux/export.h>
 #include <linux/kvm_types.h>
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8bf6ad4b9400..76537d40493c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -27,7 +27,6 @@
 #include <asm/pti.h>
 #include <asm/text-patching.h>
 #include <asm/memtype.h>
-#include <asm/paravirt.h>
 #include <asm/mmu_context.h>
 
 /*
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 8e4efe0fb6f9..fe56646d6919 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -8,7 +8,6 @@
 #include <linux/slab.h>
 #include <linux/atomic.h>
 
-#include <asm/paravirt.h>
 #include <asm/qspinlock.h>
 
 #include <xen/events.h>
-- 
2.51.0


^ permalink raw reply related

* [PATCH v5 00/21] paravirt: cleanup and reorg
From: Juergen Gross @ 2026-01-05 11:04 UTC (permalink / raw)
  To: linux-kernel, x86, linux-hyperv, virtualization, loongarch,
	linuxppc-dev, linux-riscv, kvm
  Cc: Juergen Gross, Andy Lutomirski, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Peter Zijlstra,
	Will Deacon, Boqun Feng, Waiman Long, Jiri Kosina, Josh Poimboeuf,
	Pawan Gupta, Boris Ostrovsky, xen-devel, Ajay Kaher,
	Alexey Makhalov, Broadcom internal kernel review list,
	Russell King, Catalin Marinas, Huacai Chen, WANG Xuerui,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy (CS GROUP), Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, Juri Lelli, Vincent Guittot,
	Dietmar Eggemann, Steven Rostedt, Ben Segall, Mel Gorman,
	Valentin Schneider, linux-arm-kernel, Paolo Bonzini,
	Vitaly Kuznetsov, Stefano Stabellini, Oleksandr Tyshchenko,
	Daniel Lezcano, Oleg Nesterov

Some cleanups and reorg of paravirt code and headers:

- The first 2 patches should be not controversial at all, as they
  remove just some no longer needed #include and struct forward
  declarations.

- The 3rd patch is removing CONFIG_PARAVIRT_DEBUG, which IMO has
  no real value, as it just changes a crash to a BUG() (the stack
  trace will basically be the same). As the maintainer of the main
  paravirt user (Xen) I have never seen this crash/BUG() to happen.

- The 4th patch is just a movement of code.

- I don't know for what reason asm/paravirt_api_clock.h was added,
  as all archs supporting it do it exactly in the same way. Patch
  5 is removing it.

- Patches 6-14 are streamlining the paravirt clock interfaces by
  using a common implementation across architectures where possible
  and by moving the related code into common sched code, as this is
  where it should live.

- Patches 15-20 are more like RFC material preparing the paravirt
  infrastructure to support multiple pv_ops function arrays.
  As a prerequisite for that it makes life in objtool much easier
  with dropping the Xen static initializers of the pv_ops sub-
  structures, which is done in patches 15-17.
  Patches 18-20 are doing the real preparations for multiple pv_ops
  arrays and using those arrays in multiple headers.

- Patch 21 is an example how the new scheme can look like using the
  PV-spinlocks.

Changes in V2:
- new patches 13-18 and 20
- complete rework of patch 21

Changes in V3:
- fixed 2 issues detected by kernel test robot

Changes in V4:
- fixed one build issue

Changes in V5:
- fixed another build issue
- rebase

Juergen Gross (21):
  x86/paravirt: Remove not needed includes of paravirt.h
  x86/paravirt: Remove some unneeded struct declarations
  x86/paravirt: Remove PARAVIRT_DEBUG config option
  x86/paravirt: Move thunk macros to paravirt_types.h
  paravirt: Remove asm/paravirt_api_clock.h
  sched: Move clock related paravirt code to kernel/sched
  arm/paravirt: Use common code for paravirt_steal_clock()
  arm64/paravirt: Use common code for paravirt_steal_clock()
  loongarch/paravirt: Use common code for paravirt_steal_clock()
  riscv/paravirt: Use common code for paravirt_steal_clock()
  x86/paravirt: Use common code for paravirt_steal_clock()
  x86/paravirt: Move paravirt_sched_clock() related code into tsc.c
  x86/paravirt: Introduce new paravirt-base.h header
  x86/paravirt: Move pv_native_*() prototypes to paravirt.c
  x86/xen: Drop xen_irq_ops
  x86/xen: Drop xen_cpu_ops
  x86/xen: Drop xen_mmu_ops
  objtool: Allow multiple pv_ops arrays
  x86/paravirt: Allow pv-calls outside paravirt.h
  x86/paravirt: Specify pv_ops array in paravirt macros
  x86/pvlocks: Move paravirt spinlock functions into own header

 arch/Kconfig                                  |   3 +
 arch/arm/Kconfig                              |   1 +
 arch/arm/include/asm/paravirt.h               |  22 --
 arch/arm/include/asm/paravirt_api_clock.h     |   1 -
 arch/arm/kernel/Makefile                      |   1 -
 arch/arm/kernel/paravirt.c                    |  23 --
 arch/arm64/Kconfig                            |   1 +
 arch/arm64/include/asm/paravirt.h             |  14 -
 arch/arm64/include/asm/paravirt_api_clock.h   |   1 -
 arch/arm64/kernel/paravirt.c                  |  11 +-
 arch/loongarch/Kconfig                        |   1 +
 arch/loongarch/include/asm/paravirt.h         |  13 -
 .../include/asm/paravirt_api_clock.h          |   1 -
 arch/loongarch/kernel/paravirt.c              |  10 +-
 arch/powerpc/include/asm/paravirt.h           |   3 -
 arch/powerpc/include/asm/paravirt_api_clock.h |   2 -
 arch/powerpc/platforms/pseries/setup.c        |   4 +-
 arch/riscv/Kconfig                            |   1 +
 arch/riscv/include/asm/paravirt.h             |  14 -
 arch/riscv/include/asm/paravirt_api_clock.h   |   1 -
 arch/riscv/kernel/paravirt.c                  |  11 +-
 arch/x86/Kconfig                              |   8 +-
 arch/x86/entry/entry_64.S                     |   1 -
 arch/x86/entry/vsyscall/vsyscall_64.c         |   1 -
 arch/x86/hyperv/hv_spinlock.c                 |  11 +-
 arch/x86/include/asm/apic.h                   |   4 -
 arch/x86/include/asm/highmem.h                |   1 -
 arch/x86/include/asm/mshyperv.h               |   1 -
 arch/x86/include/asm/paravirt-base.h          |  35 ++
 arch/x86/include/asm/paravirt-spinlock.h      | 145 ++++++++
 arch/x86/include/asm/paravirt.h               | 331 +++++-------------
 arch/x86/include/asm/paravirt_api_clock.h     |   1 -
 arch/x86/include/asm/paravirt_types.h         | 269 +++++++-------
 arch/x86/include/asm/pgtable_32.h             |   1 -
 arch/x86/include/asm/ptrace.h                 |   2 +-
 arch/x86/include/asm/qspinlock.h              |  87 +----
 arch/x86/include/asm/spinlock.h               |   1 -
 arch/x86/include/asm/timer.h                  |   1 +
 arch/x86/include/asm/tlbflush.h               |   4 -
 arch/x86/kernel/Makefile                      |   2 +-
 arch/x86/kernel/apm_32.c                      |   1 -
 arch/x86/kernel/callthunks.c                  |   1 -
 arch/x86/kernel/cpu/bugs.c                    |   1 -
 arch/x86/kernel/cpu/vmware.c                  |   1 +
 arch/x86/kernel/kvm.c                         |  13 +-
 arch/x86/kernel/kvmclock.c                    |   1 +
 arch/x86/kernel/paravirt-spinlocks.c          |  26 +-
 arch/x86/kernel/paravirt.c                    |  42 +--
 arch/x86/kernel/tsc.c                         |  10 +-
 arch/x86/kernel/vsmp_64.c                     |   1 -
 arch/x86/lib/cache-smp.c                      |   1 -
 arch/x86/mm/init.c                            |   1 -
 arch/x86/xen/enlighten_pv.c                   |  82 ++---
 arch/x86/xen/irq.c                            |  20 +-
 arch/x86/xen/mmu_pv.c                         | 100 ++----
 arch/x86/xen/spinlock.c                       |  11 +-
 arch/x86/xen/time.c                           |   2 +
 drivers/clocksource/hyperv_timer.c            |   2 +
 drivers/xen/time.c                            |   2 +-
 include/linux/sched/cputime.h                 |  18 +
 kernel/sched/core.c                           |   5 +
 kernel/sched/cputime.c                        |  13 +
 kernel/sched/sched.h                          |   3 +-
 tools/objtool/arch/x86/decode.c               |   8 +-
 tools/objtool/check.c                         |  78 ++++-
 tools/objtool/include/objtool/check.h         |   1 +
 66 files changed, 662 insertions(+), 827 deletions(-)
 delete mode 100644 arch/arm/include/asm/paravirt.h
 delete mode 100644 arch/arm/include/asm/paravirt_api_clock.h
 delete mode 100644 arch/arm/kernel/paravirt.c
 delete mode 100644 arch/arm64/include/asm/paravirt_api_clock.h
 delete mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
 delete mode 100644 arch/powerpc/include/asm/paravirt_api_clock.h
 delete mode 100644 arch/riscv/include/asm/paravirt_api_clock.h
 create mode 100644 arch/x86/include/asm/paravirt-base.h
 create mode 100644 arch/x86/include/asm/paravirt-spinlock.h
 delete mode 100644 arch/x86/include/asm/paravirt_api_clock.h

-- 
2.51.0


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox