Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH net-next 1/9] net: benet: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Since ETHTOOL_GRXRINGS was the only command handled by be_get_rxnfc(),
remove the function entirely.

Since the be_multi_rxq() check in be_get_rxnfc() previously blocked RSS
configuration on single-queue setups (via ethtool core validation), add
an equivalent check to be_set_rxfh() to preserve this behavior, as
suggested by Jakub.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/emulex/benet/be_ethtool.c | 37 ++++++++++----------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c
index f55f1fd5d90fd..87dbbd5b7f4e6 100644
--- a/drivers/net/ethernet/emulex/benet/be_ethtool.c
+++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c
@@ -1073,6 +1073,13 @@ static void be_set_msg_level(struct net_device *netdev, u32 level)
 	adapter->msg_enable = level;
 }
 
+static u32 be_get_rx_ring_count(struct net_device *netdev)
+{
+	struct be_adapter *adapter = netdev_priv(netdev);
+
+	return adapter->num_rx_qs;
+}
+
 static int be_get_rxfh_fields(struct net_device *netdev,
 			      struct ethtool_rxfh_fields *cmd)
 {
@@ -1117,28 +1124,6 @@ static int be_get_rxfh_fields(struct net_device *netdev,
 	return 0;
 }
 
-static int be_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd,
-			u32 *rule_locs)
-{
-	struct be_adapter *adapter = netdev_priv(netdev);
-
-	if (!be_multi_rxq(adapter)) {
-		dev_info(&adapter->pdev->dev,
-			 "ethtool::get_rxnfc: RX flow hashing is disabled\n");
-		return -EINVAL;
-	}
-
-	switch (cmd->cmd) {
-	case ETHTOOL_GRXRINGS:
-		cmd->data = adapter->num_rx_qs;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int be_set_rxfh_fields(struct net_device *netdev,
 			      const struct ethtool_rxfh_fields *cmd,
 			      struct netlink_ext_ack *extack)
@@ -1293,6 +1278,12 @@ static int be_set_rxfh(struct net_device *netdev,
 	u8 *hkey = rxfh->key;
 	u8 rsstable[RSS_INDIR_TABLE_LEN];
 
+	if (!be_multi_rxq(adapter)) {
+		dev_info(&adapter->pdev->dev,
+			 "ethtool::set_rxfh: RX flow hashing is disabled\n");
+		return -EINVAL;
+	}
+
 	/* We do not allow change in unsupported parameters */
 	if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
 	    rxfh->hfunc != ETH_RSS_HASH_TOP)
@@ -1441,7 +1432,7 @@ const struct ethtool_ops be_ethtool_ops = {
 	.get_ethtool_stats = be_get_ethtool_stats,
 	.flash_device = be_do_flash,
 	.self_test = be_self_test,
-	.get_rxnfc = be_get_rxnfc,
+	.get_rx_ring_count = be_get_rx_ring_count,
 	.get_rxfh_fields = be_get_rxfh_fields,
 	.set_rxfh_fields = be_set_rxfh_fields,
 	.get_rxfh_indir_size = be_get_rxfh_indir_size,

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 2/9] net: atlantic: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index 6fef47ba0a59b..d8b5491c9cb2b 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -500,20 +500,22 @@ static int aq_ethtool_set_rss(struct net_device *netdev,
 	return err;
 }
 
+static u32 aq_ethtool_get_rx_ring_count(struct net_device *ndev)
+{
+	struct aq_nic_s *aq_nic = netdev_priv(ndev);
+	struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(aq_nic);
+
+	return cfg->vecs;
+}
+
 static int aq_ethtool_get_rxnfc(struct net_device *ndev,
 				struct ethtool_rxnfc *cmd,
 				u32 *rule_locs)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
-	struct aq_nic_cfg_s *cfg;
 	int err = 0;
 
-	cfg = aq_nic_get_cfg(aq_nic);
-
 	switch (cmd->cmd) {
-	case ETHTOOL_GRXRINGS:
-		cmd->data = cfg->vecs;
-		break;
 	case ETHTOOL_GRXCLSRLCNT:
 		cmd->rule_cnt = aq_get_rxnfc_count_all_rules(aq_nic);
 		break;
@@ -1072,6 +1074,7 @@ const struct ethtool_ops aq_ethtool_ops = {
 	.set_rxfh            = aq_ethtool_set_rss,
 	.get_rxnfc           = aq_ethtool_get_rxnfc,
 	.set_rxnfc           = aq_ethtool_set_rxnfc,
+	.get_rx_ring_count   = aq_ethtool_get_rx_ring_count,
 	.get_msglevel        = aq_get_msg_level,
 	.set_msglevel        = aq_set_msg_level,
 	.get_sset_count      = aq_ethtool_get_sset_count,

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 3/9] net: nfp: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 16c828dd5c1a3..e88b1c4732a57 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -1435,15 +1435,19 @@ static int nfp_net_get_fs_loc(struct nfp_net *nn, u32 *rule_locs)
 	return 0;
 }
 
+static u32 nfp_net_get_rx_ring_count(struct net_device *netdev)
+{
+	struct nfp_net *nn = netdev_priv(netdev);
+
+	return nn->dp.num_rx_rings;
+}
+
 static int nfp_net_get_rxnfc(struct net_device *netdev,
 			     struct ethtool_rxnfc *cmd, u32 *rule_locs)
 {
 	struct nfp_net *nn = netdev_priv(netdev);
 
 	switch (cmd->cmd) {
-	case ETHTOOL_GRXRINGS:
-		cmd->data = nn->dp.num_rx_rings;
-		return 0;
 	case ETHTOOL_GRXCLSRLCNT:
 		cmd->rule_cnt = nn->fs.count;
 		return 0;
@@ -2501,6 +2505,7 @@ static const struct ethtool_ops nfp_net_ethtool_ops = {
 	.get_sset_count		= nfp_net_get_sset_count,
 	.get_rxnfc		= nfp_net_get_rxnfc,
 	.set_rxnfc		= nfp_net_set_rxnfc,
+	.get_rx_ring_count	= nfp_net_get_rx_ring_count,
 	.get_rxfh_indir_size	= nfp_net_get_rxfh_indir_size,
 	.get_rxfh_key_size	= nfp_net_get_rxfh_key_size,
 	.get_rxfh		= nfp_net_get_rxfh,

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 4/9] net: mana: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Since ETHTOOL_GRXRINGS was the only command handled by mana_get_rxnfc(),
remove the function entirely.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 0e2f4343ac67f..f2d220b371b5d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -282,18 +282,11 @@ static void mana_get_ethtool_stats(struct net_device *ndev,
 	}
 }
 
-static int mana_get_rxnfc(struct net_device *ndev, struct ethtool_rxnfc *cmd,
-			  u32 *rules)
+static u32 mana_get_rx_ring_count(struct net_device *ndev)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 
-	switch (cmd->cmd) {
-	case ETHTOOL_GRXRINGS:
-		cmd->data = apc->num_queues;
-		return 0;
-	}
-
-	return -EOPNOTSUPP;
+	return apc->num_queues;
 }
 
 static u32 mana_get_rxfh_key_size(struct net_device *ndev)
@@ -520,7 +513,7 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.get_ethtool_stats	= mana_get_ethtool_stats,
 	.get_sset_count		= mana_get_sset_count,
 	.get_strings		= mana_get_strings,
-	.get_rxnfc		= mana_get_rxnfc,
+	.get_rx_ring_count	= mana_get_rx_ring_count,
 	.get_rxfh_key_size	= mana_get_rxfh_key_size,
 	.get_rxfh_indir_size	= mana_rss_indir_size,
 	.get_rxfh		= mana_get_rxfh,

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 5/9] net: fbnic: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c
index 61b8005a0db5f..11745a2d8a443 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c
@@ -825,6 +825,13 @@ static int fbnic_get_cls_rule(struct fbnic_net *fbn, struct ethtool_rxnfc *cmd)
 	return 0;
 }
 
+static u32 fbnic_get_rx_ring_count(struct net_device *netdev)
+{
+	struct fbnic_net *fbn = netdev_priv(netdev);
+
+	return fbn->num_rx_queues;
+}
+
 static int fbnic_get_rxnfc(struct net_device *netdev,
 			   struct ethtool_rxnfc *cmd, u32 *rule_locs)
 {
@@ -833,10 +840,6 @@ static int fbnic_get_rxnfc(struct net_device *netdev,
 	u32 special = 0;
 
 	switch (cmd->cmd) {
-	case ETHTOOL_GRXRINGS:
-		cmd->data = fbn->num_rx_queues;
-		ret = 0;
-		break;
 	case ETHTOOL_GRXCLSRULE:
 		ret = fbnic_get_cls_rule(fbn, cmd);
 		break;
@@ -1895,6 +1898,7 @@ static const struct ethtool_ops fbnic_ethtool_ops = {
 	.get_sset_count			= fbnic_get_sset_count,
 	.get_rxnfc			= fbnic_get_rxnfc,
 	.set_rxnfc			= fbnic_set_rxnfc,
+	.get_rx_ring_count		= fbnic_get_rx_ring_count,
 	.get_rxfh_key_size		= fbnic_get_rxfh_key_size,
 	.get_rxfh_indir_size		= fbnic_get_rxfh_indir_size,
 	.get_rxfh			= fbnic_get_rxfh,

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 6/9] net: ionic: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Since ETHTOOL_GRXRINGS was the only command handled by ionic_get_rxnfc(),
remove the function entirely.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
index 2d9efadb5d2ae..b0a459eeaa640 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
@@ -843,23 +843,11 @@ static int ionic_set_channels(struct net_device *netdev,
 	return err;
 }
 
-static int ionic_get_rxnfc(struct net_device *netdev,
-			   struct ethtool_rxnfc *info, u32 *rules)
+static u32 ionic_get_rx_ring_count(struct net_device *netdev)
 {
 	struct ionic_lif *lif = netdev_priv(netdev);
-	int err = 0;
-
-	switch (info->cmd) {
-	case ETHTOOL_GRXRINGS:
-		info->data = lif->nxqs;
-		break;
-	default:
-		netdev_dbg(netdev, "Command parameter %d is not supported\n",
-			   info->cmd);
-		err = -EOPNOTSUPP;
-	}
 
-	return err;
+	return lif->nxqs;
 }
 
 static u32 ionic_get_rxfh_indir_size(struct net_device *netdev)
@@ -1152,7 +1140,7 @@ static const struct ethtool_ops ionic_ethtool_ops = {
 	.get_strings		= ionic_get_strings,
 	.get_ethtool_stats	= ionic_get_stats,
 	.get_sset_count		= ionic_get_sset_count,
-	.get_rxnfc		= ionic_get_rxnfc,
+	.get_rx_ring_count	= ionic_get_rx_ring_count,
 	.get_rxfh_indir_size	= ionic_get_rxfh_indir_size,
 	.get_rxfh_key_size	= ionic_get_rxfh_key_size,
 	.get_rxfh		= ionic_get_rxfh,

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 7/9] net: sfc: efx: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/sfc/ef100_ethtool.c  |  1 +
 drivers/net/ethernet/sfc/ethtool.c        |  1 +
 drivers/net/ethernet/sfc/ethtool_common.c | 11 +++++++----
 drivers/net/ethernet/sfc/ethtool_common.h |  1 +
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef100_ethtool.c b/drivers/net/ethernet/sfc/ef100_ethtool.c
index 6c3b74000d3b6..05dc7b10c8855 100644
--- a/drivers/net/ethernet/sfc/ef100_ethtool.c
+++ b/drivers/net/ethernet/sfc/ef100_ethtool.c
@@ -54,6 +54,7 @@ const struct ethtool_ops ef100_ethtool_ops = {
 	.get_ethtool_stats	= efx_ethtool_get_stats,
 	.get_rxnfc              = efx_ethtool_get_rxnfc,
 	.set_rxnfc              = efx_ethtool_set_rxnfc,
+	.get_rx_ring_count	= efx_ethtool_get_rx_ring_count,
 	.reset                  = efx_ethtool_reset,
 
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
index 18fe5850a9786..362388754a292 100644
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@@ -261,6 +261,7 @@ const struct ethtool_ops efx_ethtool_ops = {
 	.reset			= efx_ethtool_reset,
 	.get_rxnfc		= efx_ethtool_get_rxnfc,
 	.set_rxnfc		= efx_ethtool_set_rxnfc,
+	.get_rx_ring_count	= efx_ethtool_get_rx_ring_count,
 	.get_rxfh_indir_size	= efx_ethtool_get_rxfh_indir_size,
 	.get_rxfh_key_size	= efx_ethtool_get_rxfh_key_size,
 	.rxfh_per_ctx_fields	= true,
diff --git a/drivers/net/ethernet/sfc/ethtool_common.c b/drivers/net/ethernet/sfc/ethtool_common.c
index fa303e171d98b..2fc42b1a2bfb7 100644
--- a/drivers/net/ethernet/sfc/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/ethtool_common.c
@@ -850,6 +850,13 @@ int efx_ethtool_get_rxfh_fields(struct net_device *net_dev,
 	return rc;
 }
 
+u32 efx_ethtool_get_rx_ring_count(struct net_device *net_dev)
+{
+	struct efx_nic *efx = efx_netdev_priv(net_dev);
+
+	return efx->n_rx_channels;
+}
+
 int efx_ethtool_get_rxnfc(struct net_device *net_dev,
 			  struct ethtool_rxnfc *info, u32 *rule_locs)
 {
@@ -858,10 +865,6 @@ int efx_ethtool_get_rxnfc(struct net_device *net_dev,
 	s32 rc = 0;
 
 	switch (info->cmd) {
-	case ETHTOOL_GRXRINGS:
-		info->data = efx->n_rx_channels;
-		return 0;
-
 	case ETHTOOL_GRXCLSRLCNT:
 		info->data = efx_filter_get_rx_id_limit(efx);
 		if (info->data == 0)
diff --git a/drivers/net/ethernet/sfc/ethtool_common.h b/drivers/net/ethernet/sfc/ethtool_common.h
index 24db4fccbe78a..f96db42534546 100644
--- a/drivers/net/ethernet/sfc/ethtool_common.h
+++ b/drivers/net/ethernet/sfc/ethtool_common.h
@@ -40,6 +40,7 @@ int efx_ethtool_set_fecparam(struct net_device *net_dev,
 			     struct ethtool_fecparam *fecparam);
 int efx_ethtool_get_rxnfc(struct net_device *net_dev,
 			  struct ethtool_rxnfc *info, u32 *rule_locs);
+u32 efx_ethtool_get_rx_ring_count(struct net_device *net_dev);
 int efx_ethtool_set_rxnfc(struct net_device *net_dev,
 			  struct ethtool_rxnfc *info);
 u32 efx_ethtool_get_rxfh_indir_size(struct net_device *net_dev);

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 8/9] net: sfc: siena: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/sfc/siena/ethtool.c        |  1 +
 drivers/net/ethernet/sfc/siena/ethtool_common.c | 11 +++++++----
 drivers/net/ethernet/sfc/siena/ethtool_common.h |  1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c
index 8c3ebd0617fb1..36feedffe4444 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool.c
+++ b/drivers/net/ethernet/sfc/siena/ethtool.c
@@ -261,6 +261,7 @@ const struct ethtool_ops efx_siena_ethtool_ops = {
 	.reset			= efx_siena_ethtool_reset,
 	.get_rxnfc		= efx_siena_ethtool_get_rxnfc,
 	.set_rxnfc		= efx_siena_ethtool_set_rxnfc,
+	.get_rx_ring_count	= efx_siena_ethtool_get_rx_ring_count,
 	.get_rxfh_indir_size	= efx_siena_ethtool_get_rxfh_indir_size,
 	.get_rxfh_key_size	= efx_siena_ethtool_get_rxfh_key_size,
 	.get_rxfh		= efx_siena_ethtool_get_rxfh,
diff --git a/drivers/net/ethernet/sfc/siena/ethtool_common.c b/drivers/net/ethernet/sfc/siena/ethtool_common.c
index 47cd16a113cf1..c56e0b54d8541 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/siena/ethtool_common.c
@@ -841,6 +841,13 @@ int efx_siena_ethtool_get_rxfh_fields(struct net_device *net_dev,
 	return 0;
 }
 
+u32 efx_siena_ethtool_get_rx_ring_count(struct net_device *net_dev)
+{
+	struct efx_nic *efx = netdev_priv(net_dev);
+
+	return efx->n_rx_channels;
+}
+
 int efx_siena_ethtool_get_rxnfc(struct net_device *net_dev,
 				struct ethtool_rxnfc *info, u32 *rule_locs)
 {
@@ -849,10 +856,6 @@ int efx_siena_ethtool_get_rxnfc(struct net_device *net_dev,
 	s32 rc = 0;
 
 	switch (info->cmd) {
-	case ETHTOOL_GRXRINGS:
-		info->data = efx->n_rx_channels;
-		return 0;
-
 	case ETHTOOL_GRXCLSRLCNT:
 		info->data = efx_filter_get_rx_id_limit(efx);
 		if (info->data == 0)
diff --git a/drivers/net/ethernet/sfc/siena/ethtool_common.h b/drivers/net/ethernet/sfc/siena/ethtool_common.h
index 278d69e920d9f..7b445b0ba38aa 100644
--- a/drivers/net/ethernet/sfc/siena/ethtool_common.h
+++ b/drivers/net/ethernet/sfc/siena/ethtool_common.h
@@ -37,6 +37,7 @@ int efx_siena_ethtool_set_fecparam(struct net_device *net_dev,
 				   struct ethtool_fecparam *fecparam);
 int efx_siena_ethtool_get_rxnfc(struct net_device *net_dev,
 				struct ethtool_rxnfc *info, u32 *rule_locs);
+u32 efx_siena_ethtool_get_rx_ring_count(struct net_device *net_dev);
 int efx_siena_ethtool_set_rxnfc(struct net_device *net_dev,
 				struct ethtool_rxnfc *info);
 u32 efx_siena_ethtool_get_rxfh_indir_size(struct net_device *net_dev);

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 9/9] net: sfc: falcon: convert to use .get_rx_ring_count
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao
In-Reply-To: <20260121-grxring_big_v4-v1-0-07655be56bcf@debian.org>

Use the newly introduced .get_rx_ring_count ethtool ops callback instead
of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 drivers/net/ethernet/sfc/falcon/ethtool.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/sfc/falcon/ethtool.c b/drivers/net/ethernet/sfc/falcon/ethtool.c
index 27d1cd6f24ca1..0493640315454 100644
--- a/drivers/net/ethernet/sfc/falcon/ethtool.c
+++ b/drivers/net/ethernet/sfc/falcon/ethtool.c
@@ -974,6 +974,13 @@ ef4_ethtool_get_rxfh_fields(struct net_device *net_dev,
 	return 0;
 }
 
+static u32 ef4_ethtool_get_rx_ring_count(struct net_device *net_dev)
+{
+	struct ef4_nic *efx = netdev_priv(net_dev);
+
+	return efx->n_rx_channels;
+}
+
 static int
 ef4_ethtool_get_rxnfc(struct net_device *net_dev,
 		      struct ethtool_rxnfc *info, u32 *rule_locs)
@@ -981,10 +988,6 @@ ef4_ethtool_get_rxnfc(struct net_device *net_dev,
 	struct ef4_nic *efx = netdev_priv(net_dev);
 
 	switch (info->cmd) {
-	case ETHTOOL_GRXRINGS:
-		info->data = efx->n_rx_channels;
-		return 0;
-
 	case ETHTOOL_GRXCLSRLCNT:
 		info->data = ef4_filter_get_rx_id_limit(efx);
 		if (info->data == 0)
@@ -1348,6 +1351,7 @@ const struct ethtool_ops ef4_ethtool_ops = {
 	.reset			= ef4_ethtool_reset,
 	.get_rxnfc		= ef4_ethtool_get_rxnfc,
 	.set_rxnfc		= ef4_ethtool_set_rxnfc,
+	.get_rx_ring_count	= ef4_ethtool_get_rx_ring_count,
 	.get_rxfh_indir_size	= ef4_ethtool_get_rxfh_indir_size,
 	.get_rxfh		= ef4_ethtool_get_rxfh,
 	.set_rxfh		= ef4_ethtool_set_rxfh,

-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH net-next v15 01/12] vsock: add netns to vsock core
From: Paolo Abeni @ 2026-01-21 16:32 UTC (permalink / raw)
  To: Stefano Garzarella, Bobby Eshleman
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Simon Horman,
	Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
	Eugenio Pérez, Xuan Zhuo, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	Broadcom internal kernel review list, Shuah Khan, Long Li,
	Jonathan Corbet, linux-kernel, virtualization, netdev, kvm,
	linux-hyperv, linux-kselftest, berrange, Sargun Dhillon,
	linux-doc, Bobby Eshleman
In-Reply-To: <aXDYfYy3f1NQm5A0@sgarzare-redhat>

On 1/21/26 3:48 PM, Stefano Garzarella wrote:
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index a8d0afde7f85..b6e3bfe365a1 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -8253,6 +8253,20 @@ Kernel parameters
>> 			            them quite hard to use for exploits but
>> 			            might break your system.
>>
>> +	vsock_init_ns_mode=
>> +			[KNL,NET] Set the vsock namespace mode for the init
>> +			(root) network namespace.
>> +
>> +			global      [default] The init namespace operates in
>> +			            global mode where CIDs are system-wide and
>> +			            sockets can communicate across global
>> +			            namespaces.
>> +
>> +			local       The init namespace operates in local mode
>> +			            where CIDs are private to the namespace and
>> +			            sockets can only communicate within the same
>> +			            namespace.
>> +
> 
> My comment on v14 was more to start a discussion :-) sorry to not be 
> clear.
> 
> I briefly discussed it with Paolo in chat to better understand our 
> policy between cmdline parameters and module parameters, and it seems 
> that both are discouraged.

Double checking the git log it looks like __setup() usage is less
constrained/restricted than what I thought.

> So he asked me if we have a use case for this, and thinking about it, I 
> don't have one at the moment. Also, if a user decides to set all netns 
> to local, whether init_net is local or global doesn't really matter, 
> right?
> 
> So perhaps before adding this, we should have a real use case.
> Perhaps more than this feature, I would add a way to change the default 
> of all netns (including init_net) from global to local. But we can do 
> that later, since all netns have a way to understand what mode they are 
> in, so we don't break anything and the user has to explicitly change it, 
> knowing that they are breaking compatibility with pre-netns support.\

Lacking a clear use-case for vsock_init_ns_mode I tend to think it would
be better to postpone its introduction. It should be easier to add it
later than vice-versa.

If there is a clear/well defined/known use-case, I guess the series can
go as-is.

/P


^ permalink raw reply

* Re: [PATCH net-next 6/9] net: ionic: convert to use .get_rx_ring_count
From: Creeley, Brett @ 2026-01-21 16:47 UTC (permalink / raw)
  To: Breno Leitao, Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Igor Russkikh, Simon Horman, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Alexander Duyck,
	kernel-team, Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers
In-Reply-To: <20260121-grxring_big_v4-v1-6-07655be56bcf@debian.org>



On 1/21/2026 7:54 AM, Breno Leitao wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Use the newly introduced .get_rx_ring_count ethtool ops callback instead
> of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().
>
> Since ETHTOOL_GRXRINGS was the only command handled by ionic_get_rxnfc(),
> remove the function entirely.
>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>   drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 18 +++---------------
>   1 file changed, 3 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
> index 2d9efadb5d2ae..b0a459eeaa640 100644
> --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
> +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c
> @@ -843,23 +843,11 @@ static int ionic_set_channels(struct net_device *netdev,
>          return err;
>   }
>
> -static int ionic_get_rxnfc(struct net_device *netdev,
> -                          struct ethtool_rxnfc *info, u32 *rules)
> +static u32 ionic_get_rx_ring_count(struct net_device *netdev)
>   {
>          struct ionic_lif *lif = netdev_priv(netdev);
> -       int err = 0;
> -
> -       switch (info->cmd) {
> -       case ETHTOOL_GRXRINGS:
> -               info->data = lif->nxqs;
> -               break;
> -       default:
> -               netdev_dbg(netdev, "Command parameter %d is not supported\n",
> -                          info->cmd);
> -               err = -EOPNOTSUPP;
> -       }
>
> -       return err;
> +       return lif->nxqs;
>   }
>
>   static u32 ionic_get_rxfh_indir_size(struct net_device *netdev)
> @@ -1152,7 +1140,7 @@ static const struct ethtool_ops ionic_ethtool_ops = {
>          .get_strings            = ionic_get_strings,
>          .get_ethtool_stats      = ionic_get_stats,
>          .get_sset_count         = ionic_get_sset_count,
> -       .get_rxnfc              = ionic_get_rxnfc,
> +       .get_rx_ring_count      = ionic_get_rx_ring_count,

LGTM. Thanks.

Reviewed-by: Brett Creeley <brett.creeley@amd.com>

>          .get_rxfh_indir_size    = ionic_get_rxfh_indir_size,
>          .get_rxfh_key_size      = ionic_get_rxfh_key_size,
>          .get_rxfh               = ionic_get_rxfh,
>
> --
> 2.47.3
>


^ permalink raw reply

* Re: [PATCH net-next 2/9] net: atlantic: convert to use .get_rx_ring_count
From: Creeley, Brett @ 2026-01-21 16:49 UTC (permalink / raw)
  To: Breno Leitao, Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Igor Russkikh, Simon Horman, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Alexander Duyck,
	kernel-team, Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers
In-Reply-To: <20260121-grxring_big_v4-v1-2-07655be56bcf@debian.org>



On 1/21/2026 7:54 AM, Breno Leitao wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> Use the newly introduced .get_rx_ring_count ethtool ops callback instead
> of handling ETHTOOL_GRXRINGS directly in .get_rxnfc().
>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
>   drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c | 15 +++++++++------
>   1 file changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
> index 6fef47ba0a59b..d8b5491c9cb2b 100644
> --- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
> +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
> @@ -500,20 +500,22 @@ static int aq_ethtool_set_rss(struct net_device *netdev,
>          return err;
>   }
>
> +static u32 aq_ethtool_get_rx_ring_count(struct net_device *ndev)
> +{
> +       struct aq_nic_s *aq_nic = netdev_priv(ndev);
> +       struct aq_nic_cfg_s *cfg = aq_nic_get_cfg(aq_nic);
> +
> +       return cfg->vecs;
> +}
> +

Tiny nit, but RCT ordering is not maintained.

Thanks,

Brett
>   static int aq_ethtool_get_rxnfc(struct net_device *ndev,
>                                  struct ethtool_rxnfc *cmd,
>                                  u32 *rule_locs)
>   {
>          struct aq_nic_s *aq_nic = netdev_priv(ndev);
> -       struct aq_nic_cfg_s *cfg;
>          int err = 0;
>
> -       cfg = aq_nic_get_cfg(aq_nic);
> -
>          switch (cmd->cmd) {
> -       case ETHTOOL_GRXRINGS:
> -               cmd->data = cfg->vecs;
> -               break;
>          case ETHTOOL_GRXCLSRLCNT:
>                  cmd->rule_cnt = aq_get_rxnfc_count_all_rules(aq_nic);
>                  break;
> @@ -1072,6 +1074,7 @@ const struct ethtool_ops aq_ethtool_ops = {
>          .set_rxfh            = aq_ethtool_set_rss,
>          .get_rxnfc           = aq_ethtool_get_rxnfc,
>          .set_rxnfc           = aq_ethtool_set_rxnfc,
> +       .get_rx_ring_count   = aq_ethtool_get_rx_ring_count,
>          .get_msglevel        = aq_get_msg_level,
>          .set_msglevel        = aq_set_msg_level,
>          .get_sset_count      = aq_ethtool_get_sset_count,
>
> --
> 2.47.3
>


^ permalink raw reply

* [PATCH net-next 0/9] net: convert drivers to .get_rx_ring_count (last part)
From: Breno Leitao @ 2026-01-21 15:54 UTC (permalink / raw)
  To: Ajit Khaparde, Sriharsha Basavapatna, Somnath Kotur, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Igor Russkikh, Simon Horman, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Alexander Duyck, kernel-team,
	Edward Cree, Brett Creeley
  Cc: netdev, linux-kernel, oss-drivers, linux-hyperv,
	linux-net-drivers, Breno Leitao

Commit 84eaf4359c36 ("net: ethtool: add get_rx_ring_count callback to
optimize RX ring queries") added specific support for GRXRINGS callback,
simplifying .get_rxnfc.

Remove the handling of GRXRINGS in .get_rxnfc() by moving it to the new
.get_rx_ring_count().

This simplifies the RX ring count retrieval and aligns the following
drivers with the new ethtool API for querying RX ring parameters.

 * sfc
 * ionic
 * sfc/siena
 * sfc/ef100
 * fbnic
 * mana
 * nfp
 * atlantic
 * benet (this is v2 in fact, where v1 had some discussions that
   required a v2). See link [0]

Link: https://lore.kernel.org/all/20260119094514.5b12a097@kernel.org/ [0]

This is covering the last drivers, and as soon as this lands, I will
change the ethtool framework to avoid calling .get_rx_ring_count for
ETHTOOL_GRXRINGS, simplifying the ethtool core framework.

Part 1 is already merged in net-next and can be seen in
https://lore.kernel.org/all/20260109-grxring_big_v1-v1-0-a0f77f732006@debian.org/

Part 2 is already merged in net-next except benet driver, which is now included
in here
https://lore.kernel.org/all/20260115-grxring_big_v2-v1-0-b3e1b58bced5@debian.org/

PS: all of these change were compile-tested only.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
Breno Leitao (9):
      net: benet: convert to use .get_rx_ring_count
      net: atlantic: convert to use .get_rx_ring_count
      net: nfp: convert to use .get_rx_ring_count
      net: mana: convert to use .get_rx_ring_count
      net: fbnic: convert to use .get_rx_ring_count
      net: ionic: convert to use .get_rx_ring_count
      net: sfc: efx: convert to use .get_rx_ring_count
      net: sfc: siena: convert to use .get_rx_ring_count
      net: sfc: falcon: convert to use .get_rx_ring_count

 .../net/ethernet/aquantia/atlantic/aq_ethtool.c    | 15 +++++----
 drivers/net/ethernet/emulex/benet/be_ethtool.c     | 37 ++++++++--------------
 drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c    | 12 ++++---
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 13 ++------
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   | 11 +++++--
 .../net/ethernet/pensando/ionic/ionic_ethtool.c    | 18 ++---------
 drivers/net/ethernet/sfc/ef100_ethtool.c           |  1 +
 drivers/net/ethernet/sfc/ethtool.c                 |  1 +
 drivers/net/ethernet/sfc/ethtool_common.c          | 11 ++++---
 drivers/net/ethernet/sfc/ethtool_common.h          |  1 +
 drivers/net/ethernet/sfc/falcon/ethtool.c          | 12 ++++---
 drivers/net/ethernet/sfc/siena/ethtool.c           |  1 +
 drivers/net/ethernet/sfc/siena/ethtool_common.c    | 11 ++++---
 drivers/net/ethernet/sfc/siena/ethtool_common.h    |  1 +
 14 files changed, 72 insertions(+), 73 deletions(-)
---
base-commit: d8f87aa5fa0a4276491fa8ef436cd22605a3f9ba
change-id: 20260121-grxring_big_v4-55037f9e001e

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply

* Re: [PATCH net-next v15 01/12] vsock: add netns to vsock core
From: Bobby Eshleman @ 2026-01-21 17:36 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Stefano Garzarella, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Simon Horman, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
	Eugenio Pérez, Xuan Zhuo, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	Broadcom internal kernel review list, Shuah Khan, Long Li,
	Jonathan Corbet, linux-kernel, virtualization, netdev, kvm,
	linux-hyperv, linux-kselftest, berrange, Sargun Dhillon,
	linux-doc, Bobby Eshleman
In-Reply-To: <4997118e-471c-45fe-bc1f-8f6140199db5@redhat.com>

On Wed, Jan 21, 2026 at 05:32:34PM +0100, Paolo Abeni wrote:
> On 1/21/26 3:48 PM, Stefano Garzarella wrote:
> >> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> >> index a8d0afde7f85..b6e3bfe365a1 100644
> >> --- a/Documentation/admin-guide/kernel-parameters.txt
> >> +++ b/Documentation/admin-guide/kernel-parameters.txt
> >> @@ -8253,6 +8253,20 @@ Kernel parameters
> >> 			            them quite hard to use for exploits but
> >> 			            might break your system.
> >>
> >> +	vsock_init_ns_mode=
> >> +			[KNL,NET] Set the vsock namespace mode for the init
> >> +			(root) network namespace.
> >> +
> >> +			global      [default] The init namespace operates in
> >> +			            global mode where CIDs are system-wide and
> >> +			            sockets can communicate across global
> >> +			            namespaces.
> >> +
> >> +			local       The init namespace operates in local mode
> >> +			            where CIDs are private to the namespace and
> >> +			            sockets can only communicate within the same
> >> +			            namespace.
> >> +
> > 
> > My comment on v14 was more to start a discussion :-) sorry to not be 
> > clear.
> > 
> > I briefly discussed it with Paolo in chat to better understand our 
> > policy between cmdline parameters and module parameters, and it seems 
> > that both are discouraged.
> 
> Double checking the git log it looks like __setup() usage is less
> constrained/restricted than what I thought.
> 
> > So he asked me if we have a use case for this, and thinking about it, I 
> > don't have one at the moment. Also, if a user decides to set all netns 
> > to local, whether init_net is local or global doesn't really matter, 
> > right?
> > 
> > So perhaps before adding this, we should have a real use case.
> > Perhaps more than this feature, I would add a way to change the default 
> > of all netns (including init_net) from global to local. But we can do 
> > that later, since all netns have a way to understand what mode they are 
> > in, so we don't break anything and the user has to explicitly change it, 
> > knowing that they are breaking compatibility with pre-netns support.\
> 
> Lacking a clear use-case for vsock_init_ns_mode I tend to think it would
> be better to postpone its introduction. It should be easier to add it
> later than vice-versa.
> 
> If there is a clear/well defined/known use-case, I guess the series can
> go as-is.
> 
> /P
> 

Our use case also does not need the ability to set the init ns mode, so
I'll revert this bit.

Thanks,
Bobby

^ permalink raw reply

* Re: [PATCH net-next v15 01/12] vsock: add netns to vsock core
From: Bobby Eshleman @ 2026-01-21 17:49 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Stefan Hajnoczi, Michael S. Tsirkin, Jason Wang,
	Eugenio Pérez, Xuan Zhuo, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	Broadcom internal kernel review list, Shuah Khan, Long Li,
	Jonathan Corbet, linux-kernel, virtualization, netdev, kvm,
	linux-hyperv, linux-kselftest, berrange, Sargun Dhillon,
	linux-doc, Bobby Eshleman
In-Reply-To: <aXDYfYy3f1NQm5A0@sgarzare-redhat>

On Wed, Jan 21, 2026 at 03:48:13PM +0100, Stefano Garzarella wrote:
> On Fri, Jan 16, 2026 at 01:28:41PM -0800, Bobby Eshleman wrote:
> > From: Bobby Eshleman <bobbyeshleman@meta.com>
> > 
> > Add netns logic to vsock core. Additionally, modify transport hook
> > prototypes to be used by later transport-specific patches (e.g.,
> > *_seqpacket_allow()).
> > 
> > Namespaces are supported primarily by changing socket lookup functions
> > (e.g., vsock_find_connected_socket()) to take into account the socket
> > namespace and the namespace mode before considering a candidate socket a
> > "match".
> > 
> > This patch also introduces the sysctl /proc/sys/net/vsock/ns_mode to
> > report the mode and /proc/sys/net/vsock/child_ns_mode to set the mode
> > for new namespaces.
> > 
> > Add netns functionality (initialization, passing to transports, procfs,
> > etc...) to the af_vsock socket layer. Later patches that add netns
> > support to transports depend on this patch.
> 
> nit: maybe we should mention here why we changed the random port allocation
> 
> (not a big deal, only if you need to resend)
> 
> > 
> > dgram_allow(), stream_allow(), and seqpacket_allow() callbacks are
> > modified to take a vsk in order to perform logic on namespace modes. In
> > future patches, the net will also be used for socket
> > lookups in these functions.
> > 
> > Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
> > ---
> > Changes in v15:
> > - make static port in __vsock_bind_connectible per-netns
> > - remove __net_initdata because we want the ops beyond just boot
> > - add vsock_init_ns_mode kernel cmdline parameter to set init ns mode
> > - use if (ret || !write) in __vsock_net_mode_string() (Stefano)
> > - add vsock_net_mode_global() (Stefano)
> > - hide !net == VSOCK_NET_MODE_GLOBAL inside vsock_net_mode() (Stefano)
> > - clarify af_vsock.c comments on ns_mode/child_ns_mode (Stefano)
> > 
> > Changes in v14:
> > - include linux/sysctl.h in af_vsock.c
> > - squash patch 'vsock: add per-net vsock NS mode state' into this patch
> >  (prior version can be found here):
> >  https://lore.kernel.org/all/20251223-vsock-vmtest-v13-1-9d6db8e7c80b@meta.com/)
> > 
> > Changes in v13:
> > - remove net_mode and replace with direct accesses to net->vsock.mode,
> >  since this is now immutable.
> > - update comments about mode behavior and mutability, and sysctl API
> > - only pass NULL for net when wanting global, instead of net_mode ==
> >  VSOCK_NET_MODE_GLOBAL. This reflects the new logic
> >  of vsock_net_check_mode() that only requires net pointers (not
> >  net_mode).
> > - refactor sysctl string code into a re-usable function, because
> >  child_ns_mode and ns_mode both handle the same strings.
> > - remove redundant vsock_net_init(&init_net) call in module init because
> >  pernet registration calls the callback on the init_net too
> > 
> > Changes in v12:
> > - return true in dgram_allow(), stream_allow(), and seqpacket_allow()
> >  only if net_mode == VSOCK_NET_MODE_GLOBAL (Stefano)
> > - document bind(VMADDR_CID_ANY) case in af_vsock.c (Stefano)
> > - change order of stream_allow() call in vmci so we can pass vsk
> >  to it
> > 
> > Changes in v10:
> > - add file-level comment about what happens to sockets/devices
> >  when the namespace mode changes (Stefano)
> > - change the 'if (write)' boolean in vsock_net_mode_string() to
> >  if (!write), this simplifies a later patch which adds "goto"
> >  for mutex unlocking on function exit.
> > 
> > Changes in v9:
> > - remove virtio_vsock_alloc_rx_skb() (Stefano)
> > - remove vsock_global_dummy_net, not needed as net=NULL +
> >  net_mode=VSOCK_NET_MODE_GLOBAL achieves identical result
> > 
> > Changes in v7:
> > - hv_sock: fix hyperv build error
> > - explain why vhost does not use the dummy
> > - explain usage of __vsock_global_dummy_net
> > - explain why VSOCK_NET_MODE_STR_MAX is 8 characters
> > - use switch-case in vsock_net_mode_string()
> > - avoid changing transports as much as possible
> > - add vsock_find_{bound,connected}_socket_net()
> > - rename `vsock_hdr` to `sysctl_hdr`
> > - add virtio_vsock_alloc_linear_skb() wrapper for setting dummy net and
> >  global mode for virtio-vsock, move skb->cb zero-ing into wrapper
> > - explain seqpacket_allow() change
> > - move net setting to __vsock_create() instead of vsock_create() so
> >  that child sockets also have their net assigned upon accept()
> > 
> > Changes in v6:
> > - unregister sysctl ops in vsock_exit()
> > - af_vsock: clarify description of CID behavior
> > - af_vsock: fix buf vs buffer naming, and length checking
> > - af_vsock: fix length checking w/ correct ctl_table->maxlen
> > 
> > Changes in v5:
> > - vsock_global_net() -> vsock_global_dummy_net()
> > - update comments for new uAPI
> > - use /proc/sys/net/vsock/ns_mode instead of /proc/net/vsock_ns_mode
> > - add prototype changes so patch remains c)mpilable
> > ---
> > Documentation/admin-guide/kernel-parameters.txt |  14 +
> > MAINTAINERS                                     |   1 +
> > drivers/vhost/vsock.c                           |   6 +-
> > include/linux/virtio_vsock.h                    |   4 +-
> > include/net/af_vsock.h                          |  61 ++++-
> > include/net/net_namespace.h                     |   4 +
> > include/net/netns/vsock.h                       |  21 ++
> > net/vmw_vsock/af_vsock.c                        | 328 ++++++++++++++++++++++--
> > net/vmw_vsock/hyperv_transport.c                |   7 +-
> > net/vmw_vsock/virtio_transport.c                |   9 +-
> > net/vmw_vsock/virtio_transport_common.c         |   6 +-
> > net/vmw_vsock/vmci_transport.c                  |  26 +-
> > net/vmw_vsock/vsock_loopback.c                  |   8 +-
> > 13 files changed, 444 insertions(+), 51 deletions(-)
> > 
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index a8d0afde7f85..b6e3bfe365a1 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -8253,6 +8253,20 @@ Kernel parameters
> > 			            them quite hard to use for exploits but
> > 			            might break your system.
> > 
> > +	vsock_init_ns_mode=
> > +			[KNL,NET] Set the vsock namespace mode for the init
> > +			(root) network namespace.
> > +
> > +			global      [default] The init namespace operates in
> > +			            global mode where CIDs are system-wide and
> > +			            sockets can communicate across global
> > +			            namespaces.
> > +
> > +			local       The init namespace operates in local mode
> > +			            where CIDs are private to the namespace and
> > +			            sockets can only communicate within the same
> > +			            namespace.
> > +
> 
> My comment on v14 was more to start a discussion :-) sorry to not be clear.

No worries, resending with this included started a good discussion so
not for nil.

> 
> I briefly discussed it with Paolo in chat to better understand our policy
> between cmdline parameters and module parameters, and it seems that both are
> discouraged.
> 
> So he asked me if we have a use case for this, and thinking about it, I
> don't have one at the moment. Also, if a user decides to set all netns to
> local, whether init_net is local or global doesn't really matter, right?
> 
> So perhaps before adding this, we should have a real use case.
> Perhaps more than this feature, I would add a way to change the default of
> all netns (including init_net) from global to local. But we can do that
> later, since all netns have a way to understand what mode they are in, so we
> don't break anything and the user has to explicitly change it, knowing that
> they are breaking compatibility with pre-netns support.\
> 
> 
> That said, at this point, maybe we can remove this, documenting that
> init_net is always global, and if we have a use case in the future, we can
> add this (or something else) to set the init_net mode (or change the default
> for all netns).
> 
> Let's wait a bit before next version to wait a comment from Paolo or Jakub
> on this. But I'm almost fine with both ways, so:
> 
> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
> 
> > 	vt.color=	[VT] Default text color.
> > 			Format: 0xYX, X = foreground, Y = background.
> > 			Default: 0x07 = light gray on black.
> 
> [...]
> 
> > diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> > index a3505a4dcee0..3fc8160d51df 100644
> > --- a/net/vmw_vsock/af_vsock.c
> > +++ b/net/vmw_vsock/af_vsock.c
> 
> [...]
> 
> > @@ -235,33 +303,42 @@ static void __vsock_remove_connected(struct
> > vsock_sock *vsk)
> > 	sock_put(&vsk->sk);
> > }
> > 
> 
> In the v14 I suggested to add some documentation on top of the vsock_find*()
> vs vsock_find_*_net() to explain better which one should be used by
> transports.
> 
> Again is not a big deal, we can fix later if you don't need to resend.
> 
> Thanks,
> Stefano

Sorry about that slipping through the cracks, will add to v16.

I'll resend with:

1. revert init ns cmdline
2. update this message about why the port allocation changes
3. fix the vmtest missing ns arg bug that Kuba mentioned
4. update documentation on top of vsock_find* / vsock_find_*_net
5. update documentation on top of af_vsock.c w/ note about init_ns
having its mode fixed to global

Unless any prior feedback slipped, I think this captures everything
pending? 

Best,
Bobby

^ permalink raw reply

* [PATCH v4 0/7] mshv: Debugfs interface for mshv_root
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves

Expose hypervisor, logical processor, partition, and virtual processor
statistics via debugfs. These are provided by mapping 'stats' pages via
hypercall.

Patch #1: Update hv_call_map_stats_page() to return success when
          HV_STATS_AREA_PARENT is unavailable, which is the case on some
          hypervisor versions, where it can fall back to HV_STATS_AREA_SELF
Patch #2: Use struct hv_stats_page pointers instead of void *
Patch #3: Make mshv_vp_stats_map/unmap() more flexible to use with debugfs code
Patch #4: Always map vp stats page regardless of scheduler, to reuse in debugfs
Patch #5: Change to hv_stats_page definition and VpRootDispatchThreadBlocked
Patch #6: Introduce the definitions needed for the various stats pages
Patch #7: Add mshv_debugfs.c, and integrate it with the mshv_root driver to
          expose the partition and VP stats.

---
Changes in v4:
- Put the counters definitions in static arrays in hv_counters.c, instead of as
  enums in hvhdk.h [Michael]
- Due to the above, add an additional patch (#5) to simplify hv_stats_page, and
  retain the enum definition at the top of mshv_root_main.c for use with
  VpRootDispatchThreadBlocked. That is the only remaining use of the counter
  enum.
- Due to the above, use num_present_cpus() as the number of LPs to map stats
  pages for - this number shouldn't change at runtime because the hypervisor
  doesn't support hotplug for root partition.

Changes in v3:
- Add 3 small refactor/cleanup patches (patches 2,3,4) from Stanislav. These
  simplify some of the debugfs code, and fix issues with mapping VP stats on
  L1VH.
- Fix cleanup of parent stats dentries on module removal (via squashing some
  internal patches into patch #6) [Praveen]
- Remove unused goto label [Stanislav, kernel bot]
- Use struct hv_stats_page * instead of void * in mshv_debugfs.c [Stanislav]
- Remove some redundant variables [Stanislav]
- Rename debugfs dentry fields for brevity [Stanislav]
- Use ERR_CAST() for the dentry error pointer returned from
  lp_debugfs_stats_create() [Stanislav]
- Fix leak of pages allocated for lp stats mappings by storing them in an array
  [Michael]
- Add comments to clarify PARENT vs SELF usage and edge cases [Michael]
- Add VpLoadAvg for x86 and print the stat [Michael]
- Add NUM_STATS_AREAS for array sizing in mshv_debugfs.c [Michael]

Changes in v2:
- Remove unnecessary pr_debug_once() in patch 1 [Stanislav Kinsburskii]
- CONFIG_X86 -> CONFIG_X86_64 in patch 2 [Stanislav Kinsburskii]

---
Nuno Das Neves (3):
  mshv: Update hv_stats_page definitions
  mshv: Add data for printing stats page counters
  mshv: Add debugfs to view hypervisor statistics

Purna Pavan Chandra Aekkaladevi (1):
  mshv: Ignore second stats page map result failure

Stanislav Kinsburskii (3):
  mshv: Use typed hv_stats_page pointers
  mshv: Improve mshv_vp_stats_map/unmap(), add them to mshv_root.h
  mshv: Always map child vp stats pages regardless of scheduler type

 drivers/hv/Makefile            |   1 +
 drivers/hv/hv_counters.c       | 489 +++++++++++++++++++++++
 drivers/hv/hv_synic.c          | 177 +++++++++
 drivers/hv/mshv_debugfs.c      | 703 +++++++++++++++++++++++++++++++++
 drivers/hv/mshv_root.h         |  49 ++-
 drivers/hv/mshv_root_hv_call.c |  64 ++-
 drivers/hv/mshv_root_main.c    | 135 ++++---
 include/hyperv/hvhdk.h         |   8 +
 8 files changed, 1564 insertions(+), 62 deletions(-)
 create mode 100644 drivers/hv/hv_counters.c
 create mode 100644 drivers/hv/hv_synic.c
 create mode 100644 drivers/hv/mshv_debugfs.c

-- 
2.34.1


^ permalink raw reply

* [PATCH v4 1/7] mshv: Ignore second stats page map result failure
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

From: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>

Older versions of the hypervisor do not have a concept of separate SELF
and PARENT stats areas. In this case, mapping the HV_STATS_AREA_SELF page
is sufficient - it's the only page and it contains all available stats.

Mapping HV_STATS_AREA_PARENT returns HV_STATUS_INVALID_PARAMETER which
currently causes module init to fail on older hypevisor versions.

Detect this case and gracefully fall back to populating
stats_pages[HV_STATS_AREA_PARENT] with the already-mapped SELF page.

Add comments to clarify the behavior, including a clarification of why
this isn't needed for hv_call_map_stats_page2() which always supports
PARENT and SELF areas.

Signed-off-by: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_hv_call.c | 52 +++++++++++++++++++++++++++++++---
 drivers/hv/mshv_root_main.c    |  3 ++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 598eaff4ff29..1f93b94d7580 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -813,6 +813,13 @@ hv_call_notify_port_ring_empty(u32 sint_index)
 	return hv_result_to_errno(status);
 }
 
+/*
+ * Equivalent of hv_call_map_stats_page() for cases when the caller provides
+ * the map location.
+ *
+ * NOTE: This is a newer hypercall that always supports SELF and PARENT stats
+ * areas, unlike hv_call_map_stats_page().
+ */
 static int hv_call_map_stats_page2(enum hv_stats_object_type type,
 				   const union hv_stats_object_identity *identity,
 				   u64 map_location)
@@ -855,6 +862,34 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type,
 	return ret;
 }
 
+static int
+hv_stats_get_area_type(enum hv_stats_object_type type,
+		       const union hv_stats_object_identity *identity)
+{
+	switch (type) {
+	case HV_STATS_OBJECT_HYPERVISOR:
+		return identity->hv.stats_area_type;
+	case HV_STATS_OBJECT_LOGICAL_PROCESSOR:
+		return identity->lp.stats_area_type;
+	case HV_STATS_OBJECT_PARTITION:
+		return identity->partition.stats_area_type;
+	case HV_STATS_OBJECT_VP:
+		return identity->vp.stats_area_type;
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * Map a stats page, where the page location is provided by the hypervisor.
+ *
+ * NOTE: The concept of separate SELF and PARENT stats areas does not exist on
+ * older hypervisor versions. All the available stats information can be found
+ * on the SELF page. When attempting to map the PARENT area on a hypervisor
+ * that doesn't support it, return "success" but with a NULL address. The
+ * caller should check for this case and instead fallback to the SELF area
+ * alone.
+ */
 static int hv_call_map_stats_page(enum hv_stats_object_type type,
 				  const union hv_stats_object_identity *identity,
 				  void **addr)
@@ -863,7 +898,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
 	struct hv_input_map_stats_page *input;
 	struct hv_output_map_stats_page *output;
 	u64 status, pfn;
-	int ret = 0;
+	int hv_status, ret = 0;
 
 	do {
 		local_irq_save(flags);
@@ -878,11 +913,20 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
 		pfn = output->map_location;
 
 		local_irq_restore(flags);
-		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
-			ret = hv_result_to_errno(status);
+
+		hv_status = hv_result(status);
+		if (hv_status != HV_STATUS_INSUFFICIENT_MEMORY) {
 			if (hv_result_success(status))
 				break;
-			return ret;
+
+			if (hv_stats_get_area_type(type, identity) == HV_STATS_AREA_PARENT &&
+			    hv_status == HV_STATUS_INVALID_PARAMETER) {
+				*addr = NULL;
+				return 0;
+			}
+
+			hv_status_debug(status, "\n");
+			return hv_result_to_errno(status);
 		}
 
 		ret = hv_call_deposit_pages(NUMA_NO_NODE,
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 1134a82c7881..1777778f84b8 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -992,6 +992,9 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
 	if (err)
 		goto unmap_self;
 
+	if (!stats_pages[HV_STATS_AREA_PARENT])
+		stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
+
 	return 0;
 
 unmap_self:
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 2/7] mshv: Use typed hv_stats_page pointers
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

Refactor all relevant functions to use struct hv_stats_page pointers
instead of void pointers for stats page mapping and unmapping thus
improving type safety and code clarity across the Hyper-V stats mapping
APIs.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/mshv_root.h         |  5 +++--
 drivers/hv/mshv_root_hv_call.c | 12 +++++++-----
 drivers/hv/mshv_root_main.c    |  8 ++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 3c1d88b36741..05ba1f716f9e 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -307,8 +307,9 @@ int hv_call_disconnect_port(u64 connection_partition_id,
 int hv_call_notify_port_ring_empty(u32 sint_index);
 int hv_map_stats_page(enum hv_stats_object_type type,
 		      const union hv_stats_object_identity *identity,
-		      void **addr);
-int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+		      struct hv_stats_page **addr);
+int hv_unmap_stats_page(enum hv_stats_object_type type,
+			struct hv_stats_page *page_addr,
 			const union hv_stats_object_identity *identity);
 int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 				   u64 page_struct_count, u32 host_access,
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 1f93b94d7580..daee036e48bc 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -890,9 +890,10 @@ hv_stats_get_area_type(enum hv_stats_object_type type,
  * caller should check for this case and instead fallback to the SELF area
  * alone.
  */
-static int hv_call_map_stats_page(enum hv_stats_object_type type,
-				  const union hv_stats_object_identity *identity,
-				  void **addr)
+static int
+hv_call_map_stats_page(enum hv_stats_object_type type,
+		       const union hv_stats_object_identity *identity,
+		       struct hv_stats_page **addr)
 {
 	unsigned long flags;
 	struct hv_input_map_stats_page *input;
@@ -942,7 +943,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
 
 int hv_map_stats_page(enum hv_stats_object_type type,
 		      const union hv_stats_object_identity *identity,
-		      void **addr)
+		      struct hv_stats_page **addr)
 {
 	int ret;
 	struct page *allocated_page = NULL;
@@ -990,7 +991,8 @@ static int hv_call_unmap_stats_page(enum hv_stats_object_type type,
 	return hv_result_to_errno(status);
 }
 
-int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr,
+int hv_unmap_stats_page(enum hv_stats_object_type type,
+			struct hv_stats_page *page_addr,
 			const union hv_stats_object_identity *identity)
 {
 	int ret;
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 1777778f84b8..be5ad0fbfbee 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -957,7 +957,7 @@ mshv_vp_release(struct inode *inode, struct file *filp)
 }
 
 static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
-				void *stats_pages[])
+				struct hv_stats_page *stats_pages[])
 {
 	union hv_stats_object_identity identity = {
 		.vp.partition_id = partition_id,
@@ -972,7 +972,7 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
 }
 
 static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
-			     void *stats_pages[])
+			     struct hv_stats_page *stats_pages[])
 {
 	union hv_stats_object_identity identity = {
 		.vp.partition_id = partition_id,
@@ -1010,7 +1010,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 	struct mshv_create_vp args;
 	struct mshv_vp *vp;
 	struct page *intercept_msg_page, *register_page, *ghcb_page;
-	void *stats_pages[2];
+	struct hv_stats_page *stats_pages[2];
 	long ret;
 
 	if (copy_from_user(&args, arg, sizeof(args)))
@@ -1729,7 +1729,7 @@ static void destroy_partition(struct mshv_partition *partition)
 
 			if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
 				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
-						    (void **)vp->vp_stats_pages);
+						    vp->vp_stats_pages);
 
 			if (vp->vp_register_page) {
 				(void)hv_unmap_vp_state_page(partition->pt_id,
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 3/7] mshv: Improve mshv_vp_stats_map/unmap(), add them to mshv_root.h
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

These functions are currently only used to map child partition VP stats,
on root partition. However, they will soon be used on L1VH, and and also
used for mapping the host's own VP stats.

Introduce a helper is_l1vh_parent() to determine whether we are mapping
our own VP stats. In this case, do not attempt to map the PARENT area.
Note this is a different case than mapping PARENT on an older hypervisor
where it is not available at all, so must be handled separately.

On unmap, pass the stats pages since on L1VH the kernel allocates them
and they must be freed in hv_unmap_stats_page().

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/mshv_root.h      | 10 ++++++
 drivers/hv/mshv_root_main.c | 61 ++++++++++++++++++++++++++-----------
 2 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 05ba1f716f9e..e4912b0618fa 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -254,6 +254,16 @@ struct mshv_partition *mshv_partition_get(struct mshv_partition *partition);
 void mshv_partition_put(struct mshv_partition *partition);
 struct mshv_partition *mshv_partition_find(u64 partition_id) __must_hold(RCU);
 
+static inline bool is_l1vh_parent(u64 partition_id)
+{
+	return hv_l1vh_partition() && (partition_id == HV_PARTITION_ID_SELF);
+}
+
+int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
+		      struct hv_stats_page **stats_pages);
+void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+			 struct hv_stats_page **stats_pages);
+
 /* hypercalls */
 
 int hv_call_withdraw_memory(u64 count, int node, u64 partition_id);
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index be5ad0fbfbee..faca3cc63e79 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -956,23 +956,36 @@ mshv_vp_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
-				struct hv_stats_page *stats_pages[])
+void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+			 struct hv_stats_page *stats_pages[])
 {
 	union hv_stats_object_identity identity = {
 		.vp.partition_id = partition_id,
 		.vp.vp_index = vp_index,
 	};
+	int err;
 
 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
-	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
-
-	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
-	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
+	err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
+				  stats_pages[HV_STATS_AREA_SELF],
+				  &identity);
+	if (err)
+		pr_err("%s: failed to unmap partition %llu vp %u self stats, err: %d\n",
+		       __func__, partition_id, vp_index, err);
+
+	if (stats_pages[HV_STATS_AREA_PARENT] != stats_pages[HV_STATS_AREA_SELF]) {
+		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+		err = hv_unmap_stats_page(HV_STATS_OBJECT_VP,
+					  stats_pages[HV_STATS_AREA_PARENT],
+					  &identity);
+		if (err)
+			pr_err("%s: failed to unmap partition %llu vp %u parent stats, err: %d\n",
+			       __func__, partition_id, vp_index, err);
+	}
 }
 
-static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
-			     struct hv_stats_page *stats_pages[])
+int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
+		      struct hv_stats_page *stats_pages[])
 {
 	union hv_stats_object_identity identity = {
 		.vp.partition_id = partition_id,
@@ -983,23 +996,37 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
 	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
 				&stats_pages[HV_STATS_AREA_SELF]);
-	if (err)
+	if (err) {
+		pr_err("%s: failed to map partition %llu vp %u self stats, err: %d\n",
+		       __func__, partition_id, vp_index, err);
 		return err;
+	}
 
-	identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
-	err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
-				&stats_pages[HV_STATS_AREA_PARENT]);
-	if (err)
-		goto unmap_self;
-
-	if (!stats_pages[HV_STATS_AREA_PARENT])
+	/*
+	 * L1VH partition cannot access its vp stats in parent area.
+	 */
+	if (is_l1vh_parent(partition_id)) {
 		stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
+	} else {
+		identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
+		err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+					&stats_pages[HV_STATS_AREA_PARENT]);
+		if (err) {
+			pr_err("%s: failed to map partition %llu vp %u parent stats, err: %d\n",
+			       __func__, partition_id, vp_index, err);
+			goto unmap_self;
+		}
+		if (!stats_pages[HV_STATS_AREA_PARENT])
+			stats_pages[HV_STATS_AREA_PARENT] = stats_pages[HV_STATS_AREA_SELF];
+	}
 
 	return 0;
 
 unmap_self:
 	identity.vp.stats_area_type = HV_STATS_AREA_SELF;
-	hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
+	hv_unmap_stats_page(HV_STATS_OBJECT_VP,
+			    stats_pages[HV_STATS_AREA_SELF],
+			    &identity);
 	return err;
 }
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 4/7] mshv: Always map child vp stats pages regardless of scheduler type
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

Currently vp->vp_stats_pages is only used by the root scheduler for fast
interrupt injection.

Soon, vp_stats_pages will also be needed for exposing child VP stats to
userspace via debugfs. Mapping the pages a second time to a different
address causes an error on L1VH.

Remove the scheduler requirement and always map the vp stats pages.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index faca3cc63e79..fbfc9e7d9fa4 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1077,16 +1077,10 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 			goto unmap_register_page;
 	}
 
-	/*
-	 * This mapping of the stats page is for detecting if dispatch thread
-	 * is blocked - only relevant for root scheduler
-	 */
-	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
-		ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
-					stats_pages);
-		if (ret)
-			goto unmap_ghcb_page;
-	}
+	ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
+				stats_pages);
+	if (ret)
+		goto unmap_ghcb_page;
 
 	vp = kzalloc(sizeof(*vp), GFP_KERNEL);
 	if (!vp)
@@ -1110,8 +1104,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
 		vp->vp_ghcb_page = page_to_virt(ghcb_page);
 
-	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
-		memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
+	memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
 
 	/*
 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
@@ -1133,8 +1126,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 free_vp:
 	kfree(vp);
 unmap_stats_pages:
-	if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
-		mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
+	mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
 unmap_ghcb_page:
 	if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
 		hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
@@ -1754,9 +1746,8 @@ static void destroy_partition(struct mshv_partition *partition)
 			if (!vp)
 				continue;
 
-			if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
-				mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
-						    vp->vp_stats_pages);
+			mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+					    vp->vp_stats_pages);
 
 			if (vp->vp_register_page) {
 				(void)hv_unmap_vp_state_page(partition->pt_id,
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 5/7] mshv: Update hv_stats_page definitions
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

hv_stats_page belongs in hvhdk.h, move it there.

It does not require a union to access the data for different counters,
just use a single u64 array for simplicity and to match the Windows
definitions.

While at it, correct the ARM64 value for VpRootDispatchThreadBlocked.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c | 22 ++++++----------------
 include/hyperv/hvhdk.h      |  8 ++++++++
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index fbfc9e7d9fa4..12825666e21b 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -39,23 +39,14 @@ MODULE_AUTHOR("Microsoft");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
 
-/* TODO move this to another file when debugfs code is added */
 enum hv_stats_vp_counters {			/* HV_THREAD_COUNTER */
 #if defined(CONFIG_X86)
-	VpRootDispatchThreadBlocked			= 202,
+	VpRootDispatchThreadBlocked = 202,
 #elif defined(CONFIG_ARM64)
-	VpRootDispatchThreadBlocked			= 94,
+	VpRootDispatchThreadBlocked = 95,
 #endif
-	VpStatsMaxCounter
 };
 
-struct hv_stats_page {
-	union {
-		u64 vp_cntrs[VpStatsMaxCounter];		/* VP counters */
-		u8 data[HV_HYP_PAGE_SIZE];
-	};
-} __packed;
-
 struct mshv_root mshv_root;
 
 enum hv_scheduler_type hv_scheduler_type;
@@ -485,12 +476,11 @@ static u64 mshv_vp_interrupt_pending(struct mshv_vp *vp)
 static bool mshv_vp_dispatch_thread_blocked(struct mshv_vp *vp)
 {
 	struct hv_stats_page **stats = vp->vp_stats_pages;
-	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->vp_cntrs;
-	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->vp_cntrs;
+	u64 *self_vp_cntrs = stats[HV_STATS_AREA_SELF]->data;
+	u64 *parent_vp_cntrs = stats[HV_STATS_AREA_PARENT]->data;
 
-	if (self_vp_cntrs[VpRootDispatchThreadBlocked])
-		return self_vp_cntrs[VpRootDispatchThreadBlocked];
-	return parent_vp_cntrs[VpRootDispatchThreadBlocked];
+	return parent_vp_cntrs[VpRootDispatchThreadBlocked] ||
+	       self_vp_cntrs[VpRootDispatchThreadBlocked];
 }
 
 static int
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 469186df7826..ac501969105c 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -10,6 +10,14 @@
 #include "hvhdk_mini.h"
 #include "hvgdk.h"
 
+/*
+ * Hypervisor statistics page format
+ */
+struct hv_stats_page {
+	u64 data[HV_HYP_PAGE_SIZE / sizeof(u64)];
+} __packed;
+
+
 /* Bits for dirty mask of hv_vp_register_page */
 #define HV_X64_REGISTER_CLASS_GENERAL	0
 #define HV_X64_REGISTER_CLASS_IP	1
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 6/7] mshv: Add data for printing stats page counters
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

Introduce hv_counters.c, containing static data corresponding to
HV_*_COUNTER enums in the hypervisor source. Defining the enum
members as an array instead makes more sense, since it will be
iterated over to print counter information to debugfs.

Include hypervisor, logical processor, partition, and virtual
processor counters.

Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
---
 drivers/hv/hv_counters.c | 488 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 488 insertions(+)
 create mode 100644 drivers/hv/hv_counters.c

diff --git a/drivers/hv/hv_counters.c b/drivers/hv/hv_counters.c
new file mode 100644
index 000000000000..a8e07e72cc29
--- /dev/null
+++ b/drivers/hv/hv_counters.c
@@ -0,0 +1,488 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * Data for printing stats page counters via debugfs.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+struct hv_counter_entry {
+	char *name;
+	int idx;
+};
+
+/* HV_HYPERVISOR_COUNTER */
+static struct hv_counter_entry hv_hypervisor_counters[] = {
+	{ "HvLogicalProcessors", 1 },
+	{ "HvPartitions", 2 },
+	{ "HvTotalPages", 3 },
+	{ "HvVirtualProcessors", 4 },
+	{ "HvMonitoredNotifications", 5 },
+	{ "HvModernStandbyEntries", 6 },
+	{ "HvPlatformIdleTransitions", 7 },
+	{ "HvHypervisorStartupCost", 8 },
+
+	{ "HvIOSpacePages", 10 },
+	{ "HvNonEssentialPagesForDump", 11 },
+	{ "HvSubsumedPages", 12 },
+};
+
+/* HV_CPU_COUNTER */
+static struct hv_counter_entry hv_lp_counters[] = {
+	{ "LpGlobalTime", 1 },
+	{ "LpTotalRunTime", 2 },
+	{ "LpHypervisorRunTime", 3 },
+	{ "LpHardwareInterrupts", 4 },
+	{ "LpContextSwitches", 5 },
+	{ "LpInterProcessorInterrupts", 6 },
+	{ "LpSchedulerInterrupts", 7 },
+	{ "LpTimerInterrupts", 8 },
+	{ "LpInterProcessorInterruptsSent", 9 },
+	{ "LpProcessorHalts", 10 },
+	{ "LpMonitorTransitionCost", 11 },
+	{ "LpContextSwitchTime", 12 },
+	{ "LpC1TransitionsCount", 13 },
+	{ "LpC1RunTime", 14 },
+	{ "LpC2TransitionsCount", 15 },
+	{ "LpC2RunTime", 16 },
+	{ "LpC3TransitionsCount", 17 },
+	{ "LpC3RunTime", 18 },
+	{ "LpRootVpIndex", 19 },
+	{ "LpIdleSequenceNumber", 20 },
+	{ "LpGlobalTscCount", 21 },
+	{ "LpActiveTscCount", 22 },
+	{ "LpIdleAccumulation", 23 },
+	{ "LpReferenceCycleCount0", 24 },
+	{ "LpActualCycleCount0", 25 },
+	{ "LpReferenceCycleCount1", 26 },
+	{ "LpActualCycleCount1", 27 },
+	{ "LpProximityDomainId", 28 },
+	{ "LpPostedInterruptNotifications", 29 },
+	{ "LpBranchPredictorFlushes", 30 },
+#if IS_ENABLED(CONFIG_X86_64)
+	{ "LpL1DataCacheFlushes", 31 },
+	{ "LpImmediateL1DataCacheFlushes", 32 },
+	{ "LpMbFlushes", 33 },
+	{ "LpCounterRefreshSequenceNumber", 34 },
+	{ "LpCounterRefreshReferenceTime", 35 },
+	{ "LpIdleAccumulationSnapshot", 36 },
+	{ "LpActiveTscCountSnapshot", 37 },
+	{ "LpHwpRequestContextSwitches", 38 },
+	{ "LpPlaceholder1", 39 },
+	{ "LpPlaceholder2", 40 },
+	{ "LpPlaceholder3", 41 },
+	{ "LpPlaceholder4", 42 },
+	{ "LpPlaceholder5", 43 },
+	{ "LpPlaceholder6", 44 },
+	{ "LpPlaceholder7", 45 },
+	{ "LpPlaceholder8", 46 },
+	{ "LpPlaceholder9", 47 },
+	{ "LpSchLocalRunListSize", 48 },
+	{ "LpReserveGroupId", 49 },
+	{ "LpRunningPriority", 50 },
+	{ "LpPerfmonInterruptCount", 51 },
+#elif IS_ENABLED(CONFIG_ARM64)
+	{ "LpCounterRefreshSequenceNumber", 31 },
+	{ "LpCounterRefreshReferenceTime", 32 },
+	{ "LpIdleAccumulationSnapshot", 33 },
+	{ "LpActiveTscCountSnapshot", 34 },
+	{ "LpHwpRequestContextSwitches", 35 },
+	{ "LpPlaceholder2", 36 },
+	{ "LpPlaceholder3", 37 },
+	{ "LpPlaceholder4", 38 },
+	{ "LpPlaceholder5", 39 },
+	{ "LpPlaceholder6", 40 },
+	{ "LpPlaceholder7", 41 },
+	{ "LpPlaceholder8", 42 },
+	{ "LpPlaceholder9", 43 },
+	{ "LpSchLocalRunListSize", 44 },
+	{ "LpReserveGroupId", 45 },
+	{ "LpRunningPriority", 46 },
+#endif
+};
+
+/* HV_PROCESS_COUNTER */
+static struct hv_counter_entry hv_partition_counters[] = {
+	{ "PtVirtualProcessors", 1 },
+
+	{ "PtTlbSize", 3 },
+	{ "PtAddressSpaces", 4 },
+	{ "PtDepositedPages", 5 },
+	{ "PtGpaPages", 6 },
+	{ "PtGpaSpaceModifications", 7 },
+	{ "PtVirtualTlbFlushEntires", 8 },
+	{ "PtRecommendedTlbSize", 9 },
+	{ "PtGpaPages4K", 10 },
+	{ "PtGpaPages2M", 11 },
+	{ "PtGpaPages1G", 12 },
+	{ "PtGpaPages512G", 13 },
+	{ "PtDevicePages4K", 14 },
+	{ "PtDevicePages2M", 15 },
+	{ "PtDevicePages1G", 16 },
+	{ "PtDevicePages512G", 17 },
+	{ "PtAttachedDevices", 18 },
+	{ "PtDeviceInterruptMappings", 19 },
+	{ "PtIoTlbFlushes", 20 },
+	{ "PtIoTlbFlushCost", 21 },
+	{ "PtDeviceInterruptErrors", 22 },
+	{ "PtDeviceDmaErrors", 23 },
+	{ "PtDeviceInterruptThrottleEvents", 24 },
+	{ "PtSkippedTimerTicks", 25 },
+	{ "PtPartitionId", 26 },
+#if IS_ENABLED(CONFIG_X86_64)
+	{ "PtNestedTlbSize", 27 },
+	{ "PtRecommendedNestedTlbSize", 28 },
+	{ "PtNestedTlbFreeListSize", 29 },
+	{ "PtNestedTlbTrimmedPages", 30 },
+	{ "PtPagesShattered", 31 },
+	{ "PtPagesRecombined", 32 },
+	{ "PtHwpRequestValue", 33 },
+	{ "PtAutoSuspendEnableTime", 34 },
+	{ "PtAutoSuspendTriggerTime", 35 },
+	{ "PtAutoSuspendDisableTime", 36 },
+	{ "PtPlaceholder1", 37 },
+	{ "PtPlaceholder2", 38 },
+	{ "PtPlaceholder3", 39 },
+	{ "PtPlaceholder4", 40 },
+	{ "PtPlaceholder5", 41 },
+	{ "PtPlaceholder6", 42 },
+	{ "PtPlaceholder7", 43 },
+	{ "PtPlaceholder8", 44 },
+	{ "PtHypervisorStateTransferGeneration", 45 },
+	{ "PtNumberofActiveChildPartitions", 46 },
+#elif IS_ENABLED(CONFIG_ARM64)
+	{ "PtHwpRequestValue", 27 },
+	{ "PtAutoSuspendEnableTime", 28 },
+	{ "PtAutoSuspendTriggerTime", 29 },
+	{ "PtAutoSuspendDisableTime", 30 },
+	{ "PtPlaceholder1", 31 },
+	{ "PtPlaceholder2", 32 },
+	{ "PtPlaceholder3", 33 },
+	{ "PtPlaceholder4", 34 },
+	{ "PtPlaceholder5", 35 },
+	{ "PtPlaceholder6", 36 },
+	{ "PtPlaceholder7", 37 },
+	{ "PtPlaceholder8", 38 },
+	{ "PtHypervisorStateTransferGeneration", 39 },
+	{ "PtNumberofActiveChildPartitions", 40 },
+#endif
+};
+
+/* HV_THREAD_COUNTER */
+static struct hv_counter_entry hv_vp_counters[] = {
+	{ "VpTotalRunTime", 1 },
+	{ "VpHypervisorRunTime", 2 },
+	{ "VpRemoteNodeRunTime", 3 },
+	{ "VpNormalizedRunTime", 4 },
+	{ "VpIdealCpu", 5 },
+
+	{ "VpHypercallsCount", 7 },
+	{ "VpHypercallsTime", 8 },
+#if IS_ENABLED(CONFIG_X86_64)
+	{ "VpPageInvalidationsCount", 9 },
+	{ "VpPageInvalidationsTime", 10 },
+	{ "VpControlRegisterAccessesCount", 11 },
+	{ "VpControlRegisterAccessesTime", 12 },
+	{ "VpIoInstructionsCount", 13 },
+	{ "VpIoInstructionsTime", 14 },
+	{ "VpHltInstructionsCount", 15 },
+	{ "VpHltInstructionsTime", 16 },
+	{ "VpMwaitInstructionsCount", 17 },
+	{ "VpMwaitInstructionsTime", 18 },
+	{ "VpCpuidInstructionsCount", 19 },
+	{ "VpCpuidInstructionsTime", 20 },
+	{ "VpMsrAccessesCount", 21 },
+	{ "VpMsrAccessesTime", 22 },
+	{ "VpOtherInterceptsCount", 23 },
+	{ "VpOtherInterceptsTime", 24 },
+	{ "VpExternalInterruptsCount", 25 },
+	{ "VpExternalInterruptsTime", 26 },
+	{ "VpPendingInterruptsCount", 27 },
+	{ "VpPendingInterruptsTime", 28 },
+	{ "VpEmulatedInstructionsCount", 29 },
+	{ "VpEmulatedInstructionsTime", 30 },
+	{ "VpDebugRegisterAccessesCount", 31 },
+	{ "VpDebugRegisterAccessesTime", 32 },
+	{ "VpPageFaultInterceptsCount", 33 },
+	{ "VpPageFaultInterceptsTime", 34 },
+	{ "VpGuestPageTableMaps", 35 },
+	{ "VpLargePageTlbFills", 36 },
+	{ "VpSmallPageTlbFills", 37 },
+	{ "VpReflectedGuestPageFaults", 38 },
+	{ "VpApicMmioAccesses", 39 },
+	{ "VpIoInterceptMessages", 40 },
+	{ "VpMemoryInterceptMessages", 41 },
+	{ "VpApicEoiAccesses", 42 },
+	{ "VpOtherMessages", 43 },
+	{ "VpPageTableAllocations", 44 },
+	{ "VpLogicalProcessorMigrations", 45 },
+	{ "VpAddressSpaceEvictions", 46 },
+	{ "VpAddressSpaceSwitches", 47 },
+	{ "VpAddressDomainFlushes", 48 },
+	{ "VpAddressSpaceFlushes", 49 },
+	{ "VpGlobalGvaRangeFlushes", 50 },
+	{ "VpLocalGvaRangeFlushes", 51 },
+	{ "VpPageTableEvictions", 52 },
+	{ "VpPageTableReclamations", 53 },
+	{ "VpPageTableResets", 54 },
+	{ "VpPageTableValidations", 55 },
+	{ "VpApicTprAccesses", 56 },
+	{ "VpPageTableWriteIntercepts", 57 },
+	{ "VpSyntheticInterrupts", 58 },
+	{ "VpVirtualInterrupts", 59 },
+	{ "VpApicIpisSent", 60 },
+	{ "VpApicSelfIpisSent", 61 },
+	{ "VpGpaSpaceHypercalls", 62 },
+	{ "VpLogicalProcessorHypercalls", 63 },
+	{ "VpLongSpinWaitHypercalls", 64 },
+	{ "VpOtherHypercalls", 65 },
+	{ "VpSyntheticInterruptHypercalls", 66 },
+	{ "VpVirtualInterruptHypercalls", 67 },
+	{ "VpVirtualMmuHypercalls", 68 },
+	{ "VpVirtualProcessorHypercalls", 69 },
+	{ "VpHardwareInterrupts", 70 },
+	{ "VpNestedPageFaultInterceptsCount", 71 },
+	{ "VpNestedPageFaultInterceptsTime", 72 },
+	{ "VpPageScans", 73 },
+	{ "VpLogicalProcessorDispatches", 74 },
+	{ "VpWaitingForCpuTime", 75 },
+	{ "VpExtendedHypercalls", 76 },
+	{ "VpExtendedHypercallInterceptMessages", 77 },
+	{ "VpMbecNestedPageTableSwitches", 78 },
+	{ "VpOtherReflectedGuestExceptions", 79 },
+	{ "VpGlobalIoTlbFlushes", 80 },
+	{ "VpGlobalIoTlbFlushCost", 81 },
+	{ "VpLocalIoTlbFlushes", 82 },
+	{ "VpLocalIoTlbFlushCost", 83 },
+	{ "VpHypercallsForwardedCount", 84 },
+	{ "VpHypercallsForwardingTime", 85 },
+	{ "VpPageInvalidationsForwardedCount", 86 },
+	{ "VpPageInvalidationsForwardingTime", 87 },
+	{ "VpControlRegisterAccessesForwardedCount", 88 },
+	{ "VpControlRegisterAccessesForwardingTime", 89 },
+	{ "VpIoInstructionsForwardedCount", 90 },
+	{ "VpIoInstructionsForwardingTime", 91 },
+	{ "VpHltInstructionsForwardedCount", 92 },
+	{ "VpHltInstructionsForwardingTime", 93 },
+	{ "VpMwaitInstructionsForwardedCount", 94 },
+	{ "VpMwaitInstructionsForwardingTime", 95 },
+	{ "VpCpuidInstructionsForwardedCount", 96 },
+	{ "VpCpuidInstructionsForwardingTime", 97 },
+	{ "VpMsrAccessesForwardedCount", 98 },
+	{ "VpMsrAccessesForwardingTime", 99 },
+	{ "VpOtherInterceptsForwardedCount", 100 },
+	{ "VpOtherInterceptsForwardingTime", 101 },
+	{ "VpExternalInterruptsForwardedCount", 102 },
+	{ "VpExternalInterruptsForwardingTime", 103 },
+	{ "VpPendingInterruptsForwardedCount", 104 },
+	{ "VpPendingInterruptsForwardingTime", 105 },
+	{ "VpEmulatedInstructionsForwardedCount", 106 },
+	{ "VpEmulatedInstructionsForwardingTime", 107 },
+	{ "VpDebugRegisterAccessesForwardedCount", 108 },
+	{ "VpDebugRegisterAccessesForwardingTime", 109 },
+	{ "VpPageFaultInterceptsForwardedCount", 110 },
+	{ "VpPageFaultInterceptsForwardingTime", 111 },
+	{ "VpVmclearEmulationCount", 112 },
+	{ "VpVmclearEmulationTime", 113 },
+	{ "VpVmptrldEmulationCount", 114 },
+	{ "VpVmptrldEmulationTime", 115 },
+	{ "VpVmptrstEmulationCount", 116 },
+	{ "VpVmptrstEmulationTime", 117 },
+	{ "VpVmreadEmulationCount", 118 },
+	{ "VpVmreadEmulationTime", 119 },
+	{ "VpVmwriteEmulationCount", 120 },
+	{ "VpVmwriteEmulationTime", 121 },
+	{ "VpVmxoffEmulationCount", 122 },
+	{ "VpVmxoffEmulationTime", 123 },
+	{ "VpVmxonEmulationCount", 124 },
+	{ "VpVmxonEmulationTime", 125 },
+	{ "VpNestedVMEntriesCount", 126 },
+	{ "VpNestedVMEntriesTime", 127 },
+	{ "VpNestedSLATSoftPageFaultsCount", 128 },
+	{ "VpNestedSLATSoftPageFaultsTime", 129 },
+	{ "VpNestedSLATHardPageFaultsCount", 130 },
+	{ "VpNestedSLATHardPageFaultsTime", 131 },
+	{ "VpInvEptAllContextEmulationCount", 132 },
+	{ "VpInvEptAllContextEmulationTime", 133 },
+	{ "VpInvEptSingleContextEmulationCount", 134 },
+	{ "VpInvEptSingleContextEmulationTime", 135 },
+	{ "VpInvVpidAllContextEmulationCount", 136 },
+	{ "VpInvVpidAllContextEmulationTime", 137 },
+	{ "VpInvVpidSingleContextEmulationCount", 138 },
+	{ "VpInvVpidSingleContextEmulationTime", 139 },
+	{ "VpInvVpidSingleAddressEmulationCount", 140 },
+	{ "VpInvVpidSingleAddressEmulationTime", 141 },
+	{ "VpNestedTlbPageTableReclamations", 142 },
+	{ "VpNestedTlbPageTableEvictions", 143 },
+	{ "VpFlushGuestPhysicalAddressSpaceHypercalls", 144 },
+	{ "VpFlushGuestPhysicalAddressListHypercalls", 145 },
+	{ "VpPostedInterruptNotifications", 146 },
+	{ "VpPostedInterruptScans", 147 },
+	{ "VpTotalCoreRunTime", 148 },
+	{ "VpMaximumRunTime", 149 },
+	{ "VpHwpRequestContextSwitches", 150 },
+	{ "VpWaitingForCpuTimeBucket0", 151 },
+	{ "VpWaitingForCpuTimeBucket1", 152 },
+	{ "VpWaitingForCpuTimeBucket2", 153 },
+	{ "VpWaitingForCpuTimeBucket3", 154 },
+	{ "VpWaitingForCpuTimeBucket4", 155 },
+	{ "VpWaitingForCpuTimeBucket5", 156 },
+	{ "VpWaitingForCpuTimeBucket6", 157 },
+	{ "VpVmloadEmulationCount", 158 },
+	{ "VpVmloadEmulationTime", 159 },
+	{ "VpVmsaveEmulationCount", 160 },
+	{ "VpVmsaveEmulationTime", 161 },
+	{ "VpGifInstructionEmulationCount", 162 },
+	{ "VpGifInstructionEmulationTime", 163 },
+	{ "VpEmulatedErrataSvmInstructions", 164 },
+	{ "VpPlaceholder1", 165 },
+	{ "VpPlaceholder2", 166 },
+	{ "VpPlaceholder3", 167 },
+	{ "VpPlaceholder4", 168 },
+	{ "VpPlaceholder5", 169 },
+	{ "VpPlaceholder6", 170 },
+	{ "VpPlaceholder7", 171 },
+	{ "VpPlaceholder8", 172 },
+	{ "VpContentionTime", 173 },
+	{ "VpWakeUpTime", 174 },
+	{ "VpSchedulingPriority", 175 },
+	{ "VpRdpmcInstructionsCount", 176 },
+	{ "VpRdpmcInstructionsTime", 177 },
+	{ "VpPerfmonPmuMsrAccessesCount", 178 },
+	{ "VpPerfmonLbrMsrAccessesCount", 179 },
+	{ "VpPerfmonIptMsrAccessesCount", 180 },
+	{ "VpPerfmonInterruptCount", 181 },
+	{ "VpVtl1DispatchCount", 182 },
+	{ "VpVtl2DispatchCount", 183 },
+	{ "VpVtl2DispatchBucket0", 184 },
+	{ "VpVtl2DispatchBucket1", 185 },
+	{ "VpVtl2DispatchBucket2", 186 },
+	{ "VpVtl2DispatchBucket3", 187 },
+	{ "VpVtl2DispatchBucket4", 188 },
+	{ "VpVtl2DispatchBucket5", 189 },
+	{ "VpVtl2DispatchBucket6", 190 },
+	{ "VpVtl1RunTime", 191 },
+	{ "VpVtl2RunTime", 192 },
+	{ "VpIommuHypercalls", 193 },
+	{ "VpCpuGroupHypercalls", 194 },
+	{ "VpVsmHypercalls", 195 },
+	{ "VpEventLogHypercalls", 196 },
+	{ "VpDeviceDomainHypercalls", 197 },
+	{ "VpDepositHypercalls", 198 },
+	{ "VpSvmHypercalls", 199 },
+	{ "VpBusLockAcquisitionCount", 200 },
+	{ "VpLoadAvg", 201 },
+	{ "VpRootDispatchThreadBlocked", 202 },
+	{ "VpIdleCpuTime", 203 },
+	{ "VpWaitingForCpuTimeBucket7", 204 },
+	{ "VpWaitingForCpuTimeBucket8", 205 },
+	{ "VpWaitingForCpuTimeBucket9", 206 },
+	{ "VpWaitingForCpuTimeBucket10", 207 },
+	{ "VpWaitingForCpuTimeBucket11", 208 },
+	{ "VpWaitingForCpuTimeBucket12", 209 },
+	{ "VpHierarchicalSuspendTime", 210 },
+	{ "VpExpressSchedulingAttempts", 211 },
+	{ "VpExpressSchedulingCount", 212 },
+	{ "VpBusLockAcquisitionTime", 213 },
+#elif IS_ENABLED(CONFIG_ARM64)
+	{ "VpSysRegAccessesCount", 9 },
+	{ "VpSysRegAccessesTime", 10 },
+	{ "VpSmcInstructionsCount", 11 },
+	{ "VpSmcInstructionsTime", 12 },
+	{ "VpOtherInterceptsCount", 13 },
+	{ "VpOtherInterceptsTime", 14 },
+	{ "VpExternalInterruptsCount", 15 },
+	{ "VpExternalInterruptsTime", 16 },
+	{ "VpPendingInterruptsCount", 17 },
+	{ "VpPendingInterruptsTime", 18 },
+	{ "VpGuestPageTableMaps", 19 },
+	{ "VpLargePageTlbFills", 20 },
+	{ "VpSmallPageTlbFills", 21 },
+	{ "VpReflectedGuestPageFaults", 22 },
+	{ "VpMemoryInterceptMessages", 23 },
+	{ "VpOtherMessages", 24 },
+	{ "VpLogicalProcessorMigrations", 25 },
+	{ "VpAddressDomainFlushes", 26 },
+	{ "VpAddressSpaceFlushes", 27 },
+	{ "VpSyntheticInterrupts", 28 },
+	{ "VpVirtualInterrupts", 29 },
+	{ "VpApicSelfIpisSent", 30 },
+	{ "VpGpaSpaceHypercalls", 31 },
+	{ "VpLogicalProcessorHypercalls", 32 },
+	{ "VpLongSpinWaitHypercalls", 33 },
+	{ "VpOtherHypercalls", 34 },
+	{ "VpSyntheticInterruptHypercalls", 35 },
+	{ "VpVirtualInterruptHypercalls", 36 },
+	{ "VpVirtualMmuHypercalls", 37 },
+	{ "VpVirtualProcessorHypercalls", 38 },
+	{ "VpHardwareInterrupts", 39 },
+	{ "VpNestedPageFaultInterceptsCount", 40 },
+	{ "VpNestedPageFaultInterceptsTime", 41 },
+	{ "VpLogicalProcessorDispatches", 42 },
+	{ "VpWaitingForCpuTime", 43 },
+	{ "VpExtendedHypercalls", 44 },
+	{ "VpExtendedHypercallInterceptMessages", 45 },
+	{ "VpMbecNestedPageTableSwitches", 46 },
+	{ "VpOtherReflectedGuestExceptions", 47 },
+	{ "VpGlobalIoTlbFlushes", 48 },
+	{ "VpGlobalIoTlbFlushCost", 49 },
+	{ "VpLocalIoTlbFlushes", 50 },
+	{ "VpLocalIoTlbFlushCost", 51 },
+	{ "VpFlushGuestPhysicalAddressSpaceHypercalls", 52 },
+	{ "VpFlushGuestPhysicalAddressListHypercalls", 53 },
+	{ "VpPostedInterruptNotifications", 54 },
+	{ "VpPostedInterruptScans", 55 },
+	{ "VpTotalCoreRunTime", 56 },
+	{ "VpMaximumRunTime", 57 },
+	{ "VpWaitingForCpuTimeBucket0", 58 },
+	{ "VpWaitingForCpuTimeBucket1", 59 },
+	{ "VpWaitingForCpuTimeBucket2", 60 },
+	{ "VpWaitingForCpuTimeBucket3", 61 },
+	{ "VpWaitingForCpuTimeBucket4", 62 },
+	{ "VpWaitingForCpuTimeBucket5", 63 },
+	{ "VpWaitingForCpuTimeBucket6", 64 },
+	{ "VpHwpRequestContextSwitches", 65 },
+	{ "VpPlaceholder2", 66 },
+	{ "VpPlaceholder3", 67 },
+	{ "VpPlaceholder4", 68 },
+	{ "VpPlaceholder5", 69 },
+	{ "VpPlaceholder6", 70 },
+	{ "VpPlaceholder7", 71 },
+	{ "VpPlaceholder8", 72 },
+	{ "VpContentionTime", 73 },
+	{ "VpWakeUpTime", 74 },
+	{ "VpSchedulingPriority", 75 },
+	{ "VpVtl1DispatchCount", 76 },
+	{ "VpVtl2DispatchCount", 77 },
+	{ "VpVtl2DispatchBucket0", 78 },
+	{ "VpVtl2DispatchBucket1", 79 },
+	{ "VpVtl2DispatchBucket2", 80 },
+	{ "VpVtl2DispatchBucket3", 81 },
+	{ "VpVtl2DispatchBucket4", 82 },
+	{ "VpVtl2DispatchBucket5", 83 },
+	{ "VpVtl2DispatchBucket6", 84 },
+	{ "VpVtl1RunTime", 85 },
+	{ "VpVtl2RunTime", 86 },
+	{ "VpIommuHypercalls", 87 },
+	{ "VpCpuGroupHypercalls", 88 },
+	{ "VpVsmHypercalls", 89 },
+	{ "VpEventLogHypercalls", 90 },
+	{ "VpDeviceDomainHypercalls", 91 },
+	{ "VpDepositHypercalls", 92 },
+	{ "VpSvmHypercalls", 93 },
+	{ "VpLoadAvg", 94 },
+	{ "VpRootDispatchThreadBlocked", 95 },
+	{ "VpIdleCpuTime", 96 },
+	{ "VpWaitingForCpuTimeBucket7", 97 },
+	{ "VpWaitingForCpuTimeBucket8", 98 },
+	{ "VpWaitingForCpuTimeBucket9", 99 },
+	{ "VpWaitingForCpuTimeBucket10", 100 },
+	{ "VpWaitingForCpuTimeBucket11", 101 },
+	{ "VpWaitingForCpuTimeBucket12", 102 },
+	{ "VpHierarchicalSuspendTime", 103 },
+	{ "VpExpressSchedulingAttempts", 104 },
+	{ "VpExpressSchedulingCount", 105 },
+#endif
+};
+
-- 
2.34.1


^ permalink raw reply related

* [PATCH v4 7/7] mshv: Add debugfs to view hypervisor statistics
From: Nuno Das Neves @ 2026-01-21 21:46 UTC (permalink / raw)
  To: linux-hyperv, linux-kernel, mhklinux, skinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, prapal, mrathor,
	paekkaladevi, Nuno Das Neves, Jinank Jain
In-Reply-To: <20260121214623.76374-1-nunodasneves@linux.microsoft.com>

Introduce a debugfs interface to expose root and child partition stats
when running with mshv_root.

Create a debugfs directory "mshv" containing 'stats' files organized by
type and id. A stats file contains a number of counters depending on
its type. e.g. an excerpt from a VP stats file:

TotalRunTime                  : 1997602722
HypervisorRunTime             : 649671371
RemoteNodeRunTime             : 0
NormalizedRunTime             : 1997602721
IdealCpu                      : 0
HypercallsCount               : 1708169
HypercallsTime                : 111914774
PageInvalidationsCount        : 0
PageInvalidationsTime         : 0

On a root partition with some active child partitions, the entire
directory structure may look like:

mshv/
  stats             # hypervisor stats
  lp/               # logical processors
    0/              # LP id
      stats         # LP 0 stats
    1/
    2/
    3/
  partition/        # partition stats
    1/              # root partition id
      stats         # root partition stats
      vp/           # root virtual processors
        0/          # root VP id
          stats     # root VP 0 stats
        1/
        2/
        3/
    42/             # child partition id
      stats         # child partition stats
      vp/           # child VPs
        0/          # child VP id
          stats     # child VP 0 stats
        1/
    43/
    55/

On L1VH, some stats are not present as it does not own the hardware
like the root partition does:
- The hypervisor and lp stats are not present
- L1VH's partition directory is named "self" because it can't get its
  own id
- Some of L1VH's partition and VP stats fields are not populated, because
  it can't map its own HV_STATS_AREA_PARENT page.

Co-developed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Co-developed-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
Co-developed-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
Co-developed-by: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>
Signed-off-by: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>
Co-developed-by: Jinank Jain <jinankjain@microsoft.com>
Signed-off-by: Jinank Jain <jinankjain@microsoft.com>
Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/Makefile         |   1 +
 drivers/hv/hv_counters.c    |   1 +
 drivers/hv/hv_synic.c       | 177 +++++++++
 drivers/hv/mshv_debugfs.c   | 703 ++++++++++++++++++++++++++++++++++++
 drivers/hv/mshv_root.h      |  34 ++
 drivers/hv/mshv_root_main.c |  26 +-
 6 files changed, 940 insertions(+), 2 deletions(-)
 create mode 100644 drivers/hv/hv_synic.c
 create mode 100644 drivers/hv/mshv_debugfs.c

diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index a49f93c2d245..2593711c3628 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -15,6 +15,7 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING)	+= hv_debugfs.o
 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
 mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
 	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+mshv_root-$(CONFIG_DEBUG_FS) += mshv_debugfs.o
 mshv_vtl-y := mshv_vtl_main.o
 
 # Code that must be built-in
diff --git a/drivers/hv/hv_counters.c b/drivers/hv/hv_counters.c
index a8e07e72cc29..45ff3d663e56 100644
--- a/drivers/hv/hv_counters.c
+++ b/drivers/hv/hv_counters.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2026, Microsoft Corporation.
  *
  * Data for printing stats page counters via debugfs.
+ * Included directly in mshv_debugfs.c.
  *
  * Authors: Microsoft Linux virtualization team
  */
diff --git a/drivers/hv/hv_synic.c b/drivers/hv/hv_synic.c
new file mode 100644
index 000000000000..cc81d78887f2
--- /dev/null
+++ b/drivers/hv/hv_synic.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+/*
+	root	l1vh	vtl
+vmbus
+
+guest
+vmbus, nothing else
+
+vtl
+mshv_vtl uses intercept SINT, VTL2_VMBUS_SINT_INDEX (7, not in hvgdk_mini lol)
+vmbus
+
+bm root
+mshv_root, no vmbus
+
+nested root
+mshv_root uses L1
+vmbus uses L0 (NESTED regs)
+
+l1vh
+mshv_root and vmbus use same regs
+
+*/
+
+struct hv_synic_page {
+	u64 msr;
+	void *ptr;
+	struct kref refcount;
+};
+
+void *hv_get_synic_page(u32 msr) {
+	struct hv_synic_page *page_obj;
+	page_obj = kmalloc
+}
+
+
+#define HV_SYNIC_PAGE_STRUCT(type, name) \
+struct 
+
+/* UGH */
+struct hv_percpu_synic_cxt {
+	struct {
+		struct hv_message_page *ptr;
+		refcount_t pt_ref_count;
+	} hv_simp;
+	struct hv_message_page *hv_simp;
+	struct hv_synic_event_flags_page *hv_siefp;
+	struct hv_synic_event_ring_page *hv_sierp;
+};
+
+int hv_setup_sint(u32 sint_msr)
+{
+	union hv_synic_sint sint;
+
+	// TODO validate sint_msr
+
+	sint.as_uint64 = hv_get_msr(sint_msr);
+	sint.vector = vmbus_interrupt;
+	sint.masked = false;
+	sint.auto_eoi = hv_recommend_using_aeoi();
+
+	hv_set_msr(sint_msr, sint.as_uint64);
+
+	return 0;
+}
+
+void *hv_setup_synic_page(u32 msr)
+{
+	void *addr;
+	struct hv_synic_page synic_page;
+
+	// TODO validate msr
+
+	synic_page.as_uint64 = hv_get_msr(msr);
+	synic_page.enabled = 1;
+
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
+		u64 base = (synic_page.gpa << HV_HYP_PAGE_SHIFT) &
+			    ~ms_hyperv.shared_gpa_boundary;
+		addr = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+		if (!addr) {
+			pr_err("%s: Fail to map synic page from %#x.\n",
+			       __func__, msr);
+			return NULL;
+		}
+	} else {
+		addr = (void *)__get_free_page(GFP_KERNEL);
+		if (!page)
+			return NULL;
+
+		memset(page, 0, PAGE_SIZE);
+		synic_page.gpa = virt_to_phys(addr) >> HV_HYP_PAGE_SHIFT;
+	}
+	hv_set_msr(msr, synic_page.as_uint64);
+
+	return addr;
+}
+
+/*
+ * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller
+ * with the hypervisor.
+ */
+void hv_hyp_synic_enable_regs(unsigned int cpu)
+{
+	struct hv_per_cpu_context *hv_cpu =
+		per_cpu_ptr(hv_context.cpu_context, cpu);
+	union hv_synic_simp simp;
+	union hv_synic_siefp siefp;
+	union hv_synic_sint shared_sint;
+
+	/* Setup the Synic's message page with the hypervisor. */
+	simp.as_uint64 = hv_get_msr(HV_MSR_SIMP);
+	simp.simp_enabled = 1;
+
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
+		u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
+				~ms_hyperv.shared_gpa_boundary;
+		hv_cpu->hyp_synic_message_page =
+			(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+		if (!hv_cpu->hyp_synic_message_page)
+			pr_err("Fail to map synic message page.\n");
+	} else {
+		simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page)
+			>> HV_HYP_PAGE_SHIFT;
+	}
+
+	hv_set_msr(HV_MSR_SIMP, simp.as_uint64);
+
+	/* Setup the Synic's event page with the hypervisor. */
+	siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP);
+	siefp.siefp_enabled = 1;
+
+	if (ms_hyperv.paravisor_present || hv_root_partition()) {
+		/* Mask out vTOM bit. ioremap_cache() maps decrypted */
+		u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
+				~ms_hyperv.shared_gpa_boundary;
+		hv_cpu->hyp_synic_event_page =
+			(void *)ioremap_cache(base, HV_HYP_PAGE_SIZE);
+		if (!hv_cpu->hyp_synic_event_page)
+			pr_err("Fail to map synic event page.\n");
+	} else {
+		siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page)
+			>> HV_HYP_PAGE_SHIFT;
+	}
+
+	hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64);
+	hv_enable_coco_interrupt(cpu, vmbus_interrupt, true);
+
+	/* Setup the shared SINT. */
+	if (vmbus_irq != -1)
+		enable_percpu_irq(vmbus_irq, 0);
+	shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT);
+
+	shared_sint.vector = vmbus_interrupt;
+	shared_sint.masked = false;
+	shared_sint.auto_eoi = hv_recommend_using_aeoi();
+	hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
+}
+
+static void hv_hyp_synic_enable_interrupts(void)
+{
+	union hv_synic_scontrol sctrl;
+
+	/* Enable the global synic bit */
+	sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL);
+	sctrl.enable = 1;
+
+	hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64);
+}
diff --git a/drivers/hv/mshv_debugfs.c b/drivers/hv/mshv_debugfs.c
new file mode 100644
index 000000000000..72eb0ae44e4b
--- /dev/null
+++ b/drivers/hv/mshv_debugfs.c
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * The /sys/kernel/debug/mshv directory contents.
+ * Contains various statistics data, provided by the hypervisor.
+ *
+ * Authors: Microsoft Linux virtualization team
+ */
+
+#include <linux/debugfs.h>
+#include <linux/stringify.h>
+#include <asm/mshyperv.h>
+#include <linux/slab.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+#include "hv_counters.c"
+
+#define U32_BUF_SZ 11
+#define U64_BUF_SZ 21
+#define NUM_STATS_AREAS (HV_STATS_AREA_PARENT + 1)
+
+static struct dentry *mshv_debugfs;
+static struct dentry *mshv_debugfs_partition;
+static struct dentry *mshv_debugfs_lp;
+static struct dentry **parent_vp_stats;
+static struct dentry *parent_partition_stats;
+
+static u64 mshv_lps_count;
+static struct hv_stats_page **mshv_lps_stats;
+
+static int lp_stats_show(struct seq_file *m, void *v)
+{
+	const struct hv_stats_page *stats = m->private;
+	struct hv_counter_entry *entry = hv_lp_counters;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_lp_counters); i++, entry++)
+		seq_printf(m, "%-29s: %llu\n", entry->name,
+			   stats->data[entry->idx]);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(lp_stats);
+
+static void mshv_lp_stats_unmap(u32 lp_index)
+{
+	union hv_stats_object_identity identity = {
+		.lp.lp_index = lp_index,
+		.lp.stats_area_type = HV_STATS_AREA_SELF,
+	};
+	int err;
+
+	err = hv_unmap_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR,
+				  mshv_lps_stats[lp_index], &identity);
+	if (err)
+		pr_err("%s: failed to unmap logical processor %u stats, err: %d\n",
+		       __func__, lp_index, err);
+}
+
+static struct hv_stats_page * __init mshv_lp_stats_map(u32 lp_index)
+{
+	union hv_stats_object_identity identity = {
+		.lp.lp_index = lp_index,
+		.lp.stats_area_type = HV_STATS_AREA_SELF,
+	};
+	struct hv_stats_page *stats;
+	int err;
+
+	err = hv_map_stats_page(HV_STATS_OBJECT_LOGICAL_PROCESSOR, &identity,
+				&stats);
+	if (err) {
+		pr_err("%s: failed to map logical processor %u stats, err: %d\n",
+		       __func__, lp_index, err);
+		return ERR_PTR(err);
+	}
+	mshv_lps_stats[lp_index] = stats;
+
+	return stats;
+}
+
+static struct hv_stats_page * __init lp_debugfs_stats_create(u32 lp_index,
+							     struct dentry *parent)
+{
+	struct dentry *dentry;
+	struct hv_stats_page *stats;
+
+	stats = mshv_lp_stats_map(lp_index);
+	if (IS_ERR(stats))
+		return stats;
+
+	dentry = debugfs_create_file("stats", 0400, parent,
+				     stats, &lp_stats_fops);
+	if (IS_ERR(dentry)) {
+		mshv_lp_stats_unmap(lp_index);
+		return ERR_CAST(dentry);
+	}
+	return stats;
+}
+
+static int __init lp_debugfs_create(u32 lp_index, struct dentry *parent)
+{
+	struct dentry *idx;
+	char lp_idx_str[U32_BUF_SZ];
+	struct hv_stats_page *stats;
+	int err;
+
+	sprintf(lp_idx_str, "%u", lp_index);
+
+	idx = debugfs_create_dir(lp_idx_str, parent);
+	if (IS_ERR(idx))
+		return PTR_ERR(idx);
+
+	stats = lp_debugfs_stats_create(lp_index, idx);
+	if (IS_ERR(stats)) {
+		err = PTR_ERR(stats);
+		goto remove_debugfs_lp_idx;
+	}
+
+	return 0;
+
+remove_debugfs_lp_idx:
+	debugfs_remove_recursive(idx);
+	return err;
+}
+
+static void mshv_debugfs_lp_remove(void)
+{
+	int lp_index;
+
+	debugfs_remove_recursive(mshv_debugfs_lp);
+
+	for (lp_index = 0; lp_index < mshv_lps_count; lp_index++)
+		mshv_lp_stats_unmap(lp_index);
+
+	kfree(mshv_lps_stats);
+	mshv_lps_stats = NULL;
+}
+
+static int __init mshv_debugfs_lp_create(struct dentry *parent)
+{
+	struct dentry *lp_dir;
+	int err, lp_index;
+
+	mshv_lps_stats = kcalloc(mshv_lps_count,
+				 sizeof(*mshv_lps_stats),
+				 GFP_KERNEL_ACCOUNT);
+
+	if (!mshv_lps_stats)
+		return -ENOMEM;
+
+	lp_dir = debugfs_create_dir("lp", parent);
+	if (IS_ERR(lp_dir)) {
+		err = PTR_ERR(lp_dir);
+		goto free_lp_stats;
+	}
+
+	for (lp_index = 0; lp_index < mshv_lps_count; lp_index++) {
+		err = lp_debugfs_create(lp_index, lp_dir);
+		if (err)
+			goto remove_debugfs_lps;
+	}
+
+	mshv_debugfs_lp = lp_dir;
+
+	return 0;
+
+remove_debugfs_lps:
+	for (lp_index -= 1; lp_index >= 0; lp_index--)
+		mshv_lp_stats_unmap(lp_index);
+	debugfs_remove_recursive(lp_dir);
+free_lp_stats:
+	kfree(mshv_lps_stats);
+
+	return err;
+}
+
+static int vp_stats_show(struct seq_file *m, void *v)
+{
+	const struct hv_stats_page **pstats = m->private;
+	struct hv_counter_entry *entry = hv_vp_counters;
+	int i;
+
+	/*
+	 * For VP and partition stats, there may be two stats areas mapped,
+	 * SELF and PARENT. These refer to the privilege level of the data in
+	 * each page. Some fields may be 0 in SELF and nonzero in PARENT, or
+	 * vice versa.
+	 *
+	 * Hence, prioritize printing from the PARENT page (more privileged
+	 * data), but use the value from the SELF page if the PARENT value is
+	 * 0.
+	 */
+
+	for (i = 0; i < ARRAY_SIZE(hv_vp_counters); i++, entry++) {
+		u64 parent_val = pstats[HV_STATS_AREA_PARENT]->data[entry->idx];
+		u64 self_val = pstats[HV_STATS_AREA_SELF]->data[entry->idx];
+
+		seq_printf(m, "%-43s: %llu\n", entry->name,
+			   parent_val ? parent_val : self_val);
+	}
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(vp_stats);
+
+static void vp_debugfs_remove(struct dentry *vp_stats)
+{
+	debugfs_remove_recursive(vp_stats->d_parent);
+}
+
+static int vp_debugfs_create(u64 partition_id, u32 vp_index,
+			     struct hv_stats_page **pstats,
+			     struct dentry **vp_stats_ptr,
+			     struct dentry *parent)
+{
+	struct dentry *vp_idx_dir, *d;
+	char vp_idx_str[U32_BUF_SZ];
+	int err;
+
+	sprintf(vp_idx_str, "%u", vp_index);
+
+	vp_idx_dir = debugfs_create_dir(vp_idx_str, parent);
+	if (IS_ERR(vp_idx_dir))
+		return PTR_ERR(vp_idx_dir);
+
+	d = debugfs_create_file("stats", 0400, vp_idx_dir,
+				     pstats, &vp_stats_fops);
+	if (IS_ERR(d)) {
+		err = PTR_ERR(d);
+		goto remove_debugfs_vp_idx;
+	}
+
+	*vp_stats_ptr = d;
+
+	return 0;
+
+remove_debugfs_vp_idx:
+	debugfs_remove_recursive(vp_idx_dir);
+	return err;
+}
+
+static int partition_stats_show(struct seq_file *m, void *v)
+{
+	const struct hv_stats_page **pstats = m->private;
+	struct hv_counter_entry *entry = hv_partition_counters;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_partition_counters); i++, entry++) {
+		u64 parent_val = pstats[HV_STATS_AREA_PARENT]->data[entry->idx];
+		u64 self_val = pstats[HV_STATS_AREA_SELF]->data[entry->idx];
+
+		seq_printf(m, "%-32s: %llu\n", entry->name,
+			   parent_val ? parent_val : self_val);
+	}
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(partition_stats);
+
+static void mshv_partition_stats_unmap(u64 partition_id,
+				       struct hv_stats_page *stats_page,
+				       enum hv_stats_area_type stats_area_type)
+{
+	union hv_stats_object_identity identity = {
+		.partition.partition_id = partition_id,
+		.partition.stats_area_type = stats_area_type,
+	};
+	int err;
+
+	err = hv_unmap_stats_page(HV_STATS_OBJECT_PARTITION, stats_page,
+				  &identity);
+	if (err)
+		pr_err("%s: failed to unmap partition %lld %s stats, err: %d\n",
+		       __func__, partition_id,
+		       (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent",
+		       err);
+}
+
+static struct hv_stats_page *mshv_partition_stats_map(u64 partition_id,
+						      enum hv_stats_area_type stats_area_type)
+{
+	union hv_stats_object_identity identity = {
+		.partition.partition_id = partition_id,
+		.partition.stats_area_type = stats_area_type,
+	};
+	struct hv_stats_page *stats;
+	int err;
+
+	err = hv_map_stats_page(HV_STATS_OBJECT_PARTITION, &identity, &stats);
+	if (err) {
+		pr_err("%s: failed to map partition %lld %s stats, err: %d\n",
+		       __func__, partition_id,
+		       (stats_area_type == HV_STATS_AREA_SELF) ? "self" : "parent",
+		       err);
+		return ERR_PTR(err);
+	}
+	return stats;
+}
+
+static int mshv_debugfs_partition_stats_create(u64 partition_id,
+					    struct dentry **partition_stats_ptr,
+					    struct dentry *parent)
+{
+	struct dentry *dentry;
+	struct hv_stats_page **pstats;
+	int err;
+
+	pstats = kcalloc(NUM_STATS_AREAS, sizeof(struct hv_stats_page *),
+			 GFP_KERNEL_ACCOUNT);
+	if (!pstats)
+		return -ENOMEM;
+
+	pstats[HV_STATS_AREA_SELF] = mshv_partition_stats_map(partition_id,
+							      HV_STATS_AREA_SELF);
+	if (IS_ERR(pstats[HV_STATS_AREA_SELF])) {
+		err = PTR_ERR(pstats[HV_STATS_AREA_SELF]);
+		goto cleanup;
+	}
+
+	/*
+	 * L1VH partition cannot access its partition stats in parent area.
+	 */
+	if (is_l1vh_parent(partition_id)) {
+		pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF];
+	} else {
+		pstats[HV_STATS_AREA_PARENT] = mshv_partition_stats_map(partition_id,
+									HV_STATS_AREA_PARENT);
+		if (IS_ERR(pstats[HV_STATS_AREA_PARENT])) {
+			err = PTR_ERR(pstats[HV_STATS_AREA_PARENT]);
+			goto unmap_self;
+		}
+		if (!pstats[HV_STATS_AREA_PARENT])
+			pstats[HV_STATS_AREA_PARENT] = pstats[HV_STATS_AREA_SELF];
+	}
+
+	dentry = debugfs_create_file("stats", 0400, parent,
+				     pstats, &partition_stats_fops);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto unmap_partition_stats;
+	}
+
+	*partition_stats_ptr = dentry;
+	return 0;
+
+unmap_partition_stats:
+	if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF])
+		mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_PARENT],
+					   HV_STATS_AREA_PARENT);
+unmap_self:
+	mshv_partition_stats_unmap(partition_id, pstats[HV_STATS_AREA_SELF],
+				   HV_STATS_AREA_SELF);
+cleanup:
+	kfree(pstats);
+	return err;
+}
+
+static void partition_debugfs_remove(u64 partition_id, struct dentry *dentry)
+{
+	struct hv_stats_page **pstats = NULL;
+
+	pstats = dentry->d_inode->i_private;
+
+	debugfs_remove_recursive(dentry->d_parent);
+
+	if (pstats[HV_STATS_AREA_PARENT] != pstats[HV_STATS_AREA_SELF]) {
+		mshv_partition_stats_unmap(partition_id,
+					   pstats[HV_STATS_AREA_PARENT],
+					   HV_STATS_AREA_PARENT);
+	}
+
+	mshv_partition_stats_unmap(partition_id,
+				   pstats[HV_STATS_AREA_SELF],
+				   HV_STATS_AREA_SELF);
+
+	kfree(pstats);
+}
+
+static int partition_debugfs_create(u64 partition_id,
+				    struct dentry **vp_dir_ptr,
+				    struct dentry **partition_stats_ptr,
+				    struct dentry *parent)
+{
+	char part_id_str[U64_BUF_SZ];
+	struct dentry *part_id_dir, *vp_dir;
+	int err;
+
+	if (is_l1vh_parent(partition_id))
+		sprintf(part_id_str, "self");
+	else
+		sprintf(part_id_str, "%llu", partition_id);
+
+	part_id_dir = debugfs_create_dir(part_id_str, parent);
+	if (IS_ERR(part_id_dir))
+		return PTR_ERR(part_id_dir);
+
+	vp_dir = debugfs_create_dir("vp", part_id_dir);
+	if (IS_ERR(vp_dir)) {
+		err = PTR_ERR(vp_dir);
+		goto remove_debugfs_partition_id;
+	}
+
+	err = mshv_debugfs_partition_stats_create(partition_id,
+						  partition_stats_ptr,
+						  part_id_dir);
+	if (err)
+		goto remove_debugfs_partition_id;
+
+	*vp_dir_ptr = vp_dir;
+
+	return 0;
+
+remove_debugfs_partition_id:
+	debugfs_remove_recursive(part_id_dir);
+	return err;
+}
+
+static void parent_vp_debugfs_remove(u32 vp_index,
+				     struct dentry *vp_stats_ptr)
+{
+	struct hv_stats_page **pstats;
+
+	pstats = vp_stats_ptr->d_inode->i_private;
+	vp_debugfs_remove(vp_stats_ptr);
+	mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats);
+	kfree(pstats);
+}
+
+static void mshv_debugfs_parent_partition_remove(void)
+{
+	int idx;
+
+	for_each_online_cpu(idx)
+		parent_vp_debugfs_remove(idx,
+					 parent_vp_stats[idx]);
+
+	partition_debugfs_remove(hv_current_partition_id,
+				 parent_partition_stats);
+	kfree(parent_vp_stats);
+	parent_vp_stats = NULL;
+	parent_partition_stats = NULL;
+
+}
+
+static int __init parent_vp_debugfs_create(u32 vp_index,
+					   struct dentry **vp_stats_ptr,
+					   struct dentry *parent)
+{
+	struct hv_stats_page **pstats;
+	int err;
+
+	pstats = kcalloc(2, sizeof(struct hv_stats_page *), GFP_KERNEL_ACCOUNT);
+	if (!pstats)
+		return -ENOMEM;
+
+	err = mshv_vp_stats_map(hv_current_partition_id, vp_index, pstats);
+	if (err)
+		goto cleanup;
+
+	err = vp_debugfs_create(hv_current_partition_id, vp_index, pstats,
+				vp_stats_ptr, parent);
+	if (err)
+		goto unmap_vp_stats;
+
+	return 0;
+
+unmap_vp_stats:
+	mshv_vp_stats_unmap(hv_current_partition_id, vp_index, pstats);
+cleanup:
+	kfree(pstats);
+	return err;
+}
+
+static int __init mshv_debugfs_parent_partition_create(void)
+{
+	struct dentry *vp_dir;
+	int err, idx, i;
+
+	mshv_debugfs_partition = debugfs_create_dir("partition",
+						     mshv_debugfs);
+	if (IS_ERR(mshv_debugfs_partition))
+		return PTR_ERR(mshv_debugfs_partition);
+
+	err = partition_debugfs_create(hv_current_partition_id,
+				       &vp_dir,
+				       &parent_partition_stats,
+				       mshv_debugfs_partition);
+	if (err)
+		goto remove_debugfs_partition;
+
+	parent_vp_stats = kcalloc(num_possible_cpus(),
+				  sizeof(*parent_vp_stats),
+				  GFP_KERNEL);
+	if (!parent_vp_stats) {
+		err = -ENOMEM;
+		goto remove_debugfs_partition;
+	}
+
+	for_each_online_cpu(idx) {
+		err = parent_vp_debugfs_create(hv_vp_index[idx],
+					       &parent_vp_stats[idx],
+					       vp_dir);
+		if (err)
+			goto remove_debugfs_partition_vp;
+	}
+
+	return 0;
+
+remove_debugfs_partition_vp:
+	for_each_online_cpu(i) {
+		if (i >= idx)
+			break;
+		parent_vp_debugfs_remove(i, parent_vp_stats[i]);
+	}
+	partition_debugfs_remove(hv_current_partition_id,
+				 parent_partition_stats);
+
+	kfree(parent_vp_stats);
+	parent_vp_stats = NULL;
+	parent_partition_stats = NULL;
+
+remove_debugfs_partition:
+	debugfs_remove_recursive(mshv_debugfs_partition);
+	mshv_debugfs_partition = NULL;
+	return err;
+}
+
+static int hv_stats_show(struct seq_file *m, void *v)
+{
+	const struct hv_stats_page *stats = m->private;
+	struct hv_counter_entry *entry = hv_hypervisor_counters;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_hypervisor_counters); i++, entry++)
+		seq_printf(m, "%-25s: %llu\n", entry->name,
+			   stats->data[entry->idx]);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(hv_stats);
+
+static void mshv_hv_stats_unmap(void)
+{
+	union hv_stats_object_identity identity = {
+		.hv.stats_area_type = HV_STATS_AREA_SELF,
+	};
+	int err;
+
+	err = hv_unmap_stats_page(HV_STATS_OBJECT_HYPERVISOR, NULL, &identity);
+	if (err)
+		pr_err("%s: failed to unmap hypervisor stats: %d\n",
+		       __func__, err);
+}
+
+static void * __init mshv_hv_stats_map(void)
+{
+	union hv_stats_object_identity identity = {
+		.hv.stats_area_type = HV_STATS_AREA_SELF,
+	};
+	struct hv_stats_page *stats;
+	int err;
+
+	err = hv_map_stats_page(HV_STATS_OBJECT_HYPERVISOR, &identity, &stats);
+	if (err) {
+		pr_err("%s: failed to map hypervisor stats: %d\n",
+		       __func__, err);
+		return ERR_PTR(err);
+	}
+	return stats;
+}
+
+static int __init mshv_debugfs_hv_stats_create(struct dentry *parent)
+{
+	struct dentry *dentry;
+	u64 *stats;
+	int err;
+
+	stats = mshv_hv_stats_map();
+	if (IS_ERR(stats))
+		return PTR_ERR(stats);
+
+	dentry = debugfs_create_file("stats", 0400, parent,
+				     stats, &hv_stats_fops);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		pr_err("%s: failed to create hypervisor stats dentry: %d\n",
+		       __func__, err);
+		goto unmap_hv_stats;
+	}
+
+	mshv_lps_count = num_present_cpus();
+
+	return 0;
+
+unmap_hv_stats:
+	mshv_hv_stats_unmap();
+	return err;
+}
+
+int mshv_debugfs_vp_create(struct mshv_vp *vp)
+{
+	struct mshv_partition *p = vp->vp_partition;
+
+	if (!mshv_debugfs)
+		return 0;
+
+	return vp_debugfs_create(p->pt_id, vp->vp_index,
+				 vp->vp_stats_pages,
+				 &vp->vp_stats_dentry,
+				 p->pt_vp_dentry);
+}
+
+void mshv_debugfs_vp_remove(struct mshv_vp *vp)
+{
+	if (!mshv_debugfs)
+		return;
+
+	vp_debugfs_remove(vp->vp_stats_dentry);
+}
+
+int mshv_debugfs_partition_create(struct mshv_partition *partition)
+{
+	int err;
+
+	if (!mshv_debugfs)
+		return 0;
+
+	err = partition_debugfs_create(partition->pt_id,
+				       &partition->pt_vp_dentry,
+				       &partition->pt_stats_dentry,
+				       mshv_debugfs_partition);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+void mshv_debugfs_partition_remove(struct mshv_partition *partition)
+{
+	if (!mshv_debugfs)
+		return;
+
+	partition_debugfs_remove(partition->pt_id,
+				 partition->pt_stats_dentry);
+}
+
+int __init mshv_debugfs_init(void)
+{
+	int err;
+
+	mshv_debugfs = debugfs_create_dir("mshv", NULL);
+	if (IS_ERR(mshv_debugfs)) {
+		pr_err("%s: failed to create debugfs directory\n", __func__);
+		return PTR_ERR(mshv_debugfs);
+	}
+
+	if (hv_root_partition()) {
+		err = mshv_debugfs_hv_stats_create(mshv_debugfs);
+		if (err)
+			goto remove_mshv_dir;
+
+		err = mshv_debugfs_lp_create(mshv_debugfs);
+		if (err)
+			goto unmap_hv_stats;
+	}
+
+	err = mshv_debugfs_parent_partition_create();
+	if (err)
+		goto unmap_lp_stats;
+
+	return 0;
+
+unmap_lp_stats:
+	if (hv_root_partition()) {
+		mshv_debugfs_lp_remove();
+		mshv_debugfs_lp = NULL;
+	}
+unmap_hv_stats:
+	if (hv_root_partition())
+		mshv_hv_stats_unmap();
+remove_mshv_dir:
+	debugfs_remove_recursive(mshv_debugfs);
+	mshv_debugfs = NULL;
+	return err;
+}
+
+void mshv_debugfs_exit(void)
+{
+	mshv_debugfs_parent_partition_remove();
+
+	if (hv_root_partition()) {
+		mshv_debugfs_lp_remove();
+		mshv_debugfs_lp = NULL;
+		mshv_hv_stats_unmap();
+	}
+
+	debugfs_remove_recursive(mshv_debugfs);
+	mshv_debugfs = NULL;
+	mshv_debugfs_partition = NULL;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index e4912b0618fa..7332d9af8373 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -52,6 +52,9 @@ struct mshv_vp {
 		unsigned int kicked_by_hv;
 		wait_queue_head_t vp_suspend_queue;
 	} run;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct dentry *vp_stats_dentry;
+#endif
 };
 
 #define vp_fmt(fmt) "p%lluvp%u: " fmt
@@ -136,6 +139,10 @@ struct mshv_partition {
 	u64 isolation_type;
 	bool import_completed;
 	bool pt_initialized;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct dentry *pt_stats_dentry;
+	struct dentry *pt_vp_dentry;
+#endif
 };
 
 #define pt_fmt(fmt) "p%llu: " fmt
@@ -327,6 +334,33 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg,
 				      void *property_value, size_t property_value_sz);
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+int __init mshv_debugfs_init(void);
+void mshv_debugfs_exit(void);
+
+int mshv_debugfs_partition_create(struct mshv_partition *partition);
+void mshv_debugfs_partition_remove(struct mshv_partition *partition);
+int mshv_debugfs_vp_create(struct mshv_vp *vp);
+void mshv_debugfs_vp_remove(struct mshv_vp *vp);
+#else
+static inline int __init mshv_debugfs_init(void)
+{
+	return 0;
+}
+static inline void mshv_debugfs_exit(void) { }
+
+static inline int mshv_debugfs_partition_create(struct mshv_partition *partition)
+{
+	return 0;
+}
+static inline void mshv_debugfs_partition_remove(struct mshv_partition *partition) { }
+static inline int mshv_debugfs_vp_create(struct mshv_vp *vp)
+{
+	return 0;
+}
+static inline void mshv_debugfs_vp_remove(struct mshv_vp *vp) { }
+#endif
+
 extern struct mshv_root mshv_root;
 extern enum hv_scheduler_type hv_scheduler_type;
 extern u8 * __percpu *hv_synic_eventring_tail;
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 12825666e21b..f4654fb8cd23 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1096,6 +1096,10 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 
 	memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
 
+	ret = mshv_debugfs_vp_create(vp);
+	if (ret)
+		goto put_partition;
+
 	/*
 	 * Keep anon_inode_getfd last: it installs fd in the file struct and
 	 * thus makes the state accessible in user space.
@@ -1103,7 +1107,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
 			       O_RDWR | O_CLOEXEC);
 	if (ret < 0)
-		goto put_partition;
+		goto remove_debugfs_vp;
 
 	/* already exclusive with the partition mutex for all ioctls */
 	partition->pt_vp_count++;
@@ -1111,6 +1115,8 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 
 	return ret;
 
+remove_debugfs_vp:
+	mshv_debugfs_vp_remove(vp);
 put_partition:
 	mshv_partition_put(partition);
 free_vp:
@@ -1553,10 +1559,16 @@ mshv_partition_ioctl_initialize(struct mshv_partition *partition)
 	if (ret)
 		goto withdraw_mem;
 
+	ret = mshv_debugfs_partition_create(partition);
+	if (ret)
+		goto finalize_partition;
+
 	partition->pt_initialized = true;
 
 	return 0;
 
+finalize_partition:
+	hv_call_finalize_partition(partition->pt_id);
 withdraw_mem:
 	hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE, partition->pt_id);
 
@@ -1736,6 +1748,7 @@ static void destroy_partition(struct mshv_partition *partition)
 			if (!vp)
 				continue;
 
+			mshv_debugfs_vp_remove(vp);
 			mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
 					    vp->vp_stats_pages);
 
@@ -1769,6 +1782,8 @@ static void destroy_partition(struct mshv_partition *partition)
 			partition->pt_vp_array[i] = NULL;
 		}
 
+		mshv_debugfs_partition_remove(partition);
+
 		/* Deallocates and unmaps everything including vcpus, GPA mappings etc */
 		hv_call_finalize_partition(partition->pt_id);
 
@@ -2314,10 +2329,14 @@ static int __init mshv_parent_partition_init(void)
 
 	mshv_init_vmm_caps(dev);
 
-	ret = mshv_irqfd_wq_init();
+	ret = mshv_debugfs_init();
 	if (ret)
 		goto exit_partition;
 
+	ret = mshv_irqfd_wq_init();
+	if (ret)
+		goto exit_debugfs;
+
 	spin_lock_init(&mshv_root.pt_ht_lock);
 	hash_init(mshv_root.pt_htable);
 
@@ -2325,6 +2344,8 @@ static int __init mshv_parent_partition_init(void)
 
 	return 0;
 
+exit_debugfs:
+	mshv_debugfs_exit();
 exit_partition:
 	if (hv_root_partition())
 		mshv_root_partition_exit();
@@ -2341,6 +2362,7 @@ static void __exit mshv_parent_partition_exit(void)
 {
 	hv_setup_mshv_handler(NULL);
 	mshv_port_table_fini();
+	mshv_debugfs_exit();
 	misc_deregister(&mshv_dev);
 	mshv_irqfd_wq_cleanup();
 	if (hv_root_partition())
-- 
2.34.1


^ permalink raw reply related

* [PATCH net-next v16 01/12] vsock: add netns to vsock core
From: Bobby Eshleman @ 2026-01-21 22:11 UTC (permalink / raw)
  To: Stefano Garzarella, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Stefan Hajnoczi, Michael S. Tsirkin,
	Jason Wang, Eugenio Pérez, Xuan Zhuo, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	Broadcom internal kernel review list, Shuah Khan, Long Li,
	Jonathan Corbet
  Cc: linux-kernel, virtualization, netdev, kvm, linux-hyperv,
	linux-kselftest, berrange, Sargun Dhillon, linux-doc,
	Bobby Eshleman, Bobby Eshleman
In-Reply-To: <20260121-vsock-vmtest-v16-0-2859a7512097@meta.com>

From: Bobby Eshleman <bobbyeshleman@meta.com>

Add netns logic to vsock core. Additionally, modify transport hook
prototypes to be used by later transport-specific patches (e.g.,
*_seqpacket_allow()).

Namespaces are supported primarily by changing socket lookup functions
(e.g., vsock_find_connected_socket()) to take into account the socket
namespace and the namespace mode before considering a candidate socket a
"match".

This patch also introduces the sysctl /proc/sys/net/vsock/ns_mode to
report the mode and /proc/sys/net/vsock/child_ns_mode to set the mode
for new namespaces.

Add netns functionality (initialization, passing to transports, procfs,
etc...) to the af_vsock socket layer. Later patches that add netns
support to transports depend on this patch.

This patch changes the allocation of random ports for connectible vsocks
in order to avoid leaking the random port range starting point to other
namespaces.

dgram_allow(), stream_allow(), and seqpacket_allow() callbacks are
modified to take a vsk in order to perform logic on namespace modes. In
future patches, the net will also be used for socket
lookups in these functions.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
Changes in v16:
- remove vsock_init_ns_mode kernel cmdline
- add documentation at top of af_vsock.c about init_net being global
  and unmodifiable
- add comments for vsock_find* and vsock_find*_net functions to clarify
  which to choose for which transports

Changes in v15:
- make static port in __vsock_bind_connectible per-netns
- remove __net_initdata because we want the ops beyond just boot
- add vsock_init_ns_mode kernel cmdline parameter to set init ns mode
- use if (ret || !write) in __vsock_net_mode_string() (Stefano)
- add vsock_net_mode_global() (Stefano)
- hide !net == VSOCK_NET_MODE_GLOBAL inside vsock_net_mode() (Stefano)
- clarify af_vsock.c comments on ns_mode/child_ns_mode (Stefano)

Changes in v14:
- include linux/sysctl.h in af_vsock.c
- squash patch 'vsock: add per-net vsock NS mode state' into this patch
  (prior version can be found here):
  https://lore.kernel.org/all/20251223-vsock-vmtest-v13-1-9d6db8e7c80b@meta.com/)

Changes in v13:
- remove net_mode and replace with direct accesses to net->vsock.mode,
  since this is now immutable.
- update comments about mode behavior and mutability, and sysctl API
- only pass NULL for net when wanting global, instead of net_mode ==
  VSOCK_NET_MODE_GLOBAL. This reflects the new logic
  of vsock_net_check_mode() that only requires net pointers (not
  net_mode).
- refactor sysctl string code into a re-usable function, because
  child_ns_mode and ns_mode both handle the same strings.
- remove redundant vsock_net_init(&init_net) call in module init because
  pernet registration calls the callback on the init_net too

Changes in v12:
- return true in dgram_allow(), stream_allow(), and seqpacket_allow()
  only if net_mode == VSOCK_NET_MODE_GLOBAL (Stefano)
- document bind(VMADDR_CID_ANY) case in af_vsock.c (Stefano)
- change order of stream_allow() call in vmci so we can pass vsk
  to it

Changes in v10:
- add file-level comment about what happens to sockets/devices
  when the namespace mode changes (Stefano)
- change the 'if (write)' boolean in vsock_net_mode_string() to
  if (!write), this simplifies a later patch which adds "goto"
  for mutex unlocking on function exit.

Changes in v9:
- remove virtio_vsock_alloc_rx_skb() (Stefano)
- remove vsock_global_dummy_net, not needed as net=NULL +
  net_mode=VSOCK_NET_MODE_GLOBAL achieves identical result

Changes in v7:
- hv_sock: fix hyperv build error
- explain why vhost does not use the dummy
- explain usage of __vsock_global_dummy_net
- explain why VSOCK_NET_MODE_STR_MAX is 8 characters
- use switch-case in vsock_net_mode_string()
- avoid changing transports as much as possible
- add vsock_find_{bound,connected}_socket_net()
- rename `vsock_hdr` to `sysctl_hdr`
- add virtio_vsock_alloc_linear_skb() wrapper for setting dummy net and
  global mode for virtio-vsock, move skb->cb zero-ing into wrapper
- explain seqpacket_allow() change
- move net setting to __vsock_create() instead of vsock_create() so
  that child sockets also have their net assigned upon accept()

Changes in v6:
- unregister sysctl ops in vsock_exit()
- af_vsock: clarify description of CID behavior
- af_vsock: fix buf vs buffer naming, and length checking
- af_vsock: fix length checking w/ correct ctl_table->maxlen

Changes in v5:
- vsock_global_net() -> vsock_global_dummy_net()
- update comments for new uAPI
- use /proc/sys/net/vsock/ns_mode instead of /proc/net/vsock_ns_mode
- add prototype changes so patch remains compilable
---
 MAINTAINERS                             |   1 +
 drivers/vhost/vsock.c                   |   6 +-
 include/linux/virtio_vsock.h            |   4 +-
 include/net/af_vsock.h                  |  61 +++++-
 include/net/net_namespace.h             |   4 +
 include/net/netns/vsock.h               |  21 ++
 net/vmw_vsock/af_vsock.c                | 335 +++++++++++++++++++++++++++++---
 net/vmw_vsock/hyperv_transport.c        |   7 +-
 net/vmw_vsock/virtio_transport.c        |   9 +-
 net/vmw_vsock/virtio_transport_common.c |   6 +-
 net/vmw_vsock/vmci_transport.c          |  26 ++-
 net/vmw_vsock/vsock_loopback.c          |   8 +-
 12 files changed, 437 insertions(+), 51 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 92768bceb929..8341a451570b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -27549,6 +27549,7 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/vhost/vsock.c
 F:	include/linux/virtio_vsock.h
+F:	include/net/netns/vsock.h
 F:	include/uapi/linux/virtio_vsock.h
 F:	net/vmw_vsock/virtio_transport.c
 F:	net/vmw_vsock/virtio_transport_common.c
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 552cfb53498a..647ded6f6ea5 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -407,7 +407,8 @@ static bool vhost_transport_msgzerocopy_allow(void)
 	return true;
 }
 
-static bool vhost_transport_seqpacket_allow(u32 remote_cid);
+static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
+					    u32 remote_cid);
 
 static struct virtio_transport vhost_transport = {
 	.transport = {
@@ -463,7 +464,8 @@ static struct virtio_transport vhost_transport = {
 	.send_pkt = vhost_transport_send_pkt,
 };
 
-static bool vhost_transport_seqpacket_allow(u32 remote_cid)
+static bool vhost_transport_seqpacket_allow(struct vsock_sock *vsk,
+					    u32 remote_cid)
 {
 	struct vhost_vsock *vsock;
 	bool seqpacket_allow = false;
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 0c67543a45c8..1845e8d4f78d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -256,10 +256,10 @@ void virtio_transport_notify_buffer_size(struct vsock_sock *vsk, u64 *val);
 
 u64 virtio_transport_stream_rcvhiwat(struct vsock_sock *vsk);
 bool virtio_transport_stream_is_active(struct vsock_sock *vsk);
-bool virtio_transport_stream_allow(u32 cid, u32 port);
+bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port);
 int virtio_transport_dgram_bind(struct vsock_sock *vsk,
 				struct sockaddr_vm *addr);
-bool virtio_transport_dgram_allow(u32 cid, u32 port);
+bool virtio_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port);
 
 int virtio_transport_connect(struct vsock_sock *vsk);
 
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index d40e978126e3..d3ff48a2fbe0 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -10,6 +10,7 @@
 
 #include <linux/kernel.h>
 #include <linux/workqueue.h>
+#include <net/netns/vsock.h>
 #include <net/sock.h>
 #include <uapi/linux/vm_sockets.h>
 
@@ -124,7 +125,7 @@ struct vsock_transport {
 			     size_t len, int flags);
 	int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
 			     struct msghdr *, size_t len);
-	bool (*dgram_allow)(u32 cid, u32 port);
+	bool (*dgram_allow)(struct vsock_sock *vsk, u32 cid, u32 port);
 
 	/* STREAM. */
 	/* TODO: stream_bind() */
@@ -136,14 +137,14 @@ struct vsock_transport {
 	s64 (*stream_has_space)(struct vsock_sock *);
 	u64 (*stream_rcvhiwat)(struct vsock_sock *);
 	bool (*stream_is_active)(struct vsock_sock *);
-	bool (*stream_allow)(u32 cid, u32 port);
+	bool (*stream_allow)(struct vsock_sock *vsk, u32 cid, u32 port);
 
 	/* SEQ_PACKET. */
 	ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
 				     int flags);
 	int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
 				 size_t len);
-	bool (*seqpacket_allow)(u32 remote_cid);
+	bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid);
 	u32 (*seqpacket_has_data)(struct vsock_sock *vsk);
 
 	/* Notification. */
@@ -216,6 +217,11 @@ void vsock_remove_connected(struct vsock_sock *vsk);
 struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
 struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 					 struct sockaddr_vm *dst);
+struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr,
+					 struct net *net);
+struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src,
+					     struct sockaddr_vm *dst,
+					     struct net *net);
 void vsock_remove_sock(struct vsock_sock *vsk);
 void vsock_for_each_connected_socket(struct vsock_transport *transport,
 				     void (*fn)(struct sock *sk));
@@ -256,4 +262,53 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
 {
 	return t->msgzerocopy_allow && t->msgzerocopy_allow();
 }
+
+static inline enum vsock_net_mode vsock_net_mode(struct net *net)
+{
+	if (!net)
+		return VSOCK_NET_MODE_GLOBAL;
+
+	return READ_ONCE(net->vsock.mode);
+}
+
+static inline bool vsock_net_mode_global(struct vsock_sock *vsk)
+{
+	return vsock_net_mode(sock_net(sk_vsock(vsk))) == VSOCK_NET_MODE_GLOBAL;
+}
+
+static inline void vsock_net_set_child_mode(struct net *net,
+					    enum vsock_net_mode mode)
+{
+	WRITE_ONCE(net->vsock.child_ns_mode, mode);
+}
+
+static inline enum vsock_net_mode vsock_net_child_mode(struct net *net)
+{
+	return READ_ONCE(net->vsock.child_ns_mode);
+}
+
+/* Return true if two namespaces pass the mode rules. Otherwise, return false.
+ *
+ * A NULL namespace is treated as VSOCK_NET_MODE_GLOBAL.
+ *
+ * Read more about modes in the comment header of net/vmw_vsock/af_vsock.c.
+ */
+static inline bool vsock_net_check_mode(struct net *ns0, struct net *ns1)
+{
+	enum vsock_net_mode mode0, mode1;
+
+	/* Any vsocks within the same network namespace are always reachable,
+	 * regardless of the mode.
+	 */
+	if (net_eq(ns0, ns1))
+		return true;
+
+	mode0 = vsock_net_mode(ns0);
+	mode1 = vsock_net_mode(ns1);
+
+	/* Different namespaces are only reachable if they are both
+	 * global mode.
+	 */
+	return mode0 == VSOCK_NET_MODE_GLOBAL && mode0 == mode1;
+}
 #endif /* __AF_VSOCK_H__ */
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index cb664f6e3558..66d3de1d935f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -37,6 +37,7 @@
 #include <net/netns/smc.h>
 #include <net/netns/bpf.h>
 #include <net/netns/mctp.h>
+#include <net/netns/vsock.h>
 #include <net/net_trackers.h>
 #include <linux/ns_common.h>
 #include <linux/idr.h>
@@ -196,6 +197,9 @@ struct net {
 	/* Move to a better place when the config guard is removed. */
 	struct mutex		rtnl_mutex;
 #endif
+#if IS_ENABLED(CONFIG_VSOCKETS)
+	struct netns_vsock	vsock;
+#endif
 } __randomize_layout;
 
 #include <linux/seq_file_net.h>
diff --git a/include/net/netns/vsock.h b/include/net/netns/vsock.h
new file mode 100644
index 000000000000..b34d69a22fa8
--- /dev/null
+++ b/include/net/netns/vsock.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_NET_NAMESPACE_VSOCK_H
+#define __NET_NET_NAMESPACE_VSOCK_H
+
+#include <linux/types.h>
+
+enum vsock_net_mode {
+	VSOCK_NET_MODE_GLOBAL,
+	VSOCK_NET_MODE_LOCAL,
+};
+
+struct netns_vsock {
+	struct ctl_table_header *sysctl_hdr;
+
+	/* protected by the vsock_table_lock in af_vsock.c */
+	u32 port;
+
+	enum vsock_net_mode mode;
+	enum vsock_net_mode child_ns_mode;
+};
+#endif /* __NET_NET_NAMESPACE_VSOCK_H */
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index a3505a4dcee0..20ad2b2dc17b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -83,6 +83,50 @@
  *   TCP_ESTABLISHED - connected
  *   TCP_CLOSING - disconnecting
  *   TCP_LISTEN - listening
+ *
+ * - Namespaces in vsock support two different modes: "local" and "global".
+ *   Each mode defines how the namespace interacts with CIDs.
+ *   Each namespace exposes two sysctl files:
+ *
+ *   - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's
+ *     mode, which is set at namespace creation and immutable thereafter.
+ *   - /proc/sys/net/vsock/child_ns_mode (writable) controls what mode future
+ *     child namespaces will inherit when created. The default is "global".
+ *
+ *   Changing child_ns_mode only affects newly created namespaces, not the
+ *   current namespace or existing children. At namespace creation, ns_mode
+ *   is inherited from the parent's child_ns_mode.
+ *
+ *   The init_net mode is "global" and cannot be modified.
+ *
+ *   The modes affect the allocation and accessibility of CIDs as follows:
+ *
+ *   - global - access and allocation are all system-wide
+ *      - all CID allocation from global namespaces draw from the same
+ *        system-wide pool.
+ *      - if one global namespace has already allocated some CID, another
+ *        global namespace will not be able to allocate the same CID.
+ *      - global mode AF_VSOCK sockets can reach any VM or socket in any global
+ *        namespace, they are not contained to only their own namespace.
+ *      - AF_VSOCK sockets in a global mode namespace cannot reach VMs or
+ *        sockets in any local mode namespace.
+ *   - local - access and allocation are contained within the namespace
+ *     - CID allocation draws only from a private pool local only to the
+ *       namespace, and does not affect the CIDs available for allocation in any
+ *       other namespace (global or local).
+ *     - VMs in a local namespace do not collide with CIDs in any other local
+ *       namespace or any global namespace. For example, if a VM in a local mode
+ *       namespace is given CID 10, then CID 10 is still available for
+ *       allocation in any other namespace, but not in the same namespace.
+ *     - AF_VSOCK sockets in a local mode namespace can connect only to VMs or
+ *       other sockets within their own namespace.
+ *     - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve
+ *       to any transport that is not compatible with local mode. There is no
+ *       error that propagates to the user (as there is for connection attempts)
+ *       because it is possible for some packet to reach this socket from
+ *       a different transport that *does* support local mode. For
+ *       example, virtio-vsock may not support local mode, but the socket
+ *       may still accept a connection from vhost-vsock which does.
  */
 
 #include <linux/compat.h>
@@ -100,20 +144,31 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/net.h>
+#include <linux/proc_fs.h>
 #include <linux/poll.h>
 #include <linux/random.h>
 #include <linux/skbuff.h>
 #include <linux/smp.h>
 #include <linux/socket.h>
 #include <linux/stddef.h>
+#include <linux/sysctl.h>
 #include <linux/unistd.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <net/sock.h>
 #include <net/af_vsock.h>
+#include <net/netns/vsock.h>
 #include <uapi/linux/vm_sockets.h>
 #include <uapi/asm-generic/ioctls.h>
 
+#define VSOCK_NET_MODE_STR_GLOBAL "global"
+#define VSOCK_NET_MODE_STR_LOCAL "local"
+
+/* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'.
+ * The newline is added by proc_dostring() for read operations.
+ */
+#define VSOCK_NET_MODE_STR_MAX 8
+
 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
 static void vsock_sk_destruct(struct sock *sk);
 static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
@@ -235,33 +290,42 @@ static void __vsock_remove_connected(struct vsock_sock *vsk)
 	sock_put(&vsk->sk);
 }
 
-static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
+static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr,
+						  struct net *net)
 {
 	struct vsock_sock *vsk;
 
 	list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
-		if (vsock_addr_equals_addr(addr, &vsk->local_addr))
-			return sk_vsock(vsk);
+		struct sock *sk = sk_vsock(vsk);
+
+		if (vsock_addr_equals_addr(addr, &vsk->local_addr) &&
+		    vsock_net_check_mode(sock_net(sk), net))
+			return sk;
 
 		if (addr->svm_port == vsk->local_addr.svm_port &&
 		    (vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
-		     addr->svm_cid == VMADDR_CID_ANY))
-			return sk_vsock(vsk);
+		     addr->svm_cid == VMADDR_CID_ANY) &&
+		     vsock_net_check_mode(sock_net(sk), net))
+			return sk;
 	}
 
 	return NULL;
 }
 
-static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
-						  struct sockaddr_vm *dst)
+static struct sock *
+__vsock_find_connected_socket_net(struct sockaddr_vm *src,
+				  struct sockaddr_vm *dst, struct net *net)
 {
 	struct vsock_sock *vsk;
 
 	list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
 			    connected_table) {
+		struct sock *sk = sk_vsock(vsk);
+
 		if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
-		    dst->svm_port == vsk->local_addr.svm_port) {
-			return sk_vsock(vsk);
+		    dst->svm_port == vsk->local_addr.svm_port &&
+		    vsock_net_check_mode(sock_net(sk), net)) {
+			return sk;
 		}
 	}
 
@@ -304,12 +368,18 @@ void vsock_remove_connected(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(vsock_remove_connected);
 
-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+/* Find a bound socket, filtering by namespace and namespace mode.
+ *
+ * Use this in transports that are namespace-aware and can provide the
+ * network namespace context.
+ */
+struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr,
+					 struct net *net)
 {
 	struct sock *sk;
 
 	spin_lock_bh(&vsock_table_lock);
-	sk = __vsock_find_bound_socket(addr);
+	sk = __vsock_find_bound_socket_net(addr, net);
 	if (sk)
 		sock_hold(sk);
 
@@ -317,15 +387,32 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
 
 	return sk;
 }
+EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net);
+
+/* Find a bound socket without namespace filtering.
+ *
+ * Use this in transports that lack namespace context. All sockets are
+ * treated as if in global mode.
+ */
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+{
+	return vsock_find_bound_socket_net(addr, NULL);
+}
 EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
 
-struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
-					 struct sockaddr_vm *dst)
+/* Find a connected socket, filtering by namespace and namespace mode.
+ *
+ * Use this in transports that are namespace-aware and can provide the
+ * network namespace context.
+ */
+struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src,
+					     struct sockaddr_vm *dst,
+					     struct net *net)
 {
 	struct sock *sk;
 
 	spin_lock_bh(&vsock_table_lock);
-	sk = __vsock_find_connected_socket(src, dst);
+	sk = __vsock_find_connected_socket_net(src, dst, net);
 	if (sk)
 		sock_hold(sk);
 
@@ -333,6 +420,18 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
 
 	return sk;
 }
+EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net);
+
+/* Find a connected socket without namespace filtering.
+ *
+ * Use this in transports that lack namespace context. All sockets are
+ * treated as if in global mode.
+ */
+struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
+					 struct sockaddr_vm *dst)
+{
+	return vsock_find_connected_socket_net(src, dst, NULL);
+}
 EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
 
 void vsock_remove_sock(struct vsock_sock *vsk)
@@ -528,7 +627,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
 
 	if (sk->sk_type == SOCK_SEQPACKET) {
 		if (!new_transport->seqpacket_allow ||
-		    !new_transport->seqpacket_allow(remote_cid)) {
+		    !new_transport->seqpacket_allow(vsk, remote_cid)) {
 			module_put(new_transport->module);
 			return -ESOCKTNOSUPPORT;
 		}
@@ -676,11 +775,11 @@ static void vsock_pending_work(struct work_struct *work)
 static int __vsock_bind_connectible(struct vsock_sock *vsk,
 				    struct sockaddr_vm *addr)
 {
-	static u32 port;
+	struct net *net = sock_net(sk_vsock(vsk));
 	struct sockaddr_vm new_addr;
 
-	if (!port)
-		port = get_random_u32_above(LAST_RESERVED_PORT);
+	if (!net->vsock.port)
+		net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT);
 
 	vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
 
@@ -689,13 +788,13 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
 		unsigned int i;
 
 		for (i = 0; i < MAX_PORT_RETRIES; i++) {
-			if (port == VMADDR_PORT_ANY ||
-			    port <= LAST_RESERVED_PORT)
-				port = LAST_RESERVED_PORT + 1;
+			if (net->vsock.port == VMADDR_PORT_ANY ||
+			    net->vsock.port <= LAST_RESERVED_PORT)
+				net->vsock.port = LAST_RESERVED_PORT + 1;
 
-			new_addr.svm_port = port++;
+			new_addr.svm_port = net->vsock.port++;
 
-			if (!__vsock_find_bound_socket(&new_addr)) {
+			if (!__vsock_find_bound_socket_net(&new_addr, net)) {
 				found = true;
 				break;
 			}
@@ -712,7 +811,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
 			return -EACCES;
 		}
 
-		if (__vsock_find_bound_socket(&new_addr))
+		if (__vsock_find_bound_socket_net(&new_addr, net))
 			return -EADDRINUSE;
 	}
 
@@ -1314,7 +1413,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto out;
 	}
 
-	if (!transport->dgram_allow(remote_addr->svm_cid,
+	if (!transport->dgram_allow(vsk, remote_addr->svm_cid,
 				    remote_addr->svm_port)) {
 		err = -EINVAL;
 		goto out;
@@ -1355,7 +1454,7 @@ static int vsock_dgram_connect(struct socket *sock,
 	if (err)
 		goto out;
 
-	if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
+	if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid,
 					 remote_addr->svm_port)) {
 		err = -EINVAL;
 		goto out;
@@ -1585,7 +1684,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr,
 		 * endpoints.
 		 */
 		if (!transport ||
-		    !transport->stream_allow(remote_addr->svm_cid,
+		    !transport->stream_allow(vsk, remote_addr->svm_cid,
 					     remote_addr->svm_port)) {
 			err = -ENETUNREACH;
 			goto out;
@@ -2662,6 +2761,180 @@ static struct miscdevice vsock_device = {
 	.fops		= &vsock_device_ops,
 };
 
+static int __vsock_net_mode_string(const struct ctl_table *table, int write,
+				   void *buffer, size_t *lenp, loff_t *ppos,
+				   enum vsock_net_mode mode,
+				   enum vsock_net_mode *new_mode)
+{
+	char data[VSOCK_NET_MODE_STR_MAX] = {0};
+	struct ctl_table tmp;
+	int ret;
+
+	if (!table->data || !table->maxlen || !*lenp) {
+		*lenp = 0;
+		return 0;
+	}
+
+	tmp = *table;
+	tmp.data = data;
+
+	if (!write) {
+		const char *p;
+
+		switch (mode) {
+		case VSOCK_NET_MODE_GLOBAL:
+			p = VSOCK_NET_MODE_STR_GLOBAL;
+			break;
+		case VSOCK_NET_MODE_LOCAL:
+			p = VSOCK_NET_MODE_STR_LOCAL;
+			break;
+		default:
+			WARN_ONCE(true, "netns has invalid vsock mode");
+			*lenp = 0;
+			return 0;
+		}
+
+		strscpy(data, p, sizeof(data));
+		tmp.maxlen = strlen(p);
+	}
+
+	ret = proc_dostring(&tmp, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	if (*lenp >= sizeof(data))
+		return -EINVAL;
+
+	if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data)))
+		*new_mode = VSOCK_NET_MODE_GLOBAL;
+	else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data)))
+		*new_mode = VSOCK_NET_MODE_LOCAL;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int vsock_net_mode_string(const struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net;
+
+	if (write)
+		return -EPERM;
+
+	net = current->nsproxy->net_ns;
+
+	return __vsock_net_mode_string(table, write, buffer, lenp, ppos,
+				       vsock_net_mode(net), NULL);
+}
+
+static int vsock_net_child_mode_string(const struct ctl_table *table, int write,
+				       void *buffer, size_t *lenp, loff_t *ppos)
+{
+	enum vsock_net_mode new_mode;
+	struct net *net;
+	int ret;
+
+	net = current->nsproxy->net_ns;
+
+	ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos,
+				      vsock_net_child_mode(net), &new_mode);
+	if (ret)
+		return ret;
+
+	if (write)
+		vsock_net_set_child_mode(net, new_mode);
+
+	return 0;
+}
+
+static struct ctl_table vsock_table[] = {
+	{
+		.procname	= "ns_mode",
+		.data		= &init_net.vsock.mode,
+		.maxlen		= VSOCK_NET_MODE_STR_MAX,
+		.mode		= 0444,
+		.proc_handler	= vsock_net_mode_string
+	},
+	{
+		.procname	= "child_ns_mode",
+		.data		= &init_net.vsock.child_ns_mode,
+		.maxlen		= VSOCK_NET_MODE_STR_MAX,
+		.mode		= 0644,
+		.proc_handler	= vsock_net_child_mode_string
+	},
+};
+
+static int __net_init vsock_sysctl_register(struct net *net)
+{
+	struct ctl_table *table;
+
+	if (net_eq(net, &init_net)) {
+		table = vsock_table;
+	} else {
+		table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL);
+		if (!table)
+			goto err_alloc;
+
+		table[0].data = &net->vsock.mode;
+		table[1].data = &net->vsock.child_ns_mode;
+	}
+
+	net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table,
+						       ARRAY_SIZE(vsock_table));
+	if (!net->vsock.sysctl_hdr)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	if (!net_eq(net, &init_net))
+		kfree(table);
+err_alloc:
+	return -ENOMEM;
+}
+
+static void vsock_sysctl_unregister(struct net *net)
+{
+	const struct ctl_table *table;
+
+	table = net->vsock.sysctl_hdr->ctl_table_arg;
+	unregister_net_sysctl_table(net->vsock.sysctl_hdr);
+	if (!net_eq(net, &init_net))
+		kfree(table);
+}
+
+static void vsock_net_init(struct net *net)
+{
+	if (net_eq(net, &init_net))
+		net->vsock.mode = VSOCK_NET_MODE_GLOBAL;
+	else
+		net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns);
+
+	net->vsock.child_ns_mode = VSOCK_NET_MODE_GLOBAL;
+}
+
+static __net_init int vsock_sysctl_init_net(struct net *net)
+{
+	vsock_net_init(net);
+
+	if (vsock_sysctl_register(net))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static __net_exit void vsock_sysctl_exit_net(struct net *net)
+{
+	vsock_sysctl_unregister(net);
+}
+
+static struct pernet_operations vsock_sysctl_ops = {
+	.init = vsock_sysctl_init_net,
+	.exit = vsock_sysctl_exit_net,
+};
+
 static int __init vsock_init(void)
 {
 	int err = 0;
@@ -2689,10 +2962,17 @@ static int __init vsock_init(void)
 		goto err_unregister_proto;
 	}
 
+	if (register_pernet_subsys(&vsock_sysctl_ops)) {
+		err = -ENOMEM;
+		goto err_unregister_sock;
+	}
+
 	vsock_bpf_build_proto();
 
 	return 0;
 
+err_unregister_sock:
+	sock_unregister(AF_VSOCK);
 err_unregister_proto:
 	proto_unregister(&vsock_proto);
 err_deregister_misc:
@@ -2706,6 +2986,7 @@ static void __exit vsock_exit(void)
 	misc_deregister(&vsock_device);
 	sock_unregister(AF_VSOCK);
 	proto_unregister(&vsock_proto);
+	unregister_pernet_subsys(&vsock_sysctl_ops);
 }
 
 const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 432fcbbd14d4..c3010c874308 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -570,7 +570,7 @@ static int hvs_dgram_enqueue(struct vsock_sock *vsk,
 	return -EOPNOTSUPP;
 }
 
-static bool hvs_dgram_allow(u32 cid, u32 port)
+static bool hvs_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port)
 {
 	return false;
 }
@@ -745,8 +745,11 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk)
 	return hvs->chan != NULL;
 }
 
-static bool hvs_stream_allow(u32 cid, u32 port)
+static bool hvs_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
 {
+	if (!vsock_net_mode_global(vsk))
+		return false;
+
 	if (cid == VMADDR_CID_HOST)
 		return true;
 
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 8c867023a2e5..f0a9e51118f3 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -536,7 +536,8 @@ static bool virtio_transport_msgzerocopy_allow(void)
 	return true;
 }
 
-static bool virtio_transport_seqpacket_allow(u32 remote_cid);
+static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk,
+					     u32 remote_cid);
 
 static struct virtio_transport virtio_transport = {
 	.transport = {
@@ -593,11 +594,15 @@ static struct virtio_transport virtio_transport = {
 	.can_msgzerocopy = virtio_transport_can_msgzerocopy,
 };
 
-static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+static bool
+virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
 {
 	struct virtio_vsock *vsock;
 	bool seqpacket_allow;
 
+	if (!vsock_net_mode_global(vsk))
+		return false;
+
 	seqpacket_allow = false;
 	rcu_read_lock();
 	vsock = rcu_dereference(the_virtio_vsock);
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index dcc8a1d5851e..fdb8f5b3fa60 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1043,9 +1043,9 @@ bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
 
-bool virtio_transport_stream_allow(u32 cid, u32 port)
+bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
 {
-	return true;
+	return vsock_net_mode(sock_net(sk_vsock(vsk))) == VSOCK_NET_MODE_GLOBAL;
 }
 EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
 
@@ -1056,7 +1056,7 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk,
 }
 EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
 
-bool virtio_transport_dgram_allow(u32 cid, u32 port)
+bool virtio_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port)
 {
 	return false;
 }
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 7eccd6708d66..00f6bbdb035a 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -646,13 +646,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
 	return VMCI_SUCCESS;
 }
 
-static bool vmci_transport_stream_allow(u32 cid, u32 port)
+static bool vmci_transport_stream_allow(struct vsock_sock *vsk, u32 cid,
+					u32 port)
 {
 	static const u32 non_socket_contexts[] = {
 		VMADDR_CID_LOCAL,
 	};
 	int i;
 
+	if (!vsock_net_mode_global(vsk))
+		return false;
+
 	BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts));
 
 	for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) {
@@ -682,12 +686,10 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
 	err = VMCI_SUCCESS;
 	bh_process_pkt = false;
 
-	/* Ignore incoming packets from contexts without sockets, or resources
-	 * that aren't vsock implementations.
+	/* Ignore incoming packets from resources that aren't vsock
+	 * implementations.
 	 */
-
-	if (!vmci_transport_stream_allow(dg->src.context, -1)
-	    || vmci_transport_peer_rid(dg->src.context) != dg->src.resource)
+	if (vmci_transport_peer_rid(dg->src.context) != dg->src.resource)
 		return VMCI_ERROR_NO_ACCESS;
 
 	if (VMCI_DG_SIZE(dg) < sizeof(*pkt))
@@ -749,6 +751,12 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
 		goto out;
 	}
 
+	/* Ignore incoming packets from contexts without sockets. */
+	if (!vmci_transport_stream_allow(vsk, dg->src.context, -1)) {
+		err = VMCI_ERROR_NO_ACCESS;
+		goto out;
+	}
+
 	/* We do most everything in a work queue, but let's fast path the
 	 * notification of reads and writes to help data transfer performance.
 	 * We can only do this if there is no process context code executing
@@ -1784,8 +1792,12 @@ static int vmci_transport_dgram_dequeue(struct vsock_sock *vsk,
 	return err;
 }
 
-static bool vmci_transport_dgram_allow(u32 cid, u32 port)
+static bool vmci_transport_dgram_allow(struct vsock_sock *vsk, u32 cid,
+				       u32 port)
 {
+	if (!vsock_net_mode_global(vsk))
+		return false;
+
 	if (cid == VMADDR_CID_HYPERVISOR) {
 		/* Registrations of PBRPC Servers do not modify VMX/Hypervisor
 		 * state and are allowed.
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index bc2ff918b315..deff68c64a09 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -46,7 +46,8 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
 	return 0;
 }
 
-static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk,
+					   u32 remote_cid);
 static bool vsock_loopback_msgzerocopy_allow(void)
 {
 	return true;
@@ -106,9 +107,10 @@ static struct virtio_transport loopback_transport = {
 	.send_pkt = vsock_loopback_send_pkt,
 };
 
-static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
+static bool
+vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
 {
-	return true;
+	return vsock_net_mode_global(vsk);
 }
 
 static void vsock_loopback_work(struct work_struct *work)

-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next v16 02/12] virtio: set skb owner of virtio_transport_reset_no_sock() reply
From: Bobby Eshleman @ 2026-01-21 22:11 UTC (permalink / raw)
  To: Stefano Garzarella, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Stefan Hajnoczi, Michael S. Tsirkin,
	Jason Wang, Eugenio Pérez, Xuan Zhuo, K. Y. Srinivasan,
	Haiyang Zhang, Wei Liu, Dexuan Cui, Bryan Tan, Vishnu Dasa,
	Broadcom internal kernel review list, Shuah Khan, Long Li,
	Jonathan Corbet
  Cc: linux-kernel, virtualization, netdev, kvm, linux-hyperv,
	linux-kselftest, berrange, Sargun Dhillon, linux-doc,
	Bobby Eshleman, Bobby Eshleman
In-Reply-To: <20260121-vsock-vmtest-v16-0-2859a7512097@meta.com>

From: Bobby Eshleman <bobbyeshleman@meta.com>

Associate reply packets with the sending socket. When vsock must reply
with an RST packet and there exists a sending socket (e.g., for
loopback), setting the skb owner to the socket correctly handles
reference counting between the skb and sk (i.e., the sk stays alive
until the skb is freed).

This allows the net namespace to be used for socket lookups for the
duration of the reply skb's lifetime, preventing race conditions between
the namespace lifecycle and vsock socket search using the namespace
pointer.

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
---
Changes in v11:
- move before adding to netns support (Stefano)

Changes in v10:
- break this out into its own patch for easy revert (Stefano)
---
 net/vmw_vsock/virtio_transport_common.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index fdb8f5b3fa60..718be9f33274 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1165,6 +1165,12 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
 		.op = VIRTIO_VSOCK_OP_RST,
 		.type = le16_to_cpu(hdr->type),
 		.reply = true,
+
+		/* Set sk owner to socket we are replying to (may be NULL for
+		 * non-loopback). This keeps a reference to the sock and
+		 * sock_net(sk) until the reply skb is freed.
+		 */
+		.vsk = vsock_sk(skb->sk),
 	};
 	struct sk_buff *reply;
 

-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox