Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* Re: [PATCH v2 0/4] Convert remaining buses to generic driver_override handling
From: Danilo Krummrich @ 2026-06-08 18:09 UTC (permalink / raw)
  To: Runyu Xiao
  Cc: gregkh, rafael, driver-core, linux, andersson, mathieu.poirier,
	kys, haiyangz, wei.liu, decui, longli, nipun.gupta,
	nikhil.agarwal, linux-remoteproc, linux-arm-msm, linux-hyperv,
	linux-kernel, jianhao.xu
In-Reply-To: <20260604035239.1711889-1-runyu.xiao@seu.edu.cn>

On Thu Jun 4, 2026 at 5:52 AM CEST, Runyu Xiao wrote:
> Runyu Xiao (4):
>   amba: use generic driver_override infrastructure
>   rpmsg: core: use generic driver_override infrastructure
>   vmbus: use generic driver_override infrastructure
>   cdx: use generic driver_override infrastructure

Given that you changed the approach to use the new driver_override
infrastructure, I assume you read the message in [1]?

In this message I also explained that this all has been addressed and was merged
into driver-core-next already.

[1] https://lore.kernel.org/all/DIYPR0K2CZW7.254R8K7ONBX5D@kernel.org/

^ permalink raw reply

* Re: [PATCH v2 2/4] rpmsg: core: use generic driver_override infrastructure
From: Mathieu Poirier @ 2026-06-08 17:47 UTC (permalink / raw)
  To: Runyu Xiao
  Cc: gregkh, rafael, dakr, driver-core, linux, andersson, kys,
	haiyangz, wei.liu, decui, longli, nipun.gupta, nikhil.agarwal,
	linux-remoteproc, linux-arm-msm, linux-hyperv, linux-kernel,
	jianhao.xu, stable
In-Reply-To: <20260604035239.1711889-3-runyu.xiao@seu.edu.cn>

On Wed, 3 Jun 2026 at 21:52, Runyu Xiao <runyu.xiao@seu.edu.cn> wrote:
>
> RPMSG still keeps driver_override in bus-private storage.
>
> That private pointer can be updated from the sysfs driver_override
> attribute, and also from rpmsg_register_device_override(). Both paths
> replace the pointer and can free the old value.
>
> However, driver_match_device() can call rpmsg_dev_match() from
> __driver_attach() without holding the device lock, and rpmsg_dev_match()
> still dereferences that private pointer directly.
>
> This leaves the match path racing with concurrent driver_override
> updates, with the usual risk of comparing against freed memory.
>
> Switch rpmsg to the driver-core driver_override infrastructure. This
> removes the private storage, uses device_match_driver_override() for the
> locked read in rpmsg_dev_match(), and converts
> rpmsg_register_device_override() to device_set_driver_override() so the
> in-kernel override path uses the same core-managed storage. With that
> storage now owned by struct device, drop the remaining rpmsg transport
> release-path frees of rpdev->driver_override as well.
>
> Link: https://lore.kernel.org/driver-core/DGRGTIRHA62X.3RY09D9SOK77P@kernel.org/
> Fixes: 39e47767ec9b ("rpmsg: Add driver_override device attribute for rpmsg_device")
> Cc: stable@vger.kernel.org
> Signed-off-by: Runyu Xiao <runyu.xiao@seu.edu.cn>
> ---
> drivers/rpmsg/qcom_glink_native.c |  2 --
> drivers/rpmsg/rpmsg_core.c        | 41 ++++++--------------------------------
> drivers/rpmsg/virtio_rpmsg_bus.c  |  1 -
> include/linux/rpmsg.h             |  4 ----

For the bottom 3:

Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>

>  4 files changed, 6 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c
> index e7f7831d37f8..11d3007db5cd 100644
> --- a/drivers/rpmsg/rpmsg_core.c
> +++ b/drivers/rpmsg/rpmsg_core.c
> @@ -358,33 +358,6 @@ rpmsg_show_attr(src, src, "0x%x\n");
>  rpmsg_show_attr(dst, dst, "0x%x\n");
>  rpmsg_show_attr(announce, announce ? "true" : "false", "%s\n");
>
> -static ssize_t driver_override_store(struct device *dev,
> -                                    struct device_attribute *attr,
> -                                    const char *buf, size_t count)
> -{
> -       struct rpmsg_device *rpdev = to_rpmsg_device(dev);
> -       int ret;
> -
> -       ret = driver_set_override(dev, &rpdev->driver_override, buf, count);
> -       if (ret)
> -               return ret;
> -
> -       return count;
> -}
> -
> -static ssize_t driver_override_show(struct device *dev,
> -                                   struct device_attribute *attr, char *buf)
> -{
> -       struct rpmsg_device *rpdev = to_rpmsg_device(dev);
> -       ssize_t len;
> -
> -       device_lock(dev);
> -       len = sysfs_emit(buf, "%s\n", rpdev->driver_override);
> -       device_unlock(dev);
> -       return len;
> -}
> -static DEVICE_ATTR_RW(driver_override);
> -
>  static ssize_t modalias_show(struct device *dev,
>                              struct device_attribute *attr, char *buf)
>  {
> @@ -405,7 +378,6 @@ static struct attribute *rpmsg_dev_attrs[] = {
>         &dev_attr_dst.attr,
>         &dev_attr_src.attr,
>         &dev_attr_announce.attr,
> -       &dev_attr_driver_override.attr,
>         NULL,
>  };
>  ATTRIBUTE_GROUPS(rpmsg_dev);
> @@ -424,9 +396,11 @@ static int rpmsg_dev_match(struct device *dev, const struct device_driver *drv)
>         const struct rpmsg_driver *rpdrv = to_rpmsg_driver(drv);
>         const struct rpmsg_device_id *ids = rpdrv->id_table;
>         unsigned int i;
> +       int ret;
>
> -       if (rpdev->driver_override)
> -               return !strcmp(rpdev->driver_override, drv->name);
> +       ret = device_match_driver_override(dev, drv);
> +       if (ret >= 0)
> +               return ret;
>
>         if (ids)
>                 for (i = 0; ids[i].name[0]; i++)
> @@ -533,6 +507,7 @@ static void rpmsg_dev_remove(struct device *dev)
>
>  static const struct bus_type rpmsg_bus = {
>         .name           = "rpmsg",
> +       .driver_override = true,
>         .match          = rpmsg_dev_match,
>         .dev_groups     = rpmsg_dev_groups,
>         .uevent         = rpmsg_uevent,
> @@ -560,9 +535,7 @@ int rpmsg_register_device_override(struct rpmsg_device *rpdev,
>
>         device_initialize(dev);
>         if (driver_override) {
> -               ret = driver_set_override(dev, &rpdev->driver_override,
> -                                         driver_override,
> -                                         strlen(driver_override));
> +               ret = device_set_driver_override(dev, driver_override);
>                 if (ret) {
>                         dev_err(dev, "device_set_override failed: %d\n", ret);
>                         put_device(dev);
> @@ -573,8 +546,6 @@ int rpmsg_register_device_override(struct rpmsg_device *rpdev,
>         ret = device_add(dev);
>         if (ret) {
>                 dev_err(dev, "device_add failed: %d\n", ret);
> -               kfree(rpdev->driver_override);
> -               rpdev->driver_override = NULL;
>                 put_device(dev);
>         }
>
> diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c
> index 401a4ece0c97..d9d4468e4cbd 100644
> --- a/drivers/rpmsg/qcom_glink_native.c
> +++ b/drivers/rpmsg/qcom_glink_native.c
> @@ -1626,7 +1626,6 @@ static void qcom_glink_rpdev_release(struct device *dev)
>  {
>         struct rpmsg_device *rpdev = to_rpmsg_device(dev);
>
> -       kfree(rpdev->driver_override);
>         kfree(rpdev);
>  }
>
> @@ -1862,7 +1861,6 @@ static void qcom_glink_device_release(struct device *dev)
>
>         /* Release qcom_glink_alloc_channel() reference */
>         kref_put(&channel->refcount, qcom_glink_channel_release);
> -       kfree(rpdev->driver_override);
>         kfree(rpdev);
>  }
>
> diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
> index 5ae15111fb4f..1b8bb05924af 100644
> --- a/drivers/rpmsg/virtio_rpmsg_bus.c
> +++ b/drivers/rpmsg/virtio_rpmsg_bus.c
> @@ -374,7 +374,6 @@ static void virtio_rpmsg_release_device(struct device *dev)
>         struct rpmsg_device *rpdev = to_rpmsg_device(dev);
>         struct virtio_rpmsg_channel *vch = to_virtio_rpmsg_channel(rpdev);
>
> -       kfree(rpdev->driver_override);
>         kfree(vch);
>  }
>
> diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h
> index 83266ce14642..2e40eb54155e 100644
> --- a/include/linux/rpmsg.h
> +++ b/include/linux/rpmsg.h
> @@ -41,9 +41,6 @@ struct rpmsg_channel_info {
>   * rpmsg_device - device that belong to the rpmsg bus
>   * @dev: the device struct
>   * @id: device id (used to match between rpmsg drivers and devices)
> - * @driver_override: driver name to force a match; do not set directly,
> - *                   because core frees it; use driver_set_override() to
> - *                   set or clear it.
>   * @src: local address
>   * @dst: destination address
>   * @ept: the rpmsg endpoint of this channel
> @@ -53,7 +50,6 @@ struct rpmsg_channel_info {
>  struct rpmsg_device {
>         struct device dev;
>         struct rpmsg_device_id id;
> -       const char *driver_override;
>         u32 src;
>         u32 dst;
>         struct rpmsg_endpoint *ept;
> --
> 2.34.1

^ permalink raw reply

* Re: [GIT PULL] Hyper-V fixes for v7.1-rc8
From: pr-tracker-bot @ 2026-06-08 15:03 UTC (permalink / raw)
  To: Wei Liu
  Cc: Linus Torvalds, Wei Liu, Linux on Hyper-V List, Linux Kernel List,
	kys, haiyangz, decui, longli
In-Reply-To: <20260608053408.GA1541576@liuwe-devbox-debian-v2.local>

The pull request you sent on Sun, 7 Jun 2026 22:34:08 -0700:

> ssh://git@gitolite.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git tags/hyperv-fixes-signed-20260607

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/e92a7628772ba49f3cdc1d141cd2b0b5d607bda2

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

^ permalink raw reply

* [PATCH net v2 2/2] net: mana: guard TX wq object destroy with INVALID_MANA_HANDLE check
From: Aditya Garg @ 2026-06-08 10:13 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, horms, shradhagupta, dipayanroy, ernis,
	kees, shacharr, stephen, gargaditya, gargaditya, ssengar,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260608101345.2267320-1-gargaditya@linux.microsoft.com>

mana_create_txq() has several error paths (after mana_alloc_queues() or
mana_create_wq_obj() failure) where tx_qp[i].tx_object stays as the
INVALID_MANA_HANDLE sentinel set at allocation. mana_destroy_txq() then
unconditionally calls mana_destroy_wq_obj() with (u64)-1, which firmware
rejects and logs an error.

Mirror the RX-side pattern in mana_destroy_rxq() and skip the destroy
when the handle is still INVALID_MANA_HANDLE.

Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index c9b1df1ed109..d7de4c4d25bb 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2334,7 +2334,8 @@ static void mana_destroy_txq(struct mana_port_context *apc)
 			netif_napi_del_locked(napi);
 			apc->tx_qp[i].txq.napi_initialized = false;
 		}
-		mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object);
+		if (apc->tx_qp[i].tx_object != INVALID_MANA_HANDLE)
+			mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object);
 
 		mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net v2 1/2] net: mana: initialize gdma queue id to INVALID_QUEUE_ID
From: Aditya Garg @ 2026-06-08 10:13 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, horms, shradhagupta, dipayanroy, ernis,
	kees, shacharr, stephen, gargaditya, gargaditya, ssengar,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260608101345.2267320-1-gargaditya@linux.microsoft.com>

mana_gd_create_mana_wq_cq() leaves queue->id as 0 (from kzalloc_obj())
until mana_create_wq_obj() assigns the firmware-returned id. If creation
fails before that, cleanup calls mana_gd_destroy_cq() with id 0, NULLing
gc->cq_table[0] and silently breaking whichever real CQ owns that slot.

Initialize queue->id to INVALID_QUEUE_ID right after allocation, matching
mana_gd_create_eq(). The existing (id >= max_num_cqs) guard then
short-circuits cleanly.

Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Signed-off-by: Aditya Garg <gargaditya@linux.microsoft.com>
Reviewed-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index d8e816882f02..ac71ca8450bf 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1192,6 +1192,8 @@ int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
 	if (!queue)
 		return -ENOMEM;
 
+	queue->id = INVALID_QUEUE_ID;
+
 	gmi = &queue->mem_info;
 	err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
 	if (err) {
-- 
2.43.0


^ permalink raw reply related

* [PATCH net v2 0/2] net: mana: fix error-path issues in queue setup
From: Aditya Garg @ 2026-06-08 10:13 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, horms, shradhagupta, dipayanroy, ernis,
	kees, shacharr, stephen, gargaditya, gargaditya, ssengar,
	linux-hyperv, netdev, linux-kernel

Two error-path fixes in MANA queue setup, both surfaced during Sashiko
AI review of a recently upstreamed patch series.

Patch 1 initializes queue->id to INVALID_QUEUE_ID in
mana_gd_create_mana_wq_cq() so that a CQ creation failure before the
firmware id is assigned does not NULL gc->cq_table[0] and silently
break whichever real CQ owns that slot. This mirrors the existing
pattern in mana_gd_create_eq().

Patch 2 guards mana_destroy_txq()'s call to mana_destroy_wq_obj() with
an INVALID_MANA_HANDLE check, mirroring mana_destroy_rxq(). Without
it, TX setup failures lead to a firmware-rejected destroy of (u64)-1
and a spurious error in dmesg.

Changes in v2:
  - Rebased onto net.

Aditya Garg (2):
  net: mana: initialize gdma queue id to INVALID_QUEUE_ID
  net: mana: guard TX wq object destroy with INVALID_MANA_HANDLE check

 drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 ++
 drivers/net/ethernet/microsoft/mana/mana_en.c   | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

-- 
2.43.0

^ permalink raw reply

* [PATCH 3/4] x86/msr: Switch wrmsrl() users to wrmsrq()
From: Juergen Gross @ 2026-06-08  8:28 UTC (permalink / raw)
  To: linux-kernel, x86, linux-perf-users, kvm, linux-coco,
	linux-hyperv, linux-pm
  Cc: Juergen Gross, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	James Clark, Thomas Gleixner, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Tony Luck, Reinette Chatre, Dave Martin,
	James Morse, Babu Moger, Sean Christopherson, Paolo Bonzini,
	Kiryl Shutsemau, Rick Edgecombe, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Rafael J. Wysocki, Artem Bityutskiy,
	Artem Bityutskiy, Len Brown
In-Reply-To: <20260608082809.3492719-1-jgross@suse.com>

wrmsrl() is a deprecated synonym for wrmsrq(). Switch its users to
wrmsrq().

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/events/amd/uncore.c          | 2 +-
 arch/x86/events/intel/core.c          | 4 ++--
 arch/x86/kernel/cpu/resctrl/monitor.c | 2 +-
 arch/x86/kernel/process_64.c          | 2 +-
 arch/x86/kvm/pmu.c                    | 6 +++---
 arch/x86/kvm/vmx/tdx.c                | 6 +++---
 drivers/hv/mshv_vtl_main.c            | 2 +-
 drivers/idle/intel_idle.c             | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 98ef4bf9911a..7dc6af4231cc 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -975,7 +975,7 @@ static void amd_uncore_umc_read(struct perf_event *event)
 	 * that the counter never gets a chance to saturate.
 	 */
 	if (new & BIT_ULL(63 - COUNTER_SHIFT)) {
-		wrmsrl(hwc->event_base, 0);
+		wrmsrq(hwc->event_base, 0);
 		local64_set(&hwc->prev_count, 0);
 	} else {
 		local64_set(&hwc->prev_count, new);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index dd1e3aa75ee9..e9baa64dc962 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3166,12 +3166,12 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
 	}
 
 	if (cpuc->acr_cfg_b[idx] != mask) {
-		wrmsrl(msr_b + msr_offset, mask);
+		wrmsrq(msr_b + msr_offset, mask);
 		cpuc->acr_cfg_b[idx] = mask;
 	}
 	/* Only need to update the reload value when there is a valid config value. */
 	if (mask && cpuc->acr_cfg_c[idx] != reload) {
-		wrmsrl(msr_c + msr_offset, reload);
+		wrmsrq(msr_c + msr_offset, reload);
 		cpuc->acr_cfg_c[idx] = reload;
 	}
 }
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index c5ed0bc1f831..e4918c32a822 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -532,7 +532,7 @@ static void resctrl_abmc_config_one_amd(void *info)
 {
 	union l3_qos_abmc_cfg *abmc_cfg = info;
 
-	wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
+	wrmsrq(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full);
 }
 
 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b85e715ebb30..d44afbe005bb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -708,7 +708,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* Reset hw history on AMD CPUs */
 	if (cpu_feature_enabled(X86_FEATURE_AMD_WORKLOAD_CLASS))
-		wrmsrl(MSR_AMD_WORKLOAD_HRST, 0x1);
+		wrmsrq(MSR_AMD_WORKLOAD_HRST, 0x1);
 
 	return prev_p;
 }
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index e218352e3423..aee70e5dc15d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1313,14 +1313,14 @@ static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu)
 		pmc = &pmu->gp_counters[i];
 
 		if (pmc->counter != rdpmc(i))
-			wrmsrl(gp_counter_msr(i), pmc->counter);
-		wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw);
+			wrmsrq(gp_counter_msr(i), pmc->counter);
+		wrmsrq(gp_eventsel_msr(i), pmc->eventsel_hw);
 	}
 	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
 		pmc = &pmu->fixed_counters[i];
 
 		if (pmc->counter != rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i))
-			wrmsrl(fixed_counter_msr(i), pmc->counter);
+			wrmsrq(fixed_counter_msr(i), pmc->counter);
 	}
 }
 
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 04ce321ebdf3..cb50e23c39ca 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -823,7 +823,7 @@ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
 		return;
 
 	++vcpu->stat.host_state_reload;
-	wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
+	wrmsrq(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
 
 	vt->guest_state_loaded = false;
 }
@@ -1048,10 +1048,10 @@ static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
 
 	/*
 	 * Likewise, even if a TDX hosts didn't support XSS both arms of
-	 * the comparison would be 0 and the wrmsrl would be skipped.
+	 * the comparison would be 0 and the wrmsrq would be skipped.
 	 */
 	if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
-		wrmsrl(MSR_IA32_XSS, kvm_host.xss);
+		wrmsrq(MSR_IA32_XSS, kvm_host.xss);
 }
 
 #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
index f5d27f28d6ad..0d3d4161974f 100644
--- a/drivers/hv/mshv_vtl_main.c
+++ b/drivers/hv/mshv_vtl_main.c
@@ -596,7 +596,7 @@ static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
 		} else {
 			/* Handle MSRs */
 			if (set)
-				wrmsrl(reg_table[i].msr_addr, *reg64);
+				wrmsrq(reg_table[i].msr_addr, *reg64);
 			else
 				rdmsrq(reg_table[i].msr_addr, *reg64);
 		}
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 15c698291b32..67d5993c7387 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -2379,7 +2379,7 @@ static void intel_c1_demotion_toggle(void *enable)
 		msr_val |= NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE;
 	else
 		msr_val &= ~(NHM_C1_AUTO_DEMOTE | SNB_C1_AUTO_UNDEMOTE);
-	wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
+	wrmsrq(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
 }
 
 static ssize_t intel_c1_demotion_store(struct device *dev,
-- 
2.54.0


^ permalink raw reply related

* [PATCH 1/4] x86/msr: Switch rdmsrl() users to rdmsrq()
From: Juergen Gross @ 2026-06-08  8:28 UTC (permalink / raw)
  To: linux-kernel, x86, linux-perf-users, linux-hyperv, linux-pm
  Cc: Juergen Gross, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	James Clark, Thomas Gleixner, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Tony Luck, Reinette Chatre, Dave Martin,
	James Morse, Babu Moger, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Rafael J. Wysocki, Artem Bityutskiy,
	Artem Bityutskiy, Len Brown
In-Reply-To: <20260608082809.3492719-1-jgross@suse.com>

rdmsrl() is a deprecated synonym for rdmsrq(). Switch its users to
rdmsrq().

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 arch/x86/events/amd/uncore.c          | 2 +-
 arch/x86/kernel/cpu/resctrl/monitor.c | 2 +-
 drivers/hv/mshv_vtl_main.c            | 2 +-
 drivers/idle/intel_idle.c             | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index dd956cfcadef..98ef4bf9911a 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -966,7 +966,7 @@ static void amd_uncore_umc_read(struct perf_event *event)
 	 * UMC counters do not have RDPMC assignments. Read counts directly
 	 * from the corresponding PERF_CTR.
 	 */
-	rdmsrl(hwc->event_base, new);
+	rdmsrq(hwc->event_base, new);
 
 	/*
 	 * Unlike the other uncore counters, UMC counters saturate and set the
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 59215fef3924..c5ed0bc1f831 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -301,7 +301,7 @@ static int __cntr_id_read(u32 cntr_id, u64 *val)
 	 * is set if the counter data is unavailable.
 	 */
 	wrmsr(MSR_IA32_QM_EVTSEL, ABMC_EXTENDED_EVT_ID | ABMC_EVT_ID, cntr_id);
-	rdmsrl(MSR_IA32_QM_CTR, msr_val);
+	rdmsrq(MSR_IA32_QM_CTR, msr_val);
 
 	if (msr_val & RMID_VAL_ERROR)
 		return -EIO;
diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c
index c19400701467..f5d27f28d6ad 100644
--- a/drivers/hv/mshv_vtl_main.c
+++ b/drivers/hv/mshv_vtl_main.c
@@ -598,7 +598,7 @@ static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set)
 			if (set)
 				wrmsrl(reg_table[i].msr_addr, *reg64);
 			else
-				rdmsrl(reg_table[i].msr_addr, *reg64);
+				rdmsrq(reg_table[i].msr_addr, *reg64);
 		}
 		return 0;
 	}
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f49354e37777..15c698291b32 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -2370,7 +2370,7 @@ static void intel_c1_demotion_toggle(void *enable)
 {
 	unsigned long long msr_val;
 
-	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
+	rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
 	/*
 	 * Enable/disable C1 undemotion along with C1 demotion, as this is the
 	 * most sensible configuration in general.
@@ -2410,7 +2410,7 @@ static ssize_t intel_c1_demotion_show(struct device *dev,
 	 * Read the MSR value for a CPU and assume it is the same for all CPUs. Any other
 	 * configuration would be a BIOS bug.
 	 */
-	rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
+	rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, msr_val);
 	return sysfs_emit(buf, "%d\n", !!(msr_val & NHM_C1_AUTO_DEMOTE));
 }
 static DEVICE_ATTR_RW(intel_c1_demotion);
-- 
2.54.0


^ permalink raw reply related

* [PATCH 0/4] x86/msr: Get rid of rdmsrl() and wrmsrl()
From: Juergen Gross @ 2026-06-08  8:28 UTC (permalink / raw)
  To: linux-kernel, x86, linux-perf-users, linux-hyperv, linux-pm, kvm,
	linux-coco
  Cc: Juergen Gross, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Mark Rutland,
	Alexander Shishkin, Jiri Olsa, Ian Rogers, Adrian Hunter,
	James Clark, Thomas Gleixner, Borislav Petkov, Dave Hansen,
	H. Peter Anvin, Tony Luck, Reinette Chatre, Dave Martin,
	James Morse, Babu Moger, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Rafael J. Wysocki, Artem Bityutskiy,
	Artem Bityutskiy, Len Brown, Sean Christopherson, Paolo Bonzini,
	Kiryl Shutsemau, Rick Edgecombe

rdsmrl() and wrmsrl() are deprecated aliases of rdmsrq() and wrmsrq().

Switch all users and remove the deprecated variants.

Juergen Gross (4):
  x86/msr: Switch rdmsrl() users to rdmsrq()
  x86/msr: Remove rdmsrl()
  x86/msr: Switch wrmsrl() users to wrmsrq()
  x86/msr: Remove wrmsrl()

 arch/x86/events/amd/uncore.c          | 4 ++--
 arch/x86/events/intel/core.c          | 4 ++--
 arch/x86/include/asm/msr.h            | 5 -----
 arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++--
 arch/x86/kernel/process_64.c          | 2 +-
 arch/x86/kvm/pmu.c                    | 6 +++---
 arch/x86/kvm/vmx/tdx.c                | 6 +++---
 drivers/hv/mshv_vtl_main.c            | 4 ++--
 drivers/idle/intel_idle.c             | 6 +++---
 9 files changed, 18 insertions(+), 23 deletions(-)

-- 
2.54.0


^ permalink raw reply

* Re: [PATCH] Drivers: hv: mshv: add bounds check on vp_index in mshv_intercept_isr()
From: Wei Liu @ 2026-06-08  6:23 UTC (permalink / raw)
  To: Junrui Luo
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Jinank Jain, Praveen K Paladugu, Mukesh Rathor, Nuno Das Neves,
	Anirudh Rayabharam, Roman Kisel, Muminul Islam, linux-hyperv,
	linux-kernel, Stanislav Kinsburskii, Yuhao Jiang
In-Reply-To: <SYBPR01MB7881B8B5D35E02A0E8404E4FAF232@SYBPR01MB7881.ausprd01.prod.outlook.com>

On Thu, Apr 16, 2026 at 10:18:05PM +0800, Junrui Luo wrote:
> mshv_intercept_isr() extracts vp_index from the hypervisor message
> payload and uses it directly to index into pt_vp_array without
> validation. handle_bitset_message() and handle_pair_message() already
> validate vp_index against MSHV_MAX_VPS before array access.
> 
> A vp_index exceeding MSHV_MAX_VPS leads to an out-of-bounds read from
> pt_vp_array.
> 
> Add the same MSHV_MAX_VPS bounds check for consistency with the other
> message handlers.
> 
> Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> Reported-by: Yuhao Jiang <danisjiang@gmail.com>
> Signed-off-by: Junrui Luo <moonafterrain@outlook.com>

Like other places say, the hypervisor shouldn't give us an out-of-bound
index. It has many different ways to screw with the root kernel, so I'm
not overly concerned about this.

That said, having a bit more consistency and defensive programming
doesn't hurt. I have applied this patch. Thanks.

Wei

> ---
>  drivers/hv/mshv_synic.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
> index 43f1bcbbf2d3..5bceb8122981 100644
> --- a/drivers/hv/mshv_synic.c
> +++ b/drivers/hv/mshv_synic.c
> @@ -384,6 +384,10 @@ mshv_intercept_isr(struct hv_message *msg)
>  	 */
>  	vp_index =
>  	       ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
> +	if (unlikely(vp_index >= MSHV_MAX_VPS)) {
> +		pr_debug("VP index %u out of bounds\n", vp_index);
> +		goto unlock_out;
> +	}
>  	vp = partition->pt_vp_array[vp_index];
>  	if (unlikely(!vp)) {
>  		pr_debug("failed to find VP %u\n", vp_index);
> 
> ---
> base-commit: 7aaa8047eafd0bd628065b15757d9b48c5f9c07d
> change-id: 20260416-fixes-693196e52f93
> 
> Best regards,
> -- 
> Junrui Luo <moonafterrain@outlook.com>
> 

^ permalink raw reply

* Re: [PATCH] x86/hyperv: Cosmetic changes in irqdomain.c for readability
From: Wei Liu @ 2026-06-08  6:00 UTC (permalink / raw)
  To: Mukesh R; +Cc: linux-hyperv, linux-kernel, wei.liu
In-Reply-To: <20260601225116.956392-1-mrathor@linux.microsoft.com>

On Mon, Jun 01, 2026 at 03:51:16PM -0700, Mukesh R wrote:
> Make cosmetic changes:
>  o Rename struct pci_dev *dev to *pdev since there are cases of
>    struct device *dev in the file and all over the kernel
>  o Rename hv_build_pci_dev_id to hv_build_devid_type_pci in anticipation
>    of building different types of device ids
>  o Fix checkpatch.pl issues with return and extraneous printk
>  o Replace spaces with tabs
>  o Rename struct hv_devid *xxx to struct hv_devid *hv_devid given code
>    paths involve many types of device ids
>  o Fix indentation in a large if block by using goto.
> 
> There are no functional changes.
> 
> Reviewed-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>

Applied to hyperv-next.

Wei

^ permalink raw reply

* [GIT PULL] Hyper-V fixes for v7.1-rc8
From: Wei Liu @ 2026-06-08  5:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Wei Liu, Linux on Hyper-V List, Linux Kernel List, kys, haiyangz,
	decui, longli

Hi Linus,

The following changes since commit 254f49634ee16a731174d2ae34bc50bd5f45e731:

  Linux 7.1-rc1 (2026-04-26 14:19:00 -0700)

are available in the Git repository at:

  ssh://git@gitolite.kernel.org/pub/scm/linux/kernel/git/hyperv/linux.git tags/hyperv-fixes-signed-20260607

for you to fetch changes up to 98e0fc32e53dd62cd38a0d67eaf5846ae20078cc:

  mshv: support 1G hugepages by passing them as 2M-aligned chunks (2026-05-27 15:30:15 -0700)

----------------------------------------------------------------
hyperv-fixes for v7.1-rc8
 - MSHV driver fixes from various people (Anirudh Rayabharam, Can Peng,
   Dexuan Cui, Michael Kelley, Jork Loeser, Wei Liu)
 - Hyper-V user space tools fixes (Thorsten Blum)
 - Allow VMBus to be unloaded after frame buffer is flushed (Michael
   Kelley)
----------------------------------------------------------------
Anirudh Rayabharam (Microsoft) (1):
      mshv: support 1G hugepages by passing them as 2M-aligned chunks

Can Peng (1):
      mshv: use kmalloc_array in mshv_root_scheduler_init

Dexuan Cui (2):
      hyperv: Clean up and fix the guest ID comment in hvgdk.h
      Drivers: hv: vmbus: Improve the logic of reserving fb_mmio on Gen2 VMs

Jork Loeser (3):
      mshv: limit SynIC management to MSHV-owned resources
      mshv: clean up SynIC state on kexec for L1VH
      mshv: unmap debugfs stats pages on kexec

Michael Kelley (3):
      Drivers: hv: vmbus: Provide option to skip VMBus unload on panic
      drm/hyperv: During panic do VMBus unload after frame buffer is flushed
      mshv: Add conditional VMBus dependency

Thorsten Blum (2):
      hv: utils: handle and propagate errors in kvp_register
      hv: utils: replace deprecated strcpy with strscpy in kvp_register

Wei Liu (1):
      mshv: add a missing padding field

 drivers/gpu/drm/hyperv/hyperv_drm_drv.c     |   5 +
 drivers/gpu/drm/hyperv/hyperv_drm_modeset.c |  15 +--
 drivers/hv/Kconfig                          |   1 +
 drivers/hv/channel_mgmt.c                   |   1 +
 drivers/hv/hv.c                             |   3 +
 drivers/hv/hv_kvp.c                         |  27 +++--
 drivers/hv/hyperv_vmbus.h                   |   1 -
 drivers/hv/mshv_debugfs.c                   |   7 +-
 drivers/hv/mshv_regions.c                   |  29 +++---
 drivers/hv/mshv_root_main.c                 |   2 +-
 drivers/hv/mshv_synic.c                     | 156 ++++++++++++++++++----------
 drivers/hv/vmbus_drv.c                      |  54 ++++++++--
 include/hyperv/hvgdk.h                      |  10 +-
 include/hyperv/hvhdk.h                      |   1 +
 include/linux/hyperv.h                      |   7 ++
 15 files changed, 207 insertions(+), 112 deletions(-)

^ permalink raw reply

* Re: [PATCH net 0/2] net: mana: fix error-path issues in queue setup
From: Aditya Garg @ 2026-06-08  4:43 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, pabeni, horms, shradhagupta, dipayanroy, ernis, kees,
	shacharr, stephen, gargaditya, ssengar, linux-hyperv, netdev,
	linux-kernel
In-Reply-To: <20260605182748.5f106575@kernel.org>

On 06-06-2026 06:57, Jakub Kicinski wrote:
> On Thu,  4 Jun 2026 01:01:24 -0700 Aditya Garg wrote:
>> Two error-path fixes in MANA queue setup, both surfaced during Sashiko
>> AI review of a recently upstreamed patch series.
>>
>> Patch 1 initializes queue->id to INVALID_QUEUE_ID in
>> mana_gd_create_mana_wq_cq() so that a CQ creation failure before the
>> firmware id is assigned does not NULL gc->cq_table[0] and silently
>> break whichever real CQ owns that slot. This mirrors the existing
>> pattern in mana_gd_create_eq().
>>
>> Patch 2 guards mana_destroy_txq()'s call to mana_destroy_wq_obj() with
>> an INVALID_MANA_HANDLE check, mirroring mana_destroy_rxq(). Without
>> it, TX setup failures lead to a firmware-rejected destroy of (u64)-1
>> and a spurious error in dmesg.
> 
> Looks like these patches were generated against net-next, please rebase:
> 
> Applying: net: mana: initialize gdma queue id to INVALID_QUEUE_ID
> Applying: net: mana: guard TX wq object destroy with INVALID_MANA_HANDLE check
> error: patch failed: drivers/net/ethernet/microsoft/mana/mana_en.c:2351
> error: drivers/net/ethernet/microsoft/mana/mana_en.c: patch does not apply
> Patch failed at 0002 net: mana: guard TX wq object destroy with INVALID_MANA_HANDLE check

Thanks Jakub for pointing it out.
I'll rebase against net and post a v2

Regards,
Aditya

^ permalink raw reply

* Re: [PATCH 1/1] Drivers: hv: vmbus: Set DMA coherent mask for VMBus devices
From: sashiko-bot @ 2026-06-08  2:19 UTC (permalink / raw)
  To: Michael Kelley; +Cc: linux-hyperv
In-Reply-To: <20260608020616.52852-1-mhklkml@zohomail.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:

Pre-existing issues:
- [High] If device_register() fails in vmbus_device_register(), the error handling path panics the kernel due to a BUG_ON, and would result in a double free if the panic were bypassed.
--

commit 575a138c8a50043a8fa9993f9a0f7b0f34865b8a
Author: Michael Kelley <mhklinux@outlook.com>
Drivers: hv: vmbus: Set DMA coherent mask for VMBus devices

This commit fixes DMA allocation failures for VMBus devices in CoCo VMs.
It ensures the coherent_dma_mask is set to 64 bits at the same time the
dma_mask is configured during vmbus_device_register().

> diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
> index 23206640c6139..a6b9a33db6578 100644
> --- a/drivers/hv/vmbus_drv.c
> +++ b/drivers/hv/vmbus_drv.c
> @@ -2169,6 +2169,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
>  	child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
>  	child_device_obj->device.dma_mask = &child_device_obj->dma_mask;
>  	dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64));
> +	dma_set_coherent_mask(&child_device_obj->device, DMA_BIT_MASK(64));
>  
>  	/*
>  	 * Register with the LDM. This will kick off the driver/device

[Severity: High]
This is a pre-existing issue, but does the error handling path for
device_register() further down in this function cause a kernel panic?

If device_register() fails, the code calls put_device():

vmbus_device_register() {
    ...
    ret = device_register(&child_device_obj->device);
    if (ret) {
        pr_err("Unable to register child device\n");
        put_device(&child_device_obj->device);
        return ret;
    }
    ...
}

This triggers the release callback vmbus_device_release(), which
unconditionally invokes hv_process_channel_removal(). However,
hv_process_channel_removal() begins with:

    BUG_ON(!channel->rescind);

Since channel->rescind is false during initial channel offer processing,
would this BUG_ON fire and crash the kernel?

Additionally, if the BUG_ON were bypassed, hv_process_channel_removal()
calls free_channel(channel). The error then propagates up to
vmbus_add_channel_work(), which takes the err_deq_chan label and calls
free_channel(newchannel) again. Does this sequence result in a double free?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260608020616.52852-1-mhklkml@zohomail.com?part=1

^ permalink raw reply

* [PATCH 1/1] Drivers: hv: vmbus: Set DMA coherent mask for VMBus devices
From: Michael Kelley @ 2026-06-08  2:06 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv; +Cc: linux-kernel

From: Michael Kelley <mhklinux@outlook.com>

In current code, the coherent_dma_mask for VMBus devices is not set, so
it has the default value of 0, which essentially means "invalid". Because
drivers for VMBus devices do not use dma_alloc_*() functions, the usual
use of the coherent mask does not occur, and no errors result.

However, a valid coherent_dma_mask may be needed even though the drivers
don't use dma_alloc_*() functions. In a CoCo VM, the VMBus storvsc and
netvsc drivers must bounce buffer DMA operations through the swiotlb
because the Hyper-V host can't DMA into encrypted guest memory. If the
kernel is built with CONFIG_SWIOTLB_DYNAMIC and the initial swiotlb size
is small, swiotlb code may need to grow the swiotlb in response to a DMA
mapping request. That growth first allocates a transient pool while the
swiotlb is expanded in the background. The transient pool memory is
allocated from the DMA atomic pools, and the allocation code checks for
a valid coherent_dma_mask. With current code, this check fails, then the
DMA mapping request from the storvsc or netvsc driver fails, and finally
an I/O error occurs.

Fix this problem by setting coherent_dma_mask for VMBus devices at the
same time that dma_mask is set. Being a synthetic bus, VMBus does not
have any restrictions on coherent DMA, so the coherent mask is set to
the full 64 bits for all VMBus devices, just like with dma_mask.

Signed-off-by: Michael Kelley <mhklinux@outlook.com>
---
I have not provided a Fixes: tag because the scenario under which
the error occurs is an artificial test case that I came across
while stressing a unrelated patch set. The fix is valid for general
goodness, but the likelihood of the problem occurring in the real
world is extremely small. So I see little value in adding this
patch to the stable kernel maintainers' workload. If someone wants
to argue otherwise, I have no fundamental objection to adding the
Fixes: tag.

 drivers/hv/vmbus_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index c9eeb2ec365d..26e8273bbddd 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2192,6 +2192,7 @@ int vmbus_device_register(struct hv_device *child_device_obj)
 	child_device_obj->device.dma_parms = &child_device_obj->dma_parms;
 	child_device_obj->device.dma_mask = &child_device_obj->dma_mask;
 	dma_set_mask(&child_device_obj->device, DMA_BIT_MASK(64));
+	dma_set_coherent_mask(&child_device_obj->device, DMA_BIT_MASK(64));

 	/*
 	 * Register with the LDM. This will kick off the driver/device
-- 
2.25.1

^ permalink raw reply related

* Re: [PATCH net v3] net: mana: Optimize irq affinity for low vcpu configs
From: Shradha Gupta @ 2026-06-07 12:16 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Dexuan Cui, Wei Liu, Haiyang Zhang, K. Y. Srinivasan, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Konstantin Taranov,
	Simon Horman, Erni Sri Satya Vennela, Dipayaan Roy, Shiraz Saleem,
	Michael Kelley, Long Li, Yury Norov, linux-hyperv, linux-kernel,
	netdev, Paul Rosswurm, Shradha Gupta, Saurabh Singh Sengar,
	stable
In-Reply-To: <6d1fa9d9-73c2-48b5-95a1-51710d81b3ed@redhat.com>

On Thu, Jun 04, 2026 at 12:45:03PM +0200, Paolo Abeni wrote:
> On 6/1/26 12:27 PM, Shradha Gupta wrote:
> > In mana driver, the number of IRQs allocated is capped by the
> > min(num_cpu + 1, queue count). In cases, where the IRQ count is greater
> > than the vcpu count, we want to utilize all the vCPUs, irrespective of
> > their NUMA/core bindings.
> > 
> > This is important, especially in the envs where number of vCPUs are so
> > few that the softIRQ handling overhead on two IRQs on the same vCPU is
> > much more than their overheads if they were spread across sibling vCPUs.
> > 
> > This behaviour is more evident with dynamic IRQ allocation. Since MANA
> > IRQs are assigned at a later stage compared to static allocation, other
> > device IRQs may already be affinitized to the vCPUs. As a result, IRQ
> > weights become imbalanced, causing multiple MANA IRQs to land on the
> > same vCPU, while some vCPUs have none.
> > 
> > In such cases when many parallel TCP connections are tested, the
> > throughput drops significantly.
> > 
> > Test envs:
> > =======================================================
> > Case 1: without this patch
> > =======================================================
> > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > 
> > 	TYPE		effective vCPU aff
> > =======================================================
> > IRQ0:	HWC		0
> > IRQ1:	mana_q1		0
> > IRQ2:	mana_q2		2
> > IRQ3:	mana_q3		0
> > IRQ4:	mana_q4		3
> > 
> > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > vCPU		0	1	2	3
> > =======================================================
> > pass 1:		38.85	0.03	24.89	24.65
> > pass 2:		39.15	0.03	24.57	25.28
> > pass 3:		40.36	0.03	23.20	23.17
> > 
> > =======================================================
> > Case 2: with this patch
> > =======================================================
> > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > 
> >         TYPE            effective vCPU aff
> > =======================================================
> > IRQ0:   HWC             0
> > IRQ1:   mana_q1         0
> > IRQ2:   mana_q2         1
> > IRQ3:   mana_q3         2
> > IRQ4:   mana_q4         3
> > 
> > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > vCPU            0       1       2       3
> > =======================================================
> > pass 1:         15.42	15.85	14.99	14.51
> > pass 2:         15.53	15.94	15.81	15.93
> > pass 3:         16.41	16.35	16.40	16.36
> > 
> > =======================================================
> > Throughput Impact(in Gbps, same env)
> > =======================================================
> > TCP conn	with patch	w/o patch
> > 20480		15.65		7.73
> > 10240		15.63		8.93
> > 8192		15.64		9.69
> > 6144		15.64		13.16
> > 4096		15.69		15.75
> > 2048		15.69		15.83
> > 1024		15.71		15.28
> > 
> > Fixes: 755391121038 ("net: mana: Allocate MSI-X vectors dynamically")
> > Cc: stable@vger.kernel.org
> > Co-developed-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> > Reviewed-by: Simon Horman <horms@kernel.org>
> 
> Why do you consider this patch a fix? To me is a configuration
> improvement and should land on net-next.

Hi Paolo,

This is a fix for commit 755391121038 ("net: mana: Allocate MSI-X
vectors dynamically"). Before that commit, IRQs were statically
allocated and clustering of MANA IRQs happened less often on low vCPU
configs. With dynamic allocation, MANA IRQs are assigned at a later
stage when other device IRQs have already occupied vCPUs. The NUMA-aware
affinity logic in that commit increased the probability of IRQ
clustering, causing a 2x throughput regression (15.65 vs 7.73 Gbps) on
low vCPU Azure SKUs at high connection counts.

> 
> > @@ -1717,11 +1719,24 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node,
> >  	return 0;
> >  }
> >  
> > +/* should be called with cpus_read_lock() held */
> 
> Minor nit: s/should/must/ or just drop the comment, as
> `for_each_online_cpu()` usage implies that.
> 

Thanks, will change this in next version.

> > +static void irq_setup_linear(unsigned int *irqs, unsigned int len)
> > +{
> > +	int cpu;
> > +
> > +	for_each_online_cpu(cpu) {
> > +		if (len == 0)
> > +			break;
> > +
> > +		irq_set_affinity_and_hint(*irqs++, cpumask_of(cpu));
> > +		len--;
> > +	}
> 
> As this is another heuristic regarding irq spreading, why don't you
> implement that inside irq_setup()?
> 

irq_setup() already handles multiple cases - dynamic, static, and HWC
affinity logic with NUMA-aware sibling group spreading. Adding the
linear case there would make it more complex and harder to follow.
Keeping it as a separate function makes both, NUMA aware and linear
paths easier to understand and maintain.

Happy to reconsider if you feel strongly about it.

> > @@ -1767,13 +1784,42 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> >  	 * first CPU sibling group since they are already affinitized to HWC IRQ
> >  	 */
> >  	cpus_read_lock();
> > -	if (gc->num_msix_usable <= num_online_cpus())
> > -		skip_first_cpu = true;
> > +	if (gc->num_msix_usable <= num_online_cpus()) {
> > +		err = irq_setup(irqs, nvec, gc->numa_node, true);
> > +		if (err) {
> > +			cpus_read_unlock();
> > +			goto free_irq;
> > +		}
> > +	} else {
> > +		/*
> > +		 * When num_msix_usable are more than num_online_cpus, our
> > +		 * queue IRQs should be equal to num of online vCPUs.
> > +		 * We try to make sure queue IRQs spread across all vCPUs.
> > +		 * In such a case NUMA or CPU core affinity does not matter.
> > +		 * Note: in this case the total mana IRQ should always be
> > +		 * num_online_cpus + 1. The first HWC IRQ is already handled
> > +		 * in HWC setup calls
> > +		 * However, if CPUs went offline since num_msix_usable was
> > +		 * computed, queue IRQs will be more than num_online_cpus().
> > +		 * In such cases remaining extra IRQs will retain their default
> > +		 * affinity.
> > +		 */
> > +		int first_unassigned = num_online_cpus();
> > +		if (nvec > first_unassigned) {
> 
> An empty line is needed between the variable declaration and the code.

noted, Thanks.

> 
> /P

^ permalink raw reply

* [PATCH net-next v2] net: mana: Cache MANA_QUERY_LINK_CONFIG result to avoid repeated HWC queries
From: Erni Sri Satya Vennela @ 2026-06-06 13:32 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, kuba, pabeni, kotaranov, horms, ernis, dipayanroy, kees,
	linux-hyperv, netdev, linux-kernel, linux-rdma

mana_query_link_cfg() sends an HWC command to firmware on every call,
but the link speed and QoS values it returns only change when the
driver explicitly calls mana_set_bw_clamp(). This function is called
not only by userspace via ethtool get_link_ksettings, but also
periodically by hv_netvsc through netvsc_get_link_ksettings and by
the sysfs speed_show attribute via dev_attr_show, resulting in
unnecessary HWC traffic every few minutes.

Add a link_cfg_error field to mana_port_context to cache the query
result. The field uses three states: 1 (not yet queried, initial
value set during mana_probe_port), 0 (success, speed/max_speed are
valid), or a negative errno for permanent errors like -EOPNOTSUPP
when the hardware does not support the command. Transient errors and
qos_unconfigured responses are not cached so that subsequent calls
will retry.

MANA is ops-locked because it implements net_shaper_ops, so the core
already takes netdev_lock() around all ethtool_ops and net_shaper_ops
entry points. Reuse that lock to serialize mana_query_link_cfg() and
mana_set_bw_clamp(). This prevents a concurrent mana_set_bw_clamp()
from racing with an in-flight query and publishing stale pre-clamp
speed/max_speed.

Invalidate the cache inside mana_set_bw_clamp() on success, so all
current and future callers that change the link configuration
automatically trigger a fresh query on the next mana_query_link_cfg()
call. Also reset link_cfg_error during resume in mana_probe() under
netdev_lock(), so that any query already in flight cannot later
store 0 and silently overwrite the post-resume invalidation.

Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v2:
* Use netdev_lock() instead of introducing new per-port mutex.
* Update commit message.
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 23 +++++++++++++++----
 include/net/mana/mana.h                       |  4 ++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index db14357d3732..af2517a27aad 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1456,6 +1456,12 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 	struct mana_query_link_config_req req = {};
 	int err;

+	netdev_assert_locked(ndev);
+
+	err = apc->link_cfg_error;
+	if (err <= 0)
+		return err;
+
 	mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_LINK_CONFIG,
 			     sizeof(req), sizeof(resp));

@@ -1468,6 +1474,7 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 	if (err) {
 		if (err == -EOPNOTSUPP) {
 			netdev_info_once(ndev, "MANA_QUERY_LINK_CONFIG not supported\n");
+			apc->link_cfg_error = err;
 			return err;
 		}
 		netdev_err(ndev, "Failed to query link config: %d\n", err);
@@ -1485,12 +1492,12 @@ int mana_query_link_cfg(struct mana_port_context *apc)
 		return err;
 	}

-	if (resp.qos_unconfigured) {
-		err = -EINVAL;
-		return err;
-	}
+	if (resp.qos_unconfigured)
+		return -EINVAL;
+
 	apc->speed = resp.link_speed_mbps;
 	apc->max_speed = resp.qos_speed_mbps;
+	apc->link_cfg_error = 0;
 	return 0;
 }

@@ -1502,6 +1509,8 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 	struct net_device *ndev = apc->ndev;
 	int err;

+	netdev_assert_locked(ndev);
+
 	mana_gd_init_req_hdr(&req.hdr, MANA_SET_BW_CLAMP,
 			     sizeof(req), sizeof(resp));
 	req.vport = apc->port_handle;
@@ -1535,6 +1544,8 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
 	if (resp.qos_unconfigured)
 		netdev_info(ndev, "QoS is unconfigured\n");

+	/* Invalidate the cache; next query will re-fetch from firmware. */
+	apc->link_cfg_error = 1;
 	return 0;
 }

@@ -3448,6 +3459,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	apc->port_handle = INVALID_MANA_HANDLE;
 	apc->pf_filter_handle = INVALID_MANA_HANDLE;
 	apc->port_idx = port_idx;
+	apc->link_cfg_error = 1;
 	apc->cqe_coalescing_enable = 0;

 	mutex_init(&apc->vport_mutex);
@@ -3768,6 +3780,9 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 			rtnl_lock();
 			apc = netdev_priv(ac->ports[i]);
 			enable_work(&apc->queue_reset_work);
+			netdev_lock(ac->ports[i]);
+			apc->link_cfg_error = 1;
+			netdev_unlock(ac->ports[i]);
 			err = mana_attach(ac->ports[i]);
 			rtnl_unlock();
 			/* Log the port for which the attach failed, stop
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index d9c27310fd04..2a45ff7211ef 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -555,6 +555,10 @@ struct mana_port_context {
 	u32 speed;
 	/* Maximum speed supported by the SKU (mbps) */
 	u32 max_speed;
+	/* 1 = not queried, 0 = cached success, negative = permanent error.
+	 * Protected by the netdev instance lock.
+	 */
+	int link_cfg_error;

 	bool port_is_up;
 	bool port_st_save; /* Saved port state */
-- 
2.34.1

^ permalink raw reply related

* Re: [PATCH net-next] net: mana: Cache MANA_QUERY_LINK_CONFIG result to avoid repeated HWC queries
From: Erni Sri Satya Vennela @ 2026-06-06 11:17 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, pabeni, kotaranov, horms, dipayanroy, kees,
	linux-hyperv, netdev, linux-kernel, linux-rdma
In-Reply-To: <20260605161315.26784677@kernel.org>

On Fri, Jun 05, 2026 at 04:13:15PM -0700, Jakub Kicinski wrote:
> On Thu, 4 Jun 2026 22:29:29 -0700 Erni Sri Satya Vennela wrote:
> > I tried two netdev_lock-based variants. 
> > 
> > mana_query_link_cfg() has four callers:
> > 
> > 1 ethtool ioctl/netlink			- has RTNL	- has netdev->lock
> > 2 sysfs speed_show/duplex_show		- has RTNL	- no netdev->lock
> > 3 netvsc_get_link_ksettings VF forward	- has RTNL	- no netdev->lock
> > 4 mana_shaper_set			- no RTNL	- has netdev->lock
> > 
> > No existing lock covers all four.
> 
> How fresh is your tree? The just-minted commit 9f275c2e9020 should
> address the gap, I believe?

Hi Jakub,

Thanks for pointing out the commit 9f275c2e9020. It does close the gap.
My analysis was against a tree that predated it but the commit landed
on net-next on Jun 4 21:30 UTC and my reply went out about an hour later,
so the rebase that picked it up hadn't happened on my end yet.

I've now rebased to current net-next and re-walked the four callers of
mana_query_link_cfg(). All of them hold netdev->lock by the time they 
reach mana_query_link_cfg(), and the race scenarios I described no longer
apply.

Thanks,
Vennela

^ permalink raw reply

* Re: [PATCH v4 10/47] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code
From: David Woodhouse @ 2026-06-06 10:52 UTC (permalink / raw)
  To: Thomas Gleixner, Sean Christopherson, Paolo Bonzini, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, Kiryl Shutsemau,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H. Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, Tom Lendacky, Nikunj A Dadhania,
	Michael Kelley
In-Reply-To: <877boc554l.ffs@fw13>

[-- Attachment #1: Type: text/plain, Size: 3487 bytes --]

On Sat, 2026-06-06 at 12:34 +0200, Thomas Gleixner wrote:
> On Fri, May 29 2026 at 07:43, Sean Christopherson wrote:
> 
> > Now that all paravirt code that explicitly specifies the TSC frequency
> > also sets X86_FEATURE_TSC_KNOWN_FREQ, replace all of the one-off code
> > and simply set X86_FEATURE_TSC_KNOWN_FREQ if the TSC frequency is known.
> > 
> > Do NOT force set TSC_KNOWN_FREQ if the "known" TSC frequency was provided
> > by the user.  Per commit bd35c77e32e4 ("x86/tsc: Add tsc_early_khz command
> > line parameter"), one of the goals of the param is to allow the refined
> > calibration work "to do meaningful error checking".
> > 
> > Note, preferring the user-provided TSC frequency over the frequency from
> > the hypervisor or trusted firmware, while simultaneously not treating the
> > user-provided frequency as gospel, is obviously incongruous.  Sweep the
> > problem under the rug for now to avoid opening a big can of worms that
> > likely doesn't have a great answer.
> 
> There is a good answer I think.
> 
> early_tsc_khz exists to cater for the overclocking crowd. On their
> modded systems the firmware supplied TSC frequency (CPUID/MSR) is not
> matching reality anymore. So they work around that by supplying a close
> enough tsc_early_khz and then they let the refined calibration work
> figure it out.
> 
> Arguably that's only relevant for bare metal systems and what's worse is
> that in virtual environments the refined calibration work can fail,
> which renders the TSC unstable.
> 
> So I'd rather say we change this logic to:
> 
>    if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
>       tsc_khz = x86_init.....();
>       force(X86_FEATURE_TSC_KNOWN_FREQ);
>    } else if (tsc_khz_early) {
>       ....
>    } else {
>       ...
>    }
> 
> Along with:
> 
>    if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
>       if (tsc_khz_early)
>          pr_warn("Ignoring non-sensical tsc_early_khz command line argument\n");
> 
> or something daft like that.
> 
> The kernel has for various reasons always tried to cater for the needs
> of users who are plagued by bonkers firmware, but we have to stop to
> prioritize or treating equal ancient and modded out of spec hardware.
> 
> TBH, I consider that whole KVM clock nonsense to fall into the modded
> out of spec hardware realm. Do a reality check:
> 
>    How many production systems are out there still which run VMs on CPUs
>    with a broken TSC and the lack of VM TSC scaling?
> 
> I'm not saying that we should not support the few remaining systems
> anymore, but our tendency to pretend that we can keep all of this
> nonsense working and at the same time making progress is just a fallacy.

I don't know that we can take the KVM (and Xen) clock away from guests,
but all of the *horrid* part about it is the way it attempts to cope
with the possibility that the *host* timekeeping might flip away from
TSC-based mode at any point in time. By the end of my outstanding
cleanup series, that is the *only* thing the gtod_notifier remains for.

If we can trust the hardware *and* the host kernel, then KVM could
theoretically hardwire the kvmclock into 'master clock mode' where it
basically just advertises the TSC→kvmclock relationship *once* to all
CPUs and it never changes.

All the nonsense about updating it every time we enter a CPU could just
go away completely.


[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [PATCH v4 10/47] x86/tsc: Consolidate forcing of X86_FEATURE_TSC_KNOWN_FREQ for PV code
From: Thomas Gleixner @ 2026-06-06 10:34 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, Kiryl Shutsemau, Sean Christopherson,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Andy Lutomirski,
	Peter Zijlstra, Juergen Gross, Daniel Lezcano, John Stultz
  Cc: H. Peter Anvin, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, kvm, linux-kernel, linux-coco, linux-hyperv,
	virtualization, xen-devel, David Woodhouse, Tom Lendacky,
	Nikunj A Dadhania, David Woodhouse, Michael Kelley
In-Reply-To: <20260529144435.704127-11-seanjc@google.com>

On Fri, May 29 2026 at 07:43, Sean Christopherson wrote:

> Now that all paravirt code that explicitly specifies the TSC frequency
> also sets X86_FEATURE_TSC_KNOWN_FREQ, replace all of the one-off code
> and simply set X86_FEATURE_TSC_KNOWN_FREQ if the TSC frequency is known.
>
> Do NOT force set TSC_KNOWN_FREQ if the "known" TSC frequency was provided
> by the user.  Per commit bd35c77e32e4 ("x86/tsc: Add tsc_early_khz command
> line parameter"), one of the goals of the param is to allow the refined
> calibration work "to do meaningful error checking".
>
> Note, preferring the user-provided TSC frequency over the frequency from
> the hypervisor or trusted firmware, while simultaneously not treating the
> user-provided frequency as gospel, is obviously incongruous.  Sweep the
> problem under the rug for now to avoid opening a big can of worms that
> likely doesn't have a great answer.

There is a good answer I think.

early_tsc_khz exists to cater for the overclocking crowd. On their
modded systems the firmware supplied TSC frequency (CPUID/MSR) is not
matching reality anymore. So they work around that by supplying a close
enough tsc_early_khz and then they let the refined calibration work
figure it out.

Arguably that's only relevant for bare metal systems and what's worse is
that in virtual environments the refined calibration work can fail,
which renders the TSC unstable.

So I'd rather say we change this logic to:

   if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
      tsc_khz = x86_init.....();
      force(X86_FEATURE_TSC_KNOWN_FREQ);
   } else if (tsc_khz_early) {
      ....
   } else {
      ...
   }

Along with:

   if (!hypervisor_is_type(X86_HYPER_NATIVE)) {
      if (tsc_khz_early)
         pr_warn("Ignoring non-sensical tsc_early_khz command line argument\n");

or something daft like that.

The kernel has for various reasons always tried to cater for the needs
of users who are plagued by bonkers firmware, but we have to stop to
prioritize or treating equal ancient and modded out of spec hardware.

TBH, I consider that whole KVM clock nonsense to fall into the modded
out of spec hardware realm. Do a reality check:

   How many production systems are out there still which run VMs on CPUs
   with a broken TSC and the lack of VM TSC scaling?

I'm not saying that we should not support the few remaining systems
anymore, but our tendency to pretend that we can keep all of this
nonsense working and at the same time making progress is just a fallacy.

I rather want to have a more fine grained differentiation and
prioritization of:

  1) The actual real world relevant use cases which run on contemporary
     hardware.

  2) Still relevant use cases on slightly older hardware with less
     capabilities

  3) Broken firmware

  4) Modded out of spec nonsense

  5) Support for ancient museums pieces

Thanks,

        tglx

^ permalink raw reply

* Re: [PATCH net 0/2] net: mana: fix error-path issues in queue setup
From: Jakub Kicinski @ 2026-06-06  1:27 UTC (permalink / raw)
  To: Aditya Garg
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, pabeni, horms, shradhagupta, dipayanroy, ernis, kees,
	shacharr, stephen, gargaditya, ssengar, linux-hyperv, netdev,
	linux-kernel
In-Reply-To: <20260604080137.1995269-1-gargaditya@linux.microsoft.com>

On Thu,  4 Jun 2026 01:01:24 -0700 Aditya Garg wrote:
> Two error-path fixes in MANA queue setup, both surfaced during Sashiko
> AI review of a recently upstreamed patch series.
> 
> Patch 1 initializes queue->id to INVALID_QUEUE_ID in
> mana_gd_create_mana_wq_cq() so that a CQ creation failure before the
> firmware id is assigned does not NULL gc->cq_table[0] and silently
> break whichever real CQ owns that slot. This mirrors the existing
> pattern in mana_gd_create_eq().
> 
> Patch 2 guards mana_destroy_txq()'s call to mana_destroy_wq_obj() with
> an INVALID_MANA_HANDLE check, mirroring mana_destroy_rxq(). Without
> it, TX setup failures lead to a firmware-rejected destroy of (u64)-1
> and a spurious error in dmesg.

Looks like these patches were generated against net-next, please rebase:

Applying: net: mana: initialize gdma queue id to INVALID_QUEUE_ID
Applying: net: mana: guard TX wq object destroy with INVALID_MANA_HANDLE check
error: patch failed: drivers/net/ethernet/microsoft/mana/mana_en.c:2351
error: drivers/net/ethernet/microsoft/mana/mana_en.c: patch does not apply
Patch failed at 0002 net: mana: guard TX wq object destroy with INVALID_MANA_HANDLE check

^ permalink raw reply

* Re: [PATCH net-next v12 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: sashiko-bot @ 2026-06-06  0:57 UTC (permalink / raw)
  To: Long Li; +Cc: linux-hyperv
In-Reply-To: <20260605005717.2059954-3-longli@microsoft.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] Permanent downward clamping of maximum queues (`apc->max_queues`) across suspend/resume cycles.
--

commit ae1648776d10be5e3fd6640d34f239a82e77a213
Author: Long Li <longli@microsoft.com>
Subject: net: mana: Query device capabilities and configure MSI-X sharing for EQs

This commit adjusts the max number of queues to allow dedicated MSI-X
vectors for each vPort and configures MSI-X sharing when resources are
scarce. It also updates port initialization to clamp the queue counts
to the recomputed per-vPort maximums.

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index ed60cc15fe78d..3ec8e94e7c17e 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
[ ... ]
> @@ -3208,6 +3213,8 @@ static int mana_init_port(struct net_device *ndev)
>  	max_queues = min_t(u32, max_txq, max_rxq);
>  	if (apc->max_queues > max_queues)
>  		apc->max_queues = max_queues;
> +	if (apc->max_queues > gc->max_num_queues_vport)
> +		apc->max_queues = gc->max_num_queues_vport;

[Severity: Medium]
Will this result in a permanent downward clamping of the maximum queues across
suspend and resume cycles?

Because gc->max_num_queues_vport is dynamically recalculated on every
suspend/resume cycle based on current hardware state (such as the number of
online CPUs and available MSI-X vectors), it might temporarily shrink.

If a system resumes with fewer vectors, this logic reduces apc->max_queues.

However, if the system subsequently resumes again with the original number of
vectors restored, apc->max_queues cannot recover because this clamping only
ever reduces the value and the original allocation limit is permanently
overwritten. 

Should we be clamping a temporary runtime variable instead of permanently
modifying the device's capability maximums here?

>  
>  	if (apc->num_queues > apc->max_queues)
>  		apc->num_queues = apc->max_queues;

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260605005717.2059954-1-longli@microsoft.com?part=2

^ permalink raw reply

* Re: [PATCH net-next v2] net: mana: Add Interrupt Moderation support
From: sashiko-bot @ 2026-06-05 23:43 UTC (permalink / raw)
  To: Haiyang Zhang; +Cc: linux-hyperv
In-Reply-To: <20260604234211.2056341-1-haiyangz@linux.microsoft.com>

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [High] Use-after-free and workqueue corruption due to a race between NAPI polling and interface teardown.
- [High] Unconditional use of GDMA_MESSAGE_V3 breaks backward compatibility with older firmware.
- [Medium] Static interrupt moderation is silently disabled if only one parameter is configured.
--

commit ec13708996b121633f56552fac3f0ceba004bf71
Author: Haiyang Zhang <haiyangz@microsoft.com>

net: mana: Add Interrupt Moderation support

This commit adds Static and Dynamic Interrupt Moderation (DIM) support for Rx
and Tx. It updates the queue creation procedure with new data structures and
settings, adds statistics collection functions for DIM, and introduces workers
to update DIM settings. It also hooks into ethtool to allow configuration.

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index db14357d3732e..b1e0c444f4144 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -1551,6 +1551,9 @@ int mana_create_wq_obj(struct mana_port_context *apc,
>  
>  	mana_gd_init_req_hdr(&req.hdr, MANA_CREATE_WQ_OBJ,
>  			     sizeof(req), sizeof(resp));
> +
> +	req.hdr.req.msg_version = GDMA_MESSAGE_V3;
> +	req.hdr.resp.msg_version = GDMA_MESSAGE_V2;
>  	req.vport = vport;

[Severity: High]
Will this break initialization on older firmware? We unconditionally set
msg_version to GDMA_MESSAGE_V3 here without checking if the hardware actually
supports it.

If an older Azure host without the GDMA_PF_CAP_FLAG_1_DYN_INTERRUPT_MODERATION
capability rejects the V3 request, could this cause queue creation to fail
and result in a complete loss of network connectivity?

[ ... ]
> @@ -2274,7 +2344,13 @@ static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
>  	if (w < cq->budget) {
>  		mana_gd_ring_cq(gdma_queue, SET_ARM_BIT);
>  		cq->work_done_since_doorbell = 0;
> -		napi_complete_done(&cq->napi, w);
> +
> +		if (napi_complete_done(&cq->napi, w)) {
> +			if (cq->type == MANA_CQ_TYPE_RX)
> +				mana_update_rx_dim(cq);
> +			else
> +				mana_update_tx_dim(cq);
> +		}

[Severity: High]
Is there a race condition here during teardown? If napi_complete_done()
clears the scheduling state, a concurrent teardown thread in
mana_destroy_rxq() can return from napi_disable_locked() and proceed to call
cancel_work_sync().

Because mana_update_rx_dim() hasn't been called yet to schedule the work,
cancel_work_sync() would return immediately and the queue memory would be
freed. When the NAPI thread resumes and calls mana_update_rx_dim(), could
this lead to a use-after-free or corruption of the workqueue linked list
by scheduling work on freed memory?

[ ... ]
> @@ -2783,6 +2869,10 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
>  	cq_spec.queue_size = cq->gdma_cq->queue_size;
>  	cq_spec.modr_ctx_id = 0;
>  	cq_spec.attached_eq = cq->gdma_cq->cq.parent->id;
> +	cq_spec.req_cq_moderation = apc->rx_dim_enabled ||
> +		(apc->intr_modr_rx_usec && apc->intr_modr_rx_comp);
> +	cq_spec.cq_moderation_usec = apc->intr_modr_rx_usec;
> +	cq_spec.cq_moderation_comp = apc->intr_modr_rx_comp;

[Severity: Medium]
Does this logic unintentionally ignore user configurations where only one
parameter is non-zero? 

In mana_set_coalesce(), ethtool allows setting rx-usecs without setting
rx-frames. If a user sets only time-based moderation (e.g., rx-usecs 50,
rx-frames 0), the condition here evaluates to false and silently disables
static moderation without returning an error via ethtool.

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260604234211.2056341-1-haiyangz@linux.microsoft.com?part=1

^ permalink raw reply

* Re: [PATCH net-next] net: mana: Cache MANA_QUERY_LINK_CONFIG result to avoid repeated HWC queries
From: Jakub Kicinski @ 2026-06-05 23:13 UTC (permalink / raw)
  To: Erni Sri Satya Vennela
  Cc: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
	edumazet, pabeni, kotaranov, horms, dipayanroy, kees,
	linux-hyperv, netdev, linux-kernel, linux-rdma
In-Reply-To: <aiJeuU3DLKL7JcPN@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On Thu, 4 Jun 2026 22:29:29 -0700 Erni Sri Satya Vennela wrote:
> I tried two netdev_lock-based variants. 
> 
> mana_query_link_cfg() has four callers:
> 
> 1 ethtool ioctl/netlink			- has RTNL	- has netdev->lock
> 2 sysfs speed_show/duplex_show		- has RTNL	- no netdev->lock
> 3 netvsc_get_link_ksettings VF forward	- has RTNL	- no netdev->lock
> 4 mana_shaper_set			- no RTNL	- has netdev->lock
> 
> No existing lock covers all four.

How fresh is your tree? The just-minted commit 9f275c2e9020 should
address the gap, I believe?

^ permalink raw reply

* [PATCH net-next] net: mana: Add support for PF device 0x00C1
From: Haiyang Zhang @ 2026-06-05 21:22 UTC (permalink / raw)
  To: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Konstantin Taranov, Simon Horman,
	Shradha Gupta, Erni Sri Satya Vennela, Dipayaan Roy, linux-kernel,
	linux-rdma
  Cc: paulros

From: Haiyang Zhang <haiyangz@microsoft.com>

Update the device id table to include the new device id 0x00C1.
This device's BAR layout is similar to VF's, update the function,
mana_gd_init_registers(), accordingly.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c | 7 +++++--
 include/net/mana/gdma.h                         | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 712a0881d720..5bc91ee8a543 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -150,7 +150,7 @@ static int mana_gd_init_registers(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 
-	if (gc->is_pf)
+	if (gc->is_pf && !gc->is_pf2)
 		return mana_gd_init_pf_regs(pdev);
 	else
 		return mana_gd_init_vf_regs(pdev);
@@ -2070,7 +2070,7 @@ static void mana_gd_cleanup_device(struct pci_dev *pdev)
 
 static bool mana_is_pf(unsigned short dev_id)
 {
-	return dev_id == MANA_PF_DEVICE_ID;
+	return dev_id == MANA_PF_DEVICE_ID || dev_id == MANA_PF2_DEVICE_ID;
 }
 
 static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -2118,6 +2118,8 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	gc->numa_node = dev_to_node(&pdev->dev);
 	gc->is_pf = mana_is_pf(pdev->device);
+	gc->is_pf2 = (pdev->device == MANA_PF2_DEVICE_ID);
+
 	gc->bar0_va = bar0_va;
 	gc->dev = &pdev->dev;
 	xa_init(&gc->irq_contexts);
@@ -2269,6 +2271,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
 
 static const struct pci_device_id mana_id_table[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_PF_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_PF2_DEVICE_ID) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_VF_DEVICE_ID) },
 	{ }
 };
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 70d62bc32837..7361e98d94ce 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -418,6 +418,7 @@ struct gdma_context {
 	u32			test_event_eq_id;
 
 	bool			is_pf;
+	bool			is_pf2;
 
 	phys_addr_t		bar0_pa;
 	void __iomem		*bar0_va;
@@ -571,6 +572,7 @@ struct gdma_eqe {
 #define GDMA_SRIOV_REG_CFG_BASE_OFF	0x108
 
 #define MANA_PF_DEVICE_ID 0x00B9
+#define MANA_PF2_DEVICE_ID 0x00C1
 #define MANA_VF_DEVICE_ID 0x00BA
 
 struct gdma_posted_wqe_info {
-- 
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox