* [PATCH V2 01/11] iommu/hyperv: rename hyperv-iommu.c to hyperv-irq.c
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
This file actually implements irq remapping, so rename to more appropriate
hyperv-irq.c. A new file to implement hyperv iommu will be introduced
later. Also, it should not be tied to HYPERV_IOMMU, but to CONFIG_HYPERV
and IRQ_REMAP. The file already has #ifdef CONFIG_IRQ_REMAP.
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
MAINTAINERS | 2 +-
drivers/iommu/Makefile | 2 +-
drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} | 6 +++---
drivers/iommu/irq_remapping.c | 2 +-
4 files changed, 6 insertions(+), 6 deletions(-)
rename drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} (99%)
diff --git a/MAINTAINERS b/MAINTAINERS
index d1cc0e12fe1f..f803a6a38fee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11914,7 +11914,7 @@ F: drivers/clocksource/hyperv_timer.c
F: drivers/hid/hid-hyperv.c
F: drivers/hv/
F: drivers/input/serio/hyperv-keyboard.c
-F: drivers/iommu/hyperv-iommu.c
+F: drivers/iommu/hyperv-irq.c
F: drivers/net/ethernet/microsoft/
F: drivers/net/hyperv/
F: drivers/pci/controller/pci-hyperv-intf.c
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 0275821f4ef9..335ea77cced6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -30,7 +30,7 @@ obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
-obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
+obj-$(CONFIG_HYPERV) += hyperv-irq.o
obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-irq.c
similarity index 99%
rename from drivers/iommu/hyperv-iommu.c
rename to drivers/iommu/hyperv-irq.c
index 479103261ae6..d11076f906fb 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-irq.c
@@ -8,6 +8,8 @@
* Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
*/
+#ifdef CONFIG_IRQ_REMAP
+
#include <linux/types.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
@@ -24,8 +26,6 @@
#include "irq_remapping.h"
-#ifdef CONFIG_IRQ_REMAP
-
/*
* According 82093AA IO-APIC spec , IO APIC has a 24-entry Interrupt
* Redirection Table. Hyper-V exposes one single IO-APIC and so define
@@ -331,4 +331,4 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
.free = hyperv_root_irq_remapping_free,
};
-#endif
+#endif /* CONFIG_IRQ_REMAP */
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index c2443659812a..41bf65e4ea88 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -108,7 +108,7 @@ int __init irq_remapping_prepare(void)
else if (IS_ENABLED(CONFIG_AMD_IOMMU) &&
amd_iommu_irq_ops.prepare() == 0)
remap_ops = &amd_iommu_irq_ops;
- else if (IS_ENABLED(CONFIG_HYPERV_IOMMU) &&
+ else if (IS_ENABLED(CONFIG_HYPERV) &&
hyperv_irq_remap_ops.prepare() == 0)
remap_ops = &hyperv_irq_remap_ops;
else
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 00/11] PCI passthru on Hyper-V (Part I)
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
Implement passthru of PCI devices to unprivileged virtual machines
(VMs) when Linux is running as a privileged VM on Microsoft Hyper-V
hypervisor. This support is made to fit within the workings of VFIO
framework, and any VMM needing to use it must use the VFIO subsystem.
This supports both full device passthru and SR-IOV based VFs.
At a high level, the hypervisor supports traditional mapped iommu domains
that use explicit map and unmap hypercalls for mapping and unmapping guest
RAM into the iommu subsystem. Hyper-V also has a concept of direct attach
devices whereby the iommu subsystem simply uses the guest HW page table
(ept/npt/..). This series adds support for both, and both are made to
work with the VFIO subsystem.
While this Part I focuses on memory mappings, Part II focuses on irq
remapping and irq migrations.
This series rebased to: 5170a82e8921 (origin/hyperv-next)
Testing:
o Most testing done on hyperv-next:e733a9e28180 using Cloud Hypervisor (51).
o Limited testing on : 5170a82e8921
o Tested with impending Part II irq patches.
o All tests involved PF passthru of devices using MSIx.
o Following combinations were tested:
- L1VH(1): test 1: Mellanox ConnectX-6 Lx passthru
test 2: NVIDIA Tesla Tesla T4 GPU.
test 3: Both of above simultaneous passthru
- Baremetal dom0/root: All of above.
(1) L1VH: this is a semi privileged VM that runs on Windows root on
Hyper-V, and allows users to create more child VMs.
This series strives to establish a base line. Some pending work items:
o arm64 : some delta to make this work on arm64 (in progress).
o Qemu and OpenVMM support (in progress).
o VF testing
o device sleep/wakeup.
o More stress testing with high end GPUs
Changes in V2:
o rebase to 5170a82e8921
o minor fixes for arm64 build
o drop patch 03: "x86/hyperv: add insufficient memory support in irqdomain.c"
as it that path is no longer used
o drop patch 08: "PCI: hv: rename hv_compose_msi_msg .. " and do it separately
outside this series.
o minor updates to commit messages
Changes in V1:
o patch 1: Don't tie hyperv-irq.c to CONFIG_HYPERV_IOMMU.
o patch 4: Redesigned to address security vulnerability found by copilot
with passing tgid as a parameter. Also, do tgid setting right
after setting pt_id.
o patch 5: Remove unused type parameter from mshv_device_ops.device_create
o patch 7: mshv_partition_ioctl_create_device cleanup on copy_to_user.
o patch 10: Add export of hv_build_devid_type_pci here to get rid of
patch 11.
o patch 12: Move functions to build device ids from patch 11 here for
the benefit of arm64. Rename file to: hyperv-iommu-root.c.
o patch 13: removed to be made part of interrupt part II of this support.
o patch 14: get rid of fast path to reduce review noise.
o New (last) patch to pin ram regions if device passthru to a VM.
Thanks,
-Mukesh
Mukesh R (11):
iommu/hyperv: rename hyperv-iommu.c to hyperv-irq.c
x86/hyperv: cosmetic changes in irqdomain.c for readability
mshv: Provide a way to get partition id if running in a VMM process
mshv: Declarations and definitions for VFIO-MSHV bridge device
mshv: Implement mshv bridge device for VFIO
mshv: Add ioctl support for MSHV-VFIO bridge device
mshv: Import data structs around device passthru from hyperv headers
PCI: hv: Build device id for a VMBus device, export PCI devid function
x86/hyperv: Implement hyperv virtual IOMMU
mshv: Populate mmio mappings for PCI passthru
mshv: Mark mem regions as non-movable upfront if device passthru
MAINTAINERS | 3 +-
arch/x86/hyperv/irqdomain.c | 199 ++--
arch/x86/include/asm/mshyperv.h | 6 +
arch/x86/kernel/pci-dma.c | 2 +
drivers/hv/Makefile | 3 +-
drivers/hv/mshv_root.h | 21 +
drivers/hv/mshv_root_main.c | 266 ++++-
drivers/hv/mshv_vfio.c | 211 ++++
drivers/iommu/Kconfig | 5 +-
drivers/iommu/Makefile | 3 +-
drivers/iommu/hyperv-iommu-root.c | 908 ++++++++++++++++++
.../iommu/{hyperv-iommu.c => hyperv-irq.c} | 6 +-
drivers/iommu/irq_remapping.c | 2 +-
drivers/pci/controller/pci-hyperv.c | 24 +
include/asm-generic/mshyperv.h | 30 +
include/hyperv/hvgdk_mini.h | 11 +
include/hyperv/hvhdk_mini.h | 112 +++
include/linux/hyperv.h | 6 +
include/uapi/linux/mshv.h | 31 +
19 files changed, 1727 insertions(+), 122 deletions(-)
create mode 100644 drivers/hv/mshv_vfio.c
create mode 100644 drivers/iommu/hyperv-iommu-root.c
rename drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} (99%)
--
2.51.2.vfs.0.1
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
From: Dexuan Cui @ 2026-04-30 22:42 UTC (permalink / raw)
To: kernel test robot, KY Srinivasan, Haiyang Zhang,
wei.liu@kernel.org, Long Li, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, mhklinux@outlook.com,
matthew.ruffell@canonical.com, johansen@templeofstupid.com
Cc: llvm@lists.linux.dev, oe-kbuild-all@lists.linux.dev,
stable@vger.kernel.org
In-Reply-To: <202605010002.dnnxVZFF-lkp@intel.com>
> From: kernel test robot <lkp@intel.com>
> Sent: Thursday, April 30, 2026 9:33 AM
> ...
> config: i386-buildonly-randconfig-002-20260430
> ...
> All warnings (new ones prefixed by >>):
>
> >> drivers/hv/vmbus_drv.c:2403:40: warning: result of comparison of constant
> 4294967296 with expression of type 'resource_size_t' (aka 'unsigned int') is
> always false [-Wtautological-constant-out-of-range-compare]
> 2403 | if (!low_mmio_base || low_mmio_base >= SZ_4G ||
> | ~~~~~~~~~~~~~ ^ ~~~~~
> 1 warning generated.
Thanks for reporting the warning with the i386 kernel config.
I don't know if there is any x86-32 users nowadays, but this warning can be
fixed by:
- if (!low_mmio_base || low_mmio_base >= SZ_4G ||
+ if (!low_mmio_base || upper_32_bits(low_mmio_base) ||
(start && start < low_mmio_base)) {
pr_warn("Unexpected low mmio base 0x%pa\n", &low_mmio_base);
}
^ permalink raw reply
* RE: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
From: Dexuan Cui @ 2026-04-30 22:16 UTC (permalink / raw)
To: Michael Kelley, KY Srinivasan, Haiyang Zhang, wei.liu@kernel.org,
Long Li, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, matthew.ruffell@canonical.com,
johansen@templeofstupid.com
Cc: stable@vger.kernel.org
In-Reply-To: <SN6PR02MB415726B17D5A6027CD1717E8D4342@SN6PR02MB4157.namprd02.prod.outlook.com>
> From: Michael Kelley <mhklinux@outlook.com>
> Sent: Wednesday, April 29, 2026 11:01 AM
>
> From: Dexuan Cui <DECUI@microsoft.com> Sent: Tuesday, April 28, 2026 8:13
> PM
> ...
> >
> > A CVM on Hyper-V won't start without the command line
> > Disable-VMConsoleSupport -VMName $vmName
This is not true. It turns out I can start a VBS/SNP/TDX without the
command line.... Sorry! Not sure why I had the wrong impression -- I
guess I was told to always run the command since day 1, so I subconsciously
thought a VM would not start without it. Or, maybe the host behavior
changed? but that seems unlikely to me.
> Unfortunately, on my laptop Hyper-V, a VM with VBS Isolation appears
> to *not* require Disable-VMConsoleSupport. I can start the VM, and the
> VM is offered the VMBus synthvid, mouse, and keyboard devices.
Actually I can also start a VBS VM without Disable-VMConsoleSupport.
> But what's weird in this case is that vmbus_reserved_fb() sees lfb_base
> and lfb_start as 0.
I see the same.
> Furthermore, as a test, I changed the "allowed_in_isolated"
> flag to true for the synthvid device, and the Hyper-V DRM driver loads and
> initializes.
I also changed the flag .allowed_in_isolated to true for HV_SYNTHVID_GUID,
HV_KBD, and HV_MOUSE, but I can't see the devices in "lsvmbus".
In vmbus_onoffer(), I printed the offer->offer.if_type and
offer->offer.if_instance just after the message " Invalid offer %d from the host
supporting isolation", and I indeed don't see the fb/mouse/keyboard devices.
I'm on a recent Hyper-V dev build. Maybe this is why my observation is
not exactly the same.
>In doing so, the vmconnect.exe window is resized larger, as is
> done in a normal VM. /proc/iomem shows that the DRM driver claimed
> the expected MMIO range at the start of low MMIO space. I can run a user
> space program that mmaps /dev/fb0 and writes pixels to the mmap'ed
> memory, and that succeeds as it would in a normal VM, but the
> vmconnect.exe window doesn't show anything. It appears that the Hyper-V
> host has allocated memory for the frame buffer, but is ignoring anything
> that is written to it.
>
> Running Disable-VMConsoleSupport works as expected -- the synthvid,
> mouse, and keyboard devices are no longer offered to the VM.
I even ran "Enable-VMConsoleSupport", which finished without any error,
but I still didn't see the keyboard/mouse/framebuffer devices.
> So instead of not reserving any MMIO space for the framebuffer on
> CVMs, the code you already have limits the reservation to half of the
> MMIO space below 4 GB.
Correct.
> Won't that work to avoid exhausting the low
> MMIO space in a CVM that's running on a local Hyper-V with only 128
> MiB of low MMIO space?
Correct. I'll drop the CVM check in vmbus_reserve_fb() in v2.
^ permalink raw reply
* Re: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
From: kernel test robot @ 2026-04-30 16:33 UTC (permalink / raw)
To: Dexuan Cui, kys, haiyangz, wei.liu, longli, linux-hyperv,
linux-kernel, mhklinux, matthew.ruffell, johansen
Cc: llvm, oe-kbuild-all, stable
In-Reply-To: <20260416183529.838321-1-decui@microsoft.com>
Hi Dexuan,
kernel test robot noticed the following build warnings:
[auto build test WARNING on linus/master]
[also build test WARNING on v7.1-rc1 next-20260429]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Dexuan-Cui/Drivers-hv-vmbus-Improve-the-logc-of-reserving-fb_mmio-on-Gen2-VMs/20260424-033622
base: linus/master
patch link: https://lore.kernel.org/r/20260416183529.838321-1-decui%40microsoft.com
patch subject: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
config: i386-buildonly-randconfig-002-20260430 (https://download.01.org/0day-ci/archive/20260501/202605010002.dnnxVZFF-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260501/202605010002.dnnxVZFF-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605010002.dnnxVZFF-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> drivers/hv/vmbus_drv.c:2403:40: warning: result of comparison of constant 4294967296 with expression of type 'resource_size_t' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare]
2403 | if (!low_mmio_base || low_mmio_base >= SZ_4G ||
| ~~~~~~~~~~~~~ ^ ~~~~~
1 warning generated.
vim +2403 drivers/hv/vmbus_drv.c
2385
2386 static void __maybe_unused vmbus_reserve_fb(void)
2387 {
2388 resource_size_t start = 0, size;
2389 resource_size_t low_mmio_base;
2390 struct pci_dev *pdev;
2391
2392 /* Hyper-V CoCo guests do not have a framebuffer device. */
2393 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
2394 return;
2395
2396 if (efi_enabled(EFI_BOOT)) {
2397 /* Gen2 VM: get FB base from EFI framebuffer */
2398 if (IS_ENABLED(CONFIG_SYSFB)) {
2399 start = sysfb_primary_display.screen.lfb_base;
2400 size = max_t(__u32, sysfb_primary_display.screen.lfb_size, 0x800000);
2401
2402 low_mmio_base = hyperv_mmio->start;
> 2403 if (!low_mmio_base || low_mmio_base >= SZ_4G ||
2404 (start && start < low_mmio_base)) {
2405 pr_warn("Unexpected low mmio base 0x%pa\n", &low_mmio_base);
2406 } else {
2407 /*
2408 * If the kdump kernel's lfb_base is 0,
2409 * fall back to the low mmio base.
2410 */
2411 if (!start)
2412 start = low_mmio_base;
2413 /*
2414 * Reserve half of the space below 4GB for high
2415 * resolutions, but cap the reservation to 128MB.
2416 */
2417 size = min((SZ_4G - start) / 2, SZ_128M);
2418 }
2419 }
2420 } else {
2421 /* Gen1 VM: get FB base from PCI */
2422 pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
2423 PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
2424 if (!pdev)
2425 return;
2426
2427 if (pdev->resource[0].flags & IORESOURCE_MEM) {
2428 start = pci_resource_start(pdev, 0);
2429 size = pci_resource_len(pdev, 0);
2430 }
2431
2432 /*
2433 * Release the PCI device so hyperv_drm driver can grab it
2434 * later.
2435 */
2436 pci_dev_put(pdev);
2437 }
2438
2439 if (!start)
2440 return;
2441
2442 /*
2443 * Make a claim for the frame buffer in the resource tree under the
2444 * first node, which will be the one below 4GB. The length seems to
2445 * be underreported, particularly in a Generation 1 VM. So start out
2446 * reserving a larger area and make it smaller until it succeeds.
2447 */
2448 for (; !fb_mmio && (size >= 0x100000); size >>= 1)
2449 fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0);
2450
2451 pr_info("hv_mmio=%pR,%pR fb=%pR\n", hyperv_mmio, hyperv_mmio->sibling, fb_mmio);
2452 }
2453
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* [PATCH v3] mshv: Simplify GPA map/unmap hypercall helpers
From: Stanislav Kinsburskii @ 2026-04-30 14:52 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
preceding bug-fix patches:
Move "done += completed" before the status checks so that pages mapped
by a partially-successful batch are included in the error cleanup unmap.
Previously these mappings were leaked on failure.
While here, improve type safety and readability:
- Change "int done" to "u64 done" to match the u64 page_count it is
compared against, avoiding signed/unsigned comparison hazards.
- Use u64 for loop iteration and batch size variables consistently.
- Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
- Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
- Simplify the error-path unmap to use "done << large_shift" directly
instead of mutating done in place.
v3: aligned changes by 80 colons
v2: replaced min with min_t
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_root_hv_call.c | 56 +++++++++++++++-------------------------
1 file changed, 21 insertions(+), 35 deletions(-)
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index e5992c324904a..e1f9e28d5a19b 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
struct hv_input_map_gpa_pages *input_page;
u64 status, *pfnlist;
unsigned long irq_flags, large_shift = 0;
- int ret = 0, done = 0;
- u64 page_count = page_struct_count;
+ u64 done = 0, page_count = page_struct_count;
+ int ret = 0;
if (page_count == 0 || (pages && mmio_spa))
return -EINVAL;
@@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
}
while (done < page_count) {
- ulong i, completed, remain = page_count - done;
- int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
+ u64 i, completed, remain = page_count - done;
+ u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -224,23 +224,14 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
input_page->map_flags = flags;
pfnlist = input_page->source_gpa_page_list;
- for (i = 0; i < rep_count; i++)
- if (flags & HV_MAP_GPA_NO_ACCESS) {
+ for (i = 0; i < rep_count; i++) {
+ if (flags & HV_MAP_GPA_NO_ACCESS)
pfnlist[i] = 0;
- } else if (pages) {
- u64 index = (done + i) << large_shift;
-
- if (index >= page_struct_count) {
- ret = -EINVAL;
- break;
- }
- pfnlist[i] = page_to_pfn(pages[index]);
- } else {
+ else if (pages)
+ pfnlist[i] = page_to_pfn(pages[(done + i) <<
+ large_shift]);
+ else
pfnlist[i] = mmio_spa + done + i;
- }
- if (ret) {
- local_irq_restore(irq_flags);
- break;
}
status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
@@ -248,29 +239,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
+ done += completed;
if (hv_result_needs_memory(status)) {
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
HV_MAP_GPA_DEPOSIT_PAGES);
if (ret)
break;
-
} else if (!hv_result_success(status)) {
ret = hv_result_to_errno(status);
break;
}
-
- done += completed;
}
if (ret && done) {
u32 unmap_flags = 0;
- if (flags & HV_MAP_GPA_LARGE_PAGE) {
+ if (flags & HV_MAP_GPA_LARGE_PAGE)
unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
- done <<= large_shift;
- }
- hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
+ hv_call_unmap_gpa_pages(partition_id, gfn,
+ done << large_shift, unmap_flags);
}
return ret;
@@ -305,7 +293,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
struct hv_input_unmap_gpa_pages *input_page;
u64 status, page_count = page_count_4k;
unsigned long irq_flags, large_shift = 0;
- int ret = 0, done = 0;
+ u64 done = 0;
if (page_count == 0)
return -EINVAL;
@@ -319,8 +307,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
}
while (done < page_count) {
- ulong completed, remain = page_count - done;
- int rep_count = min(remain, HV_UMAP_GPA_PAGES);
+ u64 completed, remain = page_count - done;
+ u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -333,15 +321,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
- if (!hv_result_success(status)) {
- ret = hv_result_to_errno(status);
- break;
- }
-
done += completed;
+
+ if (!hv_result_success(status))
+ return hv_result_to_errno(status);
}
- return ret;
+ return 0;
}
int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
^ permalink raw reply related
* Re: [PATCH v2] mshv: Simplify GPA map/unmap hypercall helpers
From: Stanislav Kinsburskii @ 2026-04-30 14:43 UTC (permalink / raw)
To: Mukesh R
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <61e5d806-b5d5-ab2c-0e09-6def449d5582@linux.microsoft.com>
On Wed, Apr 29, 2026 at 07:06:08PM -0700, Mukesh R wrote:
>
> On 4/29/26 09:48, Stanislav Kinsburskii wrote:
> > Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> > preceding bug-fix patches:
> >
> > Move "done += completed" before the status checks so that pages mapped
> > by a partially-successful batch are included in the error cleanup unmap.
> > Previously these mappings were leaked on failure.
> >
> > While here, improve type safety and readability:
> > - Change "int done" to "u64 done" to match the u64 page_count it is
> > compared against, avoiding signed/unsigned comparison hazards.
> > - Use u64 for loop iteration and batch size variables consistently.
> > - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
> > - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
> > - Simplify the error-path unmap to use "done << large_shift" directly
> > instead of mutating done in place.
> >
>
> what changed in V2?
>
No functional changes: "min" was replaced with "min_t" (reported by
checkpatch.pl).
> > Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > ---
> > drivers/hv/mshv_root_hv_call.c | 55 +++++++++++++++-------------------------
> > 1 file changed, 20 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > index e5992c324904a..1f19a4ca824f0 100644
> > --- a/drivers/hv/mshv_root_hv_call.c
> > +++ b/drivers/hv/mshv_root_hv_call.c
> > @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > struct hv_input_map_gpa_pages *input_page;
> > u64 status, *pfnlist;
> > unsigned long irq_flags, large_shift = 0;
> > - int ret = 0, done = 0;
> > - u64 page_count = page_struct_count;
> > + u64 done = 0, page_count = page_struct_count;
> > + int ret = 0;
> > if (page_count == 0 || (pages && mmio_spa))
> > return -EINVAL;
> > @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > }
> > while (done < page_count) {
> > - ulong i, completed, remain = page_count - done;
> > - int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> > + u64 i, completed, remain = page_count - done;
> > + u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
> > local_irq_save(irq_flags);
> > input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > @@ -224,23 +224,13 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > input_page->map_flags = flags;
> > pfnlist = input_page->source_gpa_page_list;
> > - for (i = 0; i < rep_count; i++)
> > - if (flags & HV_MAP_GPA_NO_ACCESS) {
> > + for (i = 0; i < rep_count; i++) {
> > + if (flags & HV_MAP_GPA_NO_ACCESS)
> > pfnlist[i] = 0;
> > - } else if (pages) {
> > - u64 index = (done + i) << large_shift;
> > -
> > - if (index >= page_struct_count) {
> > - ret = -EINVAL;
> > - break;
> > - }
> > - pfnlist[i] = page_to_pfn(pages[index]);
> > - } else {
> > + else if (pages)
> > + pfnlist[i] = page_to_pfn(pages[(done + i) << large_shift]);
>
> Entire file is 80 cols, please don't cause this one overflow.
>
Sure. I'll update.
Thanks,
Stanislav
> Thanks,
> -Mukesh
>
>
> > + else
> > pfnlist[i] = mmio_spa + done + i;
> > - }
> > - if (ret) {
> > - local_irq_restore(irq_flags);
> > - break;
> > }
> > status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> > @@ -248,29 +238,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > local_irq_restore(irq_flags);
> > completed = hv_repcomp(status);
> > + done += completed;
> > if (hv_result_needs_memory(status)) {
> > ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> > HV_MAP_GPA_DEPOSIT_PAGES);
> > if (ret)
> > break;
> > -
> > } else if (!hv_result_success(status)) {
> > ret = hv_result_to_errno(status);
> > break;
> > }
> > -
> > - done += completed;
> > }
> > if (ret && done) {
> > u32 unmap_flags = 0;
> > - if (flags & HV_MAP_GPA_LARGE_PAGE) {
> > + if (flags & HV_MAP_GPA_LARGE_PAGE)
> > unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> > - done <<= large_shift;
> > - }
> > - hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> > + hv_call_unmap_gpa_pages(partition_id, gfn,
> > + done << large_shift, unmap_flags);
> > }
> > return ret;
> > @@ -305,7 +292,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> > struct hv_input_unmap_gpa_pages *input_page;
> > u64 status, page_count = page_count_4k;
> > unsigned long irq_flags, large_shift = 0;
> > - int ret = 0, done = 0;
> > + u64 done = 0;
> > if (page_count == 0)
> > return -EINVAL;
> > @@ -319,8 +306,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> > }
> > while (done < page_count) {
> > - ulong completed, remain = page_count - done;
> > - int rep_count = min(remain, HV_UMAP_GPA_PAGES);
> > + u64 completed, remain = page_count - done;
> > + u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
> > local_irq_save(irq_flags);
> > input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > @@ -333,15 +320,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> > local_irq_restore(irq_flags);
> > completed = hv_repcomp(status);
> > - if (!hv_result_success(status)) {
> > - ret = hv_result_to_errno(status);
> > - break;
> > - }
> > -
> > done += completed;
> > +
> > + if (!hv_result_success(status))
> > + return hv_result_to_errno(status);
> > }
> > - return ret;
> > + return 0;
> > }
> > int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
> >
> >
>
^ permalink raw reply
* Re: [PATCH 00/10] mshv: Bug fixes across the mshv_root module
From: Stanislav Kinsburskii @ 2026-04-30 14:40 UTC (permalink / raw)
To: Mukesh R
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <daacfbcc-e725-65f2-4b20-b4501e45e651@linux.microsoft.com>
On Wed, Apr 29, 2026 at 07:18:44PM -0700, Mukesh R wrote:
> On 4/29/26 11:17, Stanislav Kinsburskii wrote:
> > This series addresses bugs found during a review of the mshv_root module
> > introduced by commit 621191d709b14 ("Drivers: hv: Introduce mshv_root
> > module to expose /dev/mshv to VMMs").
> >
> > The fixes range from data corruption and use-after-free to silent
> > functional failures:
> >
> > - IRQ state leak and type truncation in hypercall helpers
> > (hv_call_modify_spa_host_access)
> > - Integer overflow on userspace-controlled allocation size
> > (mshv_region_create)
> > - Missing locking, broken seqcount read protection, and a check on
> > uninitialized data in the irqfd path ? the latter makes
> > level-triggered interrupt resampling completely non-functional
> > - Duplicate GSI 0 detection using the wrong predicate
> > - Use-after-RCU in port ID lookup
> > - Missing VP index bounds check in intercept ISR (OOB in interrupt
> > context)
> > - Missing error code on VP allocation failure (silent success to
> > userspace)
>
> Lot of changes here, curious, how were all these discovered
> suddenly? Stress testing, internal/external? Or reported by
> copilot/sashiko/etc..
>
These are suggested by Claude Opus 4.6.
> How were the fixes tested?
>
I ran cloud hypervisor intergration tests suite against these changes,
which covers a wide range of scenarios including interrupt handling,
memory management, and VP lifecycle.
Thanks,
Stanislav
> Thanks,
> -Mukesh
>
>
> > ---
> >
> > Stanislav Kinsburskii (10):
> > mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
> > mshv: Fix potential integer overflow in mshv_region_create
> > mshv: Fix missing lock in mshv_irqfd_deassign
> > mshv: Fix broken seqcount read protection
> > mshv: Fix level-triggered check on uninitialized data
> > mshv: Fix duplicate GSI detection for GSI 0
> > mshv: Fix use-after-RCU in mshv_portid_lookup
> > mshv: Use kfree_rcu in mshv_portid_free
> > mshv: Add missing vp_index bounds check in intercept ISR
> > mshv: Fix missing error code on VP allocation failure
> >
> >
> > drivers/hv/mshv_eventfd.c | 75 ++++++++++++++++++++++------------------
> > drivers/hv/mshv_irq.c | 2 +
> > drivers/hv/mshv_portid_table.c | 6 +--
> > drivers/hv/mshv_regions.c | 2 +
> > drivers/hv/mshv_root_hv_call.c | 18 +++-------
> > drivers/hv/mshv_root_main.c | 4 ++
> > drivers/hv/mshv_synic.c | 4 ++
> > 7 files changed, 59 insertions(+), 52 deletions(-)
> >
>
^ permalink raw reply
* Re: [PATCH] mshv: Simplify GPA map/unmap hypercall helpers
From: Anirudh Rayabharam @ 2026-04-30 9:57 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <afIgeaLSiCG4f8lW@skinsburskii.localdomain>
On Wed, Apr 29, 2026 at 08:15:05AM -0700, Stanislav Kinsburskii wrote:
> On Wed, Apr 29, 2026 at 11:02:37AM +0000, Anirudh Rayabharam wrote:
> > On Tue, Apr 28, 2026 at 11:21:12PM +0000, Stanislav Kinsburskii wrote:
> > > Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> > > preceding bug-fix patches:
> > >
> > > Move "done += completed" before the status checks so that pages mapped
> > > by a partially-successful batch are included in the error cleanup unmap.
> > > Previously these mappings were leaked on failure.
> > >
> > > While here, improve type safety and readability:
> > > - Change "int done" to "u64 done" to match the u64 page_count it is
> > > compared against, avoiding signed/unsigned comparison hazards.
> > > - Use u64 for loop iteration and batch size variables consistently.
> > > - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
> > > - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
> > > - Simplify the error-path unmap to use "done << large_shift" directly
> > > instead of mutating done in place.
> > >
> > > Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> > > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > > ---
> > > drivers/hv/mshv_root_hv_call.c | 55 +++++++++++++++-------------------------
> > > 1 file changed, 20 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > > index e5992c324904a..f5f205a397834 100644
> > > --- a/drivers/hv/mshv_root_hv_call.c
> > > +++ b/drivers/hv/mshv_root_hv_call.c
> > > @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > struct hv_input_map_gpa_pages *input_page;
> > > u64 status, *pfnlist;
> > > unsigned long irq_flags, large_shift = 0;
> > > - int ret = 0, done = 0;
> > > - u64 page_count = page_struct_count;
> > > + u64 done = 0, page_count = page_struct_count;
> > > + int ret = 0;
> > >
> > > if (page_count == 0 || (pages && mmio_spa))
> > > return -EINVAL;
> > > @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > }
> > >
> > > while (done < page_count) {
> > > - ulong i, completed, remain = page_count - done;
> > > - int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> > > + u64 i, completed, remain = page_count - done;
> > > + u64 rep_count = min(remain, (u64)HV_MAP_GPA_BATCH_SIZE);
> > >
> > > local_irq_save(irq_flags);
> > > input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > @@ -224,23 +224,13 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > input_page->map_flags = flags;
> > > pfnlist = input_page->source_gpa_page_list;
> > >
> > > - for (i = 0; i < rep_count; i++)
> > > - if (flags & HV_MAP_GPA_NO_ACCESS) {
> > > + for (i = 0; i < rep_count; i++) {
> > > + if (flags & HV_MAP_GPA_NO_ACCESS)
> > > pfnlist[i] = 0;
> > > - } else if (pages) {
> > > - u64 index = (done + i) << large_shift;
> > > -
> > > - if (index >= page_struct_count) {
> > > - ret = -EINVAL;
> > > - break;
> > > - }
> > > - pfnlist[i] = page_to_pfn(pages[index]);
> > > - } else {
> > > + else if (pages)
> > > + pfnlist[i] = page_to_pfn(pages[(done + i) << large_shift]);
> > > + else
> > > pfnlist[i] = mmio_spa + done + i;
> > > - }
> > > - if (ret) {
> > > - local_irq_restore(irq_flags);
> > > - break;
> > > }
> > >
> > > status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> > > @@ -248,29 +238,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > local_irq_restore(irq_flags);
> > >
> > > completed = hv_repcomp(status);
> > > + done += completed;
> > >
> > > if (hv_result_needs_memory(status)) {
> > > ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> > > HV_MAP_GPA_DEPOSIT_PAGES);
> > > if (ret)
> > > break;
> > > -
> > > } else if (!hv_result_success(status)) {
> > > ret = hv_result_to_errno(status);
> > > break;
> > > }
> > > -
> > > - done += completed;
> > > }
> > >
> > > if (ret && done) {
> > > u32 unmap_flags = 0;
> > >
> > > - if (flags & HV_MAP_GPA_LARGE_PAGE) {
> > > + if (flags & HV_MAP_GPA_LARGE_PAGE)
> > > unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> > > - done <<= large_shift;
> > > - }
> > > - hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> > > + hv_call_unmap_gpa_pages(partition_id, gfn,
> > > + done << large_shift, unmap_flags);
> >
> > How does this work? Earlier we were doing "done << large_shift" only if
> > HV_MAP_GPA_LARGE_PAGE is set but now we always do it.
> >
>
> It works becuase large_shift in initialized to 0 when
> HV_MAP_GPA_LARGE_PAGE is not set.
Oh I see.
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
^ permalink raw reply
* [PATCH net-next v2] net: mana: hardening: Reject zero max_num_queues from MANA_QUERY_VPORT_CONFIG
From: Erni Sri Satya Vennela @ 2026-04-30 8:56 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, ernis, dipayanroy, shirazsaleem, kees,
linux-hyperv, netdev, linux-kernel
As a part of MANA hardening for CVM, validate that max_num_sq and
max_num_rq returned by MANA_QUERY_VPORT_CONFIG are not zero. These
values flow into apc->num_queues, which is used as an allocation count
and loop bound. A zero value would result in zero-size allocations and
incorrect driver behavior.
Return -EPROTO if either value is zero.
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v2:
* Rebase to latest main.
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..7c83e010a1e6 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1262,6 +1262,12 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
*max_sq = resp.max_num_sq;
*max_rq = resp.max_num_rq;
+
+ if (*max_sq == 0 || *max_rq == 0) {
+ netdev_err(apc->ndev, "Invalid max queues from vPort config\n");
+ return -EPROTO;
+ }
+
if (resp.num_indirection_ent > 0 &&
resp.num_indirection_ent <= MANA_INDIRECT_TABLE_MAX_SIZE &&
is_power_of_2(resp.num_indirection_ent)) {
--
2.34.1
^ permalink raw reply related
* [PATCH net-next v2] net: mana: hardening: Reject zero max_num_queues from GDMA_QUERY_MAX_RESOURCES
From: Erni Sri Satya Vennela @ 2026-04-30 8:36 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, horms, shradhagupta, dipayanroy, ernis,
yury.norov, linux-hyperv, netdev, linux-kernel
In a CVM environment, hardware responses cannot be trusted. The
GDMA_QUERY_MAX_RESOURCES command returns resource limits used to
determine the maximum number of queues.
In mana_gd_query_max_resources(), gc->max_num_queues is initialized
from num_online_cpus() and successively clamped by the hardware-reported
max_eq, max_cq, max_sq, max_rq, and num_msix_usable values. If any of
these hardware values is zero, gc->max_num_queues becomes zero and the
function returns success. This leads to a confusing failure later when
alloc_etherdev_mq() is called with zero queues, returning NULL and
producing a misleading -ENOMEM error.
Add an explicit zero check for gc->max_num_queues after all clamping
steps and return -ENOSPC for a clear early failure, consistent with the
existing gc->num_msix_usable <= 1 guard.
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v2:
* Rebase to latest main.
---
drivers/net/ethernet/microsoft/mana/gdma_main.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 098fbda0d128..f3316e929175 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -194,6 +194,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
if (gc->max_num_queues > gc->num_msix_usable - 1)
gc->max_num_queues = gc->num_msix_usable - 1;
+ if (gc->max_num_queues == 0)
+ return -ENOSPC;
+
return 0;
}
--
2.34.1
^ permalink raw reply related
* [PATCH net-next v7] net: mana: Expose hardware diagnostic info via debugfs
From: Erni Sri Satya Vennela @ 2026-04-30 7:53 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, kotaranov, horms, shradhagupta, ernis,
dipayanroy, yury.norov, shirazsaleem, kees, linux-hyperv, netdev,
linux-kernel, linux-rdma
Add debugfs entries to expose hardware configuration and diagnostic
information that aids in debugging driver initialization and runtime
operations without adding noise to dmesg.
The debugfs directory for each PCI device is named using pci_name()
(the unique BDF address), and its creation and removal is integrated
into mana_gd_setup() and mana_gd_cleanup_device() respectively, so
that all callers (probe, remove, suspend, resume, shutdown) share a
single code path.
Device-level entries (under /sys/kernel/debug/mana/<BDF>/):
- num_msix_usable, max_num_queues: Max resources from hardware
- gdma_protocol_ver, pf_cap_flags1: VF version negotiation results
- num_vports, bm_hostmode: Device configuration
Per-vPort entries (under /sys/kernel/debug/mana/<BDF>/vportN/):
- port_handle: Hardware vPort handle
- max_sq, max_rq: Max queues from vPort config
- indir_table_sz: Indirection table size
- steer_rx, steer_rss, steer_update_tab, steer_cqe_coalescing:
Last applied steering configuration parameters
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v7:
* Rebase to latest main.
Changes in v6:
* Move out of patchset and create a separate patch.
Changes in v5:
* Update commit message.
* Fix conflicts to align with the new patches.
* Make it part of patchset.
Changes in v4:
* Rebase and fix conflicts.
Changes in v3:
* Rename mana_gd_cleanup to mana_gd_cleanup_device.
* Add creation of debugfs entries in mana_gd_setup.
* Add removal of debugfs entries in mana_gd_cleanup_device.
* Remove bm_hostmode and num_vports from debugfs in mana_remove itself,
because "ac" gets freed before debugfs_remove_recursive, to avoid
Use-After-Free error.
* Add "goto out:" in mana_cfg_vport_steering to avoid populating apc
values when resp.hdr.status is not NULL.
Changes in v2:
* Add debugfs_remove_recursice for gc>mana_pci_debugfs in
mana_gd_suspend to handle multiple duplicates creation in
mana_gd_setup and mana_gd_resume path.
* Move debugfs creation for num_vports and bm_hostmode out of
if(!resuming) condition since we have to create it again even for
resume.
* Recreate mana_pci_debugfs in mana_gd_resume.
---
.../net/ethernet/microsoft/mana/gdma_main.c | 68 +++++++++++--------
drivers/net/ethernet/microsoft/mana/mana_en.c | 33 +++++++++
include/net/mana/gdma.h | 1 +
include/net/mana/mana.h | 8 +++
4 files changed, 81 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 098fbda0d128..33fd7d9259c9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -194,6 +194,11 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
if (gc->max_num_queues > gc->num_msix_usable - 1)
gc->max_num_queues = gc->num_msix_usable - 1;
+ debugfs_create_u32("num_msix_usable", 0400, gc->mana_pci_debugfs,
+ &gc->num_msix_usable);
+ debugfs_create_u32("max_num_queues", 0400, gc->mana_pci_debugfs,
+ &gc->max_num_queues);
+
return 0;
}
@@ -1264,6 +1269,13 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
return err ? err : -EPROTO;
}
gc->pf_cap_flags1 = resp.pf_cap_flags1;
+ gc->gdma_protocol_ver = resp.gdma_protocol_ver;
+
+ debugfs_create_x64("gdma_protocol_ver", 0400, gc->mana_pci_debugfs,
+ &gc->gdma_protocol_ver);
+ debugfs_create_x64("pf_cap_flags1", 0400, gc->mana_pci_debugfs,
+ &gc->pf_cap_flags1);
+
if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
if (err) {
@@ -1943,15 +1955,20 @@ static int mana_gd_setup(struct pci_dev *pdev)
struct gdma_context *gc = pci_get_drvdata(pdev);
int err;
+ gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
+ mana_debugfs_root);
+
err = mana_gd_init_registers(pdev);
if (err)
- return err;
+ goto remove_debugfs;
mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0);
- if (!gc->service_wq)
- return -ENOMEM;
+ if (!gc->service_wq) {
+ err = -ENOMEM;
+ goto remove_debugfs;
+ }
err = mana_gd_setup_hwc_irqs(pdev);
if (err) {
@@ -1992,11 +2009,14 @@ static int mana_gd_setup(struct pci_dev *pdev)
free_workqueue:
destroy_workqueue(gc->service_wq);
gc->service_wq = NULL;
+remove_debugfs:
+ debugfs_remove_recursive(gc->mana_pci_debugfs);
+ gc->mana_pci_debugfs = NULL;
dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err);
return err;
}
-static void mana_gd_cleanup(struct pci_dev *pdev)
+static void mana_gd_cleanup_device(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -2008,6 +2028,10 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
destroy_workqueue(gc->service_wq);
gc->service_wq = NULL;
}
+
+ debugfs_remove_recursive(gc->mana_pci_debugfs);
+ gc->mana_pci_debugfs = NULL;
+
dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
}
@@ -2065,9 +2089,6 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
gc->dev = &pdev->dev;
xa_init(&gc->irq_contexts);
- gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
- mana_debugfs_root);
-
err = mana_gd_setup(pdev);
if (err)
goto unmap_bar;
@@ -2096,16 +2117,8 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
cleanup_mana:
mana_remove(&gc->mana, false);
cleanup_gd:
- mana_gd_cleanup(pdev);
+ mana_gd_cleanup_device(pdev);
unmap_bar:
- /*
- * at this point we know that the other debugfs child dir/files
- * are either not yet created or are already cleaned up.
- * The pci debugfs folder clean-up now, will only be cleaning up
- * adapter-MTU file and apc->mana_pci_debugfs folder.
- */
- debugfs_remove_recursive(gc->mana_pci_debugfs);
- gc->mana_pci_debugfs = NULL;
xa_destroy(&gc->irq_contexts);
pci_iounmap(pdev, bar0_va);
free_gc:
@@ -2155,11 +2168,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, false);
- mana_gd_cleanup(pdev);
-
- debugfs_remove_recursive(gc->mana_pci_debugfs);
-
- gc->mana_pci_debugfs = NULL;
+ mana_gd_cleanup_device(pdev);
xa_destroy(&gc->irq_contexts);
@@ -2181,7 +2190,7 @@ int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
- mana_gd_cleanup(pdev);
+ mana_gd_cleanup_device(pdev);
return 0;
}
@@ -2201,13 +2210,18 @@ int mana_gd_resume(struct pci_dev *pdev)
err = mana_probe(&gc->mana, true);
if (err)
- return err;
+ goto cleanup_gd;
err = mana_rdma_probe(&gc->mana_ib);
if (err)
- return err;
+ goto cleanup_mana;
return 0;
+cleanup_mana:
+ mana_remove(&gc->mana, true);
+cleanup_gd:
+ mana_gd_cleanup_device(pdev);
+ return err;
}
/* Quiesce the device for kexec. This is also called upon reboot/shutdown. */
@@ -2220,11 +2234,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
- mana_gd_cleanup(pdev);
-
- debugfs_remove_recursive(gc->mana_pci_debugfs);
-
- gc->mana_pci_debugfs = NULL;
+ mana_gd_cleanup_device(pdev);
pci_disable_device(pdev);
}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..077d3a1ff6bf 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1276,6 +1276,9 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
apc->port_handle = resp.vport;
ether_addr_copy(apc->mac_addr, resp.mac_addr);
+ apc->vport_max_sq = *max_sq;
+ apc->vport_max_rq = *max_rq;
+
return 0;
}
@@ -1430,6 +1433,11 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
apc->port_handle, apc->indir_table_sz);
+
+ apc->steer_rx = rx;
+ apc->steer_rss = apc->rss_state;
+ apc->steer_update_tab = update_tab;
+ apc->steer_cqe_coalescing = req->cqe_coalescing_enable;
out:
kfree(req);
return err;
@@ -3161,6 +3169,23 @@ static int mana_init_port(struct net_device *ndev)
eth_hw_addr_set(ndev, apc->mac_addr);
sprintf(vport, "vport%d", port_idx);
apc->mana_port_debugfs = debugfs_create_dir(vport, gc->mana_pci_debugfs);
+
+ debugfs_create_u64("port_handle", 0400, apc->mana_port_debugfs,
+ &apc->port_handle);
+ debugfs_create_u32("max_sq", 0400, apc->mana_port_debugfs,
+ &apc->vport_max_sq);
+ debugfs_create_u32("max_rq", 0400, apc->mana_port_debugfs,
+ &apc->vport_max_rq);
+ debugfs_create_u32("indir_table_sz", 0400, apc->mana_port_debugfs,
+ &apc->indir_table_sz);
+ debugfs_create_u32("steer_rx", 0400, apc->mana_port_debugfs,
+ &apc->steer_rx);
+ debugfs_create_u32("steer_rss", 0400, apc->mana_port_debugfs,
+ &apc->steer_rss);
+ debugfs_create_u32("steer_update_tab", 0400, apc->mana_port_debugfs,
+ &apc->steer_update_tab);
+ debugfs_create_u32("steer_cqe_coalescing", 0400, apc->mana_port_debugfs,
+ &apc->steer_cqe_coalescing);
debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs,
&apc->speed);
return 0;
@@ -3659,6 +3684,11 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
ac->bm_hostmode = bm_hostmode;
+ debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
+ &ac->num_ports);
+ debugfs_create_u8("bm_hostmode", 0400, gc->mana_pci_debugfs,
+ &ac->bm_hostmode);
+
if (!resuming) {
ac->num_ports = num_ports;
} else {
@@ -3800,6 +3830,9 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
mana_gd_deregister_device(gd);
+ debugfs_lookup_and_remove("bm_hostmode", gc->mana_pci_debugfs);
+ debugfs_lookup_and_remove("num_vports", gc->mana_pci_debugfs);
+
if (suspending)
return;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6d836060976a..70d62bc32837 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -442,6 +442,7 @@ struct gdma_context {
struct gdma_dev mana_ib;
u64 pf_cap_flags1;
+ u64 gdma_protocol_ver;
struct workqueue_struct *service_wq;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8f721cd4e4a7..18215388d2c7 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -568,6 +568,14 @@ struct mana_port_context {
/* Debugfs */
struct dentry *mana_port_debugfs;
+
+ /* Cached vport/steering config for debugfs */
+ u32 vport_max_sq;
+ u32 vport_max_rq;
+ u32 steer_rx;
+ u32 steer_rss;
+ u32 steer_update_tab;
+ u32 steer_cqe_coalescing;
};
netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH 0/8] firmware: sysfb: Consolidate config/code wrt. sysfb_primary_screen
From: Thomas Zimmermann @ 2026-04-30 6:35 UTC (permalink / raw)
To: patchwork-bot+linux-riscv
Cc: linux-riscv, javierm, arnd, ardb, ilias.apalodimas, chenhuacai,
kernel, maarten.lankhorst, mripard, airlied, simona, kys,
haiyangz, wei.liu, decui, longli, deller, linux-arm-kernel,
loongarch, linux-efi, dri-devel, linux-hyperv, linux-fbdev
In-Reply-To: <177751955329.2274119.12779807302343885295.git-patchwork-notify@kernel.org>
Hi
Am 30.04.26 um 05:25 schrieb patchwork-bot+linux-riscv@kernel.org:
> Hello:
>
> This series was applied to riscv/linux.git (fixes)
> by Ard Biesheuvel <ardb@kernel.org>:
Patch 3 was fairly controversial.
Best regards
Thomas
>
> On Thu, 2 Apr 2026 11:09:14 +0200 you wrote:
>> The global state sysfb_primary_screen holds information about the
>> framebuffer provided by EFI/BIOS systems. It is part of the sysfb
>> module, but used in several places without direct connection to
>> sysfb. Fix this by making users of sysfb_primary_screen depend on
>> CONFIG_SYSFB. Fix a few issues in the process.
>>
>> Patches 1 and 2 fix general errors in the Kconfig rules. In any case,
>> these patches should be considered even without the rest of the series.
>>
>> [...]
> Here is the summary with links:
> - [1/8] hv: Select CONFIG_SYSFB only for CONFIG_HYPERV_VMBUS
> https://git.kernel.org/riscv/c/d33db956c961
> - [2/8] firmware: efi: Never declare sysfb_primary_display on x86
> https://git.kernel.org/riscv/c/5241c2ca33bb
> - [3/8] firmware: sysfb: Make CONFIG_SYSFB a user-selectable option
> (no matching commit)
> - [4/8] firmware: sysfb: Split sysfb.c into sysfb_primary.c and sysfb_pci.c
> (no matching commit)
> - [5/8] firmware: sysfb: Implement screen_info relocation for primary display
> (no matching commit)
> - [6/8] firmware: sysfb: Avoid forward-declaring sysfb_parent_dev()
> (no matching commit)
> - [7/8] firmware: efi: Make CONFIG_EFI_EARLYCON depend on CONFIG_SYSFB; clean up
> (no matching commit)
> - [8/8] firmware: sysfb: Move CONFIG_FIRMWARE_EDID to firmware options
> (no matching commit)
>
> You are awesome, thank you!
--
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstr. 146, 90461 Nürnberg, Germany, www.suse.com
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich, (HRB 36809, AG Nürnberg)
^ permalink raw reply
* Re: [PATCH 3/3] net: mana: remove double CQ cleanup in mana_create_rxq error path
From: Aditya Garg @ 2026-04-30 4:14 UTC (permalink / raw)
To: Dipayaan Roy, kys, haiyangz, wei.liu, decui, andrew+netdev, davem,
edumazet, kuba, pabeni, leon, longli, kotaranov, horms,
shradhagupta, ssengar, ernis, shirazsaleem, linux-hyperv, netdev,
linux-kernel, linux-rdma, stephen, jacob.e.keller, dipayanroy,
leitao, kees, john.fastabend, hawk, bpf, daniel, ast, sdf,
yury.norov
In-Reply-To: <20260430035935.1859220-4-dipayanroy@linux.microsoft.com>
On 30-04-2026 09:27, Dipayaan Roy wrote:
> In mana_create_rxq(), the error cleanup path calls mana_destroy_rxq()
> followed by mana_deinit_cq(). This is incorrect for two reasons:
>
> 1. mana_destroy_rxq() already calls mana_deinit_cq() internally,
> so the CQ's GDMA queue is destroyed twice.
>
> 2. mana_destroy_rxq() frees the rxq via kfree(rxq) before returning.
> The subsequent mana_deinit_cq(apc, cq) then operates on freed memory
> since cq points to &rxq->rx_cq, which is embedded in the
> already-freed rxq structure — a use-after-free.
>
> Remove the redundant mana_deinit_cq() call from the error path since
> mana_destroy_rxq() already handles CQ cleanup. mana_deinit_cq() is
> itself safe for an uninitialized CQ as it checks for a NULL gdma_cq
> before proceeding.
>
> Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ---
> 1 file changed, 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index f2a6ea162dc3..9afc786b297a 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -2799,9 +2799,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
>
> mana_destroy_rxq(apc, rxq, false);
>
> - if (cq)
> - mana_deinit_cq(apc, cq);
> -
> return NULL;
> }
>
Reviewed-by: Aditya Garg <gargaditya@linux.microsoft.com>
^ permalink raw reply
* [PATCH 3/3] net: mana: remove double CQ cleanup in mana_create_rxq error path
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>
In mana_create_rxq(), the error cleanup path calls mana_destroy_rxq()
followed by mana_deinit_cq(). This is incorrect for two reasons:
1. mana_destroy_rxq() already calls mana_deinit_cq() internally,
so the CQ's GDMA queue is destroyed twice.
2. mana_destroy_rxq() frees the rxq via kfree(rxq) before returning.
The subsequent mana_deinit_cq(apc, cq) then operates on freed memory
since cq points to &rxq->rx_cq, which is embedded in the
already-freed rxq structure — a use-after-free.
Remove the redundant mana_deinit_cq() call from the error path since
mana_destroy_rxq() already handles CQ cleanup. mana_deinit_cq() is
itself safe for an uninitialized CQ as it checks for a NULL gdma_cq
before proceeding.
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index f2a6ea162dc3..9afc786b297a 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2799,9 +2799,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
mana_destroy_rxq(apc, rxq, false);
- if (cq)
- mana_deinit_cq(apc, cq);
-
return NULL;
}
--
2.43.0
^ permalink raw reply related
* [PATCH 2/3] net: mana: Skip WQ object destruction for uninitialized RXQ
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>
In mana_destroy_rxq(), mana_destroy_wq_obj() is called unconditionally
even when the WQ object was never created (rxobj is still
INVALID_MANA_HANDLE). When mana_create_rxq() fails before
mana_create_wq_obj() succeeds, the error path calls mana_destroy_rxq()
which sends a bogus destroy command to the hardware:
mana 7870:00:00.0: HWC: Failed hw_channel req: 0x1d
mana 7870:00:00.0: Failed to send mana message: -71, 0x1d
mana 7870:00:00.0 eth7: Failed to destroy WQ object: -71
Guard mana_destroy_wq_obj() with an INVALID_MANA_HANDLE check so that
mana_destroy_rxq() is safe to call at any stage of RXQ initialization.
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index dfb4ba9f7664..f2a6ea162dc3 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2524,7 +2524,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
if (xdp_rxq_info_is_reg(&rxq->xdp_rxq))
xdp_rxq_info_unreg(&rxq->xdp_rxq);
- mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
+ if (rxq->rxobj != INVALID_MANA_HANDLE)
+ mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
mana_deinit_cq(apc, &rxq->rx_cq);
--
2.43.0
^ permalink raw reply related
* [PATCH 1/3] net: mana: check xdp_rxq registration before unreg in mana_destroy_rxq()
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>
When mana_create_rxq() fails at mana_create_wq_obj() or any step before
xdp_rxq_info_reg() is called, the error path jumps to `out:` which calls
mana_destroy_rxq(). mana_destroy_rxq() unconditionally calls
xdp_rxq_info_unreg() on xilinx xdp_rxq that was never registered,
triggering a WARN_ON in net/core/xdp.c:
mana 7870:00:00.0: HWC: Failed hw_channel req: 0xc000009a
mana 7870:00:00.0 eth7: Failed to create RXQ: err = -71
Driver BUG
WARNING: CPU: 442 PID: 491615 at ../net/core/xdp.c:150 xdp_rxq_info_unreg+0x44/0x70
Modules linked in: tcp_bbr xsk_diag udp_diag raw_diag unix_diag af_packet_diag netlink_diag nf_tables nfnetlink tcp_diag inet_diag binfmt_misc rpcsec_gss_krb5 nfsv3 nfs_acl auth_rpcgss nfsv4 dns_resolver nfs lockd ext4 grace crc16 iscsi_tcp mbcache fscache libiscsi_tcp jbd2 netfs rpcrdma af_packet sunrpc rdma_ucm ib_iser rdma_cm iw_cm iscsi_ibft ib_cm iscsi_boot_sysfs libiscsi rfkill scsi_transport_iscsi mana_ib ib_uverbs ib_core mana hyperv_drm(X) drm_shmem_helper intel_rapl_msr drm_kms_helper intel_rapl_common syscopyarea nls_iso8859_1 sysfillrect intel_uncore_frequency_common nls_cp437 vfat fat nfit sysimgblt libnvdimm hv_netvsc(X) hv_utils(X) fb_sys_fops hv_balloon(X) joydev fuse drm dm_mod configfs ip_tables x_tables xfs libcrc32c sd_mod nvme nvme_core nvme_common t10_pi crc64_rocksoft_generic crc64_rocksoft crc64 hid_generic serio_raw pci_hyperv(X) hv_storvsc(X) scsi_transport_fc hyperv_keyboard(X) hid_hyperv(X) pci_hyperv_intf(X) crc32_pclmul
crc32c_intel ghash_clmulni_intel aesni_intel crypto_simd cryptd hv_vmbus(X) softdog sg scsi_mod efivarfs
Supported: Yes, External
CPU: 442 PID: 491615 Comm: ethtool Kdump: loaded Tainted: G X 5.14.21-150500.55.136-default #1 SLE15-SP5 a627be1b53abbfd64ad16b2685e4308c52847f42
Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 07/25/2025
RIP: 0010:xdp_rxq_info_unreg+0x44/0x70
Code: e8 91 fe ff ff c7 43 0c 02 00 00 00 48 c7 03 00 00 00 00 5b c3 cc cc cc cc e9 58 3a 1c 00 48 c7 c7 f6 5f 19 97 e8 5c a4 7e ff <0f> 0b 83 7b 0c 01 74 ca 48 c7 c7 d9 5f 19 97 e8 48 a4 7e ff 0f 0b
RSP: 0018:ff3df6c8f7207818 EFLAGS: 00010286
RAX: 0000000000000000 RBX: ff30d89f94808a80 RCX: 0000000000000027
RDX: 0000000000000000 RSI: 0000000000000002 RDI: ff30d94bdcca2908
RBP: 0000000000080000 R08: ffffffff98ed11a0 R09: ff3df6c8f72077a0
R10: dead000000000100 R11: 000000000000000a R12: 0000000000000000
R13: 0000000000002000 R14: 0000000000040000 R15: ff30d89f94800000
FS: 00007fe6d8432b80(0000) GS:ff30d94bdcc80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fe6d81a89b1 CR3: 00000b3b6d578001 CR4: 0000000000371ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
Call Trace:
<TASK>
mana_destroy_rxq+0x5b/0x2f0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
mana_create_rxq.isra.55+0x3db/0x720 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
? simple_lookup+0x36/0x50
? current_time+0x42/0x80
? __d_free_external+0x30/0x30
mana_alloc_queues+0x32a/0x470 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
? _raw_spin_unlock+0xa/0x30
? d_instantiate.part.29+0x2e/0x40
? _raw_spin_unlock+0xa/0x30
? debugfs_create_dir+0xe4/0x140
mana_attach+0x5c/0xf0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
mana_set_ringparam+0xd5/0x1a0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
ethnl_set_rings+0x292/0x320
genl_family_rcv_msg_doit.isra.15+0x11b/0x150
genl_rcv_msg+0xe3/0x1e0
? rings_prepare_data+0x80/0x80
? genl_family_rcv_msg_doit.isra.15+0x150/0x150
netlink_rcv_skb+0x50/0x100
genl_rcv+0x24/0x40
netlink_unicast+0x1b6/0x280
netlink_sendmsg+0x365/0x4d0
sock_sendmsg+0x5f/0x70
__sys_sendto+0x112/0x140
__x64_sys_sendto+0x24/0x30
do_syscall_64+0x5b/0x80
? handle_mm_fault+0xd7/0x290
? do_user_addr_fault+0x2d8/0x740
? exc_page_fault+0x67/0x150
entry_SYSCALL_64_after_hwframe+0x6b/0xd5
RIP: 0033:0x7fe6d8122f06
Code: 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 11 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 72 f3 c3 41 57 41 56 4d 89 c7 41 55 41 54 41
RSP: 002b:00007fff2b66b068 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 000055771123d2a0 RCX: 00007fe6d8122f06
RDX: 0000000000000034 RSI: 000055771123d3b0 RDI: 0000000000000003
RBP: 00007fff2b66b100 R08: 00007fe6d8203360 R09: 000000000000000c
R10: 0000000000000000 R11: 0000000000000246 R12: 000055771123d350
R13: 000055771123d340 R14: 0000000000000000 R15: 00007fff2b66b2b0
</TASK>
Guard the xdp_rxq_info_unreg() call with xdp_rxq_info_is_reg() so that
mana_destroy_rxq() is safe to call regardless of how far initialization
progressed.
Fixes: ed5356b53f07 ("net: mana: Add XDP support")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..dfb4ba9f7664 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2520,7 +2520,9 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
napi_disable_locked(napi);
netif_napi_del_locked(napi);
}
- xdp_rxq_info_unreg(&rxq->xdp_rxq);
+
+ if (xdp_rxq_info_is_reg(&rxq->xdp_rxq))
+ xdp_rxq_info_unreg(&rxq->xdp_rxq);
mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
--
2.43.0
^ permalink raw reply related
* [PATCH 0/3] net: mana: Fix mana_destroy_rxq() cleanup for partial RXQ init
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
When mana_create_rxq() fails partway through initialization (e.g. the
hardware rejects the WQ object creation), the error path calls
mana_destroy_rxq() to tear down a partially-initialized RXQ.
This exposed multiple issues in mana_destroy_rxq() path, as it assumed
the RXQ was always fully initialized, leading to multiple issues:
1. xdp_rxq_info_unreg() was called on an unregistered xdp_rxq,
triggering a WARN_ON ("Driver BUG") in net/core/xdp.c.
2. mana_destroy_wq_obj() was called with INVALID_MANA_HANDLE,
sending a bogus destroy command to the hardware.
3. mana_deinit_cq() was called twice — once inside mana_destroy_rxq()
and again in mana_create_rxq()'s error path — causing a
use-after-free since mana_destroy_rxq() frees the rxq first.
This was observed during ethtool ring parameter changes when the
hardware returned an error creating the RXQ. This series makes
mana_destroy_rxq() safe to call at any stage of RXQ initialization
by guarding each teardown step, and removes the redundant cleanup
in mana_create_rxq().
Dipayaan Roy (3):
net: mana: check xdp_rxq registration before unreg in
mana_destroy_rxq()
net: mana: Skip WQ object destruction for uninitialized RXQ
net: mana: remove double CQ cleanup in mana_create_rxq error path
drivers/net/ethernet/microsoft/mana/mana_en.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
--
2.43.0
^ permalink raw reply
* Re: [PATCH 0/8] firmware: sysfb: Consolidate config/code wrt. sysfb_primary_screen
From: patchwork-bot+linux-riscv @ 2026-04-30 3:25 UTC (permalink / raw)
To: Thomas Zimmermann
Cc: linux-riscv, javierm, arnd, ardb, ilias.apalodimas, chenhuacai,
kernel, maarten.lankhorst, mripard, airlied, simona, kys,
haiyangz, wei.liu, decui, longli, deller, linux-arm-kernel,
loongarch, linux-efi, dri-devel, linux-hyperv, linux-fbdev
In-Reply-To: <20260402092305.208728-1-tzimmermann@suse.de>
Hello:
This series was applied to riscv/linux.git (fixes)
by Ard Biesheuvel <ardb@kernel.org>:
On Thu, 2 Apr 2026 11:09:14 +0200 you wrote:
> The global state sysfb_primary_screen holds information about the
> framebuffer provided by EFI/BIOS systems. It is part of the sysfb
> module, but used in several places without direct connection to
> sysfb. Fix this by making users of sysfb_primary_screen depend on
> CONFIG_SYSFB. Fix a few issues in the process.
>
> Patches 1 and 2 fix general errors in the Kconfig rules. In any case,
> these patches should be considered even without the rest of the series.
>
> [...]
Here is the summary with links:
- [1/8] hv: Select CONFIG_SYSFB only for CONFIG_HYPERV_VMBUS
https://git.kernel.org/riscv/c/d33db956c961
- [2/8] firmware: efi: Never declare sysfb_primary_display on x86
https://git.kernel.org/riscv/c/5241c2ca33bb
- [3/8] firmware: sysfb: Make CONFIG_SYSFB a user-selectable option
(no matching commit)
- [4/8] firmware: sysfb: Split sysfb.c into sysfb_primary.c and sysfb_pci.c
(no matching commit)
- [5/8] firmware: sysfb: Implement screen_info relocation for primary display
(no matching commit)
- [6/8] firmware: sysfb: Avoid forward-declaring sysfb_parent_dev()
(no matching commit)
- [7/8] firmware: efi: Make CONFIG_EFI_EARLYCON depend on CONFIG_SYSFB; clean up
(no matching commit)
- [8/8] firmware: sysfb: Move CONFIG_FIRMWARE_EDID to firmware options
(no matching commit)
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH 00/10] mshv: Bug fixes across the mshv_root module
From: Mukesh R @ 2026-04-30 2:18 UTC (permalink / raw)
To: Stanislav Kinsburskii, kys, haiyangz, wei.liu, decui, longli
Cc: linux-hyperv, linux-kernel
In-Reply-To: <177748522635.144491.1565666089881726479.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On 4/29/26 11:17, Stanislav Kinsburskii wrote:
> This series addresses bugs found during a review of the mshv_root module
> introduced by commit 621191d709b14 ("Drivers: hv: Introduce mshv_root
> module to expose /dev/mshv to VMMs").
>
> The fixes range from data corruption and use-after-free to silent
> functional failures:
>
> - IRQ state leak and type truncation in hypercall helpers
> (hv_call_modify_spa_host_access)
> - Integer overflow on userspace-controlled allocation size
> (mshv_region_create)
> - Missing locking, broken seqcount read protection, and a check on
> uninitialized data in the irqfd path ? the latter makes
> level-triggered interrupt resampling completely non-functional
> - Duplicate GSI 0 detection using the wrong predicate
> - Use-after-RCU in port ID lookup
> - Missing VP index bounds check in intercept ISR (OOB in interrupt
> context)
> - Missing error code on VP allocation failure (silent success to
> userspace)
Lot of changes here, curious, how were all these discovered
suddenly? Stress testing, internal/external? Or reported by
copilot/sashiko/etc..
How were the fixes tested?
Thanks,
-Mukesh
> ---
>
> Stanislav Kinsburskii (10):
> mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
> mshv: Fix potential integer overflow in mshv_region_create
> mshv: Fix missing lock in mshv_irqfd_deassign
> mshv: Fix broken seqcount read protection
> mshv: Fix level-triggered check on uninitialized data
> mshv: Fix duplicate GSI detection for GSI 0
> mshv: Fix use-after-RCU in mshv_portid_lookup
> mshv: Use kfree_rcu in mshv_portid_free
> mshv: Add missing vp_index bounds check in intercept ISR
> mshv: Fix missing error code on VP allocation failure
>
>
> drivers/hv/mshv_eventfd.c | 75 ++++++++++++++++++++++------------------
> drivers/hv/mshv_irq.c | 2 +
> drivers/hv/mshv_portid_table.c | 6 +--
> drivers/hv/mshv_regions.c | 2 +
> drivers/hv/mshv_root_hv_call.c | 18 +++-------
> drivers/hv/mshv_root_main.c | 4 ++
> drivers/hv/mshv_synic.c | 4 ++
> 7 files changed, 59 insertions(+), 52 deletions(-)
>
^ permalink raw reply
* Re: [PATCH v2] mshv: Simplify GPA map/unmap hypercall helpers
From: Mukesh R @ 2026-04-30 2:06 UTC (permalink / raw)
To: Stanislav Kinsburskii, kys, haiyangz, wei.liu, decui, longli
Cc: linux-hyperv, linux-kernel
In-Reply-To: <177748126383.33250.14844440376241852870.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On 4/29/26 09:48, Stanislav Kinsburskii wrote:
> Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> preceding bug-fix patches:
>
> Move "done += completed" before the status checks so that pages mapped
> by a partially-successful batch are included in the error cleanup unmap.
> Previously these mappings were leaked on failure.
>
> While here, improve type safety and readability:
> - Change "int done" to "u64 done" to match the u64 page_count it is
> compared against, avoiding signed/unsigned comparison hazards.
> - Use u64 for loop iteration and batch size variables consistently.
> - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
> - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
> - Simplify the error-path unmap to use "done << large_shift" directly
> instead of mutating done in place.
>
what changed in V2?
> Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/mshv_root_hv_call.c | 55 +++++++++++++++-------------------------
> 1 file changed, 20 insertions(+), 35 deletions(-)
>
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index e5992c324904a..1f19a4ca824f0 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> struct hv_input_map_gpa_pages *input_page;
> u64 status, *pfnlist;
> unsigned long irq_flags, large_shift = 0;
> - int ret = 0, done = 0;
> - u64 page_count = page_struct_count;
> + u64 done = 0, page_count = page_struct_count;
> + int ret = 0;
>
> if (page_count == 0 || (pages && mmio_spa))
> return -EINVAL;
> @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> }
>
> while (done < page_count) {
> - ulong i, completed, remain = page_count - done;
> - int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> + u64 i, completed, remain = page_count - done;
> + u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
>
> local_irq_save(irq_flags);
> input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -224,23 +224,13 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> input_page->map_flags = flags;
> pfnlist = input_page->source_gpa_page_list;
>
> - for (i = 0; i < rep_count; i++)
> - if (flags & HV_MAP_GPA_NO_ACCESS) {
> + for (i = 0; i < rep_count; i++) {
> + if (flags & HV_MAP_GPA_NO_ACCESS)
> pfnlist[i] = 0;
> - } else if (pages) {
> - u64 index = (done + i) << large_shift;
> -
> - if (index >= page_struct_count) {
> - ret = -EINVAL;
> - break;
> - }
> - pfnlist[i] = page_to_pfn(pages[index]);
> - } else {
> + else if (pages)
> + pfnlist[i] = page_to_pfn(pages[(done + i) << large_shift]);
Entire file is 80 cols, please don't cause this one overflow.
Thanks,
-Mukesh
> + else
> pfnlist[i] = mmio_spa + done + i;
> - }
> - if (ret) {
> - local_irq_restore(irq_flags);
> - break;
> }
>
> status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> @@ -248,29 +238,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> local_irq_restore(irq_flags);
>
> completed = hv_repcomp(status);
> + done += completed;
>
> if (hv_result_needs_memory(status)) {
> ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> HV_MAP_GPA_DEPOSIT_PAGES);
> if (ret)
> break;
> -
> } else if (!hv_result_success(status)) {
> ret = hv_result_to_errno(status);
> break;
> }
> -
> - done += completed;
> }
>
> if (ret && done) {
> u32 unmap_flags = 0;
>
> - if (flags & HV_MAP_GPA_LARGE_PAGE) {
> + if (flags & HV_MAP_GPA_LARGE_PAGE)
> unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> - done <<= large_shift;
> - }
> - hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> + hv_call_unmap_gpa_pages(partition_id, gfn,
> + done << large_shift, unmap_flags);
> }
>
> return ret;
> @@ -305,7 +292,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> struct hv_input_unmap_gpa_pages *input_page;
> u64 status, page_count = page_count_4k;
> unsigned long irq_flags, large_shift = 0;
> - int ret = 0, done = 0;
> + u64 done = 0;
>
> if (page_count == 0)
> return -EINVAL;
> @@ -319,8 +306,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> }
>
> while (done < page_count) {
> - ulong completed, remain = page_count - done;
> - int rep_count = min(remain, HV_UMAP_GPA_PAGES);
> + u64 completed, remain = page_count - done;
> + u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
>
> local_irq_save(irq_flags);
> input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -333,15 +320,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> local_irq_restore(irq_flags);
>
> completed = hv_repcomp(status);
> - if (!hv_result_success(status)) {
> - ret = hv_result_to_errno(status);
> - break;
> - }
> -
> done += completed;
> +
> + if (!hv_result_success(status))
> + return hv_result_to_errno(status);
> }
>
> - return ret;
> + return 0;
> }
>
> int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
>
>
^ permalink raw reply
* Re: [PATCH] hv_sock: fix ARM64 support
From: patchwork-bot+netdevbpf @ 2026-04-30 0:50 UTC (permalink / raw)
To: Hamza Mahfooz
Cc: netdev, kys, haiyangz, wei.liu, decui, longli, sgarzare, davem,
edumazet, kuba, pabeni, horms, mhklinux, himadrispandya,
linux-hyperv, virtualization, linux-kernel
In-Reply-To: <20260428125339.13963-1-hamzamahfooz@linux.microsoft.com>
Hello:
This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Tue, 28 Apr 2026 08:53:39 -0400 you wrote:
> VMBUS ring buffers must be page aligned. Therefore, the current value of
> 24K presents a challenge on ARM64 kernels (with 64K pages). So, use
> VMBUS_RING_SIZE() to ensure they are always aligned and large enough to
> hold all of the relevant data.
>
> Cc: stable@vger.kernel.org
> Fixes: 77ffe33363c0 ("hv_sock: use HV_HYP_PAGE_SIZE for Hyper-V communication")
> Tested-by: Dexuan Cui <decui@microsoft.com>
> Reviewed-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
>
> [...]
Here is the summary with links:
- hv_sock: fix ARM64 support
https://git.kernel.org/netdev/net/c/b31681206e3f
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* [PATCH v0 3/3] mshv: Implement guest irq migration for passthru'd devices
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
linux-pci, linux-arch
In-Reply-To: <20260429231519.2569088-1-mrathor@linux.microsoft.com>
Ask the hypervisor to retarget interrupts to new guest cpu or vector
upon guest irq migration. This happens in the irqfd update path.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/mshv_eventfd.c | 78 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 76 insertions(+), 2 deletions(-)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 666e28f4a4b5..0d0f1229f500 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -192,6 +192,77 @@ static int mshv_map_device_interrupt(u64 ptid, union hv_device_id hv_devid,
}
+/* NOTE: caller does spin_lock_irq on pt_irqfds_lock, hence no disable here */
+static void mshv_do_guest_irq_retarget(u64 partid, struct mshv_irqfd *irqfd)
+{
+ int rc, var_size;
+ u64 status;
+ union hv_device_id hv_devid;
+ struct hv_input_get_vp_set_from_mda *mda_input;
+ union hv_output_get_vp_set_from_mda *mda_output;
+ struct hv_retarget_device_interrupt *remap_inp;
+ struct pci_dev *pdev;
+ struct irq_data *irqdata;
+ struct mshv_lapic_irq *lapic_irq = &irqfd->irqfd_lapic_irq;
+ struct hv_interrupt_entry *inte = NULL;
+
+ if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+ irqfd->irqfd_bypass_prod == NULL)
+ return;
+
+ rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+ if (rc)
+ return;
+
+ inte = irqdata->chip_data;
+ if (inte == NULL)
+ return;
+
+ hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+
+
+ mda_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ mda_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ rc = hv_vpset_from_hyp_disabled(mda_input, mda_output, lapic_irq,
+ partid);
+ if (rc)
+ return;
+
+ remap_inp = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(remap_inp, 0, sizeof(*remap_inp));
+
+ rc = hv_copy_vpset(&remap_inp->int_target.vp_set,
+ &mda_output->target_vpset);
+ if (rc <= 0) {
+ pr_err("Hyper-V: ptid %lld - vpset copy failed (%d)\n",
+ partid, rc);
+ return;
+ }
+
+ /*
+ * var-sized hcall: var-size starts after vp_mask (thus vp_set.format
+ * does not count, but vp_set.valid_bank_mask does).
+ */
+ var_size = rc + 1;
+
+ remap_inp->partition_id = partid;
+ remap_inp->device_id = hv_devid.as_uint64;
+ remap_inp->int_target.vector = lapic_irq->lapic_vector;
+ remap_inp->int_target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+ remap_inp->int_entry.source = inte->source;
+ remap_inp->int_entry.msi_entry.as_uint64 = inte->msi_entry.as_uint64;
+
+ status = hv_do_rep_hypercall(HVCALL_RETARGET_INTERRUPT, 0, var_size,
+ remap_inp, NULL);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "pt:%lld vec:%d lapic-id:%lld\n",
+ partid, lapic_irq->lapic_vector,
+ lapic_irq->lapic_apic_id);
+}
+
static int mshv_unmap_device_interrupt(union hv_device_id hv_devid,
struct hv_interrupt_entry *irq_entry)
{
@@ -728,9 +799,12 @@ static void mshv_irqfd_update(struct mshv_partition *pt,
struct mshv_irqfd *irqfd)
{
write_seqcount_begin(&irqfd->irqfd_irqe_sc);
- irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
- irqfd->irqfd_irqnum);
+ irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, irqfd->irqfd_irqnum);
mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
+
+#if IS_ENABLED(CONFIG_X86_64)
+ mshv_do_guest_irq_retarget(pt->pt_id, irqfd);
+#endif
write_seqcount_end(&irqfd->irqfd_irqe_sc);
}
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH v0 2/3] hyperv: Implement irq remap for passthru devices
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
linux-pci, linux-arch
In-Reply-To: <20260429231519.2569088-1-mrathor@linux.microsoft.com>
Implement interrupt remapping for direct attached and domain attached
devices on Hyper-V.
Please note there are few constraints when it comes to mapping device
interrupts on Hyper-V. For example, the hypervisor will not allow mapping
device interrupts to root if the device is a direct attached device. Since
the target guest cpu and vector info is not available during the initial
VFIO irq setup, we work around by skipping this initial map. Then later
during irqbypass trigger, when both guest target cpu vector are available,
we do the map in the hypervisor, update the device, and enable the
interrupt vector on the device. Rather than special case direct attached,
we do same for domain attached also. This implies irqbypass is required
for MSHV pci device passthru. Also noteworthy is that the hypervisor
will automatically setup any direct hw injection like posted interrupts.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
arch/x86/hyperv/irqdomain.c | 18 +-
drivers/hv/mshv_eventfd.c | 422 +++++++++++++++++++++++++++-
drivers/iommu/hyperv-iommu-root.c | 14 +
drivers/pci/controller/pci-hyperv.c | 10 +
include/asm-generic/mshyperv.h | 4 +
5 files changed, 464 insertions(+), 4 deletions(-)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 527835b99a70..d32e912ad4a9 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -222,7 +222,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
msidesc = irq_data_get_msi_desc(data);
pdev = msi_desc_to_pci_dev(msidesc);
- hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
+ hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
return hv_map_interrupt(hv_current_partition_id, hv_devid, false, cpu,
@@ -258,6 +258,20 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return;
}
+ /*
+ * For direct attached devices, we cannot map interrupts in the
+ * hypervisor because it will not allow it until we have guest target
+ * vcpu and vector. So defer it until irqbypass. Also, do the same
+ * for domain attached devices for simplicity.
+ */
+ if (hv_pcidev_is_pthru_dev(pdev)) {
+ if (data->chip_data)
+ entry_to_msi_msg(data->chip_data, msg);
+ else
+ memset(msg, 0, sizeof(struct msi_msg));
+ return;
+ }
+
if (data->chip_data) {
/*
* This interrupt is already mapped. Let's unmap first.
@@ -297,7 +311,7 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
{
union hv_device_id hv_devid;
- hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
+ hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
}
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 90959f639dc3..666e28f4a4b5 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -7,7 +7,6 @@
*
* All credits to kvm developers.
*/
-
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/poll.h>
@@ -15,7 +14,8 @@
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/eventfd.h>
-
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
#if IS_ENABLED(CONFIG_X86_64)
#include <asm/apic.h>
#endif
@@ -27,6 +27,376 @@
static struct workqueue_struct *irqfd_cleanup_wq;
+#if IS_ENABLED(CONFIG_X86_64)
+
+static int mshv_parse_mshv_irqfd(struct mshv_irqfd *irqfd,
+ struct pci_dev **out_pdev,
+ struct irq_data **out_irqdata)
+{
+ struct irq_bypass_producer *prod;
+ struct msi_desc *msidesc;
+ struct irq_data *irqdata;
+
+ if (irqfd == NULL || irqfd->irqfd_bypass_prod == NULL)
+ return -ENODEV;
+
+ prod = irqfd->irqfd_bypass_prod;
+
+ irqdata = irq_get_irq_data(prod->irq);
+ if (irqdata == NULL) {
+ pr_err("Hyper-V: irqbypass fail, no irqdata. irq:0x%x\n",
+ prod->irq);
+ return -EINVAL;
+ }
+ *out_irqdata = irqdata;
+
+ msidesc = irq_data_get_msi_desc(irqdata);
+ if (msidesc == NULL) {
+ pr_err("Hyper-V: irqbypass msi fail. irq:0x%x\n", prod->irq);
+ return -EINVAL;
+ }
+
+ *out_pdev = msi_desc_to_pci_dev(msidesc);
+ if (*out_pdev == NULL) {
+ pr_err("Hyper-V: mshv_irqfd parse fail. irq:0x%x\n", prod->irq);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Must be called with interrupts disabled */
+static int hv_vpset_from_hyp_disabled(
+ struct hv_input_get_vp_set_from_mda *input,
+ union hv_output_get_vp_set_from_mda *output,
+ struct mshv_lapic_irq *lapic_irq, u64 partid)
+{
+ u64 status;
+
+ memset(input, 0, sizeof(*input));
+ input->target_partid = partid;
+ input->dest_address = lapic_irq->lapic_apic_id;
+ input->input_vtl = 0;
+ input->destmode_logical = lapic_irq->lapic_control.logical_dest_mode;
+
+ status = hv_do_hypercall(HVCALL_GET_VPSET_FROM_MDA, input, output);
+ if (!hv_result_success(status)) {
+ hv_status_err(status, "apicid:0x%llx dest:0x%x\n",
+ lapic_irq->lapic_apic_id,
+ lapic_irq->lapic_control.logical_dest_mode);
+ }
+
+ return hv_result_to_errno(status);
+}
+
+/* Returns number of banks copied, -errno in case of error */
+static int hv_copy_vpset(struct hv_vpset *dest, struct hv_vpset *src)
+{
+ u64 bank_mask;
+ int banks, tot_banks = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK;
+
+ if (tot_banks >= HV_MAX_SPARSE_VCPU_BANKS)
+ return -EINVAL;
+
+ dest->format = src->format;
+ dest->valid_bank_mask = src->valid_bank_mask;
+ bank_mask = src->valid_bank_mask;
+ for (banks = 0; banks <= tot_banks; banks++) {
+ if (bank_mask == 0)
+ break;
+
+ if (bank_mask & 1)
+ dest->bank_contents[banks] = src->bank_contents[banks];
+ bank_mask = bank_mask >> 1;
+ }
+
+ return banks;
+}
+
+static int mshv_map_device_interrupt(u64 ptid, union hv_device_id hv_devid,
+ struct mshv_lapic_irq *ginfo,
+ struct hv_interrupt_entry *ret_entry,
+ u64 *ret_status)
+{
+ struct hv_input_map_device_interrupt *irq_input;
+ struct hv_output_map_device_interrupt *irq_output;
+ struct hv_device_interrupt_descriptor *intdesc;
+ struct hv_input_get_vp_set_from_mda *mda_input;
+ union hv_output_get_vp_set_from_mda *mda_output;
+ ulong flags;
+ u64 status;
+ int rc, var_size;
+
+ *ret_status = U64_MAX;
+ local_irq_save(flags);
+
+ mda_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ mda_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ /*
+ * Map Device Interrupt hcall needs vp set based on vp indexes used
+ * during vp creation. Here we have lapic-id of the vp only. Easiest
+ * is to just ask the hypervisor for the vp set matching the lapic-id.
+ */
+ rc = hv_vpset_from_hyp_disabled(mda_input, mda_output, ginfo, ptid);
+ if (rc)
+ goto out; /* error already printed */
+
+ irq_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ irq_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+ memset(irq_input, 0, sizeof(*irq_input));
+
+ irq_input->partition_id = ptid;
+ irq_input->device_id = hv_devid.as_uint64;
+
+ intdesc = &irq_input->interrupt_descriptor;
+ intdesc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+ intdesc->vector_count = 1;
+ intdesc->target.vector = ginfo->lapic_vector;
+ intdesc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+ intdesc->target.vp_set.valid_bank_mask = 0;
+ intdesc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+ intdesc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+ rc = hv_copy_vpset(&intdesc->target.vp_set, &mda_output->target_vpset);
+ if (rc <= 0) {
+ pr_err("Hyper-V: ptid %lld - (irq)vpset copy failed (%d)\n",
+ ptid, rc);
+ goto out;
+ }
+
+ /*
+ * var-sized hcall: var-size starts after vp_mask (thus vp_set.format
+ * does not count, but vp_set.valid_bank_mask does).
+ */
+ var_size = rc + 1;
+ status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+ irq_input, irq_output);
+ *ret_entry = irq_output->interrupt_entry;
+ local_irq_restore(flags);
+
+ rc = 0;
+ if (!hv_result_success(status)) {
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+ hv_status_err(status, "pt:%lld vec:%d lapic-id:%lld\n",
+ ptid, ginfo->lapic_vector, ginfo->lapic_apic_id);
+ *ret_status = status;
+ rc = hv_result_to_errno(status);
+ }
+
+ return rc;
+
+out:
+ local_irq_restore(flags);
+ return rc;
+
+}
+
+static int mshv_unmap_device_interrupt(union hv_device_id hv_devid,
+ struct hv_interrupt_entry *irq_entry)
+{
+ unsigned long flags;
+ struct hv_input_unmap_device_interrupt *input;
+ u64 status;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+
+ if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
+ input->partition_id = hv_get_current_partid();
+ else
+ input->partition_id = hv_current_partition_id;
+
+ input->device_id = hv_devid.as_uint64;
+ input->interrupt_entry = *irq_entry;
+
+ status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status))
+ hv_status_err(status, "\n");
+
+ return hv_result_to_errno(status);
+}
+
+static int mshv_chk_unmap_irq(union hv_device_id hv_devid,
+ struct irq_data *irqdata)
+{
+ int rc;
+
+ if (irqdata->chip_data == NULL)
+ return 0;
+
+ rc = mshv_unmap_device_interrupt(hv_devid, irqdata->chip_data);
+ if (rc)
+ return rc;
+
+ kfree(irqdata->chip_data);
+ irqdata->chip_data = NULL;
+
+ return 0;
+}
+
+/*
+ * Synchronize device update with VFIO.
+ * See: vfio_pci_memory_lock_and_enable()
+ */
+static u16 mshv_pci_memory_lock_and_enable(struct vfio_pci_core_device *cdev)
+{
+ u16 cmd;
+
+ down_write(&cdev->memory_lock);
+ pci_read_config_word(cdev->pdev, PCI_COMMAND, &cmd);
+ if (!(cmd & PCI_COMMAND_MEMORY))
+ pci_write_config_word(cdev->pdev, PCI_COMMAND,
+ cmd | PCI_COMMAND_MEMORY);
+ return cmd;
+}
+
+static void mshv_pci_memory_unlock_and_restore(
+ struct vfio_pci_core_device *cdev,
+ u16 cmd)
+{
+ pci_write_config_word(cdev->pdev, PCI_COMMAND, cmd);
+ up_write(&cdev->memory_lock);
+}
+
+static void mshv_make_device_usable(struct pci_dev *pdev, int vector,
+ struct hv_interrupt_entry *hv_entry)
+{
+ int lirq;
+ struct msi_msg msimsg;
+ struct irq_data *irqdata;
+ u16 pcicmd;
+ struct vfio_pci_core_device *coredev = dev_get_drvdata(&pdev->dev);
+
+ if (pdev->dev.driver == NULL ||
+ strcmp(pdev->dev.driver->name, "vfio-pci") != 0) {
+ pr_err("Hyper-V: irqbypass: non vfio device %s\n",
+ pci_name(pdev));
+ return;
+ }
+ if (coredev == NULL) {
+ pr_err("Hyper-V: irqbypass: null vfio device for %s\n",
+ pci_name(pdev));
+ return;
+ }
+
+ if (hv_entry->source != HV_INTERRUPT_SOURCE_MSI) {
+ pr_err("Hyper-V: %s irq source not msi\n", pci_name(pdev));
+ return;
+ }
+
+ lirq = pci_irq_vector(pdev, vector);
+ irqdata = irq_get_irq_data(lirq);
+ if (irqdata == NULL) {
+ pr_err("Hyper-V: null irq_data for write msimsg. lirq:0x%x\n",
+ lirq);
+ return;
+ }
+
+ msimsg.address_hi = 0;
+ msimsg.address_lo = hv_entry->msi_entry.address.as_uint32;
+ msimsg.data = hv_entry->msi_entry.data.as_uint32;
+
+ pcicmd = mshv_pci_memory_lock_and_enable(coredev);
+ pci_write_msi_msg(lirq, &msimsg);
+ mshv_pci_memory_unlock_and_restore(coredev, pcicmd);
+
+ pci_msi_unmask_irq(irqdata);
+
+ if (irqdata->parent_data)
+ irq_chip_unmask_parent(irqdata);
+}
+
+/*
+ * This guest has a device passthru'd to it. VFIO did the initial setup of
+ * the device interrupts, but we left them unmapped in the hypervisor
+ * because we didn't have the guest target cpu and vector (required by
+ * hypervisor). We have them now, so do the map hypercall.
+ * Also, when here, it is expected that the device global mask is unset
+ * but individual MSI/x masks are set. Goal here is to map the interrupt in
+ * the hypervisor, update the corresponding device MSI/x entry, and enable it.
+ */
+static void mshv_pthru_dev_irq_remap(struct mshv_irqfd *irqfd)
+{
+ u64 ptid, status;
+ struct pci_dev *pdev;
+ int rc, deposit_pgs = 16;
+ struct mshv_lapic_irq *ginfo = &irqfd->irqfd_lapic_irq;
+ union hv_device_id hv_devid;
+ struct hv_interrupt_entry *new_entry;
+ struct irq_data *irqdata;
+
+ if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+ irqfd->irqfd_bypass_prod == NULL)
+ return;
+
+ rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+ if (rc)
+ return;
+
+ hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+
+ rc = mshv_chk_unmap_irq(hv_devid, irqdata);
+ if (rc)
+ return;
+
+ new_entry = kmalloc(sizeof(*new_entry), GFP_ATOMIC);
+ if (new_entry == NULL)
+ return;
+
+ ptid = irqfd->irqfd_partn->pt_id;
+
+ while (deposit_pgs--) {
+ rc = mshv_map_device_interrupt(ptid, hv_devid, ginfo, new_entry,
+ &status);
+ if (rc == 0)
+ break;
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+ break;
+
+ rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
+ if (rc)
+ break;
+ }
+ if (rc) {
+ kfree(new_entry);
+ return;
+ }
+
+ irqdata->chip_data = new_entry;
+
+ mshv_make_device_usable(pdev, irqdata->hwirq, new_entry);
+}
+
+static void mshv_pthru_dev_irq_undo(struct mshv_irqfd *irqfd)
+{
+ struct pci_dev *pdev;
+ union hv_device_id hv_devid;
+ struct irq_data *irqdata;
+ int rc;
+
+ if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+ irqfd->irqfd_bypass_prod == NULL)
+ return;
+
+ rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+ if (rc)
+ return;
+
+ hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+ mshv_chk_unmap_irq(hv_devid, irqdata);
+}
+
+#else /* IS_ENABLED(CONFIG_X86_64) */
+
+static void mshv_pthru_dev_irq_remap(struct mshv_irqfd *irqfd) { }
+static void mshv_pthru_dev_irq_undo(struct mshv_irqfd *irqfd) { }
+
+#endif /* IS_ENABLED(CONFIG_X86_64) */
+
void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
struct mshv_irq_ack_notifier *mian)
{
@@ -264,6 +634,7 @@ static void mshv_irqfd_shutdown(struct work_struct *work)
/*
* It is now safe to release the object's resources
*/
+ irq_bypass_unregister_consumer(&irqfd->irqfd_bypass_cons);
eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
kfree(irqfd);
}
@@ -286,6 +657,12 @@ static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
hlist_del(&irqfd->irqfd_hnode);
+ /*
+ * Cleanup interrupt map (kfree chip_data) while in a VMM thread as
+ * unmap needs partition id. mshv_irqfd_shutdown() runs in a kthread.
+ */
+ mshv_pthru_dev_irq_undo(irqfd);
+
queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
}
@@ -383,6 +760,45 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
}
+static int mshv_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct mshv_irqfd *irqfd;
+
+ irqfd = container_of(cons, struct mshv_irqfd, irqfd_bypass_cons);
+ irqfd->irqfd_bypass_prod = prod;
+
+ mshv_pthru_dev_irq_remap(irqfd);
+
+ return 0;
+}
+
+static void mshv_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct mshv_irqfd *irqfd;
+
+ irqfd = container_of(cons, struct mshv_irqfd, irqfd_bypass_cons);
+
+ WARN_ON(irqfd->irqfd_bypass_prod != prod);
+ irqfd->irqfd_bypass_prod = NULL;
+
+}
+
+static void mshv_setup_irq_bypass(struct mshv_irqfd *irqfd,
+ struct eventfd_ctx *eventfd)
+{
+ struct irq_bypass_consumer *consumer = &irqfd->irqfd_bypass_cons;
+ int rc;
+
+ consumer->add_producer = mshv_irq_bypass_add_producer;
+ consumer->del_producer = mshv_irq_bypass_del_producer;
+ rc = irq_bypass_register_consumer(&irqfd->irqfd_bypass_cons, eventfd);
+ if (rc)
+ pr_err("Hyper-V: irq bypass consumer registration failed: %d\n",
+ rc);
+}
+
static int mshv_irqfd_assign(struct mshv_partition *pt,
struct mshv_user_irqfd *args)
{
@@ -509,6 +925,8 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
if (events & EPOLLIN)
mshv_assert_irq_slow(irqfd);
+ mshv_setup_irq_bypass(irqfd, eventfd);
+
srcu_read_unlock(&pt->pt_irq_srcu, idx);
return 0;
diff --git a/drivers/iommu/hyperv-iommu-root.c b/drivers/iommu/hyperv-iommu-root.c
index 739bbf39dea2..3e078e9213f9 100644
--- a/drivers/iommu/hyperv-iommu-root.c
+++ b/drivers/iommu/hyperv-iommu-root.c
@@ -219,6 +219,20 @@ u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
}
EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
+/* Build device id for the interrupt path */
+u64 hv_devid_from_pdev(struct pci_dev *pdev)
+{
+ enum hv_device_type dev_type;
+
+ if (hv_pcidev_is_attached_dev(pdev))
+ dev_type = HV_DEVICE_TYPE_LOGICAL;
+ else
+ dev_type = HV_DEVICE_TYPE_PCI;
+
+ return hv_build_devid_oftype(pdev, dev_type);
+}
+EXPORT_SYMBOL_GPL(hv_devid_from_pdev);
+
/* Create a new device domain in the hypervisor */
static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
{
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 8f6b818ee09b..8ecc909c3415 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1745,6 +1745,16 @@ static void hv_irq_mask(struct irq_data *data)
static void hv_irq_unmask(struct irq_data *data)
{
+ struct pci_dev *pdev;
+ struct msi_desc *msi_desc;
+
+ msi_desc = irq_data_get_msi_desc(data);
+ pdev = msi_desc_to_pci_dev(msi_desc);
+
+ /* Done during bypass setup in mshv_eventfd.c: mshv_irqfd_assign() */
+ if (hv_pcidev_is_pthru_dev(pdev))
+ return;
+
hv_arch_irq_unmask(data);
if (data->parent_data->chip->irq_unmask)
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index edbcfc2a9b60..887605aa9c95 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -341,6 +341,7 @@ u64 hv_get_current_partid(void);
bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev);
u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
+u64 hv_devid_from_pdev(struct pci_dev *pdev);
#else /* Remove following after arm64 implementation is done */
@@ -354,6 +355,9 @@ static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
enum hv_device_type type)
{ return 0; }
+static inline u64 hv_devid_from_pdev(struct pci_dev *pdev)
+{ return 0; }
+
static inline u64 hv_get_current_partid(void)
{ return HV_PARTITION_ID_INVALID; }
#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH v0 1/3] mshv: Import declarations for irq remap and add irqbypass support
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
linux-pci, linux-arch
In-Reply-To: <20260429231519.2569088-1-mrathor@linux.microsoft.com>
For the irq map/remap hypercalls, copy relevant data structures from
hypervisor public headers into Linux equivalents. Also, update Kconfig and
mshv_irqfd for irqbypass. Please note, irqbypass is required for doing
passthru on MSHV. This because there is really no way of knowing the linux
irq in the mshv_irqfd_assign and mshv_irqfd_update paths without it. The
linux irq is setup upfront by VFIO before irqfd assign/update happens.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/Kconfig | 1 +
drivers/hv/mshv_eventfd.h | 3 +++
include/hyperv/hvgdk_mini.h | 3 +++
include/hyperv/hvhdk.h | 17 +++++++++++++++++
4 files changed, 24 insertions(+)
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 7937ac0cbd0f..c831fe25ca2b 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -75,6 +75,7 @@ config MSHV_ROOT
# no particular order, making it impossible to reassemble larger pages
depends on PAGE_SIZE_4KB
select EVENTFD
+ select IRQ_BYPASS_MANAGER
select VIRT_XFER_TO_GUEST_WORK
select HMM_MIRROR
select MMU_NOTIFIER
diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
index 464c6b81ab33..ff4dd24b8ad4 100644
--- a/drivers/hv/mshv_eventfd.h
+++ b/drivers/hv/mshv_eventfd.h
@@ -9,6 +9,7 @@
#define __LINUX_MSHV_EVENTFD_H
#include <linux/poll.h>
+#include <linux/irqbypass.h>
#include "mshv.h"
#include "mshv_root.h"
@@ -37,6 +38,8 @@ struct mshv_irqfd {
struct mshv_irqfd_resampler *irqfd_resampler;
struct eventfd_ctx *irqfd_resamplefd;
struct hlist_node irqfd_resampler_hnode;
+ struct irq_bypass_consumer irqfd_bypass_cons;
+ struct irq_bypass_producer *irqfd_bypass_prod;
};
void mshv_eventfd_init(struct mshv_partition *partition);
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index da622fb06440..1ef480825705 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -59,6 +59,8 @@ struct hv_u128 {
#define HV_PARTITION_ID_INVALID ((u64)0)
#define HV_PARTITION_ID_SELF ((u64)-1)
+#define HV_MAX_VPS 256 /* HV_MAXIMUM_PROCESSORS */
+
/* Hyper-V specific model specific registers (MSRs) */
#if defined(CONFIG_X86)
@@ -508,6 +510,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_UNMAP_VP_STATE_PAGE 0x00e2
#define HVCALL_GET_VP_STATE 0x00e3
#define HVCALL_SET_VP_STATE 0x00e4
+#define HVCALL_GET_VPSET_FROM_MDA 0x00e5
#define HVCALL_GET_VP_CPUID_VALUES 0x00f4
#define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101
#define HVCALL_MMIO_READ 0x0106
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 5e83d3714966..d0a892347ab1 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -952,4 +952,21 @@ struct hv_input_modify_sparse_spa_page_host_access {
#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE 0x4
#define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE 0x8
+#ifdef CONFIG_X86
+
+struct hv_input_get_vp_set_from_mda { /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
+ u64 target_partid;
+ u64 dest_address;
+ u8 input_vtl;
+ u8 destmode_logical; /* true => mode is logical */
+ u16 reserved0; /* mbz */
+ u32 reserved1; /* mbz */
+} __packed;
+
+union hv_output_get_vp_set_from_mda { /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
+ struct hv_vpset target_vpset;
+ u64 bitset_buffer[HV_GENERIC_SET_QWORD_COUNT(HV_MAX_VPS)];
+} __packed;
+
+#endif /* CONFIG_X86 */
#endif /* _HV_HVHDK_H */
--
2.51.2.vfs.0.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox