* [PATCH V2 08/11] PCI: hv: Build device id for a VMBus device, export PCI devid function
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
interrupts, etc need a device ID as a parameter. This device ID refers
to that specific device during the lifetime of passthru.
An L1VH VM only contains VMBus based devices. A device ID for a VMBus
device is slightly different in that it uses the hv_pcibus_device info
for building it to make sure it matches exactly what the hypervisor
expects. This VMBus based device ID is needed when attaching devices in
an L1VH based guest VM. Before building it, a check is done to make sure
the device is a valid VMBus device.
In remaining cases, PCI device ID is used. So, also make PCI device ID
build function hv_build_devid_type_pci() public.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
arch/x86/hyperv/irqdomain.c | 9 +++++----
arch/x86/include/asm/mshyperv.h | 6 ++++++
drivers/pci/controller/pci-hyperv.c | 24 ++++++++++++++++++++++++
include/asm-generic/mshyperv.h | 8 ++++++++
4 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index b3ad50a874dc..8780573a4332 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -112,7 +112,7 @@ static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
return 0;
}
-static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
+u64 hv_build_devid_type_pci(struct pci_dev *pdev)
{
int pos;
union hv_device_id hv_devid;
@@ -172,8 +172,9 @@ static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
}
out:
- return hv_devid;
+ return hv_devid.as_uint64;
}
+EXPORT_SYMBOL_GPL(hv_build_devid_type_pci);
/*
* hv_map_msi_interrupt() - Map the MSI IRQ in the hypervisor.
@@ -196,7 +197,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
msidesc = irq_data_get_msi_desc(data);
pdev = msi_desc_to_pci_dev(msidesc);
- hv_devid = hv_build_devid_type_pci(pdev);
+ hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
return hv_map_interrupt(hv_devid, false, cpu, cfg->vector,
@@ -271,7 +272,7 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
{
union hv_device_id hv_devid;
- hv_devid = hv_build_devid_type_pci(pdev);
+ hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
}
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f64393e853ee..9d24cafed657 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -271,6 +271,12 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
+#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
+u64 hv_build_devid_type_pci(struct pci_dev *pdev);
+#else
+u64 hv_build_devid_type_pci(struct pci_dev *pdev) { return 0; }
+#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
+
struct mshv_vtl_cpu_context {
union {
struct {
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index cfc8fa403dad..50d793ca8f31 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -573,6 +573,7 @@ struct hv_pci_compl {
};
static void hv_pci_onchannelcallback(void *context);
+static bool hv_vmbus_pci_device(struct pci_bus *pbus);
#ifdef CONFIG_X86
#define DELIVERY_MODE APIC_DELIVERY_MODE_FIXED
@@ -1005,6 +1006,24 @@ static struct irq_domain *hv_pci_get_root_domain(void)
static void hv_arch_irq_unmask(struct irq_data *data) { }
#endif /* CONFIG_ARM64 */
+u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
+{
+ struct hv_pcibus_device *hbus;
+ struct pci_bus *pbus = pdev->bus;
+
+ if (!hv_vmbus_pci_device(pbus))
+ return 0;
+
+ hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
+
+ return (hbus->hdev->dev_instance.b[5] << 24) |
+ (hbus->hdev->dev_instance.b[4] << 16) |
+ (hbus->hdev->dev_instance.b[7] << 8) |
+ (hbus->hdev->dev_instance.b[6] & 0xf8) |
+ PCI_FUNC(pdev->devfn);
+}
+EXPORT_SYMBOL_GPL(hv_pci_vmbus_device_id);
+
/**
* hv_pci_generic_compl() - Invoked for a completion packet
* @context: Set up by the sender of the packet.
@@ -1403,6 +1422,11 @@ static struct pci_ops hv_pcifront_ops = {
.write = hv_pcifront_write_config,
};
+static bool hv_vmbus_pci_device(struct pci_bus *pbus)
+{
+ return pbus->ops == &hv_pcifront_ops;
+}
+
/*
* Paravirtual backchannel
*
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index e8cbc4e3f7ad..a6878ab685e7 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -23,6 +23,7 @@
#include <acpi/acpi_numa.h>
#include <linux/cpumask.h>
#include <linux/nmi.h>
+#include <linux/pci.h>
#include <asm/ptrace.h>
#include <hyperv/hvhdk.h>
@@ -329,6 +330,13 @@ static inline enum hv_isolation_type hv_get_isolation_type(void)
}
#endif /* CONFIG_HYPERV */
+#if IS_ENABLED(CONFIG_PCI_HYPERV)
+u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
+#else
+static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
+{ return 0; }
+#endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
+
#if IS_ENABLED(CONFIG_MSHV_ROOT)
static inline bool hv_root_partition(void)
{
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 07/11] mshv: Import data structs around device passthru from hyperv headers
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Copy/import from Hyper-V public headers, definitions and declarations that
are related to attaching and detaching of device domains, and building
device ids for those purposes.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
include/hyperv/hvgdk_mini.h | 11 ++++
include/hyperv/hvhdk_mini.h | 112 ++++++++++++++++++++++++++++++++++++
2 files changed, 123 insertions(+)
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 6a4e8b9d570f..da622fb06440 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -326,6 +326,9 @@ union hv_hypervisor_version_info {
/* stimer Direct Mode is available */
#define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19)
+#define HV_DEVICE_DOMAIN_AVAILABLE BIT(24)
+#define HV_S1_DEVICE_DOMAIN_AVAILABLE BIT(25)
+
/*
* Implementation recommendations. Indicates which behaviors the hypervisor
* recommends the OS implement for optimal performance.
@@ -475,6 +478,8 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_MAP_DEVICE_INTERRUPT 0x007c
#define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d
#define HVCALL_RETARGET_INTERRUPT 0x007e
+#define HVCALL_ATTACH_DEVICE 0x0082
+#define HVCALL_DETACH_DEVICE 0x0083
#define HVCALL_NOTIFY_PARTITION_EVENT 0x0087
#define HVCALL_ENTER_SLEEP_STATE 0x0084
#define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b
@@ -486,9 +491,15 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */
#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0
+#define HVCALL_CREATE_DEVICE_DOMAIN 0x00b1
+#define HVCALL_ATTACH_DEVICE_DOMAIN 0x00b2
+#define HVCALL_MAP_DEVICE_GPA_PAGES 0x00b3
+#define HVCALL_UNMAP_DEVICE_GPA_PAGES 0x00b4
#define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0
#define HVCALL_POST_MESSAGE_DIRECT 0x00c1
#define HVCALL_DISPATCH_VP 0x00c2
+#define HVCALL_DETACH_DEVICE_DOMAIN 0x00c4
+#define HVCALL_DELETE_DEVICE_DOMAIN 0x00c5
#define HVCALL_GET_GPA_PAGES_ACCESS_STATES 0x00c9
#define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d7
#define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS 0x00d8
diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index b4cb2fa26e9b..60425052a799 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -468,6 +468,32 @@ struct hv_send_ipi_ex { /* HV_INPUT_SEND_SYNTHETIC_CLUSTER_IPI_EX */
struct hv_vpset vp_set;
} __packed;
+union hv_attdev_flags { /* HV_ATTACH_DEVICE_FLAGS */
+ struct {
+ u32 logical_id : 1;
+ u32 resvd0 : 1;
+ u32 ats_enabled : 1;
+ u32 virt_func : 1;
+ u32 shared_irq_child : 1;
+ u32 virt_dev : 1;
+ u32 ats_supported : 1;
+ u32 small_irt : 1;
+ u32 resvd : 24;
+ } __packed;
+ u32 as_uint32;
+};
+
+union hv_dev_pci_caps { /* HV_DEVICE_PCI_CAPABILITIES */
+ struct {
+ u32 max_pasid_width : 5;
+ u32 invalidate_qdepth : 5;
+ u32 global_inval : 1;
+ u32 prg_response_req : 1;
+ u32 resvd : 20;
+ } __packed;
+ u32 as_uint32;
+};
+
typedef u16 hv_pci_rid; /* HV_PCI_RID */
typedef u16 hv_pci_segment; /* HV_PCI_SEGMENT */
typedef u64 hv_logical_device_id;
@@ -547,4 +573,90 @@ union hv_device_id { /* HV_DEVICE_ID */
} acpi;
} __packed;
+struct hv_input_attach_device { /* HV_INPUT_ATTACH_DEVICE */
+ u64 partition_id;
+ union hv_device_id device_id;
+ union hv_attdev_flags attdev_flags;
+ u8 attdev_vtl;
+ u8 rsvd0;
+ u16 rsvd1;
+ u64 logical_devid;
+ union hv_dev_pci_caps dev_pcicaps;
+ u16 pf_pci_rid;
+ u16 resvd2;
+} __packed;
+
+struct hv_input_detach_device { /* HV_INPUT_DETACH_DEVICE */
+ u64 partition_id;
+ u64 logical_devid;
+} __packed;
+
+
+/* 3 domain types: stage 1, stage 2, and SOC */
+#define HV_DEVICE_DOMAIN_TYPE_S2 0 /* HV_DEVICE_DOMAIN_ID_TYPE_S2 */
+#define HV_DEVICE_DOMAIN_TYPE_S1 1 /* HV_DEVICE_DOMAIN_ID_TYPE_S1 */
+#define HV_DEVICE_DOMAIN_TYPE_SOC 2 /* HV_DEVICE_DOMAIN_ID_TYPE_SOC */
+
+/* ID for stage 2 default domain and NULL domain */
+#define HV_DEVICE_DOMAIN_ID_S2_DEFAULT 0
+#define HV_DEVICE_DOMAIN_ID_S2_NULL 0xFFFFFFFFULL
+
+union hv_device_domain_id {
+ u64 as_uint64;
+ struct {
+ u32 type : 4;
+ u32 reserved : 28;
+ u32 id;
+ };
+} __packed;
+
+struct hv_input_device_domain { /* HV_INPUT_DEVICE_DOMAIN */
+ u64 partition_id;
+ union hv_input_vtl owner_vtl;
+ u8 padding[7];
+ union hv_device_domain_id domain_id;
+} __packed;
+
+union hv_create_device_domain_flags { /* HV_CREATE_DEVICE_DOMAIN_FLAGS */
+ u32 as_uint32;
+ struct {
+ u32 forward_progress_required : 1;
+ u32 inherit_owning_vtl : 1;
+ u32 reserved : 30;
+ } __packed;
+} __packed;
+
+struct hv_input_create_device_domain { /* HV_INPUT_CREATE_DEVICE_DOMAIN */
+ struct hv_input_device_domain device_domain;
+ union hv_create_device_domain_flags create_device_domain_flags;
+} __packed;
+
+struct hv_input_delete_device_domain { /* HV_INPUT_DELETE_DEVICE_DOMAIN */
+ struct hv_input_device_domain device_domain;
+} __packed;
+
+struct hv_input_attach_device_domain { /* HV_INPUT_ATTACH_DEVICE_DOMAIN */
+ struct hv_input_device_domain device_domain;
+ union hv_device_id device_id;
+} __packed;
+
+struct hv_input_detach_device_domain { /* HV_INPUT_DETACH_DEVICE_DOMAIN */
+ u64 partition_id;
+ union hv_device_id device_id;
+} __packed;
+
+struct hv_input_map_device_gpa_pages { /* HV_INPUT_MAP_DEVICE_GPA_PAGES */
+ struct hv_input_device_domain device_domain;
+ union hv_input_vtl target_vtl;
+ u8 padding[3];
+ u32 map_flags;
+ u64 target_device_va_base;
+ u64 gpa_page_list[];
+} __packed;
+
+struct hv_input_unmap_device_gpa_pages { /* HV_INPUT_UNMAP_DEVICE_GPA_PAGES */
+ struct hv_input_device_domain device_domain;
+ u64 target_device_va_base;
+} __packed;
+
#endif /* _HV_HVHDK_MINI_H */
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 06/11] mshv: Add ioctl support for MSHV-VFIO bridge device
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Add ioctl support for creating MSHV devices for a partition. At
present only VFIO device types are supported, but more could be
added. At a high level, a partition ioctl to create device verifies
it is of type VFIO and does some setup for bridge code in mshv_vfio.c.
Adapted from KVM device ioctls.
Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/mshv_root_main.c | 116 ++++++++++++++++++++++++++++++++++++
1 file changed, 116 insertions(+)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 02c107458be9..6ceb5f608589 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1625,6 +1625,119 @@ mshv_partition_ioctl_initialize(struct mshv_partition *partition)
return ret;
}
+static long mshv_device_attr_ioctl(struct mshv_device *mshv_dev, int cmd,
+ ulong uarg)
+{
+ struct mshv_device_attr attr;
+ const struct mshv_device_ops *devops = mshv_dev->device_ops;
+
+ if (copy_from_user(&attr, (void __user *)uarg, sizeof(attr)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case MSHV_SET_DEVICE_ATTR:
+ if (devops->device_set_attr)
+ return devops->device_set_attr(mshv_dev, &attr);
+ break;
+ case MSHV_HAS_DEVICE_ATTR:
+ if (devops->device_has_attr)
+ return devops->device_has_attr(mshv_dev, &attr);
+ break;
+ }
+
+ return -EPERM;
+}
+
+static long mshv_device_fop_ioctl(struct file *filp, unsigned int cmd,
+ ulong uarg)
+{
+ struct mshv_device *mshv_dev = filp->private_data;
+
+ switch (cmd) {
+ case MSHV_SET_DEVICE_ATTR:
+ case MSHV_HAS_DEVICE_ATTR:
+ return mshv_device_attr_ioctl(mshv_dev, cmd, uarg);
+ }
+
+ return -ENOTTY;
+}
+
+static int mshv_device_fop_release(struct inode *inode, struct file *filp)
+{
+ struct mshv_device *mshv_dev = filp->private_data;
+ struct mshv_partition *partition = mshv_dev->device_pt;
+
+ if (mshv_dev->device_ops->device_release) {
+ mutex_lock(&partition->pt_mutex);
+ hlist_del(&mshv_dev->device_ptnode);
+ mshv_dev->device_ops->device_release(mshv_dev);
+ mutex_unlock(&partition->pt_mutex);
+ }
+
+ mshv_partition_put(partition);
+ return 0;
+}
+
+static const struct file_operations mshv_device_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = mshv_device_fop_ioctl,
+ .release = mshv_device_fop_release,
+};
+
+static long mshv_partition_ioctl_create_device(struct mshv_partition *partition,
+ void __user *uarg)
+{
+ long rc;
+ struct mshv_create_device devargk;
+ struct mshv_device *mshv_dev;
+ const struct mshv_device_ops *vfio_ops;
+
+ if (copy_from_user(&devargk, uarg, sizeof(devargk)))
+ return -EFAULT;
+
+ /* At present, only VFIO is supported */
+ if (devargk.type != MSHV_DEV_TYPE_VFIO)
+ return -ENODEV;
+
+ if (devargk.flags & MSHV_CREATE_DEVICE_TEST)
+ return 0;
+
+ /* This is freed later by mshv_vfio_release_device() */
+ mshv_dev = kzalloc(sizeof(*mshv_dev), GFP_KERNEL_ACCOUNT);
+ if (mshv_dev == NULL)
+ return -ENOMEM;
+
+ vfio_ops = &mshv_vfio_device_ops;
+ mshv_dev->device_ops = vfio_ops;
+ mshv_dev->device_pt = partition;
+
+ rc = vfio_ops->device_create(mshv_dev);
+ if (rc < 0) {
+ kfree(mshv_dev);
+ return rc;
+ }
+
+ hlist_add_head(&mshv_dev->device_ptnode, &partition->pt_devices);
+
+ mshv_partition_get(partition);
+ rc = anon_inode_getfd(vfio_ops->device_name, &mshv_device_fops,
+ mshv_dev, O_RDWR | O_CLOEXEC);
+ if (rc < 0)
+ goto undo_out;
+
+ devargk.fd = rc;
+ if (copy_to_user(uarg, &devargk, sizeof(devargk)))
+ return -EFAULT; /* cleanup in mshv_device_fop_release() */
+
+ return 0;
+
+undo_out:
+ hlist_del(&mshv_dev->device_ptnode);
+ vfio_ops->device_release(mshv_dev); /* will kfree(mshv_dev) */
+ mshv_partition_put(partition);
+ return rc;
+}
+
static long
mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
@@ -1661,6 +1774,9 @@ mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
case MSHV_ROOT_HVCALL:
ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
break;
+ case MSHV_CREATE_DEVICE:
+ ret = mshv_partition_ioctl_create_device(partition, uarg);
+ break;
default:
ret = -ENOTTY;
}
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 05/11] mshv: Implement mshv bridge device for VFIO
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Add a new file to implement VFIO-MSHV bridge pseudo device. These
functions are called in the VFIO framework, and credits to kvm/vfio.c
as this file was adapted from it.
Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/Makefile | 3 +-
drivers/hv/mshv_vfio.c | 211 ++++++++++++++++++++++++++++++++++++++
include/uapi/linux/mshv.h | 1 +
3 files changed, 214 insertions(+), 1 deletion(-)
create mode 100644 drivers/hv/mshv_vfio.c
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 888a748cc7cb..9ab6fc254c38 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -14,7 +14,8 @@ hv_vmbus-y := vmbus_drv.o \
hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
- mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+ mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o \
+ mshv_vfio.o
mshv_root-$(CONFIG_DEBUG_FS) += mshv_debugfs.o
mshv_root-$(CONFIG_TRACEPOINTS) += mshv_trace.o
mshv_vtl-y := mshv_vtl_main.o
diff --git a/drivers/hv/mshv_vfio.c b/drivers/hv/mshv_vfio.c
new file mode 100644
index 000000000000..00a97920e25b
--- /dev/null
+++ b/drivers/hv/mshv_vfio.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO-MSHV bridge pseudo device
+ *
+ * Heavily inspired by the VFIO-KVM bridge pseudo device.
+ */
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <asm/mshyperv.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+struct mshv_vfio_file {
+ struct list_head node;
+ struct file *file; /* list of struct mshv_vfio_file */
+};
+
+struct mshv_vfio {
+ struct list_head file_list;
+ struct mutex lock;
+};
+
+static bool mshv_vfio_file_is_valid(struct file *file)
+{
+ bool (*fn)(struct file *file);
+ bool ret;
+
+ fn = symbol_get(vfio_file_is_valid);
+ if (!fn)
+ return false;
+
+ ret = fn(file);
+
+ symbol_put(vfio_file_is_valid);
+
+ return ret;
+}
+
+static long mshv_vfio_file_add(struct mshv_device *mshvdev, unsigned int fd)
+{
+ struct mshv_vfio *mshv_vfio = mshvdev->device_private;
+ struct mshv_vfio_file *mvf;
+ struct file *filp;
+ long ret = 0;
+
+ filp = fget(fd);
+ if (!filp)
+ return -EBADF;
+
+ /* Ensure the FD is a vfio FD. */
+ if (!mshv_vfio_file_is_valid(filp)) {
+ ret = -EINVAL;
+ goto out_fput;
+ }
+
+ mutex_lock(&mshv_vfio->lock);
+
+ list_for_each_entry(mvf, &mshv_vfio->file_list, node) {
+ if (mvf->file == filp) {
+ ret = -EEXIST;
+ goto out_unlock;
+ }
+ }
+
+ mvf = kzalloc(sizeof(*mvf), GFP_KERNEL_ACCOUNT);
+ if (!mvf) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ mvf->file = get_file(filp);
+ list_add_tail(&mvf->node, &mshv_vfio->file_list);
+
+out_unlock:
+ mutex_unlock(&mshv_vfio->lock);
+out_fput:
+ fput(filp);
+ return ret;
+}
+
+static long mshv_vfio_file_del(struct mshv_device *mshvdev, unsigned int fd)
+{
+ struct mshv_vfio *mshv_vfio = mshvdev->device_private;
+ struct mshv_vfio_file *mvf;
+ long ret;
+
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+
+ ret = -ENOENT;
+ mutex_lock(&mshv_vfio->lock);
+
+ list_for_each_entry(mvf, &mshv_vfio->file_list, node) {
+ if (mvf->file != fd_file(f))
+ continue;
+
+ list_del(&mvf->node);
+ fput(mvf->file);
+ kfree(mvf);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&mshv_vfio->lock);
+ return ret;
+}
+
+static long mshv_vfio_set_file(struct mshv_device *mshvdev, long attr,
+ void __user *arg)
+{
+ int32_t __user *argp = arg;
+ int32_t fd;
+
+ switch (attr) {
+ case MSHV_DEV_VFIO_FILE_ADD:
+ if (get_user(fd, argp))
+ return -EFAULT;
+ return mshv_vfio_file_add(mshvdev, fd);
+
+ case MSHV_DEV_VFIO_FILE_DEL:
+ if (get_user(fd, argp))
+ return -EFAULT;
+ return mshv_vfio_file_del(mshvdev, fd);
+ }
+
+ return -ENXIO;
+}
+
+static long mshv_vfio_set_attr(struct mshv_device *mshvdev,
+ struct mshv_device_attr *attr)
+{
+ switch (attr->group) {
+ case MSHV_DEV_VFIO_FILE:
+ return mshv_vfio_set_file(mshvdev, attr->attr,
+ u64_to_user_ptr(attr->addr));
+ }
+
+ return -ENXIO;
+}
+
+static long mshv_vfio_has_attr(struct mshv_device *mshvdev,
+ struct mshv_device_attr *attr)
+{
+ switch (attr->group) {
+ case MSHV_DEV_VFIO_FILE:
+ switch (attr->attr) {
+ case MSHV_DEV_VFIO_FILE_ADD:
+ case MSHV_DEV_VFIO_FILE_DEL:
+ return 0;
+ }
+
+ break;
+ }
+
+ return -ENXIO;
+}
+
+static long mshv_vfio_create_device(struct mshv_device *mshvdev)
+{
+ struct mshv_device *tmp;
+ struct mshv_vfio *mshv_vfio;
+
+ /* Only one VFIO "device" per VM */
+ hlist_for_each_entry(tmp, &mshvdev->device_pt->pt_devices,
+ device_ptnode)
+ if (tmp->device_ops == &mshv_vfio_device_ops)
+ return -EBUSY;
+
+ mshv_vfio = kzalloc(sizeof(*mshv_vfio), GFP_KERNEL_ACCOUNT);
+ if (mshv_vfio == NULL)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&mshv_vfio->file_list);
+ mutex_init(&mshv_vfio->lock);
+
+ mshvdev->device_private = mshv_vfio;
+
+ return 0;
+}
+
+/* This is called from mshv_device_fop_release() */
+static void mshv_vfio_release_device(struct mshv_device *mshvdev)
+{
+ struct mshv_vfio *mv = mshvdev->device_private;
+ struct mshv_vfio_file *mvf, *tmp;
+
+ list_for_each_entry_safe(mvf, tmp, &mv->file_list, node) {
+ fput(mvf->file);
+ list_del(&mvf->node);
+ kfree(mvf);
+ }
+
+ kfree(mv);
+ kfree(mshvdev);
+}
+
+struct mshv_device_ops mshv_vfio_device_ops = {
+ .device_name = "mshv-vfio",
+ .device_create = mshv_vfio_create_device,
+ .device_release = mshv_vfio_release_device,
+ .device_set_attr = mshv_vfio_set_attr,
+ .device_has_attr = mshv_vfio_has_attr,
+};
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 4373a8243951..6404e8a98237 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -254,6 +254,7 @@ struct mshv_root_hvcall {
#define MSHV_GET_GPAP_ACCESS_BITMAP _IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap)
/* Generic hypercall */
#define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
+#define MSHV_CREATE_DEVICE _IOWR(MSHV_IOCTL, 0x08, struct mshv_create_device)
/*
********************************
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 04/11] mshv: Declarations and definitions for VFIO-MSHV bridge device
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Add data structs needed by the subsequent patch that introduces a new
module to implement VFIO-MSHV pseudo device.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/mshv_root.h | 19 +++++++++++++++++++
include/uapi/linux/mshv.h | 30 ++++++++++++++++++++++++++++++
2 files changed, 49 insertions(+)
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index a85c24dcc701..b9880d0bdc4d 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -227,6 +227,25 @@ struct port_table_info {
};
};
+struct mshv_device {
+ const struct mshv_device_ops *device_ops;
+ struct mshv_partition *device_pt;
+ void *device_private;
+ struct hlist_node device_ptnode;
+};
+
+struct mshv_device_ops {
+ const char *device_name;
+ long (*device_create)(struct mshv_device *dev);
+ void (*device_release)(struct mshv_device *dev);
+ long (*device_set_attr)(struct mshv_device *dev,
+ struct mshv_device_attr *attr);
+ long (*device_has_attr)(struct mshv_device *dev,
+ struct mshv_device_attr *attr);
+};
+
+extern struct mshv_device_ops mshv_vfio_device_ops;
+
int mshv_update_routing_table(struct mshv_partition *partition,
const struct mshv_user_irq_entry *entries,
unsigned int numents);
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 32ff92b6342b..4373a8243951 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -404,4 +404,34 @@ struct mshv_sint_mask {
/* hv_hvcall device */
#define MSHV_HVCALL_SETUP _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup)
#define MSHV_HVCALL _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall)
+
+/* device passhthru */
+#define MSHV_CREATE_DEVICE_TEST 1
+
+enum {
+ MSHV_DEV_TYPE_VFIO,
+ MSHV_DEV_TYPE_MAX,
+};
+
+struct mshv_create_device {
+ __u32 type; /* in: MSHV_DEV_TYPE_xxx */
+ __u32 fd; /* out: device handle */
+ __u32 flags; /* in: MSHV_CREATE_DEVICE_xxx */
+};
+
+#define MSHV_DEV_VFIO_FILE 1
+#define MSHV_DEV_VFIO_FILE_ADD 1
+#define MSHV_DEV_VFIO_FILE_DEL 2
+
+struct mshv_device_attr {
+ __u32 flags; /* no flags currently defined */
+ __u32 group; /* device-defined */
+ __u64 attr; /* group-defined */
+ __u64 addr; /* userspace address of attr data */
+};
+
+/* Device fds created with MSHV_CREATE_DEVICE */
+#define MSHV_SET_DEVICE_ATTR _IOW(MSHV_IOCTL, 0x00, struct mshv_device_attr)
+#define MSHV_HAS_DEVICE_ATTR _IOW(MSHV_IOCTL, 0x01, struct mshv_device_attr)
+
#endif
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 03/11] mshv: Provide a way to get partition id if running in a VMM process
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Many PCI passthru related hypercalls require partition id of the target
guest. Guests are actually managed by MSHV driver and the partition id
is only maintained there. Add a field in the partition struct in MSHV
driver to save the tgid of the VMM process creating the partition,
and add a function there to retrieve partition id if current process
is a VMM process.
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
drivers/hv/mshv_root.h | 1 +
drivers/hv/mshv_root_main.c | 22 ++++++++++++++++++++++
include/asm-generic/mshyperv.h | 5 +++++
3 files changed, 28 insertions(+)
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 1f086dcb7aa1..a85c24dcc701 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -138,6 +138,7 @@ struct mshv_partition {
struct mshv_girq_routing_table __rcu *pt_girq_tbl;
u64 isolation_type;
+ pid_t pt_vmm_tgid;
bool import_completed;
bool pt_initialized;
#if IS_ENABLED(CONFIG_DEBUG_FS)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index bd1359eb58dd..02c107458be9 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1908,6 +1908,27 @@ mshv_partition_release(struct inode *inode, struct file *filp)
return 0;
}
+/* Given a process tgid, return partition id if it is a VMM process */
+u64 mshv_current_partid(void)
+{
+ struct mshv_partition *pt;
+ int i;
+ u64 ret_ptid = HV_PARTITION_ID_INVALID;
+
+ rcu_read_lock();
+
+ hash_for_each_rcu(mshv_root.pt_htable, i, pt, pt_hnode) {
+ if (pt->pt_vmm_tgid == current->tgid) {
+ ret_ptid = pt->pt_id;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret_ptid;
+}
+EXPORT_SYMBOL_GPL(mshv_current_partid);
+
static int
add_partition(struct mshv_partition *partition)
{
@@ -2073,6 +2094,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
goto cleanup_irq_srcu;
partition->pt_id = pt_id;
+ partition->pt_vmm_tgid = current->tgid;
ret = add_partition(partition);
if (ret)
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index bf601d67cecb..e8cbc4e3f7ad 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -350,6 +350,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_notify_all_processors_started(void);
bool hv_lp_exists(u32 lp_index);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
+u64 mshv_current_partid(void);
#else /* CONFIG_MSHV_ROOT */
static inline bool hv_root_partition(void) { return false; }
@@ -380,6 +381,10 @@ static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u3
{
return -EOPNOTSUPP;
}
+static inline u64 mshv_current_partid(void)
+{
+ return HV_PARTITION_ID_INVALID;
+}
#endif /* CONFIG_MSHV_ROOT */
static inline int hv_deposit_memory(u64 partition_id, u64 status)
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 02/11] x86/hyperv: cosmetic changes in irqdomain.c for readability
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
Make cosmetic changes:
o Rename struct pci_dev *dev to *pdev since there are cases of
struct device *dev in the file and all over the kernel
o Rename hv_build_pci_dev_id to hv_build_devid_type_pci in anticipation
of building different types of device ids
o Fix checkpatch.pl issues with return and extraneous printk
o Replace spaces with tabs
o Rename struct hv_devid *xxx to struct hv_devid *hv_devid given code
paths involve many types of device ids
o Fix indentation in a large if block by using goto.
There are no functional changes.
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
arch/x86/hyperv/irqdomain.c | 198 +++++++++++++++++++-----------------
1 file changed, 104 insertions(+), 94 deletions(-)
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 365e364268d9..b3ad50a874dc 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -1,5 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-
/*
* Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
*
@@ -14,8 +13,8 @@
#include <linux/irqchip/irq-msi-lib.h>
#include <asm/mshyperv.h>
-static int hv_map_interrupt(union hv_device_id device_id, bool level,
- int cpu, int vector, struct hv_interrupt_entry *entry)
+static int hv_map_interrupt(union hv_device_id hv_devid, bool level,
+ int cpu, int vector, struct hv_interrupt_entry *ret_entry)
{
struct hv_input_map_device_interrupt *input;
struct hv_output_map_device_interrupt *output;
@@ -32,7 +31,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
intr_desc = &input->interrupt_descriptor;
memset(input, 0, sizeof(*input));
input->partition_id = hv_current_partition_id;
- input->device_id = device_id.as_uint64;
+ input->device_id = hv_devid.as_uint64;
intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
intr_desc->vector_count = 1;
intr_desc->target.vector = vector;
@@ -44,7 +43,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
intr_desc->target.vp_set.valid_bank_mask = 0;
intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
- nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
+ nr_bank = cpumask_to_vpset(&intr_desc->target.vp_set, cpumask_of(cpu));
if (nr_bank < 0) {
local_irq_restore(flags);
pr_err("%s: unable to generate VP set\n", __func__);
@@ -61,7 +60,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
input, output);
- *entry = output->interrupt_entry;
+ *ret_entry = output->interrupt_entry;
local_irq_restore(flags);
@@ -71,21 +70,19 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
return hv_result_to_errno(status);
}
-static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
{
unsigned long flags;
struct hv_input_unmap_device_interrupt *input;
- struct hv_interrupt_entry *intr_entry;
u64 status;
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
memset(input, 0, sizeof(*input));
- intr_entry = &input->interrupt_entry;
input->partition_id = hv_current_partition_id;
input->device_id = id;
- *intr_entry = *old_entry;
+ input->interrupt_entry = *irq_entry;
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
@@ -115,67 +112,71 @@ static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
return 0;
}
-static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
{
- union hv_device_id dev_id;
+ int pos;
+ union hv_device_id hv_devid;
struct rid_data data = {
.bridge = NULL,
- .rid = PCI_DEVID(dev->bus->number, dev->devfn)
+ .rid = PCI_DEVID(pdev->bus->number, pdev->devfn)
};
- pci_for_each_dma_alias(dev, get_rid_cb, &data);
+ pci_for_each_dma_alias(pdev, get_rid_cb, &data);
- dev_id.as_uint64 = 0;
- dev_id.device_type = HV_DEVICE_TYPE_PCI;
- dev_id.pci.segment = pci_domain_nr(dev->bus);
+ hv_devid.as_uint64 = 0;
+ hv_devid.device_type = HV_DEVICE_TYPE_PCI;
+ hv_devid.pci.segment = pci_domain_nr(pdev->bus);
- dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
- dev_id.pci.bdf.device = PCI_SLOT(data.rid);
- dev_id.pci.bdf.function = PCI_FUNC(data.rid);
- dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+ hv_devid.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+ hv_devid.pci.bdf.device = PCI_SLOT(data.rid);
+ hv_devid.pci.bdf.function = PCI_FUNC(data.rid);
+ hv_devid.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
- if (data.bridge) {
- int pos;
+ if (data.bridge == NULL)
+ goto out;
- /*
- * Microsoft Hypervisor requires a bus range when the bridge is
- * running in PCI-X mode.
- *
- * To distinguish conventional vs PCI-X bridge, we can check
- * the bridge's PCI-X Secondary Status Register, Secondary Bus
- * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
- * Specification Revision 1.0 5.2.2.1.3.
- *
- * Value zero means it is in conventional mode, otherwise it is
- * in PCI-X mode.
- */
+ /*
+ * Microsoft Hypervisor requires a bus range when the bridge is
+ * running in PCI-X mode.
+ *
+ * To distinguish conventional vs PCI-X bridge, we can check
+ * the bridge's PCI-X Secondary Status Register, Secondary Bus
+ * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+ * Specification Revision 1.0 5.2.2.1.3.
+ *
+ * Value zero means it is in conventional mode, otherwise it is
+ * in PCI-X mode.
+ */
- pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
- if (pos) {
- u16 status;
+ pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+ if (pos) {
+ u16 status;
- pci_read_config_word(data.bridge, pos +
- PCI_X_BRIDGE_SSTATUS, &status);
+ pci_read_config_word(data.bridge, pos + PCI_X_BRIDGE_SSTATUS,
+ &status);
- if (status & PCI_X_SSTATUS_FREQ) {
- /* Non-zero, PCI-X mode */
- u8 sec_bus, sub_bus;
+ if (status & PCI_X_SSTATUS_FREQ) {
+ /* Non-zero, PCI-X mode */
+ u8 sec_bus, sub_bus;
- dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+ hv_devid.pci.source_shadow =
+ HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
- pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
- dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
- pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
- dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
- }
+ pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS,
+ &sec_bus);
+ hv_devid.pci.shadow_bus_range.secondary_bus = sec_bus;
+ pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS,
+ &sub_bus);
+ hv_devid.pci.shadow_bus_range.subordinate_bus = sub_bus;
}
}
- return dev_id;
+out:
+ return hv_devid;
}
-/**
- * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor.
+/*
+ * hv_map_msi_interrupt() - Map the MSI IRQ in the hypervisor.
* @data: Describes the IRQ
* @out_entry: Hypervisor (MSI) interrupt entry (can be NULL)
*
@@ -188,22 +189,23 @@ int hv_map_msi_interrupt(struct irq_data *data,
{
struct irq_cfg *cfg = irqd_cfg(data);
struct hv_interrupt_entry dummy;
- union hv_device_id device_id;
+ union hv_device_id hv_devid;
struct msi_desc *msidesc;
- struct pci_dev *dev;
+ struct pci_dev *pdev;
int cpu;
msidesc = irq_data_get_msi_desc(data);
- dev = msi_desc_to_pci_dev(msidesc);
- device_id = hv_build_pci_dev_id(dev);
+ pdev = msi_desc_to_pci_dev(msidesc);
+ hv_devid = hv_build_devid_type_pci(pdev);
cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
- return hv_map_interrupt(device_id, false, cpu, cfg->vector,
+ return hv_map_interrupt(hv_devid, false, cpu, cfg->vector,
out_entry ? out_entry : &dummy);
}
EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
-static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+static void entry_to_msi_msg(struct hv_interrupt_entry *entry,
+ struct msi_msg *msg)
{
/* High address is always 0 */
msg->address_hi = 0;
@@ -211,17 +213,19 @@ static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi
msg->data = entry->msi_entry.data.as_uint32;
}
-static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
+ struct hv_interrupt_entry *irq_entry);
+
static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
struct hv_interrupt_entry *stored_entry;
struct irq_cfg *cfg = irqd_cfg(data);
struct msi_desc *msidesc;
- struct pci_dev *dev;
+ struct pci_dev *pdev;
int ret;
msidesc = irq_data_get_msi_desc(data);
- dev = msi_desc_to_pci_dev(msidesc);
+ pdev = msi_desc_to_pci_dev(msidesc);
if (!cfg) {
pr_debug("%s: cfg is NULL", __func__);
@@ -240,7 +244,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
stored_entry = data->chip_data;
data->chip_data = NULL;
- ret = hv_unmap_msi_interrupt(dev, stored_entry);
+ ret = hv_unmap_msi_interrupt(pdev, stored_entry);
kfree(stored_entry);
@@ -249,10 +253,8 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
}
stored_entry = kzalloc_obj(*stored_entry, GFP_ATOMIC);
- if (!stored_entry) {
- pr_debug("%s: failed to allocate chip data\n", __func__);
+ if (!stored_entry)
return;
- }
ret = hv_map_msi_interrupt(data, stored_entry);
if (ret) {
@@ -262,18 +264,21 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
data->chip_data = stored_entry;
entry_to_msi_msg(data->chip_data, msg);
-
- return;
}
-static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
+ struct hv_interrupt_entry *irq_entry)
{
- return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
+ union hv_device_id hv_devid;
+
+ hv_devid = hv_build_devid_type_pci(pdev);
+ return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
}
-static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
+/* NB: during map, hv_interrupt_entry is saved via data->chip_data */
+static void hv_teardown_msi_irq(struct pci_dev *pdev, struct irq_data *irqd)
{
- struct hv_interrupt_entry old_entry;
+ struct hv_interrupt_entry irq_entry;
struct msi_msg msg;
if (!irqd->chip_data) {
@@ -281,13 +286,13 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
return;
}
- old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
- entry_to_msi_msg(&old_entry, &msg);
+ irq_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
+ entry_to_msi_msg(&irq_entry, &msg);
kfree(irqd->chip_data);
irqd->chip_data = NULL;
- (void)hv_unmap_msi_interrupt(dev, &old_entry);
+ (void)hv_unmap_msi_interrupt(pdev, &irq_entry);
}
/*
@@ -302,7 +307,8 @@ static struct irq_chip hv_pci_msi_controller = {
};
static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
- struct irq_domain *real_parent, struct msi_domain_info *info)
+ struct irq_domain *real_parent,
+ struct msi_domain_info *info)
{
struct irq_chip *chip = info->chip;
@@ -317,7 +323,8 @@ static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
}
#define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX)
-#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
+ MSI_FLAG_USE_DEF_CHIP_OPS)
static struct msi_parent_ops hv_msi_parent_ops = {
.supported_flags = HV_MSI_FLAGS_SUPPORTED,
@@ -329,14 +336,14 @@ static struct msi_parent_ops hv_msi_parent_ops = {
.init_dev_msi_info = hv_init_dev_msi_info,
};
-static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs,
- void *arg)
+/* Allocate nr_irqs IRQs for the given irq domain */
+static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
{
/*
- * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except
- * entry_to_msi_msg() should be in here.
+ * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e.
+ * everything except entry_to_msi_msg() should be in here.
*/
-
int ret;
ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg);
@@ -344,13 +351,15 @@ static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned
return ret;
for (int i = 0; i < nr_irqs; ++i) {
- irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL,
- handle_edge_irq, NULL, "edge");
+ irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller,
+ NULL, handle_edge_irq, NULL, "edge");
}
+
return 0;
}
-static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs)
+static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq,
+ unsigned int nr_irqs)
{
for (int i = 0; i < nr_irqs; ++i) {
struct irq_data *irqd = irq_domain_get_irq_data(d, virq);
@@ -362,6 +371,7 @@ static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned
hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
}
+
irq_domain_free_irqs_top(d, virq, nr_irqs);
}
@@ -394,25 +404,25 @@ struct irq_domain * __init hv_create_pci_msi_domain(void)
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
{
- union hv_device_id device_id;
+ union hv_device_id hv_devid;
- device_id.as_uint64 = 0;
- device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
- device_id.ioapic.ioapic_id = (u8)ioapic_id;
+ hv_devid.as_uint64 = 0;
+ hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
+ hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
- return hv_unmap_interrupt(device_id.as_uint64, entry);
+ return hv_unmap_interrupt(hv_devid.as_uint64, entry);
}
EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
struct hv_interrupt_entry *entry)
{
- union hv_device_id device_id;
+ union hv_device_id hv_devid;
- device_id.as_uint64 = 0;
- device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
- device_id.ioapic.ioapic_id = (u8)ioapic_id;
+ hv_devid.as_uint64 = 0;
+ hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
+ hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
- return hv_map_interrupt(device_id, level, cpu, vector, entry);
+ return hv_map_interrupt(hv_devid, level, cpu, vector, entry);
}
EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 01/11] iommu/hyperv: rename hyperv-iommu.c to hyperv-irq.c
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
In-Reply-To: <20260501004157.3108202-1-mrathor@linux.microsoft.com>
This file actually implements irq remapping, so rename to more appropriate
hyperv-irq.c. A new file to implement hyperv iommu will be introduced
later. Also, it should not be tied to HYPERV_IOMMU, but to CONFIG_HYPERV
and IRQ_REMAP. The file already has #ifdef CONFIG_IRQ_REMAP.
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
MAINTAINERS | 2 +-
drivers/iommu/Makefile | 2 +-
drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} | 6 +++---
drivers/iommu/irq_remapping.c | 2 +-
4 files changed, 6 insertions(+), 6 deletions(-)
rename drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} (99%)
diff --git a/MAINTAINERS b/MAINTAINERS
index d1cc0e12fe1f..f803a6a38fee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11914,7 +11914,7 @@ F: drivers/clocksource/hyperv_timer.c
F: drivers/hid/hid-hyperv.c
F: drivers/hv/
F: drivers/input/serio/hyperv-keyboard.c
-F: drivers/iommu/hyperv-iommu.c
+F: drivers/iommu/hyperv-irq.c
F: drivers/net/ethernet/microsoft/
F: drivers/net/hyperv/
F: drivers/pci/controller/pci-hyperv-intf.c
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 0275821f4ef9..335ea77cced6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -30,7 +30,7 @@ obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
-obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
+obj-$(CONFIG_HYPERV) += hyperv-irq.o
obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-irq.c
similarity index 99%
rename from drivers/iommu/hyperv-iommu.c
rename to drivers/iommu/hyperv-irq.c
index 479103261ae6..d11076f906fb 100644
--- a/drivers/iommu/hyperv-iommu.c
+++ b/drivers/iommu/hyperv-irq.c
@@ -8,6 +8,8 @@
* Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
*/
+#ifdef CONFIG_IRQ_REMAP
+
#include <linux/types.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
@@ -24,8 +26,6 @@
#include "irq_remapping.h"
-#ifdef CONFIG_IRQ_REMAP
-
/*
* According 82093AA IO-APIC spec , IO APIC has a 24-entry Interrupt
* Redirection Table. Hyper-V exposes one single IO-APIC and so define
@@ -331,4 +331,4 @@ static const struct irq_domain_ops hyperv_root_ir_domain_ops = {
.free = hyperv_root_irq_remapping_free,
};
-#endif
+#endif /* CONFIG_IRQ_REMAP */
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index c2443659812a..41bf65e4ea88 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -108,7 +108,7 @@ int __init irq_remapping_prepare(void)
else if (IS_ENABLED(CONFIG_AMD_IOMMU) &&
amd_iommu_irq_ops.prepare() == 0)
remap_ops = &amd_iommu_irq_ops;
- else if (IS_ENABLED(CONFIG_HYPERV_IOMMU) &&
+ else if (IS_ENABLED(CONFIG_HYPERV) &&
hyperv_irq_remap_ops.prepare() == 0)
remap_ops = &hyperv_irq_remap_ops;
else
--
2.51.2.vfs.0.1
^ permalink raw reply related
* [PATCH V2 00/11] PCI passthru on Hyper-V (Part I)
From: Mukesh R @ 2026-05-01 0:41 UTC (permalink / raw)
To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
linux-pci, linux-arch
Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
joro, will, lpieralisi, kwilczynski, bhelgaas, arnd
Implement passthru of PCI devices to unprivileged virtual machines
(VMs) when Linux is running as a privileged VM on Microsoft Hyper-V
hypervisor. This support is made to fit within the workings of VFIO
framework, and any VMM needing to use it must use the VFIO subsystem.
This supports both full device passthru and SR-IOV based VFs.
At a high level, the hypervisor supports traditional mapped iommu domains
that use explicit map and unmap hypercalls for mapping and unmapping guest
RAM into the iommu subsystem. Hyper-V also has a concept of direct attach
devices whereby the iommu subsystem simply uses the guest HW page table
(ept/npt/..). This series adds support for both, and both are made to
work with the VFIO subsystem.
While this Part I focuses on memory mappings, Part II focuses on irq
remapping and irq migrations.
This series rebased to: 5170a82e8921 (origin/hyperv-next)
Testing:
o Most testing done on hyperv-next:e733a9e28180 using Cloud Hypervisor (51).
o Limited testing on : 5170a82e8921
o Tested with impending Part II irq patches.
o All tests involved PF passthru of devices using MSIx.
o Following combinations were tested:
- L1VH(1): test 1: Mellanox ConnectX-6 Lx passthru
test 2: NVIDIA Tesla Tesla T4 GPU.
test 3: Both of above simultaneous passthru
- Baremetal dom0/root: All of above.
(1) L1VH: this is a semi privileged VM that runs on Windows root on
Hyper-V, and allows users to create more child VMs.
This series strives to establish a base line. Some pending work items:
o arm64 : some delta to make this work on arm64 (in progress).
o Qemu and OpenVMM support (in progress).
o VF testing
o device sleep/wakeup.
o More stress testing with high end GPUs
Changes in V2:
o rebase to 5170a82e8921
o minor fixes for arm64 build
o drop patch 03: "x86/hyperv: add insufficient memory support in irqdomain.c"
as it that path is no longer used
o drop patch 08: "PCI: hv: rename hv_compose_msi_msg .. " and do it separately
outside this series.
o minor updates to commit messages
Changes in V1:
o patch 1: Don't tie hyperv-irq.c to CONFIG_HYPERV_IOMMU.
o patch 4: Redesigned to address security vulnerability found by copilot
with passing tgid as a parameter. Also, do tgid setting right
after setting pt_id.
o patch 5: Remove unused type parameter from mshv_device_ops.device_create
o patch 7: mshv_partition_ioctl_create_device cleanup on copy_to_user.
o patch 10: Add export of hv_build_devid_type_pci here to get rid of
patch 11.
o patch 12: Move functions to build device ids from patch 11 here for
the benefit of arm64. Rename file to: hyperv-iommu-root.c.
o patch 13: removed to be made part of interrupt part II of this support.
o patch 14: get rid of fast path to reduce review noise.
o New (last) patch to pin ram regions if device passthru to a VM.
Thanks,
-Mukesh
Mukesh R (11):
iommu/hyperv: rename hyperv-iommu.c to hyperv-irq.c
x86/hyperv: cosmetic changes in irqdomain.c for readability
mshv: Provide a way to get partition id if running in a VMM process
mshv: Declarations and definitions for VFIO-MSHV bridge device
mshv: Implement mshv bridge device for VFIO
mshv: Add ioctl support for MSHV-VFIO bridge device
mshv: Import data structs around device passthru from hyperv headers
PCI: hv: Build device id for a VMBus device, export PCI devid function
x86/hyperv: Implement hyperv virtual IOMMU
mshv: Populate mmio mappings for PCI passthru
mshv: Mark mem regions as non-movable upfront if device passthru
MAINTAINERS | 3 +-
arch/x86/hyperv/irqdomain.c | 199 ++--
arch/x86/include/asm/mshyperv.h | 6 +
arch/x86/kernel/pci-dma.c | 2 +
drivers/hv/Makefile | 3 +-
drivers/hv/mshv_root.h | 21 +
drivers/hv/mshv_root_main.c | 266 ++++-
drivers/hv/mshv_vfio.c | 211 ++++
drivers/iommu/Kconfig | 5 +-
drivers/iommu/Makefile | 3 +-
drivers/iommu/hyperv-iommu-root.c | 908 ++++++++++++++++++
.../iommu/{hyperv-iommu.c => hyperv-irq.c} | 6 +-
drivers/iommu/irq_remapping.c | 2 +-
drivers/pci/controller/pci-hyperv.c | 24 +
include/asm-generic/mshyperv.h | 30 +
include/hyperv/hvgdk_mini.h | 11 +
include/hyperv/hvhdk_mini.h | 112 +++
include/linux/hyperv.h | 6 +
include/uapi/linux/mshv.h | 31 +
19 files changed, 1727 insertions(+), 122 deletions(-)
create mode 100644 drivers/hv/mshv_vfio.c
create mode 100644 drivers/iommu/hyperv-iommu-root.c
rename drivers/iommu/{hyperv-iommu.c => hyperv-irq.c} (99%)
--
2.51.2.vfs.0.1
^ permalink raw reply
* RE: [EXTERNAL] Re: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
From: Dexuan Cui @ 2026-04-30 22:42 UTC (permalink / raw)
To: kernel test robot, KY Srinivasan, Haiyang Zhang,
wei.liu@kernel.org, Long Li, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, mhklinux@outlook.com,
matthew.ruffell@canonical.com, johansen@templeofstupid.com
Cc: llvm@lists.linux.dev, oe-kbuild-all@lists.linux.dev,
stable@vger.kernel.org
In-Reply-To: <202605010002.dnnxVZFF-lkp@intel.com>
> From: kernel test robot <lkp@intel.com>
> Sent: Thursday, April 30, 2026 9:33 AM
> ...
> config: i386-buildonly-randconfig-002-20260430
> ...
> All warnings (new ones prefixed by >>):
>
> >> drivers/hv/vmbus_drv.c:2403:40: warning: result of comparison of constant
> 4294967296 with expression of type 'resource_size_t' (aka 'unsigned int') is
> always false [-Wtautological-constant-out-of-range-compare]
> 2403 | if (!low_mmio_base || low_mmio_base >= SZ_4G ||
> | ~~~~~~~~~~~~~ ^ ~~~~~
> 1 warning generated.
Thanks for reporting the warning with the i386 kernel config.
I don't know if there is any x86-32 users nowadays, but this warning can be
fixed by:
- if (!low_mmio_base || low_mmio_base >= SZ_4G ||
+ if (!low_mmio_base || upper_32_bits(low_mmio_base) ||
(start && start < low_mmio_base)) {
pr_warn("Unexpected low mmio base 0x%pa\n", &low_mmio_base);
}
^ permalink raw reply
* RE: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
From: Dexuan Cui @ 2026-04-30 22:16 UTC (permalink / raw)
To: Michael Kelley, KY Srinivasan, Haiyang Zhang, wei.liu@kernel.org,
Long Li, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, matthew.ruffell@canonical.com,
johansen@templeofstupid.com
Cc: stable@vger.kernel.org
In-Reply-To: <SN6PR02MB415726B17D5A6027CD1717E8D4342@SN6PR02MB4157.namprd02.prod.outlook.com>
> From: Michael Kelley <mhklinux@outlook.com>
> Sent: Wednesday, April 29, 2026 11:01 AM
>
> From: Dexuan Cui <DECUI@microsoft.com> Sent: Tuesday, April 28, 2026 8:13
> PM
> ...
> >
> > A CVM on Hyper-V won't start without the command line
> > Disable-VMConsoleSupport -VMName $vmName
This is not true. It turns out I can start a VBS/SNP/TDX without the
command line.... Sorry! Not sure why I had the wrong impression -- I
guess I was told to always run the command since day 1, so I subconsciously
thought a VM would not start without it. Or, maybe the host behavior
changed? but that seems unlikely to me.
> Unfortunately, on my laptop Hyper-V, a VM with VBS Isolation appears
> to *not* require Disable-VMConsoleSupport. I can start the VM, and the
> VM is offered the VMBus synthvid, mouse, and keyboard devices.
Actually I can also start a VBS VM without Disable-VMConsoleSupport.
> But what's weird in this case is that vmbus_reserved_fb() sees lfb_base
> and lfb_start as 0.
I see the same.
> Furthermore, as a test, I changed the "allowed_in_isolated"
> flag to true for the synthvid device, and the Hyper-V DRM driver loads and
> initializes.
I also changed the flag .allowed_in_isolated to true for HV_SYNTHVID_GUID,
HV_KBD, and HV_MOUSE, but I can't see the devices in "lsvmbus".
In vmbus_onoffer(), I printed the offer->offer.if_type and
offer->offer.if_instance just after the message " Invalid offer %d from the host
supporting isolation", and I indeed don't see the fb/mouse/keyboard devices.
I'm on a recent Hyper-V dev build. Maybe this is why my observation is
not exactly the same.
>In doing so, the vmconnect.exe window is resized larger, as is
> done in a normal VM. /proc/iomem shows that the DRM driver claimed
> the expected MMIO range at the start of low MMIO space. I can run a user
> space program that mmaps /dev/fb0 and writes pixels to the mmap'ed
> memory, and that succeeds as it would in a normal VM, but the
> vmconnect.exe window doesn't show anything. It appears that the Hyper-V
> host has allocated memory for the frame buffer, but is ignoring anything
> that is written to it.
>
> Running Disable-VMConsoleSupport works as expected -- the synthvid,
> mouse, and keyboard devices are no longer offered to the VM.
I even ran "Enable-VMConsoleSupport", which finished without any error,
but I still didn't see the keyboard/mouse/framebuffer devices.
> So instead of not reserving any MMIO space for the framebuffer on
> CVMs, the code you already have limits the reservation to half of the
> MMIO space below 4 GB.
Correct.
> Won't that work to avoid exhausting the low
> MMIO space in a CVM that's running on a local Hyper-V with only 128
> MiB of low MMIO space?
Correct. I'll drop the CVM check in vmbus_reserve_fb() in v2.
^ permalink raw reply
* Re: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
From: kernel test robot @ 2026-04-30 16:33 UTC (permalink / raw)
To: Dexuan Cui, kys, haiyangz, wei.liu, longli, linux-hyperv,
linux-kernel, mhklinux, matthew.ruffell, johansen
Cc: llvm, oe-kbuild-all, stable
In-Reply-To: <20260416183529.838321-1-decui@microsoft.com>
Hi Dexuan,
kernel test robot noticed the following build warnings:
[auto build test WARNING on linus/master]
[also build test WARNING on v7.1-rc1 next-20260429]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Dexuan-Cui/Drivers-hv-vmbus-Improve-the-logc-of-reserving-fb_mmio-on-Gen2-VMs/20260424-033622
base: linus/master
patch link: https://lore.kernel.org/r/20260416183529.838321-1-decui%40microsoft.com
patch subject: [PATCH] Drivers: hv: vmbus: Improve the logc of reserving fb_mmio on Gen2 VMs
config: i386-buildonly-randconfig-002-20260430 (https://download.01.org/0day-ci/archive/20260501/202605010002.dnnxVZFF-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260501/202605010002.dnnxVZFF-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605010002.dnnxVZFF-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> drivers/hv/vmbus_drv.c:2403:40: warning: result of comparison of constant 4294967296 with expression of type 'resource_size_t' (aka 'unsigned int') is always false [-Wtautological-constant-out-of-range-compare]
2403 | if (!low_mmio_base || low_mmio_base >= SZ_4G ||
| ~~~~~~~~~~~~~ ^ ~~~~~
1 warning generated.
vim +2403 drivers/hv/vmbus_drv.c
2385
2386 static void __maybe_unused vmbus_reserve_fb(void)
2387 {
2388 resource_size_t start = 0, size;
2389 resource_size_t low_mmio_base;
2390 struct pci_dev *pdev;
2391
2392 /* Hyper-V CoCo guests do not have a framebuffer device. */
2393 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
2394 return;
2395
2396 if (efi_enabled(EFI_BOOT)) {
2397 /* Gen2 VM: get FB base from EFI framebuffer */
2398 if (IS_ENABLED(CONFIG_SYSFB)) {
2399 start = sysfb_primary_display.screen.lfb_base;
2400 size = max_t(__u32, sysfb_primary_display.screen.lfb_size, 0x800000);
2401
2402 low_mmio_base = hyperv_mmio->start;
> 2403 if (!low_mmio_base || low_mmio_base >= SZ_4G ||
2404 (start && start < low_mmio_base)) {
2405 pr_warn("Unexpected low mmio base 0x%pa\n", &low_mmio_base);
2406 } else {
2407 /*
2408 * If the kdump kernel's lfb_base is 0,
2409 * fall back to the low mmio base.
2410 */
2411 if (!start)
2412 start = low_mmio_base;
2413 /*
2414 * Reserve half of the space below 4GB for high
2415 * resolutions, but cap the reservation to 128MB.
2416 */
2417 size = min((SZ_4G - start) / 2, SZ_128M);
2418 }
2419 }
2420 } else {
2421 /* Gen1 VM: get FB base from PCI */
2422 pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
2423 PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
2424 if (!pdev)
2425 return;
2426
2427 if (pdev->resource[0].flags & IORESOURCE_MEM) {
2428 start = pci_resource_start(pdev, 0);
2429 size = pci_resource_len(pdev, 0);
2430 }
2431
2432 /*
2433 * Release the PCI device so hyperv_drm driver can grab it
2434 * later.
2435 */
2436 pci_dev_put(pdev);
2437 }
2438
2439 if (!start)
2440 return;
2441
2442 /*
2443 * Make a claim for the frame buffer in the resource tree under the
2444 * first node, which will be the one below 4GB. The length seems to
2445 * be underreported, particularly in a Generation 1 VM. So start out
2446 * reserving a larger area and make it smaller until it succeeds.
2447 */
2448 for (; !fb_mmio && (size >= 0x100000); size >>= 1)
2449 fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0);
2450
2451 pr_info("hv_mmio=%pR,%pR fb=%pR\n", hyperv_mmio, hyperv_mmio->sibling, fb_mmio);
2452 }
2453
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* [PATCH v3] mshv: Simplify GPA map/unmap hypercall helpers
From: Stanislav Kinsburskii @ 2026-04-30 14:52 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
preceding bug-fix patches:
Move "done += completed" before the status checks so that pages mapped
by a partially-successful batch are included in the error cleanup unmap.
Previously these mappings were leaked on failure.
While here, improve type safety and readability:
- Change "int done" to "u64 done" to match the u64 page_count it is
compared against, avoiding signed/unsigned comparison hazards.
- Use u64 for loop iteration and batch size variables consistently.
- Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
- Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
- Simplify the error-path unmap to use "done << large_shift" directly
instead of mutating done in place.
v3: aligned changes by 80 colons
v2: replaced min with min_t
Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
drivers/hv/mshv_root_hv_call.c | 56 +++++++++++++++-------------------------
1 file changed, 21 insertions(+), 35 deletions(-)
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index e5992c324904a..e1f9e28d5a19b 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
struct hv_input_map_gpa_pages *input_page;
u64 status, *pfnlist;
unsigned long irq_flags, large_shift = 0;
- int ret = 0, done = 0;
- u64 page_count = page_struct_count;
+ u64 done = 0, page_count = page_struct_count;
+ int ret = 0;
if (page_count == 0 || (pages && mmio_spa))
return -EINVAL;
@@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
}
while (done < page_count) {
- ulong i, completed, remain = page_count - done;
- int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
+ u64 i, completed, remain = page_count - done;
+ u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -224,23 +224,14 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
input_page->map_flags = flags;
pfnlist = input_page->source_gpa_page_list;
- for (i = 0; i < rep_count; i++)
- if (flags & HV_MAP_GPA_NO_ACCESS) {
+ for (i = 0; i < rep_count; i++) {
+ if (flags & HV_MAP_GPA_NO_ACCESS)
pfnlist[i] = 0;
- } else if (pages) {
- u64 index = (done + i) << large_shift;
-
- if (index >= page_struct_count) {
- ret = -EINVAL;
- break;
- }
- pfnlist[i] = page_to_pfn(pages[index]);
- } else {
+ else if (pages)
+ pfnlist[i] = page_to_pfn(pages[(done + i) <<
+ large_shift]);
+ else
pfnlist[i] = mmio_spa + done + i;
- }
- if (ret) {
- local_irq_restore(irq_flags);
- break;
}
status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
@@ -248,29 +239,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
+ done += completed;
if (hv_result_needs_memory(status)) {
ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
HV_MAP_GPA_DEPOSIT_PAGES);
if (ret)
break;
-
} else if (!hv_result_success(status)) {
ret = hv_result_to_errno(status);
break;
}
-
- done += completed;
}
if (ret && done) {
u32 unmap_flags = 0;
- if (flags & HV_MAP_GPA_LARGE_PAGE) {
+ if (flags & HV_MAP_GPA_LARGE_PAGE)
unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
- done <<= large_shift;
- }
- hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
+ hv_call_unmap_gpa_pages(partition_id, gfn,
+ done << large_shift, unmap_flags);
}
return ret;
@@ -305,7 +293,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
struct hv_input_unmap_gpa_pages *input_page;
u64 status, page_count = page_count_4k;
unsigned long irq_flags, large_shift = 0;
- int ret = 0, done = 0;
+ u64 done = 0;
if (page_count == 0)
return -EINVAL;
@@ -319,8 +307,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
}
while (done < page_count) {
- ulong completed, remain = page_count - done;
- int rep_count = min(remain, HV_UMAP_GPA_PAGES);
+ u64 completed, remain = page_count - done;
+ u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
local_irq_save(irq_flags);
input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -333,15 +321,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
local_irq_restore(irq_flags);
completed = hv_repcomp(status);
- if (!hv_result_success(status)) {
- ret = hv_result_to_errno(status);
- break;
- }
-
done += completed;
+
+ if (!hv_result_success(status))
+ return hv_result_to_errno(status);
}
- return ret;
+ return 0;
}
int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
^ permalink raw reply related
* Re: [PATCH v2] mshv: Simplify GPA map/unmap hypercall helpers
From: Stanislav Kinsburskii @ 2026-04-30 14:43 UTC (permalink / raw)
To: Mukesh R
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <61e5d806-b5d5-ab2c-0e09-6def449d5582@linux.microsoft.com>
On Wed, Apr 29, 2026 at 07:06:08PM -0700, Mukesh R wrote:
>
> On 4/29/26 09:48, Stanislav Kinsburskii wrote:
> > Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> > preceding bug-fix patches:
> >
> > Move "done += completed" before the status checks so that pages mapped
> > by a partially-successful batch are included in the error cleanup unmap.
> > Previously these mappings were leaked on failure.
> >
> > While here, improve type safety and readability:
> > - Change "int done" to "u64 done" to match the u64 page_count it is
> > compared against, avoiding signed/unsigned comparison hazards.
> > - Use u64 for loop iteration and batch size variables consistently.
> > - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
> > - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
> > - Simplify the error-path unmap to use "done << large_shift" directly
> > instead of mutating done in place.
> >
>
> what changed in V2?
>
No functional changes: "min" was replaced with "min_t" (reported by
checkpatch.pl).
> > Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > ---
> > drivers/hv/mshv_root_hv_call.c | 55 +++++++++++++++-------------------------
> > 1 file changed, 20 insertions(+), 35 deletions(-)
> >
> > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > index e5992c324904a..1f19a4ca824f0 100644
> > --- a/drivers/hv/mshv_root_hv_call.c
> > +++ b/drivers/hv/mshv_root_hv_call.c
> > @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > struct hv_input_map_gpa_pages *input_page;
> > u64 status, *pfnlist;
> > unsigned long irq_flags, large_shift = 0;
> > - int ret = 0, done = 0;
> > - u64 page_count = page_struct_count;
> > + u64 done = 0, page_count = page_struct_count;
> > + int ret = 0;
> > if (page_count == 0 || (pages && mmio_spa))
> > return -EINVAL;
> > @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > }
> > while (done < page_count) {
> > - ulong i, completed, remain = page_count - done;
> > - int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> > + u64 i, completed, remain = page_count - done;
> > + u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
> > local_irq_save(irq_flags);
> > input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > @@ -224,23 +224,13 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > input_page->map_flags = flags;
> > pfnlist = input_page->source_gpa_page_list;
> > - for (i = 0; i < rep_count; i++)
> > - if (flags & HV_MAP_GPA_NO_ACCESS) {
> > + for (i = 0; i < rep_count; i++) {
> > + if (flags & HV_MAP_GPA_NO_ACCESS)
> > pfnlist[i] = 0;
> > - } else if (pages) {
> > - u64 index = (done + i) << large_shift;
> > -
> > - if (index >= page_struct_count) {
> > - ret = -EINVAL;
> > - break;
> > - }
> > - pfnlist[i] = page_to_pfn(pages[index]);
> > - } else {
> > + else if (pages)
> > + pfnlist[i] = page_to_pfn(pages[(done + i) << large_shift]);
>
> Entire file is 80 cols, please don't cause this one overflow.
>
Sure. I'll update.
Thanks,
Stanislav
> Thanks,
> -Mukesh
>
>
> > + else
> > pfnlist[i] = mmio_spa + done + i;
> > - }
> > - if (ret) {
> > - local_irq_restore(irq_flags);
> > - break;
> > }
> > status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> > @@ -248,29 +238,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > local_irq_restore(irq_flags);
> > completed = hv_repcomp(status);
> > + done += completed;
> > if (hv_result_needs_memory(status)) {
> > ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> > HV_MAP_GPA_DEPOSIT_PAGES);
> > if (ret)
> > break;
> > -
> > } else if (!hv_result_success(status)) {
> > ret = hv_result_to_errno(status);
> > break;
> > }
> > -
> > - done += completed;
> > }
> > if (ret && done) {
> > u32 unmap_flags = 0;
> > - if (flags & HV_MAP_GPA_LARGE_PAGE) {
> > + if (flags & HV_MAP_GPA_LARGE_PAGE)
> > unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> > - done <<= large_shift;
> > - }
> > - hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> > + hv_call_unmap_gpa_pages(partition_id, gfn,
> > + done << large_shift, unmap_flags);
> > }
> > return ret;
> > @@ -305,7 +292,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> > struct hv_input_unmap_gpa_pages *input_page;
> > u64 status, page_count = page_count_4k;
> > unsigned long irq_flags, large_shift = 0;
> > - int ret = 0, done = 0;
> > + u64 done = 0;
> > if (page_count == 0)
> > return -EINVAL;
> > @@ -319,8 +306,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> > }
> > while (done < page_count) {
> > - ulong completed, remain = page_count - done;
> > - int rep_count = min(remain, HV_UMAP_GPA_PAGES);
> > + u64 completed, remain = page_count - done;
> > + u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
> > local_irq_save(irq_flags);
> > input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > @@ -333,15 +320,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
> > local_irq_restore(irq_flags);
> > completed = hv_repcomp(status);
> > - if (!hv_result_success(status)) {
> > - ret = hv_result_to_errno(status);
> > - break;
> > - }
> > -
> > done += completed;
> > +
> > + if (!hv_result_success(status))
> > + return hv_result_to_errno(status);
> > }
> > - return ret;
> > + return 0;
> > }
> > int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
> >
> >
>
^ permalink raw reply
* Re: [PATCH 00/10] mshv: Bug fixes across the mshv_root module
From: Stanislav Kinsburskii @ 2026-04-30 14:40 UTC (permalink / raw)
To: Mukesh R
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <daacfbcc-e725-65f2-4b20-b4501e45e651@linux.microsoft.com>
On Wed, Apr 29, 2026 at 07:18:44PM -0700, Mukesh R wrote:
> On 4/29/26 11:17, Stanislav Kinsburskii wrote:
> > This series addresses bugs found during a review of the mshv_root module
> > introduced by commit 621191d709b14 ("Drivers: hv: Introduce mshv_root
> > module to expose /dev/mshv to VMMs").
> >
> > The fixes range from data corruption and use-after-free to silent
> > functional failures:
> >
> > - IRQ state leak and type truncation in hypercall helpers
> > (hv_call_modify_spa_host_access)
> > - Integer overflow on userspace-controlled allocation size
> > (mshv_region_create)
> > - Missing locking, broken seqcount read protection, and a check on
> > uninitialized data in the irqfd path ? the latter makes
> > level-triggered interrupt resampling completely non-functional
> > - Duplicate GSI 0 detection using the wrong predicate
> > - Use-after-RCU in port ID lookup
> > - Missing VP index bounds check in intercept ISR (OOB in interrupt
> > context)
> > - Missing error code on VP allocation failure (silent success to
> > userspace)
>
> Lot of changes here, curious, how were all these discovered
> suddenly? Stress testing, internal/external? Or reported by
> copilot/sashiko/etc..
>
These are suggested by Claude Opus 4.6.
> How were the fixes tested?
>
I ran cloud hypervisor intergration tests suite against these changes,
which covers a wide range of scenarios including interrupt handling,
memory management, and VP lifecycle.
Thanks,
Stanislav
> Thanks,
> -Mukesh
>
>
> > ---
> >
> > Stanislav Kinsburskii (10):
> > mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
> > mshv: Fix potential integer overflow in mshv_region_create
> > mshv: Fix missing lock in mshv_irqfd_deassign
> > mshv: Fix broken seqcount read protection
> > mshv: Fix level-triggered check on uninitialized data
> > mshv: Fix duplicate GSI detection for GSI 0
> > mshv: Fix use-after-RCU in mshv_portid_lookup
> > mshv: Use kfree_rcu in mshv_portid_free
> > mshv: Add missing vp_index bounds check in intercept ISR
> > mshv: Fix missing error code on VP allocation failure
> >
> >
> > drivers/hv/mshv_eventfd.c | 75 ++++++++++++++++++++++------------------
> > drivers/hv/mshv_irq.c | 2 +
> > drivers/hv/mshv_portid_table.c | 6 +--
> > drivers/hv/mshv_regions.c | 2 +
> > drivers/hv/mshv_root_hv_call.c | 18 +++-------
> > drivers/hv/mshv_root_main.c | 4 ++
> > drivers/hv/mshv_synic.c | 4 ++
> > 7 files changed, 59 insertions(+), 52 deletions(-)
> >
>
^ permalink raw reply
* Re: [PATCH] mshv: Simplify GPA map/unmap hypercall helpers
From: Anirudh Rayabharam @ 2026-04-30 9:57 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <afIgeaLSiCG4f8lW@skinsburskii.localdomain>
On Wed, Apr 29, 2026 at 08:15:05AM -0700, Stanislav Kinsburskii wrote:
> On Wed, Apr 29, 2026 at 11:02:37AM +0000, Anirudh Rayabharam wrote:
> > On Tue, Apr 28, 2026 at 11:21:12PM +0000, Stanislav Kinsburskii wrote:
> > > Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> > > preceding bug-fix patches:
> > >
> > > Move "done += completed" before the status checks so that pages mapped
> > > by a partially-successful batch are included in the error cleanup unmap.
> > > Previously these mappings were leaked on failure.
> > >
> > > While here, improve type safety and readability:
> > > - Change "int done" to "u64 done" to match the u64 page_count it is
> > > compared against, avoiding signed/unsigned comparison hazards.
> > > - Use u64 for loop iteration and batch size variables consistently.
> > > - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
> > > - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
> > > - Simplify the error-path unmap to use "done << large_shift" directly
> > > instead of mutating done in place.
> > >
> > > Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> > > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > > ---
> > > drivers/hv/mshv_root_hv_call.c | 55 +++++++++++++++-------------------------
> > > 1 file changed, 20 insertions(+), 35 deletions(-)
> > >
> > > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > > index e5992c324904a..f5f205a397834 100644
> > > --- a/drivers/hv/mshv_root_hv_call.c
> > > +++ b/drivers/hv/mshv_root_hv_call.c
> > > @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > struct hv_input_map_gpa_pages *input_page;
> > > u64 status, *pfnlist;
> > > unsigned long irq_flags, large_shift = 0;
> > > - int ret = 0, done = 0;
> > > - u64 page_count = page_struct_count;
> > > + u64 done = 0, page_count = page_struct_count;
> > > + int ret = 0;
> > >
> > > if (page_count == 0 || (pages && mmio_spa))
> > > return -EINVAL;
> > > @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > }
> > >
> > > while (done < page_count) {
> > > - ulong i, completed, remain = page_count - done;
> > > - int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> > > + u64 i, completed, remain = page_count - done;
> > > + u64 rep_count = min(remain, (u64)HV_MAP_GPA_BATCH_SIZE);
> > >
> > > local_irq_save(irq_flags);
> > > input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > @@ -224,23 +224,13 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > input_page->map_flags = flags;
> > > pfnlist = input_page->source_gpa_page_list;
> > >
> > > - for (i = 0; i < rep_count; i++)
> > > - if (flags & HV_MAP_GPA_NO_ACCESS) {
> > > + for (i = 0; i < rep_count; i++) {
> > > + if (flags & HV_MAP_GPA_NO_ACCESS)
> > > pfnlist[i] = 0;
> > > - } else if (pages) {
> > > - u64 index = (done + i) << large_shift;
> > > -
> > > - if (index >= page_struct_count) {
> > > - ret = -EINVAL;
> > > - break;
> > > - }
> > > - pfnlist[i] = page_to_pfn(pages[index]);
> > > - } else {
> > > + else if (pages)
> > > + pfnlist[i] = page_to_pfn(pages[(done + i) << large_shift]);
> > > + else
> > > pfnlist[i] = mmio_spa + done + i;
> > > - }
> > > - if (ret) {
> > > - local_irq_restore(irq_flags);
> > > - break;
> > > }
> > >
> > > status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> > > @@ -248,29 +238,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
> > > local_irq_restore(irq_flags);
> > >
> > > completed = hv_repcomp(status);
> > > + done += completed;
> > >
> > > if (hv_result_needs_memory(status)) {
> > > ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> > > HV_MAP_GPA_DEPOSIT_PAGES);
> > > if (ret)
> > > break;
> > > -
> > > } else if (!hv_result_success(status)) {
> > > ret = hv_result_to_errno(status);
> > > break;
> > > }
> > > -
> > > - done += completed;
> > > }
> > >
> > > if (ret && done) {
> > > u32 unmap_flags = 0;
> > >
> > > - if (flags & HV_MAP_GPA_LARGE_PAGE) {
> > > + if (flags & HV_MAP_GPA_LARGE_PAGE)
> > > unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> > > - done <<= large_shift;
> > > - }
> > > - hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> > > + hv_call_unmap_gpa_pages(partition_id, gfn,
> > > + done << large_shift, unmap_flags);
> >
> > How does this work? Earlier we were doing "done << large_shift" only if
> > HV_MAP_GPA_LARGE_PAGE is set but now we always do it.
> >
>
> It works becuase large_shift in initialized to 0 when
> HV_MAP_GPA_LARGE_PAGE is not set.
Oh I see.
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
^ permalink raw reply
* [PATCH net-next v2] net: mana: hardening: Reject zero max_num_queues from MANA_QUERY_VPORT_CONFIG
From: Erni Sri Satya Vennela @ 2026-04-30 8:56 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, ernis, dipayanroy, shirazsaleem, kees,
linux-hyperv, netdev, linux-kernel
As a part of MANA hardening for CVM, validate that max_num_sq and
max_num_rq returned by MANA_QUERY_VPORT_CONFIG are not zero. These
values flow into apc->num_queues, which is used as an allocation count
and loop bound. A zero value would result in zero-size allocations and
incorrect driver behavior.
Return -EPROTO if either value is zero.
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v2:
* Rebase to latest main.
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..7c83e010a1e6 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1262,6 +1262,12 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
*max_sq = resp.max_num_sq;
*max_rq = resp.max_num_rq;
+
+ if (*max_sq == 0 || *max_rq == 0) {
+ netdev_err(apc->ndev, "Invalid max queues from vPort config\n");
+ return -EPROTO;
+ }
+
if (resp.num_indirection_ent > 0 &&
resp.num_indirection_ent <= MANA_INDIRECT_TABLE_MAX_SIZE &&
is_power_of_2(resp.num_indirection_ent)) {
--
2.34.1
^ permalink raw reply related
* [PATCH net-next v2] net: mana: hardening: Reject zero max_num_queues from GDMA_QUERY_MAX_RESOURCES
From: Erni Sri Satya Vennela @ 2026-04-30 8:36 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, horms, shradhagupta, dipayanroy, ernis,
yury.norov, linux-hyperv, netdev, linux-kernel
In a CVM environment, hardware responses cannot be trusted. The
GDMA_QUERY_MAX_RESOURCES command returns resource limits used to
determine the maximum number of queues.
In mana_gd_query_max_resources(), gc->max_num_queues is initialized
from num_online_cpus() and successively clamped by the hardware-reported
max_eq, max_cq, max_sq, max_rq, and num_msix_usable values. If any of
these hardware values is zero, gc->max_num_queues becomes zero and the
function returns success. This leads to a confusing failure later when
alloc_etherdev_mq() is called with zero queues, returning NULL and
producing a misleading -ENOMEM error.
Add an explicit zero check for gc->max_num_queues after all clamping
steps and return -ENOSPC for a clear early failure, consistent with the
existing gc->num_msix_usable <= 1 guard.
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v2:
* Rebase to latest main.
---
drivers/net/ethernet/microsoft/mana/gdma_main.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 098fbda0d128..f3316e929175 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -194,6 +194,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
if (gc->max_num_queues > gc->num_msix_usable - 1)
gc->max_num_queues = gc->num_msix_usable - 1;
+ if (gc->max_num_queues == 0)
+ return -ENOSPC;
+
return 0;
}
--
2.34.1
^ permalink raw reply related
* [PATCH net-next v7] net: mana: Expose hardware diagnostic info via debugfs
From: Erni Sri Satya Vennela @ 2026-04-30 7:53 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, andrew+netdev, davem,
edumazet, kuba, pabeni, kotaranov, horms, shradhagupta, ernis,
dipayanroy, yury.norov, shirazsaleem, kees, linux-hyperv, netdev,
linux-kernel, linux-rdma
Add debugfs entries to expose hardware configuration and diagnostic
information that aids in debugging driver initialization and runtime
operations without adding noise to dmesg.
The debugfs directory for each PCI device is named using pci_name()
(the unique BDF address), and its creation and removal is integrated
into mana_gd_setup() and mana_gd_cleanup_device() respectively, so
that all callers (probe, remove, suspend, resume, shutdown) share a
single code path.
Device-level entries (under /sys/kernel/debug/mana/<BDF>/):
- num_msix_usable, max_num_queues: Max resources from hardware
- gdma_protocol_ver, pf_cap_flags1: VF version negotiation results
- num_vports, bm_hostmode: Device configuration
Per-vPort entries (under /sys/kernel/debug/mana/<BDF>/vportN/):
- port_handle: Hardware vPort handle
- max_sq, max_rq: Max queues from vPort config
- indir_table_sz: Indirection table size
- steer_rx, steer_rss, steer_update_tab, steer_cqe_coalescing:
Last applied steering configuration parameters
Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
---
Changes in v7:
* Rebase to latest main.
Changes in v6:
* Move out of patchset and create a separate patch.
Changes in v5:
* Update commit message.
* Fix conflicts to align with the new patches.
* Make it part of patchset.
Changes in v4:
* Rebase and fix conflicts.
Changes in v3:
* Rename mana_gd_cleanup to mana_gd_cleanup_device.
* Add creation of debugfs entries in mana_gd_setup.
* Add removal of debugfs entries in mana_gd_cleanup_device.
* Remove bm_hostmode and num_vports from debugfs in mana_remove itself,
because "ac" gets freed before debugfs_remove_recursive, to avoid
Use-After-Free error.
* Add "goto out:" in mana_cfg_vport_steering to avoid populating apc
values when resp.hdr.status is not NULL.
Changes in v2:
* Add debugfs_remove_recursice for gc>mana_pci_debugfs in
mana_gd_suspend to handle multiple duplicates creation in
mana_gd_setup and mana_gd_resume path.
* Move debugfs creation for num_vports and bm_hostmode out of
if(!resuming) condition since we have to create it again even for
resume.
* Recreate mana_pci_debugfs in mana_gd_resume.
---
.../net/ethernet/microsoft/mana/gdma_main.c | 68 +++++++++++--------
drivers/net/ethernet/microsoft/mana/mana_en.c | 33 +++++++++
include/net/mana/gdma.h | 1 +
include/net/mana/mana.h | 8 +++
4 files changed, 81 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 098fbda0d128..33fd7d9259c9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -194,6 +194,11 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
if (gc->max_num_queues > gc->num_msix_usable - 1)
gc->max_num_queues = gc->num_msix_usable - 1;
+ debugfs_create_u32("num_msix_usable", 0400, gc->mana_pci_debugfs,
+ &gc->num_msix_usable);
+ debugfs_create_u32("max_num_queues", 0400, gc->mana_pci_debugfs,
+ &gc->max_num_queues);
+
return 0;
}
@@ -1264,6 +1269,13 @@ int mana_gd_verify_vf_version(struct pci_dev *pdev)
return err ? err : -EPROTO;
}
gc->pf_cap_flags1 = resp.pf_cap_flags1;
+ gc->gdma_protocol_ver = resp.gdma_protocol_ver;
+
+ debugfs_create_x64("gdma_protocol_ver", 0400, gc->mana_pci_debugfs,
+ &gc->gdma_protocol_ver);
+ debugfs_create_x64("pf_cap_flags1", 0400, gc->mana_pci_debugfs,
+ &gc->pf_cap_flags1);
+
if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
if (err) {
@@ -1943,15 +1955,20 @@ static int mana_gd_setup(struct pci_dev *pdev)
struct gdma_context *gc = pci_get_drvdata(pdev);
int err;
+ gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
+ mana_debugfs_root);
+
err = mana_gd_init_registers(pdev);
if (err)
- return err;
+ goto remove_debugfs;
mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0);
- if (!gc->service_wq)
- return -ENOMEM;
+ if (!gc->service_wq) {
+ err = -ENOMEM;
+ goto remove_debugfs;
+ }
err = mana_gd_setup_hwc_irqs(pdev);
if (err) {
@@ -1992,11 +2009,14 @@ static int mana_gd_setup(struct pci_dev *pdev)
free_workqueue:
destroy_workqueue(gc->service_wq);
gc->service_wq = NULL;
+remove_debugfs:
+ debugfs_remove_recursive(gc->mana_pci_debugfs);
+ gc->mana_pci_debugfs = NULL;
dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err);
return err;
}
-static void mana_gd_cleanup(struct pci_dev *pdev)
+static void mana_gd_cleanup_device(struct pci_dev *pdev)
{
struct gdma_context *gc = pci_get_drvdata(pdev);
@@ -2008,6 +2028,10 @@ static void mana_gd_cleanup(struct pci_dev *pdev)
destroy_workqueue(gc->service_wq);
gc->service_wq = NULL;
}
+
+ debugfs_remove_recursive(gc->mana_pci_debugfs);
+ gc->mana_pci_debugfs = NULL;
+
dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
}
@@ -2065,9 +2089,6 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
gc->dev = &pdev->dev;
xa_init(&gc->irq_contexts);
- gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
- mana_debugfs_root);
-
err = mana_gd_setup(pdev);
if (err)
goto unmap_bar;
@@ -2096,16 +2117,8 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
cleanup_mana:
mana_remove(&gc->mana, false);
cleanup_gd:
- mana_gd_cleanup(pdev);
+ mana_gd_cleanup_device(pdev);
unmap_bar:
- /*
- * at this point we know that the other debugfs child dir/files
- * are either not yet created or are already cleaned up.
- * The pci debugfs folder clean-up now, will only be cleaning up
- * adapter-MTU file and apc->mana_pci_debugfs folder.
- */
- debugfs_remove_recursive(gc->mana_pci_debugfs);
- gc->mana_pci_debugfs = NULL;
xa_destroy(&gc->irq_contexts);
pci_iounmap(pdev, bar0_va);
free_gc:
@@ -2155,11 +2168,7 @@ static void mana_gd_remove(struct pci_dev *pdev)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, false);
- mana_gd_cleanup(pdev);
-
- debugfs_remove_recursive(gc->mana_pci_debugfs);
-
- gc->mana_pci_debugfs = NULL;
+ mana_gd_cleanup_device(pdev);
xa_destroy(&gc->irq_contexts);
@@ -2181,7 +2190,7 @@ int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
- mana_gd_cleanup(pdev);
+ mana_gd_cleanup_device(pdev);
return 0;
}
@@ -2201,13 +2210,18 @@ int mana_gd_resume(struct pci_dev *pdev)
err = mana_probe(&gc->mana, true);
if (err)
- return err;
+ goto cleanup_gd;
err = mana_rdma_probe(&gc->mana_ib);
if (err)
- return err;
+ goto cleanup_mana;
return 0;
+cleanup_mana:
+ mana_remove(&gc->mana, true);
+cleanup_gd:
+ mana_gd_cleanup_device(pdev);
+ return err;
}
/* Quiesce the device for kexec. This is also called upon reboot/shutdown. */
@@ -2220,11 +2234,7 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
mana_rdma_remove(&gc->mana_ib);
mana_remove(&gc->mana, true);
- mana_gd_cleanup(pdev);
-
- debugfs_remove_recursive(gc->mana_pci_debugfs);
-
- gc->mana_pci_debugfs = NULL;
+ mana_gd_cleanup_device(pdev);
pci_disable_device(pdev);
}
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..077d3a1ff6bf 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1276,6 +1276,9 @@ static int mana_query_vport_cfg(struct mana_port_context *apc, u32 vport_index,
apc->port_handle = resp.vport;
ether_addr_copy(apc->mac_addr, resp.mac_addr);
+ apc->vport_max_sq = *max_sq;
+ apc->vport_max_rq = *max_rq;
+
return 0;
}
@@ -1430,6 +1433,11 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc,
netdev_info(ndev, "Configured steering vPort %llu entries %u\n",
apc->port_handle, apc->indir_table_sz);
+
+ apc->steer_rx = rx;
+ apc->steer_rss = apc->rss_state;
+ apc->steer_update_tab = update_tab;
+ apc->steer_cqe_coalescing = req->cqe_coalescing_enable;
out:
kfree(req);
return err;
@@ -3161,6 +3169,23 @@ static int mana_init_port(struct net_device *ndev)
eth_hw_addr_set(ndev, apc->mac_addr);
sprintf(vport, "vport%d", port_idx);
apc->mana_port_debugfs = debugfs_create_dir(vport, gc->mana_pci_debugfs);
+
+ debugfs_create_u64("port_handle", 0400, apc->mana_port_debugfs,
+ &apc->port_handle);
+ debugfs_create_u32("max_sq", 0400, apc->mana_port_debugfs,
+ &apc->vport_max_sq);
+ debugfs_create_u32("max_rq", 0400, apc->mana_port_debugfs,
+ &apc->vport_max_rq);
+ debugfs_create_u32("indir_table_sz", 0400, apc->mana_port_debugfs,
+ &apc->indir_table_sz);
+ debugfs_create_u32("steer_rx", 0400, apc->mana_port_debugfs,
+ &apc->steer_rx);
+ debugfs_create_u32("steer_rss", 0400, apc->mana_port_debugfs,
+ &apc->steer_rss);
+ debugfs_create_u32("steer_update_tab", 0400, apc->mana_port_debugfs,
+ &apc->steer_update_tab);
+ debugfs_create_u32("steer_cqe_coalescing", 0400, apc->mana_port_debugfs,
+ &apc->steer_cqe_coalescing);
debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs,
&apc->speed);
return 0;
@@ -3659,6 +3684,11 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
ac->bm_hostmode = bm_hostmode;
+ debugfs_create_u16("num_vports", 0400, gc->mana_pci_debugfs,
+ &ac->num_ports);
+ debugfs_create_u8("bm_hostmode", 0400, gc->mana_pci_debugfs,
+ &ac->bm_hostmode);
+
if (!resuming) {
ac->num_ports = num_ports;
} else {
@@ -3800,6 +3830,9 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
mana_gd_deregister_device(gd);
+ debugfs_lookup_and_remove("bm_hostmode", gc->mana_pci_debugfs);
+ debugfs_lookup_and_remove("num_vports", gc->mana_pci_debugfs);
+
if (suspending)
return;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6d836060976a..70d62bc32837 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -442,6 +442,7 @@ struct gdma_context {
struct gdma_dev mana_ib;
u64 pf_cap_flags1;
+ u64 gdma_protocol_ver;
struct workqueue_struct *service_wq;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 8f721cd4e4a7..18215388d2c7 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -568,6 +568,14 @@ struct mana_port_context {
/* Debugfs */
struct dentry *mana_port_debugfs;
+
+ /* Cached vport/steering config for debugfs */
+ u32 vport_max_sq;
+ u32 vport_max_rq;
+ u32 steer_rx;
+ u32 steer_rss;
+ u32 steer_update_tab;
+ u32 steer_cqe_coalescing;
};
netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH 0/8] firmware: sysfb: Consolidate config/code wrt. sysfb_primary_screen
From: Thomas Zimmermann @ 2026-04-30 6:35 UTC (permalink / raw)
To: patchwork-bot+linux-riscv
Cc: linux-riscv, javierm, arnd, ardb, ilias.apalodimas, chenhuacai,
kernel, maarten.lankhorst, mripard, airlied, simona, kys,
haiyangz, wei.liu, decui, longli, deller, linux-arm-kernel,
loongarch, linux-efi, dri-devel, linux-hyperv, linux-fbdev
In-Reply-To: <177751955329.2274119.12779807302343885295.git-patchwork-notify@kernel.org>
Hi
Am 30.04.26 um 05:25 schrieb patchwork-bot+linux-riscv@kernel.org:
> Hello:
>
> This series was applied to riscv/linux.git (fixes)
> by Ard Biesheuvel <ardb@kernel.org>:
Patch 3 was fairly controversial.
Best regards
Thomas
>
> On Thu, 2 Apr 2026 11:09:14 +0200 you wrote:
>> The global state sysfb_primary_screen holds information about the
>> framebuffer provided by EFI/BIOS systems. It is part of the sysfb
>> module, but used in several places without direct connection to
>> sysfb. Fix this by making users of sysfb_primary_screen depend on
>> CONFIG_SYSFB. Fix a few issues in the process.
>>
>> Patches 1 and 2 fix general errors in the Kconfig rules. In any case,
>> these patches should be considered even without the rest of the series.
>>
>> [...]
> Here is the summary with links:
> - [1/8] hv: Select CONFIG_SYSFB only for CONFIG_HYPERV_VMBUS
> https://git.kernel.org/riscv/c/d33db956c961
> - [2/8] firmware: efi: Never declare sysfb_primary_display on x86
> https://git.kernel.org/riscv/c/5241c2ca33bb
> - [3/8] firmware: sysfb: Make CONFIG_SYSFB a user-selectable option
> (no matching commit)
> - [4/8] firmware: sysfb: Split sysfb.c into sysfb_primary.c and sysfb_pci.c
> (no matching commit)
> - [5/8] firmware: sysfb: Implement screen_info relocation for primary display
> (no matching commit)
> - [6/8] firmware: sysfb: Avoid forward-declaring sysfb_parent_dev()
> (no matching commit)
> - [7/8] firmware: efi: Make CONFIG_EFI_EARLYCON depend on CONFIG_SYSFB; clean up
> (no matching commit)
> - [8/8] firmware: sysfb: Move CONFIG_FIRMWARE_EDID to firmware options
> (no matching commit)
>
> You are awesome, thank you!
--
--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstr. 146, 90461 Nürnberg, Germany, www.suse.com
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich, (HRB 36809, AG Nürnberg)
^ permalink raw reply
* Re: [PATCH 3/3] net: mana: remove double CQ cleanup in mana_create_rxq error path
From: Aditya Garg @ 2026-04-30 4:14 UTC (permalink / raw)
To: Dipayaan Roy, kys, haiyangz, wei.liu, decui, andrew+netdev, davem,
edumazet, kuba, pabeni, leon, longli, kotaranov, horms,
shradhagupta, ssengar, ernis, shirazsaleem, linux-hyperv, netdev,
linux-kernel, linux-rdma, stephen, jacob.e.keller, dipayanroy,
leitao, kees, john.fastabend, hawk, bpf, daniel, ast, sdf,
yury.norov
In-Reply-To: <20260430035935.1859220-4-dipayanroy@linux.microsoft.com>
On 30-04-2026 09:27, Dipayaan Roy wrote:
> In mana_create_rxq(), the error cleanup path calls mana_destroy_rxq()
> followed by mana_deinit_cq(). This is incorrect for two reasons:
>
> 1. mana_destroy_rxq() already calls mana_deinit_cq() internally,
> so the CQ's GDMA queue is destroyed twice.
>
> 2. mana_destroy_rxq() frees the rxq via kfree(rxq) before returning.
> The subsequent mana_deinit_cq(apc, cq) then operates on freed memory
> since cq points to &rxq->rx_cq, which is embedded in the
> already-freed rxq structure — a use-after-free.
>
> Remove the redundant mana_deinit_cq() call from the error path since
> mana_destroy_rxq() already handles CQ cleanup. mana_deinit_cq() is
> itself safe for an uninitialized CQ as it checks for a NULL gdma_cq
> before proceeding.
>
> Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
> ---
> drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ---
> 1 file changed, 3 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index f2a6ea162dc3..9afc786b297a 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -2799,9 +2799,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
>
> mana_destroy_rxq(apc, rxq, false);
>
> - if (cq)
> - mana_deinit_cq(apc, cq);
> -
> return NULL;
> }
>
Reviewed-by: Aditya Garg <gargaditya@linux.microsoft.com>
^ permalink raw reply
* [PATCH 3/3] net: mana: remove double CQ cleanup in mana_create_rxq error path
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>
In mana_create_rxq(), the error cleanup path calls mana_destroy_rxq()
followed by mana_deinit_cq(). This is incorrect for two reasons:
1. mana_destroy_rxq() already calls mana_deinit_cq() internally,
so the CQ's GDMA queue is destroyed twice.
2. mana_destroy_rxq() frees the rxq via kfree(rxq) before returning.
The subsequent mana_deinit_cq(apc, cq) then operates on freed memory
since cq points to &rxq->rx_cq, which is embedded in the
already-freed rxq structure — a use-after-free.
Remove the redundant mana_deinit_cq() call from the error path since
mana_destroy_rxq() already handles CQ cleanup. mana_deinit_cq() is
itself safe for an uninitialized CQ as it checks for a NULL gdma_cq
before proceeding.
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index f2a6ea162dc3..9afc786b297a 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2799,9 +2799,6 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
mana_destroy_rxq(apc, rxq, false);
- if (cq)
- mana_deinit_cq(apc, cq);
-
return NULL;
}
--
2.43.0
^ permalink raw reply related
* [PATCH 2/3] net: mana: Skip WQ object destruction for uninitialized RXQ
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>
In mana_destroy_rxq(), mana_destroy_wq_obj() is called unconditionally
even when the WQ object was never created (rxobj is still
INVALID_MANA_HANDLE). When mana_create_rxq() fails before
mana_create_wq_obj() succeeds, the error path calls mana_destroy_rxq()
which sends a bogus destroy command to the hardware:
mana 7870:00:00.0: HWC: Failed hw_channel req: 0x1d
mana 7870:00:00.0: Failed to send mana message: -71, 0x1d
mana 7870:00:00.0 eth7: Failed to destroy WQ object: -71
Guard mana_destroy_wq_obj() with an INVALID_MANA_HANDLE check so that
mana_destroy_rxq() is safe to call at any stage of RXQ initialization.
Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index dfb4ba9f7664..f2a6ea162dc3 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2524,7 +2524,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
if (xdp_rxq_info_is_reg(&rxq->xdp_rxq))
xdp_rxq_info_unreg(&rxq->xdp_rxq);
- mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
+ if (rxq->rxobj != INVALID_MANA_HANDLE)
+ mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
mana_deinit_cq(apc, &rxq->rx_cq);
--
2.43.0
^ permalink raw reply related
* [PATCH 1/3] net: mana: check xdp_rxq registration before unreg in mana_destroy_rxq()
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260430035935.1859220-1-dipayanroy@linux.microsoft.com>
When mana_create_rxq() fails at mana_create_wq_obj() or any step before
xdp_rxq_info_reg() is called, the error path jumps to `out:` which calls
mana_destroy_rxq(). mana_destroy_rxq() unconditionally calls
xdp_rxq_info_unreg() on xilinx xdp_rxq that was never registered,
triggering a WARN_ON in net/core/xdp.c:
mana 7870:00:00.0: HWC: Failed hw_channel req: 0xc000009a
mana 7870:00:00.0 eth7: Failed to create RXQ: err = -71
Driver BUG
WARNING: CPU: 442 PID: 491615 at ../net/core/xdp.c:150 xdp_rxq_info_unreg+0x44/0x70
Modules linked in: tcp_bbr xsk_diag udp_diag raw_diag unix_diag af_packet_diag netlink_diag nf_tables nfnetlink tcp_diag inet_diag binfmt_misc rpcsec_gss_krb5 nfsv3 nfs_acl auth_rpcgss nfsv4 dns_resolver nfs lockd ext4 grace crc16 iscsi_tcp mbcache fscache libiscsi_tcp jbd2 netfs rpcrdma af_packet sunrpc rdma_ucm ib_iser rdma_cm iw_cm iscsi_ibft ib_cm iscsi_boot_sysfs libiscsi rfkill scsi_transport_iscsi mana_ib ib_uverbs ib_core mana hyperv_drm(X) drm_shmem_helper intel_rapl_msr drm_kms_helper intel_rapl_common syscopyarea nls_iso8859_1 sysfillrect intel_uncore_frequency_common nls_cp437 vfat fat nfit sysimgblt libnvdimm hv_netvsc(X) hv_utils(X) fb_sys_fops hv_balloon(X) joydev fuse drm dm_mod configfs ip_tables x_tables xfs libcrc32c sd_mod nvme nvme_core nvme_common t10_pi crc64_rocksoft_generic crc64_rocksoft crc64 hid_generic serio_raw pci_hyperv(X) hv_storvsc(X) scsi_transport_fc hyperv_keyboard(X) hid_hyperv(X) pci_hyperv_intf(X) crc32_pclmul
crc32c_intel ghash_clmulni_intel aesni_intel crypto_simd cryptd hv_vmbus(X) softdog sg scsi_mod efivarfs
Supported: Yes, External
CPU: 442 PID: 491615 Comm: ethtool Kdump: loaded Tainted: G X 5.14.21-150500.55.136-default #1 SLE15-SP5 a627be1b53abbfd64ad16b2685e4308c52847f42
Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 07/25/2025
RIP: 0010:xdp_rxq_info_unreg+0x44/0x70
Code: e8 91 fe ff ff c7 43 0c 02 00 00 00 48 c7 03 00 00 00 00 5b c3 cc cc cc cc e9 58 3a 1c 00 48 c7 c7 f6 5f 19 97 e8 5c a4 7e ff <0f> 0b 83 7b 0c 01 74 ca 48 c7 c7 d9 5f 19 97 e8 48 a4 7e ff 0f 0b
RSP: 0018:ff3df6c8f7207818 EFLAGS: 00010286
RAX: 0000000000000000 RBX: ff30d89f94808a80 RCX: 0000000000000027
RDX: 0000000000000000 RSI: 0000000000000002 RDI: ff30d94bdcca2908
RBP: 0000000000080000 R08: ffffffff98ed11a0 R09: ff3df6c8f72077a0
R10: dead000000000100 R11: 000000000000000a R12: 0000000000000000
R13: 0000000000002000 R14: 0000000000040000 R15: ff30d89f94800000
FS: 00007fe6d8432b80(0000) GS:ff30d94bdcc80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fe6d81a89b1 CR3: 00000b3b6d578001 CR4: 0000000000371ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
Call Trace:
<TASK>
mana_destroy_rxq+0x5b/0x2f0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
mana_create_rxq.isra.55+0x3db/0x720 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
? simple_lookup+0x36/0x50
? current_time+0x42/0x80
? __d_free_external+0x30/0x30
mana_alloc_queues+0x32a/0x470 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
? _raw_spin_unlock+0xa/0x30
? d_instantiate.part.29+0x2e/0x40
? _raw_spin_unlock+0xa/0x30
? debugfs_create_dir+0xe4/0x140
mana_attach+0x5c/0xf0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
mana_set_ringparam+0xd5/0x1a0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94]
ethnl_set_rings+0x292/0x320
genl_family_rcv_msg_doit.isra.15+0x11b/0x150
genl_rcv_msg+0xe3/0x1e0
? rings_prepare_data+0x80/0x80
? genl_family_rcv_msg_doit.isra.15+0x150/0x150
netlink_rcv_skb+0x50/0x100
genl_rcv+0x24/0x40
netlink_unicast+0x1b6/0x280
netlink_sendmsg+0x365/0x4d0
sock_sendmsg+0x5f/0x70
__sys_sendto+0x112/0x140
__x64_sys_sendto+0x24/0x30
do_syscall_64+0x5b/0x80
? handle_mm_fault+0xd7/0x290
? do_user_addr_fault+0x2d8/0x740
? exc_page_fault+0x67/0x150
entry_SYSCALL_64_after_hwframe+0x6b/0xd5
RIP: 0033:0x7fe6d8122f06
Code: 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 11 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 72 f3 c3 41 57 41 56 4d 89 c7 41 55 41 54 41
RSP: 002b:00007fff2b66b068 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 000055771123d2a0 RCX: 00007fe6d8122f06
RDX: 0000000000000034 RSI: 000055771123d3b0 RDI: 0000000000000003
RBP: 00007fff2b66b100 R08: 00007fe6d8203360 R09: 000000000000000c
R10: 0000000000000000 R11: 0000000000000246 R12: 000055771123d350
R13: 000055771123d340 R14: 0000000000000000 R15: 00007fff2b66b2b0
</TASK>
Guard the xdp_rxq_info_unreg() call with xdp_rxq_info_is_reg() so that
mana_destroy_rxq() is safe to call regardless of how far initialization
progressed.
Fixes: ed5356b53f07 ("net: mana: Add XDP support")
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a654b3699c4c..dfb4ba9f7664 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -2520,7 +2520,9 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
napi_disable_locked(napi);
netif_napi_del_locked(napi);
}
- xdp_rxq_info_unreg(&rxq->xdp_rxq);
+
+ if (xdp_rxq_info_is_reg(&rxq->xdp_rxq))
+ xdp_rxq_info_unreg(&rxq->xdp_rxq);
mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
--
2.43.0
^ permalink raw reply related
* [PATCH 0/3] net: mana: Fix mana_destroy_rxq() cleanup for partial RXQ init
From: Dipayaan Roy @ 2026-04-30 3:57 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
When mana_create_rxq() fails partway through initialization (e.g. the
hardware rejects the WQ object creation), the error path calls
mana_destroy_rxq() to tear down a partially-initialized RXQ.
This exposed multiple issues in mana_destroy_rxq() path, as it assumed
the RXQ was always fully initialized, leading to multiple issues:
1. xdp_rxq_info_unreg() was called on an unregistered xdp_rxq,
triggering a WARN_ON ("Driver BUG") in net/core/xdp.c.
2. mana_destroy_wq_obj() was called with INVALID_MANA_HANDLE,
sending a bogus destroy command to the hardware.
3. mana_deinit_cq() was called twice — once inside mana_destroy_rxq()
and again in mana_create_rxq()'s error path — causing a
use-after-free since mana_destroy_rxq() frees the rxq first.
This was observed during ethtool ring parameter changes when the
hardware returned an error creating the RXQ. This series makes
mana_destroy_rxq() safe to call at any stage of RXQ initialization
by guarding each teardown step, and removes the redundant cleanup
in mana_create_rxq().
Dipayaan Roy (3):
net: mana: check xdp_rxq registration before unreg in
mana_destroy_rxq()
net: mana: Skip WQ object destruction for uninitialized RXQ
net: mana: remove double CQ cleanup in mana_create_rxq error path
drivers/net/ethernet/microsoft/mana/mana_en.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
--
2.43.0
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox