Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH V3 02/11] x86/hyperv: Cosmetic changes in irqdomain.c for readability
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Make cosmetic changes:
 o Rename struct pci_dev *dev to *pdev since there are cases of
   struct device *dev in the file and all over the kernel
 o Rename hv_build_pci_dev_id to hv_build_devid_type_pci in anticipation
   of building different types of device IDs
 o Fix checkpatch.pl issues with return and extraneous printk
 o Replace spaces with tabs
 o Rename struct hv_devid *xxx to struct hv_devid *hv_devid given code
   paths involve many types of device IDs
 o Fix indentation in a large if block by using goto.

There are no functional changes.

Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 arch/x86/hyperv/irqdomain.c | 198 +++++++++++++++++++-----------------
 1 file changed, 104 insertions(+), 94 deletions(-)

diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 365e364268d9..b3ad50a874dc 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -1,5 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-
 /*
  * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor.
  *
@@ -14,8 +13,8 @@
 #include <linux/irqchip/irq-msi-lib.h>
 #include <asm/mshyperv.h>
 
-static int hv_map_interrupt(union hv_device_id device_id, bool level,
-		int cpu, int vector, struct hv_interrupt_entry *entry)
+static int hv_map_interrupt(union hv_device_id hv_devid, bool level,
+		int cpu, int vector, struct hv_interrupt_entry *ret_entry)
 {
 	struct hv_input_map_device_interrupt *input;
 	struct hv_output_map_device_interrupt *output;
@@ -32,7 +31,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
 	intr_desc = &input->interrupt_descriptor;
 	memset(input, 0, sizeof(*input));
 	input->partition_id = hv_current_partition_id;
-	input->device_id = device_id.as_uint64;
+	input->device_id = hv_devid.as_uint64;
 	intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
 	intr_desc->vector_count = 1;
 	intr_desc->target.vector = vector;
@@ -44,7 +43,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
 
 	intr_desc->target.vp_set.valid_bank_mask = 0;
 	intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-	nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu));
+	nr_bank = cpumask_to_vpset(&intr_desc->target.vp_set, cpumask_of(cpu));
 	if (nr_bank < 0) {
 		local_irq_restore(flags);
 		pr_err("%s: unable to generate VP set\n", __func__);
@@ -61,7 +60,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
 
 	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
 			input, output);
-	*entry = output->interrupt_entry;
+	*ret_entry = output->interrupt_entry;
 
 	local_irq_restore(flags);
 
@@ -71,21 +70,19 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
 	return hv_result_to_errno(status);
 }
 
-static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
+static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
 {
 	unsigned long flags;
 	struct hv_input_unmap_device_interrupt *input;
-	struct hv_interrupt_entry *intr_entry;
 	u64 status;
 
 	local_irq_save(flags);
 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
 
 	memset(input, 0, sizeof(*input));
-	intr_entry = &input->interrupt_entry;
 	input->partition_id = hv_current_partition_id;
 	input->device_id = id;
-	*intr_entry = *old_entry;
+	input->interrupt_entry = *irq_entry;
 
 	status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
 	local_irq_restore(flags);
@@ -115,67 +112,71 @@ static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
 	return 0;
 }
 
-static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev)
+static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
 {
-	union hv_device_id dev_id;
+	int pos;
+	union hv_device_id hv_devid;
 	struct rid_data data = {
 		.bridge = NULL,
-		.rid = PCI_DEVID(dev->bus->number, dev->devfn)
+		.rid = PCI_DEVID(pdev->bus->number, pdev->devfn)
 	};
 
-	pci_for_each_dma_alias(dev, get_rid_cb, &data);
+	pci_for_each_dma_alias(pdev, get_rid_cb, &data);
 
-	dev_id.as_uint64 = 0;
-	dev_id.device_type = HV_DEVICE_TYPE_PCI;
-	dev_id.pci.segment = pci_domain_nr(dev->bus);
+	hv_devid.as_uint64 = 0;
+	hv_devid.device_type = HV_DEVICE_TYPE_PCI;
+	hv_devid.pci.segment = pci_domain_nr(pdev->bus);
 
-	dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid);
-	dev_id.pci.bdf.device = PCI_SLOT(data.rid);
-	dev_id.pci.bdf.function = PCI_FUNC(data.rid);
-	dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
+	hv_devid.pci.bdf.bus = PCI_BUS_NUM(data.rid);
+	hv_devid.pci.bdf.device = PCI_SLOT(data.rid);
+	hv_devid.pci.bdf.function = PCI_FUNC(data.rid);
+	hv_devid.pci.source_shadow = HV_SOURCE_SHADOW_NONE;
 
-	if (data.bridge) {
-		int pos;
+	if (data.bridge == NULL)
+		goto out;
 
-		/*
-		 * Microsoft Hypervisor requires a bus range when the bridge is
-		 * running in PCI-X mode.
-		 *
-		 * To distinguish conventional vs PCI-X bridge, we can check
-		 * the bridge's PCI-X Secondary Status Register, Secondary Bus
-		 * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
-		 * Specification Revision 1.0 5.2.2.1.3.
-		 *
-		 * Value zero means it is in conventional mode, otherwise it is
-		 * in PCI-X mode.
-		 */
+	/*
+	 * Microsoft Hypervisor requires a bus range when the bridge is
+	 * running in PCI-X mode.
+	 *
+	 * To distinguish conventional vs PCI-X bridge, we can check
+	 * the bridge's PCI-X Secondary Status Register, Secondary Bus
+	 * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge
+	 * Specification Revision 1.0 5.2.2.1.3.
+	 *
+	 * Value zero means it is in conventional mode, otherwise it is
+	 * in PCI-X mode.
+	 */
 
-		pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
-		if (pos) {
-			u16 status;
+	pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX);
+	if (pos) {
+		u16 status;
 
-			pci_read_config_word(data.bridge, pos +
-					PCI_X_BRIDGE_SSTATUS, &status);
+		pci_read_config_word(data.bridge, pos + PCI_X_BRIDGE_SSTATUS,
+				     &status);
 
-			if (status & PCI_X_SSTATUS_FREQ) {
-				/* Non-zero, PCI-X mode */
-				u8 sec_bus, sub_bus;
+		if (status & PCI_X_SSTATUS_FREQ) {
+			/* Non-zero, PCI-X mode */
+			u8 sec_bus, sub_bus;
 
-				dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
+			hv_devid.pci.source_shadow =
+					     HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE;
 
-				pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus);
-				dev_id.pci.shadow_bus_range.secondary_bus = sec_bus;
-				pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus);
-				dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus;
-			}
+			pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS,
+					     &sec_bus);
+			hv_devid.pci.shadow_bus_range.secondary_bus = sec_bus;
+			pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS,
+					     &sub_bus);
+			hv_devid.pci.shadow_bus_range.subordinate_bus = sub_bus;
 		}
 	}
 
-	return dev_id;
+out:
+	return hv_devid;
 }
 
-/**
- * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor.
+/*
+ * hv_map_msi_interrupt() - Map the MSI IRQ in the hypervisor.
  * @data:      Describes the IRQ
  * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL)
  *
@@ -188,22 +189,23 @@ int hv_map_msi_interrupt(struct irq_data *data,
 {
 	struct irq_cfg *cfg = irqd_cfg(data);
 	struct hv_interrupt_entry dummy;
-	union hv_device_id device_id;
+	union hv_device_id hv_devid;
 	struct msi_desc *msidesc;
-	struct pci_dev *dev;
+	struct pci_dev *pdev;
 	int cpu;
 
 	msidesc = irq_data_get_msi_desc(data);
-	dev = msi_desc_to_pci_dev(msidesc);
-	device_id = hv_build_pci_dev_id(dev);
+	pdev = msi_desc_to_pci_dev(msidesc);
+	hv_devid = hv_build_devid_type_pci(pdev);
 	cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
 
-	return hv_map_interrupt(device_id, false, cpu, cfg->vector,
+	return hv_map_interrupt(hv_devid, false, cpu, cfg->vector,
 				out_entry ? out_entry : &dummy);
 }
 EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
 
-static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg)
+static void entry_to_msi_msg(struct hv_interrupt_entry *entry,
+			     struct msi_msg *msg)
 {
 	/* High address is always 0 */
 	msg->address_hi = 0;
@@ -211,17 +213,19 @@ static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi
 	msg->data = entry->msi_entry.data.as_uint32;
 }
 
-static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry);
+static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
+				  struct hv_interrupt_entry *irq_entry);
+
 static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct hv_interrupt_entry *stored_entry;
 	struct irq_cfg *cfg = irqd_cfg(data);
 	struct msi_desc *msidesc;
-	struct pci_dev *dev;
+	struct pci_dev *pdev;
 	int ret;
 
 	msidesc = irq_data_get_msi_desc(data);
-	dev = msi_desc_to_pci_dev(msidesc);
+	pdev = msi_desc_to_pci_dev(msidesc);
 
 	if (!cfg) {
 		pr_debug("%s: cfg is NULL", __func__);
@@ -240,7 +244,7 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 		stored_entry = data->chip_data;
 		data->chip_data = NULL;
 
-		ret = hv_unmap_msi_interrupt(dev, stored_entry);
+		ret = hv_unmap_msi_interrupt(pdev, stored_entry);
 
 		kfree(stored_entry);
 
@@ -249,10 +253,8 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	}
 
 	stored_entry = kzalloc_obj(*stored_entry, GFP_ATOMIC);
-	if (!stored_entry) {
-		pr_debug("%s: failed to allocate chip data\n", __func__);
+	if (!stored_entry)
 		return;
-	}
 
 	ret = hv_map_msi_interrupt(data, stored_entry);
 	if (ret) {
@@ -262,18 +264,21 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 
 	data->chip_data = stored_entry;
 	entry_to_msi_msg(data->chip_data, msg);
-
-	return;
 }
 
-static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry)
+static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
+				  struct hv_interrupt_entry *irq_entry)
 {
-	return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry);
+	union hv_device_id hv_devid;
+
+	hv_devid = hv_build_devid_type_pci(pdev);
+	return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
 }
 
-static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
+/* NB: during map, hv_interrupt_entry is saved via data->chip_data */
+static void hv_teardown_msi_irq(struct pci_dev *pdev, struct irq_data *irqd)
 {
-	struct hv_interrupt_entry old_entry;
+	struct hv_interrupt_entry irq_entry;
 	struct msi_msg msg;
 
 	if (!irqd->chip_data) {
@@ -281,13 +286,13 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd)
 		return;
 	}
 
-	old_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
-	entry_to_msi_msg(&old_entry, &msg);
+	irq_entry = *(struct hv_interrupt_entry *)irqd->chip_data;
+	entry_to_msi_msg(&irq_entry, &msg);
 
 	kfree(irqd->chip_data);
 	irqd->chip_data = NULL;
 
-	(void)hv_unmap_msi_interrupt(dev, &old_entry);
+	(void)hv_unmap_msi_interrupt(pdev, &irq_entry);
 }
 
 /*
@@ -302,7 +307,8 @@ static struct irq_chip hv_pci_msi_controller = {
 };
 
 static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
-				 struct irq_domain *real_parent, struct msi_domain_info *info)
+				 struct irq_domain *real_parent,
+				 struct msi_domain_info *info)
 {
 	struct irq_chip *chip = info->chip;
 
@@ -317,7 +323,8 @@ static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
 }
 
 #define HV_MSI_FLAGS_SUPPORTED	(MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX)
-#define HV_MSI_FLAGS_REQUIRED	(MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS)
+#define HV_MSI_FLAGS_REQUIRED	(MSI_FLAG_USE_DEF_DOM_OPS |	\
+				 MSI_FLAG_USE_DEF_CHIP_OPS)
 
 static struct msi_parent_ops hv_msi_parent_ops = {
 	.supported_flags	= HV_MSI_FLAGS_SUPPORTED,
@@ -329,14 +336,14 @@ static struct msi_parent_ops hv_msi_parent_ops = {
 	.init_dev_msi_info	= hv_init_dev_msi_info,
 };
 
-static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs,
-			       void *arg)
+/* Allocate nr_irqs IRQs for the given irq domain */
+static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq,
+			       unsigned int nr_irqs, void *arg)
 {
 	/*
-	 * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except
-	 * entry_to_msi_msg() should be in here.
+	 * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e.
+	 *	 everything except entry_to_msi_msg() should be in here.
 	 */
-
 	int ret;
 
 	ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg);
@@ -344,13 +351,15 @@ static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned
 		return ret;
 
 	for (int i = 0; i < nr_irqs; ++i) {
-		irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL,
-				    handle_edge_irq, NULL, "edge");
+		irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller,
+				    NULL, handle_edge_irq, NULL, "edge");
 	}
+
 	return 0;
 }
 
-static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs)
+static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq,
+			       unsigned int nr_irqs)
 {
 	for (int i = 0; i < nr_irqs; ++i) {
 		struct irq_data *irqd = irq_domain_get_irq_data(d, virq);
@@ -362,6 +371,7 @@ static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned
 
 		hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd);
 	}
+
 	irq_domain_free_irqs_top(d, virq, nr_irqs);
 }
 
@@ -394,25 +404,25 @@ struct irq_domain * __init hv_create_pci_msi_domain(void)
 
 int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
 {
-	union hv_device_id device_id;
+	union hv_device_id hv_devid;
 
-	device_id.as_uint64 = 0;
-	device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
-	device_id.ioapic.ioapic_id = (u8)ioapic_id;
+	hv_devid.as_uint64 = 0;
+	hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
+	hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
 
-	return hv_unmap_interrupt(device_id.as_uint64, entry);
+	return hv_unmap_interrupt(hv_devid.as_uint64, entry);
 }
 EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
 
 int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector,
 		struct hv_interrupt_entry *entry)
 {
-	union hv_device_id device_id;
+	union hv_device_id hv_devid;
 
-	device_id.as_uint64 = 0;
-	device_id.device_type = HV_DEVICE_TYPE_IOAPIC;
-	device_id.ioapic.ioapic_id = (u8)ioapic_id;
+	hv_devid.as_uint64 = 0;
+	hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
+	hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
 
-	return hv_map_interrupt(device_id, level, cpu, vector, entry);
+	return hv_map_interrupt(hv_devid, level, cpu, vector, entry);
 }
 EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt);
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 03/11] mshv: Provide a way to get partition ID if running in a VMM process
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Many PCI passthru related hypercalls require partition ID of the target
guest. Guests are actually managed by MSHV driver and the partition ID
is only maintained there. Add a field in the partition struct in MSHV
driver to save the tgid of the VMM process creating the partition, and
add a function there to retrieve partition ID if the current process is
a VMM process.

Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_root.h         |  1 +
 drivers/hv/mshv_root_main.c    | 22 ++++++++++++++++++++++
 include/asm-generic/mshyperv.h |  5 +++++
 3 files changed, 28 insertions(+)

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 1f086dcb7aa1..a85c24dcc701 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -138,6 +138,7 @@ struct mshv_partition {
 
 	struct mshv_girq_routing_table __rcu *pt_girq_tbl;
 	u64 isolation_type;
+	pid_t pt_vmm_tgid;
 	bool import_completed;
 	bool pt_initialized;
 #if IS_ENABLED(CONFIG_DEBUG_FS)
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index bd1359eb58dd..02c107458be9 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1908,6 +1908,27 @@ mshv_partition_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+/* Given a process tgid, return partition id if it is a VMM process */
+u64 mshv_current_partid(void)
+{
+	struct mshv_partition *pt;
+	int i;
+	u64 ret_ptid = HV_PARTITION_ID_INVALID;
+
+	rcu_read_lock();
+
+	hash_for_each_rcu(mshv_root.pt_htable, i, pt, pt_hnode) {
+		if (pt->pt_vmm_tgid == current->tgid) {
+			ret_ptid = pt->pt_id;
+			break;
+		}
+	}
+
+	rcu_read_unlock();
+	return ret_ptid;
+}
+EXPORT_SYMBOL_GPL(mshv_current_partid);
+
 static int
 add_partition(struct mshv_partition *partition)
 {
@@ -2073,6 +2094,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
 		goto cleanup_irq_srcu;
 
 	partition->pt_id = pt_id;
+	partition->pt_vmm_tgid = current->tgid;
 
 	ret = add_partition(partition);
 	if (ret)
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index bf601d67cecb..e8cbc4e3f7ad 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -350,6 +350,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
 int hv_call_notify_all_processors_started(void);
 bool hv_lp_exists(u32 lp_index);
 int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
+u64 mshv_current_partid(void);
 
 #else /* CONFIG_MSHV_ROOT */
 static inline bool hv_root_partition(void) { return false; }
@@ -380,6 +381,10 @@ static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u3
 {
 	return -EOPNOTSUPP;
 }
+static inline u64 mshv_current_partid(void)
+{
+	return HV_PARTITION_ID_INVALID;
+}
 #endif /* CONFIG_MSHV_ROOT */
 
 static inline int hv_deposit_memory(u64 partition_id, u64 status)
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 04/11] mshv: Declarations and definitions for VFIO-MSHV bridge device
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Add data structs needed by the subsequent patch that introduces a new
module to implement VFIO-MSHV pseudo device.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_root.h    | 19 +++++++++++++++++++
 include/uapi/linux/mshv.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index a85c24dcc701..b9880d0bdc4d 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -227,6 +227,25 @@ struct port_table_info {
 	};
 };
 
+struct mshv_device {
+	const struct mshv_device_ops *device_ops;
+	struct mshv_partition *device_pt;
+	void *device_private;
+	struct hlist_node device_ptnode;
+};
+
+struct mshv_device_ops {
+	const char *device_name;
+	long (*device_create)(struct mshv_device *dev);
+	void (*device_release)(struct mshv_device *dev);
+	long (*device_set_attr)(struct mshv_device *dev,
+				struct mshv_device_attr *attr);
+	long (*device_has_attr)(struct mshv_device *dev,
+				struct mshv_device_attr *attr);
+};
+
+extern struct mshv_device_ops mshv_vfio_device_ops;
+
 int mshv_update_routing_table(struct mshv_partition *partition,
 			      const struct mshv_user_irq_entry *entries,
 			      unsigned int numents);
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index 32ff92b6342b..be6fe3ee8707 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -404,4 +404,34 @@ struct mshv_sint_mask {
 /* hv_hvcall device */
 #define MSHV_HVCALL_SETUP        _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup)
 #define MSHV_HVCALL              _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall)
+
+/* Device passhthru */
+#define MSHV_CREATE_DEVICE_TEST		1
+
+enum {
+	MSHV_DEV_TYPE_VFIO,
+	MSHV_DEV_TYPE_MAX,
+};
+
+struct mshv_create_device {
+	__u32	type;	     /* in: MSHV_DEV_TYPE_xxx */
+	__u32	fd;	     /* out: device handle */
+	__u32	flags;	     /* in: MSHV_CREATE_DEVICE_xxx */
+};
+
+#define MSHV_DEV_VFIO_FILE      1
+#define MSHV_DEV_VFIO_FILE_ADD	1
+#define MSHV_DEV_VFIO_FILE_DEL	2
+
+struct mshv_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
+/* Device fds created with MSHV_CREATE_DEVICE */
+#define MSHV_SET_DEVICE_ATTR	_IOW(MSHV_IOCTL, 0x00, struct mshv_device_attr)
+#define MSHV_HAS_DEVICE_ATTR	_IOW(MSHV_IOCTL, 0x01, struct mshv_device_attr)
+
 #endif
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 05/11] mshv: Implement mshv bridge device for VFIO
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Add a new file to implement VFIO-MSHV bridge pseudo device. These
functions are called in the VFIO framework, and credits to kvm/vfio.c
as this file was adapted from it.

Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/Makefile       |   3 +-
 drivers/hv/mshv_vfio.c    | 211 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/mshv.h |   1 +
 3 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 drivers/hv/mshv_vfio.c

diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index 888a748cc7cb..9ab6fc254c38 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -14,7 +14,8 @@ hv_vmbus-y := vmbus_drv.o \
 hv_vmbus-$(CONFIG_HYPERV_TESTING)	+= hv_debugfs.o
 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
 mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
-	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o  \
+               mshv_vfio.o
 mshv_root-$(CONFIG_DEBUG_FS) += mshv_debugfs.o
 mshv_root-$(CONFIG_TRACEPOINTS) += mshv_trace.o
 mshv_vtl-y := mshv_vtl_main.o
diff --git a/drivers/hv/mshv_vfio.c b/drivers/hv/mshv_vfio.c
new file mode 100644
index 000000000000..00a97920e25b
--- /dev/null
+++ b/drivers/hv/mshv_vfio.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO-MSHV bridge pseudo device
+ *
+ * Heavily inspired by the VFIO-KVM bridge pseudo device.
+ */
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <asm/mshyperv.h>
+
+#include "mshv.h"
+#include "mshv_root.h"
+
+struct mshv_vfio_file {
+	struct list_head node;
+	struct file *file;	/* list of struct mshv_vfio_file */
+};
+
+struct mshv_vfio {
+	struct list_head file_list;
+	struct mutex lock;
+};
+
+static bool mshv_vfio_file_is_valid(struct file *file)
+{
+	bool (*fn)(struct file *file);
+	bool ret;
+
+	fn = symbol_get(vfio_file_is_valid);
+	if (!fn)
+		return false;
+
+	ret = fn(file);
+
+	symbol_put(vfio_file_is_valid);
+
+	return ret;
+}
+
+static long mshv_vfio_file_add(struct mshv_device *mshvdev, unsigned int fd)
+{
+	struct mshv_vfio *mshv_vfio = mshvdev->device_private;
+	struct mshv_vfio_file *mvf;
+	struct file *filp;
+	long ret = 0;
+
+	filp = fget(fd);
+	if (!filp)
+		return -EBADF;
+
+	/* Ensure the FD is a vfio FD. */
+	if (!mshv_vfio_file_is_valid(filp)) {
+		ret = -EINVAL;
+		goto out_fput;
+	}
+
+	mutex_lock(&mshv_vfio->lock);
+
+	list_for_each_entry(mvf, &mshv_vfio->file_list, node) {
+		if (mvf->file == filp) {
+			ret = -EEXIST;
+			goto out_unlock;
+		}
+	}
+
+	mvf = kzalloc(sizeof(*mvf), GFP_KERNEL_ACCOUNT);
+	if (!mvf) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
+
+	mvf->file = get_file(filp);
+	list_add_tail(&mvf->node, &mshv_vfio->file_list);
+
+out_unlock:
+	mutex_unlock(&mshv_vfio->lock);
+out_fput:
+	fput(filp);
+	return ret;
+}
+
+static long mshv_vfio_file_del(struct mshv_device *mshvdev, unsigned int fd)
+{
+	struct mshv_vfio *mshv_vfio = mshvdev->device_private;
+	struct mshv_vfio_file *mvf;
+	long ret;
+
+	CLASS(fd, f)(fd);
+
+	if (fd_empty(f))
+		return -EBADF;
+
+	ret = -ENOENT;
+	mutex_lock(&mshv_vfio->lock);
+
+	list_for_each_entry(mvf, &mshv_vfio->file_list, node) {
+		if (mvf->file != fd_file(f))
+			continue;
+
+		list_del(&mvf->node);
+		fput(mvf->file);
+		kfree(mvf);
+		ret = 0;
+		break;
+	}
+
+	mutex_unlock(&mshv_vfio->lock);
+	return ret;
+}
+
+static long mshv_vfio_set_file(struct mshv_device *mshvdev, long attr,
+			      void __user *arg)
+{
+	int32_t __user *argp = arg;
+	int32_t fd;
+
+	switch (attr) {
+	case MSHV_DEV_VFIO_FILE_ADD:
+		if (get_user(fd, argp))
+			return -EFAULT;
+		return mshv_vfio_file_add(mshvdev, fd);
+
+	case MSHV_DEV_VFIO_FILE_DEL:
+		if (get_user(fd, argp))
+			return -EFAULT;
+		return mshv_vfio_file_del(mshvdev, fd);
+	}
+
+	return -ENXIO;
+}
+
+static long mshv_vfio_set_attr(struct mshv_device *mshvdev,
+			      struct mshv_device_attr *attr)
+{
+	switch (attr->group) {
+	case MSHV_DEV_VFIO_FILE:
+		return mshv_vfio_set_file(mshvdev, attr->attr,
+					  u64_to_user_ptr(attr->addr));
+	}
+
+	return -ENXIO;
+}
+
+static long mshv_vfio_has_attr(struct mshv_device *mshvdev,
+			      struct mshv_device_attr *attr)
+{
+	switch (attr->group) {
+	case MSHV_DEV_VFIO_FILE:
+		switch (attr->attr) {
+		case MSHV_DEV_VFIO_FILE_ADD:
+		case MSHV_DEV_VFIO_FILE_DEL:
+			return 0;
+		}
+
+		break;
+	}
+
+	return -ENXIO;
+}
+
+static long mshv_vfio_create_device(struct mshv_device *mshvdev)
+{
+	struct mshv_device *tmp;
+	struct mshv_vfio *mshv_vfio;
+
+	/* Only one VFIO "device" per VM */
+	hlist_for_each_entry(tmp, &mshvdev->device_pt->pt_devices,
+			     device_ptnode)
+		if (tmp->device_ops == &mshv_vfio_device_ops)
+			return -EBUSY;
+
+	mshv_vfio = kzalloc(sizeof(*mshv_vfio), GFP_KERNEL_ACCOUNT);
+	if (mshv_vfio == NULL)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&mshv_vfio->file_list);
+	mutex_init(&mshv_vfio->lock);
+
+	mshvdev->device_private = mshv_vfio;
+
+	return 0;
+}
+
+/* This is called from mshv_device_fop_release() */
+static void mshv_vfio_release_device(struct mshv_device *mshvdev)
+{
+	struct mshv_vfio *mv = mshvdev->device_private;
+	struct mshv_vfio_file *mvf, *tmp;
+
+	list_for_each_entry_safe(mvf, tmp, &mv->file_list, node) {
+		fput(mvf->file);
+		list_del(&mvf->node);
+		kfree(mvf);
+	}
+
+	kfree(mv);
+	kfree(mshvdev);
+}
+
+struct mshv_device_ops mshv_vfio_device_ops = {
+	.device_name = "mshv-vfio",
+	.device_create = mshv_vfio_create_device,
+	.device_release = mshv_vfio_release_device,
+	.device_set_attr = mshv_vfio_set_attr,
+	.device_has_attr = mshv_vfio_has_attr,
+};
diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
index be6fe3ee8707..b038a79786d2 100644
--- a/include/uapi/linux/mshv.h
+++ b/include/uapi/linux/mshv.h
@@ -254,6 +254,7 @@ struct mshv_root_hvcall {
 #define MSHV_GET_GPAP_ACCESS_BITMAP	_IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap)
 /* Generic hypercall */
 #define MSHV_ROOT_HVCALL		_IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
+#define MSHV_CREATE_DEVICE             _IOWR(MSHV_IOCTL, 0x08, struct mshv_create_device)
 
 /*
  ********************************
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 06/11] mshv: Add ioctl support for MSHV-VFIO bridge device
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Add ioctl support for creating MSHV devices for a partition. At
present only VFIO device types are supported, but more could be
added. At a high level, a partition ioctl to create device verifies
it is of type VFIO and does some setup for bridge code in mshv_vfio.c.
Adapted from KVM device ioctls.

Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c | 116 ++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 02c107458be9..6ceb5f608589 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1625,6 +1625,119 @@ mshv_partition_ioctl_initialize(struct mshv_partition *partition)
 	return ret;
 }
 
+static long mshv_device_attr_ioctl(struct mshv_device *mshv_dev, int cmd,
+				   ulong uarg)
+{
+	struct mshv_device_attr attr;
+	const struct mshv_device_ops *devops = mshv_dev->device_ops;
+
+	if (copy_from_user(&attr, (void __user *)uarg, sizeof(attr)))
+		return -EFAULT;
+
+	switch (cmd) {
+	case MSHV_SET_DEVICE_ATTR:
+		if (devops->device_set_attr)
+			return devops->device_set_attr(mshv_dev, &attr);
+		break;
+	case MSHV_HAS_DEVICE_ATTR:
+		if (devops->device_has_attr)
+			return devops->device_has_attr(mshv_dev, &attr);
+		break;
+	}
+
+	return -EPERM;
+}
+
+static long mshv_device_fop_ioctl(struct file *filp, unsigned int cmd,
+				  ulong uarg)
+{
+	struct mshv_device *mshv_dev = filp->private_data;
+
+	switch (cmd) {
+	case MSHV_SET_DEVICE_ATTR:
+	case MSHV_HAS_DEVICE_ATTR:
+		return mshv_device_attr_ioctl(mshv_dev, cmd, uarg);
+	}
+
+	return -ENOTTY;
+}
+
+static int mshv_device_fop_release(struct inode *inode, struct file *filp)
+{
+	struct mshv_device *mshv_dev = filp->private_data;
+	struct mshv_partition *partition = mshv_dev->device_pt;
+
+	if (mshv_dev->device_ops->device_release) {
+		mutex_lock(&partition->pt_mutex);
+		hlist_del(&mshv_dev->device_ptnode);
+		mshv_dev->device_ops->device_release(mshv_dev);
+		mutex_unlock(&partition->pt_mutex);
+	}
+
+	mshv_partition_put(partition);
+	return 0;
+}
+
+static const struct file_operations mshv_device_fops = {
+	.owner = THIS_MODULE,
+	.unlocked_ioctl = mshv_device_fop_ioctl,
+	.release = mshv_device_fop_release,
+};
+
+static long mshv_partition_ioctl_create_device(struct mshv_partition *partition,
+					       void __user *uarg)
+{
+	long rc;
+	struct mshv_create_device devargk;
+	struct mshv_device *mshv_dev;
+	const struct mshv_device_ops *vfio_ops;
+
+	if (copy_from_user(&devargk, uarg, sizeof(devargk)))
+		return -EFAULT;
+
+	/* At present, only VFIO is supported */
+	if (devargk.type != MSHV_DEV_TYPE_VFIO)
+		return -ENODEV;
+
+	if (devargk.flags & MSHV_CREATE_DEVICE_TEST)
+		return 0;
+
+	/* This is freed later by mshv_vfio_release_device() */
+	mshv_dev = kzalloc(sizeof(*mshv_dev), GFP_KERNEL_ACCOUNT);
+	if (mshv_dev == NULL)
+		return -ENOMEM;
+
+	vfio_ops = &mshv_vfio_device_ops;
+	mshv_dev->device_ops = vfio_ops;
+	mshv_dev->device_pt = partition;
+
+	rc = vfio_ops->device_create(mshv_dev);
+	if (rc < 0) {
+		kfree(mshv_dev);
+		return rc;
+	}
+
+	hlist_add_head(&mshv_dev->device_ptnode, &partition->pt_devices);
+
+	mshv_partition_get(partition);
+	rc = anon_inode_getfd(vfio_ops->device_name, &mshv_device_fops,
+			      mshv_dev, O_RDWR | O_CLOEXEC);
+	if (rc < 0)
+		goto undo_out;
+
+	devargk.fd = rc;
+	if (copy_to_user(uarg, &devargk, sizeof(devargk)))
+		return -EFAULT;    /* cleanup in mshv_device_fop_release() */
+
+	return 0;
+
+undo_out:
+	hlist_del(&mshv_dev->device_ptnode);
+	vfio_ops->device_release(mshv_dev);    /* will kfree(mshv_dev) */
+	mshv_partition_put(partition);
+	return rc;
+}
+
 static long
 mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 {
@@ -1661,6 +1774,9 @@ mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 	case MSHV_ROOT_HVCALL:
 		ret = mshv_ioctl_passthru_hvcall(partition, true, uarg);
 		break;
+	case MSHV_CREATE_DEVICE:
+		ret = mshv_partition_ioctl_create_device(partition, uarg);
+		break;
 	default:
 		ret = -ENOTTY;
 	}
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 07/11] mshv: Import data structs around device passthru from hyperv headers
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Copy/import from Hyper-V public headers, definitions and declarations that
are related to attaching and detaching of device domains, and building
device ids for those purposes.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 include/hyperv/hvgdk_mini.h |  11 ++++
 include/hyperv/hvhdk_mini.h | 112 ++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+)

diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index 6a4e8b9d570f..da622fb06440 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -326,6 +326,9 @@ union hv_hypervisor_version_info {
 /* stimer Direct Mode is available */
 #define HV_STIMER_DIRECT_MODE_AVAILABLE			BIT(19)
 
+#define HV_DEVICE_DOMAIN_AVAILABLE			BIT(24)
+#define HV_S1_DEVICE_DOMAIN_AVAILABLE			BIT(25)
+
 /*
  * Implementation recommendations. Indicates which behaviors the hypervisor
  * recommends the OS implement for optimal performance.
@@ -475,6 +478,8 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_MAP_DEVICE_INTERRUPT			0x007c
 #define HVCALL_UNMAP_DEVICE_INTERRUPT			0x007d
 #define HVCALL_RETARGET_INTERRUPT			0x007e
+#define HVCALL_ATTACH_DEVICE                            0x0082
+#define HVCALL_DETACH_DEVICE                            0x0083
 #define HVCALL_NOTIFY_PARTITION_EVENT                   0x0087
 #define HVCALL_ENTER_SLEEP_STATE			0x0084
 #define HVCALL_NOTIFY_PORT_RING_EMPTY			0x008b
@@ -486,9 +491,15 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_GET_VP_INDEX_FROM_APIC_ID		0x009a
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE	0x00af
 #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST	0x00b0
+#define HVCALL_CREATE_DEVICE_DOMAIN                     0x00b1
+#define HVCALL_ATTACH_DEVICE_DOMAIN                     0x00b2
+#define HVCALL_MAP_DEVICE_GPA_PAGES                     0x00b3
+#define HVCALL_UNMAP_DEVICE_GPA_PAGES                   0x00b4
 #define HVCALL_SIGNAL_EVENT_DIRECT			0x00c0
 #define HVCALL_POST_MESSAGE_DIRECT			0x00c1
 #define HVCALL_DISPATCH_VP				0x00c2
+#define HVCALL_DETACH_DEVICE_DOMAIN                     0x00c4
+#define HVCALL_DELETE_DEVICE_DOMAIN                     0x00c5
 #define HVCALL_GET_GPA_PAGES_ACCESS_STATES		0x00c9
 #define HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS	0x00d7
 #define HVCALL_RELEASE_SPARSE_SPA_PAGE_HOST_ACCESS	0x00d8
diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
index b4cb2fa26e9b..60425052a799 100644
--- a/include/hyperv/hvhdk_mini.h
+++ b/include/hyperv/hvhdk_mini.h
@@ -468,6 +468,32 @@ struct hv_send_ipi_ex { /* HV_INPUT_SEND_SYNTHETIC_CLUSTER_IPI_EX */
 	struct hv_vpset vp_set;
 } __packed;
 
+union hv_attdev_flags {		/* HV_ATTACH_DEVICE_FLAGS */
+	struct {
+		u32 logical_id : 1;
+		u32 resvd0 : 1;
+		u32 ats_enabled : 1;
+		u32 virt_func : 1;
+		u32 shared_irq_child : 1;
+		u32 virt_dev : 1;
+		u32 ats_supported : 1;
+		u32 small_irt : 1;
+		u32 resvd : 24;
+	} __packed;
+	u32 as_uint32;
+};
+
+union hv_dev_pci_caps {		/* HV_DEVICE_PCI_CAPABILITIES */
+	struct {
+		u32 max_pasid_width : 5;
+		u32 invalidate_qdepth : 5;
+		u32 global_inval : 1;
+		u32 prg_response_req : 1;
+		u32 resvd : 20;
+	} __packed;
+	u32 as_uint32;
+};
+
 typedef u16 hv_pci_rid;		/* HV_PCI_RID */
 typedef u16 hv_pci_segment;	/* HV_PCI_SEGMENT */
 typedef u64 hv_logical_device_id;
@@ -547,4 +573,90 @@ union hv_device_id {		/* HV_DEVICE_ID */
 	} acpi;
 } __packed;
 
+struct hv_input_attach_device {         /* HV_INPUT_ATTACH_DEVICE */
+	u64 partition_id;
+	union hv_device_id device_id;
+	union hv_attdev_flags attdev_flags;
+	u8  attdev_vtl;
+	u8  rsvd0;
+	u16 rsvd1;
+	u64 logical_devid;
+	union hv_dev_pci_caps dev_pcicaps;
+	u16 pf_pci_rid;
+	u16 resvd2;
+} __packed;
+
+struct hv_input_detach_device {		/* HV_INPUT_DETACH_DEVICE */
+	u64 partition_id;
+	u64 logical_devid;
+} __packed;
+
+
+/* 3 domain types: stage 1, stage 2, and SOC */
+#define HV_DEVICE_DOMAIN_TYPE_S2  0 /* HV_DEVICE_DOMAIN_ID_TYPE_S2 */
+#define HV_DEVICE_DOMAIN_TYPE_S1  1 /* HV_DEVICE_DOMAIN_ID_TYPE_S1 */
+#define HV_DEVICE_DOMAIN_TYPE_SOC 2 /* HV_DEVICE_DOMAIN_ID_TYPE_SOC */
+
+/* ID for stage 2 default domain and NULL domain */
+#define HV_DEVICE_DOMAIN_ID_S2_DEFAULT 0
+#define HV_DEVICE_DOMAIN_ID_S2_NULL    0xFFFFFFFFULL
+
+union hv_device_domain_id {
+	u64 as_uint64;
+	struct {
+		u32 type : 4;
+		u32 reserved : 28;
+		u32 id;
+	};
+} __packed;
+
+struct hv_input_device_domain { /* HV_INPUT_DEVICE_DOMAIN */
+	u64 partition_id;
+	union hv_input_vtl owner_vtl;
+	u8 padding[7];
+	union hv_device_domain_id domain_id;
+} __packed;
+
+union hv_create_device_domain_flags {	/* HV_CREATE_DEVICE_DOMAIN_FLAGS */
+	u32 as_uint32;
+	struct {
+		u32 forward_progress_required : 1;
+		u32 inherit_owning_vtl : 1;
+		u32 reserved : 30;
+	} __packed;
+} __packed;
+
+struct hv_input_create_device_domain {	/* HV_INPUT_CREATE_DEVICE_DOMAIN */
+	struct hv_input_device_domain device_domain;
+	union hv_create_device_domain_flags create_device_domain_flags;
+} __packed;
+
+struct hv_input_delete_device_domain {	/* HV_INPUT_DELETE_DEVICE_DOMAIN */
+	struct hv_input_device_domain device_domain;
+} __packed;
+
+struct hv_input_attach_device_domain {	/* HV_INPUT_ATTACH_DEVICE_DOMAIN */
+	struct hv_input_device_domain device_domain;
+	union hv_device_id device_id;
+} __packed;
+
+struct hv_input_detach_device_domain {	/* HV_INPUT_DETACH_DEVICE_DOMAIN */
+	u64 partition_id;
+	union hv_device_id device_id;
+} __packed;
+
+struct hv_input_map_device_gpa_pages {	/* HV_INPUT_MAP_DEVICE_GPA_PAGES */
+	struct hv_input_device_domain device_domain;
+	union hv_input_vtl target_vtl;
+	u8 padding[3];
+	u32 map_flags;
+	u64 target_device_va_base;
+	u64 gpa_page_list[];
+} __packed;
+
+struct hv_input_unmap_device_gpa_pages {  /* HV_INPUT_UNMAP_DEVICE_GPA_PAGES */
+	struct hv_input_device_domain device_domain;
+	u64 target_device_va_base;
+} __packed;
+
 #endif /* _HV_HVHDK_MINI_H */
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 08/11] PCI: hv: VMBus and PCI device IDs for PCI passthru
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
interrupts, etc need a device ID as a parameter. This device ID refers
to that specific device during the lifetime of passthru.

An L1VH VM only contains VMBus based devices. A device ID for a VMBus
device is slightly different in that it uses the hv_pcibus_device info
for building it to make sure it matches exactly what the hypervisor
expects. This VMBus based device ID is needed when attaching devices in
an L1VH based guest VM. Add a function to build and export it. Before
building it, a check is done to make sure the device is a valid VMBus
device.

In remaining cases, PCI device ID is used. So, also make PCI device ID
build function hv_build_devid_type_pci() public.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 arch/x86/hyperv/irqdomain.c         |  9 +++++----
 arch/x86/include/asm/mshyperv.h     |  6 ++++++
 drivers/pci/controller/pci-hyperv.c | 24 ++++++++++++++++++++++++
 include/asm-generic/mshyperv.h      | 11 +++++++++++
 4 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index b3ad50a874dc..8780573a4332 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -112,7 +112,7 @@ static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
 	return 0;
 }
 
-static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
+u64 hv_build_devid_type_pci(struct pci_dev *pdev)
 {
 	int pos;
 	union hv_device_id hv_devid;
@@ -172,8 +172,9 @@ static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
 	}
 
 out:
-	return hv_devid;
+	return hv_devid.as_uint64;
 }
+EXPORT_SYMBOL_GPL(hv_build_devid_type_pci);
 
 /*
  * hv_map_msi_interrupt() - Map the MSI IRQ in the hypervisor.
@@ -196,7 +197,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
 
 	msidesc = irq_data_get_msi_desc(data);
 	pdev = msi_desc_to_pci_dev(msidesc);
-	hv_devid = hv_build_devid_type_pci(pdev);
+	hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
 	cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
 
 	return hv_map_interrupt(hv_devid, false, cpu, cfg->vector,
@@ -271,7 +272,7 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
 {
 	union hv_device_id hv_devid;
 
-	hv_devid = hv_build_devid_type_pci(pdev);
+	hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
 	return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
 }
 
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index f64393e853ee..2ef34001f8d3 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -248,6 +248,12 @@ void hv_crash_asm_end(void);
 static inline void hv_root_crash_init(void) {}
 #endif  /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
 
+#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
+u64 hv_build_devid_type_pci(struct pci_dev *pdev);
+#else
+static inline u64 hv_build_devid_type_pci(struct pci_dev *pdev) { return 0; }
+#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
+
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline void hyperv_setup_mmu_ops(void) {}
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index cfc8fa403dad..50d793ca8f31 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -573,6 +573,7 @@ struct hv_pci_compl {
 };
 
 static void hv_pci_onchannelcallback(void *context);
+static bool hv_vmbus_pci_device(struct pci_bus *pbus);
 
 #ifdef CONFIG_X86
 #define DELIVERY_MODE		APIC_DELIVERY_MODE_FIXED
@@ -1005,6 +1006,24 @@ static struct irq_domain *hv_pci_get_root_domain(void)
 static void hv_arch_irq_unmask(struct irq_data *data) { }
 #endif /* CONFIG_ARM64 */
 
+u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
+{
+	struct hv_pcibus_device *hbus;
+	struct pci_bus *pbus = pdev->bus;
+
+	if (!hv_vmbus_pci_device(pbus))
+		return 0;
+
+	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
+
+	return	(hbus->hdev->dev_instance.b[5] << 24) |
+		(hbus->hdev->dev_instance.b[4] << 16) |
+		(hbus->hdev->dev_instance.b[7] << 8) |
+		(hbus->hdev->dev_instance.b[6] & 0xf8) |
+		PCI_FUNC(pdev->devfn);
+}
+EXPORT_SYMBOL_GPL(hv_pci_vmbus_device_id);
+
 /**
  * hv_pci_generic_compl() - Invoked for a completion packet
  * @context:		Set up by the sender of the packet.
@@ -1403,6 +1422,11 @@ static struct pci_ops hv_pcifront_ops = {
 	.write = hv_pcifront_write_config,
 };
 
+static bool hv_vmbus_pci_device(struct pci_bus *pbus)
+{
+	return pbus->ops == &hv_pcifront_ops;
+}
+
 /*
  * Paravirtual backchannel
  *
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index e8cbc4e3f7ad..25ac7ca0fd8b 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -204,6 +204,9 @@ extern u64 (*hv_read_reference_counter)(void);
 /* Sentinel value for an uninitialized entry in hv_vp_index array */
 #define VP_INVAL	U32_MAX
 
+/* Forward declarations */
+struct pci_dev;
+
 int __init hv_common_init(void);
 void __init hv_get_partition_id(void);
 void __init hv_common_free(void);
@@ -316,6 +319,14 @@ void hv_para_set_synic_register(unsigned int reg, u64 val);
 void hyperv_cleanup(void);
 bool hv_query_ext_cap(u64 cap_query);
 void hv_setup_dma_ops(struct device *dev, bool coherent);
+
+#if IS_ENABLED(CONFIG_PCI_HYPERV)
+u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
+#else
+static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
+{ return 0; }
+#endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
+
 #else /* CONFIG_HYPERV */
 static inline void hv_identify_partition_type(void) {}
 static inline bool hv_is_hyperv_initialized(void) { return false; }
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 09/11] x86/hyperv: Implement Hyper-V virtual IOMMU
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Add a new file to implement management of device domains, mapping and
unmapping of IOMMU memory, and other iommu_ops to fit within the VFIO
framework for PCI passthru on Hyper-V running Linux as baremetal root
or L1VH root. This also implements direct attach mechanism (see below),
a special feature of Hyper-V for PCI passthru, and it is also made to
work within the VFIO framework.

At a high level, during boot the hypervisor creates a default identity
domain and attaches all devices to it. This nicely maps to Linux IOMMU
subsystem IOMMU_DOMAIN_IDENTITY domain. As a result, Linux does not
need to explicitly ask Hyper-V to attach devices and do maps/unmaps
during boot. As mentioned previously, Hyper-V supports two ways to do
PCI passthru:

  1. Device Domain (aka Domain Attach): root must create a device domain
     in the hypervisor, and do map/unmap hypercalls for mapping and
     unmapping guest RAM for DMA. All hypervisor communications use
     device ID of type PCI for identifying and referencing the device.

  2. Direct Attach: the hypervisor will simply use the guest's HW
     page table for mappings, thus the root need not map/unmap guest
     memory for DMA. As such, direct attach passthru setup during guest
     boot is extremely fast. A direct attached device must always be
     referenced via logical device ID and not via the PCI device ID.

At present, L1VH root only supports direct attaches. Also direct attach is
default in non-L1VH cases because there are some significant performance
issues with domain attach implementations currently for guests with higher
RAM (say more than 8GB), and that unfortunately cannot be addressed in
the short term.

Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 MAINTAINERS                       |   1 +
 arch/x86/kernel/pci-dma.c         |   2 +
 drivers/iommu/Kconfig             |   5 +-
 drivers/iommu/Makefile            |   1 +
 drivers/iommu/hyperv-iommu-root.c | 918 ++++++++++++++++++++++++++++++
 include/asm-generic/mshyperv.h    |  17 +
 include/linux/hyperv.h            |   6 +
 7 files changed, 947 insertions(+), 3 deletions(-)
 create mode 100644 drivers/iommu/hyperv-iommu-root.c

diff --git a/MAINTAINERS b/MAINTAINERS
index f803a6a38fee..8ae040b89a56 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11914,6 +11914,7 @@ F:	drivers/clocksource/hyperv_timer.c
 F:	drivers/hid/hid-hyperv.c
 F:	drivers/hv/
 F:	drivers/input/serio/hyperv-keyboard.c
+F:	drivers/iommu/hyperv-iommu-root.c
 F:	drivers/iommu/hyperv-irq.c
 F:	drivers/net/ethernet/microsoft/
 F:	drivers/net/hyperv/
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 6267363e0189..cfeee6505e17 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -8,6 +8,7 @@
 #include <linux/gfp.h>
 #include <linux/pci.h>
 #include <linux/amd-iommu.h>
+#include <linux/hyperv.h>
 
 #include <asm/proto.h>
 #include <asm/dma.h>
@@ -105,6 +106,7 @@ void __init pci_iommu_alloc(void)
 	gart_iommu_hole_init();
 	amd_iommu_detect();
 	detect_intel_iommu();
+	hv_iommu_detect();
 	swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
 }
 
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f86262b11416..7909cf4373a6 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -352,13 +352,12 @@ config MTK_IOMMU_V1
 	  if unsure, say N here.
 
 config HYPERV_IOMMU
-	bool "Hyper-V IRQ Handling"
+	bool "Hyper-V IOMMU Unit"
 	depends on HYPERV && X86
 	select IOMMU_API
 	default HYPERV
 	help
-	  Stub IOMMU driver to handle IRQs to support Hyper-V Linux
-	  guest and root partitions.
+	  Hyper-V pseudo IOMMU unit.
 
 config VIRTIO_IOMMU
 	tristate "Virtio IOMMU driver"
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 335ea77cced6..296fbc6ca829 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
 obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_HYPERV) += hyperv-irq.o
+obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu-root.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
 obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
diff --git a/drivers/iommu/hyperv-iommu-root.c b/drivers/iommu/hyperv-iommu-root.c
new file mode 100644
index 000000000000..a2e0f6cc78e6
--- /dev/null
+++ b/drivers/iommu/hyperv-iommu-root.c
@@ -0,0 +1,918 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Hyper-V root vIOMMU driver.
+ * Copyright (C) 2026, Microsoft, Inc.
+ */
+
+#include <linux/pci.h>
+#include <linux/dma-map-ops.h>
+#include <linux/interval_tree.h>
+#include <linux/hyperv.h>
+#include "dma-iommu.h"
+#include <asm/iommu.h>
+#include <asm/mshyperv.h>
+
+/* We will not claim these PCI devices, eg hypervisor needs it for debugger */
+static char *pci_devs_to_skip;
+static int __init hv_iommu_setup_skip(char *str)
+{
+	pci_devs_to_skip = str;
+
+	return 0;
+}
+/* hv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
+__setup("hv_iommu_skip=", hv_iommu_setup_skip);
+
+bool hv_no_attdev;	 /* disable direct device attach for passthru */
+EXPORT_SYMBOL_GPL(hv_no_attdev);
+static int __init setup_hv_no_attdev(char *str)
+{
+	hv_no_attdev = true;
+	return 0;
+}
+__setup("hv_no_attdev", setup_hv_no_attdev);
+
+/* Iommu device that we export to the world. HyperV supports max of one */
+static struct iommu_device hv_virt_iommu;
+
+struct hv_domain {
+	struct iommu_domain iommu_dom;
+	u32 domid_num;			      /* as opposed to domain_id.type */
+	bool attached_dom;		      /* is this direct attached dom? */
+	u64 partid;			      /* partition id */
+	spinlock_t mappings_lock;	      /* protects mappings_tree */
+	struct rb_root_cached mappings_tree;  /* iova to pa lookup tree */
+};
+
+#define to_hv_domain(d) container_of(d, struct hv_domain, iommu_dom)
+
+struct hv_iommu_mapping {
+	phys_addr_t paddr;
+	struct interval_tree_node iova;
+	u32 flags;
+};
+
+/*
+ * By default, during boot the hypervisor creates one Stage 2 (S2) default
+ * domain. Stage 2 means that the page table is controlled by the hypervisor.
+ * It has two types:
+ *   S2 default: access to entire root partition memory. This for us easily
+ *		 maps to IOMMU_DOMAIN_IDENTITY in the iommu subsystem, and
+ *		 is called HV_DEVICE_DOMAIN_ID_S2_DEFAULT in the hypervisor.
+ *   S2 NULL: Blocks everything except RMRR
+ *
+ * Device Management:
+ *   There are two ways to manage device attaches to domains:
+ *     1. Domain Attach: A device domain is created in the hypervisor, the
+ *			 device is attached to this domain, and then memory
+ *			 ranges are mapped in the map callbacks.
+ *     2. Direct Attach: No need to create a domain in the hypervisor for direct
+ *			 attached devices. A hypercall is made to tell the
+ *			 hypervisor to attach the device to a guest. There is
+ *			 no need for explicit memory mappings because the
+ *			 hypervisor will just use the guest HW page table.
+ *
+ * Since a direct attach is much faster, it is the default. This can be
+ * changed via hv_no_attdev.
+ *
+ * L1VH: hypervisor only supports direct attach. Also, there is no S2 default
+ *	 in the hypervisor, so no explicit attach to S2 needed.
+ */
+
+/*
+ * Create dummy domains to correspond to hypervisor prebuilt default identity
+ * and null domains (dummy because we do not make hypercalls to create them).
+ */
+static struct hv_domain hv_def_identity_dom;
+static struct hv_domain hv_null_dom;
+
+static bool hv_special_domain(struct hv_domain *hvdom)
+{
+	return hvdom == &hv_def_identity_dom || hvdom == &hv_null_dom;
+}
+
+struct iommu_domain_geometry default_geometry = (struct iommu_domain_geometry) {
+	.aperture_start = 0,
+	.aperture_end = -1UL,
+	.force_aperture = true,
+};
+
+#define HV_IOMMU_PGSIZES SZ_4K  /* for now, to be enhanced */
+
+static u32 unique_id;	      /* unique numeric id of a new domain */
+
+static void hv_iommu_detach_dev(struct hv_domain *hvdom, struct device *dev);
+static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
+				   size_t pgsize, size_t pgcount,
+				   struct iommu_iotlb_gather *gather);
+
+/*
+ * If the current thread is a VMM thread, return the partition id of the VM it
+ * is managing, else return HV_PARTITION_ID_INVALID.
+ */
+u64 hv_get_current_partid(void)
+{
+	u64 (*fn)(void);
+	u64 ptid;
+
+	fn = symbol_get(mshv_current_partid);
+	if (!fn)
+		return HV_PARTITION_ID_INVALID;
+
+	ptid = fn();
+	symbol_put(mshv_current_partid);
+
+	return ptid;
+}
+EXPORT_SYMBOL_GPL(hv_get_current_partid);
+
+/* If this is a VMM thread, then this domain is for a guest vm */
+static bool hv_curr_thread_is_vmm(void)
+{
+	return hv_get_current_partid() != HV_PARTITION_ID_INVALID;
+}
+
+/* As opposed to some host app like SPDK etc... */
+static bool hv_dom_owner_is_vmm(struct hv_domain *hvdom)
+{
+	return hvdom && hvdom->partid != HV_PARTITION_ID_INVALID;
+}
+
+static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+	switch (cap) {
+	case IOMMU_CAP_CACHE_COHERENCY:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * Check if given pci device is a direct attached device. Caller must have
+ * verified pdev is a valid pci device.
+ */
+bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
+{
+	struct iommu_domain *iommu_domain;
+	struct hv_domain *hvdom;
+	struct device *dev = &pdev->dev;
+
+	iommu_domain = iommu_get_domain_for_dev(dev);
+	if (iommu_domain) {
+		hvdom = to_hv_domain(iommu_domain);
+		return hvdom->attached_dom;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hv_pcidev_is_attached_dev);
+
+bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
+
+	if (hvdom && !hv_special_domain(hvdom))
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hv_pcidev_is_pthru_dev);
+
+/* Build device id for direct attached devices */
+static u64 hv_build_devid_type_logical(struct pci_dev *pdev)
+{
+	hv_pci_segment segment;
+	union hv_device_id hv_devid;
+	union hv_pci_bdf bdf = {.as_uint16 = 0};
+	u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn);
+
+	segment = pci_domain_nr(pdev->bus);
+	bdf.bus = PCI_BUS_NUM(rid);
+	bdf.device = PCI_SLOT(rid);
+	bdf.function = PCI_FUNC(rid);
+
+	hv_devid.as_uint64 = 0;
+	hv_devid.device_type = HV_DEVICE_TYPE_LOGICAL;
+	hv_devid.logical.id = (u64)segment << 16 | bdf.as_uint16;
+
+	return hv_devid.as_uint64;
+}
+
+u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
+{
+	if (type == HV_DEVICE_TYPE_LOGICAL) {
+		if (hv_l1vh_partition())
+			return hv_pci_vmbus_device_id(pdev);
+		else
+			return hv_build_devid_type_logical(pdev);
+	} else if (type == HV_DEVICE_TYPE_PCI)
+#ifdef CONFIG_X86
+		return hv_build_devid_type_pci(pdev);
+#else
+		return 0;
+#endif
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
+
+/* Create a new device domain in the hypervisor */
+static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
+{
+	u64 status;
+	struct hv_input_device_domain *ddp;
+	struct hv_input_create_device_domain *input;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	ddp = &input->device_domain;
+	ddp->partition_id = HV_PARTITION_ID_SELF;
+	ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	ddp->domain_id.id = hvdom->domid_num;
+
+	input->create_device_domain_flags.forward_progress_required = 1;
+	input->create_device_domain_flags.inherit_owning_vtl = 0;
+
+	status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL);
+
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+
+	return hv_result_to_errno(status);
+}
+
+static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
+{
+	struct hv_domain *hvdom;
+	int rc;
+
+	if (hv_l1vh_partition() && !hv_curr_thread_is_vmm()) {
+		pr_err("Hyper-V: l1vh iommu does not support host devices\n");
+		return NULL;
+	}
+
+	hvdom = kzalloc(sizeof(struct hv_domain), GFP_KERNEL);
+	if (hvdom == NULL)
+		return NULL;
+
+	spin_lock_init(&hvdom->mappings_lock);
+	hvdom->mappings_tree = RB_ROOT_CACHED;
+
+	/* Called under iommu group mutex, so single threaded */
+	if (++unique_id == HV_DEVICE_DOMAIN_ID_S2_NULL)   /* ie, UINTMAX */
+		goto out_err;
+
+	hvdom->domid_num = unique_id;
+	hvdom->partid = hv_get_current_partid();
+	hvdom->iommu_dom.geometry = default_geometry;
+	hvdom->iommu_dom.pgsize_bitmap = HV_IOMMU_PGSIZES;
+
+	/* For guests, by default we do direct attaches, so no domain in hyp */
+	if (hv_dom_owner_is_vmm(hvdom) && !hv_no_attdev)
+		hvdom->attached_dom = true;
+	else {
+		rc = hv_iommu_create_hyp_devdom(hvdom);
+		if (rc)
+			goto out_err;
+	}
+
+	return &hvdom->iommu_dom;
+
+out_err:
+	unique_id--;
+	kfree(hvdom);
+	return NULL;
+}
+
+static void hv_iommu_domain_free(struct iommu_domain *immdom)
+{
+	struct hv_domain *hvdom = to_hv_domain(immdom);
+	unsigned long flags;
+	u64 status;
+	struct hv_input_delete_device_domain *input;
+
+	if (hv_special_domain(hvdom))
+		return;
+
+	if (!hv_dom_owner_is_vmm(hvdom) || hv_no_attdev) {
+		struct hv_input_device_domain *ddp;
+
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		ddp = &input->device_domain;
+		memset(input, 0, sizeof(*input));
+
+		ddp->partition_id = HV_PARTITION_ID_SELF;
+		ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+		ddp->domain_id.id = hvdom->domid_num;
+
+		status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input,
+					 NULL);
+		local_irq_restore(flags);
+
+		if (!hv_result_success(status))
+			hv_status_err(status, "\n");
+	}
+
+	kfree(hvdom);
+}
+
+/*
+ * Attach a device to the default domain, or the null domain, or to a domain
+ * previously created in the hypervisor.
+ */
+static int hv_iommu_att_dev2dom(struct hv_domain *hvdom, struct pci_dev *pdev)
+{
+	unsigned long flags;
+	u64 status;
+	enum hv_device_type dev_type;
+	struct hv_input_attach_device_domain *input;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	/* For null domain, hvdom->domid_num == HV_DEVICE_DOMAIN_ID_S2_NULL */
+	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
+	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	input->device_domain.domain_id.id = hvdom->domid_num;
+
+	/* NB: Upon guest shutdown, device is re-attached to the default domain
+	 *     without explicit detach.
+	 */
+	if (hv_l1vh_partition())
+		dev_type = HV_DEVICE_TYPE_LOGICAL;
+	else
+		dev_type = HV_DEVICE_TYPE_PCI;
+
+	input->device_id.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
+
+	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+
+	return hv_result_to_errno(status);
+}
+
+/* Caller must have validated that dev is a valid pci dev */
+static int hv_iommu_direct_attach_device(struct pci_dev *pdev, u64 ptid)
+{
+	struct hv_input_attach_device *input;
+	u64 status;
+	int rc;
+	unsigned long flags;
+	union hv_device_id host_devid;
+	enum hv_device_type dev_type;
+
+	if (ptid == HV_PARTITION_ID_INVALID) {
+		pr_err("Hyper-V: Invalid partition id in direct attach\n");
+		return -EINVAL;
+	}
+
+	if (hv_l1vh_partition())
+		dev_type = HV_DEVICE_TYPE_LOGICAL;
+	else
+		dev_type = HV_DEVICE_TYPE_PCI;
+
+	host_devid.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
+
+	do {
+		local_irq_save(flags);
+		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+		memset(input, 0, sizeof(*input));
+		input->partition_id = ptid;
+		input->device_id = host_devid;
+
+		/* Hypervisor associates logical_id with this device, and in
+		 * some hypercalls like retarget interrupts, logical_id must be
+		 * used instead of the BDF. It is a required parameter.
+		 */
+		input->attdev_flags.logical_id = 1;
+		input->logical_devid =
+			   hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
+
+		status = hv_do_hypercall(HVCALL_ATTACH_DEVICE, input, NULL);
+		local_irq_restore(flags);
+
+		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+			rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
+			if (rc)
+				break;
+		}
+	} while (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+
+	return hv_result_to_errno(status);
+}
+
+/* Attach a device for passthru to guest VMs, host apps like SPDK, etc */
+static int hv_iommu_attach_dev(struct iommu_domain *immdom, struct device *dev,
+			       struct iommu_domain *old)
+{
+	struct pci_dev *pdev;
+	int rc;
+	struct hv_domain *hvdom_new = to_hv_domain(immdom);
+	struct hv_domain *hvdom_prev = to_hv_domain(old);
+
+	/* Only allow PCI devices for now */
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	pdev = to_pci_dev(dev);
+
+	if (hv_l1vh_partition() && !hv_special_domain(hvdom_new) &&
+	    !hvdom_new->attached_dom)
+		return -EINVAL;
+
+	/* VFIO does not do explicit detach calls, hence check first if we need
+	 * to detach first. Also, in case of guest shutdown, it's the VMM
+	 * thread that attaches it back to the hv_def_identity_dom, and
+	 * hvdom_prev will not be null then. It is null during boot.
+	 */
+	if (hvdom_prev)
+		if (!hv_l1vh_partition() || !hv_special_domain(hvdom_prev))
+			hv_iommu_detach_dev(hvdom_prev, dev);
+
+	/* l1vh does not have a default S2 domain in the hypervisor */
+	if (hv_l1vh_partition() && hv_special_domain(hvdom_new)) {
+		dev_iommu_priv_set(dev, hvdom_new);  /* sets "private" field */
+		return 0;
+	}
+
+	if (hvdom_new->attached_dom)
+		rc = hv_iommu_direct_attach_device(pdev, hvdom_new->partid);
+	else
+		rc = hv_iommu_att_dev2dom(hvdom_new, pdev);
+
+	if (rc == 0)
+		dev_iommu_priv_set(dev, hvdom_new);  /* sets "private" field */
+	else
+		dev_iommu_priv_set(dev, NULL);
+
+	return rc;
+}
+
+static void hv_iommu_det_dev_from_guest(struct pci_dev *pdev, u64 ptid)
+{
+	struct hv_input_detach_device *input;
+	u64 status, log_devid;
+	unsigned long flags;
+
+	log_devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->partition_id = ptid;
+	input->logical_devid = log_devid;
+	status = hv_do_hypercall(HVCALL_DETACH_DEVICE, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+}
+
+static void hv_iommu_det_dev_from_dom(struct pci_dev *pdev)
+{
+	u64 status, devid;
+	unsigned long flags;
+	struct hv_input_detach_device_domain *input;
+
+	devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_PCI);
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->partition_id = HV_PARTITION_ID_SELF;
+	input->device_id.as_uint64 = devid;
+	status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+}
+
+static void hv_iommu_detach_dev(struct hv_domain *hvdom, struct device *dev)
+{
+	struct pci_dev *pdev;
+
+	/* See the attach function, only PCI devices for now */
+	if (!dev_is_pci(dev))
+		return;
+
+	pdev = to_pci_dev(dev);
+
+	if (hvdom->attached_dom)
+		hv_iommu_det_dev_from_guest(pdev, hvdom->partid);
+
+		/* Do not clear attached_dom, hv_iommu_unmap_pages happens
+		 * next.
+		 */
+	else
+		hv_iommu_det_dev_from_dom(pdev);
+}
+
+static int hv_iommu_add_tree_mapping(struct hv_domain *hvdom,
+				     unsigned long iova, phys_addr_t paddr,
+				     size_t size, u32 flags)
+{
+	unsigned long irqflags;
+	struct hv_iommu_mapping *mapping;
+
+	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
+	if (!mapping)
+		return -ENOMEM;
+
+	mapping->paddr = paddr;
+	mapping->iova.start = iova;
+	mapping->iova.last = iova + size - 1;
+	mapping->flags = flags;
+
+	spin_lock_irqsave(&hvdom->mappings_lock, irqflags);
+	interval_tree_insert(&mapping->iova, &hvdom->mappings_tree);
+	spin_unlock_irqrestore(&hvdom->mappings_lock, irqflags);
+
+	return 0;
+}
+
+static size_t hv_iommu_del_tree_mappings(struct hv_domain *hvdom,
+					unsigned long iova, size_t size)
+{
+	unsigned long flags;
+	size_t unmapped = 0;
+	unsigned long last = iova + size - 1;
+	struct hv_iommu_mapping *mapping = NULL;
+	struct interval_tree_node *node, *next;
+
+	spin_lock_irqsave(&hvdom->mappings_lock, flags);
+	next = interval_tree_iter_first(&hvdom->mappings_tree, iova, last);
+	while (next) {
+		node = next;
+		mapping = container_of(node, struct hv_iommu_mapping, iova);
+		next = interval_tree_iter_next(node, iova, last);
+
+		/* Trying to split a mapping? Not supported for now. */
+		if (mapping->iova.start < iova)
+			break;
+
+		unmapped += mapping->iova.last - mapping->iova.start + 1;
+
+		interval_tree_remove(node, &hvdom->mappings_tree);
+		kfree(mapping);
+	}
+	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
+
+	return unmapped;
+}
+
+/* Return: must return exact status from the hypercall without changes */
+static u64 hv_iommu_map_pgs(struct hv_domain *hvdom,
+			    unsigned long iova, phys_addr_t paddr,
+			    unsigned long npages, u32 map_flags)
+{
+	u64 status;
+	int i;
+	struct hv_input_map_device_gpa_pages *input;
+	unsigned long flags, pfn;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
+	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	input->device_domain.domain_id.id = hvdom->domid_num;
+	input->map_flags = map_flags;
+	input->target_device_va_base = iova;
+
+	pfn = paddr >> HV_HYP_PAGE_SHIFT;
+	for (i = 0; i < npages; i++, pfn++)
+		input->gpa_page_list[i] = pfn;
+
+	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
+				     input, NULL);
+
+	local_irq_restore(flags);
+	return status;
+}
+
+#define HV_MAP_DEVICE_GPA_BATCH_SIZE   \
+	((HV_HYP_PAGE_SIZE - sizeof(struct hv_input_map_device_gpa_pages)) \
+			/ sizeof(u64))
+
+/*
+ * The core VFIO code loops over memory ranges calling this function with the
+ * largest pgsize from HV_IOMMU_PGSIZES. cond_resched() is in vfio_iommu_map.
+ */
+static int hv_iommu_map_pages(struct iommu_domain *immdom, ulong iova,
+			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
+			      int prot, gfp_t gfp, size_t *mapped)
+{
+	u32 map_flags;
+	int ret;
+	u64 status;
+	unsigned long npages, done = 0;
+	struct hv_domain *hvdom = to_hv_domain(immdom);
+	size_t size = pgsize * pgcount;
+
+	map_flags = HV_MAP_GPA_READABLE;	/* required */
+	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
+
+	ret = hv_iommu_add_tree_mapping(hvdom, iova, paddr, size, map_flags);
+	if (ret)
+		return ret;
+
+	if (hvdom->attached_dom) {
+		*mapped = size;
+		return 0;
+	}
+
+	npages = size >> HV_HYP_PAGE_SHIFT;
+	while (done < npages) {
+		ulong completed, remain = npages - done;
+
+		remain = min(remain, HV_MAP_DEVICE_GPA_BATCH_SIZE);
+
+		status = hv_iommu_map_pgs(hvdom, iova, paddr, remain,
+					  map_flags);
+
+		completed = hv_repcomp(status);
+		done = done + completed;
+		iova = iova + (completed << HV_HYP_PAGE_SHIFT);
+		paddr = paddr + (completed << HV_HYP_PAGE_SHIFT);
+
+		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
+			ret = hv_call_deposit_pages(NUMA_NO_NODE,
+						    hv_current_partition_id,
+						    256);
+			if (ret)
+				break;
+			continue;
+		}
+		if (!hv_result_success(status))
+			break;
+	}
+
+	if (!hv_result_success(status)) {
+		size_t done_size = done << HV_HYP_PAGE_SHIFT;
+
+		hv_status_err(status, "pgs:%lx/%lx iova:%lx\n",
+			      done, npages, iova);
+		/*
+		 * lookup tree has all mappings [0 - size-1]. Below unmap will
+		 * only remove from [0 - done], we need to remove second chunk
+		 * [done+1 - size-1].
+		 */
+		hv_iommu_del_tree_mappings(hvdom, iova, size - done_size);
+		hv_iommu_unmap_pages(immdom, iova - done_size, HV_HYP_PAGE_SIZE,
+				     done, NULL);
+		if (mapped)
+			*mapped = 0;
+	} else
+		if (mapped)
+			*mapped = size;
+
+	return hv_result_to_errno(status);
+}
+
+static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
+				   size_t pgsize, size_t pgcount,
+				   struct iommu_iotlb_gather *gather)
+{
+	unsigned long flags, npages;
+	struct hv_input_unmap_device_gpa_pages *input;
+	u64 status;
+	struct hv_domain *hvdom = to_hv_domain(immdom);
+	size_t unmapped, size = pgsize * pgcount;
+
+	unmapped = hv_iommu_del_tree_mappings(hvdom, iova, size);
+	if (unmapped < size)
+		pr_err("%s: could not delete all mappings (%lx:%lx/%lx)\n",
+		       __func__, iova, unmapped, size);
+
+	if (hvdom->attached_dom)
+		return size;
+
+	npages = size >> HV_HYP_PAGE_SHIFT;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
+	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
+	input->device_domain.domain_id.id = hvdom->domid_num;
+	input->target_device_va_base = iova;
+
+	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
+				     0, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+
+	return unmapped;
+}
+
+static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *immdom,
+					 dma_addr_t iova)
+{
+	unsigned long flags;
+	struct hv_iommu_mapping *mapping;
+	struct interval_tree_node *node;
+	u64 paddr = 0;
+	struct hv_domain *hvdom = to_hv_domain(immdom);
+
+	spin_lock_irqsave(&hvdom->mappings_lock, flags);
+	node = interval_tree_iter_first(&hvdom->mappings_tree, iova, iova);
+	if (node) {
+		mapping = container_of(node, struct hv_iommu_mapping, iova);
+		paddr = mapping->paddr + (iova - mapping->iova.start);
+	}
+	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
+
+	return paddr;
+}
+
+/*
+ * Currently, hypervisor does not provide list of devices it is using
+ * dynamically. So use this to allow users to manually specify devices that
+ * should be skipped. (eg. hypervisor debugger using some network device).
+ */
+static struct iommu_device *hv_iommu_probe_device(struct device *dev)
+{
+	if (!dev_is_pci(dev))
+		return ERR_PTR(-ENODEV);
+
+	if (pci_devs_to_skip && *pci_devs_to_skip) {
+		int rc, pos = 0;
+		int parsed;
+		int segment, bus, slot, func;
+		struct pci_dev *pdev = to_pci_dev(dev);
+
+		do {
+			parsed = 0;
+
+			rc = sscanf(pci_devs_to_skip + pos, " (%x:%x:%x.%x) %n",
+				    &segment, &bus, &slot, &func, &parsed);
+			if (rc)
+				break;
+			if (parsed <= 0)
+				break;
+
+			if (pci_domain_nr(pdev->bus) == segment &&
+			    pdev->bus->number == bus &&
+			    PCI_SLOT(pdev->devfn) == slot &&
+			    PCI_FUNC(pdev->devfn) == func) {
+
+				dev_info(dev, "skipped by Hyper-V IOMMU\n");
+				return ERR_PTR(-ENODEV);
+			}
+			pos += parsed;
+
+		} while (pci_devs_to_skip[pos]);
+	}
+
+	/* Device will be explicitly attached to the default domain, so no need
+	 * to do dev_iommu_priv_set() here.
+	 */
+
+	return &hv_virt_iommu;
+}
+
+static void hv_iommu_probe_finalize(struct device *dev)
+{
+	struct iommu_domain *immdom = iommu_get_domain_for_dev(dev);
+
+	if (immdom && immdom->type == IOMMU_DOMAIN_DMA)
+		iommu_setup_dma_ops(dev, immdom);
+	else
+		set_dma_ops(dev, NULL);
+}
+
+static void hv_iommu_release_device(struct device *dev)
+{
+	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
+
+	/* Need to detach device from device domain if necessary. */
+	if (hvdom)
+		hv_iommu_detach_dev(hvdom, dev);
+
+	dev_iommu_priv_set(dev, NULL);
+	set_dma_ops(dev, NULL);
+}
+
+static struct iommu_group *hv_iommu_device_group(struct device *dev)
+{
+	if (dev_is_pci(dev))
+		return pci_device_group(dev);
+	else
+		return generic_device_group(dev);
+}
+
+static int hv_iommu_def_domain_type(struct device *dev)
+{
+	/* The hypervisor always creates this by default during boot */
+	return IOMMU_DOMAIN_IDENTITY;
+}
+
+static struct iommu_ops hv_iommu_ops = {
+	.capable	    = hv_iommu_capable,
+	.domain_alloc_paging	= hv_iommu_domain_alloc_paging,
+	.probe_device	    = hv_iommu_probe_device,
+	.probe_finalize     = hv_iommu_probe_finalize,
+	.release_device     = hv_iommu_release_device,
+	.def_domain_type    = hv_iommu_def_domain_type,
+	.device_group	    = hv_iommu_device_group,
+	.default_domain_ops = &(const struct iommu_domain_ops) {
+		.attach_dev   = hv_iommu_attach_dev,
+		.map_pages    = hv_iommu_map_pages,
+		.unmap_pages  = hv_iommu_unmap_pages,
+		.iova_to_phys = hv_iommu_iova_to_phys,
+		.free	      = hv_iommu_domain_free,
+	},
+	.owner		    = THIS_MODULE,
+	.identity_domain = &hv_def_identity_dom.iommu_dom,
+	.blocked_domain  = &hv_null_dom.iommu_dom,
+};
+
+static const struct iommu_domain_ops hv_special_domain_ops = {
+	.attach_dev = hv_iommu_attach_dev,
+};
+
+static void __init hv_initialize_special_domains(void)
+{
+	hv_def_identity_dom.iommu_dom.type = IOMMU_DOMAIN_IDENTITY;
+	hv_def_identity_dom.iommu_dom.ops = &hv_special_domain_ops;
+	hv_def_identity_dom.iommu_dom.owner = &hv_iommu_ops;
+	hv_def_identity_dom.iommu_dom.geometry = default_geometry;
+	hv_def_identity_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_DEFAULT; /* 0 */
+
+	hv_null_dom.iommu_dom.type = IOMMU_DOMAIN_BLOCKED;
+	hv_null_dom.iommu_dom.ops = &hv_special_domain_ops;
+	hv_null_dom.iommu_dom.owner = &hv_iommu_ops;
+	hv_null_dom.iommu_dom.geometry = default_geometry;
+	hv_null_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_NULL;  /* INTMAX */
+}
+
+static int __init hv_iommu_init(void)
+{
+	int ret;
+	struct iommu_device *iommup = &hv_virt_iommu;
+
+	if (!hv_is_hyperv_initialized())
+		return -ENODEV;
+
+	ret = iommu_device_sysfs_add(iommup, NULL, NULL, "%s", "hyperv-iommu");
+	if (ret) {
+		pr_err("Hyper-V: iommu_device_sysfs_add failed: %d\n", ret);
+		return ret;
+	}
+
+	/* This must come before iommu_device_register because the latter calls
+	 * into the hooks.
+	 */
+	hv_initialize_special_domains();
+
+	ret = iommu_device_register(iommup, &hv_iommu_ops, NULL);
+	if (ret) {
+		pr_err("Hyper-V: iommu_device_register failed: %d\n", ret);
+		goto err_sysfs_remove;
+	}
+
+	pr_info("Hyper-V IOMMU initialized\n");
+
+	return 0;
+
+err_sysfs_remove:
+	iommu_device_sysfs_remove(iommup);
+	return ret;
+}
+
+void __init hv_iommu_detect(void)
+{
+	if (no_iommu || iommu_detected)
+		return;
+
+	/* For l1vh, always expose an iommu unit */
+	if (!hv_l1vh_partition())
+		if (!(ms_hyperv.misc_features & HV_DEVICE_DOMAIN_AVAILABLE))
+			return;
+
+	iommu_detected = 1;
+	x86_init.iommu.iommu_init = hv_iommu_init;
+
+	pci_request_acs();
+}
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 25ac7ca0fd8b..8d5c610da99a 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -327,6 +327,23 @@ static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
 { return 0; }
 #endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
 
+#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
+u64 hv_get_current_partid(void);
+bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
+bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev);
+u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
+#else
+static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
+{ return false; }
+static inline bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
+{ return false; }
+static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
+					enum hv_device_type type)
+{ return 0; }
+static inline u64 hv_get_current_partid(void)
+{ return HV_PARTITION_ID_INVALID; }
+#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
+
 #else /* CONFIG_HYPERV */
 static inline void hv_identify_partition_type(void) {}
 static inline bool hv_is_hyperv_initialized(void) { return false; }
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 5459e776ec17..6eee1cbf6f23 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1769,4 +1769,10 @@ static inline unsigned long virt_to_hvpfn(void *addr)
 #define HVPFN_DOWN(x)	((x) >> HV_HYP_PAGE_SHIFT)
 #define page_to_hvpfn(page)	(page_to_pfn(page) * NR_HV_HYP_PAGES_IN_PAGE)
 
+#ifdef CONFIG_HYPERV_IOMMU
+void __init hv_iommu_detect(void);
+#else
+static inline void hv_iommu_detect(void) { }
+#endif /* CONFIG_HYPERV_IOMMU */
+
 #endif /* _HYPERV_H */
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 11/11] mshv: Mark mem regions as non-movable upfront if device passthru
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

If a VM is started with device attached, the mem regions must be marked
non-movable as the device attach hypercall right away allows the use of
SLAT for IOMMU. Marking them non-movable forces mapping of the entire
guest RAM in the SLAT at the time of region creation along with the
region pinned. Also, because a device could be dynamically attached
much later in a VM, create a boot parameter to disable movable pages
that users can set if they anticipate such an action.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_root.h      |  1 +
 drivers/hv/mshv_root_main.c | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index b9880d0bdc4d..d57c26950203 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -141,6 +141,7 @@ struct mshv_partition {
 	pid_t pt_vmm_tgid;
 	bool import_completed;
 	bool pt_initialized;
+	bool pt_regions_pinned;
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct dentry *pt_stats_dentry;
 	struct dentry *pt_vp_dentry;
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index a7864463961b..ac71534733bd 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -49,6 +49,10 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
 static bool hv_nofull_mmio;	/* don't map entire mmio region upon fault */
 module_param(hv_nofull_mmio, bool, 0644);
 
+static bool hv_no_movbl_pgs;	/* disable movable pages completely */
+module_param(hv_no_movbl_pgs, bool, 0644);
+MODULE_PARM_DESC(hv_no_movbl_pgs, "If set, don't do movable pages for VMs");
+
 struct mshv_root mshv_root;
 
 enum hv_scheduler_type hv_scheduler_type;
@@ -1303,6 +1307,12 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
 	*status = partition->async_hypercall_status;
 }
 
+static bool mshv_do_pt_regions_pinned(struct mshv_partition *pt)
+{
+	return pt->pt_regions_pinned || mshv_partition_encrypted(pt) ||
+	       hv_no_movbl_pgs;
+}
+
 /*
  * NB: caller checks and makes sure mem->size is page aligned
  * Returns: 0 with regionpp updated on success, or -errno
@@ -1333,7 +1343,7 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
 
 	if (is_mmio)
 		rg->mreg_type = MSHV_REGION_TYPE_MMIO;
-	else if (mshv_partition_encrypted(partition) ||
+	else if (mshv_do_pt_regions_pinned(partition) ||
 		 !mshv_region_movable_init(rg))
 		rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
 	else
@@ -1808,6 +1818,9 @@ static long mshv_partition_ioctl_create_device(struct mshv_partition *partition,
 	if (copy_to_user(uarg, &devargk, sizeof(devargk)))
 		return -EFAULT;    /* cleanup in mshv_device_fop_release() */
 
+	/* For now, all regions must be pinned if there is device passthru. */
+	partition->pt_regions_pinned = true;
+
 	return 0;
 
 undo_out:
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V3 10/11] mshv: Populate mmio mappings for PCI passthru
From: Mukesh R @ 2026-05-12  2:02 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, wei.liu, mrathor, mhklinux, muislam,
	namjain, magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch
  Cc: kys, haiyangz, decui, longli, tglx, mingo, bp, dave.hansen, x86,
	joro, will, lpieralisi, kwilczynski, bhelgaas, arnd, jacob.pan
In-Reply-To: <20260512020259.1678627-1-mrathor@linux.microsoft.com>

Upon guest access, in case of missing mmio mapping, the hypervisor
generates an unmapped gpa intercept. In this path, lookup the PCI
resource pfn for the guest gpa, and ask the hypervisor to map it
via hypercall. The PCI resource pfn is maintained by the VFIO driver,
and obtained via fixup_user_fault call (similar to KVM).

Also, VFIO no longer puts the mmio pfn in vma->vm_pgoff. So, remove
code that is using it to map mmio space. It is broken and will cause
panic.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c | 113 ++++++++++++++++++++++++++++++------
 1 file changed, 96 insertions(+), 17 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 6ceb5f608589..a7864463961b 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -46,6 +46,9 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
 #define HV_VP_COUNTER_ROOT_DISPATCH_THREAD_BLOCKED 95
 #endif
 
+static bool hv_nofull_mmio;	/* don't map entire mmio region upon fault */
+module_param(hv_nofull_mmio, bool, 0644);
+
 struct mshv_root mshv_root;
 
 enum hv_scheduler_type hv_scheduler_type;
@@ -641,6 +644,94 @@ mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
 	return region;
 }
 
+/*
+ * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
+ * else just return -errno.
+ */
+static int mshv_chk_get_mmio_start_pfn(u64 uaddr, u64 *mmio_pfnp)
+{
+	struct vm_area_struct *vma;
+	bool is_mmio;
+	struct follow_pfnmap_args pfnmap_args;
+	int rc = -EINVAL;
+
+	mmap_read_lock(current->mm);
+	vma = vma_lookup(current->mm, uaddr);
+	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+	if (!is_mmio)
+		goto unlock_mmap_out;
+
+	pfnmap_args.vma = vma;
+	pfnmap_args.address = uaddr;
+
+	rc = follow_pfnmap_start(&pfnmap_args);
+	if (rc) {
+		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
+				      NULL);
+		if (rc)
+			goto unlock_mmap_out;
+
+		rc = follow_pfnmap_start(&pfnmap_args);
+		if (rc)
+			goto unlock_mmap_out;
+	}
+
+	*mmio_pfnp = pfnmap_args.pfn;
+	follow_pfnmap_end(&pfnmap_args);
+
+unlock_mmap_out:
+	mmap_read_unlock(current->mm);
+	return rc;
+}
+
+/*
+ * Check if the unmapped gpa belongs to mmio space. If yes, resolve it.
+ *
+ * Returns: True if valid mmio intercept and handled, else false.
+ */
+static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
+{
+	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
+	u64 gfn, uaddr, mmio_spa, numpgs;
+	struct mshv_mem_region *rg;
+	int rc = -EINVAL;
+	struct mshv_partition *pt = vp->vp_partition;
+#if defined(CONFIG_X86_64)
+	struct hv_x64_memory_intercept_message *msg =
+		(struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
+#elif defined(CONFIG_ARM64)
+	struct hv_arm64_memory_intercept_message *msg =
+		(struct hv_arm64_memory_intercept_message *)hvmsg->u.payload;
+#endif
+
+	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
+
+	rg = mshv_partition_region_by_gfn_get(pt, gfn);
+	if (rg == NULL)
+		return false;
+	if (rg->mreg_type != MSHV_REGION_TYPE_MMIO)
+		goto put_rg_out;
+
+	uaddr = rg->start_uaddr + ((gfn - rg->start_gfn) << HV_HYP_PAGE_SHIFT);
+
+	rc = mshv_chk_get_mmio_start_pfn(uaddr, &mmio_spa);
+	if (rc)
+		goto put_rg_out;
+
+	if (!hv_nofull_mmio) {		/* default case */
+		mmio_spa = mmio_spa - (gfn - rg->start_gfn);
+		gfn = rg->start_gfn;
+		numpgs = rg->nr_pages;
+	} else
+		numpgs = 1;
+
+	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
+
+put_rg_out:
+	mshv_region_put(rg);
+	return rc == 0;
+}
+
 /**
  * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
  * @vp: Pointer to the virtual processor structure.
@@ -699,6 +790,8 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
 static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
 {
 	switch (vp->vp_intercept_msg_page->header.message_type) {
+	case HVMSG_UNMAPPED_GPA:
+		return mshv_handle_unmapped_gpa(vp);
 	case HVMSG_GPA_INTERCEPT:
 		return mshv_handle_gpa_intercept(vp);
 	}
@@ -1322,16 +1415,8 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
 }
 
 /*
- * This maps two things: guest RAM and for pci passthru mmio space.
- *
- * mmio:
- *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
- *  - Two things need to happen for mapping mmio range:
- *	1. mapped in the uaddr so VMM can access it.
- *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
- *
- *   This function takes care of the second. The first one is managed by vfio,
- *   and hence is taken care of via vfio_pci_mmap_fault().
+ * This is called for both user ram and mmio space. The mmio space is not
+ * mapped here, but later during intercept on demand.
  */
 static long
 mshv_map_user_memory(struct mshv_partition *partition,
@@ -1340,7 +1425,6 @@ mshv_map_user_memory(struct mshv_partition *partition,
 	struct mshv_mem_region *region;
 	struct vm_area_struct *vma;
 	bool is_mmio;
-	ulong mmio_pfn;
 	long ret;
 
 	if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
@@ -1350,7 +1434,6 @@ mshv_map_user_memory(struct mshv_partition *partition,
 	mmap_read_lock(current->mm);
 	vma = vma_lookup(current->mm, mem->userspace_addr);
 	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
-	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
 	mmap_read_unlock(current->mm);
 
 	if (!vma)
@@ -1376,11 +1459,7 @@ mshv_map_user_memory(struct mshv_partition *partition,
 					    region->nr_pages,
 					    HV_MAP_GPA_NO_ACCESS, NULL);
 		break;
-	case MSHV_REGION_TYPE_MMIO:
-		ret = hv_call_map_mmio_pages(partition->pt_id,
-					     region->start_gfn,
-					     mmio_pfn,
-					     region->nr_pages);
+	default:
 		break;
 	}
 
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V1 0/3] PCI passthru on Hyper-V (Part II)
From: Mukesh R @ 2026-05-12  2:12 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch

This patch series implements interrupt remapping part of the PCI
passthru feature on Hyper-V when Linux is running as a privileged VM.
These patches complement Part I of the feature at:

https://lore.kernel.org/linux-hyperv/20260512020259.1678627-1-mrathor@linux.microsoft.com/T/#t

Testing and other details are listed there.

Changes in V1:
 o rebase to above V3 of Part I
 o check for NULL irqdata->parent_data->chip before calling 
   irq_chip_unmask_parent().

Thanks,
-Mukesh

Mukesh R (3):
  mshv: Import declarations for irq remap and add irqbypass support
  hyperv: Implement irq remap for passthru devices
  mshv: Implement guest irq migration for passthru devices

 arch/x86/hyperv/irqdomain.c         |  18 +-
 drivers/hv/Kconfig                  |   1 +
 drivers/hv/mshv_eventfd.c           | 501 +++++++++++++++++++++++++++-
 drivers/hv/mshv_eventfd.h           |   3 +
 drivers/iommu/hyperv-iommu-root.c   |  14 +
 drivers/pci/controller/pci-hyperv.c |  10 +
 include/asm-generic/mshyperv.h      |   3 +
 include/hyperv/hvgdk_mini.h         |   3 +
 include/hyperv/hvhdk.h              |  17 +
 9 files changed, 564 insertions(+), 6 deletions(-)

-- 
2.51.2.vfs.0.1


^ permalink raw reply

* [PATCH V1 1/3] mshv: Import declarations for irq remap and add irqbypass support
From: Mukesh R @ 2026-05-12  2:12 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch
In-Reply-To: <20260512021242.1679786-1-mrathor@linux.microsoft.com>

For the irq map/remap hypercalls, copy relevant data structures from
hypervisor public headers into Linux equivalents. Also, update Kconfig and
mshv_irqfd for irqbypass. Please note, irqbypass is required for doing
passthru on MSHV. This because there is really no way of knowing the linux
irq in the mshv_irqfd_assign and mshv_irqfd_update paths without it. The
linux irq is setup upfront by VFIO before irqfd assign/update happens.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/Kconfig          |  1 +
 drivers/hv/mshv_eventfd.h   |  3 +++
 include/hyperv/hvgdk_mini.h |  3 +++
 include/hyperv/hvhdk.h      | 17 +++++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 7937ac0cbd0f..c831fe25ca2b 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -75,6 +75,7 @@ config MSHV_ROOT
 	# no particular order, making it impossible to reassemble larger pages
 	depends on PAGE_SIZE_4KB
 	select EVENTFD
+	select IRQ_BYPASS_MANAGER
 	select VIRT_XFER_TO_GUEST_WORK
 	select HMM_MIRROR
 	select MMU_NOTIFIER
diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
index 464c6b81ab33..ff4dd24b8ad4 100644
--- a/drivers/hv/mshv_eventfd.h
+++ b/drivers/hv/mshv_eventfd.h
@@ -9,6 +9,7 @@
 #define __LINUX_MSHV_EVENTFD_H
 
 #include <linux/poll.h>
+#include <linux/irqbypass.h>
 
 #include "mshv.h"
 #include "mshv_root.h"
@@ -37,6 +38,8 @@ struct mshv_irqfd {
 	struct mshv_irqfd_resampler	    *irqfd_resampler;
 	struct eventfd_ctx		    *irqfd_resamplefd;
 	struct hlist_node		     irqfd_resampler_hnode;
+	struct irq_bypass_consumer	     irqfd_bypass_cons;
+	struct irq_bypass_producer	    *irqfd_bypass_prod;
 };
 
 void mshv_eventfd_init(struct mshv_partition *partition);
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index da622fb06440..1ef480825705 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -59,6 +59,8 @@ struct hv_u128 {
 #define HV_PARTITION_ID_INVALID		((u64)0)
 #define HV_PARTITION_ID_SELF		((u64)-1)
 
+#define HV_MAX_VPS    256               /* HV_MAXIMUM_PROCESSORS */
+
 /* Hyper-V specific model specific registers (MSRs) */
 
 #if defined(CONFIG_X86)
@@ -508,6 +510,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_UNMAP_VP_STATE_PAGE			0x00e2
 #define HVCALL_GET_VP_STATE				0x00e3
 #define HVCALL_SET_VP_STATE				0x00e4
+#define HVCALL_GET_VPSET_FROM_MDA                       0x00e5
 #define HVCALL_GET_VP_CPUID_VALUES			0x00f4
 #define HVCALL_GET_PARTITION_PROPERTY_EX		0x0101
 #define HVCALL_MMIO_READ				0x0106
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 5e83d3714966..d0a892347ab1 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -952,4 +952,21 @@ struct hv_input_modify_sparse_spa_page_host_access {
 #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE      0x4
 #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE       0x8
 
+#ifdef CONFIG_X86
+
+struct hv_input_get_vp_set_from_mda {   /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
+	u64 target_partid;
+	u64 dest_address;
+	u8  input_vtl;
+	u8  destmode_logical;         /* true => mode is logical */
+	u16 reserved0;                /* mbz */
+	u32 reserved1;                /* mbz */
+} __packed;
+
+union hv_output_get_vp_set_from_mda {  /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
+	struct hv_vpset target_vpset;
+	u64 bitset_buffer[HV_GENERIC_SET_QWORD_COUNT(HV_MAX_VPS)];
+} __packed;
+
+#endif /* CONFIG_X86 */
 #endif /* _HV_HVHDK_H */
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V1 2/3] hyperv: Implement irq remap for passthru devices
From: Mukesh R @ 2026-05-12  2:12 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch
In-Reply-To: <20260512021242.1679786-1-mrathor@linux.microsoft.com>

Implement interrupt remapping for direct attached and domain attached
devices on Hyper-V.

Please note there are few constraints when it comes to mapping device
interrupts on Hyper-V. For example, the hypervisor will not allow mapping
device interrupts to root if the device is a direct attached device. Since
the target guest cpu and vector info is not available during the initial
VFIO irq setup, we work around by skipping this initial map. Then later
during irqbypass trigger, when both guest target cpu vector are available,
we do the map in the hypervisor, update the device, and enable the
interrupt vector on the device. Rather than special case direct attached,
we do same for domain attached also. This implies irqbypass is required
for MSHV pci device passthru. Also noteworthy is that the hypervisor
will automatically setup any direct hw injection like posted interrupts.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 arch/x86/hyperv/irqdomain.c         |  18 +-
 drivers/hv/mshv_eventfd.c           | 423 +++++++++++++++++++++++++++-
 drivers/iommu/hyperv-iommu-root.c   |  14 +
 drivers/pci/controller/pci-hyperv.c |  10 +
 include/asm-generic/mshyperv.h      |   3 +
 5 files changed, 464 insertions(+), 4 deletions(-)

diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 8780573a4332..02f9a889c014 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -197,7 +197,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
 
 	msidesc = irq_data_get_msi_desc(data);
 	pdev = msi_desc_to_pci_dev(msidesc);
-	hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
 	cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
 
 	return hv_map_interrupt(hv_devid, false, cpu, cfg->vector,
@@ -233,6 +233,20 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 		return;
 	}
 
+	/*
+	 * For direct attached devices, we cannot map interrupts in the
+	 * hypervisor because it will not allow it until we have guest target
+	 * vcpu and vector. So defer it until irqbypass. Also, do the same
+	 * for domain attached devices for simplicity.
+	 */
+	if (hv_pcidev_is_pthru_dev(pdev)) {
+		if (data->chip_data)
+			entry_to_msi_msg(data->chip_data, msg);
+		else
+			memset(msg, 0, sizeof(struct msi_msg));
+		return;
+	}
+
 	if (data->chip_data) {
 		/*
 		 * This interrupt is already mapped. Let's unmap first.
@@ -272,7 +286,7 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
 {
 	union hv_device_id hv_devid;
 
-	hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
 	return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
 }
 
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 90959f639dc3..1f5c1e9ee9b7 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -7,7 +7,6 @@
  *
  * All credits to kvm developers.
  */
-
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/poll.h>
@@ -15,7 +14,8 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/eventfd.h>
-
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
 #if IS_ENABLED(CONFIG_X86_64)
 #include <asm/apic.h>
 #endif
@@ -27,6 +27,377 @@
 
 static struct workqueue_struct *irqfd_cleanup_wq;
 
+#if IS_ENABLED(CONFIG_X86_64)
+
+static int mshv_parse_mshv_irqfd(struct mshv_irqfd *irqfd,
+				 struct pci_dev **out_pdev,
+				 struct irq_data **out_irqdata)
+{
+	struct irq_bypass_producer *prod;
+	struct msi_desc *msidesc;
+	struct irq_data *irqdata;
+
+	if (irqfd == NULL || irqfd->irqfd_bypass_prod == NULL)
+		return -ENODEV;
+
+	prod = irqfd->irqfd_bypass_prod;
+
+	irqdata = irq_get_irq_data(prod->irq);
+	if (irqdata == NULL) {
+		pr_err("Hyper-V: irqbypass fail, no irqdata. irq:0x%x\n",
+		       prod->irq);
+		return -EINVAL;
+	}
+	*out_irqdata = irqdata;
+
+	msidesc = irq_data_get_msi_desc(irqdata);
+	if (msidesc == NULL) {
+		pr_err("Hyper-V: irqbypass msi fail. irq:0x%x\n", prod->irq);
+		return -EINVAL;
+	}
+
+	*out_pdev = msi_desc_to_pci_dev(msidesc);
+	if (*out_pdev == NULL) {
+		pr_err("Hyper-V: mshv_irqfd parse fail. irq:0x%x\n", prod->irq);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Must be called with interrupts disabled */
+static int hv_vpset_from_hyp_disabled(
+			struct hv_input_get_vp_set_from_mda *input,
+			union hv_output_get_vp_set_from_mda *output,
+			struct mshv_lapic_irq *lapic_irq, u64 partid)
+{
+	u64 status;
+
+	memset(input, 0, sizeof(*input));
+	input->target_partid = partid;
+	input->dest_address = lapic_irq->lapic_apic_id;
+	input->input_vtl = 0;
+	input->destmode_logical = lapic_irq->lapic_control.logical_dest_mode;
+
+	status = hv_do_hypercall(HVCALL_GET_VPSET_FROM_MDA, input, output);
+	if (!hv_result_success(status)) {
+		hv_status_err(status, "apicid:0x%llx dest:0x%x\n",
+			      lapic_irq->lapic_apic_id,
+			      lapic_irq->lapic_control.logical_dest_mode);
+	}
+
+	return hv_result_to_errno(status);
+}
+
+/* Returns number of banks copied, -errno in case of error */
+static int hv_copy_vpset(struct hv_vpset *dest, struct hv_vpset *src)
+{
+	u64 bank_mask;
+	int banks, tot_banks = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK;
+
+	if (tot_banks >= HV_MAX_SPARSE_VCPU_BANKS)
+		return -EINVAL;
+
+	dest->format = src->format;
+	dest->valid_bank_mask = src->valid_bank_mask;
+	bank_mask = src->valid_bank_mask;
+	for (banks = 0; banks <= tot_banks; banks++) {
+		if (bank_mask == 0)
+			break;
+
+		if (bank_mask & 1)
+			dest->bank_contents[banks] = src->bank_contents[banks];
+		bank_mask = bank_mask >> 1;
+	}
+
+	return banks;
+}
+
+static int mshv_map_device_interrupt(u64 ptid, union hv_device_id hv_devid,
+				     struct mshv_lapic_irq *ginfo,
+				     struct hv_interrupt_entry *ret_entry,
+				     u64 *ret_status)
+{
+	struct hv_input_map_device_interrupt *irq_input;
+	struct hv_output_map_device_interrupt *irq_output;
+	struct hv_device_interrupt_descriptor *intdesc;
+	struct hv_input_get_vp_set_from_mda *mda_input;
+	union hv_output_get_vp_set_from_mda *mda_output;
+	ulong flags;
+	u64 status;
+	int rc, var_size;
+
+	*ret_status = U64_MAX;
+	local_irq_save(flags);
+
+	mda_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	mda_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	/*
+	 * Map Device Interrupt hcall needs vp set based on vp indexes used
+	 * during vp creation. Here we have lapic-id of the vp only. Easiest
+	 * is to just ask the hypervisor for the vp set matching the lapic-id.
+	 */
+	rc = hv_vpset_from_hyp_disabled(mda_input, mda_output, ginfo, ptid);
+	if (rc)
+		goto out;	/* error already printed */
+
+	irq_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	irq_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+	memset(irq_input, 0, sizeof(*irq_input));
+
+	irq_input->partition_id = ptid;
+	irq_input->device_id = hv_devid.as_uint64;
+
+	intdesc = &irq_input->interrupt_descriptor;
+	intdesc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+	intdesc->vector_count = 1;
+	intdesc->target.vector = ginfo->lapic_vector;
+	intdesc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+	intdesc->target.vp_set.valid_bank_mask = 0;
+	intdesc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	intdesc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+	rc = hv_copy_vpset(&intdesc->target.vp_set, &mda_output->target_vpset);
+	if (rc <= 0) {
+		pr_err("Hyper-V: ptid %lld - (irq)vpset copy failed (%d)\n",
+		       ptid, rc);
+		goto out;
+	}
+
+	/*
+	 * var-sized hcall: var-size starts after vp_mask (thus vp_set.format
+	 * does not count, but vp_set.valid_bank_mask does).
+	 */
+	var_size = rc + 1;
+	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+				     irq_input, irq_output);
+	*ret_entry = irq_output->interrupt_entry;
+	local_irq_restore(flags);
+
+	rc = 0;
+	if (!hv_result_success(status)) {
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+			hv_status_err(status, "pt:%lld vec:%d lapic-id:%lld\n",
+			      ptid, ginfo->lapic_vector, ginfo->lapic_apic_id);
+		*ret_status = status;
+		rc = hv_result_to_errno(status);
+	}
+
+	return rc;
+
+out:
+	local_irq_restore(flags);
+	return rc;
+
+}
+
+static int mshv_unmap_device_interrupt(union hv_device_id hv_devid,
+				       struct hv_interrupt_entry *irq_entry)
+{
+	unsigned long flags;
+	struct hv_input_unmap_device_interrupt *input;
+	u64 status;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
+		input->partition_id = hv_get_current_partid();
+	else
+		input->partition_id = hv_current_partition_id;
+
+	input->device_id = hv_devid.as_uint64;
+	input->interrupt_entry = *irq_entry;
+
+	status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+
+	return hv_result_to_errno(status);
+}
+
+static int mshv_chk_unmap_irq(union hv_device_id hv_devid,
+			      struct irq_data *irqdata)
+{
+	int rc;
+
+	if (irqdata->chip_data == NULL)
+		return 0;
+
+	rc = mshv_unmap_device_interrupt(hv_devid, irqdata->chip_data);
+	if (rc)
+		return rc;
+
+	kfree(irqdata->chip_data);
+	irqdata->chip_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Synchronize device update with VFIO.
+ *    See: vfio_pci_memory_lock_and_enable()
+ */
+static u16 mshv_pci_memory_lock_and_enable(struct vfio_pci_core_device *cdev)
+{
+	u16 cmd;
+
+	down_write(&cdev->memory_lock);
+	pci_read_config_word(cdev->pdev, PCI_COMMAND, &cmd);
+	if (!(cmd & PCI_COMMAND_MEMORY))
+		pci_write_config_word(cdev->pdev, PCI_COMMAND,
+				      cmd | PCI_COMMAND_MEMORY);
+	return cmd;
+}
+
+static void mshv_pci_memory_unlock_and_restore(
+					struct vfio_pci_core_device *cdev,
+					u16 cmd)
+{
+	pci_write_config_word(cdev->pdev, PCI_COMMAND, cmd);
+	up_write(&cdev->memory_lock);
+}
+
+static void mshv_make_device_usable(struct pci_dev *pdev, int vector,
+				    struct hv_interrupt_entry *hv_entry)
+{
+	int lirq;
+	struct msi_msg msimsg;
+	struct irq_data *irqdata, *parent;
+	u16 pcicmd;
+	struct vfio_pci_core_device *coredev = dev_get_drvdata(&pdev->dev);
+
+	if (pdev->dev.driver == NULL ||
+	    strcmp(pdev->dev.driver->name, "vfio-pci") != 0) {
+		pr_err("Hyper-V: irqbypass: non vfio device %s\n",
+		       pci_name(pdev));
+		return;
+	}
+	if (coredev == NULL) {
+		pr_err("Hyper-V: irqbypass: null vfio device for %s\n",
+		       pci_name(pdev));
+		return;
+	}
+
+	if (hv_entry->source != HV_INTERRUPT_SOURCE_MSI) {
+		pr_err("Hyper-V: %s irq source not msi\n", pci_name(pdev));
+		return;
+	}
+
+	lirq = pci_irq_vector(pdev, vector);
+	irqdata = irq_get_irq_data(lirq);
+	if (irqdata == NULL) {
+		pr_err("Hyper-V: null irq_data for write msimsg. lirq:0x%x\n",
+		       lirq);
+		return;
+	}
+
+	msimsg.address_hi = 0;
+	msimsg.address_lo = hv_entry->msi_entry.address.as_uint32;
+	msimsg.data =  hv_entry->msi_entry.data.as_uint32;
+
+	pcicmd = mshv_pci_memory_lock_and_enable(coredev);
+	pci_write_msi_msg(lirq, &msimsg);
+	mshv_pci_memory_unlock_and_restore(coredev, pcicmd);
+
+	pci_msi_unmask_irq(irqdata);
+
+	parent = irqdata->parent_data;
+	if (parent && parent->chip && parent->chip->irq_unmask)
+		irq_chip_unmask_parent(irqdata);
+}
+
+/*
+ * This guest has a device passthru'd to it. VFIO did the initial setup of
+ * the device interrupts, but we left them unmapped in the hypervisor
+ * because we didn't have the guest target cpu and vector (required by
+ * hypervisor). We have them now, so do the map hypercall.
+ * Also, when here, it is expected that the device global mask is unset
+ * but individual MSI/x masks are set. Goal here is to map the interrupt in
+ * the hypervisor, update the corresponding device MSI/x entry, and enable it.
+ */
+static void mshv_pthru_dev_irq_remap(struct mshv_irqfd *irqfd)
+{
+	u64 ptid, status;
+	struct pci_dev *pdev;
+	int rc, deposit_pgs = 16;
+	struct mshv_lapic_irq *ginfo = &irqfd->irqfd_lapic_irq;
+	union hv_device_id hv_devid;
+	struct hv_interrupt_entry *new_entry;
+	struct irq_data *irqdata;
+
+	if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+	    irqfd->irqfd_bypass_prod == NULL)
+		return;
+
+	rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+	if (rc)
+		return;
+
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+
+	rc = mshv_chk_unmap_irq(hv_devid, irqdata);
+	if (rc)
+		return;
+
+	new_entry = kmalloc(sizeof(*new_entry), GFP_ATOMIC);
+	if (new_entry == NULL)
+		return;
+
+	ptid = irqfd->irqfd_partn->pt_id;
+
+	while (deposit_pgs--) {
+		rc = mshv_map_device_interrupt(ptid, hv_devid, ginfo, new_entry,
+					       &status);
+		if (rc == 0)
+			break;
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+			break;
+
+		rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
+		if (rc)
+			break;
+	}
+	if (rc) {
+		kfree(new_entry);
+		return;
+	}
+
+	irqdata->chip_data = new_entry;
+
+	mshv_make_device_usable(pdev, irqdata->hwirq, new_entry);
+}
+
+static void mshv_pthru_dev_irq_undo(struct mshv_irqfd *irqfd)
+{
+	struct pci_dev *pdev;
+	union hv_device_id hv_devid;
+	struct irq_data *irqdata;
+	int rc;
+
+	if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+	    irqfd->irqfd_bypass_prod == NULL)
+		return;
+
+	rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+	if (rc)
+		return;
+
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+	mshv_chk_unmap_irq(hv_devid, irqdata);
+}
+
+#else /* IS_ENABLED(CONFIG_X86_64) */
+
+static void mshv_pthru_dev_irq_remap(struct mshv_irqfd *irqfd) { }
+static void mshv_pthru_dev_irq_undo(struct mshv_irqfd *irqfd) { }
+
+#endif /* IS_ENABLED(CONFIG_X86_64) */
+
 void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
 				    struct mshv_irq_ack_notifier *mian)
 {
@@ -264,6 +635,7 @@ static void mshv_irqfd_shutdown(struct work_struct *work)
 	/*
 	 * It is now safe to release the object's resources
 	 */
+	irq_bypass_unregister_consumer(&irqfd->irqfd_bypass_cons);
 	eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
 	kfree(irqfd);
 }
@@ -286,6 +658,12 @@ static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
 
 	hlist_del(&irqfd->irqfd_hnode);
 
+	/*
+	 * Cleanup interrupt map (kfree chip_data) while in a VMM thread as
+	 * unmap needs partition id. mshv_irqfd_shutdown() runs in a kthread.
+	 */
+	mshv_pthru_dev_irq_undo(irqfd);
+
 	queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
 }
 
@@ -383,6 +761,45 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
 	add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
 }
 
+static int mshv_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+					struct irq_bypass_producer *prod)
+{
+	struct mshv_irqfd *irqfd;
+
+	irqfd = container_of(cons, struct mshv_irqfd, irqfd_bypass_cons);
+	irqfd->irqfd_bypass_prod = prod;
+
+	mshv_pthru_dev_irq_remap(irqfd);
+
+	return 0;
+}
+
+static void mshv_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+					 struct irq_bypass_producer *prod)
+{
+	struct mshv_irqfd *irqfd;
+
+	irqfd = container_of(cons, struct mshv_irqfd, irqfd_bypass_cons);
+
+	WARN_ON(irqfd->irqfd_bypass_prod != prod);
+	irqfd->irqfd_bypass_prod = NULL;
+
+}
+
+static void mshv_setup_irq_bypass(struct mshv_irqfd *irqfd,
+				  struct eventfd_ctx *eventfd)
+{
+	struct irq_bypass_consumer *consumer = &irqfd->irqfd_bypass_cons;
+	int rc;
+
+	consumer->add_producer = mshv_irq_bypass_add_producer;
+	consumer->del_producer = mshv_irq_bypass_del_producer;
+	rc = irq_bypass_register_consumer(&irqfd->irqfd_bypass_cons, eventfd);
+	if (rc)
+		pr_err("Hyper-V: irq bypass consumer registration failed: %d\n",
+		       rc);
+}
+
 static int mshv_irqfd_assign(struct mshv_partition *pt,
 			     struct mshv_user_irqfd *args)
 {
@@ -509,6 +926,8 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	if (events & EPOLLIN)
 		mshv_assert_irq_slow(irqfd);
 
+	mshv_setup_irq_bypass(irqfd, eventfd);
+
 	srcu_read_unlock(&pt->pt_irq_srcu, idx);
 	return 0;
 
diff --git a/drivers/iommu/hyperv-iommu-root.c b/drivers/iommu/hyperv-iommu-root.c
index a2e0f6cc78e6..dc270b0a80d9 100644
--- a/drivers/iommu/hyperv-iommu-root.c
+++ b/drivers/iommu/hyperv-iommu-root.c
@@ -217,6 +217,20 @@ u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
 }
 EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
 
+/* Build device id for the interrupt path */
+u64 hv_devid_from_pdev(struct pci_dev *pdev)
+{
+	enum hv_device_type dev_type;
+
+	if (hv_pcidev_is_attached_dev(pdev))
+		dev_type = HV_DEVICE_TYPE_LOGICAL;
+	else
+		dev_type = HV_DEVICE_TYPE_PCI;
+
+	return hv_build_devid_oftype(pdev, dev_type);
+}
+EXPORT_SYMBOL_GPL(hv_devid_from_pdev);
+
 /* Create a new device domain in the hypervisor */
 static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
 {
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 50d793ca8f31..702a8005651b 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1744,6 +1744,16 @@ static void hv_irq_mask(struct irq_data *data)
 
 static void hv_irq_unmask(struct irq_data *data)
 {
+	struct pci_dev *pdev;
+	struct msi_desc *msi_desc;
+
+	msi_desc = irq_data_get_msi_desc(data);
+	pdev = msi_desc_to_pci_dev(msi_desc);
+
+	/* Done during bypass setup in mshv_eventfd.c: mshv_irqfd_assign() */
+	if (hv_pcidev_is_pthru_dev(pdev))
+		return;
+
 	hv_arch_irq_unmask(data);
 
 	if (data->parent_data->chip->irq_unmask)
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 8d5c610da99a..88b3aba6691c 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -332,6 +332,7 @@ u64 hv_get_current_partid(void);
 bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
 bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev);
 u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
+u64 hv_devid_from_pdev(struct pci_dev *pdev);
 #else
 static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
 { return false; }
@@ -340,6 +341,8 @@ static inline bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
 static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
 					enum hv_device_type type)
 { return 0; }
+static inline u64 hv_devid_from_pdev(struct pci_dev *pdev)
+{ return 0; }
 static inline u64 hv_get_current_partid(void)
 { return HV_PARTITION_ID_INVALID; }
 #endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* [PATCH V1 3/3] mshv: Implement guest irq migration for passthru devices
From: Mukesh R @ 2026-05-12  2:12 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch
In-Reply-To: <20260512021242.1679786-1-mrathor@linux.microsoft.com>

Ask the hypervisor to retarget interrupts to new guest cpu or vector
upon guest irq migration. This happens in the irqfd update path.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c | 78 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 1f5c1e9ee9b7..c05201d857fd 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -192,6 +192,77 @@ static int mshv_map_device_interrupt(u64 ptid, union hv_device_id hv_devid,
 
 }
 
+/* NOTE: caller does spin_lock_irq on pt_irqfds_lock, hence no disable here */
+static void mshv_do_guest_irq_retarget(u64 partid, struct mshv_irqfd *irqfd)
+{
+	int rc, var_size;
+	u64 status;
+	union hv_device_id hv_devid;
+	struct hv_input_get_vp_set_from_mda *mda_input;
+	union hv_output_get_vp_set_from_mda *mda_output;
+	struct hv_retarget_device_interrupt *remap_inp;
+	struct pci_dev *pdev;
+	struct irq_data *irqdata;
+	struct mshv_lapic_irq *lapic_irq = &irqfd->irqfd_lapic_irq;
+	struct hv_interrupt_entry *inte = NULL;
+
+	if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+	    irqfd->irqfd_bypass_prod == NULL)
+		return;
+
+	rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+	if (rc)
+		return;
+
+	inte = irqdata->chip_data;
+	if (inte == NULL)
+		return;
+
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+
+
+	mda_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	mda_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	rc = hv_vpset_from_hyp_disabled(mda_input, mda_output, lapic_irq,
+					partid);
+	if (rc)
+		return;
+
+	remap_inp = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(remap_inp, 0, sizeof(*remap_inp));
+
+	rc = hv_copy_vpset(&remap_inp->int_target.vp_set,
+			   &mda_output->target_vpset);
+	if (rc <= 0) {
+		pr_err("Hyper-V: ptid %lld - vpset copy failed (%d)\n",
+		       partid, rc);
+		return;
+	}
+
+	/*
+	 * var-sized hcall: var-size starts after vp_mask (thus vp_set.format
+	 * does not count, but vp_set.valid_bank_mask does).
+	 */
+	var_size = rc + 1;
+
+	remap_inp->partition_id = partid;
+	remap_inp->device_id = hv_devid.as_uint64;
+	remap_inp->int_target.vector = lapic_irq->lapic_vector;
+	remap_inp->int_target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+	remap_inp->int_entry.source = inte->source;
+	remap_inp->int_entry.msi_entry.as_uint64 = inte->msi_entry.as_uint64;
+
+	status = hv_do_rep_hypercall(HVCALL_RETARGET_INTERRUPT, 0, var_size,
+				     remap_inp, NULL);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "pt:%lld vec:%d lapic-id:%lld\n",
+			      partid, lapic_irq->lapic_vector,
+			      lapic_irq->lapic_apic_id);
+}
+
 static int mshv_unmap_device_interrupt(union hv_device_id hv_devid,
 				       struct hv_interrupt_entry *irq_entry)
 {
@@ -729,9 +800,12 @@ static void mshv_irqfd_update(struct mshv_partition *pt,
 			      struct mshv_irqfd *irqfd)
 {
 	write_seqcount_begin(&irqfd->irqfd_irqe_sc);
-	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
-						    irqfd->irqfd_irqnum);
+	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, irqfd->irqfd_irqnum);
 	mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
+
+#if IS_ENABLED(CONFIG_X86_64)
+	mshv_do_guest_irq_retarget(pt->pt_id, irqfd);
+#endif
 	write_seqcount_end(&irqfd->irqfd_irqe_sc);
 }
 
-- 
2.51.2.vfs.0.1


^ permalink raw reply related

* Re: [PATCH net-next v7 1/6] net: mana: Create separate EQs for each vPort
From: Jakub Kicinski @ 2026-05-12  2:13 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-2-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: Create separate EQs for each vPort

This change removes EQ sharing across vPorts and moves EQ ownership
from struct mana_context to struct mana_port_context. It exports
mana_create_eq() and mana_destroy_eq() so the MANA RDMA driver can
create EQs when a raw QP configures the vport, and also rearranges
mana_alloc_queues()/mana_dealloc_queues() to own the netdev-side
EQ lifecycle.

A few questions on the resulting ownership model below.

> diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
> index ac5e75dd3494..8000ab6e8beb 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
>  	pd->vport_use_count--;
>  	WARN_ON(pd->vport_use_count < 0);
>  
> -	if (!pd->vport_use_count)
> +	if (!pd->vport_use_count) {
> +		mana_destroy_eq(mpc);
>  		mana_uncfg_vport(mpc);
> +	}
>  
>  	mutex_unlock(&pd->vport_mutex);
>  }

[High]
After this change, mpc->eqs has two distinct owners: the RDMA path
frees it here under pd->vport_mutex, and the netdev path frees it under
rtnl_lock via mana_dealloc_queues() -> mana_destroy_eq(). Neither lock
is held by the readers in qp.c. See the comment on the RSS QP path
below.

> diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
> index 645581359cee..6f1043383e8c 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -168,7 +168,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
>  		cq_spec.gdma_region = cq->queue.gdma_region;
>  		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
>  		cq_spec.modr_ctx_id = 0;
> -		eq = &mpc->ac->eqs[cq->comp_vector];
> +		/* EQs are created when a raw QP configures the vport.
> +		 * A raw QP must be created before creating rwq_ind_tbl.
> +		 */
> +		if (!mpc->eqs) {
> +			ret = -EINVAL;
> +			i--;
> +			goto fail;
> +		}
> +		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];
>  		cq_spec.attached_eq = eq->eq->id;

[High]
Is the NULL check on mpc->eqs safe here without any synchronization?

Before this patch, ac->eqs was allocated in mana_probe() and freed in
mana_remove(), so its lifetime spanned the whole driver. With this
patch, apc->eqs can be freed concurrently by either owner:

    mana_ib_uncfg_vport()                 (pd->vport_mutex)
      mana_destroy_eq(mpc)
        kfree(apc->eqs);
        apc->eqs = NULL;

    mana_dealloc_queues()                 (rtnl_lock, ndo_close)
      mana_destroy_eq(apc)
        kfree(apc->eqs);
        apc->eqs = NULL;

This path holds neither lock. Can the 'if (!mpc->eqs)' observation
survive through the subsequent '&mpc->eqs[...]' and 'eq->eq->id'
dereference, or can a concurrent destroyer free apc->eqs between the
check and the deref and turn this into a read of freed memory?

There is also an initialization window in the writer:

    apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
    ...
    for (i = 0; i < apc->num_queues; i++) {
        ...
        err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);

apc->eqs is published before apc->eqs[i].eq is filled in. Can a
concurrent reader here see apc->eqs non-NULL but apc->eqs[i].eq still
NULL, and fault on eq->eq->id?

[Medium]
mana_ib advertises ib_dev.num_comp_vectors as gc->max_num_queues in
mana_ib_probe(), but this patch sizes the per-port pool as
apc->num_queues, which defaults to min(gc->max_num_queues,
MANA_DEF_NUM_QUEUES) and can be reduced further via ethtool -L.

    eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];

When gc->max_num_queues > apc->num_queues, does this modulo silently
collapse distinct advertised completion vectors onto shared EQs, so
CQs created by userspace with different comp_vector values end up on
the same notification stream? Would it be better to cap
ib_dev.num_comp_vectors to apc->num_queues, or reject comp_vector >=
apc->num_queues, instead of wrapping?

[Medium]
The added comment codifies a new ordering constraint:

    /* EQs are created when a raw QP configures the vport.
     * A raw QP must be created before creating rwq_ind_tbl.
     */

Before this patch, ac->eqs existed from mana_probe() onward, so
mana_ib_create_qp_rss() succeeded whether or not the netdev was up and
whether or not any raw QP had been created on the port. After this
patch, if the netdev is administratively down and no raw QP has
configured the vport, mpc->eqs is NULL and this path returns -EINVAL.

Is that ABI-visible change to RSS QP creation intentional, and worth
calling out in the commit message?

The comment is also slightly incomplete: EQs are created on netdev open
via mana_alloc_queues() -> mana_create_eq() as well, not only when a
raw QP configures the vport. Could the wording be adjusted?

>  
>  		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
[ ... ]
> @@ -3343,6 +3367,9 @@ static int mana_dealloc_queues(struct net_device *ndev)
>  	mana_fence_rqs(apc);
>  
>  	/* Even in err case, still need to cleanup the vPort */
> +	mana_destroy_rxqs(apc);
> +	mana_destroy_txq(apc);
> +	mana_destroy_eq(apc);
>  	mana_destroy_vport(apc);
>  
>  	return 0;
-- 
pw-bot: cr

^ permalink raw reply

* Re: [PATCH net-next v7 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: Jakub Kicinski @ 2026-05-12  2:14 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-3-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: Query device capabilities and configure MSI-X sharing for EQs

This change queries device capabilities earlier during GDMA setup,
renames mana_query_device_cfg() to mana_gd_query_device_cfg(), and adds
a new gc->msi_sharing flag plus a gc->msi_bitmap to manage dedicated
vs. shared MSI-X vectors for EQs across vPorts.

A few questions below, mostly about msi_sharing state across resume and
about some wording in the commit message and in the code.

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index f3316e929175..3aa96329f359 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -149,6 +149,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
>  	struct gdma_context *gc = pci_get_drvdata(pdev);
>  	struct gdma_query_max_resources_resp resp = {};
>  	struct gdma_general_req req = {};
> +	unsigned int max_num_queues;
> +	u8 bm_hostmode;
> +	u16 num_ports;
>  	int err;
>  
>  	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
> @@ -197,6 +200,40 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
>  	if (gc->max_num_queues == 0)
>  		return -ENOSPC;
>  
> +	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
> +				       MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
> +	if (err)
> +		return err;
> +
> +	if (!num_ports)
> +		return -EINVAL;

[Low]
This is the first of two back-to-back mana_gd_query_device_cfg() calls
per probe cycle. The second one happens in mana_probe() (further down
in this same patch) and also fetches num_ports and bm_hostmode from
firmware. The first caller here discards bm_hostmode; the second one
uses both.

Would it be reasonable to cache num_ports and bm_hostmode on
struct gdma_context during the first call and have mana_probe() read
them from there, instead of doing a second HWC round-trip that must
return the same values for the per-vPort math in
mana_gd_query_max_resources() to match ac->num_ports in mana_probe()?

> +
> +	/*
> +	 * Adjust gc->max_num_queues returned from the SOC to allow dedicated
> +	 * MSIx for each vPort. Clamp to no less than MANA_DEF_NUM_QUEUES.
> +	 */
> +	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
> +	max_num_queues = rounddown_pow_of_two(max(max_num_queues, 1U));
> +	if (max_num_queues < MANA_DEF_NUM_QUEUES)
> +		max_num_queues = MANA_DEF_NUM_QUEUES;
> +
> +	/*
> +	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
> +	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1)
> +	 */
> +	max_num_queues = min(gc->max_num_queues, max_num_queues);

[Low]
The commit message says "The number of queues per vPort is clamped to
no less than MANA_DEF_NUM_QUEUES". After the clamp-up above, this
min() with gc->max_num_queues can bring the result back below
MANA_DEF_NUM_QUEUES whenever gc->max_num_queues is below
MANA_DEF_NUM_QUEUES (which is 16).

gc->max_num_queues was previously clamped by num_online_cpus(),
resp.max_eq/cq/sq/rq, and gc->num_msix_usable - 1, so a VM with
fewer than 16 online CPUs will end up with gc->max_num_queues_vport
below MANA_DEF_NUM_QUEUES in the non-sharing branch.

Could the commit message be reworded to describe the actual behaviour
(the per-vPort count is clamped towards MANA_DEF_NUM_QUEUES but never
exceeds the hardware maximum)?

> +	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
> +		gc->msi_sharing = true;

[Medium]
Is gc->msi_sharing ever cleared?

Walking the write sites for this flag in the series: it is set to true
unconditionally in the non-dyn branch of mana_gd_setup_hwc_irqs() (see
the hunk below), and conditionally true here. I could not find any
path that writes false, and neither mana_gd_remove_irqs() nor
mana_gd_cleanup() reset it.

The same gdma_context allocated in mana_gd_probe() survives a suspend
/ resume cycle via mana_gd_suspend() -> mana_gd_cleanup() ->
mana_gd_resume() -> mana_gd_setup(). In the dynamic MSI-X path
(pci_msix_can_alloc_dyn() true, i.e. the PF), gc->num_msix_usable is
recomputed as min(resp.max_msix, num_online_cpus() + 1), so it can
legitimately grow across suspend/resume if CPUs came online while the
guest was suspended.

If the first probe set msi_sharing=true because MSI-X was tight, but
on resume there are enough vectors for dedicated allocation, does the
flag stay stuck at true? If so:

  - gc->max_num_queues_vport is computed via the sharing branch
    min(gc->num_msix_usable - 1, gc->max_num_queues) rather than the
    dedicated max_num_queues value;
  - the if (!gc->msi_sharing) branch at the end of mana_gd_setup()
    is skipped and gc->msi_bitmap stays NULL;
  - later patches in this series that consult !gc->msi_sharing as the
    "use bitmap" predicate keep taking the shared path.

Would it be appropriate to reset gc->msi_sharing = false at the top of
mana_gd_query_max_resources() (and/or in mana_gd_setup_hwc_irqs()'s
other branch) before the conditions that may set it to true are
evaluated? This would also line up with the commit message's claim
that "MSI-X sharing among vPorts is disabled by default and is only
enabled when there are not enough MSI-X vectors for dedicated
allocation."

> +
> +	/* If MSI is shared, use max allowed value */
> +	if (gc->msi_sharing)
> +		gc->max_num_queues_vport = min(gc->num_msix_usable - 1, gc->max_num_queues);
> +	else
> +		gc->max_num_queues_vport = max_num_queues;
> +
> +	dev_info(gc->dev, "MSI sharing mode %d max queues %d\n",
> +		 gc->msi_sharing, gc->max_num_queues);

[Low, Low]
Two small wording questions on this block:

The comment above reads "Adjust gc->max_num_queues returned from the
SOC to allow dedicated MSIx for each vPort", but the code never
modifies gc->max_num_queues; it updates a local and assigns to
gc->max_num_queues_vport. Should the comment say
gc->max_num_queues_vport instead?

For the dev_info, should the printed field be gc->max_num_queues_vport
rather than gc->max_num_queues? As written, the log always shows the
hardware maximum that was already decided earlier in the function,
not the per-vPort value that the preceding logic just chose, so the
number never reflects the effect of toggling msi_sharing.

> +
>  	return 0;
>  }
>  
> @@ -1859,6 +1896,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
>  		/* Need 1 interrupt for HWC */
>  		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
>  		min_irqs = 2;
> +		gc->msi_sharing = true;
>  	}
>  

[ ... ]

^ permalink raw reply

* Re: [PATCH net-next v7 4/6] net: mana: Use GIC functions to allocate global EQs
From: Jakub Kicinski @ 2026-05-12  2:14 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-5-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: Use GIC functions to allocate global EQs

This patch replaces the inline GDMA global interrupt setup in
mana_gd_setup_dyn_irqs(), mana_gd_setup_irqs(), and mana_gd_remove_irqs()
with calls to the new mana_gd_get_gic()/mana_gd_put_gic() helpers
introduced in the preceding patch.

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 5930ab817056..850afdbd9c4e 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1888,30 +1888,13 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
>  	 * further used in irq_setup()
>  	 */
>  	for (i = 1; i <= nvec; i++) {
> -		gic = kzalloc_obj(*gic);
> +		gic = mana_gd_get_gic(gc, false, &i);
>  		if (!gic) {
>  			err = -ENOMEM;
>  			goto free_irq;
>  		}
> -		gic->handler = mana_gd_process_eq_events;
> -		INIT_LIST_HEAD(&gic->eq_list);
> -		spin_lock_init(&gic->lock);
> -
> -		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
> -			 i - 1, pci_name(pdev));

[Low]
Before this patch the per-queue interrupts in mana_gd_setup_dyn_irqs()
and mana_gd_setup_irqs() were named as:

    "mana_q%d@pci:%s", i - 1, pci_name(pdev)

so they appeared as mana_q0, mana_q1, ... in /proc/interrupts.

After this patch, allocation goes through mana_gd_get_gic(), which
names them using gic->msi (equal to i, not i - 1):

    snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s",
             gic->msi, pci_name(dev));

so the same vectors now appear as mana_msi1, mana_msi2, ...  The
mana_hwc@pci:%s name is preserved, but every per-queue vector is both
renamed (mana_q -> mana_msi) and renumbered (0-based -> 1-based).

Is this rename intentional?  The commit message describes the change
as replacing "the GDMA global interrupt setup code with the new GIC
allocation and release functions", which reads as a pure refactor.  If
the rename is intended, could it be mentioned in the changelog so that
operators relying on the mana_q prefix in /proc/interrupts are not
surprised?

> -
> -		/* one pci vector is already allocated for HWC */
> -		irqs[i - 1] = pci_irq_vector(pdev, i);
> -		if (irqs[i - 1] < 0) {
> -			err = irqs[i - 1];
> -			goto free_current_gic;
> -		}
> -
> -		err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic);
> -		if (err)
> -			goto free_current_gic;
>  
> -		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
> +		irqs[i - 1] = gic->irq;
>  	}
>  
>  	/*
> @@ -1933,19 +1916,11 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
>  	kfree(irqs);
>  	return 0;
>  
> -free_current_gic:
> -	kfree(gic);
>  free_irq:
>  	for (i -= 1; i > 0; i--) {
>  		irq = pci_irq_vector(pdev, i);
> -		gic = xa_load(&gc->irq_contexts, i);
> -		if (WARN_ON(!gic))
> -			continue;
> -
>  		irq_update_affinity_hint(irq, NULL);
> -		free_irq(irq, gic);
> -		xa_erase(&gc->irq_contexts, i);
> -		kfree(gic);
> +		mana_gd_put_gic(gc, false, i);
>  	}

[Low]
Is the irq_update_affinity_hint(irq, NULL) call here still needed?

Looking at mana_gd_put_gic():

    if (!refcount_dec_and_test(&gic->refcount))
            goto out;

    irq = gic->irq;

    irq_update_affinity_hint(irq, NULL);
    free_irq(irq, gic);

When the last reference is dropped (which is the case on every caller in
this patch, since the refcount is 1 here), the helper already clears the
affinity hint before calling free_irq().  The explicit call in the caller
duplicates that work.

The duplication is harmless today because irq_update_affinity_hint() is
idempotent, but could the clearing of the affinity hint be left entirely
to mana_gd_put_gic()?  Otherwise, if these helpers are ever reused in a
context where the refcount is greater than one, the caller will clear an
affinity hint that the helper would correctly have left intact.

> @@ -1966,34 +1941,13 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
>  	start_irqs = irqs;
>  
>  	for (i = 0; i < nvec; i++) {
> -		gic = kzalloc_obj(*gic);
> +		gic = mana_gd_get_gic(gc, false, &i);
>  		if (!gic) {
>  			err = -ENOMEM;
>  			goto free_irq;
>  		}
>  
> -		gic->handler = mana_gd_process_eq_events;
> -		INIT_LIST_HEAD(&gic->eq_list);
> -		spin_lock_init(&gic->lock);
> -
> -		if (!i)
> -			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
> -				 pci_name(pdev));
> -		else
> -			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
> -				 i - 1, pci_name(pdev));

[Low]
The same rename applies here for the non-HWC vectors (mana_q%d with
i - 1 becomes mana_msi%d with i inside mana_gd_get_gic()).  Could the
commit message note this user-visible change in /proc/interrupts?

[ ... ]

> @@ -2022,19 +1976,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
>  	kfree(start_irqs);
>  	return 0;
>  
> -free_current_gic:
> -	kfree(gic);
>  free_irq:
>  	for (i -= 1; i >= 0; i--) {
>  		irq = pci_irq_vector(pdev, i);
> -		gic = xa_load(&gc->irq_contexts, i);
> -		if (WARN_ON(!gic))
> -			continue;
> -
>  		irq_update_affinity_hint(irq, NULL);
> -		free_irq(irq, gic);
> -		xa_erase(&gc->irq_contexts, i);
> -		kfree(gic);
> +		mana_gd_put_gic(gc, false, i);
>  	}

[Low]
Same observation as in mana_gd_setup_dyn_irqs(): the explicit
irq_update_affinity_hint(irq, NULL) duplicates what
mana_gd_put_gic() already does when it drops the last reference.  Could
this call be dropped from the caller?

> @@ -2109,26 +2055,20 @@ static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
>  static void mana_gd_remove_irqs(struct pci_dev *pdev)
>  {
>  	struct gdma_context *gc = pci_get_drvdata(pdev);
> -	struct gdma_irq_context *gic;
>  	int irq, i;
>  
>  	if (gc->max_num_msix < 1)
>  		return;
>  
>  	for (i = 0; i < gc->max_num_msix; i++) {
> -		irq = pci_irq_vector(pdev, i);
> -		if (irq < 0)
> -			continue;
> -
> -		gic = xa_load(&gc->irq_contexts, i);
> -		if (WARN_ON(!gic))
> +		if (!xa_load(&gc->irq_contexts, i))
>  			continue;
>  
>  		/* Need to clear the hint before free_irq */
> +		irq = pci_irq_vector(pdev, i);
>  		irq_update_affinity_hint(irq, NULL);
> -		free_irq(irq, gic);
> -		xa_erase(&gc->irq_contexts, i);
> -		kfree(gic);
> +
> +		mana_gd_put_gic(gc, false, i);
>  	}

[Low]
Same pattern in mana_gd_remove_irqs(): the comment says "Need to clear
the hint before free_irq", but the actual free_irq() now happens inside
mana_gd_put_gic(), which itself calls irq_update_affinity_hint(irq, NULL)
right before free_irq().  Could the explicit call (and comment) be
removed so that the affinity-hint clearing lives in one place?

^ permalink raw reply

* Re: [PATCH net-next v7 5/6] net: mana: Allocate interrupt context for each EQ when creating vPort
From: Jakub Kicinski @ 2026-05-12  2:14 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-6-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
Subject: net: mana: Allocate interrupt context for each EQ when creating vPort

This commit switches the vPort EQ creation path to acquire a GIC (either
dedicated or shared) for each EQ via mana_gd_get_gic()/mana_gd_put_gic(),
adds a new irq field to struct gdma_queue's EQ union and a debugfs entry
for it, and moves the msix_index invalidation out of mana_gd_deregister_irq()
into the mana_gd_create_eq() error path.

[Low]
The changelog is a single sentence, but the patch makes several coupled
changes beyond acquiring a GIC per EQ:

  - it adds a new per-EQ irq field on the shared struct gdma_queue,
  - it adds a new debugfs entry for that field,
  - it changes an invariant in gdma_main.c (see the comment on
    mana_gd_deregister_irq() below),
  - it splits the GIC lifecycle across two source files — mana_en.c now
    owns get_gic/put_gic around mana_gd_create_mana_eq(), while
    mana_gd_register_irq() is still called internally from the create path.

Could the commit message describe the resulting ownership contract
(caller holds a GIC reference across create/destroy, register_irq does
not) and justify the scope of the new field?

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 850afdbd9c4e..54127c385240 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -854,7 +854,6 @@ static void mana_gd_deregister_irq(struct gdma_queue *queue)
>  	}
>  	spin_unlock_irqrestore(&gic->lock, flags);
>  
> -	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
>  	synchronize_rcu();
>  }

[Low]
Removing the msix_index reset from mana_gd_deregister_irq() and relocating
it to the mana_gd_create_eq() error path is the prerequisite that lets the
new mana_destroy_eq() read eq->eq.msix_index before calling
mana_gd_destroy_queue():

    msi = eq->eq.msix_index;
    mana_gd_destroy_queue(gc, eq);
    mana_gd_put_gic(gc, !gc->msi_sharing, msi);

Could the changelog mention this intentional invariant change and the
ownership contract it implies?  Any out-of-tree caller that relied on
mana_gd_deregister_irq() to invalidate msix_index will now silently observe
stale values after a normal destroy.

> @@ -969,6 +968,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd,
>  out:
>  	dev_err(dev, "Failed to create EQ: %d\n", err);
>  	mana_gd_destroy_eq(gc, false, queue);
> +	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
>  	return err;
>  }
>  
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 2f106d6f5be4..b3684fa3eb4a 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c

[ ... ]

> @@ -1664,6 +1668,8 @@ int mana_create_eq(struct mana_port_context *apc)
>  	struct gdma_queue_spec spec = {};
>  	int err;
>  	int i;
> +	int msi;
> +	struct gdma_irq_context *gic;
>  
>  	WARN_ON(apc->eqs);
>  	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
> @@ -1681,12 +1687,24 @@ int mana_create_eq(struct mana_port_context *apc)
>  						    apc->mana_port_debugfs);
>  
>  	for (i = 0; i < apc->num_queues; i++) {
> -		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
> +		msi = (i + 1) % gc->num_msix_usable;
> +		if (gc->msi_sharing)
> +			spec.eq.msix_index = msi;
> +
> +		gic = mana_gd_get_gic(gc, !gc->msi_sharing, &msi);
> +		if (!gic) {
> +			err = -ENOMEM;
> +			goto out;
> +		}
> +		spec.eq.msix_index = msi;

[Low]
Is the conditional write to spec.eq.msix_index dead?

    if (gc->msi_sharing)
        spec.eq.msix_index = msi;

    gic = mana_gd_get_gic(gc, !gc->msi_sharing, &msi);
    ...
    spec.eq.msix_index = msi;

The unconditional assignment after mana_gd_get_gic() always overwrites the
conditional one.  In the non-sharing case, mana_gd_get_gic() also reassigns
*msi_requested via find_first_zero_bit(), so the pre-computed
(i + 1) % gc->num_msix_usable is only consumed in the sharing branch.

Would splitting the sharing and non-sharing paths, or dropping the
conditional write entirely, make the intent clearer and avoid the
inverted-boolean fallthrough (!gc->msi_sharing)?

> +
>  		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
>  		if (err) {
>  			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
> +			mana_gd_put_gic(gc, !gc->msi_sharing, msi);
>  			goto out;
>  		}
> +		apc->eqs[i].eq->eq.irq = gic->irq;
>  		mana_create_eq_debugfs(apc, i);
>  	}

[Low]
The new struct gdma_queue.eq.irq field is only assigned here on the vPort
path:

    apc->eqs[i].eq->eq.irq = gic->irq;

The other in-tree caller of mana_gd_create_mana_eq(), mana_ib_create_eqs()
in drivers/infiniband/hw/mana/main.c, creates EQs using the same struct
but never assigns eq.irq, leaving it zero.  No debugfs is wired up for ib
EQs today, so the zero is not user-visible, but any future consumer that
reads this field on an ib EQ will see 0 rather than the real IRQ number.

Should mana_gd_create_mana_eq() (or its callers) populate eq.irq
uniformly so the field has one well-defined meaning across all EQs?

> diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
> index fbe3c1427b45..6c138cc77407 100644
> --- a/include/net/mana/gdma.h
> +++ b/include/net/mana/gdma.h
> @@ -342,6 +342,7 @@ struct gdma_queue {
>  			void *context;
>  
>  			unsigned int msix_index;
> +			unsigned int irq;
>  
>  			u32 log2_throttle_limit;
>  		} eq;

^ permalink raw reply

* Re: [PATCH net-next v7 6/6] RDMA/mana_ib: Allocate interrupt contexts on EQs
From: Jakub Kicinski @ 2026-05-12  2:14 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-7-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
Subject: RDMA/mana_ib: Allocate interrupt contexts on EQs

This patch converts RDMA/mana_ib EQ creation to use the shared GIC
allocator (mana_gd_get_gic/mana_gd_put_gic), allowing the RDMA EQs
to share interrupt contexts with the Ethernet driver's EQs. It also
adds a new driver capability bit and sets it in the advertised flags.

> diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
> index 8000ab6e8beb..0dc47540fb6f 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -749,7 +749,8 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
>  {
>  	struct gdma_context *gc = mdev_to_gc(mdev);
>  	struct gdma_queue_spec spec = {};
> -	int err, i;
> +	struct gdma_irq_context *gic;
> +	int err, i, msi;
>  
>  	spec.type = GDMA_EQ;
>  	spec.monitor_avl_buf = false;
> @@ -757,11 +758,18 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
>  	spec.eq.callback = mana_ib_event_handler;
>  	spec.eq.context = mdev;
>  	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
> -	spec.eq.msix_index = 0;
> +
> +	msi = 0;
> +	gic = mana_gd_get_gic(gc, false, &msi);

[Low]
The commit message states "The driver now supports allocating dedicated
MSI-X for each EQ", and the new capability bit added below is named
GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT. Does that description
match what this code does?

The calls here pass use_msi_bitmap=false, which appears to share an
existing gic slot (incrementing its refcount) rather than allocating a
fresh dedicated MSI-X from the bitmap:

	msi = 0;
	gic = mana_gd_get_gic(gc, false, &msi);
	...
	msi = (i + 1) % gc->num_msix_usable;
	gic = mana_gd_get_gic(gc, false, &msi);

The commit body also says "These interrupt contexts may be shared with
Ethernet EQs when MSI-X vectors are limited", which describes
sharing-with-fallback, while the flag name says "UNSHARE". Could the
changelog and flag naming be reconciled so the semantics are clear (for
example, whether the bit advertises "driver can handle unshared EQs" vs
"driver will always be unshared")?

Should the flag also be set only when the driver actually ends up
dedicated rather than advertised unconditionally?

[ ... ]

> diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
> index 6c138cc77407..d84e474309a3 100644
> --- a/include/net/mana/gdma.h
> +++ b/include/net/mana/gdma.h
> @@ -615,6 +615,7 @@ enum {
>  #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
>  #define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
>  #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
> +#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
>  
>  /* Driver can handle holes (zeros) in the device list */
>  #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
> @@ -631,7 +632,8 @@ enum {
>  /* Driver detects stalled send queues and recovers them */
>  #define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18)
>  
> -#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
> +/* Driver supports separate EQ/MSIs for each vPort */
> +#define GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT BIT(19)
>  
>  /* Driver supports linearizing the skb when num_sge exceeds hardware limit */
>  #define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)

[Low]
The subject line scopes this commit to RDMA/mana_ib, but this hunk
changes the shared header include/net/mana/gdma.h in two ways that
affect the Ethernet driver and firmware interface:

- it introduces the new GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT
  BIT(19) advertised to firmware, and
- it silently moves GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6) from
  after BIT(18) back to its numeric slot after BIT(5).

Neither change is mentioned in the changelog. Would it be clearer to
split the capability-bit addition into its own patch (or use a subject
not scoped to mana_ib), and to note the BIT(6) relocation as a separate
cleanup so bisecting firmware/Ethernet behavior changes is easier?

> @@ -659,7 +661,8 @@ enum {
>  	 GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
>  	 GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \
>  	 GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \
> -	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY)
> +	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY | \
> +	 GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT)
>  
>  #define GDMA_DRV_CAP_FLAGS2 0

^ permalink raw reply

* Re: [PATCH net-next v7 1/6] net: mana: Create separate EQs for each vPort
From: Jakub Kicinski @ 2026-05-12  2:14 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-2-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
Subject: net: mana: Create separate EQs for each vPort

This commit prepares for assigning vPorts to dedicated MSI-X vectors by
removing EQ sharing among the vPorts and creating dedicated EQs for each
vPort. It moves the EQ definition from struct mana_context to struct
mana_port_context and exports related functions for the RDMA driver.

> diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
> index ac5e75dd3494d..8000ab6e8beb9 100644
> --- a/drivers/infiniband/hw/mana/main.c
> +++ b/drivers/infiniband/hw/mana/main.c
> @@ -55,15 +57,22 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
>  		return err;
>  	}
>  
> -	mutex_unlock(&pd->vport_mutex);
>  
>  	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
>  	pd->tx_vp_offset = mpc->tx_vp_offset;
> +	err = mana_create_eq(mpc);
> +	if (err) {
> +		mana_uncfg_vport(mpc);
> +		pd->vport_use_count--;
> +	}
>  
> -	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
> -		  mpc->port_handle, pd->pdn, doorbell_id);
> +	mutex_unlock(&pd->vport_mutex);

How does this handle a Protection Domain shared across multiple physical
ports?

Looking at the earlier part of mana_ib_cfg_vport(), if a PD is already
in use, the code increments pd->vport_use_count and returns early:

	pd->vport_use_count++;
	if (pd->vport_use_count > 1) {
		ibdev_dbg(&dev->ib_dev,
			  "Skip as this PD is already configured vport\n");
		mutex_unlock(&pd->vport_mutex);
		return 0;
	}

If a second port is configured with the same PD, it skips calling
mana_create_eq() for the new port's mpc. Will this cause mpc->eqs to
remain NULL for the second port and fail subsequent queue creations?

[ ... ]

> diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
> index 0fbcf449c134b..6167742df9d29 100644
> --- a/drivers/infiniband/hw/mana/qp.c
> +++ b/drivers/infiniband/hw/mana/qp.c
> @@ -171,7 +171,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
>  		cq_spec.gdma_region = cq->queue.gdma_region;
>  		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
>  		cq_spec.modr_ctx_id = 0;
> -		eq = &mpc->ac->eqs[cq->comp_vector];
> +		/* EQs are created when a raw QP configures the vport.
> +		 * A raw QP must be created before creating rwq_ind_tbl.
> +		 */
> +		if (!mpc->eqs) {
> +			ret = -EINVAL;
> +			i--;
> +			goto fail;
> +		}

Does this explicit i-- cause a resource leak in the error path?

The fail block cleans up previously created WQ objects using a post-decrement
while loop:

fail:
	while (i-- > 0) {
		ibwq = ind_tbl->ind_tbl[i];
		ibcq = ibwq->cq;
		...
		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
	}

If we manually decrement i before jumping to the fail block, the while loop
will decrement it again prior to evaluation, completely skipping the cleanup
of the WQ object that was just successfully created at the original index i.

> +		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];

What guarantees the lifetime of the mpc->eqs array for active RSS QPs?

Since mana_ib_create_qp_rss() doesn't take a reference count or hold a lock
that prevents mana_destroy_eq() from running, if the Ethernet interface is
brought down or the RAW QP is destroyed, could mana_destroy_eq() free the
array while RSS QPs are still actively using it?

Additionally, can this out-of-bounds read if an ethtool channel update
changes mpc->num_queues while the interface is down?

If a RAW QP is created while the interface is down, mpc->eqs is sized to the
old channel count. If the channel count is then increased, mpc->num_queues
updates but the array isn't reallocated because the interface is down and
queue deallocation was skipped. This could lead to accessing past the end of
the allocation both here and in mana_destroy_eq().

[ ... ]

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index b2faa7cf398f3..cef86c9310543 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
[ ... ]
> -static void mana_create_eq_debugfs(struct mana_context *ac, int i)
> +static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
>  {
> -	struct mana_eq eq = ac->eqs[i];
> +	struct mana_eq eq = apc->eqs[i];
>  	char eqnum[32];
>  
>  	sprintf(eqnum, "eq%d", i);
> -	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs);
> +	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
>  }

Does this update the actual debugfs pointer in the array?

By using struct mana_eq eq = apc->eqs[i];, it creates a local by-value copy
on the stack. The debugfs directory is created, but the pointer in the actual
apc->eqs array remains NULL. Would it be better to use a pointer reference
like struct mana_eq *eq = &apc->eqs[i]; here?

[ ... ]

> -static int mana_create_eq(struct mana_context *ac)
> +int mana_create_eq(struct mana_port_context *apc)
>  {
> -	struct gdma_dev *gd = ac->gdma_dev;
> +	struct gdma_dev *gd = apc->ac->gdma_dev;
>  	struct gdma_context *gc = gd->gdma_context;
>  	struct gdma_queue_spec spec = {};
>  	int err;
>  	int i;
>  
> -	ac->eqs = kzalloc_objs(struct mana_eq, gc->max_num_queues);
> -	if (!ac->eqs)
> +	WARN_ON(apc->eqs);
> +	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
> +	if (!apc->eqs)
>  		return -ENOMEM;
>  
>  	spec.type = GDMA_EQ;
>  	spec.monitor_avl_buf = false;
>  	spec.queue_size = EQ_SIZE;
>  	spec.eq.callback = NULL;
> -	spec.eq.context = ac->eqs;
> +	spec.eq.context = apc->eqs;
>  	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
>  
> -	ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs);
> +	apc->mana_eqs_debugfs = debugfs_create_dir("EQs",
> +						    apc->mana_port_debugfs);
>  
> -	for (i = 0; i < gc->max_num_queues; i++) {
> +	for (i = 0; i < apc->num_queues; i++) {
>  		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
> -		err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
> +		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);

Could there be a race condition between mana_create_eq() and
mana_ib_create_qp_rss() here?

apc->eqs is assigned immediately after allocation, but before the individual
eq structures are populated by mana_gd_create_mana_eq() in this loop.

If an RDMA application concurrently calls mana_ib_create_qp_rss(), it checks
if (!mpc->eqs). That check will pass because the array is allocated, but
accessing eq->eq->id could result in a NULL pointer dereference because the
element has not been fully initialized yet.

^ permalink raw reply

* Re: [PATCH net-next v7 4/6] net: mana: Use GIC functions to allocate global EQs
From: Jakub Kicinski @ 2026-05-12  2:14 UTC (permalink / raw)
  To: longli
  Cc: Jakub Kicinski, kotaranov, davem, pabeni, edumazet, andrew+netdev,
	jgg, leon, haiyangz, kys, wei.liu, decui, shradhagupta, horms,
	netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-5-longli@microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: Use GIC functions to allocate global EQs

This commit replaces the GDMA global interrupt setup code with the new GIC
allocation and release functions to manage interrupt contexts via refcounting.

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 81bf379283663..72a67b0b2b5d4 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1918,30 +1918,13 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
>  	 * further used in irq_setup()
>  	 */
>  	for (i = 1; i <= nvec; i++) {
> -		gic = kzalloc_obj(*gic);
> +		gic = mana_gd_get_gic(gc, false, &i);
>  		if (!gic) {
>  			err = -ENOMEM;
>  			goto free_irq;
>  		}
> -		gic->handler = mana_gd_process_eq_events;
> -		INIT_LIST_HEAD(&gic->eq_list);
> -		spin_lock_init(&gic->lock);
> -
> -		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
> -			 i - 1, pci_name(pdev));

Does moving the name generation into mana_gd_get_gic() unintentionally change
the IRQ names exposed in /proc/interrupts?

In mana_gd_get_gic(), the names are now generated as "mana_msi%d" using the
raw MSI index:

	if (!gic->msi)
		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
			 pci_name(dev));
	else
		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s",
			 gic->msi, pci_name(dev));

This could break userspace tooling (like irqbalance or monitoring scripts) that
might rely on parsing the "mana_q*" pattern to bind network queues.

[ ... ]

> @@ -1963,19 +1946,11 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
>  	kfree(irqs);
>  	return 0;
>  
> -free_current_gic:
> -	kfree(gic);
>  free_irq:
>  	for (i -= 1; i > 0; i--) {
>  		irq = pci_irq_vector(pdev, i);
> -		gic = xa_load(&gc->irq_contexts, i);
> -		if (WARN_ON(!gic))
> -			continue;
> -
>  		irq_update_affinity_hint(irq, NULL);
> -		free_irq(irq, gic);
> -		xa_erase(&gc->irq_contexts, i);
> -		kfree(gic);
> +		mana_gd_put_gic(gc, false, i);
>  	}
>  	kfree(irqs);
>  	return err;

[ ... ]

> @@ -2139,26 +2085,20 @@ static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
>  static void mana_gd_remove_irqs(struct pci_dev *pdev)
>  {
>  	struct gdma_context *gc = pci_get_drvdata(pdev);
> -	struct gdma_irq_context *gic;
>  	int irq, i;
>  
>  	if (gc->max_num_msix < 1)
>  		return;
>  
>  	for (i = 0; i < gc->max_num_msix; i++) {
> -		irq = pci_irq_vector(pdev, i);
> -		if (irq < 0)
> -			continue;
> -
> -		gic = xa_load(&gc->irq_contexts, i);
> -		if (WARN_ON(!gic))
> +		if (!xa_load(&gc->irq_contexts, i))
>  			continue;
>  
>  		/* Need to clear the hint before free_irq */
> +		irq = pci_irq_vector(pdev, i);
>  		irq_update_affinity_hint(irq, NULL);
> -		free_irq(irq, gic);
> -		xa_erase(&gc->irq_contexts, i);
> -		kfree(gic);
> +
> +		mana_gd_put_gic(gc, false, i);
>  	}

Is it safe to unconditionally call irq_update_affinity_hint() in these cleanup
paths?

Now that IRQs can be shared and are reference counted via mana_gd_put_gic(),
unconditionally clearing the affinity hint here could disrupt the routing for
any remaining active users sharing this IRQ. mana_gd_put_gic() already clears
the affinity hint internally when the refcount reaches zero.

Also, since the "if (irq < 0)" check was removed in mana_gd_remove_irqs(), if
pci_irq_vector() returns a negative error like -EINVAL, we would pass a
negative value to irq_update_affinity_hint(), which expects an unsigned int.

^ permalink raw reply

* Re: [PATCH net-next v7 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Jakub Kicinski @ 2026-05-12  2:15 UTC (permalink / raw)
  To: Long Li
  Cc: Konstantin Taranov, David S . Miller, Paolo Abeni, Eric Dumazet,
	Andrew Lunn, Jason Gunthorpe, Leon Romanovsky, Haiyang Zhang,
	K . Y . Srinivasan, Wei Liu, Dexuan Cui, shradhagupta,
	Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-1-longli@microsoft.com>

On Thu,  7 May 2026 12:12:31 -0700 Long Li wrote:
> This series adds per-vPort Event Queue (EQ) allocation and MSI-X interrupt
> management for the MANA driver. Previously, all vPorts shared a single set
> of EQs. This change enables dedicated EQs per vPort with support for both
> dedicated and shared MSI-X vector allocation modes.

Once all the AI review comments are address / only false positives
remain - could you pop these patches on a branch and add PR info
to the cover letter so that both RDMA and netdev can pull this?

^ permalink raw reply

* Re: [PATCH v8 2/2] net: mana: force full-page RX buffers via ethtool private flag
From: Jakub Kicinski @ 2026-05-12  2:21 UTC (permalink / raw)
  To: dipayanroy
  Cc: Jakub Kicinski, kys, haiyangz, wei.liu, decui, andrew+netdev,
	davem, edumazet, pabeni, leon, longli, kotaranov, horms,
	shradhagupta, ssengar, ernis, shirazsaleem, linux-hyperv, netdev,
	linux-kernel, linux-rdma, stephen, jacob.e.keller, dipayanroy,
	leitao, kees, john.fastabend, hawk, bpf, daniel, ast, sdf,
	yury.norov
In-Reply-To: <20260508115100.488506-3-dipayanroy@linux.microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: force full-page RX buffers via ethtool private flag

This patch adds an ethtool private flag "full-page-rx" that forces one
RX buffer per page, bypassing the page_pool fragment path, to address
a 15-20% throughput regression seen on some 4K PAGE_SIZE ARM64
platforms under high connection counts.  Toggling the flag triggers
mana_detach() + mana_attach() to apply the new RX buffer layout.

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> index a28ca461c135..dfd222de11d0 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c

[ ... ]

> @@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device *ndev,
>  	return 0;
>  }
>  

[ ... ]

> +static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
> +{
> +	struct mana_port_context *apc = netdev_priv(ndev);
> +	u32 changed = apc->priv_flags ^ priv_flags;
> +	u32 old_priv_flags = apc->priv_flags;
> +	bool schedule_port_reset = false;
> +	int err = 0;
> +
> +	if (!changed)
> +		return 0;
> +
> +	/* Reject unknown bits */
> +	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
> +		return -EINVAL;
> +
> +	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
> +		apc->priv_flags = priv_flags;
> +
> +		if (!apc->port_is_up) {
> +			/* Port is down, flag updated to apply on next up
> +			 * so just return.
> +			 */
> +			return 0;
> +		}
> +
> +		/* Pre-allocate buffers to prevent failure in mana_attach
> +		 * later
> +		 */
> +		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
> +		if (err) {
> +			netdev_err(ndev,
> +				   "Insufficient memory for new allocations\n");
> +			apc->priv_flags = old_priv_flags;
> +			return err;
> +		}
> +
> +		err = mana_detach(ndev, false);
> +		if (err) {
> +			netdev_err(ndev, "mana_detach failed: %d\n", err);
> +			apc->priv_flags = old_priv_flags;
> +
> +			/* Port is in an inconsistent state. Restore
> +			 * 'port_is_up' so that queue reset work handler
> +			 * can properly detach and re-attach.
> +			 */
> +			apc->port_is_up = true;
> +			schedule_port_reset = true;
> +			goto out;
> +		}
> +
> +		err = mana_attach(ndev);
> +		if (err) {
> +			netdev_err(ndev, "mana_attach failed: %d\n", err);
> +			apc->priv_flags = old_priv_flags;
> +
> +			/* Restore 'port_is_up' so the reset work handler
> +			 * can properly detach/attach. Without this,
> +			 * the handler sees port_is_up=false and skips
> +			 * queue allocation, leaving the port dead.
> +			 */
> +			apc->port_is_up = true;
> +			schedule_port_reset = true;
> +		}
> +	}
> +
> +out:
> +	mana_pre_dealloc_rxbufs(apc);
> +
> +	if (schedule_port_reset)
> +		queue_work(apc->ac->per_port_queue_reset_wq,
> +			   &apc->queue_reset_work);
> +
> +	return err;
> +}

[High]
Can this failure path oops inside the reset work handler?

When mana_attach() fails after mana_detach() has already torn the vport
down, apc->tx_qp has been set to NULL by mana_destroy_txq() and apc->rxqs
has been cleared by mana_cleanup_port_context().  This code then forces
apc->port_is_up back to true and queues apc->queue_reset_work:

    apc->port_is_up = true;
    schedule_port_reset = true;

When mana_per_port_queue_reset_work_handler runs it calls
mana_detach(ndev, false), which latches port_st_save = port_is_up = true
and, because port_st_save is true, invokes mana_dealloc_queues().  The TX
drain loop there does:

    txq = &apc->tx_qp[i].txq;
    atomic_read(&txq->pending_sends);

with no NULL check on apc->tx_qp, and mana_destroy_vport further down
iterates apc->rxqs[rxq_idx] while rxqs is NULL.  Would that produce a
NULL pointer dereference in the workqueue worker whenever mana_attach()
fails here (for example on an allocation failure in
mana_init_port_context or mana_create_txq, or a transient HW command
failure in mana_query_vport_cfg / mana_cfg_vport)?

The mana_detach() failure path above (goto out) reaches
mana_dealloc_queues() through the same chain and looks to have the same
exposure.

For comparison, mana_change_mtu() handles a mana_attach() failure by
returning the error without scheduling a reset.  Would a similar
treatment here avoid the asynchronous oops, or is there a reason the
reset must be scheduled in this specific failure case?
-- 
pw-bot: cr

^ permalink raw reply

* Re: [PATCH V1 1/3] mshv: Import declarations for irq remap and add irqbypass support
From: Souradeep Chakrabarti @ 2026-05-12  4:59 UTC (permalink / raw)
  To: Mukesh R
  Cc: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch
In-Reply-To: <20260512021242.1679786-2-mrathor@linux.microsoft.com>

On Mon, May 11, 2026 at 07:12:40PM -0700, Mukesh R wrote:
> For the irq map/remap hypercalls, copy relevant data structures from
> hypervisor public headers into Linux equivalents. Also, update Kconfig and
> mshv_irqfd for irqbypass. Please note, irqbypass is required for doing
> passthru on MSHV. This because there is really no way of knowing the linux
> irq in the mshv_irqfd_assign and mshv_irqfd_update paths without it. The
> linux irq is setup upfront by VFIO before irqfd assign/update happens.
> 
Reviewed-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
> Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
> ---
>  drivers/hv/Kconfig          |  1 +
>  drivers/hv/mshv_eventfd.h   |  3 +++
>  include/hyperv/hvgdk_mini.h |  3 +++
>  include/hyperv/hvhdk.h      | 17 +++++++++++++++++
>  4 files changed, 24 insertions(+)
> 
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 7937ac0cbd0f..c831fe25ca2b 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -75,6 +75,7 @@ config MSHV_ROOT
>  	# no particular order, making it impossible to reassemble larger pages
>  	depends on PAGE_SIZE_4KB
>  	select EVENTFD
> +	select IRQ_BYPASS_MANAGER
>  	select VIRT_XFER_TO_GUEST_WORK
>  	select HMM_MIRROR
>  	select MMU_NOTIFIER
> diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
> index 464c6b81ab33..ff4dd24b8ad4 100644
> --- a/drivers/hv/mshv_eventfd.h
> +++ b/drivers/hv/mshv_eventfd.h
> @@ -9,6 +9,7 @@
>  #define __LINUX_MSHV_EVENTFD_H
>  
>  #include <linux/poll.h>
> +#include <linux/irqbypass.h>
>  
>  #include "mshv.h"
>  #include "mshv_root.h"
> @@ -37,6 +38,8 @@ struct mshv_irqfd {
>  	struct mshv_irqfd_resampler	    *irqfd_resampler;
>  	struct eventfd_ctx		    *irqfd_resamplefd;
>  	struct hlist_node		     irqfd_resampler_hnode;
> +	struct irq_bypass_consumer	     irqfd_bypass_cons;
> +	struct irq_bypass_producer	    *irqfd_bypass_prod;
>  };
>  
>  void mshv_eventfd_init(struct mshv_partition *partition);
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index da622fb06440..1ef480825705 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -59,6 +59,8 @@ struct hv_u128 {
>  #define HV_PARTITION_ID_INVALID		((u64)0)
>  #define HV_PARTITION_ID_SELF		((u64)-1)
>  
> +#define HV_MAX_VPS    256               /* HV_MAXIMUM_PROCESSORS */
> +
>  /* Hyper-V specific model specific registers (MSRs) */
>  
>  #if defined(CONFIG_X86)
> @@ -508,6 +510,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
>  #define HVCALL_UNMAP_VP_STATE_PAGE			0x00e2
>  #define HVCALL_GET_VP_STATE				0x00e3
>  #define HVCALL_SET_VP_STATE				0x00e4
> +#define HVCALL_GET_VPSET_FROM_MDA                       0x00e5
>  #define HVCALL_GET_VP_CPUID_VALUES			0x00f4
>  #define HVCALL_GET_PARTITION_PROPERTY_EX		0x0101
>  #define HVCALL_MMIO_READ				0x0106
> diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
> index 5e83d3714966..d0a892347ab1 100644
> --- a/include/hyperv/hvhdk.h
> +++ b/include/hyperv/hvhdk.h
> @@ -952,4 +952,21 @@ struct hv_input_modify_sparse_spa_page_host_access {
>  #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE      0x4
>  #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE       0x8
>  
> +#ifdef CONFIG_X86
> +
> +struct hv_input_get_vp_set_from_mda {   /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
> +	u64 target_partid;
> +	u64 dest_address;
> +	u8  input_vtl;
> +	u8  destmode_logical;         /* true => mode is logical */
> +	u16 reserved0;                /* mbz */
> +	u32 reserved1;                /* mbz */
> +} __packed;
> +
> +union hv_output_get_vp_set_from_mda {  /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
> +	struct hv_vpset target_vpset;
> +	u64 bitset_buffer[HV_GENERIC_SET_QWORD_COUNT(HV_MAX_VPS)];
> +} __packed;
> +
> +#endif /* CONFIG_X86 */
>  #endif /* _HV_HVHDK_H */
> -- 
> 2.51.2.vfs.0.1
> 

^ permalink raw reply

* Re: [PATCH 1/3] mm/hmm: Add hmm_range_fault_unlockable() for mmap lock-drop support
From: David Hildenbrand (Arm) @ 2026-05-12  8:42 UTC (permalink / raw)
  To: Stanislav Kinsburskii, kys, Liam.Howlett, akpm, decui, haiyangz,
	jgg, corbet, leon, longli, ljs, mhocko, rppt, shuah, skhan,
	surenb, vbabka, wei.liu
  Cc: linux-doc, linux-hyperv, linux-kernel, linux-kselftest, linux-mm
In-Reply-To: <177759840859.221039.13065406062747296947.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>


> +	for (; addr < end; addr += PAGE_SIZE) {
> +		vm_fault_t ret;
> +
> +		ret = handle_mm_fault(vma, addr, fault_flags, NULL);
> +
> +		if (ret & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)) {
> +			/*
> +			 * The mmap lock has been dropped by the fault handler.
> +			 * Record the failing address and signal lock-drop to
> +			 * the caller.
> +			 */
> +			*hmm_vma_walk->locked = 0;
> +			hmm_vma_walk->last = addr;
> +			return -EAGAIN;


Okay, so we'll return straight from hmm_vma_fault() to
hmm_vma_handle_pte()/hmm_vma_walk_pmd() -> walk_page_range() machinery.

Hopefully we don't refer to the MM/VMA on any path there? It would be nicer if
the hmm_vma_fault() could be called by the caller of walk_page_range(), but
that's tricky I guess, as hmm_vma_fault() consumes the walk structure and
requires the vma in there.


Note: am I wrong, or is hmm_vma_fault() really always called with
required_fault=true?

> +		}
> +
> +		if (ret & VM_FAULT_ERROR)
>  			return -EFAULT;
> +	}
>  	return -EBUSY;
>  }
>  
> @@ -566,6 +585,17 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
>  	if (required_fault) {
>  		int ret;
>  
> +		/*
> +		 * Faulting hugetlb pages on the unlockable path is not
> +		 * supported. The walk framework holds hugetlb_vma_lock_read
> +		 * which must be dropped before handle_mm_fault, but if the
> +		 * mmap lock is also dropped (VM_FAULT_RETRY), the vma may
> +		 * be freed and the walk framework's unconditional unlock
> +		 * becomes a use-after-free.
> +		 */
> +		if (hmm_vma_walk->locked)
> +			return -EFAULT;

Just because it's unlockable doesn't mean that you must unlock. Can't this be
kept working as is, just simulating here as if it would not be unlockable?


-- 
Cheers,

David

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox