public inbox for linux-arch@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v0 0/3] PCI passthru on Hyper-V (Part II)
@ 2026-04-29 23:15 Mukesh R
  2026-04-29 23:15 ` [PATCH v0 1/3] mshv: Import declarations for irq remap and add irqbypass support Mukesh R
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch

This patch series implements interrupt remapping part of the PCI
passthru feature on Hyper-V when Linux is running as a privileged VM.
These patches complement the Part I of the feature at:

https://lore.kernel.org/linux-hyperv/20260422023239.1171963-1-mrathor@linux.microsoft.com/T/#t

Testing and other details are listed there.

Thanks,
-Mukesh

Mukesh R (3):
  mshv: Import declarations for irq remap and add irqbypass support
  hyperv: Implement irq remap for passthru devices
  mshv: Implement guest irq migration for passthru'd devices

 arch/x86/hyperv/irqdomain.c         |  18 +-
 drivers/hv/Kconfig                  |   1 +
 drivers/hv/mshv_eventfd.c           | 500 +++++++++++++++++++++++++++-
 drivers/hv/mshv_eventfd.h           |   3 +
 drivers/iommu/hyperv-iommu-root.c   |  14 +
 drivers/pci/controller/pci-hyperv.c |  10 +
 include/asm-generic/mshyperv.h      |   4 +
 include/hyperv/hvgdk_mini.h         |   3 +
 include/hyperv/hvhdk.h              |  17 +
 9 files changed, 564 insertions(+), 6 deletions(-)

-- 
2.51.2.vfs.0.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v0 1/3] mshv: Import declarations for irq remap and add irqbypass support
  2026-04-29 23:15 [PATCH v0 0/3] PCI passthru on Hyper-V (Part II) Mukesh R
@ 2026-04-29 23:15 ` Mukesh R
  2026-04-29 23:15 ` [PATCH v0 2/3] hyperv: Implement irq remap for passthru devices Mukesh R
  2026-04-29 23:15 ` [PATCH v0 3/3] mshv: Implement guest irq migration for passthru'd devices Mukesh R
  2 siblings, 0 replies; 4+ messages in thread
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch

For the irq map/remap hypercalls, copy relevant data structures from
hypervisor public headers into Linux equivalents. Also, update Kconfig and
mshv_irqfd for irqbypass. Please note, irqbypass is required for doing
passthru on MSHV. This because there is really no way of knowing the linux
irq in the mshv_irqfd_assign and mshv_irqfd_update paths without it. The
linux irq is setup upfront by VFIO before irqfd assign/update happens.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/Kconfig          |  1 +
 drivers/hv/mshv_eventfd.h   |  3 +++
 include/hyperv/hvgdk_mini.h |  3 +++
 include/hyperv/hvhdk.h      | 17 +++++++++++++++++
 4 files changed, 24 insertions(+)

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 7937ac0cbd0f..c831fe25ca2b 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -75,6 +75,7 @@ config MSHV_ROOT
 	# no particular order, making it impossible to reassemble larger pages
 	depends on PAGE_SIZE_4KB
 	select EVENTFD
+	select IRQ_BYPASS_MANAGER
 	select VIRT_XFER_TO_GUEST_WORK
 	select HMM_MIRROR
 	select MMU_NOTIFIER
diff --git a/drivers/hv/mshv_eventfd.h b/drivers/hv/mshv_eventfd.h
index 464c6b81ab33..ff4dd24b8ad4 100644
--- a/drivers/hv/mshv_eventfd.h
+++ b/drivers/hv/mshv_eventfd.h
@@ -9,6 +9,7 @@
 #define __LINUX_MSHV_EVENTFD_H
 
 #include <linux/poll.h>
+#include <linux/irqbypass.h>
 
 #include "mshv.h"
 #include "mshv_root.h"
@@ -37,6 +38,8 @@ struct mshv_irqfd {
 	struct mshv_irqfd_resampler	    *irqfd_resampler;
 	struct eventfd_ctx		    *irqfd_resamplefd;
 	struct hlist_node		     irqfd_resampler_hnode;
+	struct irq_bypass_consumer	     irqfd_bypass_cons;
+	struct irq_bypass_producer	    *irqfd_bypass_prod;
 };
 
 void mshv_eventfd_init(struct mshv_partition *partition);
diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
index da622fb06440..1ef480825705 100644
--- a/include/hyperv/hvgdk_mini.h
+++ b/include/hyperv/hvgdk_mini.h
@@ -59,6 +59,8 @@ struct hv_u128 {
 #define HV_PARTITION_ID_INVALID		((u64)0)
 #define HV_PARTITION_ID_SELF		((u64)-1)
 
+#define HV_MAX_VPS    256               /* HV_MAXIMUM_PROCESSORS */
+
 /* Hyper-V specific model specific registers (MSRs) */
 
 #if defined(CONFIG_X86)
@@ -508,6 +510,7 @@ union hv_vp_assist_msr_contents {	 /* HV_REGISTER_VP_ASSIST_PAGE */
 #define HVCALL_UNMAP_VP_STATE_PAGE			0x00e2
 #define HVCALL_GET_VP_STATE				0x00e3
 #define HVCALL_SET_VP_STATE				0x00e4
+#define HVCALL_GET_VPSET_FROM_MDA                       0x00e5
 #define HVCALL_GET_VP_CPUID_VALUES			0x00f4
 #define HVCALL_GET_PARTITION_PROPERTY_EX		0x0101
 #define HVCALL_MMIO_READ				0x0106
diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 5e83d3714966..d0a892347ab1 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -952,4 +952,21 @@ struct hv_input_modify_sparse_spa_page_host_access {
 #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE      0x4
 #define HV_MODIFY_SPA_PAGE_HOST_ACCESS_HUGE_PAGE       0x8
 
+#ifdef CONFIG_X86
+
+struct hv_input_get_vp_set_from_mda {   /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
+	u64 target_partid;
+	u64 dest_address;
+	u8  input_vtl;
+	u8  destmode_logical;         /* true => mode is logical */
+	u16 reserved0;                /* mbz */
+	u32 reserved1;                /* mbz */
+} __packed;
+
+union hv_output_get_vp_set_from_mda {  /* HV_OUTPUT_GET_VP_SET_FROM_MDA */
+	struct hv_vpset target_vpset;
+	u64 bitset_buffer[HV_GENERIC_SET_QWORD_COUNT(HV_MAX_VPS)];
+} __packed;
+
+#endif /* CONFIG_X86 */
 #endif /* _HV_HVHDK_H */
-- 
2.51.2.vfs.0.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v0 2/3] hyperv: Implement irq remap for passthru devices
  2026-04-29 23:15 [PATCH v0 0/3] PCI passthru on Hyper-V (Part II) Mukesh R
  2026-04-29 23:15 ` [PATCH v0 1/3] mshv: Import declarations for irq remap and add irqbypass support Mukesh R
@ 2026-04-29 23:15 ` Mukesh R
  2026-04-29 23:15 ` [PATCH v0 3/3] mshv: Implement guest irq migration for passthru'd devices Mukesh R
  2 siblings, 0 replies; 4+ messages in thread
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch

Implement interrupt remapping for direct attached and domain attached
devices on Hyper-V.

Please note there are few constraints when it comes to mapping device
interrupts on Hyper-V. For example, the hypervisor will not allow mapping
device interrupts to root if the device is a direct attached device. Since
the target guest cpu and vector info is not available during the initial
VFIO irq setup, we work around by skipping this initial map. Then later
during irqbypass trigger, when both guest target cpu vector are available,
we do the map in the hypervisor, update the device, and enable the
interrupt vector on the device. Rather than special case direct attached,
we do same for domain attached also. This implies irqbypass is required
for MSHV pci device passthru. Also noteworthy is that the hypervisor
will automatically setup any direct hw injection like posted interrupts.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 arch/x86/hyperv/irqdomain.c         |  18 +-
 drivers/hv/mshv_eventfd.c           | 422 +++++++++++++++++++++++++++-
 drivers/iommu/hyperv-iommu-root.c   |  14 +
 drivers/pci/controller/pci-hyperv.c |  10 +
 include/asm-generic/mshyperv.h      |   4 +
 5 files changed, 464 insertions(+), 4 deletions(-)

diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 527835b99a70..d32e912ad4a9 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -222,7 +222,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
 
 	msidesc = irq_data_get_msi_desc(data);
 	pdev = msi_desc_to_pci_dev(msidesc);
-	hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
 	cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
 
 	return hv_map_interrupt(hv_current_partition_id, hv_devid, false, cpu,
@@ -258,6 +258,20 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 		return;
 	}
 
+	/*
+	 * For direct attached devices, we cannot map interrupts in the
+	 * hypervisor because it will not allow it until we have guest target
+	 * vcpu and vector. So defer it until irqbypass. Also, do the same
+	 * for domain attached devices for simplicity.
+	 */
+	if (hv_pcidev_is_pthru_dev(pdev)) {
+		if (data->chip_data)
+			entry_to_msi_msg(data->chip_data, msg);
+		else
+			memset(msg, 0, sizeof(struct msi_msg));
+		return;
+	}
+
 	if (data->chip_data) {
 		/*
 		 * This interrupt is already mapped. Let's unmap first.
@@ -297,7 +311,7 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
 {
 	union hv_device_id hv_devid;
 
-	hv_devid.as_uint64 = hv_build_devid_type_pci(pdev);
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
 	return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
 }
 
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 90959f639dc3..666e28f4a4b5 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -7,7 +7,6 @@
  *
  * All credits to kvm developers.
  */
-
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/poll.h>
@@ -15,7 +14,8 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 #include <linux/eventfd.h>
-
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
 #if IS_ENABLED(CONFIG_X86_64)
 #include <asm/apic.h>
 #endif
@@ -27,6 +27,376 @@
 
 static struct workqueue_struct *irqfd_cleanup_wq;
 
+#if IS_ENABLED(CONFIG_X86_64)
+
+static int mshv_parse_mshv_irqfd(struct mshv_irqfd *irqfd,
+				 struct pci_dev **out_pdev,
+				 struct irq_data **out_irqdata)
+{
+	struct irq_bypass_producer *prod;
+	struct msi_desc *msidesc;
+	struct irq_data *irqdata;
+
+	if (irqfd == NULL || irqfd->irqfd_bypass_prod == NULL)
+		return -ENODEV;
+
+	prod = irqfd->irqfd_bypass_prod;
+
+	irqdata = irq_get_irq_data(prod->irq);
+	if (irqdata == NULL) {
+		pr_err("Hyper-V: irqbypass fail, no irqdata. irq:0x%x\n",
+		       prod->irq);
+		return -EINVAL;
+	}
+	*out_irqdata = irqdata;
+
+	msidesc = irq_data_get_msi_desc(irqdata);
+	if (msidesc == NULL) {
+		pr_err("Hyper-V: irqbypass msi fail. irq:0x%x\n", prod->irq);
+		return -EINVAL;
+	}
+
+	*out_pdev = msi_desc_to_pci_dev(msidesc);
+	if (*out_pdev == NULL) {
+		pr_err("Hyper-V: mshv_irqfd parse fail. irq:0x%x\n", prod->irq);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* Must be called with interrupts disabled */
+static int hv_vpset_from_hyp_disabled(
+			struct hv_input_get_vp_set_from_mda *input,
+			union hv_output_get_vp_set_from_mda *output,
+			struct mshv_lapic_irq *lapic_irq, u64 partid)
+{
+	u64 status;
+
+	memset(input, 0, sizeof(*input));
+	input->target_partid = partid;
+	input->dest_address = lapic_irq->lapic_apic_id;
+	input->input_vtl = 0;
+	input->destmode_logical = lapic_irq->lapic_control.logical_dest_mode;
+
+	status = hv_do_hypercall(HVCALL_GET_VPSET_FROM_MDA, input, output);
+	if (!hv_result_success(status)) {
+		hv_status_err(status, "apicid:0x%llx dest:0x%x\n",
+			      lapic_irq->lapic_apic_id,
+			      lapic_irq->lapic_control.logical_dest_mode);
+	}
+
+	return hv_result_to_errno(status);
+}
+
+/* Returns number of banks copied, -errno in case of error */
+static int hv_copy_vpset(struct hv_vpset *dest, struct hv_vpset *src)
+{
+	u64 bank_mask;
+	int banks, tot_banks = hv_max_vp_index / HV_VCPUS_PER_SPARSE_BANK;
+
+	if (tot_banks >= HV_MAX_SPARSE_VCPU_BANKS)
+		return -EINVAL;
+
+	dest->format = src->format;
+	dest->valid_bank_mask = src->valid_bank_mask;
+	bank_mask = src->valid_bank_mask;
+	for (banks = 0; banks <= tot_banks; banks++) {
+		if (bank_mask == 0)
+			break;
+
+		if (bank_mask & 1)
+			dest->bank_contents[banks] = src->bank_contents[banks];
+		bank_mask = bank_mask >> 1;
+	}
+
+	return banks;
+}
+
+static int mshv_map_device_interrupt(u64 ptid, union hv_device_id hv_devid,
+				     struct mshv_lapic_irq *ginfo,
+				     struct hv_interrupt_entry *ret_entry,
+				     u64 *ret_status)
+{
+	struct hv_input_map_device_interrupt *irq_input;
+	struct hv_output_map_device_interrupt *irq_output;
+	struct hv_device_interrupt_descriptor *intdesc;
+	struct hv_input_get_vp_set_from_mda *mda_input;
+	union hv_output_get_vp_set_from_mda *mda_output;
+	ulong flags;
+	u64 status;
+	int rc, var_size;
+
+	*ret_status = U64_MAX;
+	local_irq_save(flags);
+
+	mda_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	mda_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	/*
+	 * Map Device Interrupt hcall needs vp set based on vp indexes used
+	 * during vp creation. Here we have lapic-id of the vp only. Easiest
+	 * is to just ask the hypervisor for the vp set matching the lapic-id.
+	 */
+	rc = hv_vpset_from_hyp_disabled(mda_input, mda_output, ginfo, ptid);
+	if (rc)
+		goto out;	/* error already printed */
+
+	irq_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	irq_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+	memset(irq_input, 0, sizeof(*irq_input));
+
+	irq_input->partition_id = ptid;
+	irq_input->device_id = hv_devid.as_uint64;
+
+	intdesc = &irq_input->interrupt_descriptor;
+	intdesc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED;
+	intdesc->vector_count = 1;
+	intdesc->target.vector = ginfo->lapic_vector;
+	intdesc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE;
+
+	intdesc->target.vp_set.valid_bank_mask = 0;
+	intdesc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K;
+	intdesc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+	rc = hv_copy_vpset(&intdesc->target.vp_set, &mda_output->target_vpset);
+	if (rc <= 0) {
+		pr_err("Hyper-V: ptid %lld - (irq)vpset copy failed (%d)\n",
+		       ptid, rc);
+		goto out;
+	}
+
+	/*
+	 * var-sized hcall: var-size starts after vp_mask (thus vp_set.format
+	 * does not count, but vp_set.valid_bank_mask does).
+	 */
+	var_size = rc + 1;
+	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size,
+				     irq_input, irq_output);
+	*ret_entry = irq_output->interrupt_entry;
+	local_irq_restore(flags);
+
+	rc = 0;
+	if (!hv_result_success(status)) {
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+			hv_status_err(status, "pt:%lld vec:%d lapic-id:%lld\n",
+			      ptid, ginfo->lapic_vector, ginfo->lapic_apic_id);
+		*ret_status = status;
+		rc = hv_result_to_errno(status);
+	}
+
+	return rc;
+
+out:
+	local_irq_restore(flags);
+	return rc;
+
+}
+
+static int mshv_unmap_device_interrupt(union hv_device_id hv_devid,
+				       struct hv_interrupt_entry *irq_entry)
+{
+	unsigned long flags;
+	struct hv_input_unmap_device_interrupt *input;
+	u64 status;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
+		input->partition_id = hv_get_current_partid();
+	else
+		input->partition_id = hv_current_partition_id;
+
+	input->device_id = hv_devid.as_uint64;
+	input->interrupt_entry = *irq_entry;
+
+	status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "\n");
+
+	return hv_result_to_errno(status);
+}
+
+static int mshv_chk_unmap_irq(union hv_device_id hv_devid,
+			      struct irq_data *irqdata)
+{
+	int rc;
+
+	if (irqdata->chip_data == NULL)
+		return 0;
+
+	rc = mshv_unmap_device_interrupt(hv_devid, irqdata->chip_data);
+	if (rc)
+		return rc;
+
+	kfree(irqdata->chip_data);
+	irqdata->chip_data = NULL;
+
+	return 0;
+}
+
+/*
+ * Synchronize device update with VFIO.
+ *    See: vfio_pci_memory_lock_and_enable()
+ */
+static u16 mshv_pci_memory_lock_and_enable(struct vfio_pci_core_device *cdev)
+{
+	u16 cmd;
+
+	down_write(&cdev->memory_lock);
+	pci_read_config_word(cdev->pdev, PCI_COMMAND, &cmd);
+	if (!(cmd & PCI_COMMAND_MEMORY))
+		pci_write_config_word(cdev->pdev, PCI_COMMAND,
+				      cmd | PCI_COMMAND_MEMORY);
+	return cmd;
+}
+
+static void mshv_pci_memory_unlock_and_restore(
+					struct vfio_pci_core_device *cdev,
+					u16 cmd)
+{
+	pci_write_config_word(cdev->pdev, PCI_COMMAND, cmd);
+	up_write(&cdev->memory_lock);
+}
+
+static void mshv_make_device_usable(struct pci_dev *pdev, int vector,
+				    struct hv_interrupt_entry *hv_entry)
+{
+	int lirq;
+	struct msi_msg msimsg;
+	struct irq_data *irqdata;
+	u16 pcicmd;
+	struct vfio_pci_core_device *coredev = dev_get_drvdata(&pdev->dev);
+
+	if (pdev->dev.driver == NULL ||
+	    strcmp(pdev->dev.driver->name, "vfio-pci") != 0) {
+		pr_err("Hyper-V: irqbypass: non vfio device %s\n",
+		       pci_name(pdev));
+		return;
+	}
+	if (coredev == NULL) {
+		pr_err("Hyper-V: irqbypass: null vfio device for %s\n",
+		       pci_name(pdev));
+		return;
+	}
+
+	if (hv_entry->source != HV_INTERRUPT_SOURCE_MSI) {
+		pr_err("Hyper-V: %s irq source not msi\n", pci_name(pdev));
+		return;
+	}
+
+	lirq = pci_irq_vector(pdev, vector);
+	irqdata = irq_get_irq_data(lirq);
+	if (irqdata == NULL) {
+		pr_err("Hyper-V: null irq_data for write msimsg. lirq:0x%x\n",
+		       lirq);
+		return;
+	}
+
+	msimsg.address_hi = 0;
+	msimsg.address_lo = hv_entry->msi_entry.address.as_uint32;
+	msimsg.data =  hv_entry->msi_entry.data.as_uint32;
+
+	pcicmd = mshv_pci_memory_lock_and_enable(coredev);
+	pci_write_msi_msg(lirq, &msimsg);
+	mshv_pci_memory_unlock_and_restore(coredev, pcicmd);
+
+	pci_msi_unmask_irq(irqdata);
+
+	if (irqdata->parent_data)
+		irq_chip_unmask_parent(irqdata);
+}
+
+/*
+ * This guest has a device passthru'd to it. VFIO did the initial setup of
+ * the device interrupts, but we left them unmapped in the hypervisor
+ * because we didn't have the guest target cpu and vector (required by
+ * hypervisor). We have them now, so do the map hypercall.
+ * Also, when here, it is expected that the device global mask is unset
+ * but individual MSI/x masks are set. Goal here is to map the interrupt in
+ * the hypervisor, update the corresponding device MSI/x entry, and enable it.
+ */
+static void mshv_pthru_dev_irq_remap(struct mshv_irqfd *irqfd)
+{
+	u64 ptid, status;
+	struct pci_dev *pdev;
+	int rc, deposit_pgs = 16;
+	struct mshv_lapic_irq *ginfo = &irqfd->irqfd_lapic_irq;
+	union hv_device_id hv_devid;
+	struct hv_interrupt_entry *new_entry;
+	struct irq_data *irqdata;
+
+	if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+	    irqfd->irqfd_bypass_prod == NULL)
+		return;
+
+	rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+	if (rc)
+		return;
+
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+
+	rc = mshv_chk_unmap_irq(hv_devid, irqdata);
+	if (rc)
+		return;
+
+	new_entry = kmalloc(sizeof(*new_entry), GFP_ATOMIC);
+	if (new_entry == NULL)
+		return;
+
+	ptid = irqfd->irqfd_partn->pt_id;
+
+	while (deposit_pgs--) {
+		rc = mshv_map_device_interrupt(ptid, hv_devid, ginfo, new_entry,
+					       &status);
+		if (rc == 0)
+			break;
+		if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+			break;
+
+		rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
+		if (rc)
+			break;
+	}
+	if (rc) {
+		kfree(new_entry);
+		return;
+	}
+
+	irqdata->chip_data = new_entry;
+
+	mshv_make_device_usable(pdev, irqdata->hwirq, new_entry);
+}
+
+static void mshv_pthru_dev_irq_undo(struct mshv_irqfd *irqfd)
+{
+	struct pci_dev *pdev;
+	union hv_device_id hv_devid;
+	struct irq_data *irqdata;
+	int rc;
+
+	if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+	    irqfd->irqfd_bypass_prod == NULL)
+		return;
+
+	rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+	if (rc)
+		return;
+
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+	mshv_chk_unmap_irq(hv_devid, irqdata);
+}
+
+#else /* IS_ENABLED(CONFIG_X86_64) */
+
+static void mshv_pthru_dev_irq_remap(struct mshv_irqfd *irqfd) { }
+static void mshv_pthru_dev_irq_undo(struct mshv_irqfd *irqfd) { }
+
+#endif /* IS_ENABLED(CONFIG_X86_64) */
+
 void mshv_register_irq_ack_notifier(struct mshv_partition *partition,
 				    struct mshv_irq_ack_notifier *mian)
 {
@@ -264,6 +634,7 @@ static void mshv_irqfd_shutdown(struct work_struct *work)
 	/*
 	 * It is now safe to release the object's resources
 	 */
+	irq_bypass_unregister_consumer(&irqfd->irqfd_bypass_cons);
 	eventfd_ctx_put(irqfd->irqfd_eventfd_ctx);
 	kfree(irqfd);
 }
@@ -286,6 +657,12 @@ static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
 
 	hlist_del(&irqfd->irqfd_hnode);
 
+	/*
+	 * Cleanup interrupt map (kfree chip_data) while in a VMM thread as
+	 * unmap needs partition id. mshv_irqfd_shutdown() runs in a kthread.
+	 */
+	mshv_pthru_dev_irq_undo(irqfd);
+
 	queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
 }
 
@@ -383,6 +760,45 @@ static void mshv_irqfd_queue_proc(struct file *file, wait_queue_head_t *wqh,
 	add_wait_queue_priority(wqh, &irqfd->irqfd_wait);
 }
 
+static int mshv_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+					struct irq_bypass_producer *prod)
+{
+	struct mshv_irqfd *irqfd;
+
+	irqfd = container_of(cons, struct mshv_irqfd, irqfd_bypass_cons);
+	irqfd->irqfd_bypass_prod = prod;
+
+	mshv_pthru_dev_irq_remap(irqfd);
+
+	return 0;
+}
+
+static void mshv_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+					 struct irq_bypass_producer *prod)
+{
+	struct mshv_irqfd *irqfd;
+
+	irqfd = container_of(cons, struct mshv_irqfd, irqfd_bypass_cons);
+
+	WARN_ON(irqfd->irqfd_bypass_prod != prod);
+	irqfd->irqfd_bypass_prod = NULL;
+
+}
+
+static void mshv_setup_irq_bypass(struct mshv_irqfd *irqfd,
+				  struct eventfd_ctx *eventfd)
+{
+	struct irq_bypass_consumer *consumer = &irqfd->irqfd_bypass_cons;
+	int rc;
+
+	consumer->add_producer = mshv_irq_bypass_add_producer;
+	consumer->del_producer = mshv_irq_bypass_del_producer;
+	rc = irq_bypass_register_consumer(&irqfd->irqfd_bypass_cons, eventfd);
+	if (rc)
+		pr_err("Hyper-V: irq bypass consumer registration failed: %d\n",
+		       rc);
+}
+
 static int mshv_irqfd_assign(struct mshv_partition *pt,
 			     struct mshv_user_irqfd *args)
 {
@@ -509,6 +925,8 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	if (events & EPOLLIN)
 		mshv_assert_irq_slow(irqfd);
 
+	mshv_setup_irq_bypass(irqfd, eventfd);
+
 	srcu_read_unlock(&pt->pt_irq_srcu, idx);
 	return 0;
 
diff --git a/drivers/iommu/hyperv-iommu-root.c b/drivers/iommu/hyperv-iommu-root.c
index 739bbf39dea2..3e078e9213f9 100644
--- a/drivers/iommu/hyperv-iommu-root.c
+++ b/drivers/iommu/hyperv-iommu-root.c
@@ -219,6 +219,20 @@ u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
 }
 EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
 
+/* Build device id for the interrupt path */
+u64 hv_devid_from_pdev(struct pci_dev *pdev)
+{
+	enum hv_device_type dev_type;
+
+	if (hv_pcidev_is_attached_dev(pdev))
+		dev_type = HV_DEVICE_TYPE_LOGICAL;
+	else
+		dev_type = HV_DEVICE_TYPE_PCI;
+
+	return hv_build_devid_oftype(pdev, dev_type);
+}
+EXPORT_SYMBOL_GPL(hv_devid_from_pdev);
+
 /* Create a new device domain in the hypervisor */
 static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
 {
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 8f6b818ee09b..8ecc909c3415 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1745,6 +1745,16 @@ static void hv_irq_mask(struct irq_data *data)
 
 static void hv_irq_unmask(struct irq_data *data)
 {
+	struct pci_dev *pdev;
+	struct msi_desc *msi_desc;
+
+	msi_desc = irq_data_get_msi_desc(data);
+	pdev = msi_desc_to_pci_dev(msi_desc);
+
+	/* Done during bypass setup in mshv_eventfd.c: mshv_irqfd_assign() */
+	if (hv_pcidev_is_pthru_dev(pdev))
+		return;
+
 	hv_arch_irq_unmask(data);
 
 	if (data->parent_data->chip->irq_unmask)
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index edbcfc2a9b60..887605aa9c95 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -341,6 +341,7 @@ u64 hv_get_current_partid(void);
 bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
 bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev);
 u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
+u64 hv_devid_from_pdev(struct pci_dev *pdev);
 
 #else /* Remove following after arm64 implementation is done */
 
@@ -354,6 +355,9 @@ static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
 					enum hv_device_type type)
 { return 0; }
 
+static inline u64 hv_devid_from_pdev(struct pci_dev *pdev)
+{ return 0; }
+
 static inline u64 hv_get_current_partid(void)
 { return HV_PARTITION_ID_INVALID; }
 #endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
-- 
2.51.2.vfs.0.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH v0 3/3] mshv: Implement guest irq migration for passthru'd devices
  2026-04-29 23:15 [PATCH v0 0/3] PCI passthru on Hyper-V (Part II) Mukesh R
  2026-04-29 23:15 ` [PATCH v0 1/3] mshv: Import declarations for irq remap and add irqbypass support Mukesh R
  2026-04-29 23:15 ` [PATCH v0 2/3] hyperv: Implement irq remap for passthru devices Mukesh R
@ 2026-04-29 23:15 ` Mukesh R
  2 siblings, 0 replies; 4+ messages in thread
From: Mukesh R @ 2026-04-29 23:15 UTC (permalink / raw)
  To: hpa, robin.murphy, robh, linux-hyperv, linux-kernel, iommu,
	linux-pci, linux-arch

Ask the hypervisor to retarget interrupts to new guest cpu or vector
upon guest irq migration. This happens in the irqfd update path.

Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c | 78 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 666e28f4a4b5..0d0f1229f500 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -192,6 +192,77 @@ static int mshv_map_device_interrupt(u64 ptid, union hv_device_id hv_devid,
 
 }
 
+/* NOTE: caller does spin_lock_irq on pt_irqfds_lock, hence no disable here */
+static void mshv_do_guest_irq_retarget(u64 partid, struct mshv_irqfd *irqfd)
+{
+	int rc, var_size;
+	u64 status;
+	union hv_device_id hv_devid;
+	struct hv_input_get_vp_set_from_mda *mda_input;
+	union hv_output_get_vp_set_from_mda *mda_output;
+	struct hv_retarget_device_interrupt *remap_inp;
+	struct pci_dev *pdev;
+	struct irq_data *irqdata;
+	struct mshv_lapic_irq *lapic_irq = &irqfd->irqfd_lapic_irq;
+	struct hv_interrupt_entry *inte = NULL;
+
+	if (!irqfd->irqfd_girq_ent.girq_entry_valid ||
+	    irqfd->irqfd_bypass_prod == NULL)
+		return;
+
+	rc = mshv_parse_mshv_irqfd(irqfd, &pdev, &irqdata);
+	if (rc)
+		return;
+
+	inte = irqdata->chip_data;
+	if (inte == NULL)
+		return;
+
+	hv_devid.as_uint64 = hv_devid_from_pdev(pdev);
+
+
+	mda_input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	mda_output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	rc = hv_vpset_from_hyp_disabled(mda_input, mda_output, lapic_irq,
+					partid);
+	if (rc)
+		return;
+
+	remap_inp = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(remap_inp, 0, sizeof(*remap_inp));
+
+	rc = hv_copy_vpset(&remap_inp->int_target.vp_set,
+			   &mda_output->target_vpset);
+	if (rc <= 0) {
+		pr_err("Hyper-V: ptid %lld - vpset copy failed (%d)\n",
+		       partid, rc);
+		return;
+	}
+
+	/*
+	 * var-sized hcall: var-size starts after vp_mask (thus vp_set.format
+	 * does not count, but vp_set.valid_bank_mask does).
+	 */
+	var_size = rc + 1;
+
+	remap_inp->partition_id = partid;
+	remap_inp->device_id = hv_devid.as_uint64;
+	remap_inp->int_target.vector = lapic_irq->lapic_vector;
+	remap_inp->int_target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
+
+	remap_inp->int_entry.source = inte->source;
+	remap_inp->int_entry.msi_entry.as_uint64 = inte->msi_entry.as_uint64;
+
+	status = hv_do_rep_hypercall(HVCALL_RETARGET_INTERRUPT, 0, var_size,
+				     remap_inp, NULL);
+
+	if (!hv_result_success(status))
+		hv_status_err(status, "pt:%lld vec:%d lapic-id:%lld\n",
+			      partid, lapic_irq->lapic_vector,
+			      lapic_irq->lapic_apic_id);
+}
+
 static int mshv_unmap_device_interrupt(union hv_device_id hv_devid,
 				       struct hv_interrupt_entry *irq_entry)
 {
@@ -728,9 +799,12 @@ static void mshv_irqfd_update(struct mshv_partition *pt,
 			      struct mshv_irqfd *irqfd)
 {
 	write_seqcount_begin(&irqfd->irqfd_irqe_sc);
-	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt,
-						    irqfd->irqfd_irqnum);
+	irqfd->irqfd_girq_ent = mshv_ret_girq_entry(pt, irqfd->irqfd_irqnum);
 	mshv_copy_girq_info(&irqfd->irqfd_girq_ent, &irqfd->irqfd_lapic_irq);
+
+#if IS_ENABLED(CONFIG_X86_64)
+	mshv_do_guest_irq_retarget(pt->pt_id, irqfd);
+#endif
 	write_seqcount_end(&irqfd->irqfd_irqe_sc);
 }
 
-- 
2.51.2.vfs.0.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-04-29 23:15 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-29 23:15 [PATCH v0 0/3] PCI passthru on Hyper-V (Part II) Mukesh R
2026-04-29 23:15 ` [PATCH v0 1/3] mshv: Import declarations for irq remap and add irqbypass support Mukesh R
2026-04-29 23:15 ` [PATCH v0 2/3] hyperv: Implement irq remap for passthru devices Mukesh R
2026-04-29 23:15 ` [PATCH v0 3/3] mshv: Implement guest irq migration for passthru'd devices Mukesh R

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox