Linux-HyperV List
 help / color / mirror / Atom feed
* Re: [PATCH 2/4] mshv: Introduce hv_deposit_memory helper functions
From: Mukesh R @ 2026-01-24  0:33 UTC (permalink / raw)
  To: Stanislav Kinsburskii, kys, haiyangz, wei.liu, decui, longli
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <176913212322.89165.12915292926444353627.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

On 1/22/26 17:35, Stanislav Kinsburskii wrote:
> Introduce hv_deposit_memory_node() and hv_deposit_memory() helper
> functions to handle memory deposition with proper error handling.
> 
> The new hv_deposit_memory_node() function takes the hypervisor status
> as a parameter and validates it before depositing pages. It checks for
> HV_STATUS_INSUFFICIENT_MEMORY specifically and returns an error for
> unexpected status codes.
> 
> This is a precursor patch to new out-of-memory error codes support.
> No functional changes intended.
> 
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
>   drivers/hv/hv_proc.c           |   22 ++++++++++++++++++++--
>   drivers/hv/mshv_root_hv_call.c |   25 +++++++++----------------
>   drivers/hv/mshv_root_main.c    |    3 +--
>   include/asm-generic/mshyperv.h |   10 ++++++++++
>   4 files changed, 40 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> index 80c66d1c74d5..c0c2bfc80d77 100644
> --- a/drivers/hv/hv_proc.c
> +++ b/drivers/hv/hv_proc.c
> @@ -110,6 +110,23 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
>   }
>   EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
>   
> +int hv_deposit_memory_node(int node, u64 partition_id,
> +			   u64 hv_status)
> +{
> +	u32 num_pages;
> +
> +	switch (hv_result(hv_status)) {
> +	case HV_STATUS_INSUFFICIENT_MEMORY:
> +		num_pages = 1;
> +		break;
> +	default:
> +		hv_status_err(hv_status, "Unexpected!\n");
> +		return -ENOMEM;
> +	}
> +	return hv_call_deposit_pages(node, partition_id, num_pages);
> +}
> +EXPORT_SYMBOL_GPL(hv_deposit_memory_node);
> +

Different hypercalls may want to deposit different number of pages in one
shot. As feature evolves, page sizes get mixed, we'd almost need that
flexibility. So, imo, either we just don't do this for now, or add num pages
parameter to be passed down.

Thanks,
-Mukesh



>   bool hv_result_oom(u64 status)
>   {
>   	switch (hv_result(status)) {
> @@ -155,7 +172,8 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
>   			}
>   			break;
>   		}
> -		ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
> +		ret = hv_deposit_memory_node(node, hv_current_partition_id,
> +					     status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -197,7 +215,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
>   			}
>   			break;
>   		}
> -		ret = hv_call_deposit_pages(node, partition_id, 1);
> +		ret = hv_deposit_memory_node(node, partition_id, status);
>   
>   	} while (!ret);
>   
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index 58c5cbf2e567..06f2bac8039d 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -123,8 +123,7 @@ int hv_call_create_partition(u64 flags,
>   			break;
>   		}
>   		local_irq_restore(irq_flags);
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -					    hv_current_partition_id, 1);
> +		ret = hv_deposit_memory(hv_current_partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -151,7 +150,7 @@ int hv_call_initialize_partition(u64 partition_id)
>   			ret = hv_result_to_errno(status);
>   			break;
>   		}
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
> +		ret = hv_deposit_memory(partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -465,8 +464,7 @@ int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
>   		}
>   		local_irq_restore(flags);
>   
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -					    partition_id, 1);
> +		ret = hv_deposit_memory(partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -525,8 +523,7 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
>   		}
>   		local_irq_restore(flags);
>   
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -					    partition_id, 1);
> +		ret = hv_deposit_memory(partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -573,7 +570,7 @@ static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
>   
>   		local_irq_restore(flags);
>   
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
> +		ret = hv_deposit_memory(partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -722,8 +719,7 @@ hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
>   			ret = hv_result_to_errno(status);
>   			break;
>   		}
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1);
> -
> +		ret = hv_deposit_memory(port_partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -776,8 +772,7 @@ hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
>   			ret = hv_result_to_errno(status);
>   			break;
>   		}
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -					    connection_partition_id, 1);
> +		ret = hv_deposit_memory(connection_partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -848,8 +843,7 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type,
>   			break;
>   		}
>   
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -					    hv_current_partition_id, 1);
> +		ret = hv_deposit_memory(hv_current_partition_id, status);
>   	} while (!ret);
>   
>   	return ret;
> @@ -885,8 +879,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
>   			return ret;
>   		}
>   
> -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -					    hv_current_partition_id, 1);
> +		ret = hv_deposit_memory(hv_current_partition_id, status);
>   		if (ret)
>   			return ret;
>   	} while (!ret);
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index f4697497f83e..5fc572e31cd7 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -264,8 +264,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
>   		if (!hv_result_oom(status))
>   			ret = hv_result_to_errno(status);
>   		else
> -			ret = hv_call_deposit_pages(NUMA_NO_NODE,
> -						    pt_id, 1);
> +			ret = hv_deposit_memory(pt_id, status);
>   	} while (!ret);
>   
>   	args.status = hv_result(status);
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index b73352a7fc9e..c8e8976839f8 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -344,6 +344,7 @@ static inline bool hv_parent_partition(void)
>   }
>   
>   bool hv_result_oom(u64 status);
> +int hv_deposit_memory_node(int node, u64 partition_id, u64 status);
>   int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
>   int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
>   int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
> @@ -353,6 +354,10 @@ static inline bool hv_root_partition(void) { return false; }
>   static inline bool hv_l1vh_partition(void) { return false; }
>   static inline bool hv_parent_partition(void) { return false; }
>   static inline bool hv_result_oom(u64 status) { return false; }
> +static inline int hv_deposit_memory_node(int node, u64 partition_id, u64 status)
> +{
> +	return -EOPNOTSUPP;
> +}
>   static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
>   {
>   	return -EOPNOTSUPP;
> @@ -367,6 +372,11 @@ static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u3
>   }
>   #endif /* CONFIG_MSHV_ROOT */
>   
> +static inline int hv_deposit_memory(u64 partition_id, u64 status)
> +{
> +	return hv_deposit_memory_node(NUMA_NO_NODE, partition_id, status);
> +}
> +
>   #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
>   u8 __init get_vtl(void);
>   #else
> 
> 


^ permalink raw reply

* Re: [PATCH v0 05/15] mshv: Declarations and definitions for VFIO-MSHV bridge device
From: Mukesh R @ 2026-01-24  0:36 UTC (permalink / raw)
  To: Nuno Das Neves, linux-kernel, linux-hyperv, linux-arm-kernel,
	iommu, linux-pci, linux-arch
  Cc: kys, haiyangz, wei.liu, decui, longli, catalin.marinas, will,
	tglx, mingo, bp, dave.hansen, hpa, joro, lpieralisi, kwilczynski,
	mani, robh, bhelgaas, arnd, mhklinux, romank
In-Reply-To: <eeb79431-47e6-4334-b97a-4dd64474e539@linux.microsoft.com>

On 1/23/26 10:25, Nuno Das Neves wrote:
> On 1/19/2026 10:42 PM, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Add data structs needed by the subsequent patch that introduces a new
>> module to implement VFIO-MSHV pseudo device.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   drivers/hv/mshv_root.h    | 23 +++++++++++++++++++++++
>>   include/uapi/linux/mshv.h | 31 +++++++++++++++++++++++++++++++
>>   2 files changed, 54 insertions(+)
>>
>> diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
>> index c3753b009fd8..42e1da1d545b 100644
>> --- a/drivers/hv/mshv_root.h
>> +++ b/drivers/hv/mshv_root.h
>> @@ -220,6 +220,29 @@ struct port_table_info {
>>   	};
>>   };
>>   
>> +struct mshv_device {
>> +	const struct mshv_device_ops *device_ops;
>> +	struct mshv_partition *device_pt;
>> +	void *device_private;
>> +	struct hlist_node device_ptnode;
>> +};
>> +
>> +struct mshv_device_ops {
>> +	const char *device_name;
>> +	long (*device_create)(struct mshv_device *dev, u32 type);
>> +	void (*device_release)(struct mshv_device *dev);
>> +	long (*device_set_attr)(struct mshv_device *dev,
>> +				struct mshv_device_attr *attr);
>> +	long (*device_has_attr)(struct mshv_device *dev,
>> +				struct mshv_device_attr *attr);
>> +};
>> +
>> +extern struct mshv_device_ops mshv_vfio_device_ops;
>> +int mshv_vfio_ops_init(void);
>> +void mshv_vfio_ops_exit(void);
>> +long mshv_partition_ioctl_create_device(struct mshv_partition *partition,
>> +					void __user *user_args);
>> +
>>   int mshv_update_routing_table(struct mshv_partition *partition,
>>   			      const struct mshv_user_irq_entry *entries,
>>   			      unsigned int numents);
>> diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h
>> index dee3ece28ce5..b7b10f9e2896 100644
>> --- a/include/uapi/linux/mshv.h
>> +++ b/include/uapi/linux/mshv.h
>> @@ -252,6 +252,7 @@ struct mshv_root_hvcall {
>>   #define MSHV_GET_GPAP_ACCESS_BITMAP	_IOWR(MSHV_IOCTL, 0x06, struct mshv_gpap_access_bitmap)
>>   /* Generic hypercall */
>>   #define MSHV_ROOT_HVCALL		_IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall)
>> +#define MSHV_CREATE_DEVICE		_IOWR(MSHV_IOCTL, 0x08, struct mshv_create_device)
>>   
> 
> With this commit, the IOCTL number is exposed to userspace but it doesn't work.
> Ideally the IOCTL number should be added in the commit where it becomes usable.
> 


Correct, I switched it because the next patch won't compile without it as
it needs the declarations here. It could be combined into one big patch,
but I think normally one would not expect full functionality until the
release is certified to be that feature compliant anyways. Hope that
makes sense.

Thanks,
-Mukesh




>>   /*
>>    ********************************
>> @@ -402,4 +403,34 @@ struct mshv_sint_mask {
>>   /* hv_hvcall device */
>>   #define MSHV_HVCALL_SETUP        _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup)
>>   #define MSHV_HVCALL              _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall)
>> +
>> +/* device passhthru */
>> +#define MSHV_CREATE_DEVICE_TEST		1
>> +
>> +enum {
>> +	MSHV_DEV_TYPE_VFIO,
>> +	MSHV_DEV_TYPE_MAX,
>> +};
>> +
>> +struct mshv_create_device {
>> +	__u32	type;	     /* in: MSHV_DEV_TYPE_xxx */
>> +	__u32	fd;	     /* out: device handle */
>> +	__u32	flags;	     /* in: MSHV_CREATE_DEVICE_xxx */
>> +};
>> +
>> +#define MSHV_DEV_VFIO_FILE      1
>> +#define MSHV_DEV_VFIO_FILE_ADD	1
>> +#define MSHV_DEV_VFIO_FILE_DEL	2
>> +
>> +struct mshv_device_attr {
>> +	__u32	flags;		/* no flags currently defined */
>> +	__u32	group;		/* device-defined */
>> +	__u64	attr;		/* group-defined */
>> +	__u64	addr;		/* userspace address of attr data */
>> +};
>> +
>> +/* Device fds created with MSHV_CREATE_DEVICE */
>> +#define MSHV_SET_DEVICE_ATTR	_IOW(MSHV_IOCTL, 0x00, struct mshv_device_attr)
>> +#define MSHV_HAS_DEVICE_ATTR	_IOW(MSHV_IOCTL, 0x01, struct mshv_device_attr)
>> +
>>   #endif


^ permalink raw reply

* Re: [PATCH v0 06/15] mshv: Implement mshv bridge device for VFIO
From: Mukesh R @ 2026-01-24  0:37 UTC (permalink / raw)
  To: Nuno Das Neves, linux-kernel, linux-hyperv, linux-arm-kernel,
	iommu, linux-pci, linux-arch
  Cc: kys, haiyangz, wei.liu, decui, longli, catalin.marinas, will,
	tglx, mingo, bp, dave.hansen, hpa, joro, lpieralisi, kwilczynski,
	mani, robh, bhelgaas, arnd, mhklinux, romank
In-Reply-To: <82082744-2e2f-4ea2-8ab4-cdd3e69ffd8f@linux.microsoft.com>

On 1/23/26 10:32, Nuno Das Neves wrote:
> On 1/19/2026 10:42 PM, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Add a new file to implement VFIO-MSHV bridge pseudo device. These
>> functions are called in the VFIO framework, and credits to kvm/vfio.c
>> as this file was adapted from it.
>>
>> Original author: Wei Liu <wei.liu@kernel.org>
>> (Slightly modified from the original version).
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> 
> Since the code is very similar to Wei's original commit, the way I'd
> recommend to do it is:
> 1. Change the commit author to Wei, using git commit --amend --author=
> and
> 2. Put his signed-off line before yours:
> 
> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> 
> This shows he is the author of the commit but you ported it.
> 
> If you feel you changed it enough that it should be considered
> co-authored, you can instead keep your authorship of the commit and
> put:
> 
> Co-developed-by: Wei Liu <wei.liu@kernel.org>
> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>

Perfect! Thank you, that is exactly the information I was trying to
seek... makes sense.

Thanks,
-Mukesh



>> ---
>>   drivers/hv/Makefile    |   3 +-
>>   drivers/hv/mshv_vfio.c | 210 +++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 212 insertions(+), 1 deletion(-)
>>   create mode 100644 drivers/hv/mshv_vfio.c
>>
>> diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
>> index a49f93c2d245..eae003c4cb8f 100644
>> --- a/drivers/hv/Makefile
>> +++ b/drivers/hv/Makefile
>> @@ -14,7 +14,8 @@ hv_vmbus-y := vmbus_drv.o \
>>   hv_vmbus-$(CONFIG_HYPERV_TESTING)	+= hv_debugfs.o
>>   hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
>>   mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
>> -	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
>> +	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o \
>> +               mshv_vfio.o
>>   mshv_vtl-y := mshv_vtl_main.o
>>   
>>   # Code that must be built-in
>> diff --git a/drivers/hv/mshv_vfio.c b/drivers/hv/mshv_vfio.c
>> new file mode 100644
>> index 000000000000..6ea4d99a3bd2
>> --- /dev/null
>> +++ b/drivers/hv/mshv_vfio.c
>> @@ -0,0 +1,210 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/*
>> + * VFIO-MSHV bridge pseudo device
>> + *
>> + * Heavily inspired by the VFIO-KVM bridge pseudo device.
>> + */
>> +#include <linux/errno.h>
>> +#include <linux/file.h>
>> +#include <linux/list.h>
>> +#include <linux/module.h>
>> +#include <linux/mutex.h>
>> +#include <linux/slab.h>
>> +#include <linux/vfio.h>
>> +
>> +#include "mshv.h"
>> +#include "mshv_root.h"
>> +
>> +struct mshv_vfio_file {
>> +	struct list_head node;
>> +	struct file *file;	/* list of struct mshv_vfio_file */
>> +};
>> +
>> +struct mshv_vfio {
>> +	struct list_head file_list;
>> +	struct mutex lock;
>> +};
>> +
>> +static bool mshv_vfio_file_is_valid(struct file *file)
>> +{
>> +	bool (*fn)(struct file *file);
>> +	bool ret;
>> +
>> +	fn = symbol_get(vfio_file_is_valid);
>> +	if (!fn)
>> +		return false;
>> +
>> +	ret = fn(file);
>> +
>> +	symbol_put(vfio_file_is_valid);
>> +
>> +	return ret;
>> +}
>> +
>> +static long mshv_vfio_file_add(struct mshv_device *mshvdev, unsigned int fd)
>> +{
>> +	struct mshv_vfio *mshv_vfio = mshvdev->device_private;
>> +	struct mshv_vfio_file *mvf;
>> +	struct file *filp;
>> +	long ret = 0;
>> +
>> +	filp = fget(fd);
>> +	if (!filp)
>> +		return -EBADF;
>> +
>> +	/* Ensure the FD is a vfio FD. */
>> +	if (!mshv_vfio_file_is_valid(filp)) {
>> +		ret = -EINVAL;
>> +		goto out_fput;
>> +	}
>> +
>> +	mutex_lock(&mshv_vfio->lock);
>> +
>> +	list_for_each_entry(mvf, &mshv_vfio->file_list, node) {
>> +		if (mvf->file == filp) {
>> +			ret = -EEXIST;
>> +			goto out_unlock;
>> +		}
>> +	}
>> +
>> +	mvf = kzalloc(sizeof(*mvf), GFP_KERNEL_ACCOUNT);
>> +	if (!mvf) {
>> +		ret = -ENOMEM;
>> +		goto out_unlock;
>> +	}
>> +
>> +	mvf->file = get_file(filp);
>> +	list_add_tail(&mvf->node, &mshv_vfio->file_list);
>> +
>> +out_unlock:
>> +	mutex_unlock(&mshv_vfio->lock);
>> +out_fput:
>> +	fput(filp);
>> +	return ret;
>> +}
>> +
>> +static long mshv_vfio_file_del(struct mshv_device *mshvdev, unsigned int fd)
>> +{
>> +	struct mshv_vfio *mshv_vfio = mshvdev->device_private;
>> +	struct mshv_vfio_file *mvf;
>> +	long ret;
>> +
>> +	CLASS(fd, f)(fd);
>> +
>> +	if (fd_empty(f))
>> +		return -EBADF;
>> +
>> +	ret = -ENOENT;
>> +	mutex_lock(&mshv_vfio->lock);
>> +
>> +	list_for_each_entry(mvf, &mshv_vfio->file_list, node) {
>> +		if (mvf->file != fd_file(f))
>> +			continue;
>> +
>> +		list_del(&mvf->node);
>> +		fput(mvf->file);
>> +		kfree(mvf);
>> +		ret = 0;
>> +		break;
>> +	}
>> +
>> +	mutex_unlock(&mshv_vfio->lock);
>> +	return ret;
>> +}
>> +
>> +static long mshv_vfio_set_file(struct mshv_device *mshvdev, long attr,
>> +			      void __user *arg)
>> +{
>> +	int32_t __user *argp = arg;
>> +	int32_t fd;
>> +
>> +	switch (attr) {
>> +	case MSHV_DEV_VFIO_FILE_ADD:
>> +		if (get_user(fd, argp))
>> +			return -EFAULT;
>> +		return mshv_vfio_file_add(mshvdev, fd);
>> +
>> +	case MSHV_DEV_VFIO_FILE_DEL:
>> +		if (get_user(fd, argp))
>> +			return -EFAULT;
>> +		return mshv_vfio_file_del(mshvdev, fd);
>> +	}
>> +
>> +	return -ENXIO;
>> +}
>> +
>> +static long mshv_vfio_set_attr(struct mshv_device *mshvdev,
>> +			      struct mshv_device_attr *attr)
>> +{
>> +	switch (attr->group) {
>> +	case MSHV_DEV_VFIO_FILE:
>> +		return mshv_vfio_set_file(mshvdev, attr->attr,
>> +					  u64_to_user_ptr(attr->addr));
>> +	}
>> +
>> +	return -ENXIO;
>> +}
>> +
>> +static long mshv_vfio_has_attr(struct mshv_device *mshvdev,
>> +			      struct mshv_device_attr *attr)
>> +{
>> +	switch (attr->group) {
>> +	case MSHV_DEV_VFIO_FILE:
>> +		switch (attr->attr) {
>> +		case MSHV_DEV_VFIO_FILE_ADD:
>> +		case MSHV_DEV_VFIO_FILE_DEL:
>> +			return 0;
>> +		}
>> +
>> +		break;
>> +	}
>> +
>> +	return -ENXIO;
>> +}
>> +
>> +static long mshv_vfio_create_device(struct mshv_device *mshvdev, u32 type)
>> +{
>> +	struct mshv_device *tmp;
>> +	struct mshv_vfio *mshv_vfio;
>> +
>> +	/* Only one VFIO "device" per VM */
>> +	hlist_for_each_entry(tmp, &mshvdev->device_pt->pt_devices,
>> +			     device_ptnode)
>> +		if (tmp->device_ops == &mshv_vfio_device_ops)
>> +			return -EBUSY;
>> +
>> +	mshv_vfio = kzalloc(sizeof(*mshv_vfio), GFP_KERNEL_ACCOUNT);
>> +	if (mshv_vfio == NULL)
>> +		return -ENOMEM;
>> +
>> +	INIT_LIST_HEAD(&mshv_vfio->file_list);
>> +	mutex_init(&mshv_vfio->lock);
>> +
>> +	mshvdev->device_private = mshv_vfio;
>> +
>> +	return 0;
>> +}
>> +
>> +/* This is called from mshv_device_fop_release() */
>> +static void mshv_vfio_release_device(struct mshv_device *mshvdev)
>> +{
>> +	struct mshv_vfio *mv = mshvdev->device_private;
>> +	struct mshv_vfio_file *mvf, *tmp;
>> +
>> +	list_for_each_entry_safe(mvf, tmp, &mv->file_list, node) {
>> +		fput(mvf->file);
>> +		list_del(&mvf->node);
>> +		kfree(mvf);
>> +	}
>> +
>> +	kfree(mv);
>> +	kfree(mshvdev);
>> +}
>> +
>> +struct mshv_device_ops mshv_vfio_device_ops = {
>> +	.device_name = "mshv-vfio",
>> +	.device_create = mshv_vfio_create_device,
>> +	.device_release = mshv_vfio_release_device,
>> +	.device_set_attr = mshv_vfio_set_attr,
>> +	.device_has_attr = mshv_vfio_has_attr,
>> +};
> 


^ permalink raw reply

* Re: [PATCH v0 09/15] mshv: Import data structs around device domains and irq remapping
From: Mukesh R @ 2026-01-24  0:38 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux, romank
In-Reply-To: <aW_-9KO2DrVvmvSs@skinsburskii.localdomain>

On 1/20/26 14:17, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:24PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Import/copy from Hyper-V public headers, definitions and declarations that
>> are related to attaching and detaching of device domains and interrupt
>> remapping, and building device ids for those purposes.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   include/hyperv/hvgdk_mini.h |  11 ++++
>>   include/hyperv/hvhdk_mini.h | 112 ++++++++++++++++++++++++++++++++++++
>>   2 files changed, 123 insertions(+)
>>
> 
> <snip>
> 
>> +/* ID for stage 2 default domain and NULL domain */
>> +#define HV_DEVICE_DOMAIN_ID_S2_DEFAULT 0
>> +#define HV_DEVICE_DOMAIN_ID_S2_NULL    0xFFFFFFFFULL
>> +
>> +union hv_device_domain_id {
>> +	u64 as_uint64;
>> +	struct {
>> +		u32 type : 4;
>> +		u32 reserved : 28;
>> +		u32 id;
>> +	};
>> +} __packed;
> 
> Shouldn't the inner struct be packed instead?
> 
>> +
>> +struct hv_input_device_domain { /* HV_INPUT_DEVICE_DOMAIN */
>> +	u64 partition_id;
>> +	union hv_input_vtl owner_vtl;
>> +	u8 padding[7];
>> +	union hv_device_domain_id domain_id;
>> +} __packed;
>> +
>> +union hv_create_device_domain_flags {	/* HV_CREATE_DEVICE_DOMAIN_FLAGS */
>> +	u32 as_uint32;
>> +	struct {
>> +		u32 forward_progress_required : 1;
>> +		u32 inherit_owning_vtl : 1;
>> +		u32 reserved : 30;
>> +	} __packed;
>> +} __packed;
> 
> Why should the union be packed?

 From GCC docs:

Specifying this attribute for struct and union types is equivalent to
specifying the packed attribute on each of the structure or union members.

Thanks,
-Mukesh



> Thanks,
> Stanislav
> 
>> +
>> +struct hv_input_create_device_domain {	/* HV_INPUT_CREATE_DEVICE_DOMAIN */
>> +	struct hv_input_device_domain device_domain;
>> +	union hv_create_device_domain_flags create_device_domain_flags;
>> +} __packed;
>> +
>> +struct hv_input_delete_device_domain {	/* HV_INPUT_DELETE_DEVICE_DOMAIN */
>> +	struct hv_input_device_domain device_domain;
>> +} __packed;
>> +
>> +struct hv_input_attach_device_domain {	/* HV_INPUT_ATTACH_DEVICE_DOMAIN */
>> +	struct hv_input_device_domain device_domain;
>> +	union hv_device_id device_id;
>> +} __packed;
>> +
>> +struct hv_input_detach_device_domain {	/* HV_INPUT_DETACH_DEVICE_DOMAIN */
>> +	u64 partition_id;
>> +	union hv_device_id device_id;
>> +} __packed;
>> +
>> +struct hv_input_map_device_gpa_pages {	/* HV_INPUT_MAP_DEVICE_GPA_PAGES */
>> +	struct hv_input_device_domain device_domain;
>> +	union hv_input_vtl target_vtl;
>> +	u8 padding[3];
>> +	u32 map_flags;
>> +	u64 target_device_va_base;
>> +	u64 gpa_page_list[];
>> +} __packed;
>> +
>> +struct hv_input_unmap_device_gpa_pages {  /* HV_INPUT_UNMAP_DEVICE_GPA_PAGES */
>> +	struct hv_input_device_domain device_domain;
>> +	u64 target_device_va_base;
>> +} __packed;
>> +
>>   #endif /* _HV_HVHDK_MINI_H */
>> -- 
>> 2.51.2.vfs.0.1
>>


^ permalink raw reply

* Re: [PATCH v0 10/15] PCI: hv: Build device id for a VMBus device
From: Mukesh R @ 2026-01-24  0:42 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux, romank
In-Reply-To: <aXAAH4G9ztAGDWuy@skinsburskii.localdomain>

On 1/20/26 14:22, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:25PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
>> interrupts, etc need a device id as a parameter. This device id refers
>> to that specific device during the lifetime of passthru.
>>
>> An L1VH VM only contains VMBus based devices. A device id for a VMBus
>> device is slightly different in that it uses the hv_pcibus_device info
>> for building it to make sure it matches exactly what the hypervisor
>> expects. This VMBus based device id is needed when attaching devices in
>> an L1VH based guest VM. Before building it, a check is done to make sure
>> the device is a valid VMBus device.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   arch/x86/include/asm/mshyperv.h     |  2 ++
>>   drivers/pci/controller/pci-hyperv.c | 29 +++++++++++++++++++++++++++++
>>   2 files changed, 31 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
>> index eef4c3a5ba28..0d7fdfb25e76 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -188,6 +188,8 @@ bool hv_vcpu_is_preempted(int vcpu);
>>   static inline void hv_apic_init(void) {}
>>   #endif
>>   
>> +u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
>> +
>>   struct irq_domain *hv_create_pci_msi_domain(void);
>>   
>>   int hv_map_msi_interrupt(struct irq_data *data,
>> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
>> index 8bc6a38c9b5a..40f0b06bb966 100644
>> --- a/drivers/pci/controller/pci-hyperv.c
>> +++ b/drivers/pci/controller/pci-hyperv.c
>> @@ -579,6 +579,8 @@ static void hv_pci_onchannelcallback(void *context);
>>   #define DELIVERY_MODE		APIC_DELIVERY_MODE_FIXED
>>   #define HV_MSI_CHIP_FLAGS	MSI_CHIP_FLAG_SET_ACK
>>   
>> +static bool hv_vmbus_pci_device(struct pci_bus *pbus);
>> +
> 
> Why not moving this static function definition above the called instead of
> defining the prototype?

Did you see the function implementation? It has other dependencies that
are later, it would need code reorg.

Thanks,
-Mukesh


>>   static int hv_pci_irqchip_init(void)
>>   {
>>   	return 0;
>> @@ -598,6 +600,26 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data)
>>   
>>   #define hv_msi_prepare		pci_msi_prepare
>>   
>> +u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
>> +{
>> +	u64 u64val;
> 
> This variable is redundant.

Not really. It helps with debug by putting a quick print, and is
harmless.

>> +	struct hv_pcibus_device *hbus;
>> +	struct pci_bus *pbus = pdev->bus;
>> +
>> +	if (!hv_vmbus_pci_device(pbus))
>> +		return 0;
>> +
>> +	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
>> +	u64val = (hbus->hdev->dev_instance.b[5] << 24) |
>> +		 (hbus->hdev->dev_instance.b[4] << 16) |
>> +		 (hbus->hdev->dev_instance.b[7] << 8) |
>> +		 (hbus->hdev->dev_instance.b[6] & 0xf8) |
>> +		 PCI_FUNC(pdev->devfn);
>> +
> 
> It looks like this value always fits into 32 bit, so what is the value
> in returning 64 bit?

The ABI has device id defined as 64bits where this is assigned.

Thanks,
-Mukesh




> Thanks,
> Stanislav
> 
>> +	return u64val;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_pci_vmbus_device_id);
>> +
>>   /**
>>    * hv_irq_retarget_interrupt() - "Unmask" the IRQ by setting its current
>>    * affinity.
>> @@ -1404,6 +1426,13 @@ static struct pci_ops hv_pcifront_ops = {
>>   	.write = hv_pcifront_write_config,
>>   };
>>   
>> +#ifdef CONFIG_X86
>> +static bool hv_vmbus_pci_device(struct pci_bus *pbus)
>> +{
>> +	return pbus->ops == &hv_pcifront_ops;
>> +}
>> +#endif /* CONFIG_X86 */
>> +
>>   /*
>>    * Paravirtual backchannel
>>    *
>> -- 
>> 2.51.2.vfs.0.1
>>


^ permalink raw reply

* Re: [PATCH v0 11/15] x86/hyperv: Build logical device ids for PCI passthru hcalls
From: Mukesh R @ 2026-01-24  0:44 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux, romank
In-Reply-To: <aXABVTS6xDb2GB2s@skinsburskii.localdomain>

On 1/20/26 14:27, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:26PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> On Hyper-V, most hypercalls related to PCI passthru to map/unmap regions,
>> interrupts, etc need a device id as a parameter. A device id refers
>> to a specific device. A device id is of two types:
>>     o Logical: used for direct attach (see below) hypercalls. A logical
>>                device id is a unique 62bit value that is created and
>>                sent during the initial device attach. Then all further
>>                communications (for interrupt remaps etc) must use this
>>                logical id.
>>     o PCI: used for device domain hypercalls such as map, unmap, etc.
>>            This is built using actual device BDF info.
>>
>>     PS: Since an L1VH only supports direct attaches, a logical device id
>>         on an L1VH VM is always a VMBus device id. For non-L1VH cases,
>>         we just use PCI BDF info, altho not strictly needed, to build the
>>         logical device id.
>>
>> At a high level, Hyper-V supports two ways to do PCI passthru:
>>    1. Device Domain: root must create a device domain in the hypervisor,
>>       and do map/unmap hypercalls for mapping and unmapping guest RAM.
>>       All hypervisor communications use device id of type PCI for
>>       identifying and referencing the device.
>>
>>    2. Direct Attach: the hypervisor will simply use the guest's HW
>>       page table for mappings, thus the host need not do map/unmap
>>       hypercalls. A direct attached device must be referenced
>>       via logical device id and never via the PCI device id. For an
>>       L1VH root/parent, Hyper-V only supports direct attaches.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   arch/x86/hyperv/irqdomain.c     | 60 ++++++++++++++++++++++++++++++---
>>   arch/x86/include/asm/mshyperv.h | 14 ++++++++
>>   2 files changed, 70 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
>> index ccbe5848a28f..33017aa0caa4 100644
>> --- a/arch/x86/hyperv/irqdomain.c
>> +++ b/arch/x86/hyperv/irqdomain.c
>> @@ -137,7 +137,7 @@ static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data)
>>   	return 0;
>>   }
>>   
>> -static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
>> +static u64 hv_build_devid_type_pci(struct pci_dev *pdev)
>>   {
>>   	int pos;
>>   	union hv_device_id hv_devid;
>> @@ -197,7 +197,58 @@ static union hv_device_id hv_build_devid_type_pci(struct pci_dev *pdev)
>>   	}
>>   
>>   out:
>> -	return hv_devid;
>> +	return hv_devid.as_uint64;
>> +}
>> +
>> +/* Build device id for direct attached devices */
>> +static u64 hv_build_devid_type_logical(struct pci_dev *pdev)
>> +{
>> +	hv_pci_segment segment;
>> +	union hv_device_id hv_devid;
>> +	union hv_pci_bdf bdf = {.as_uint16 = 0};
>> +	struct rid_data data = {
>> +		.bridge = NULL,
>> +		.rid = PCI_DEVID(pdev->bus->number, pdev->devfn)
>> +	};
>> +
>> +	segment = pci_domain_nr(pdev->bus);
>> +	bdf.bus = PCI_BUS_NUM(data.rid);
>> +	bdf.device = PCI_SLOT(data.rid);
>> +	bdf.function = PCI_FUNC(data.rid);
>> +
>> +	hv_devid.as_uint64 = 0;
>> +	hv_devid.device_type = HV_DEVICE_TYPE_LOGICAL;
>> +	hv_devid.logical.id = (u64)segment << 16 | bdf.as_uint16;
>> +
>> +	return hv_devid.as_uint64;
>> +}
>> +
>> +/* Build device id after the device has been attached */
>> +u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
>> +{
>> +	if (type == HV_DEVICE_TYPE_LOGICAL) {
>> +		if (hv_l1vh_partition())
>> +			return hv_pci_vmbus_device_id(pdev);
> 
> Should this one be renamed into hv_build_devid_type_vmbus() to align
> with the other two function names?

No, because hyperv only defines two types of device ids, and it would
unnecessary at to confusion. vmbus uses one the two types of device
ids.


> Thanks,
> Stanislav
> 
>> +		else
>> +			return hv_build_devid_type_logical(pdev);
>> +	} else if (type == HV_DEVICE_TYPE_PCI)
>> +		return hv_build_devid_type_pci(pdev);
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
>> +
>> +/* Build device id for the interrupt path */
>> +static u64 hv_build_irq_devid(struct pci_dev *pdev)
>> +{
>> +	enum hv_device_type dev_type;
>> +
>> +	if (hv_pcidev_is_attached_dev(pdev) || hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	return hv_build_devid_oftype(pdev, dev_type);
>>   }
>>   
>>   /*
>> @@ -221,7 +272,7 @@ int hv_map_msi_interrupt(struct irq_data *data,
>>   
>>   	msidesc = irq_data_get_msi_desc(data);
>>   	pdev = msi_desc_to_pci_dev(msidesc);
>> -	hv_devid = hv_build_devid_type_pci(pdev);
>> +	hv_devid.as_uint64 = hv_build_irq_devid(pdev);
>>   	cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
>>   
>>   	return hv_map_interrupt(hv_current_partition_id, hv_devid, false, cpu,
>> @@ -296,7 +347,8 @@ static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
>>   {
>>   	union hv_device_id hv_devid;
>>   
>> -	hv_devid = hv_build_devid_type_pci(pdev);
>> +	hv_devid.as_uint64 = hv_build_irq_devid(pdev);
>> +
>>   	return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
>>   }
>>   
>> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
>> index 0d7fdfb25e76..97477c5a8487 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -188,6 +188,20 @@ bool hv_vcpu_is_preempted(int vcpu);
>>   static inline void hv_apic_init(void) {}
>>   #endif
>>   
>> +#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
>> +static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{ return false; }       /* temporary */
>> +u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
>> +#else	/* CONFIG_HYPERV_IOMMU */
>> +static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{ return false; }
>> +
>> +static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
>> +				       enum hv_device_type type)
>> +{ return 0; }
>> +
>> +#endif	/* CONFIG_HYPERV_IOMMU */
>> +
>>   u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
>>   
>>   struct irq_domain *hv_create_pci_msi_domain(void);
>> -- 
>> 2.51.2.vfs.0.1
>>
> 
> 
> 
> 
> 
> 
> 
> 


^ permalink raw reply

* RE: [PATCH v4 6/7] mshv: Add data for printing stats page counters
From: Michael Kelley @ 2026-01-24  0:44 UTC (permalink / raw)
  To: Nuno Das Neves, Stanislav Kinsburskii
  Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org,
	kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	prapal@linux.microsoft.com, mrathor@linux.microsoft.com,
	paekkaladevi@linux.microsoft.com
In-Reply-To: <dbe3960d-c765-4394-87ce-e11c051cde44@linux.microsoft.com>

From: Nuno Das Neves <nunodasneves@linux.microsoft.com> Sent: Friday, January 23, 2026 4:13 PM
> 
> On 1/23/2026 2:31 PM, Stanislav Kinsburskii wrote:
> > On Fri, Jan 23, 2026 at 11:04:52AM -0800, Nuno Das Neves wrote:
> >> On 1/23/2026 9:09 AM, Michael Kelley wrote:
> >>> From: Nuno Das Neves <nunodasneves@linux.microsoft.com> Sent: Wednesday, January 21, 2026 1:46 PM
> >>>>
> >>>> Introduce hv_counters.c, containing static data corresponding to
> >>>> HV_*_COUNTER enums in the hypervisor source. Defining the enum
> >>>> members as an array instead makes more sense, since it will be
> >>>> iterated over to print counter information to debugfs.
> >>>
> >>> I would have expected the filename to be mshv_counters.c, so that the association
> >>> with the MS hypervisor is clear. And the file is inextricably linked to mshv_debugfs.c,
> >>> which of course has the "mshv_" prefix. Or is there some thinking I'm not aware of
> >>> for using the "hv_" prefix?
> >>>
> >> Good question - I originally thought of using hv_ because the definitions inside are
> >> part of the hypervisor ABI, and hence also have the hv_ prefix.
> >>
> >> However you have a good point, and I'm not opposed to changing it.
> >>
> >> Maybe to just be super explicit: "mshv_debugfs_counters.c" ?
> >>
> >
> > This is reudnant from my POV.
> > If these counters are only used by mshv_debugfs.c, then should rather be
> > a part of this file.
> > What was the reason to move them elsewhere?
> >
> 
> Just a matter of taste - so there isn't ~450 lines of definitions at the beginning of
> mshv_debugfs.c. But I'm not fussed. If you think it's better to just prepend the
> definitions to mshv_debugfs.c, then that's an easy change.
> 
> Nuno

FWIW, I preferred the separate file so that the main debugfs code
isn't burdened with 450 lines of definitions that aren't going to be
edited/revised/improved via the typical processes. The current
mshv_debugfs.c is a reasonable 700 lines of code without all the
definitions.

But it's not a big deal for me either way.

Michael

> 
> > Thanks,
> > Stanislav
> >
> >>> Also, I see in Patch 7 of this series that hv_counters.c is #included as a .c file
> >>> in mshv_debugfs.c. Is there a reason for doing the #include instead of adding
> >>> hv_counters.c to the Makefile and building it on its own? You would need to
> >>> add a handful of extern statements to mshv_root.h so that the tables are
> >>> referenceable from mshv_debugfs.c. But that would seem to be the more
> >>> normal way of doing things.  #including a .c file is unusual.
> >>>
> >>
> >> Yes...I thought I could avoid noise in mshv_root.h and the Makefile, since it's
> >> only relevant for mshv_debugfs.c. However I could see this file (whether as .c or
> >> .h) being misused and included elsewhere inadvertantly, which would duplicate the
> >> tables, so maybe doing it the normal way is a better idea, even if mshv_debugfs.c
> >> is likely the only user.
> >>
> >>> See one more comment on the last line of this patch ...
> >>>
> 
> <snip>


^ permalink raw reply

* Re: [PATCH v0 12/15] x86/hyperv: Implement hyperv virtual iommu
From: Mukesh R @ 2026-01-24  1:26 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux, romank
In-Reply-To: <aXAZ-r1PeUBAHwaK@skinsburskii.localdomain>

On 1/20/26 16:12, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:27PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Add a new file to implement management of device domains, mapping and
>> unmapping of iommu memory, and other iommu_ops to fit within the VFIO
>> framework for PCI passthru on Hyper-V running Linux as root or L1VH
>> parent. This also implements direct attach mechanism for PCI passthru,
>> and it is also made to work within the VFIO framework.
>>
>> At a high level, during boot the hypervisor creates a default identity
>> domain and attaches all devices to it. This nicely maps to Linux iommu
>> subsystem IOMMU_DOMAIN_IDENTITY domain. As a result, Linux does not
>> need to explicitly ask Hyper-V to attach devices and do maps/unmaps
>> during boot. As mentioned previously, Hyper-V supports two ways to do
>> PCI passthru:
>>
>>    1. Device Domain: root must create a device domain in the hypervisor,
>>       and do map/unmap hypercalls for mapping and unmapping guest RAM.
>>       All hypervisor communications use device id of type PCI for
>>       identifying and referencing the device.
>>
>>    2. Direct Attach: the hypervisor will simply use the guest's HW
>>       page table for mappings, thus the host need not do map/unmap
>>       device memory hypercalls. As such, direct attach passthru setup
>>       during guest boot is extremely fast. A direct attached device
>>       must be referenced via logical device id and not via the PCI
>>       device id.
>>
>> At present, L1VH root/parent only supports direct attaches. Also direct
>> attach is default in non-L1VH cases because there are some significant
>> performance issues with device domain implementation currently for guests
>> with higher RAM (say more than 8GB), and that unfortunately cannot be
>> addressed in the short term.
>>
> 
> <snip>
> 
>> +/*
>> + * If the current thread is a VMM thread, return the partition id of the VM it
>> + * is managing, else return HV_PARTITION_ID_INVALID.
>> + */
>> +u64 hv_iommu_get_curr_partid(void)
>> +{
>> +	u64 (*fn)(pid_t pid);
>> +	u64 partid;
>> +
>> +	fn = symbol_get(mshv_pid_to_partid);
>> +	if (!fn)
>> +		return HV_PARTITION_ID_INVALID;
>> +
>> +	partid = fn(current->tgid);
>> +	symbol_put(mshv_pid_to_partid);
>> +
>> +	return partid;
>> +}
>> +
>> +/* If this is a VMM thread, then this domain is for a guest VM */
>> +static bool hv_curr_thread_is_vmm(void)
>> +{
>> +	return hv_iommu_get_curr_partid() != HV_PARTITION_ID_INVALID;
>> +}
>> +
>> +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
>> +{
>> +	switch (cap) {
>> +	case IOMMU_CAP_CACHE_COHERENCY:
>> +		return true;
>> +	default:
>> +		return false;
>> +	}
>> +	return false;
> 
> The return above is never reached.
> 
>> +}
>> +
>> +/*
>> + * Check if given pci device is a direct attached device. Caller must have
>> + * verified pdev is a valid pci device.
>> + */
>> +bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{
>> +	struct iommu_domain *iommu_domain;
>> +	struct hv_domain *hvdom;
>> +	struct device *dev = &pdev->dev;
>> +
>> +	iommu_domain = iommu_get_domain_for_dev(dev);
>> +	if (iommu_domain) {
>> +		hvdom = to_hv_domain(iommu_domain);
> 
> hvdom varaible is redundant.
> 
>> +		return hvdom->attached_dom;
>> +	}
>> +
>> +	return false;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_pcidev_is_attached_dev);
>> +
>> +/* Create a new device domain in the hypervisor */
>> +static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
>> +{
>> +	u64 status;
>> +	unsigned long flags;
>> +	struct hv_input_device_domain *ddp;
>> +	struct hv_input_create_device_domain *input;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	ddp = &input->device_domain;
>> +	ddp->partition_id = HV_PARTITION_ID_SELF;
>> +	ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	ddp->domain_id.id = hvdom->domid_num;
>> +
>> +	input->create_device_domain_flags.forward_progress_required = 1;
>> +	input->create_device_domain_flags.inherit_owning_vtl = 0;
>> +
>> +	status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL);
>> +
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* During boot, all devices are attached to this */
>> +static struct iommu_domain *hv_iommu_domain_alloc_identity(struct device *dev)
>> +{
>> +	return &hv_def_identity_dom.iommu_dom;
>> +}
>> +
>> +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
>> +{
>> +	struct hv_domain *hvdom;
>> +	int rc;
>> +
>> +	if (hv_l1vh_partition() && !hv_curr_thread_is_vmm() && !hv_no_attdev) {
>> +		pr_err("Hyper-V: l1vh iommu does not support host devices\n");
>> +		return NULL;
>> +	}
>> +
>> +	hvdom = kzalloc(sizeof(struct hv_domain), GFP_KERNEL);
>> +	if (hvdom == NULL)
>> +		goto out;
> 
> Why goto here and not return NULL like above?

Some debug code there got removed. Will fix in next version.

>> +
>> +	spin_lock_init(&hvdom->mappings_lock);
>> +	hvdom->mappings_tree = RB_ROOT_CACHED;
>> +
>> +	if (++unique_id == HV_DEVICE_DOMAIN_ID_S2_DEFAULT)   /* ie, 0 */
>> +		goto out_free;
>> +
>> +	hvdom->domid_num = unique_id;
>> +	hvdom->iommu_dom.geometry = default_geometry;
>> +	hvdom->iommu_dom.pgsize_bitmap = HV_IOMMU_PGSIZES;
>> +
>> +	/* For guests, by default we do direct attaches, so no domain in hyp */
>> +	if (hv_curr_thread_is_vmm() && !hv_no_attdev)
>> +		hvdom->attached_dom = true;
>> +	else {
>> +		rc = hv_iommu_create_hyp_devdom(hvdom);
>> +		if (rc)
>> +			goto out_free_id;
>> +	}
>> +
>> +	return &hvdom->iommu_dom;
>> +
>> +out_free_id:
>> +	unique_id--;
>> +out_free:
>> +	kfree(hvdom);
>> +out:
>> +	return NULL;
>> +}
>> +
>> +static void hv_iommu_domain_free(struct iommu_domain *immdom)
>> +{
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	unsigned long flags;
>> +	u64 status;
>> +	struct hv_input_delete_device_domain *input;
>> +
>> +	if (hv_special_domain(hvdom))
>> +		return;
>> +
>> +	if (hvdom->num_attchd) {
>> +		pr_err("Hyper-V: can't free busy iommu domain (%p)\n", immdom);
>> +		return;
>> +	}
>> +
>> +	if (!hv_curr_thread_is_vmm() || hv_no_attdev) {
>> +		struct hv_input_device_domain *ddp;
>> +
>> +		local_irq_save(flags);
>> +		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +		ddp = &input->device_domain;
>> +		memset(input, 0, sizeof(*input));
>> +
>> +		ddp->partition_id = HV_PARTITION_ID_SELF;
>> +		ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +		ddp->domain_id.id = hvdom->domid_num;
>> +
>> +		status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input,
>> +					 NULL);
>> +		local_irq_restore(flags);
>> +
>> +		if (!hv_result_success(status))
>> +			hv_status_err(status, "\n");
>> +	}
>> +
>> +	kfree(hvdom);
>> +}
>> +
>> +/* Attach a device to a domain previously created in the hypervisor */
>> +static int hv_iommu_att_dev2dom(struct hv_domain *hvdom, struct pci_dev *pdev)
>> +{
>> +	unsigned long flags;
>> +	u64 status;
>> +	enum hv_device_type dev_type;
>> +	struct hv_input_attach_device_domain *input;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +
>> +	/* NB: Upon guest shutdown, device is re-attached to the default domain
>> +	 * without explicit detach.
>> +	 */
>> +	if (hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	input->device_id.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
>> +
>> +	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* Caller must have validated that dev is a valid pci dev */
>> +static int hv_iommu_direct_attach_device(struct pci_dev *pdev)
>> +{
>> +	struct hv_input_attach_device *input;
>> +	u64 status;
>> +	int rc;
>> +	unsigned long flags;
>> +	union hv_device_id host_devid;
>> +	enum hv_device_type dev_type;
>> +	u64 ptid = hv_iommu_get_curr_partid();
>> +
>> +	if (ptid == HV_PARTITION_ID_INVALID) {
>> +		pr_err("Hyper-V: Invalid partition id in direct attach\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	host_devid.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
>> +
>> +	do {
>> +		local_irq_save(flags);
>> +		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +		memset(input, 0, sizeof(*input));
>> +		input->partition_id = ptid;
>> +		input->device_id = host_devid;
>> +
>> +		/* Hypervisor associates logical_id with this device, and in
>> +		 * some hypercalls like retarget interrupts, logical_id must be
>> +		 * used instead of the BDF. It is a required parameter.
>> +		 */
>> +		input->attdev_flags.logical_id = 1;
>> +		input->logical_devid =
>> +			   hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
>> +
>> +		status = hv_do_hypercall(HVCALL_ATTACH_DEVICE, input, NULL);
>> +		local_irq_restore(flags);
>> +
>> +		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
>> +			rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
>> +			if (rc)
>> +				break;
>> +		}
>> +	} while (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* This to attach a device to both host app (like DPDK) and a guest VM */
>> +static int hv_iommu_attach_dev(struct iommu_domain *immdom, struct device *dev,
>> +			       struct iommu_domain *old)
>> +{
>> +	struct pci_dev *pdev;
>> +	int rc;
>> +	struct hv_domain *hvdom_new = to_hv_domain(immdom);
>> +	struct hv_domain *hvdom_prev = dev_iommu_priv_get(dev);
>> +
>> +	/* Only allow PCI devices for now */
>> +	if (!dev_is_pci(dev))
>> +		return -EINVAL;
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	/* l1vh does not support host device (eg DPDK) passthru */
>> +	if (hv_l1vh_partition() && !hv_special_domain(hvdom_new) &&
>> +	    !hvdom_new->attached_dom)
>> +		return -EINVAL;
>> +
>> +	/*
>> +	 * VFIO does not do explicit detach calls, hence check first if we need
>> +	 * to detach first. Also, in case of guest shutdown, it's the VMM
>> +	 * thread that attaches it back to the hv_def_identity_dom, and
>> +	 * hvdom_prev will not be null then. It is null during boot.
>> +	 */
>> +	if (hvdom_prev)
>> +		if (!hv_l1vh_partition() || !hv_special_domain(hvdom_prev))
>> +			hv_iommu_detach_dev(&hvdom_prev->iommu_dom, dev);
>> +
>> +	if (hv_l1vh_partition() && hv_special_domain(hvdom_new)) {
>> +		dev_iommu_priv_set(dev, hvdom_new);  /* sets "private" field */
>> +		return 0;
>> +	}
>> +
>> +	if (hvdom_new->attached_dom)
>> +		rc = hv_iommu_direct_attach_device(pdev);
>> +	else
>> +		rc = hv_iommu_att_dev2dom(hvdom_new, pdev);
>> +
>> +	if (rc && hvdom_prev) {
>> +		int rc1;
>> +
>> +		if (hvdom_prev->attached_dom)
>> +			rc1 = hv_iommu_direct_attach_device(pdev);
>> +		else
>> +			rc1 = hv_iommu_att_dev2dom(hvdom_prev, pdev);
>> +
>> +		if (rc1)
>> +			pr_err("Hyper-V: iommu could not restore orig device state.. dev:%s\n",
>> +			       dev_name(dev));
>> +	}
>> +
>> +	if (rc == 0) {
>> +		dev_iommu_priv_set(dev, hvdom_new);  /* sets "private" field */
>> +		hvdom_new->num_attchd++;
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static void hv_iommu_det_dev_from_guest(struct hv_domain *hvdom,
>> +					struct pci_dev *pdev)
>> +{
>> +	struct hv_input_detach_device *input;
>> +	u64 status, log_devid;
>> +	unsigned long flags;
>> +
>> +	log_devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->partition_id = hv_iommu_get_curr_partid();
>> +	input->logical_devid = log_devid;
>> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +}
>> +
>> +static void hv_iommu_det_dev_from_dom(struct hv_domain *hvdom,
>> +				      struct pci_dev *pdev)
>> +{
>> +	u64 status, devid;
>> +	unsigned long flags;
>> +	struct hv_input_detach_device_domain *input;
>> +
>> +	devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_PCI);
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_id.as_uint64 = devid;
>> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +}
>> +
>> +static void hv_iommu_detach_dev(struct iommu_domain *immdom, struct device *dev)
>> +{
>> +	struct pci_dev *pdev;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +
>> +	/* See the attach function, only PCI devices for now */
>> +	if (!dev_is_pci(dev))
>> +		return;
>> +
>> +	if (hvdom->num_attchd == 0)
>> +		pr_warn("Hyper-V: num_attchd is zero (%s)\n", dev_name(dev));
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	if (hvdom->attached_dom) {
>> +		hv_iommu_det_dev_from_guest(hvdom, pdev);
>> +
>> +		/* Do not reset attached_dom, hv_iommu_unmap_pages happens
>> +		 * next.
>> +		 */
>> +	} else {
>> +		hv_iommu_det_dev_from_dom(hvdom, pdev);
>> +	}
>> +
>> +	hvdom->num_attchd--;
> 
> Shouldn't this be modified iff the detach succeeded?

We want to still free the domain and not let it get stuck. The purpose
is more to make sure detach was called before domain free.

>> +}
>> +
>> +static int hv_iommu_add_tree_mapping(struct hv_domain *hvdom,
>> +				     unsigned long iova, phys_addr_t paddr,
>> +				     size_t size, u32 flags)
>> +{
>> +	unsigned long irqflags;
>> +	struct hv_iommu_mapping *mapping;
>> +
>> +	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
>> +	if (!mapping)
>> +		return -ENOMEM;
>> +
>> +	mapping->paddr = paddr;
>> +	mapping->iova.start = iova;
>> +	mapping->iova.last = iova + size - 1;
>> +	mapping->flags = flags;
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, irqflags);
>> +	interval_tree_insert(&mapping->iova, &hvdom->mappings_tree);
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, irqflags);
>> +
>> +	return 0;
>> +}
>> +
>> +static size_t hv_iommu_del_tree_mappings(struct hv_domain *hvdom,
>> +					unsigned long iova, size_t size)
>> +{
>> +	unsigned long flags;
>> +	size_t unmapped = 0;
>> +	unsigned long last = iova + size - 1;
>> +	struct hv_iommu_mapping *mapping = NULL;
>> +	struct interval_tree_node *node, *next;
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
>> +	next = interval_tree_iter_first(&hvdom->mappings_tree, iova, last);
>> +	while (next) {
>> +		node = next;
>> +		mapping = container_of(node, struct hv_iommu_mapping, iova);
>> +		next = interval_tree_iter_next(node, iova, last);
>> +
>> +		/* Trying to split a mapping? Not supported for now. */
>> +		if (mapping->iova.start < iova)
>> +			break;
>> +
>> +		unmapped += mapping->iova.last - mapping->iova.start + 1;
>> +
>> +		interval_tree_remove(node, &hvdom->mappings_tree);
>> +		kfree(mapping);
>> +	}
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
>> +
>> +	return unmapped;
>> +}
>> +
>> +/* Return: must return exact status from the hypercall without changes */
>> +static u64 hv_iommu_map_pgs(struct hv_domain *hvdom,
>> +			    unsigned long iova, phys_addr_t paddr,
>> +			    unsigned long npages, u32 map_flags)
>> +{
>> +	u64 status;
>> +	int i;
>> +	struct hv_input_map_device_gpa_pages *input;
>> +	unsigned long flags, pfn = paddr >> HV_HYP_PAGE_SHIFT;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +	input->map_flags = map_flags;
>> +	input->target_device_va_base = iova;
>> +
>> +	pfn = paddr >> HV_HYP_PAGE_SHIFT;
>> +	for (i = 0; i < npages; i++, pfn++)
>> +		input->gpa_page_list[i] = pfn;
>> +
>> +	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
>> +				     input, NULL);
>> +
>> +	local_irq_restore(flags);
>> +	return status;
>> +}
>> +
>> +/*
>> + * The core VFIO code loops over memory ranges calling this function with
>> + * the largest size from HV_IOMMU_PGSIZES. cond_resched() is in vfio_iommu_map.
>> + */
>> +static int hv_iommu_map_pages(struct iommu_domain *immdom, ulong iova,
>> +			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
>> +			      int prot, gfp_t gfp, size_t *mapped)
>> +{
>> +	u32 map_flags;
>> +	int ret;
>> +	u64 status;
>> +	unsigned long npages, done = 0;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	size_t size = pgsize * pgcount;
>> +
>> +	map_flags = HV_MAP_GPA_READABLE;	/* required */
>> +	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
>> +
>> +	ret = hv_iommu_add_tree_mapping(hvdom, iova, paddr, size, map_flags);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (hvdom->attached_dom) {
>> +		*mapped = size;
>> +		return 0;
>> +	}
>> +
>> +	npages = size >> HV_HYP_PAGE_SHIFT;
>> +	while (done < npages) {
>> +		ulong completed, remain = npages - done;
>> +
>> +		status = hv_iommu_map_pgs(hvdom, iova, paddr, remain,
>> +					  map_flags);
>> +
>> +		completed = hv_repcomp(status);
>> +		done = done + completed;
>> +		iova = iova + (completed << HV_HYP_PAGE_SHIFT);
>> +		paddr = paddr + (completed << HV_HYP_PAGE_SHIFT);
>> +
>> +		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
>> +			ret = hv_call_deposit_pages(NUMA_NO_NODE,
>> +						    hv_current_partition_id,
>> +						    256);
>> +			if (ret)
>> +				break;
>> +		}
>> +		if (!hv_result_success(status))
>> +			break;
>> +	}
>> +
>> +	if (!hv_result_success(status)) {
>> +		size_t done_size = done << HV_HYP_PAGE_SHIFT;
>> +
>> +		hv_status_err(status, "pgs:%lx/%lx iova:%lx\n",
>> +			      done, npages, iova);
>> +		/*
>> +		 * lookup tree has all mappings [0 - size-1]. Below unmap will
>> +		 * only remove from [0 - done], we need to remove second chunk
>> +		 * [done+1 - size-1].
>> +		 */
>> +		hv_iommu_del_tree_mappings(hvdom, iova, size - done_size);
>> +		hv_iommu_unmap_pages(immdom, iova - done_size, pgsize,
>> +				     done, NULL);
>> +		if (mapped)
>> +			*mapped = 0;
>> +	} else
>> +		if (mapped)
>> +			*mapped = size;
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
>> +				   size_t pgsize, size_t pgcount,
>> +				   struct iommu_iotlb_gather *gather)
>> +{
>> +	unsigned long flags, npages;
>> +	struct hv_input_unmap_device_gpa_pages *input;
>> +	u64 status;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	size_t unmapped, size = pgsize * pgcount;
>> +
>> +	unmapped = hv_iommu_del_tree_mappings(hvdom, iova, size);
>> +	if (unmapped < size)
>> +		pr_err("%s: could not delete all mappings (%lx:%lx/%lx)\n",
>> +		       __func__, iova, unmapped, size);
>> +
>> +	if (hvdom->attached_dom)
>> +		return size;
>> +
>> +	npages = size >> HV_HYP_PAGE_SHIFT;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +	input->target_device_va_base = iova;
>> +
>> +	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
>> +				     0, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
> 
> There is some inconsistency in namings and behaviour of paired
> functions:
> 1. The pair of hv_iommu_unmap_pages is called hv_iommu_map_pgs

The pair of hv_iommu_unmap_pages is hv_iommu_map_pages right above.
hv_iommu_map_pgs could be renamed to hv_iommu_map_pgs_hcall I suppose.

> 2. hv_iommu_map_pgs doesn't print status in case of error.

it does:
             hv_status_err(status, "\n");  <==============


> It would be much better to keep this code consistent.
> 
>> +	return unmapped;
>> +}
>> +
>> +static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *immdom,
>> +					 dma_addr_t iova)
>> +{
>> +	u64 paddr = 0;
>> +	unsigned long flags;
>> +	struct hv_iommu_mapping *mapping;
>> +	struct interval_tree_node *node;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
>> +	node = interval_tree_iter_first(&hvdom->mappings_tree, iova, iova);
>> +	if (node) {
>> +		mapping = container_of(node, struct hv_iommu_mapping, iova);
>> +		paddr = mapping->paddr + (iova - mapping->iova.start);
>> +	}
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
>> +
>> +	return paddr;
>> +}
>> +
>> +/*
>> + * Currently, hypervisor does not provide list of devices it is using
>> + * dynamically. So use this to allow users to manually specify devices that
>> + * should be skipped. (eg. hypervisor debugger using some network device).
>> + */
>> +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
>> +{
>> +	if (!dev_is_pci(dev))
>> +		return ERR_PTR(-ENODEV);
>> +
>> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
>> +		int rc, pos = 0;
>> +		int parsed;
>> +		int segment, bus, slot, func;
>> +		struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +		do {
>> +			parsed = 0;
>> +
>> +			rc = sscanf(pci_devs_to_skip + pos, " (%x:%x:%x.%x) %n",
>> +				    &segment, &bus, &slot, &func, &parsed);
>> +			if (rc)
>> +				break;
>> +			if (parsed <= 0)
>> +				break;
>> +
>> +			if (pci_domain_nr(pdev->bus) == segment &&
>> +			    pdev->bus->number == bus &&
>> +			    PCI_SLOT(pdev->devfn) == slot &&
>> +			    PCI_FUNC(pdev->devfn) == func) {
>> +
>> +				dev_info(dev, "skipped by Hyper-V IOMMU\n");
>> +				return ERR_PTR(-ENODEV);
>> +			}
>> +			pos += parsed;
>> +
>> +		} while (pci_devs_to_skip[pos]);
>> +	}
>> +
>> +	/* Device will be explicitly attached to the default domain, so no need
>> +	 * to do dev_iommu_priv_set() here.
>> +	 */
>> +
>> +	return &hv_virt_iommu;
>> +}
>> +
>> +static void hv_iommu_probe_finalize(struct device *dev)
>> +{
>> +	struct iommu_domain *immdom = iommu_get_domain_for_dev(dev);
>> +
>> +	if (immdom && immdom->type == IOMMU_DOMAIN_DMA)
>> +		iommu_setup_dma_ops(dev);
>> +	else
>> +		set_dma_ops(dev, NULL);
>> +}
>> +
>> +static void hv_iommu_release_device(struct device *dev)
>> +{
>> +	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
>> +
>> +	/* Need to detach device from device domain if necessary. */
>> +	if (hvdom)
>> +		hv_iommu_detach_dev(&hvdom->iommu_dom, dev);
>> +
>> +	dev_iommu_priv_set(dev, NULL);
>> +	set_dma_ops(dev, NULL);
>> +}
>> +
>> +static struct iommu_group *hv_iommu_device_group(struct device *dev)
>> +{
>> +	if (dev_is_pci(dev))
>> +		return pci_device_group(dev);
>> +	else
>> +		return generic_device_group(dev);
>> +}
>> +
>> +static int hv_iommu_def_domain_type(struct device *dev)
>> +{
>> +	/* The hypervisor always creates this by default during boot */
>> +	return IOMMU_DOMAIN_IDENTITY;
>> +}
>> +
>> +static struct iommu_ops hv_iommu_ops = {
>> +	.capable	    = hv_iommu_capable,
>> +	.domain_alloc_identity	= hv_iommu_domain_alloc_identity,
>> +	.domain_alloc_paging	= hv_iommu_domain_alloc_paging,
>> +	.probe_device	    = hv_iommu_probe_device,
>> +	.probe_finalize     = hv_iommu_probe_finalize,
>> +	.release_device     = hv_iommu_release_device,
>> +	.def_domain_type    = hv_iommu_def_domain_type,
>> +	.device_group	    = hv_iommu_device_group,
>> +	.default_domain_ops = &(const struct iommu_domain_ops) {
>> +		.attach_dev   = hv_iommu_attach_dev,
>> +		.map_pages    = hv_iommu_map_pages,
>> +		.unmap_pages  = hv_iommu_unmap_pages,
>> +		.iova_to_phys = hv_iommu_iova_to_phys,
>> +		.free	      = hv_iommu_domain_free,
>> +	},
>> +	.owner		    = THIS_MODULE,
>> +};
>> +
>> +static void __init hv_initialize_special_domains(void)
>> +{
>> +	hv_def_identity_dom.iommu_dom.geometry = default_geometry;
>> +	hv_def_identity_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_DEFAULT; /* 0 */
> 
> hv_def_identity_dom is a static global variable.
> Why not initialize hv_def_identity_dom upon definition instead of
> introducing a new function?

Originally, it was function. I changed it static, but during 6.6
review I changed it back to function.  I can't remember why, but is
pretty harmless. We may add more domains, for example null domain to the
initilization in future.

>> +}
>> +
>> +static int __init hv_iommu_init(void)
>> +{
>> +	int ret;
>> +	struct iommu_device *iommup = &hv_virt_iommu;
>> +
>> +	if (!hv_is_hyperv_initialized())
>> +		return -ENODEV;
>> +
>> +	ret = iommu_device_sysfs_add(iommup, NULL, NULL, "%s", "hyperv-iommu");
>> +	if (ret) {
>> +		pr_err("Hyper-V: iommu_device_sysfs_add failed: %d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	/* This must come before iommu_device_register because the latter calls
>> +	 * into the hooks.
>> +	 */
>> +	hv_initialize_special_domains();
>> +
>> +	ret = iommu_device_register(iommup, &hv_iommu_ops, NULL);
> 
> It looks weird to initialize an object after creating sysfs entries for
> it.
> It should be the other way around.

Not sure if it should be, much easier to remove sysfs entry than other
cleanup, even tho iommu_device_unregister is there. I am sure we'll add
more code here, probably why it was originally done this way.

Thanks,
-Mukesh


... snip........


^ permalink raw reply

* Re: [PATCH v0 12/15] x86/hyperv: Implement hyperv virtual iommu
From: Mukesh R @ 2026-01-24  2:01 UTC (permalink / raw)
  To: Jacob Pan
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux
In-Reply-To: <20260121211806.000006aa@linux.microsoft.com>

On 1/21/26 21:18, Jacob Pan wrote:
> Hi Mukesh,
> 
> On Mon, 19 Jan 2026 22:42:27 -0800
> Mukesh R <mrathor@linux.microsoft.com> wrote:
> 
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Add a new file to implement management of device domains, mapping and
>> unmapping of iommu memory, and other iommu_ops to fit within the VFIO
>> framework for PCI passthru on Hyper-V running Linux as root or L1VH
>> parent. This also implements direct attach mechanism for PCI passthru,
>> and it is also made to work within the VFIO framework.
>>
>> At a high level, during boot the hypervisor creates a default identity
>> domain and attaches all devices to it. This nicely maps to Linux iommu
>> subsystem IOMMU_DOMAIN_IDENTITY domain. As a result, Linux does not
>> need to explicitly ask Hyper-V to attach devices and do maps/unmaps
>> during boot. As mentioned previously, Hyper-V supports two ways to do
>> PCI passthru:
>>
>>    1. Device Domain: root must create a device domain in the
>> hypervisor, and do map/unmap hypercalls for mapping and unmapping
>> guest RAM. All hypervisor communications use device id of type PCI for
>>       identifying and referencing the device.
>>
>>    2. Direct Attach: the hypervisor will simply use the guest's HW
>>       page table for mappings, thus the host need not do map/unmap
>>       device memory hypercalls. As such, direct attach passthru setup
>>       during guest boot is extremely fast. A direct attached device
>>       must be referenced via logical device id and not via the PCI
>>       device id.
>>
>> At present, L1VH root/parent only supports direct attaches. Also
>> direct attach is default in non-L1VH cases because there are some
>> significant performance issues with device domain implementation
>> currently for guests with higher RAM (say more than 8GB), and that
>> unfortunately cannot be addressed in the short term.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   MAINTAINERS                     |   1 +
>>   arch/x86/include/asm/mshyperv.h |   7 +-
>>   arch/x86/kernel/pci-dma.c       |   2 +
>>   drivers/iommu/Makefile          |   2 +-
>>   drivers/iommu/hyperv-iommu.c    | 876
>> ++++++++++++++++++++++++++++++++ include/linux/hyperv.h          |
>> 6 + 6 files changed, 890 insertions(+), 4 deletions(-)
>>   create mode 100644 drivers/iommu/hyperv-iommu.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 381a0e086382..63160cee942c 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -11741,6 +11741,7 @@ F:	drivers/hid/hid-hyperv.c
>>   F:	drivers/hv/
>>   F:	drivers/infiniband/hw/mana/
>>   F:	drivers/input/serio/hyperv-keyboard.c
>> +F:	drivers/iommu/hyperv-iommu.c
> Given we are also developing a guest iommu driver on hyperv, I think it
> is more clear to name them accordingly. Perhaps, hyperv-iommu-root.c?

well, l1vh is not quite root, more like a parent. But we've been using
l1vh root loosely to mean l1vh parent. so probably ok to rename it
to hyperv-iommu-root.c. I prefer not calling it parent or something
like that.

>>   F:	drivers/iommu/hyperv-irq.c
>>   F:	drivers/net/ethernet/microsoft/
>>   F:	drivers/net/hyperv/
>> diff --git a/arch/x86/include/asm/mshyperv.h
>> b/arch/x86/include/asm/mshyperv.h index 97477c5a8487..e4ccdbbf1d12
>> 100644 --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -189,16 +189,17 @@ static inline void hv_apic_init(void) {}
>>   #endif
>>   
>>   #if IS_ENABLED(CONFIG_HYPERV_IOMMU)
>> -static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> -{ return false; }       /* temporary */
>> +bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
>>   u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type
>> type); +u64 hv_iommu_get_curr_partid(void);
>>   #else	/* CONFIG_HYPERV_IOMMU */
>>   static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>>   { return false; }
>> -
>>   static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
>>   				       enum hv_device_type type)
>>   { return 0; }
>> +static inline u64 hv_iommu_get_curr_partid(void)
>> +{ return HV_PARTITION_ID_INVALID; }
>>   
>>   #endif	/* CONFIG_HYPERV_IOMMU */
>>   
>> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
>> index 6267363e0189..cfeee6505e17 100644
>> --- a/arch/x86/kernel/pci-dma.c
>> +++ b/arch/x86/kernel/pci-dma.c
>> @@ -8,6 +8,7 @@
>>   #include <linux/gfp.h>
>>   #include <linux/pci.h>
>>   #include <linux/amd-iommu.h>
>> +#include <linux/hyperv.h>
>>   
>>   #include <asm/proto.h>
>>   #include <asm/dma.h>
>> @@ -105,6 +106,7 @@ void __init pci_iommu_alloc(void)
>>   	gart_iommu_hole_init();
>>   	amd_iommu_detect();
>>   	detect_intel_iommu();
>> +	hv_iommu_detect();
j
> Will this driver be x86 only?
Yes for now.

>>   	swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
>>   }
>>   
>> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>> index 598c39558e7d..cc9774864b00 100644
>> --- a/drivers/iommu/Makefile
>> +++ b/drivers/iommu/Makefile
>> @@ -30,7 +30,7 @@ obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
>>   obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
>>   obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
>>   obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>> -obj-$(CONFIG_HYPERV_IOMMU) += hyperv-irq.o
>> +obj-$(CONFIG_HYPERV_IOMMU) += hyperv-irq.o hyperv-iommu.o
> DMA and IRQ remapping should be separate

not sure i follow.


>>   obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
>>   obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
>>   obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
>> diff --git a/drivers/iommu/hyperv-iommu.c
>> b/drivers/iommu/hyperv-iommu.c new file mode 100644
>> index 000000000000..548483fec6b1
>> --- /dev/null
>> +++ b/drivers/iommu/hyperv-iommu.c
>> @@ -0,0 +1,876 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Hyper-V root vIOMMU driver.
>> + * Copyright (C) 2026, Microsoft, Inc.
>> + */
>> +
>> +#include <linux/module.h>
> I don't think this is needed since this driver cannot be a module
> 
>> +#include <linux/pci.h>
>> +#include <linux/dmar.h>
> should not depend on Intel's DMAR
> 
>> +#include <linux/dma-map-ops.h>
>> +#include <linux/interval_tree.h>
>> +#include <linux/hyperv.h>
>> +#include "dma-iommu.h"
>> +#include <asm/iommu.h>
>> +#include <asm/mshyperv.h>
>> +
>> +/* We will not claim these PCI devices, eg hypervisor needs it for
>> debugger */ +static char *pci_devs_to_skip;
>> +static int __init hv_iommu_setup_skip(char *str)
>> +{
>> +	pci_devs_to_skip = str;
>> +
>> +	return 0;
>> +}
>> +/* hv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
>> +__setup("hv_iommu_skip=", hv_iommu_setup_skip);
>> +
>> +bool hv_no_attdev;	 /* disable direct device attach for
>> passthru */ +EXPORT_SYMBOL_GPL(hv_no_attdev);
>> +static int __init setup_hv_no_attdev(char *str)
>> +{
>> +	hv_no_attdev = true;
>> +	return 0;
>> +}
>> +__setup("hv_no_attdev", setup_hv_no_attdev);
>> +
>> +/* Iommu device that we export to the world. HyperV supports max of
>> one */ +static struct iommu_device hv_virt_iommu;
>> +
>> +struct hv_domain {
>> +	struct iommu_domain iommu_dom;
>> +	u32 domid_num;			      /* as opposed to
>> domain_id.type */
>> +	u32 num_attchd;		      /* number of currently
>> attached devices */
> rename to num_dev_attached?
> 
>> +	bool attached_dom;		      /* is this direct
>> attached dom? */
>> +	spinlock_t mappings_lock;	      /* protects
>> mappings_tree */
>> +	struct rb_root_cached mappings_tree;  /* iova to pa lookup
>> tree */ +};
>> +
>> +#define to_hv_domain(d) container_of(d, struct hv_domain, iommu_dom)
>> +
>> +struct hv_iommu_mapping {
>> +	phys_addr_t paddr;
>> +	struct interval_tree_node iova;
>> +	u32 flags;
>> +};
>> +
>> +/*
>> + * By default, during boot the hypervisor creates one Stage 2 (S2)
>> default
>> + * domain. Stage 2 means that the page table is controlled by the
>> hypervisor.
>> + *   S2 default: access to entire root partition memory. This for us
>> easily
>> + *		 maps to IOMMU_DOMAIN_IDENTITY in the iommu
>> subsystem, and
>> + *		 is called HV_DEVICE_DOMAIN_ID_S2_DEFAULT in the
>> hypervisor.
>> + *
>> + * Device Management:
>> + *   There are two ways to manage device attaches to domains:
>> + *     1. Domain Attach: A device domain is created in the
>> hypervisor, the
>> + *			 device is attached to this domain, and
>> then memory
>> + *			 ranges are mapped in the map callbacks.
>> + *     2. Direct Attach: No need to create a domain in the
>> hypervisor for direct
>> + *			 attached devices. A hypercall is made to
>> tell the
>> + *			 hypervisor to attach the device to a
>> guest. There is
>> + *			 no need for explicit memory mappings
>> because the
>> + *			 hypervisor will just use the guest HW
>> page table.
>> + *
>> + * Since a direct attach is much faster, it is the default. This can
>> be
>> + * changed via hv_no_attdev.
>> + *
>> + * L1VH: hypervisor only supports direct attach.
>> + */
>> +
>> +/*
>> + * Create dummy domain to correspond to hypervisor prebuilt default
>> identity
>> + * domain (dummy because we do not make hypercall to create them).
>> + */
>> +static struct hv_domain hv_def_identity_dom;
>> +
>> +static bool hv_special_domain(struct hv_domain *hvdom)
>> +{
>> +	return hvdom == &hv_def_identity_dom;
>> +}
>> +
>> +struct iommu_domain_geometry default_geometry = (struct
>> iommu_domain_geometry) {
>> +	.aperture_start = 0,
>> +	.aperture_end = -1UL,
>> +	.force_aperture = true,
>> +};
>> +
>> +/*
>> + * Since the relevant hypercalls can only fit less than 512 PFNs in
>> the pfn
>> + * array, report 1M max.
>> + */
>> +#define HV_IOMMU_PGSIZES (SZ_4K | SZ_1M)
>> +
>> +static u32 unique_id;	      /* unique numeric id of a new
>> domain */ +
>> +static void hv_iommu_detach_dev(struct iommu_domain *immdom,
>> +				struct device *dev);
>> +static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom,
>> ulong iova,
>> +				   size_t pgsize, size_t pgcount,
>> +				   struct iommu_iotlb_gather
>> *gather); +
>> +/*
>> + * If the current thread is a VMM thread, return the partition id of
>> the VM it
>> + * is managing, else return HV_PARTITION_ID_INVALID.
>> + */
>> +u64 hv_iommu_get_curr_partid(void)
>> +{
>> +	u64 (*fn)(pid_t pid);
>> +	u64 partid;
>> +
>> +	fn = symbol_get(mshv_pid_to_partid);
>> +	if (!fn)
>> +		return HV_PARTITION_ID_INVALID;
>> +
>> +	partid = fn(current->tgid);
>> +	symbol_put(mshv_pid_to_partid);
>> +
>> +	return partid;
>> +}
> This function is not iommu specific. Maybe move it to mshv code?

Well, it is getting the information from mshv by calling a function
there for iommu, and is not needed if no HYPER_IOMMU. So this is probably
the best place for it.

>> +
>> +/* If this is a VMM thread, then this domain is for a guest VM */
>> +static bool hv_curr_thread_is_vmm(void)
>> +{
>> +	return hv_iommu_get_curr_partid() != HV_PARTITION_ID_INVALID;
>> +}
>> +
>> +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
>> +{
>> +	switch (cap) {
>> +	case IOMMU_CAP_CACHE_COHERENCY:
>> +		return true;
>> +	default:
>> +		return false;
>> +	}
>> +	return false;
>> +}
>> +
>> +/*
>> + * Check if given pci device is a direct attached device. Caller
>> must have
>> + * verified pdev is a valid pci device.
>> + */
>> +bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{
>> +	struct iommu_domain *iommu_domain;
>> +	struct hv_domain *hvdom;
>> +	struct device *dev = &pdev->dev;
>> +
>> +	iommu_domain = iommu_get_domain_for_dev(dev);
>> +	if (iommu_domain) {
>> +		hvdom = to_hv_domain(iommu_domain);
>> +		return hvdom->attached_dom;
>> +	}
>> +
>> +	return false;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_pcidev_is_attached_dev);
> Attached domain can change anytime, what guarantee does the caller have?

Not sure I understand what can change: the device moving from attached
to non-attached? or the domain getting deleted? In any case, this is
called from leaf functions, so that should not happen... and it
will return false if the device did somehow got removed.
  
>> +
>> +/* Create a new device domain in the hypervisor */
>> +static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
>> +{
>> +	u64 status;
>> +	unsigned long flags;
>> +	struct hv_input_device_domain *ddp;
>> +	struct hv_input_create_device_domain *input;
> nit: use consistent coding style, inverse Christmas tree.
> 
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	ddp = &input->device_domain;
>> +	ddp->partition_id = HV_PARTITION_ID_SELF;
>> +	ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	ddp->domain_id.id = hvdom->domid_num;
>> +
>> +	input->create_device_domain_flags.forward_progress_required
>> = 1;
>> +	input->create_device_domain_flags.inherit_owning_vtl = 0;
>> +
>> +	status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input,
>> NULL); +
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* During boot, all devices are attached to this */
>> +static struct iommu_domain *hv_iommu_domain_alloc_identity(struct
>> device *dev) +{
>> +	return &hv_def_identity_dom.iommu_dom;
>> +}
>> +
>> +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct
>> device *dev) +{
>> +	struct hv_domain *hvdom;
>> +	int rc;
>> +
>> +	if (hv_l1vh_partition() && !hv_curr_thread_is_vmm() &&
>> !hv_no_attdev) {
>> +		pr_err("Hyper-V: l1vh iommu does not support host
>> devices\n");
> why is this an error if user input choose not to do direct attach?

Like the error message says: on l1vh, direct attaches of host devices
(eg dpdk) is not supported. and l1vh only does direct attaches. IOW,
no host devices on l1vh.

>> +		return NULL;
>> +	}
>> +
>> +	hvdom = kzalloc(sizeof(struct hv_domain), GFP_KERNEL);
>> +	if (hvdom == NULL)
>> +		goto out;
>> +
>> +	spin_lock_init(&hvdom->mappings_lock);
>> +	hvdom->mappings_tree = RB_ROOT_CACHED;
>> +
>> +	if (++unique_id == HV_DEVICE_DOMAIN_ID_S2_DEFAULT)   /* ie,
>> 0 */
> This is true only when unique_id wraps around, right? Then this driver
> stops working?

Correct. It's a u32, so if my math is right, and a device is attached
every second, it will take 136 years to wrap! Did i get that right?

> can you use an IDR for the unique_id and free it as you detach instead
> of doing this cyclic allocation?
> 
>> +		goto out_free;
>> +
>> +	hvdom->domid_num = unique_id;
>> +	hvdom->iommu_dom.geometry = default_geometry;
>> +	hvdom->iommu_dom.pgsize_bitmap = HV_IOMMU_PGSIZES;
>> +
>> +	/* For guests, by default we do direct attaches, so no
>> domain in hyp */
>> +	if (hv_curr_thread_is_vmm() && !hv_no_attdev)
>> +		hvdom->attached_dom = true;
>> +	else {
>> +		rc = hv_iommu_create_hyp_devdom(hvdom);
>> +		if (rc)
>> +			goto out_free_id;
>> +	}
>> +
>> +	return &hvdom->iommu_dom;
>> +
>> +out_free_id:
>> +	unique_id--;
>> +out_free:
>> +	kfree(hvdom);
>> +out:
>> +	return NULL;
>> +}
>> +
>> +static void hv_iommu_domain_free(struct iommu_domain *immdom)
>> +{
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	unsigned long flags;
>> +	u64 status;
>> +	struct hv_input_delete_device_domain *input;
>> +
>> +	if (hv_special_domain(hvdom))
>> +		return;
>> +
>> +	if (hvdom->num_attchd) {
>> +		pr_err("Hyper-V: can't free busy iommu domain
>> (%p)\n", immdom);
>> +		return;
>> +	}
>> +
>> +	if (!hv_curr_thread_is_vmm() || hv_no_attdev) {
>> +		struct hv_input_device_domain *ddp;
>> +
>> +		local_irq_save(flags);
>> +		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +		ddp = &input->device_domain;
>> +		memset(input, 0, sizeof(*input));
>> +
>> +		ddp->partition_id = HV_PARTITION_ID_SELF;
>> +		ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +		ddp->domain_id.id = hvdom->domid_num;
>> +
>> +		status =
>> hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input,
>> +					 NULL);
>> +		local_irq_restore(flags);
>> +
>> +		if (!hv_result_success(status))
>> +			hv_status_err(status, "\n");
>> +	}

> you could free the domid here, no?
sorry, don't follow what you mean by domid, you mean unique_id?

>> +
>> +	kfree(hvdom);
>> +}
>> +
>> +/* Attach a device to a domain previously created in the hypervisor
>> */ +static int hv_iommu_att_dev2dom(struct hv_domain *hvdom, struct
>> pci_dev *pdev) +{
>> +	unsigned long flags;
>> +	u64 status;
>> +	enum hv_device_type dev_type;
>> +	struct hv_input_attach_device_domain *input;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type =
>> HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +
>> +	/* NB: Upon guest shutdown, device is re-attached to the
>> default domain
>> +	 * without explicit detach.
>> +	 */
>> +	if (hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	input->device_id.as_uint64 = hv_build_devid_oftype(pdev,
>> dev_type); +
>> +	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input,
>> NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* Caller must have validated that dev is a valid pci dev */
>> +static int hv_iommu_direct_attach_device(struct pci_dev *pdev)
>> +{
>> +	struct hv_input_attach_device *input;
>> +	u64 status;
>> +	int rc;
>> +	unsigned long flags;
>> +	union hv_device_id host_devid;
>> +	enum hv_device_type dev_type;
>> +	u64 ptid = hv_iommu_get_curr_partid();
>> +
>> +	if (ptid == HV_PARTITION_ID_INVALID) {
>> +		pr_err("Hyper-V: Invalid partition id in direct
>> attach\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	host_devid.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
>> +
>> +	do {
>> +		local_irq_save(flags);
>> +		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +		memset(input, 0, sizeof(*input));
>> +		input->partition_id = ptid;
>> +		input->device_id = host_devid;
>> +
>> +		/* Hypervisor associates logical_id with this
>> device, and in
>> +		 * some hypercalls like retarget interrupts,
>> logical_id must be
>> +		 * used instead of the BDF. It is a required
>> parameter.
>> +		 */
>> +		input->attdev_flags.logical_id = 1;
>> +		input->logical_devid =
>> +			   hv_build_devid_oftype(pdev,
>> HV_DEVICE_TYPE_LOGICAL); +
>> +		status = hv_do_hypercall(HVCALL_ATTACH_DEVICE,
>> input, NULL);
>> +		local_irq_restore(flags);
>> +
>> +		if (hv_result(status) ==
>> HV_STATUS_INSUFFICIENT_MEMORY) {
>> +			rc = hv_call_deposit_pages(NUMA_NO_NODE,
>> ptid, 1);
>> +			if (rc)
>> +				break;
>> +		}
>> +	} while (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* This to attach a device to both host app (like DPDK) and a guest
>> VM */
> The IOMMU driver should be agnostic to the type of consumer, whether a
> userspace driver or a VM. This comment is not necessary.
> 
>> +static int hv_iommu_attach_dev(struct iommu_domain *immdom,
>> struct device *dev,
>> +			       struct iommu_domain *old)
> This does not match upstream kernel prototype, which kernel version is
> this based on? I will stop here for now.

As I mentioned in the cover letter:
          Based on: 8f0b4cce4481 (origin/hyperv-next)

which is now 6.19 based.

> struct iommu_domain_ops {
> 	int (*attach_dev)(struct iommu_domain *domain, struct device
> 	*dev);

I think you got it backwards, 6.6 has this. 6.19 has extra paremeter.

Thanks,
-Mukesh


>> +{
>> +	struct pci_dev *pdev;
>> +	int rc;
>> +	struct hv_domain *hvdom_new = to_hv_domain(immdom);
>> +	struct hv_domain *hvdom_prev = dev_iommu_priv_get(dev);
>> +
>> +	/* Only allow PCI devices for now */
>> +	if (!dev_is_pci(dev))
>> +		return -EINVAL;
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	/* l1vh does not support host device (eg DPDK) passthru */
>> +	if (hv_l1vh_partition() && !hv_special_domain(hvdom_new) &&
>> +	    !hvdom_new->attached_dom)
>> +		return -EINVAL;
>> +
>> +	/*
>> +	 * VFIO does not do explicit detach calls, hence check first
>> if we need
>> +	 * to detach first. Also, in case of guest shutdown, it's
>> the VMM
>> +	 * thread that attaches it back to the hv_def_identity_dom,
>> and
>> +	 * hvdom_prev will not be null then. It is null during boot.
>> +	 */
>> +	if (hvdom_prev)
>> +		if (!hv_l1vh_partition() ||
>> !hv_special_domain(hvdom_prev))
>> +			hv_iommu_detach_dev(&hvdom_prev->iommu_dom,
>> dev); +
>> +	if (hv_l1vh_partition() && hv_special_domain(hvdom_new)) {
>> +		dev_iommu_priv_set(dev, hvdom_new);  /* sets
>> "private" field */
>> +		return 0;
>> +	}
>> +
>> +	if (hvdom_new->attached_dom)
>> +		rc = hv_iommu_direct_attach_device(pdev);
>> +	else
>> +		rc = hv_iommu_att_dev2dom(hvdom_new, pdev);
>> +
>> +	if (rc && hvdom_prev) {
>> +		int rc1;
>> +
>> +		if (hvdom_prev->attached_dom)
>> +			rc1 = hv_iommu_direct_attach_device(pdev);
>> +		else
>> +			rc1 = hv_iommu_att_dev2dom(hvdom_prev, pdev);
>> +
>> +		if (rc1)
>> +			pr_err("Hyper-V: iommu could not restore
>> orig device state.. dev:%s\n",
>> +			       dev_name(dev));
>> +	}
>> +
>> +	if (rc == 0) {
>> +		dev_iommu_priv_set(dev, hvdom_new);  /* sets
>> "private" field */
>> +		hvdom_new->num_attchd++;
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static void hv_iommu_det_dev_from_guest(struct hv_domain *hvdom,
>> +					struct pci_dev *pdev)
>> +{
>> +	struct hv_input_detach_device *input;
>> +	u64 status, log_devid;
>> +	unsigned long flags;
>> +
>> +	log_devid = hv_build_devid_oftype(pdev,
>> HV_DEVICE_TYPE_LOGICAL); +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->partition_id = hv_iommu_get_curr_partid();
>> +	input->logical_devid = log_devid;
>> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +}
>> +
>> +static void hv_iommu_det_dev_from_dom(struct hv_domain *hvdom,
>> +				      struct pci_dev *pdev)
>> +{
>> +	u64 status, devid;
>> +	unsigned long flags;
>> +	struct hv_input_detach_device_domain *input;
>> +
>> +	devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_PCI);
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_id.as_uint64 = devid;
>> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input,
>> NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +}
>> +
>> +static void hv_iommu_detach_dev(struct iommu_domain *immdom, struct
>> device *dev) +{
>> +	struct pci_dev *pdev;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +
>> +	/* See the attach function, only PCI devices for now */
>> +	if (!dev_is_pci(dev))
>> +		return;
>> +
>> +	if (hvdom->num_attchd == 0)
>> +		pr_warn("Hyper-V: num_attchd is zero (%s)\n",
>> dev_name(dev)); +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	if (hvdom->attached_dom) {
>> +		hv_iommu_det_dev_from_guest(hvdom, pdev);
>> +
>> +		/* Do not reset attached_dom, hv_iommu_unmap_pages
>> happens
>> +		 * next.
>> +		 */
>> +	} else {
>> +		hv_iommu_det_dev_from_dom(hvdom, pdev);
>> +	}
>> +
>> +	hvdom->num_attchd--;
>> +}
>> +
>> +static int hv_iommu_add_tree_mapping(struct hv_domain *hvdom,
>> +				     unsigned long iova, phys_addr_t
>> paddr,
>> +				     size_t size, u32 flags)
>> +{
>> +	unsigned long irqflags;
>> +	struct hv_iommu_mapping *mapping;
>> +
>> +	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
>> +	if (!mapping)
>> +		return -ENOMEM;
>> +
>> +	mapping->paddr = paddr;
>> +	mapping->iova.start = iova;
>> +	mapping->iova.last = iova + size - 1;
>> +	mapping->flags = flags;
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, irqflags);
>> +	interval_tree_insert(&mapping->iova, &hvdom->mappings_tree);
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, irqflags);
>> +
>> +	return 0;
>> +}
>> +
>> +static size_t hv_iommu_del_tree_mappings(struct hv_domain *hvdom,
>> +					unsigned long iova, size_t
>> size) +{
>> +	unsigned long flags;
>> +	size_t unmapped = 0;
>> +	unsigned long last = iova + size - 1;
>> +	struct hv_iommu_mapping *mapping = NULL;
>> +	struct interval_tree_node *node, *next;
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
>> +	next = interval_tree_iter_first(&hvdom->mappings_tree, iova,
>> last);
>> +	while (next) {
>> +		node = next;
>> +		mapping = container_of(node, struct
>> hv_iommu_mapping, iova);
>> +		next = interval_tree_iter_next(node, iova, last);
>> +
>> +		/* Trying to split a mapping? Not supported for now.
>> */
>> +		if (mapping->iova.start < iova)
>> +			break;
>> +
>> +		unmapped += mapping->iova.last - mapping->iova.start
>> + 1; +
>> +		interval_tree_remove(node, &hvdom->mappings_tree);
>> +		kfree(mapping);
>> +	}
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
>> +
>> +	return unmapped;
>> +}
>> +
>> +/* Return: must return exact status from the hypercall without
>> changes */ +static u64 hv_iommu_map_pgs(struct hv_domain *hvdom,
>> +			    unsigned long iova, phys_addr_t paddr,
>> +			    unsigned long npages, u32 map_flags)
>> +{
>> +	u64 status;
>> +	int i;
>> +	struct hv_input_map_device_gpa_pages *input;
>> +	unsigned long flags, pfn = paddr >> HV_HYP_PAGE_SHIFT;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type =
>> HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +	input->map_flags = map_flags;
>> +	input->target_device_va_base = iova;
>> +
>> +	pfn = paddr >> HV_HYP_PAGE_SHIFT;
>> +	for (i = 0; i < npages; i++, pfn++)
>> +		input->gpa_page_list[i] = pfn;
>> +
>> +	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES,
>> npages, 0,
>> +				     input, NULL);
>> +
>> +	local_irq_restore(flags);
>> +	return status;
>> +}
>> +
>> +/*
>> + * The core VFIO code loops over memory ranges calling this function
>> with
>> + * the largest size from HV_IOMMU_PGSIZES. cond_resched() is in
>> vfio_iommu_map.
>> + */
>> +static int hv_iommu_map_pages(struct iommu_domain *immdom, ulong
>> iova,
>> +			      phys_addr_t paddr, size_t pgsize,
>> size_t pgcount,
>> +			      int prot, gfp_t gfp, size_t *mapped)
>> +{
>> +	u32 map_flags;
>> +	int ret;
>> +	u64 status;
>> +	unsigned long npages, done = 0;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	size_t size = pgsize * pgcount;
>> +
>> +	map_flags = HV_MAP_GPA_READABLE;	/* required */
>> +	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
>> +
>> +	ret = hv_iommu_add_tree_mapping(hvdom, iova, paddr, size,
>> map_flags);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (hvdom->attached_dom) {
>> +		*mapped = size;
>> +		return 0;
>> +	}
>> +
>> +	npages = size >> HV_HYP_PAGE_SHIFT;
>> +	while (done < npages) {
>> +		ulong completed, remain = npages - done;
>> +
>> +		status = hv_iommu_map_pgs(hvdom, iova, paddr, remain,
>> +					  map_flags);
>> +
>> +		completed = hv_repcomp(status);
>> +		done = done + completed;
>> +		iova = iova + (completed << HV_HYP_PAGE_SHIFT);
>> +		paddr = paddr + (completed << HV_HYP_PAGE_SHIFT);
>> +
>> +		if (hv_result(status) ==
>> HV_STATUS_INSUFFICIENT_MEMORY) {
>> +			ret = hv_call_deposit_pages(NUMA_NO_NODE,
>> +
>> hv_current_partition_id,
>> +						    256);
>> +			if (ret)
>> +				break;
>> +		}
>> +		if (!hv_result_success(status))
>> +			break;
>> +	}
>> +
>> +	if (!hv_result_success(status)) {
>> +		size_t done_size = done << HV_HYP_PAGE_SHIFT;
>> +
>> +		hv_status_err(status, "pgs:%lx/%lx iova:%lx\n",
>> +			      done, npages, iova);
>> +		/*
>> +		 * lookup tree has all mappings [0 - size-1]. Below
>> unmap will
>> +		 * only remove from [0 - done], we need to remove
>> second chunk
>> +		 * [done+1 - size-1].
>> +		 */
>> +		hv_iommu_del_tree_mappings(hvdom, iova, size -
>> done_size);
>> +		hv_iommu_unmap_pages(immdom, iova - done_size,
>> pgsize,
>> +				     done, NULL);
>> +		if (mapped)
>> +			*mapped = 0;
>> +	} else
>> +		if (mapped)
>> +			*mapped = size;
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom,
>> ulong iova,
>> +				   size_t pgsize, size_t pgcount,
>> +				   struct iommu_iotlb_gather *gather)
>> +{
>> +	unsigned long flags, npages;
>> +	struct hv_input_unmap_device_gpa_pages *input;
>> +	u64 status;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	size_t unmapped, size = pgsize * pgcount;
>> +
>> +	unmapped = hv_iommu_del_tree_mappings(hvdom, iova, size);
>> +	if (unmapped < size)
>> +		pr_err("%s: could not delete all mappings
>> (%lx:%lx/%lx)\n",
>> +		       __func__, iova, unmapped, size);
>> +
>> +	if (hvdom->attached_dom)
>> +		return size;
>> +
>> +	npages = size >> HV_HYP_PAGE_SHIFT;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type =
>> HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +	input->target_device_va_base = iova;
>> +
>> +	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES,
>> npages,
>> +				     0, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return unmapped;
>> +}
>> +
>> +static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *immdom,
>> +					 dma_addr_t iova)
>> +{
>> +	u64 paddr = 0;
>> +	unsigned long flags;
>> +	struct hv_iommu_mapping *mapping;
>> +	struct interval_tree_node *node;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
>> +	node = interval_tree_iter_first(&hvdom->mappings_tree, iova,
>> iova);
>> +	if (node) {
>> +		mapping = container_of(node, struct
>> hv_iommu_mapping, iova);
>> +		paddr = mapping->paddr + (iova -
>> mapping->iova.start);
>> +	}
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
>> +
>> +	return paddr;
>> +}
>> +
>> +/*
>> + * Currently, hypervisor does not provide list of devices it is using
>> + * dynamically. So use this to allow users to manually specify
>> devices that
>> + * should be skipped. (eg. hypervisor debugger using some network
>> device).
>> + */
>> +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
>> +{
>> +	if (!dev_is_pci(dev))
>> +		return ERR_PTR(-ENODEV);
>> +
>> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
>> +		int rc, pos = 0;
>> +		int parsed;
>> +		int segment, bus, slot, func;
>> +		struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +		do {
>> +			parsed = 0;
>> +
>> +			rc = sscanf(pci_devs_to_skip + pos, "
>> (%x:%x:%x.%x) %n",
>> +				    &segment, &bus, &slot, &func,
>> &parsed);
>> +			if (rc)
>> +				break;
>> +			if (parsed <= 0)
>> +				break;
>> +
>> +			if (pci_domain_nr(pdev->bus) == segment &&
>> +			    pdev->bus->number == bus &&
>> +			    PCI_SLOT(pdev->devfn) == slot &&
>> +			    PCI_FUNC(pdev->devfn) == func) {
>> +
>> +				dev_info(dev, "skipped by Hyper-V
>> IOMMU\n");
>> +				return ERR_PTR(-ENODEV);
>> +			}
>> +			pos += parsed;
>> +
>> +		} while (pci_devs_to_skip[pos]);
>> +	}
>> +
>> +	/* Device will be explicitly attached to the default domain,
>> so no need
>> +	 * to do dev_iommu_priv_set() here.
>> +	 */
>> +
>> +	return &hv_virt_iommu;
>> +}
>> +
>> +static void hv_iommu_probe_finalize(struct device *dev)
>> +{
>> +	struct iommu_domain *immdom = iommu_get_domain_for_dev(dev);
>> +
>> +	if (immdom && immdom->type == IOMMU_DOMAIN_DMA)
>> +		iommu_setup_dma_ops(dev);
>> +	else
>> +		set_dma_ops(dev, NULL);
>> +}
>> +
>> +static void hv_iommu_release_device(struct device *dev)
>> +{
>> +	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
>> +
>> +	/* Need to detach device from device domain if necessary. */
>> +	if (hvdom)
>> +		hv_iommu_detach_dev(&hvdom->iommu_dom, dev);
>> +
>> +	dev_iommu_priv_set(dev, NULL);
>> +	set_dma_ops(dev, NULL);
>> +}
>> +
>> +static struct iommu_group *hv_iommu_device_group(struct device *dev)
>> +{
>> +	if (dev_is_pci(dev))
>> +		return pci_device_group(dev);
>> +	else
>> +		return generic_device_group(dev);
>> +}
>> +
>> +static int hv_iommu_def_domain_type(struct device *dev)
>> +{
>> +	/* The hypervisor always creates this by default during boot
>> */
>> +	return IOMMU_DOMAIN_IDENTITY;
>> +}
>> +
>> +static struct iommu_ops hv_iommu_ops = {
>> +	.capable	    = hv_iommu_capable,
>> +	.domain_alloc_identity	=
>> hv_iommu_domain_alloc_identity,
>> +	.domain_alloc_paging	= hv_iommu_domain_alloc_paging,
>> +	.probe_device	    = hv_iommu_probe_device,
>> +	.probe_finalize     = hv_iommu_probe_finalize,
>> +	.release_device     = hv_iommu_release_device,
>> +	.def_domain_type    = hv_iommu_def_domain_type,
>> +	.device_group	    = hv_iommu_device_group,
>> +	.default_domain_ops = &(const struct iommu_domain_ops) {
>> +		.attach_dev   = hv_iommu_attach_dev,
>> +		.map_pages    = hv_iommu_map_pages,
>> +		.unmap_pages  = hv_iommu_unmap_pages,
>> +		.iova_to_phys = hv_iommu_iova_to_phys,
>> +		.free	      = hv_iommu_domain_free,
>> +	},
>> +	.owner		    = THIS_MODULE,
>> +};
>> +
>> +static void __init hv_initialize_special_domains(void)
>> +{
>> +	hv_def_identity_dom.iommu_dom.geometry = default_geometry;
>> +	hv_def_identity_dom.domid_num =
>> HV_DEVICE_DOMAIN_ID_S2_DEFAULT; /* 0 */ +}
> This could be initialized statically.
> 
>> +
>> +static int __init hv_iommu_init(void)
>> +{
>> +	int ret;
>> +	struct iommu_device *iommup = &hv_virt_iommu;
>> +
>> +	if (!hv_is_hyperv_initialized())
>> +		return -ENODEV;
>> +
>> +	ret = iommu_device_sysfs_add(iommup, NULL, NULL, "%s",
>> "hyperv-iommu");
>> +	if (ret) {
>> +		pr_err("Hyper-V: iommu_device_sysfs_add failed:
>> %d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	/* This must come before iommu_device_register because the
>> latter calls
>> +	 * into the hooks.
>> +	 */
>> +	hv_initialize_special_domains();
>> +
>> +	ret = iommu_device_register(iommup, &hv_iommu_ops, NULL);
>> +	if (ret) {
>> +		pr_err("Hyper-V: iommu_device_register failed:
>> %d\n", ret);
>> +		goto err_sysfs_remove;
>> +	}
>> +
>> +	pr_info("Hyper-V IOMMU initialized\n");
>> +
>> +	return 0;
>> +
>> +err_sysfs_remove:
>> +	iommu_device_sysfs_remove(iommup);
>> +	return ret;
>> +}
>> +
>> +void __init hv_iommu_detect(void)
>> +{
>> +	if (no_iommu || iommu_detected)
>> +		return;
>> +
>> +	/* For l1vh, always expose an iommu unit */
>> +	if (!hv_l1vh_partition())
>> +		if (!(ms_hyperv.misc_features &
>> HV_DEVICE_DOMAIN_AVAILABLE))
>> +			return;
>> +
>> +	iommu_detected = 1;
>> +	x86_init.iommu.iommu_init = hv_iommu_init;
>> +
>> +	pci_request_acs();
>> +}
>> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
>> index dfc516c1c719..2ad111727e82 100644
>> --- a/include/linux/hyperv.h
>> +++ b/include/linux/hyperv.h
>> @@ -1767,4 +1767,10 @@ static inline unsigned long virt_to_hvpfn(void
>> *addr) #define HVPFN_DOWN(x)	((x) >> HV_HYP_PAGE_SHIFT)
>>   #define page_to_hvpfn(page)	(page_to_pfn(page) *
>> NR_HV_HYP_PAGES_IN_PAGE)
>> +#ifdef CONFIG_HYPERV_IOMMU
>> +void __init hv_iommu_detect(void);
>> +#else
>> +static inline void hv_iommu_detect(void) { }
>> +#endif /* CONFIG_HYPERV_IOMMU */
>> +
>>   #endif /* _HYPERV_H */


^ permalink raw reply

* Re: [PATCH v0 13/15] x86/hyperv: Basic interrupt support for direct attached devices
From: Mukesh R @ 2026-01-24  2:08 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux, romank
In-Reply-To: <aXAiCw9Dk7GDfagy@skinsburskii.localdomain>

On 1/20/26 16:47, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:28PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> As mentioned previously, a direct attached device must be referenced
>> via logical device id which is formed in the initial attach hypercall.
>> Interrupt mapping paths for direct attached devices are almost same,
>> except we must use logical device ids instead of the PCI device ids.
>>
>> L1VH only supports direct attaches for passing thru devices to its guests,
>> and devices on L1VH are VMBus based. However, the interrupts are mapped
>> via the map interrupt hypercall and not the traditional method of VMBus
>> messages.
>>
>> Partition id for the relevant hypercalls is tricky. This because a device
>> could be moving from root to guest and then back to the root. In case
>> of L1VH, it could be moving from system host to L1VH root to a guest,
>> then back to the L1VH root. So, it is carefully crafted by keeping
>> track of whether the call is on behalf of a VMM process, whether the
>> device is attached device (as opposed to mapped), and whether we are in
>> an L1VH root/parent. If VMM process, we assume it is on behalf of a
>> guest. Otherwise, the device is being attached or detached during boot
>> or shutdown of the privileged partition.
>>
>> Lastly, a dummy cpu and vector is used to map interrupt for a direct
>> attached device. This because, once a device is marked for direct attach,
>> hypervisor will not let any interrupts be mapped to host. So it is mapped
>> to guest dummy cpu and dummy vector. This is then correctly mapped during
>> guest boot via the retarget paths.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   arch/arm64/include/asm/mshyperv.h   | 15 +++++
>>   arch/x86/hyperv/irqdomain.c         | 57 +++++++++++++-----
>>   arch/x86/include/asm/mshyperv.h     |  4 ++
>>   drivers/pci/controller/pci-hyperv.c | 91 +++++++++++++++++++++++++----
>>   4 files changed, 142 insertions(+), 25 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/mshyperv.h b/arch/arm64/include/asm/mshyperv.h
>> index b721d3134ab6..27da480f94f6 100644
>> --- a/arch/arm64/include/asm/mshyperv.h
>> +++ b/arch/arm64/include/asm/mshyperv.h
>> @@ -53,6 +53,21 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg)
>>   	return hv_get_msr(reg);
>>   }
>>   
>> +struct irq_data;
>> +struct msi_msg;
>> +struct pci_dev;
>> +static inline void hv_irq_compose_msi_msg(struct irq_data *data,
>> +					  struct msi_msg *msg) {};
>> +static inline int hv_unmap_msi_interrupt(struct pci_dev *pdev,
>> +					struct hv_interrupt_entry *hvirqe)
>> +{
>> +	return -EOPNOTSUPP;
>> +}
>> +static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{
>> +	return false;
>> +}
>> +
>>   /* SMCCC hypercall parameters */
>>   #define HV_SMCCC_FUNC_NUMBER	1
>>   #define HV_FUNC_ID	ARM_SMCCC_CALL_VAL(			\
>> diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
>> index 33017aa0caa4..e6eb457f791e 100644
>> --- a/arch/x86/hyperv/irqdomain.c
>> +++ b/arch/x86/hyperv/irqdomain.c
>> @@ -13,6 +13,16 @@
>>   #include <linux/irqchip/irq-msi-lib.h>
>>   #include <asm/mshyperv.h>
>>   
>> +/*
>> + * For direct attached devices (which use logical device ids), hypervisor will
>> + * not allow mappings to host. But VFIO needs to bind the interrupt at the very
>> + * start before the guest cpu/vector is known. So we use dummy cpu and vector
>> + * to bind in such case, and later when the guest starts, retarget will move it
>> + * to correct guest cpu and vector.
>> + */
>> +#define HV_DDA_DUMMY_CPU      0
>> +#define HV_DDA_DUMMY_VECTOR  32
>> +
>>   static u64 hv_map_interrupt_hcall(u64 ptid, union hv_device_id hv_devid,
>>   				  bool level, int cpu, int vector,
>>   				  struct hv_interrupt_entry *ret_entry)
>> @@ -24,6 +34,11 @@ static u64 hv_map_interrupt_hcall(u64 ptid, union hv_device_id hv_devid,
>>   	u64 status;
>>   	int nr_bank, var_size;
>>   
>> +	if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL) {
>> +		cpu = HV_DDA_DUMMY_CPU;
>> +		vector = HV_DDA_DUMMY_VECTOR;
>> +	}
>> +
>>   	local_irq_save(flags);
>>   
>>   	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> @@ -95,7 +110,8 @@ static int hv_map_interrupt(u64 ptid, union hv_device_id device_id, bool level,
>>   	return hv_result_to_errno(status);
>>   }
>>   
>> -static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
>> +static int hv_unmap_interrupt(union hv_device_id hv_devid,
>> +			      struct hv_interrupt_entry *irq_entry)
>>   {
>>   	unsigned long flags;
>>   	struct hv_input_unmap_device_interrupt *input;
>> @@ -103,10 +119,14 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *irq_entry)
>>   
>>   	local_irq_save(flags);
>>   	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> -
>>   	memset(input, 0, sizeof(*input));
>> -	input->partition_id = hv_current_partition_id;
>> -	input->device_id = id;
>> +
>> +	if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
>> +		input->partition_id = hv_iommu_get_curr_partid();
>> +	else
>> +		input->partition_id = hv_current_partition_id;
>> +
>> +	input->device_id = hv_devid.as_uint64;
>>   	input->interrupt_entry = *irq_entry;
>>   
>>   	status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
>> @@ -263,6 +283,7 @@ static u64 hv_build_irq_devid(struct pci_dev *pdev)
>>   int hv_map_msi_interrupt(struct irq_data *data,
>>   			 struct hv_interrupt_entry *out_entry)
>>   {
>> +	u64 ptid;
>>   	struct irq_cfg *cfg = irqd_cfg(data);
>>   	struct hv_interrupt_entry dummy;
>>   	union hv_device_id hv_devid;
>> @@ -275,8 +296,17 @@ int hv_map_msi_interrupt(struct irq_data *data,
>>   	hv_devid.as_uint64 = hv_build_irq_devid(pdev);
>>   	cpu = cpumask_first(irq_data_get_effective_affinity_mask(data));
>>   
>> -	return hv_map_interrupt(hv_current_partition_id, hv_devid, false, cpu,
>> -				cfg->vector, out_entry ? out_entry : &dummy);
>> +	if (hv_devid.device_type == HV_DEVICE_TYPE_LOGICAL)
>> +		if (hv_pcidev_is_attached_dev(pdev))
>> +			ptid = hv_iommu_get_curr_partid();
>> +		else
>> +			/* Device actually on l1vh root, not passthru'd to vm */
> 
> l1vh and root are mutually exclusive partitions.
> If you wanted to highlight that it's l1vh itself and not its child guest, then
> "l1vh parent" term would do.

We've been loosely using "l1vh root" to mean "privilated l1vh" as opposed
to l1vh guests. I think that is fine. l1vh parent is confusing, as it may
also refer to l1vh parent, which would be the host. so as long as the
context is clear, we are ok.

>> +			ptid = hv_current_partition_id;
>> +	else
>> +		ptid = hv_current_partition_id;
> 
> Looks like the only special case is for attached logical devices,
> otherwise hv_current_partition_id is used.
> Can the logic simplified here?

Could be, but at the cost of clear upfront clarity. this nicely tells
the reader that a logical ID has different cases, where as PCI
does not. End instructions are the same.

Thanks,
-Mukesh



> Thanks,
> Stanislav
> 
>> +
>> +	return hv_map_interrupt(ptid, hv_devid, false, cpu, cfg->vector,
>> +				out_entry ? out_entry : &dummy);
>>   }
>>   EXPORT_SYMBOL_GPL(hv_map_msi_interrupt);
>>   
>> @@ -289,10 +319,7 @@ static void entry_to_msi_msg(struct hv_interrupt_entry *entry,
>>   	msg->data = entry->msi_entry.data.as_uint32;
>>   }
>>   
>> -static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
>> -				  struct hv_interrupt_entry *irq_entry);
>> -
>> -static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>> +void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>>   {
>>   	struct hv_interrupt_entry *stored_entry;
>>   	struct irq_cfg *cfg = irqd_cfg(data);
>> @@ -341,16 +368,18 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>>   	data->chip_data = stored_entry;
>>   	entry_to_msi_msg(data->chip_data, msg);
>>   }
>> +EXPORT_SYMBOL_GPL(hv_irq_compose_msi_msg);
>>   
>> -static int hv_unmap_msi_interrupt(struct pci_dev *pdev,
>> -				  struct hv_interrupt_entry *irq_entry)
>> +int hv_unmap_msi_interrupt(struct pci_dev *pdev,
>> +			   struct hv_interrupt_entry *irq_entry)
>>   {
>>   	union hv_device_id hv_devid;
>>   
>>   	hv_devid.as_uint64 = hv_build_irq_devid(pdev);
>>   
>> -	return hv_unmap_interrupt(hv_devid.as_uint64, irq_entry);
>> +	return hv_unmap_interrupt(hv_devid, irq_entry);
>>   }
>> +EXPORT_SYMBOL_GPL(hv_unmap_msi_interrupt);
>>   
>>   /* NB: during map, hv_interrupt_entry is saved via data->chip_data */
>>   static void hv_teardown_msi_irq(struct pci_dev *pdev, struct irq_data *irqd)
>> @@ -486,7 +515,7 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry)
>>   	hv_devid.device_type = HV_DEVICE_TYPE_IOAPIC;
>>   	hv_devid.ioapic.ioapic_id = (u8)ioapic_id;
>>   
>> -	return hv_unmap_interrupt(hv_devid.as_uint64, entry);
>> +	return hv_unmap_interrupt(hv_devid, entry);
>>   }
>>   EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt);
>>   
>> diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
>> index e4ccdbbf1d12..b6facd3a0f5e 100644
>> --- a/arch/x86/include/asm/mshyperv.h
>> +++ b/arch/x86/include/asm/mshyperv.h
>> @@ -204,11 +204,15 @@ static inline u64 hv_iommu_get_curr_partid(void)
>>   #endif	/* CONFIG_HYPERV_IOMMU */
>>   
>>   u64 hv_pci_vmbus_device_id(struct pci_dev *pdev);
>> +void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg);
>> +extern bool hv_no_attdev;
>>   
>>   struct irq_domain *hv_create_pci_msi_domain(void);
>>   
>>   int hv_map_msi_interrupt(struct irq_data *data,
>>   			 struct hv_interrupt_entry *out_entry);
>> +int hv_unmap_msi_interrupt(struct pci_dev *dev,
>> +			   struct hv_interrupt_entry *hvirqe);
>>   int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
>>   		struct hv_interrupt_entry *entry);
>>   int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
>> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
>> index 40f0b06bb966..71d1599dc4a8 100644
>> --- a/drivers/pci/controller/pci-hyperv.c
>> +++ b/drivers/pci/controller/pci-hyperv.c
>> @@ -660,15 +660,17 @@ static void hv_irq_retarget_interrupt(struct irq_data *data)
>>   
>>   	params = *this_cpu_ptr(hyperv_pcpu_input_arg);
>>   	memset(params, 0, sizeof(*params));
>> -	params->partition_id = HV_PARTITION_ID_SELF;
>> +
>> +	if (hv_pcidev_is_attached_dev(pdev))
>> +		params->partition_id = hv_iommu_get_curr_partid();
>> +	else
>> +		params->partition_id = HV_PARTITION_ID_SELF;
>> +
>>   	params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
>> -	params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
>> +	params->int_entry.msi_entry.address.as_uint32 =
>> +						int_desc->address & 0xffffffff;
>>   	params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
>> -	params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
>> -			   (hbus->hdev->dev_instance.b[4] << 16) |
>> -			   (hbus->hdev->dev_instance.b[7] << 8) |
>> -			   (hbus->hdev->dev_instance.b[6] & 0xf8) |
>> -			   PCI_FUNC(pdev->devfn);
>> +	params->device_id = hv_pci_vmbus_device_id(pdev);
>>   	params->int_target.vector = hv_msi_get_int_vector(data);
>>   
>>   	if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
>> @@ -1263,6 +1265,15 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
>>   			mb();
>>   		}
>>   		spin_unlock_irqrestore(&hbus->config_lock, flags);
>> +		/*
>> +		 * Make sure PCI_INTERRUPT_PIN is hard-wired to 0 since it may
>> +		 * be read using a 32bit read which is skipped by the above
>> +		 * emulation.
>> +		 */
>> +		if (PCI_INTERRUPT_PIN >= where &&
>> +		    PCI_INTERRUPT_PIN <= (where + size)) {
>> +			*((char *)val + PCI_INTERRUPT_PIN - where) = 0;
>> +		}
>>   	} else {
>>   		dev_err(dev, "Attempt to read beyond a function's config space.\n");
>>   	}
>> @@ -1731,14 +1742,22 @@ static void hv_msi_free(struct irq_domain *domain, unsigned int irq)
>>   	if (!int_desc)
>>   		return;
>>   
>> -	irq_data->chip_data = NULL;
>>   	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
>>   	if (!hpdev) {
>> +		irq_data->chip_data = NULL;
>>   		kfree(int_desc);
>>   		return;
>>   	}
>>   
>> -	hv_int_desc_free(hpdev, int_desc);
>> +	if (hv_pcidev_is_attached_dev(pdev)) {
>> +		hv_unmap_msi_interrupt(pdev, irq_data->chip_data);
>> +		kfree(irq_data->chip_data);
>> +		irq_data->chip_data = NULL;
>> +	} else {
>> +		irq_data->chip_data = NULL;
>> +		hv_int_desc_free(hpdev, int_desc);
>> +	}
>> +
>>   	put_pcichild(hpdev);
>>   }
>>   
>> @@ -2139,6 +2158,56 @@ static void hv_vmbus_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>>   	msg->data = 0;
>>   }
>>   
>> +/* Compose an msi message for a directly attached device */
>> +static void hv_dda_compose_msi_msg(struct irq_data *irq_data,
>> +				   struct msi_desc *msi_desc,
>> +				   struct msi_msg *msg)
>> +{
>> +	bool multi_msi;
>> +	struct hv_pcibus_device *hbus;
>> +	struct hv_pci_dev *hpdev;
>> +	struct pci_dev *pdev = msi_desc_to_pci_dev(msi_desc);
>> +
>> +	multi_msi = !msi_desc->pci.msi_attrib.is_msix &&
>> +		    msi_desc->nvec_used > 1;
>> +
>> +	if (multi_msi) {
>> +		dev_err(&hbus->hdev->device,
>> +			"Passthru direct attach does not support multi msi\n");
>> +		goto outerr;
>> +	}
>> +
>> +	hbus = container_of(pdev->bus->sysdata, struct hv_pcibus_device,
>> +			    sysdata);
>> +
>> +	hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
>> +	if (!hpdev)
>> +		goto outerr;
>> +
>> +	/* will unmap if needed and also update irq_data->chip_data */
>> +	hv_irq_compose_msi_msg(irq_data, msg);
>> +
>> +	put_pcichild(hpdev);
>> +	return;
>> +
>> +outerr:
>> +	memset(msg, 0, sizeof(*msg));
>> +}
>> +
>> +static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>> +{
>> +	struct pci_dev *pdev;
>> +	struct msi_desc *msi_desc;
>> +
>> +	msi_desc = irq_data_get_msi_desc(data);
>> +	pdev = msi_desc_to_pci_dev(msi_desc);
>> +
>> +	if (hv_pcidev_is_attached_dev(pdev))
>> +		hv_dda_compose_msi_msg(data, msi_desc, msg);
>> +	else
>> +		hv_vmbus_compose_msi_msg(data, msg);
>> +}
>> +
>>   static bool hv_pcie_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
>>   				      struct irq_domain *real_parent, struct msi_domain_info *info)
>>   {
>> @@ -2177,7 +2246,7 @@ static const struct msi_parent_ops hv_pcie_msi_parent_ops = {
>>   /* HW Interrupt Chip Descriptor */
>>   static struct irq_chip hv_msi_irq_chip = {
>>   	.name			= "Hyper-V PCIe MSI",
>> -	.irq_compose_msi_msg	= hv_vmbus_compose_msi_msg,
>> +	.irq_compose_msi_msg	= hv_compose_msi_msg,
>>   	.irq_set_affinity	= irq_chip_set_affinity_parent,
>>   	.irq_ack		= irq_chip_ack_parent,
>>   	.irq_eoi		= irq_chip_eoi_parent,
>> @@ -4096,7 +4165,7 @@ static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
>>   		irq_data = irq_get_irq_data(entry->irq);
>>   		if (WARN_ON_ONCE(!irq_data))
>>   			return -EINVAL;
>> -		hv_vmbus_compose_msi_msg(irq_data, &entry->msg);
>> +		hv_compose_msi_msg(irq_data, &entry->msg);
>>   	}
>>   	return 0;
>>   }
>> -- 
>> 2.51.2.vfs.0.1
>>


^ permalink raw reply

* Re: [PATCH v0 14/15] mshv: Remove mapping of mmio space during map user ioctl
From: Mukesh R @ 2026-01-24  2:12 UTC (permalink / raw)
  To: Nuno Das Neves, linux-kernel, linux-hyperv, linux-arm-kernel,
	iommu, linux-pci, linux-arch
  Cc: kys, haiyangz, wei.liu, decui, longli, catalin.marinas, will,
	tglx, mingo, bp, dave.hansen, hpa, joro, lpieralisi, kwilczynski,
	mani, robh, bhelgaas, arnd, mhklinux, romank
In-Reply-To: <152d9393-4af3-4c2f-a808-367ee7249d36@linux.microsoft.com>

On 1/23/26 10:34, Nuno Das Neves wrote:
> On 1/19/2026 10:42 PM, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> VFIO no longer puts the mmio pfn in vma->vm_pgoff. So, remove code
>> that is using it to map mmio space. It is broken and will cause
>> panic.
> 
> What is the reason for having this as a separate commit from patch 15?
> It seems like removing this code and adding the mmio intercept
> handling could be done in one patch.

Just ease of review and porting patches from this branch to that
branch to that release to this release... I am sure someone would
have asked for this to be a separate patch :).

Thanks,
-Mukesh


>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   drivers/hv/mshv_root_main.c | 20 ++++----------------
>>   1 file changed, 4 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>> index 27313419828d..03f3aa9f5541 100644
>> --- a/drivers/hv/mshv_root_main.c
>> +++ b/drivers/hv/mshv_root_main.c
>> @@ -1258,16 +1258,8 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
>>   }
>>   
>>   /*
>> - * This maps two things: guest RAM and for pci passthru mmio space.
>> - *
>> - * mmio:
>> - *  - vfio overloads vm_pgoff to store the mmio start pfn/spa.
>> - *  - Two things need to happen for mapping mmio range:
>> - *	1. mapped in the uaddr so VMM can access it.
>> - *	2. mapped in the hwpt (gfn <-> mmio phys addr) so guest can access it.
>> - *
>> - *   This function takes care of the second. The first one is managed by vfio,
>> - *   and hence is taken care of via vfio_pci_mmap_fault().
>> + * This is called for both user ram and mmio space. The mmio space is not
>> + * mapped here, but later during intercept.
>>    */
>>   static long
>>   mshv_map_user_memory(struct mshv_partition *partition,
>> @@ -1276,7 +1268,6 @@ mshv_map_user_memory(struct mshv_partition *partition,
>>   	struct mshv_mem_region *region;
>>   	struct vm_area_struct *vma;
>>   	bool is_mmio;
>> -	ulong mmio_pfn;
>>   	long ret;
>>   
>>   	if (mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
>> @@ -1286,7 +1277,6 @@ mshv_map_user_memory(struct mshv_partition *partition,
>>   	mmap_read_lock(current->mm);
>>   	vma = vma_lookup(current->mm, mem.userspace_addr);
>>   	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>> -	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
>>   	mmap_read_unlock(current->mm);
>>   
>>   	if (!vma)
>> @@ -1313,10 +1303,8 @@ mshv_map_user_memory(struct mshv_partition *partition,
>>   					    HV_MAP_GPA_NO_ACCESS, NULL);
>>   		break;
>>   	case MSHV_REGION_TYPE_MMIO:
>> -		ret = hv_call_map_mmio_pages(partition->pt_id,
>> -					     region->start_gfn,
>> -					     mmio_pfn,
>> -					     region->nr_pages);
>> +		/* mmio mappings are handled later during intercepts */
>> +		ret = 0;
>>   		break;
>>   	}
>>   
> 


^ permalink raw reply

* Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
From: Mukesh R @ 2026-01-24  2:19 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux
In-Reply-To: <aXAxmYm4zbOzGztz@skinsburskii.localdomain>

On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Upon guest access, in case of missing mmio mapping, the hypervisor
>> generates an unmapped gpa intercept. In this path, lookup the PCI
>> resource pfn for the guest gpa, and ask the hypervisor to map it
>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>> and obtained via fixup_user_fault call (similar to KVM).
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>> ---
>>   drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>   1 file changed, 115 insertions(+)
>>
>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>> index 03f3aa9f5541..4c8bc7cd0888 100644
>> --- a/drivers/hv/mshv_root_main.c
>> +++ b/drivers/hv/mshv_root_main.c
>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>   	};
>>   } __packed;
>>   
>> +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
>> +static int __init setup_hv_full_mmio(char *str)
>> +{
>> +	hv_nofull_mmio = true;
>> +	return 0;
>> +}
>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>> +
>>   struct mshv_root mshv_root;
>>   
>>   enum hv_scheduler_type hv_scheduler_type;
>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>   }
>>   
>>   #ifdef CONFIG_X86_64
>> +
>> +/*
>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>> + * else just return -errno.
>> + */
>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>> +				       u64 *mmio_pfnp)
>> +{
>> +	struct vm_area_struct *vma;
>> +	bool is_mmio;
>> +	u64 uaddr;
>> +	struct mshv_mem_region *mreg;
>> +	struct follow_pfnmap_args pfnmap_args;
>> +	int rc = -EINVAL;
>> +
>> +	/*
>> +	 * Do not allow mem region to be deleted beneath us. VFIO uses
>> +	 * useraddr vma to lookup pci bar pfn.
>> +	 */
>> +	spin_lock(&pt->pt_mem_regions_lock);
>> +
>> +	/* Get the region again under the lock */
>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>> +		goto unlock_pt_out;
>> +
>> +	uaddr = mreg->start_uaddr +
>> +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>> +
>> +	mmap_read_lock(current->mm);
> 
> Semaphore can't be taken under spinlock.
> Get it instead.

Yeah, something didn't feel right here and I meant to recheck, now regret
rushing to submit the patch.

Rethinking, I think the pt_mem_regions_lock is not needed to protect
the uaddr because unmap will properly serialize via the mm lock.


>> +	vma = vma_lookup(current->mm, uaddr);
>> +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> 
> Why this check is needed again?

To make sure region did not change. This check is under lock.

> The region type is stored on the region itself.
> And the type is checked on the caller side.
> 
>> +	if (!is_mmio)
>> +		goto unlock_mmap_out;
>> +
>> +	pfnmap_args.vma = vma;
>> +	pfnmap_args.address = uaddr;
>> +
>> +	rc = follow_pfnmap_start(&pfnmap_args);
>> +	if (rc) {
>> +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>> +				      NULL);
>> +		if (rc)
>> +			goto unlock_mmap_out;
>> +
>> +		rc = follow_pfnmap_start(&pfnmap_args);
>> +		if (rc)
>> +			goto unlock_mmap_out;
>> +	}
>> +
>> +	*mmio_pfnp = pfnmap_args.pfn;
>> +	follow_pfnmap_end(&pfnmap_args);
>> +
>> +unlock_mmap_out:
>> +	mmap_read_unlock(current->mm);
>> +unlock_pt_out:
>> +	spin_unlock(&pt->pt_mem_regions_lock);
>> +	return rc;
>> +}
>> +
>> +/*
>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>> + * and resolve if possible.
>> + * Returns: True if valid mmio intercept and it was handled, else false
>> + */
>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>> +{
>> +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>> +	struct hv_x64_memory_intercept_message *msg;
>> +	union hv_x64_memory_access_info accinfo;
>> +	u64 gfn, mmio_spa, numpgs;
>> +	struct mshv_mem_region *mreg;
>> +	int rc;
>> +	struct mshv_partition *pt = vp->vp_partition;
>> +
>> +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>> +	accinfo = msg->memory_access_info;
>> +
>> +	if (!accinfo.gva_gpa_valid)
>> +		return false;
>> +
>> +	/* Do a fast check and bail if non mmio intercept */
>> +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>> +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> 
> This call needs to be protected by the spinlock.

This is sorta fast path to bail. We recheck under partition lock above.

Thanks,
-Mukesh


> Thanks,
> Stanislav
> 
>> +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>> +		return false;
>> +
>> +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>> +	if (rc)
>> +		return false;
>> +
>> +	if (!hv_nofull_mmio) {		/* default case */
>> +		gfn = mreg->start_gfn;
>> +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>> +		numpgs = mreg->nr_pages;
>> +	} else
>> +		numpgs = 1;
>> +
>> +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>> +
>> +	return rc == 0;
>> +}
>> +
>>   static struct mshv_mem_region *
>>   mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>   {
>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>   
>>   	return ret;
>>   }
>> +
>>   #else  /* CONFIG_X86_64 */
>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>   static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>   #endif /* CONFIG_X86_64 */
>>   
>>   static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>   {
>>   	switch (vp->vp_intercept_msg_page->header.message_type) {
>> +	case HVMSG_UNMAPPED_GPA:
>> +		return mshv_handle_unmapped_gpa(vp);
>>   	case HVMSG_GPA_INTERCEPT:
>>   		return mshv_handle_gpa_intercept(vp);
>>   	}
>> -- 
>> 2.51.2.vfs.0.1
>>


^ permalink raw reply

* Re: [PATCH v0 00/15] PCI passthru on Hyper-V (Part I)
From: Mukesh R @ 2026-01-24  2:27 UTC (permalink / raw)
  To: Jacob Pan
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux
In-Reply-To: <20260120134933.00004f2a@linux.microsoft.com>

On 1/20/26 13:50, Jacob Pan wrote:
> Hi Mukesh,
> 
> On Mon, 19 Jan 2026 22:42:15 -0800
> Mukesh R <mrathor@linux.microsoft.com> wrote:
> 
>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>
>> Implement passthru of PCI devices to unprivileged virtual machines
>> (VMs) when Linux is running as a privileged VM on Microsoft Hyper-V
>> hypervisor. This support is made to fit within the workings of VFIO
>> framework, and any VMM needing to use it must use the VFIO subsystem.
>> This supports both full device passthru and SR-IOV based VFs.
>>
>> There are 3 cases where Linux can run as a privileged VM (aka MSHV):
>>    Baremetal root (meaning Hyper-V+Linux), L1VH, and Nested.
>>
> I think some introduction/background to L1VH would help.

Ok, i can add something, but l1vh was very well introduced if you
search the mshv commits for "l1vh".

>> At a high level, the hypervisor supports traditional mapped iommu
>> domains that use explicit map and unmap hypercalls for mapping and
>> unmapping guest RAM into the iommu subsystem.
> It may be clearer to state that the hypervisor supports Linux IOMMU
> paging domains through map/unmap hypercalls, mapping GPAs to HPAs using
> stage?2 I/O page tables.

sure.

>> Hyper-V also has a
>> concept of direct attach devices whereby the iommu subsystem simply
>> uses the guest HW page table (ept/npt/..). This series adds support
>> for both, and both are made to work in VFIO type1 subsystem.
>>
> This may warrant introducing a new IOMMU domain feature flag, as it
> performs mappings but does not support map/unmap semantics in the same
> way as a paging domain.

Yeah, I was hoping we can get by for now without it. At least in case of
the cloud hypervisor, entire guest ram is mapped anyways. We can document
it and work on enhancements which are much easier once we have a baseline.
For now, it's a paging domain will all pages pinned.. :).

>> While this Part I focuses on memory mappings, upcoming Part II
>> will focus on irq bypass along with some minor irq remapping
>> updates.
>>
>> This patch series was tested using Cloud Hypervisor verion 48. Qemu
>> support of MSHV is in the works, and that will be extended to include
>> PCI passthru and SR-IOV support also in near future.
>>
>> Based on: 8f0b4cce4481 (origin/hyperv-next)
>>
>> Thanks,
>> -Mukesh
>>
>> Mukesh Rathor (15):
>>    iommu/hyperv: rename hyperv-iommu.c to hyperv-irq.c
>>    x86/hyperv: cosmetic changes in irqdomain.c for readability
>>    x86/hyperv: add insufficient memory support in irqdomain.c
>>    mshv: Provide a way to get partition id if running in a VMM process
>>    mshv: Declarations and definitions for VFIO-MSHV bridge device
>>    mshv: Implement mshv bridge device for VFIO
>>    mshv: Add ioctl support for MSHV-VFIO bridge device
>>    PCI: hv: rename hv_compose_msi_msg to hv_vmbus_compose_msi_msg
>>    mshv: Import data structs around device domains and irq remapping
>>    PCI: hv: Build device id for a VMBus device
>>    x86/hyperv: Build logical device ids for PCI passthru hcalls
>>    x86/hyperv: Implement hyperv virtual iommu
>>    x86/hyperv: Basic interrupt support for direct attached devices
>>    mshv: Remove mapping of mmio space during map user ioctl
>>    mshv: Populate mmio mappings for PCI passthru
>>
>>   MAINTAINERS                         |    1 +
>>   arch/arm64/include/asm/mshyperv.h   |   15 +
>>   arch/x86/hyperv/irqdomain.c         |  314 ++++++---
>>   arch/x86/include/asm/mshyperv.h     |   21 +
>>   arch/x86/kernel/pci-dma.c           |    2 +
>>   drivers/hv/Makefile                 |    3 +-
>>   drivers/hv/mshv_root.h              |   24 +
>>   drivers/hv/mshv_root_main.c         |  296 +++++++-
>>   drivers/hv/mshv_vfio.c              |  210 ++++++
>>   drivers/iommu/Kconfig               |    1 +
>>   drivers/iommu/Makefile              |    2 +-
>>   drivers/iommu/hyperv-iommu.c        | 1004
>> +++++++++++++++++++++------ drivers/iommu/hyperv-irq.c          |
>> 330 +++++++++ drivers/pci/controller/pci-hyperv.c |  207 ++++--
>>   include/asm-generic/mshyperv.h      |    1 +
>>   include/hyperv/hvgdk_mini.h         |   11 +
>>   include/hyperv/hvhdk_mini.h         |  112 +++
>>   include/linux/hyperv.h              |    6 +
>>   include/uapi/linux/mshv.h           |   31 +
>>   19 files changed, 2182 insertions(+), 409 deletions(-)
>>   create mode 100644 drivers/hv/mshv_vfio.c
>>   create mode 100644 drivers/iommu/hyperv-irq.c
>>


^ permalink raw reply

* RE: [PATCH v4 7/7] mshv: Add debugfs to view hypervisor statistics
From: Michael Kelley @ 2026-01-24  4:14 UTC (permalink / raw)
  To: Nuno Das Neves, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, skinsburskii@linux.microsoft.com
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	prapal@linux.microsoft.com, mrathor@linux.microsoft.com,
	paekkaladevi@linux.microsoft.com
In-Reply-To: <467e1b51-e335-4280-befa-adc2cf283ab5@linux.microsoft.com>

From: Nuno Das Neves <nunodasneves@linux.microsoft.com> Sent: Friday, January 23, 2026 1:11 PM
> 
> On 1/23/2026 9:09 AM, Michael Kelley wrote:
> > From: Nuno Das Neves <nunodasneves@linux.microsoft.com> Sent: Wednesday, January 21, 2026 1:46 PM
> >>
> >> Introduce a debugfs interface to expose root and child partition stats
> >> when running with mshv_root.
> >>
> >> Create a debugfs directory "mshv" containing 'stats' files organized by
> >> type and id. A stats file contains a number of counters depending on
> >> its type. e.g. an excerpt from a VP stats file:
> >>
> >> TotalRunTime                  : 1997602722
> >> HypervisorRunTime             : 649671371
> >> RemoteNodeRunTime             : 0
> >> NormalizedRunTime             : 1997602721
> >> IdealCpu                      : 0
> >> HypercallsCount               : 1708169
> >> HypercallsTime                : 111914774
> >> PageInvalidationsCount        : 0
> >> PageInvalidationsTime         : 0
> >>
> >> On a root partition with some active child partitions, the entire
> >> directory structure may look like:
> >>
> >> mshv/
> >>   stats             # hypervisor stats
> >>   lp/               # logical processors
> >>     0/              # LP id
> >>       stats         # LP 0 stats
> >>     1/
> >>     2/
> >>     3/
> >>   partition/        # partition stats
> >>     1/              # root partition id
> >>       stats         # root partition stats
> >>       vp/           # root virtual processors
> >>         0/          # root VP id
> >>           stats     # root VP 0 stats
> >>         1/
> >>         2/
> >>         3/
> >>     42/             # child partition id
> >>       stats         # child partition stats
> >>       vp/           # child VPs
> >>         0/          # child VP id
> >>           stats     # child VP 0 stats
> >>         1/
> >>     43/
> >>     55/
> >>
> >> On L1VH, some stats are not present as it does not own the hardware
> >> like the root partition does:
> >> - The hypervisor and lp stats are not present
> >> - L1VH's partition directory is named "self" because it can't get its
> >>   own id
> >> - Some of L1VH's partition and VP stats fields are not populated, because
> >>   it can't map its own HV_STATS_AREA_PARENT page.
> >>
> >> Co-developed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> >> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> >> Co-developed-by: Praveen K Paladugu <prapal@linux.microsoft.com>
> >> Signed-off-by: Praveen K Paladugu <prapal@linux.microsoft.com>
> >> Co-developed-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> >> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> >> Co-developed-by: Purna Pavan Chandra Aekkaladevi
> >> <paekkaladevi@linux.microsoft.com>
> >> Signed-off-by: Purna Pavan Chandra Aekkaladevi <paekkaladevi@linux.microsoft.com>
> >> Co-developed-by: Jinank Jain <jinankjain@microsoft.com>
> >> Signed-off-by: Jinank Jain <jinankjain@microsoft.com>
> >> Signed-off-by: Nuno Das Neves <nunodasneves@linux.microsoft.com>
> >> Reviewed-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> >> ---
> >>  drivers/hv/Makefile         |   1 +
> >>  drivers/hv/hv_counters.c    |   1 +
> >>  drivers/hv/hv_synic.c       | 177 +++++++++
> >
> > This new file hv_synic.c seems to be spurious.  It looks like you unintentionally
> > picked up this new file from the build tree where you were creating the patches
> > for this series.
> >
> 
> Oh, that's embarrassing! Yes, it's a half-baked, unrelated work-in-progress...
> Please ignore!
> 
> <snip>
> >> diff --git a/drivers/hv/mshv_debugfs.c b/drivers/hv/mshv_debugfs.c
> >> new file mode 100644
> >> index 000000000000..72eb0ae44e4b
> >> --- /dev/null
> >> +++ b/drivers/hv/mshv_debugfs.c
> >> @@ -0,0 +1,703 @@
> >> +// SPDX-License-Identifier: GPL-2.0-only
> >> +/*
> >> + * Copyright (c) 2026, Microsoft Corporation.
> >> + *
> >> + * The /sys/kernel/debug/mshv directory contents.
> >> + * Contains various statistics data, provided by the hypervisor.
> >> + *
> >> + * Authors: Microsoft Linux virtualization team
> >> + */
> >> +
> >> +#include <linux/debugfs.h>
> >> +#include <linux/stringify.h>
> >> +#include <asm/mshyperv.h>
> >> +#include <linux/slab.h>
> >> +
> >> +#include "mshv.h"
> >> +#include "mshv_root.h"
> >> +
> >> +#include "hv_counters.c"
> >> +
> >> +#define U32_BUF_SZ 11
> >> +#define U64_BUF_SZ 21
> >> +#define NUM_STATS_AREAS (HV_STATS_AREA_PARENT + 1)
> >
> > This is sort of weak in that it doesn't really guard against
> > changes in the enum that defines HV_STATS_AREA_PARENT.
> > It would work if it were defined as part of the enum, but then
> > you are changing the code coming from the Windows world,
> > which I know is a different problem.
> >
> > The enum is part of the hypervisor ABI and hence isn't likely to
> > change, but it still feels funny to define NUM_STATS_AREAS like
> > this. I would suggest dropping this and just using
> > HV_STATS_AREA_COUNT for the memory allocations even
> > though doing so will allocate space for a stats area pointer
> > that isn't used by this code. It's only a few bytes.
> >
> 
> That would work, but then I'd want to have a comment explaining
> that the decision is intentional, otherwise I think it's just as
> confusing to have unexplained wasted space.

Yes, that's true.

> 
> Alternatively, the usage of SELF and PARENT (but not INTERNAL)
> could be made explicit by a compile-time check, and a comment to
> clarify:
> 
> /* Only support SELF and PARENT areas */
> #define NUM_STATS_AREAS 2
> static_assert(HV_STATS_AREA_SELF == 0 && HV_STATS_AREA_PARENT == 1,
> 	      "SELF and PARENT areas must be usable as indices into an array of size NUM_STATS_AREAS")

Works for me. 

[snip]

> >> +static int __init mshv_debugfs_hv_stats_create(struct dentry *parent)
> >> +{
> >> +	struct dentry *dentry;
> >> +	u64 *stats;
> >> +	int err;
> >> +
> >> +	stats = mshv_hv_stats_map();
> >> +	if (IS_ERR(stats))
> >> +		return PTR_ERR(stats);
> >> +
> >> +	dentry = debugfs_create_file("stats", 0400, parent,
> >> +				     stats, &hv_stats_fops);
> >> +	if (IS_ERR(dentry)) {
> >> +		err = PTR_ERR(dentry);
> >> +		pr_err("%s: failed to create hypervisor stats dentry: %d\n",
> >> +		       __func__, err);
> >> +		goto unmap_hv_stats;
> >> +	}
> >> +
> >> +	mshv_lps_count = num_present_cpus();
> >
> > This method of setting mshv_lps_count, and the iteration through the lp_index
> > in mshv_debugfs_lp_create() and mshv_debugfs_lp_remove(), seems risky. The
> > lp_index gets passed to the hypervisor, so it must be the hypervisor's concept
> > of the lp_index. Is that always guaranteed to be the same as Linux's numbering
> > of the present CPUs? There may be edge cases where it is not. For example, what
> > if Linux in the root partition were booted with the "nosmt" kernel boot option,
> > such that Linux ignores all the 2nd hyper-threads in a core? Could that create
> > a numbering mismatch?
> >
> 
> Ah, this was using the hypervisor stats page before; HvLogicalProcessors. But
> I removed the enum, so I thought this would be a reasonable way to get the number
> of LPs, but I think I'm mistaken.
> 
> For context, there is a fix to how LP and VP numbers are assigned in
> hv_smp_prepare_cpus(), but it's part of a future patchset. That fix ensures the
> LP indices are dense. The code looks like:
> 
> 	/* create dense LPs from 0-N for all apicids */
>         i = next_smallest_apicid(apicids, 0);
>         for (lpidx = 1; i != INT_MAX; lpidx++) {
>                 node = __apicid_to_node[i];
>                 if (node == NUMA_NO_NODE)
>                         node = 0;
> 
>                 /* params: node num, lp index, apic id */
>                 ret = hv_call_add_logical_proc(node, lpidx, i);
>                 BUG_ON(ret);
> 
>                 i = next_smallest_apicid(apicids, i);
>         }
> 
> 	/* create a VP for each present CPU */
>         lpidx = 1;         /* skip BSP cpu 0 */
>         for_each_present_cpu(i) {
>                 if (i == 0)
>                         continue;
> 
>                 /* params: node num, domid, vp index, lp index */
>                 ret = hv_call_create_vp(numa_cpu_node(i),
>                                         hv_current_partition_id, lpidx, lpidx);
>                 BUG_ON(ret);
>                 lpidx++;
>         }
> 
> For what it's worth, with that fix^ I tested with "nosmt" and things worked as I
> would expect: All LPs were displayed in debugfs, but every second LP was not in
> use by Linux, as evidenced by e.g. the number of timer interrupts not going up:
> LpTimerInterrupts            : 1
> 
> Also, only every second VP was created (0, 2, 4, 6...) since the others aren't
> in the present mask at boot.

OK -- I'm glad to hear that someone has thought about this potential
issue. next_smallest_apicid() looks like it should handle the gaps that can occur
in APIC IDs, as I pointed out in my separate email to you about an issue unrelated
to this patch set.

> 
> > Note that for vp_index, we have the hv_vp_index[] array for translating from
> > Linux's concept of a CPU number to Hyper-V's concept of vp_index. For
> > example, mshv_debugfs_parent_partition_create() correctly goes through
> > this translation. And presumably when the VMM code does the
> > MSHV_CREATE_VP ioctl, it is passing in a hypervisor vp_index.
> >
> > Everything may work fine "as is" for the moment, but the lp functions here
> > are still conflating the hypervisor's LP numbering with Linux's CPU numbering,
> > and that seems like a recipe for trouble somewhere down the road. I'm
> > not sure how the hypervisor interprets the "lp_index" part of the identity
> > argument passed to a hypercall, so I'm not sure what the fix is.
> >
> 
> The simplest thing for now might be to bring back that enum value
> HvLogicalProcessors just for this one usage. I'll admit I'm not familiar with
> all the nuances here so there are still probably edge cases here.

Yes, that seems like it would make sense. At least it would have the
hypervisor reporting how many LPs it thinks there are, instead of
getting Linux's view, which might be different. Guaranteeing that the
LP indices are dense can come later since in all likelihood they are
dense by default. It's the potential for an atypical corner case that
I was worried about.

Michael

> 
> >> +
> >> +	return 0;
> >> +
> >> +unmap_hv_stats:
> >> +	mshv_hv_stats_unmap();
> >> +	return err;
> >> +}
> >> +
> <snip>


^ permalink raw reply

* [RFC PATCH 0/3] mshv: Add kexec safety for deposited pages
From: Stanislav Kinsburskii @ 2026-01-24 18:42 UTC (permalink / raw)
  To: pasha.tatashin, rppt, pratyush, kys, haiyangz, wei.liu, decui,
	longli
  Cc: linux-hyperv, linux-kernel

This is an RFC series. The goal is to discuss an approach; it is not
intended for merging as-is.

Microsoft Hypervisor is active, the kernel cannot access pages that have
been deposited to the hypervisor; doing so can trigger a GPF. The “pages
deposited” state is also lost across kexec, and the next kernel has no way
to know which pages are still owned/managed by the hypervisor.

Until a proper handoff of that state to the next kernel exists, kexec (and
liveupdate-based transitions) must be refused whenever there is shared
state that cannot safely survive the transition (in particular, deposited
pages; also running VMs).

What MSHV needs (until proper state preservation is ready) is simply a callback into the driver at the “freeze” stage
so it can:
 - refuse the transition while VMs are running
 - attempt to withdraw deposited pages (L1VH host case)
 - fail the transition if any pages remain deposited

Current liveupdate/LUO interfaces are primarily userspace-facing and are
structured around file/FD preservation. In order to reuse the existing
freeze sequencing with minimal churn, this RFC adapts a small part of the
LUO/file plumbing to allow a kernel user (mshv driver) to register into
that flow. This results in creating a session file solely as a vehicle to
get the freeze callback invoked, which is acknowledged to be awkward.

The intent of posting this is to get feedback on:
  - whether reusing the existing file-based liveupdate hooks for a pure
    in-kernel “freeze veto” is acceptable, or
  - whether we should instead introduce a dedicated kernel-side
    registration API (e.g. a notifier-style “freeze blockers” interface)
    that does not involve files at all.

---

Stanislav Kinsburskii (3):
      luo: Extract file object logic
      mshv: Account pages deposited to hypervisor
      mshv: Block kexec when hypervisor has pages deposited


 MAINTAINERS                      |    1 
 drivers/hv/Kconfig               |    1 
 drivers/hv/Makefile              |    1 
 drivers/hv/hv_proc.c             |    4 +
 drivers/hv/mshv_luo.c            |  113 ++++++++++++++++++++++++++++++++++++++
 drivers/hv/mshv_root.h           |   14 +++++
 drivers/hv/mshv_root_hv_call.c   |    2 +
 drivers/hv/mshv_root_main.c      |    7 ++
 include/linux/kho/abi/mshv.h     |   14 +++++
 include/linux/liveupdate.h       |    3 +
 kernel/liveupdate/luo_file.c     |   55 +++++++++++++++---
 kernel/liveupdate/luo_internal.h |    2 -
 kernel/liveupdate/luo_session.c  |    2 -
 13 files changed, 208 insertions(+), 11 deletions(-)
 create mode 100644 drivers/hv/mshv_luo.c
 create mode 100644 include/linux/kho/abi/mshv.h


^ permalink raw reply

* [RFC PATCH 1/3] luo: Extract file object logic
From: Stanislav Kinsburskii @ 2026-01-24 18:42 UTC (permalink / raw)
  To: pasha.tatashin, rppt, pratyush, kys, haiyangz, wei.liu, decui,
	longli
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <176927917602.26405.4149319776242398706.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Split file object related logic out of the liveupdate implementation so it
can be reused by other kernel subsystems.

This is a preparatory change for enabling use of the luo infrastructure
outside of liveupdate. No functional change intended.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 include/linux/liveupdate.h       |    3 ++
 kernel/liveupdate/luo_file.c     |   55 ++++++++++++++++++++++++++++++++------
 kernel/liveupdate/luo_internal.h |    2 +
 kernel/liveupdate/luo_session.c  |    2 +
 4 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h
index a7f6ee5b6771..0e6e0a1b1489 100644
--- a/include/linux/liveupdate.h
+++ b/include/linux/liveupdate.h
@@ -112,6 +112,9 @@ int liveupdate_reboot(void);
 int liveupdate_register_file_handler(struct liveupdate_file_handler *fh);
 int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh);
 
+int liveupdate_session_create(char *name, struct file **filep);
+void liveupdate_session_destroy(char *name, struct file *file);
+
 #else /* CONFIG_LIVEUPDATE */
 
 static inline bool liveupdate_enabled(void)
diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c
index a32a777f6df8..e46aeb29ab4d 100644
--- a/kernel/liveupdate/luo_file.c
+++ b/kernel/liveupdate/luo_file.c
@@ -250,12 +250,11 @@ static bool luo_token_is_used(struct luo_file_set *file_set, u64 token)
  *         -ENOMEM on memory allocation failure.
  *         Other erros might be returned by .preserve().
  */
-int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
+static int luo_preserve_file(struct luo_file_set *file_set, u64 token, struct file *file)
 {
 	struct liveupdate_file_op_args args = {0};
 	struct liveupdate_file_handler *fh;
 	struct luo_file *luo_file;
-	struct file *file;
 	int err;
 
 	if (luo_token_is_used(file_set, token))
@@ -264,13 +263,9 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
 	if (file_set->count == LUO_FILE_MAX)
 		return -ENOSPC;
 
-	file = fget(fd);
-	if (!file)
-		return -EBADF;
-
 	err = luo_alloc_files_mem(file_set);
 	if (err)
-		goto  err_fput;
+		return err;
 
 	err = -ENOENT;
 	luo_list_for_each_private(fh, &luo_file_handler_list, list) {
@@ -313,8 +308,22 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd)
 	kfree(luo_file);
 err_free_files_mem:
 	luo_free_files_mem(file_set);
-err_fput:
-	fput(file);
+
+	return err;
+}
+
+int luo_preserve_fd(struct luo_file_set *file_set, u64 token, int fd)
+{
+	struct file *file;
+	int err;
+
+	file = fget(fd);
+	if (!file)
+		return -EBADF;
+
+	err = luo_preserve_file(file_set, token, file);
+	if (err)
+		fput(file);
 
 	return err;
 }
@@ -861,6 +870,34 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh)
 	return err;
 }
 
+int liveupdate_session_create(char *name, struct file **filep)
+{
+	struct luo_session *session;
+	int err;
+
+	err = luo_session_create(name, filep);
+	if (err)
+		return err;
+
+	session = (*filep)->private_data;
+
+	/* Not need to guard here, session is not yet visible to others */
+	err = luo_preserve_file(&session->file_set, 0, *filep);
+	if (err)
+		goto err_preserve;
+
+	return 0;
+
+err_preserve:
+	fput(*filep);
+	return err;
+}
+
+void liveupdate_session_destroy(char *name, struct file *file)
+{
+	fput(file);
+}
+
 /**
  * liveupdate_unregister_file_handler - Unregister a liveupdate file handler
  * @fh: The file handler to unregister
diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h
index c8973b543d1d..b14391f1514f 100644
--- a/kernel/liveupdate/luo_internal.h
+++ b/kernel/liveupdate/luo_internal.h
@@ -93,7 +93,7 @@ int luo_session_deserialize(void);
 bool luo_session_quiesce(void);
 void luo_session_resume(void);
 
-int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd);
+int luo_preserve_fd(struct luo_file_set *file_set, u64 token, int fd);
 void luo_file_unpreserve_files(struct luo_file_set *file_set);
 int luo_file_freeze(struct luo_file_set *file_set,
 		    struct luo_file_set_ser *file_set_ser);
diff --git a/kernel/liveupdate/luo_session.c b/kernel/liveupdate/luo_session.c
index dbdbc3bd7929..641186d41a22 100644
--- a/kernel/liveupdate/luo_session.c
+++ b/kernel/liveupdate/luo_session.c
@@ -234,7 +234,7 @@ static int luo_session_preserve_fd(struct luo_session *session,
 	int err;
 
 	guard(mutex)(&session->mutex);
-	err = luo_preserve_file(&session->file_set, argp->token, argp->fd);
+	err = luo_preserve_fd(&session->file_set, argp->token, argp->fd);
 	if (err)
 		return err;
 



^ permalink raw reply related

* [RFC PATCH 2/3] mshv: Account pages deposited to hypervisor
From: Stanislav Kinsburskii @ 2026-01-24 18:42 UTC (permalink / raw)
  To: pasha.tatashin, rppt, pratyush, kys, haiyangz, wei.liu, decui,
	longli
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <176927917602.26405.4149319776242398706.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

This is a preparatory change for blocking kexec is there are any pages
deposited, as this information is lost after kexec and the pages aren't
accessible by kernel.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/hv_proc.c           |    4 ++++
 drivers/hv/mshv_root.h         |    1 +
 drivers/hv/mshv_root_hv_call.c |    2 ++
 3 files changed, 7 insertions(+)

diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
index 89870c1b0087..39bbbedb0340 100644
--- a/drivers/hv/hv_proc.c
+++ b/drivers/hv/hv_proc.c
@@ -15,6 +15,8 @@
  */
 #define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1)
 
+atomic_t hv_pages_deposited;
+
 /* Deposits exact number of pages. Must be called with interrupts enabled.  */
 int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
 {
@@ -93,6 +95,8 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
 		goto err_free_allocations;
 	}
 
+	atomic_add(page_count, &hv_pages_deposited);
+
 	ret = 0;
 	goto free_buf;
 
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 3c1d88b36741..c792afce0839 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -319,6 +319,7 @@ int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 a
 extern struct mshv_root mshv_root;
 extern enum hv_scheduler_type hv_scheduler_type;
 extern u8 * __percpu *hv_synic_eventring_tail;
+extern atomic_t hv_pages_deposited;
 
 struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
 					   u64 uaddr, u32 flags);
diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 06f2bac8039d..4203af5190ee 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -73,6 +73,8 @@ int hv_call_withdraw_memory(u64 count, int node, u64 partition_id)
 		for (i = 0; i < completed; i++)
 			__free_page(pfn_to_page(output_page->gpa_page_list[i]));
 
+		atomic_sub(completed, &hv_pages_deposited);
+
 		if (!hv_result_success(status)) {
 			if (hv_result(status) == HV_STATUS_NO_RESOURCES)
 				status = HV_STATUS_SUCCESS;



^ permalink raw reply related

* [RFC PATCH 3/3] mshv: Block kexec when hypervisor has pages deposited
From: Stanislav Kinsburskii @ 2026-01-24 18:42 UTC (permalink / raw)
  To: pasha.tatashin, rppt, pratyush, kys, haiyangz, wei.liu, decui,
	longli
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <176927917602.26405.4149319776242398706.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Use the live update infrastructure to make kexec safe when Microsoft
Hypervisor is active.

The kernel cannot access hypervisor-deposited pages; any access triggers a
GPF. Until the deposited-page state can be handed over to the next kernel,
kexec must be blocked is there is any hsared state between kernel and
hypervisor.

During the freeze stage:
- Refuse the transition while VMs are running
- Withdraw all pages from L1VH host (guest pages are withdrawn
  upon guest shutdown)
- Verify no deposited pages remain

Abort kexec if any of the above checks fail.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 MAINTAINERS                  |    1 
 drivers/hv/Kconfig           |    1 
 drivers/hv/Makefile          |    1 
 drivers/hv/mshv_luo.c        |  113 ++++++++++++++++++++++++++++++++++++++++++
 drivers/hv/mshv_root.h       |   13 +++++
 drivers/hv/mshv_root_main.c  |    7 +++
 include/linux/kho/abi/mshv.h |   14 +++++
 7 files changed, 150 insertions(+)
 create mode 100644 drivers/hv/mshv_luo.c
 create mode 100644 include/linux/kho/abi/mshv.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 5b11839cba9d..d625a1c111e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11756,6 +11756,7 @@ F:	include/hyperv/hvgdk_mini.h
 F:	include/hyperv/hvhdk.h
 F:	include/hyperv/hvhdk_mini.h
 F:	include/linux/hyperv.h
+F:	include/linux/kho/abi/mshv.h
 F:	include/net/mana
 F:	include/uapi/linux/hyperv.h
 F:	include/uapi/rdma/mana-abi.h
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 7937ac0cbd0f..94887b8b92b5 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -78,6 +78,7 @@ config MSHV_ROOT
 	select VIRT_XFER_TO_GUEST_WORK
 	select HMM_MIRROR
 	select MMU_NOTIFIER
+	select LIVEUPDATE
 	default n
 	help
 	  Select this option to enable support for booting and running as root
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index a49f93c2d245..73258fb811eb 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -15,6 +15,7 @@ hv_vmbus-$(CONFIG_HYPERV_TESTING)	+= hv_debugfs.o
 hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o
 mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \
 	       mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o
+mshv_root-$(CONFIG_LIVEUPDATE) += mshv_luo.o
 mshv_vtl-y := mshv_vtl_main.o
 
 # Code that must be built-in
diff --git a/drivers/hv/mshv_luo.c b/drivers/hv/mshv_luo.c
new file mode 100644
index 000000000000..eed7755fc27e
--- /dev/null
+++ b/drivers/hv/mshv_luo.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * Live update orchestration management for mshv_root module.
+ *
+ * Author: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/liveupdate.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kho/abi/mshv.h>
+#include <asm/mshyperv.h>
+#include "mshv_root.h"
+
+static struct file *mshv_luo_file;
+
+static void mshv_luo_finish(struct liveupdate_file_op_args *args)
+{
+}
+
+static int mshv_luo_retrieve(struct liveupdate_file_op_args *args)
+{
+	return 0;
+}
+
+static int mshv_luo_freeze(struct liveupdate_file_op_args *args)
+{
+	if (!hash_empty(mshv_root.pt_htable)) {
+		pr_warn("mshv: Cannot perform live update while VMs are active\n");
+		return -EBUSY;
+	}
+
+	if (hv_l1vh_partition()) {
+		int err;
+
+		/* Attempt to withdraw all the deposited pages */
+		err = hv_call_withdraw_memory(U64_MAX, NUMA_NO_NODE,
+					      hv_current_partition_id);
+		if (err) {
+			pr_err("mshv: Failed to withdraw memory from L1 virtualization: %d\n", err);
+			return err;
+		}
+	}
+
+	if (atomic_read(&hv_pages_deposited)) {
+		pr_warn("mshv: Cannot perform live update while pages are deposited\n");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static void mshv_luo_unpreserve(struct liveupdate_file_op_args *args)
+{
+}
+
+static int mshv_luo_preserve(struct liveupdate_file_op_args *args)
+{
+	return 0;
+}
+
+static bool mshv_luo_can_preserve(struct liveupdate_file_handler *handler,
+				  struct file *file)
+{
+	return file == mshv_luo_file;
+}
+
+static const struct liveupdate_file_ops mshv_luo_file_ops = {
+	.can_preserve = mshv_luo_can_preserve,
+	.preserve = mshv_luo_preserve,
+	.unpreserve = mshv_luo_unpreserve,
+	.retrieve = mshv_luo_retrieve,
+	.freeze = mshv_luo_freeze,
+	.finish = mshv_luo_finish,
+	.owner = THIS_MODULE,
+};
+
+static struct liveupdate_file_handler mshv_luo_handler = {
+	.ops = &mshv_luo_file_ops,
+	.compatible = MSHV_LUO_FH_COMPATIBLE,
+};
+
+int __init mshv_luo_init(void)
+{
+	int err;
+
+	err = liveupdate_register_file_handler(&mshv_luo_handler);
+	if (err && err != -EOPNOTSUPP) {
+		pr_err("Could not register luo filesystem handler: %pe\n",
+		       ERR_PTR(err));
+		return err;
+	}
+
+	err = liveupdate_session_create("mshv_root", &mshv_luo_file);
+	if (err)
+		goto err_session;
+
+	pr_info("mshv_root live update handler registered\n");
+	return 0;
+
+err_session:
+	liveupdate_unregister_file_handler(&mshv_luo_handler);
+	return err;
+}
+
+void __exit mshv_luo_exit(void)
+{
+	if (mshv_luo_file)
+		fput(mshv_luo_file);
+	liveupdate_unregister_file_handler(&mshv_luo_handler);
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index c792afce0839..89d5ece0b538 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -17,6 +17,7 @@
 #include <linux/build_bug.h>
 #include <linux/mmu_notifier.h>
 #include <uapi/linux/mshv.h>
+#include <hyperv/hvhdk.h>
 
 /*
  * Hypervisor must be between these version numbers (inclusive)
@@ -334,4 +335,16 @@ bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
 void mshv_region_movable_fini(struct mshv_mem_region *region);
 bool mshv_region_movable_init(struct mshv_mem_region *region);
 
+#if IS_ENABLED(CONFIG_LIVEUPDATE)
+int __init mshv_luo_init(void);
+void __exit mshv_luo_exit(void);
+#else
+static inline int mshv_luo_init(void)
+{
+	return 0;
+}
+
+static inline void mshv_luo_exit(void) { }
+#endif
+
 #endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 5fc572e31cd7..c0274bbc65ac 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -2330,6 +2330,10 @@ static int __init mshv_parent_partition_init(void)
 	if (ret)
 		goto deinit_root_scheduler;
 
+	ret = mshv_luo_init();
+	if (ret)
+		goto deinit_irqfd_wq;
+
 	spin_lock_init(&mshv_root.pt_ht_lock);
 	hash_init(mshv_root.pt_htable);
 
@@ -2337,6 +2341,8 @@ static int __init mshv_parent_partition_init(void)
 
 	return 0;
 
+deinit_irqfd_wq:
+	mshv_irqfd_wq_cleanup();
 deinit_root_scheduler:
 	root_scheduler_deinit();
 exit_partition:
@@ -2356,6 +2362,7 @@ static void __exit mshv_parent_partition_exit(void)
 	hv_setup_mshv_handler(NULL);
 	mshv_port_table_fini();
 	misc_deregister(&mshv_dev);
+	mshv_luo_exit();
 	mshv_irqfd_wq_cleanup();
 	root_scheduler_deinit();
 	if (hv_root_partition())
diff --git a/include/linux/kho/abi/mshv.h b/include/linux/kho/abi/mshv.h
new file mode 100644
index 000000000000..e6ae5a731802
--- /dev/null
+++ b/include/linux/kho/abi/mshv.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2026, Microsoft Corporation.
+ *
+ * Author: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
+ */
+
+#ifndef _LINUX_KHO_ABI_MSHV_H
+#define _LINUX_KHO_ABI_MSHV_H
+
+/* The compatibility string for mshv file handler */
+#define MSHV_LUO_FH_COMPATIBLE "mshv-v1"
+
+#endif /* _LINUX_KHO_ABI_MSHV_H */



^ permalink raw reply related

* Re: [PATCH] mshv: Make MSHV mutually exclusive with KEXEC
From: Stanislav Kinsburskii @ 2026-01-25 22:39 UTC (permalink / raw)
  To: Mukesh R
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <549041d1-360d-d34c-4e3b-62802346acaa@linux.microsoft.com>

On Fri, Jan 23, 2026 at 04:16:33PM -0800, Mukesh R wrote:
> On 1/23/26 14:20, Stanislav Kinsburskii wrote:
> > The MSHV driver deposits kernel-allocated pages to the hypervisor during
> > runtime and never withdraws them. This creates a fundamental incompatibility
> > with KEXEC, as these deposited pages remain unavailable to the new kernel
> > loaded via KEXEC, leading to potential system crashes upon kernel accessing
> > hypervisor deposited pages.
> > 
> > Make MSHV mutually exclusive with KEXEC until proper page lifecycle
> > management is implemented.
> > 
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > ---
> >   drivers/hv/Kconfig |    1 +
> >   1 file changed, 1 insertion(+)
> > 
> > diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> > index 7937ac0cbd0f..cfd4501db0fa 100644
> > --- a/drivers/hv/Kconfig
> > +++ b/drivers/hv/Kconfig
> > @@ -74,6 +74,7 @@ config MSHV_ROOT
> >   	# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
> >   	# no particular order, making it impossible to reassemble larger pages
> >   	depends on PAGE_SIZE_4KB
> > +	depends on !KEXEC
> >   	select EVENTFD
> >   	select VIRT_XFER_TO_GUEST_WORK
> >   	select HMM_MIRROR
> > 
> > 
> 
> Will this affect CRASH kexec? I see few CONFIG_CRASH_DUMP in kexec.c
> implying that crash dump might be involved. Or did you test kdump
> and it was fine?
> 

Yes, it will. Crash kexec depends on normal kexec functionality, so it
will be affected as well.

Thanks,
Stanislav

> Thanks,
> -Mukesh

^ permalink raw reply

* Re: [PATCH 2/4] mshv: Introduce hv_deposit_memory helper functions
From: Stanislav Kinsburskii @ 2026-01-25 22:41 UTC (permalink / raw)
  To: Mukesh R
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <23f52e20-f156-a2aa-ff04-00dc55238c51@linux.microsoft.com>

On Fri, Jan 23, 2026 at 04:33:39PM -0800, Mukesh R wrote:
> On 1/22/26 17:35, Stanislav Kinsburskii wrote:
> > Introduce hv_deposit_memory_node() and hv_deposit_memory() helper
> > functions to handle memory deposition with proper error handling.
> > 
> > The new hv_deposit_memory_node() function takes the hypervisor status
> > as a parameter and validates it before depositing pages. It checks for
> > HV_STATUS_INSUFFICIENT_MEMORY specifically and returns an error for
> > unexpected status codes.
> > 
> > This is a precursor patch to new out-of-memory error codes support.
> > No functional changes intended.
> > 
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > ---
> >   drivers/hv/hv_proc.c           |   22 ++++++++++++++++++++--
> >   drivers/hv/mshv_root_hv_call.c |   25 +++++++++----------------
> >   drivers/hv/mshv_root_main.c    |    3 +--
> >   include/asm-generic/mshyperv.h |   10 ++++++++++
> >   4 files changed, 40 insertions(+), 20 deletions(-)
> > 
> > diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> > index 80c66d1c74d5..c0c2bfc80d77 100644
> > --- a/drivers/hv/hv_proc.c
> > +++ b/drivers/hv/hv_proc.c
> > @@ -110,6 +110,23 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> >   }
> >   EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
> > +int hv_deposit_memory_node(int node, u64 partition_id,
> > +			   u64 hv_status)
> > +{
> > +	u32 num_pages;
> > +
> > +	switch (hv_result(hv_status)) {
> > +	case HV_STATUS_INSUFFICIENT_MEMORY:
> > +		num_pages = 1;
> > +		break;
> > +	default:
> > +		hv_status_err(hv_status, "Unexpected!\n");
> > +		return -ENOMEM;
> > +	}
> > +	return hv_call_deposit_pages(node, partition_id, num_pages);
> > +}
> > +EXPORT_SYMBOL_GPL(hv_deposit_memory_node);
> > +
> 
> Different hypercalls may want to deposit different number of pages in one
> shot. As feature evolves, page sizes get mixed, we'd almost need that
> flexibility. So, imo, either we just don't do this for now, or add num pages
> parameter to be passed down.
> 

What you do mean by "page sizes get mixed"?
A helper to deposit num pages already exists: its
hv_call_deposit_pages().

Thanks,
Stanislav

> Thanks,
> -Mukesh
> 
> 
> 
> >   bool hv_result_oom(u64 status)
> >   {
> >   	switch (hv_result(status)) {
> > @@ -155,7 +172,8 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
> >   			}
> >   			break;
> >   		}
> > -		ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
> > +		ret = hv_deposit_memory_node(node, hv_current_partition_id,
> > +					     status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -197,7 +215,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
> >   			}
> >   			break;
> >   		}
> > -		ret = hv_call_deposit_pages(node, partition_id, 1);
> > +		ret = hv_deposit_memory_node(node, partition_id, status);
> >   	} while (!ret);
> > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > index 58c5cbf2e567..06f2bac8039d 100644
> > --- a/drivers/hv/mshv_root_hv_call.c
> > +++ b/drivers/hv/mshv_root_hv_call.c
> > @@ -123,8 +123,7 @@ int hv_call_create_partition(u64 flags,
> >   			break;
> >   		}
> >   		local_irq_restore(irq_flags);
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -					    hv_current_partition_id, 1);
> > +		ret = hv_deposit_memory(hv_current_partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -151,7 +150,7 @@ int hv_call_initialize_partition(u64 partition_id)
> >   			ret = hv_result_to_errno(status);
> >   			break;
> >   		}
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
> > +		ret = hv_deposit_memory(partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -465,8 +464,7 @@ int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
> >   		}
> >   		local_irq_restore(flags);
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -					    partition_id, 1);
> > +		ret = hv_deposit_memory(partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -525,8 +523,7 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
> >   		}
> >   		local_irq_restore(flags);
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -					    partition_id, 1);
> > +		ret = hv_deposit_memory(partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -573,7 +570,7 @@ static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
> >   		local_irq_restore(flags);
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
> > +		ret = hv_deposit_memory(partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -722,8 +719,7 @@ hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
> >   			ret = hv_result_to_errno(status);
> >   			break;
> >   		}
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1);
> > -
> > +		ret = hv_deposit_memory(port_partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -776,8 +772,7 @@ hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
> >   			ret = hv_result_to_errno(status);
> >   			break;
> >   		}
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -					    connection_partition_id, 1);
> > +		ret = hv_deposit_memory(connection_partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -848,8 +843,7 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type,
> >   			break;
> >   		}
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -					    hv_current_partition_id, 1);
> > +		ret = hv_deposit_memory(hv_current_partition_id, status);
> >   	} while (!ret);
> >   	return ret;
> > @@ -885,8 +879,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
> >   			return ret;
> >   		}
> > -		ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -					    hv_current_partition_id, 1);
> > +		ret = hv_deposit_memory(hv_current_partition_id, status);
> >   		if (ret)
> >   			return ret;
> >   	} while (!ret);
> > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > index f4697497f83e..5fc572e31cd7 100644
> > --- a/drivers/hv/mshv_root_main.c
> > +++ b/drivers/hv/mshv_root_main.c
> > @@ -264,8 +264,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
> >   		if (!hv_result_oom(status))
> >   			ret = hv_result_to_errno(status);
> >   		else
> > -			ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > -						    pt_id, 1);
> > +			ret = hv_deposit_memory(pt_id, status);
> >   	} while (!ret);
> >   	args.status = hv_result(status);
> > diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> > index b73352a7fc9e..c8e8976839f8 100644
> > --- a/include/asm-generic/mshyperv.h
> > +++ b/include/asm-generic/mshyperv.h
> > @@ -344,6 +344,7 @@ static inline bool hv_parent_partition(void)
> >   }
> >   bool hv_result_oom(u64 status);
> > +int hv_deposit_memory_node(int node, u64 partition_id, u64 status);
> >   int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
> >   int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
> >   int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
> > @@ -353,6 +354,10 @@ static inline bool hv_root_partition(void) { return false; }
> >   static inline bool hv_l1vh_partition(void) { return false; }
> >   static inline bool hv_parent_partition(void) { return false; }
> >   static inline bool hv_result_oom(u64 status) { return false; }
> > +static inline int hv_deposit_memory_node(int node, u64 partition_id, u64 status)
> > +{
> > +	return -EOPNOTSUPP;
> > +}
> >   static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> >   {
> >   	return -EOPNOTSUPP;
> > @@ -367,6 +372,11 @@ static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u3
> >   }
> >   #endif /* CONFIG_MSHV_ROOT */
> > +static inline int hv_deposit_memory(u64 partition_id, u64 status)
> > +{
> > +	return hv_deposit_memory_node(NUMA_NO_NODE, partition_id, status);
> > +}
> > +
> >   #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
> >   u8 __init get_vtl(void);
> >   #else
> > 
> > 

^ permalink raw reply

* Re: [PATCH v0 12/15] x86/hyperv: Implement hyperv virtual iommu
From: Stanislav Kinsburskii @ 2026-01-26 15:57 UTC (permalink / raw)
  To: Mukesh R
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux, romank
In-Reply-To: <54fd73b9-ade6-f1bb-08fc-17571aeadb20@linux.microsoft.com>

On Fri, Jan 23, 2026 at 05:26:19PM -0800, Mukesh R wrote:
> On 1/20/26 16:12, Stanislav Kinsburskii wrote:
> > On Mon, Jan 19, 2026 at 10:42:27PM -0800, Mukesh R wrote:
> > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > 
> > > Add a new file to implement management of device domains, mapping and
> > > unmapping of iommu memory, and other iommu_ops to fit within the VFIO
> > > framework for PCI passthru on Hyper-V running Linux as root or L1VH
> > > parent. This also implements direct attach mechanism for PCI passthru,
> > > and it is also made to work within the VFIO framework.
> > > 
> > > At a high level, during boot the hypervisor creates a default identity
> > > domain and attaches all devices to it. This nicely maps to Linux iommu
> > > subsystem IOMMU_DOMAIN_IDENTITY domain. As a result, Linux does not
> > > need to explicitly ask Hyper-V to attach devices and do maps/unmaps
> > > during boot. As mentioned previously, Hyper-V supports two ways to do
> > > PCI passthru:
> > > 
> > >    1. Device Domain: root must create a device domain in the hypervisor,
> > >       and do map/unmap hypercalls for mapping and unmapping guest RAM.
> > >       All hypervisor communications use device id of type PCI for
> > >       identifying and referencing the device.
> > > 
> > >    2. Direct Attach: the hypervisor will simply use the guest's HW
> > >       page table for mappings, thus the host need not do map/unmap
> > >       device memory hypercalls. As such, direct attach passthru setup
> > >       during guest boot is extremely fast. A direct attached device
> > >       must be referenced via logical device id and not via the PCI
> > >       device id.
> > > 
> > > At present, L1VH root/parent only supports direct attaches. Also direct
> > > attach is default in non-L1VH cases because there are some significant
> > > performance issues with device domain implementation currently for guests
> > > with higher RAM (say more than 8GB), and that unfortunately cannot be
> > > addressed in the short term.
> > > 
> > 
> > <snip>
> >

<snip>

> > > +static void hv_iommu_detach_dev(struct iommu_domain *immdom, struct device *dev)
> > > +{
> > > +	struct pci_dev *pdev;
> > > +	struct hv_domain *hvdom = to_hv_domain(immdom);
> > > +
> > > +	/* See the attach function, only PCI devices for now */
> > > +	if (!dev_is_pci(dev))
> > > +		return;
> > > +
> > > +	if (hvdom->num_attchd == 0)
> > > +		pr_warn("Hyper-V: num_attchd is zero (%s)\n", dev_name(dev));
> > > +
> > > +	pdev = to_pci_dev(dev);
> > > +
> > > +	if (hvdom->attached_dom) {
> > > +		hv_iommu_det_dev_from_guest(hvdom, pdev);
> > > +
> > > +		/* Do not reset attached_dom, hv_iommu_unmap_pages happens
> > > +		 * next.
> > > +		 */
> > > +	} else {
> > > +		hv_iommu_det_dev_from_dom(hvdom, pdev);
> > > +	}
> > > +
> > > +	hvdom->num_attchd--;
> > 
> > Shouldn't this be modified iff the detach succeeded?
> 
> We want to still free the domain and not let it get stuck. The purpose
> is more to make sure detach was called before domain free.
> 

How can one debug subseqent errors if num_attchd is decremented
unconditionally? In reality the device is left attached, but the related
kernel metadata is gone.

> > > +}
> > > +
> > > +static int hv_iommu_add_tree_mapping(struct hv_domain *hvdom,
> > > +				     unsigned long iova, phys_addr_t paddr,
> > > +				     size_t size, u32 flags)
> > > +{
> > > +	unsigned long irqflags;
> > > +	struct hv_iommu_mapping *mapping;
> > > +
> > > +	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
> > > +	if (!mapping)
> > > +		return -ENOMEM;
> > > +
> > > +	mapping->paddr = paddr;
> > > +	mapping->iova.start = iova;
> > > +	mapping->iova.last = iova + size - 1;
> > > +	mapping->flags = flags;
> > > +
> > > +	spin_lock_irqsave(&hvdom->mappings_lock, irqflags);
> > > +	interval_tree_insert(&mapping->iova, &hvdom->mappings_tree);
> > > +	spin_unlock_irqrestore(&hvdom->mappings_lock, irqflags);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static size_t hv_iommu_del_tree_mappings(struct hv_domain *hvdom,
> > > +					unsigned long iova, size_t size)
> > > +{
> > > +	unsigned long flags;
> > > +	size_t unmapped = 0;
> > > +	unsigned long last = iova + size - 1;
> > > +	struct hv_iommu_mapping *mapping = NULL;
> > > +	struct interval_tree_node *node, *next;
> > > +
> > > +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
> > > +	next = interval_tree_iter_first(&hvdom->mappings_tree, iova, last);
> > > +	while (next) {
> > > +		node = next;
> > > +		mapping = container_of(node, struct hv_iommu_mapping, iova);
> > > +		next = interval_tree_iter_next(node, iova, last);
> > > +
> > > +		/* Trying to split a mapping? Not supported for now. */
> > > +		if (mapping->iova.start < iova)
> > > +			break;
> > > +
> > > +		unmapped += mapping->iova.last - mapping->iova.start + 1;
> > > +
> > > +		interval_tree_remove(node, &hvdom->mappings_tree);
> > > +		kfree(mapping);
> > > +	}
> > > +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
> > > +
> > > +	return unmapped;
> > > +}
> > > +
> > > +/* Return: must return exact status from the hypercall without changes */
> > > +static u64 hv_iommu_map_pgs(struct hv_domain *hvdom,
> > > +			    unsigned long iova, phys_addr_t paddr,
> > > +			    unsigned long npages, u32 map_flags)
> > > +{
> > > +	u64 status;
> > > +	int i;
> > > +	struct hv_input_map_device_gpa_pages *input;
> > > +	unsigned long flags, pfn = paddr >> HV_HYP_PAGE_SHIFT;
> > > +
> > > +	local_irq_save(flags);
> > > +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > +	memset(input, 0, sizeof(*input));
> > > +
> > > +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
> > > +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
> > > +	input->device_domain.domain_id.id = hvdom->domid_num;
> > > +	input->map_flags = map_flags;
> > > +	input->target_device_va_base = iova;
> > > +
> > > +	pfn = paddr >> HV_HYP_PAGE_SHIFT;
> > > +	for (i = 0; i < npages; i++, pfn++)
> > > +		input->gpa_page_list[i] = pfn;
> > > +
> > > +	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
> > > +				     input, NULL);
> > > +
> > > +	local_irq_restore(flags);
> > > +	return status;
> > > +}
> > > +
> > > +/*
> > > + * The core VFIO code loops over memory ranges calling this function with
> > > + * the largest size from HV_IOMMU_PGSIZES. cond_resched() is in vfio_iommu_map.
> > > + */
> > > +static int hv_iommu_map_pages(struct iommu_domain *immdom, ulong iova,
> > > +			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
> > > +			      int prot, gfp_t gfp, size_t *mapped)
> > > +{
> > > +	u32 map_flags;
> > > +	int ret;
> > > +	u64 status;
> > > +	unsigned long npages, done = 0;
> > > +	struct hv_domain *hvdom = to_hv_domain(immdom);
> > > +	size_t size = pgsize * pgcount;
> > > +
> > > +	map_flags = HV_MAP_GPA_READABLE;	/* required */
> > > +	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
> > > +
> > > +	ret = hv_iommu_add_tree_mapping(hvdom, iova, paddr, size, map_flags);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	if (hvdom->attached_dom) {
> > > +		*mapped = size;
> > > +		return 0;
> > > +	}
> > > +
> > > +	npages = size >> HV_HYP_PAGE_SHIFT;
> > > +	while (done < npages) {
> > > +		ulong completed, remain = npages - done;
> > > +
> > > +		status = hv_iommu_map_pgs(hvdom, iova, paddr, remain,
> > > +					  map_flags);
> > > +
> > > +		completed = hv_repcomp(status);
> > > +		done = done + completed;
> > > +		iova = iova + (completed << HV_HYP_PAGE_SHIFT);
> > > +		paddr = paddr + (completed << HV_HYP_PAGE_SHIFT);
> > > +
> > > +		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
> > > +			ret = hv_call_deposit_pages(NUMA_NO_NODE,
> > > +						    hv_current_partition_id,
> > > +						    256);
> > > +			if (ret)
> > > +				break;
> > > +		}
> > > +		if (!hv_result_success(status))
> > > +			break;
> > > +	}
> > > +
> > > +	if (!hv_result_success(status)) {
> > > +		size_t done_size = done << HV_HYP_PAGE_SHIFT;
> > > +
> > > +		hv_status_err(status, "pgs:%lx/%lx iova:%lx\n",
> > > +			      done, npages, iova);
> > > +		/*
> > > +		 * lookup tree has all mappings [0 - size-1]. Below unmap will
> > > +		 * only remove from [0 - done], we need to remove second chunk
> > > +		 * [done+1 - size-1].
> > > +		 */
> > > +		hv_iommu_del_tree_mappings(hvdom, iova, size - done_size);
> > > +		hv_iommu_unmap_pages(immdom, iova - done_size, pgsize,
> > > +				     done, NULL);
> > > +		if (mapped)
> > > +			*mapped = 0;
> > > +	} else
> > > +		if (mapped)
> > > +			*mapped = size;
> > > +
> > > +	return hv_result_to_errno(status);
> > > +}
> > > +
> > > +static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
> > > +				   size_t pgsize, size_t pgcount,
> > > +				   struct iommu_iotlb_gather *gather)
> > > +{
> > > +	unsigned long flags, npages;
> > > +	struct hv_input_unmap_device_gpa_pages *input;
> > > +	u64 status;
> > > +	struct hv_domain *hvdom = to_hv_domain(immdom);
> > > +	size_t unmapped, size = pgsize * pgcount;
> > > +
> > > +	unmapped = hv_iommu_del_tree_mappings(hvdom, iova, size);
> > > +	if (unmapped < size)
> > > +		pr_err("%s: could not delete all mappings (%lx:%lx/%lx)\n",
> > > +		       __func__, iova, unmapped, size);
> > > +
> > > +	if (hvdom->attached_dom)
> > > +		return size;
> > > +
> > > +	npages = size >> HV_HYP_PAGE_SHIFT;
> > > +
> > > +	local_irq_save(flags);
> > > +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > > +	memset(input, 0, sizeof(*input));
> > > +
> > > +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
> > > +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
> > > +	input->device_domain.domain_id.id = hvdom->domid_num;
> > > +	input->target_device_va_base = iova;
> > > +
> > > +	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
> > > +				     0, input, NULL);
> > > +	local_irq_restore(flags);
> > > +
> > > +	if (!hv_result_success(status))
> > > +		hv_status_err(status, "\n");
> > > +
> > 
> > There is some inconsistency in namings and behaviour of paired
> > functions:
> > 1. The pair of hv_iommu_unmap_pages is called hv_iommu_map_pgs
> 
> The pair of hv_iommu_unmap_pages is hv_iommu_map_pages right above.
> hv_iommu_map_pgs could be renamed to hv_iommu_map_pgs_hcall I suppose.
> 

Hv_iommu_map_pages is a wrapper around hv_iommu_map_pgs while
hv_iommu_unmap_pages is a wrapper around the correspodning hypercall.
That's the inconsistency I meant.

> > 2. hv_iommu_map_pgs doesn't print status in case of error.
> 
> it does:
>             hv_status_err(status, "\n");  <==============

It does not. I guess you are confusing it with some other function.
Here is the function:

> 
> 
> > It would be much better to keep this code consistent.
> > 
> > > +	return unmapped;
> > > +}
> > > +
> > > +static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *immdom,
> > > +					 dma_addr_t iova)
> > > +{
> > > +	u64 paddr = 0;
> > > +	unsigned long flags;
> > > +	struct hv_iommu_mapping *mapping;
> > > +	struct interval_tree_node *node;
> > > +	struct hv_domain *hvdom = to_hv_domain(immdom);
> > > +
> > > +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
> > > +	node = interval_tree_iter_first(&hvdom->mappings_tree, iova, iova);
> > > +	if (node) {
> > > +		mapping = container_of(node, struct hv_iommu_mapping, iova);
> > > +		paddr = mapping->paddr + (iova - mapping->iova.start);
> > > +	}
> > > +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
> > > +
> > > +	return paddr;
> > > +}
> > > +
> > > +/*
> > > + * Currently, hypervisor does not provide list of devices it is using
> > > + * dynamically. So use this to allow users to manually specify devices that
> > > + * should be skipped. (eg. hypervisor debugger using some network device).
> > > + */
> > > +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
> > > +{
> > > +	if (!dev_is_pci(dev))
> > > +		return ERR_PTR(-ENODEV);
> > > +
> > > +	if (pci_devs_to_skip && *pci_devs_to_skip) {
> > > +		int rc, pos = 0;
> > > +		int parsed;
> > > +		int segment, bus, slot, func;
> > > +		struct pci_dev *pdev = to_pci_dev(dev);
> > > +
> > > +		do {
> > > +			parsed = 0;
> > > +
> > > +			rc = sscanf(pci_devs_to_skip + pos, " (%x:%x:%x.%x) %n",
> > > +				    &segment, &bus, &slot, &func, &parsed);
> > > +			if (rc)
> > > +				break;
> > > +			if (parsed <= 0)
> > > +				break;
> > > +
> > > +			if (pci_domain_nr(pdev->bus) == segment &&
> > > +			    pdev->bus->number == bus &&
> > > +			    PCI_SLOT(pdev->devfn) == slot &&
> > > +			    PCI_FUNC(pdev->devfn) == func) {
> > > +
> > > +				dev_info(dev, "skipped by Hyper-V IOMMU\n");
> > > +				return ERR_PTR(-ENODEV);
> > > +			}
> > > +			pos += parsed;
> > > +
> > > +		} while (pci_devs_to_skip[pos]);
> > > +	}
> > > +
> > > +	/* Device will be explicitly attached to the default domain, so no need
> > > +	 * to do dev_iommu_priv_set() here.
> > > +	 */
> > > +
> > > +	return &hv_virt_iommu;
> > > +}
> > > +
> > > +static void hv_iommu_probe_finalize(struct device *dev)
> > > +{
> > > +	struct iommu_domain *immdom = iommu_get_domain_for_dev(dev);
> > > +
> > > +	if (immdom && immdom->type == IOMMU_DOMAIN_DMA)
> > > +		iommu_setup_dma_ops(dev);
> > > +	else
> > > +		set_dma_ops(dev, NULL);
> > > +}
> > > +
> > > +static void hv_iommu_release_device(struct device *dev)
> > > +{
> > > +	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
> > > +
> > > +	/* Need to detach device from device domain if necessary. */
> > > +	if (hvdom)
> > > +		hv_iommu_detach_dev(&hvdom->iommu_dom, dev);
> > > +
> > > +	dev_iommu_priv_set(dev, NULL);
> > > +	set_dma_ops(dev, NULL);
> > > +}
> > > +
> > > +static struct iommu_group *hv_iommu_device_group(struct device *dev)
> > > +{
> > > +	if (dev_is_pci(dev))
> > > +		return pci_device_group(dev);
> > > +	else
> > > +		return generic_device_group(dev);
> > > +}
> > > +
> > > +static int hv_iommu_def_domain_type(struct device *dev)
> > > +{
> > > +	/* The hypervisor always creates this by default during boot */
> > > +	return IOMMU_DOMAIN_IDENTITY;
> > > +}
> > > +
> > > +static struct iommu_ops hv_iommu_ops = {
> > > +	.capable	    = hv_iommu_capable,
> > > +	.domain_alloc_identity	= hv_iommu_domain_alloc_identity,
> > > +	.domain_alloc_paging	= hv_iommu_domain_alloc_paging,
> > > +	.probe_device	    = hv_iommu_probe_device,
> > > +	.probe_finalize     = hv_iommu_probe_finalize,
> > > +	.release_device     = hv_iommu_release_device,
> > > +	.def_domain_type    = hv_iommu_def_domain_type,
> > > +	.device_group	    = hv_iommu_device_group,
> > > +	.default_domain_ops = &(const struct iommu_domain_ops) {
> > > +		.attach_dev   = hv_iommu_attach_dev,
> > > +		.map_pages    = hv_iommu_map_pages,
> > > +		.unmap_pages  = hv_iommu_unmap_pages,
> > > +		.iova_to_phys = hv_iommu_iova_to_phys,
> > > +		.free	      = hv_iommu_domain_free,
> > > +	},
> > > +	.owner		    = THIS_MODULE,
> > > +};
> > > +
> > > +static void __init hv_initialize_special_domains(void)
> > > +{
> > > +	hv_def_identity_dom.iommu_dom.geometry = default_geometry;
> > > +	hv_def_identity_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_DEFAULT; /* 0 */
> > 
> > hv_def_identity_dom is a static global variable.
> > Why not initialize hv_def_identity_dom upon definition instead of
> > introducing a new function?
> 
> Originally, it was function. I changed it static, but during 6.6
> review I changed it back to function.  I can't remember why, but is
> pretty harmless. We may add more domains, for example null domain to the
> initilization in future.
> 
> > > +}
> > > +
> > > +static int __init hv_iommu_init(void)
> > > +{
> > > +	int ret;
> > > +	struct iommu_device *iommup = &hv_virt_iommu;
> > > +
> > > +	if (!hv_is_hyperv_initialized())
> > > +		return -ENODEV;
> > > +
> > > +	ret = iommu_device_sysfs_add(iommup, NULL, NULL, "%s", "hyperv-iommu");
> > > +	if (ret) {
> > > +		pr_err("Hyper-V: iommu_device_sysfs_add failed: %d\n", ret);
> > > +		return ret;
> > > +	}
> > > +
> > > +	/* This must come before iommu_device_register because the latter calls
> > > +	 * into the hooks.
> > > +	 */
> > > +	hv_initialize_special_domains();
> > > +
> > > +	ret = iommu_device_register(iommup, &hv_iommu_ops, NULL);
> > 
> > It looks weird to initialize an object after creating sysfs entries for
> > it.
> > It should be the other way around.
> 
> Not sure if it should be, much easier to remove sysfs entry than other
> cleanup, even tho iommu_device_unregister is there. I am sure we'll add
> more code here, probably why it was originally done this way.
> 

Sysfs provides user space access to kernel objects. If the object is not
initialized, it's not only a useless sysfs entry, but also a potential
cause for kernel panic if user space will try to access this entry
before the object is initialized.

Thanks,
Stanislav


> Thanks,
> -Mukesh
> 
> 
> ... snip........

^ permalink raw reply

* Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
From: Stanislav Kinsburskii @ 2026-01-26 18:15 UTC (permalink / raw)
  To: Mukesh R
  Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
	linux-arch, kys, haiyangz, wei.liu, decui, longli,
	catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
	lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
	mhklinux
In-Reply-To: <45e7a4c0-f1d8-b8b4-8c03-56d06845323b@linux.microsoft.com>

On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > 
> > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > and obtained via fixup_user_fault call (similar to KVM).
> > > 
> > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > ---
> > >   drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > >   1 file changed, 115 insertions(+)
> > > 
> > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > --- a/drivers/hv/mshv_root_main.c
> > > +++ b/drivers/hv/mshv_root_main.c
> > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > >   	};
> > >   } __packed;
> > > +bool hv_nofull_mmio;   /* don't map entire mmio region upon fault */
> > > +static int __init setup_hv_full_mmio(char *str)
> > > +{
> > > +	hv_nofull_mmio = true;
> > > +	return 0;
> > > +}
> > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > +
> > >   struct mshv_root mshv_root;
> > >   enum hv_scheduler_type hv_scheduler_type;
> > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > >   }
> > >   #ifdef CONFIG_X86_64
> > > +
> > > +/*
> > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > + * else just return -errno.
> > > + */
> > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > +				       u64 *mmio_pfnp)
> > > +{
> > > +	struct vm_area_struct *vma;
> > > +	bool is_mmio;
> > > +	u64 uaddr;
> > > +	struct mshv_mem_region *mreg;
> > > +	struct follow_pfnmap_args pfnmap_args;
> > > +	int rc = -EINVAL;
> > > +
> > > +	/*
> > > +	 * Do not allow mem region to be deleted beneath us. VFIO uses
> > > +	 * useraddr vma to lookup pci bar pfn.
> > > +	 */
> > > +	spin_lock(&pt->pt_mem_regions_lock);
> > > +
> > > +	/* Get the region again under the lock */
> > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > +		goto unlock_pt_out;
> > > +
> > > +	uaddr = mreg->start_uaddr +
> > > +		((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > +
> > > +	mmap_read_lock(current->mm);
> > 
> > Semaphore can't be taken under spinlock.

> 
> Yeah, something didn't feel right here and I meant to recheck, now regret
> rushing to submit the patch.
> 
> Rethinking, I think the pt_mem_regions_lock is not needed to protect
> the uaddr because unmap will properly serialize via the mm lock.
> 
> 
> > > +	vma = vma_lookup(current->mm, uaddr);
> > > +	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > 
> > Why this check is needed again?
> 
> To make sure region did not change. This check is under lock.
> 

How can this happen? One can't change VMA type without unmapping it
first. And unmapping it leads to a kernel MMIO region state dangling
around without corresponding user space mapping.

This is similar to dangling pinned regions and should likely be
addressed the same way by utilizing MMU notifiers to destpoy memoty
regions is VMA is detached.

> > The region type is stored on the region itself.
> > And the type is checked on the caller side.
> > 
> > > +	if (!is_mmio)
> > > +		goto unlock_mmap_out;
> > > +
> > > +	pfnmap_args.vma = vma;
> > > +	pfnmap_args.address = uaddr;
> > > +
> > > +	rc = follow_pfnmap_start(&pfnmap_args);
> > > +	if (rc) {
> > > +		rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > +				      NULL);
> > > +		if (rc)
> > > +			goto unlock_mmap_out;
> > > +
> > > +		rc = follow_pfnmap_start(&pfnmap_args);
> > > +		if (rc)
> > > +			goto unlock_mmap_out;
> > > +	}
> > > +
> > > +	*mmio_pfnp = pfnmap_args.pfn;
> > > +	follow_pfnmap_end(&pfnmap_args);
> > > +d
> > > +unlock_mmap_out:
> > > +	mmap_read_unlock(current->mm);
> > > +unlock_pt_out:
> > > +	spin_unlock(&pt->pt_mem_regions_lock);
> > > +	return rc;
> > > +}
> > > +
> > > +/*
> > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > + * and resolve if possible.
> > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > + */
> > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > +{
> > > +	struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > +	struct hv_x64_memory_intercept_message *msg;
> > > +	union hv_x64_memory_access_info accinfo;
> > > +	u64 gfn, mmio_spa, numpgs;
> > > +	struct mshv_mem_region *mreg;
> > > +	int rc;
> > > +	struct mshv_partition *pt = vp->vp_partition;
> > > +
> > > +	msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > +	accinfo = msg->memory_access_info;
> > > +
> > > +	if (!accinfo.gva_gpa_valid)
> > > +		return false;
> > > +
> > > +	/* Do a fast check and bail if non mmio intercept */
> > > +	gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > +	mreg = mshv_partition_region_by_gfn(pt, gfn);
> > 
> > This call needs to be protected by the spinlock.
> 
> This is sorta fast path to bail. We recheck under partition lock above.
> 

Accessing the list of regions without lock is unsafe.

Thanks,
Stanislav

> Thanks,
> -Mukesh
> 
> 
> > Thanks,
> > Stanislav
> > 
> > > +	if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > +		return false;
> > > +
> > > +	rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > +	if (rc)
> > > +		return false;
> > > +
> > > +	if (!hv_nofull_mmio) {		/* default case */
> > > +		gfn = mreg->start_gfn;
> > > +		mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > +		numpgs = mreg->nr_pages;
> > > +	} else
> > > +		numpgs = 1;
> > > +
> > > +	rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > +
> > > +	return rc == 0;
> > > +}
> > > +
> > >   static struct mshv_mem_region *
> > >   mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > >   {
> > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > >   	return ret;
> > >   }
> > > +
> > >   #else  /* CONFIG_X86_64 */
> > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > >   static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > >   #endif /* CONFIG_X86_64 */
> > >   static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > >   {
> > >   	switch (vp->vp_intercept_msg_page->header.message_type) {
> > > +	case HVMSG_UNMAPPED_GPA:
> > > +		return mshv_handle_unmapped_gpa(vp);
> > >   	case HVMSG_GPA_INTERCEPT:
> > >   		return mshv_handle_gpa_intercept(vp);
> > >   	}
> > > -- 
> > > 2.51.2.vfs.0.1
> > > 

^ permalink raw reply

* Re: [PATCH] mshv: Make MSHV mutually exclusive with KEXEC
From: Anirudh Rayabharam @ 2026-01-26 18:49 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <176920684805.250171.6817228088359793537.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

On Fri, Jan 23, 2026 at 10:20:53PM +0000, Stanislav Kinsburskii wrote:
> The MSHV driver deposits kernel-allocated pages to the hypervisor during
> runtime and never withdraws them. This creates a fundamental incompatibility
> with KEXEC, as these deposited pages remain unavailable to the new kernel
> loaded via KEXEC, leading to potential system crashes upon kernel accessing
> hypervisor deposited pages.
> 
> Make MSHV mutually exclusive with KEXEC until proper page lifecycle
> management is implemented.

Someone might want to stop all guest VMs and do a kexec. Which is valid
and would work without any issue for L1VH.

Also, I don't think it is reasonable at all that someone needs to
disable basic kernel functionality such as kexec in order to use our
driver.

Thanks,
Anirudh.

> 
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
>  drivers/hv/Kconfig |    1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
> index 7937ac0cbd0f..cfd4501db0fa 100644
> --- a/drivers/hv/Kconfig
> +++ b/drivers/hv/Kconfig
> @@ -74,6 +74,7 @@ config MSHV_ROOT
>  	# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
>  	# no particular order, making it impossible to reassemble larger pages
>  	depends on PAGE_SIZE_4KB
> +	depends on !KEXEC
>  	select EVENTFD
>  	select VIRT_XFER_TO_GUEST_WORK
>  	select HMM_MIRROR
> 
> 

^ permalink raw reply

* Re: [PATCH net-next v2] net: mana: Improve diagnostic logging for better debuggability
From: Leon Romanovsky @ 2026-01-26 19:58 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Erni Sri Satya Vennela, kys, haiyangz, wei.liu, decui, longli,
	andrew+netdev, davem, edumazet, pabeni, kotaranov, shradhagupta,
	yury.norov, dipayanroy, shirazsaleem, ssengar, gargaditya,
	linux-hyperv, netdev, linux-kernel
In-Reply-To: <20260122180745.3b5607cf@kernel.org>

On Thu, Jan 22, 2026 at 06:07:45PM -0800, Jakub Kicinski wrote:
> On Thu, 22 Jan 2026 09:43:42 -0800 Erni Sri Satya Vennela wrote:
> > On Wed, Jan 21, 2026 at 08:14:12PM -0800, Jakub Kicinski wrote:
> > > On Tue, 20 Jan 2026 22:56:55 -0800 Erni Sri Satya Vennela wrote:  
> > > > Enhance MANA driver logging to provide better visibility into
> > > > hardware configuration and error states during driver initialization
> > > > and runtime operations.  
> > >   
> > > > +	dev_info(gc->dev, "Max Resources: msix_usable=%u max_queues=%u\n",
> > > > +		 gc->num_msix_usable, gc->max_num_queues);  
> > >   
> > > > +	dev_info(dev, "Device Config: max_vports=%u adapter_mtu=%u bm_hostmode=%u\n",
> > > > +		 *max_num_vports, gc->adapter_mtu, *bm_hostmode);  
> > > 
> > > IIUC in networking we try to follow the mantra that if the system is
> > > functioning correctly there should be no logs. You can expose the debug
> > > info via ethtool, devlink, debugfs etc. Take your pick.  
> > 
> > We discussed this internally and noted that customers often cannot
> > reliably reproduce the VM issue. In such cases, the only evidence
> > available is the dmesg logs captured during the incident. Asking them to
> > re-enable debug options later is not practical, since the problem may
> > not occur again. Similarly, exposing the information via ethtool,
> > devlink, or debugfs is less effective because the data is transient and
> > lost after a reboot. As these messages are printed only once during
> > initialization, and not repeated during runtime or driver load/unload,
> > we decided to keep them at info level to aid troubleshooting without
> > adding noise.
> 
> You will have to build proper support tooling like every single vendor
> before you. Presumably you can also log from the hypervisor side which
> makes your life so much easier than supporting real HW. Yet, real
> NIC don't spew random trash to the logs all the time. SMH. Respectfully,
> next time y'all "discuss things internally" start with the question of
> what makes your case special :|

+100

Interesting. Completely independent of your comment, I provided the same
feedback on their mana_ib driver. They added debug logs to nearly every
command, even though those commands already had existing debug logging.

https://lore.kernel.org/linux-rdma/20260122131442.GL13201@unreal/T/#m51e8a12f4bca4a6c1377c5531c8a6d94a43af1e5

"In order to simplify things for you: unless you can clearly justify why this
print is required and why you cannot proceed without it, I must ask you to stop
adding any new debug or error messages to the mana_ib driver. There is a wide
range of existing tools and well‑established practices for debugging the kernel,
and none of them require spamming dmesg."

Thanks

^ permalink raw reply

* Re: [PATCH] mshv: Make MSHV mutually exclusive with KEXEC
From: Mukesh R @ 2026-01-26 20:20 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aXabnnCV50Thv9tZ@skinsburskii.localdomain>

On 1/25/26 14:39, Stanislav Kinsburskii wrote:
> On Fri, Jan 23, 2026 at 04:16:33PM -0800, Mukesh R wrote:
>> On 1/23/26 14:20, Stanislav Kinsburskii wrote:
>>> The MSHV driver deposits kernel-allocated pages to the hypervisor during
>>> runtime and never withdraws them. This creates a fundamental incompatibility
>>> with KEXEC, as these deposited pages remain unavailable to the new kernel
>>> loaded via KEXEC, leading to potential system crashes upon kernel accessing
>>> hypervisor deposited pages.
>>>
>>> Make MSHV mutually exclusive with KEXEC until proper page lifecycle
>>> management is implemented.
>>>
>>> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
>>> ---
>>>    drivers/hv/Kconfig |    1 +
>>>    1 file changed, 1 insertion(+)
>>>
>>> diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
>>> index 7937ac0cbd0f..cfd4501db0fa 100644
>>> --- a/drivers/hv/Kconfig
>>> +++ b/drivers/hv/Kconfig
>>> @@ -74,6 +74,7 @@ config MSHV_ROOT
>>>    	# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
>>>    	# no particular order, making it impossible to reassemble larger pages
>>>    	depends on PAGE_SIZE_4KB
>>> +	depends on !KEXEC
>>>    	select EVENTFD
>>>    	select VIRT_XFER_TO_GUEST_WORK
>>>    	select HMM_MIRROR
>>>
>>>
>>
>> Will this affect CRASH kexec? I see few CONFIG_CRASH_DUMP in kexec.c
>> implying that crash dump might be involved. Or did you test kdump
>> and it was fine?
>>
> 
> Yes, it will. Crash kexec depends on normal kexec functionality, so it
> will be affected as well.

So not sure I understand the reason for this patch. We can just block
kexec if there are any VMs running, right? Doing this would mean any
further developement would be without a ver important and major feature,
right?

> Thanks,
> Stanislav
> 
>> Thanks,
>> -Mukesh


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox