* Re: [PATCH v2 4/4] mshv: Handle insufficient root memory hypervisor statuses
From: Stanislav Kinsburskii @ 2026-02-05 18:36 UTC (permalink / raw)
To: Anirudh Rayabharam
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <s6orh5waw2djyiv5w6yzwiaxv7rcja6iua6kbzldthsmceelqv@dnf2zr2m74we>
On Thu, Feb 05, 2026 at 11:37:49PM +0530, Anirudh Rayabharam wrote:
> On Mon, Feb 02, 2026 at 05:59:14PM +0000, Stanislav Kinsburskii wrote:
> > When creating guest partition objects, the hypervisor may fail to
> > allocate root partition pages and return an insufficient memory status.
> > In this case, deposit memory using the root partition ID instead.
> >
> > Note: This error should never occur in a guest of L1VH partition context.
> >
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > ---
> > drivers/hv/hv_common.c | 2 +
> > drivers/hv/hv_proc.c | 14 ++++++++++
> > include/hyperv/hvgdk_mini.h | 58 ++++++++++++++++++++++---------------------
> > 3 files changed, 46 insertions(+), 28 deletions(-)
> >
> > diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
> > index c7f63c9de503..cab0d1733607 100644
> > --- a/drivers/hv/hv_common.c
> > +++ b/drivers/hv/hv_common.c
> > @@ -792,6 +792,8 @@ static const struct hv_status_info hv_status_infos[] = {
> > _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
> > _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
> > _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM),
> > + _STATUS_INFO(HV_STATUS_INSUFFICIENT_ROOT_MEMORY, -ENOMEM),
> > + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY, -ENOMEM),
> > _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
> > _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
> > _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
> > diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> > index dfa27be66ff7..935129e0b39d 100644
> > --- a/drivers/hv/hv_proc.c
> > +++ b/drivers/hv/hv_proc.c
> > @@ -122,6 +122,18 @@ int hv_deposit_memory_node(int node, u64 partition_id,
> > case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
> > num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
> > break;
> > +
> > + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY:
> > + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
> > + fallthrough;
> > + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY:
>
> Is num_pages uninitialized when we reach this case directly?
>
It actually does not. I'll fix it.
Thanks,
Stanislav
> Thanks,
> Anirudh.
>
> > + if (!hv_root_partition()) {
> > + hv_status_err(hv_status, "Unexpected root memory deposit\n");
> > + return -ENOMEM;
> > + }
> > + partition_id = HV_PARTITION_ID_SELF;
> > + break;
> > +
> > default:
> > hv_status_err(hv_status, "Unexpected!\n");
> > return -ENOMEM;
> > @@ -135,6 +147,8 @@ bool hv_result_needs_memory(u64 status)
> > switch (hv_result(status)) {
> > case HV_STATUS_INSUFFICIENT_MEMORY:
> > case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
> > + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY:
> > + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY:
> > return true;
> > }
> > return false;
> > diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> > index 70f22ef44948..5b74a857ef43 100644
> > --- a/include/hyperv/hvgdk_mini.h
> > +++ b/include/hyperv/hvgdk_mini.h
> > @@ -14,34 +14,36 @@ struct hv_u128 {
> > } __packed;
> >
> > /* NOTE: when adding below, update hv_result_to_string() */
> > -#define HV_STATUS_SUCCESS 0x0
> > -#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2
> > -#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3
> > -#define HV_STATUS_INVALID_ALIGNMENT 0x4
> > -#define HV_STATUS_INVALID_PARAMETER 0x5
> > -#define HV_STATUS_ACCESS_DENIED 0x6
> > -#define HV_STATUS_INVALID_PARTITION_STATE 0x7
> > -#define HV_STATUS_OPERATION_DENIED 0x8
> > -#define HV_STATUS_UNKNOWN_PROPERTY 0x9
> > -#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA
> > -#define HV_STATUS_INSUFFICIENT_MEMORY 0xB
> > -#define HV_STATUS_INVALID_PARTITION_ID 0xD
> > -#define HV_STATUS_INVALID_VP_INDEX 0xE
> > -#define HV_STATUS_NOT_FOUND 0x10
> > -#define HV_STATUS_INVALID_PORT_ID 0x11
> > -#define HV_STATUS_INVALID_CONNECTION_ID 0x12
> > -#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13
> > -#define HV_STATUS_NOT_ACKNOWLEDGED 0x14
> > -#define HV_STATUS_INVALID_VP_STATE 0x15
> > -#define HV_STATUS_NO_RESOURCES 0x1D
> > -#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20
> > -#define HV_STATUS_INVALID_LP_INDEX 0x41
> > -#define HV_STATUS_INVALID_REGISTER_VALUE 0x50
> > -#define HV_STATUS_OPERATION_FAILED 0x71
> > -#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75
> > -#define HV_STATUS_TIME_OUT 0x78
> > -#define HV_STATUS_CALL_PENDING 0x79
> > -#define HV_STATUS_VTL_ALREADY_ENABLED 0x86
> > +#define HV_STATUS_SUCCESS 0x0
> > +#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2
> > +#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3
> > +#define HV_STATUS_INVALID_ALIGNMENT 0x4
> > +#define HV_STATUS_INVALID_PARAMETER 0x5
> > +#define HV_STATUS_ACCESS_DENIED 0x6
> > +#define HV_STATUS_INVALID_PARTITION_STATE 0x7
> > +#define HV_STATUS_OPERATION_DENIED 0x8
> > +#define HV_STATUS_UNKNOWN_PROPERTY 0x9
> > +#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA
> > +#define HV_STATUS_INSUFFICIENT_MEMORY 0xB
> > +#define HV_STATUS_INVALID_PARTITION_ID 0xD
> > +#define HV_STATUS_INVALID_VP_INDEX 0xE
> > +#define HV_STATUS_NOT_FOUND 0x10
> > +#define HV_STATUS_INVALID_PORT_ID 0x11
> > +#define HV_STATUS_INVALID_CONNECTION_ID 0x12
> > +#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13
> > +#define HV_STATUS_NOT_ACKNOWLEDGED 0x14
> > +#define HV_STATUS_INVALID_VP_STATE 0x15
> > +#define HV_STATUS_NO_RESOURCES 0x1D
> > +#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20
> > +#define HV_STATUS_INVALID_LP_INDEX 0x41
> > +#define HV_STATUS_INVALID_REGISTER_VALUE 0x50
> > +#define HV_STATUS_OPERATION_FAILED 0x71
> > +#define HV_STATUS_INSUFFICIENT_ROOT_MEMORY 0x73
> > +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75
> > +#define HV_STATUS_TIME_OUT 0x78
> > +#define HV_STATUS_CALL_PENDING 0x79
> > +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY 0x83
> > +#define HV_STATUS_VTL_ALREADY_ENABLED 0x86
> >
> > /*
> > * The Hyper-V TimeRefCount register and the TSC
> >
> >
^ permalink raw reply
* Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
From: Stanislav Kinsburskii @ 2026-02-05 18:31 UTC (permalink / raw)
To: Mukesh R
Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
linux-arch, kys, haiyangz, wei.liu, decui, longli,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
mhklinux
In-Reply-To: <4a7c63fc-b96a-9841-7745-adbc41190c36@linux.microsoft.com>
On Thu, Feb 05, 2026 at 09:57:20AM -0800, Mukesh R wrote:
> On 2/5/26 08:28, Stanislav Kinsburskii wrote:
> > On Wed, Feb 04, 2026 at 02:52:54PM -0800, Mukesh R wrote:
> > > On 2/2/26 08:30, Stanislav Kinsburskii wrote:
> > > > On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
> > > > > On 1/27/26 10:57, Stanislav Kinsburskii wrote:
> > > > > > On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
> > > > > > > On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> > > > > > > > On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> > > > > > > > > On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > > > > > > > > > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > > > > > > > > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > > >
> > > > > > > > > > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > > > > > > > > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > > > > > > > > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > > > > > > > > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > > > > > > > > > and obtained via fixup_user_fault call (similar to KVM).
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > > > ---
> > > > > > > > > > > drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > > > 1 file changed, 115 insertions(+)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > > > > > > > > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > > > > > > > > > --- a/drivers/hv/mshv_root_main.c
> > > > > > > > > > > +++ b/drivers/hv/mshv_root_main.c
> > > > > > > > > > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > > > > > > > > > > };
> > > > > > > > > > > } __packed;
> > > > > > > > > > > +bool hv_nofull_mmio; /* don't map entire mmio region upon fault */
> > > > > > > > > > > +static int __init setup_hv_full_mmio(char *str)
> > > > > > > > > > > +{
> > > > > > > > > > > + hv_nofull_mmio = true;
> > > > > > > > > > > + return 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > > > > > > > > > +
> > > > > > > > > > > struct mshv_root mshv_root;
> > > > > > > > > > > enum hv_scheduler_type hv_scheduler_type;
> > > > > > > > > > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > > > > > > > > > > }
> > > > > > > > > > > #ifdef CONFIG_X86_64
> > > > > > > > > > > +
> > > > > > > > > > > +/*
> > > > > > > > > > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > > > > > > > > > + * else just return -errno.
> > > > > > > > > > > + */
> > > > > > > > > > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > > > > > > > > > + u64 *mmio_pfnp)
> > > > > > > > > > > +{
> > > > > > > > > > > + struct vm_area_struct *vma;
> > > > > > > > > > > + bool is_mmio;
> > > > > > > > > > > + u64 uaddr;
> > > > > > > > > > > + struct mshv_mem_region *mreg;
> > > > > > > > > > > + struct follow_pfnmap_args pfnmap_args;
> > > > > > > > > > > + int rc = -EINVAL;
> > > > > > > > > > > +
> > > > > > > > > > > + /*
> > > > > > > > > > > + * Do not allow mem region to be deleted beneath us. VFIO uses
> > > > > > > > > > > + * useraddr vma to lookup pci bar pfn.
> > > > > > > > > > > + */
> > > > > > > > > > > + spin_lock(&pt->pt_mem_regions_lock);
> > > > > > > > > > > +
> > > > > > > > > > > + /* Get the region again under the lock */
> > > > > > > > > > > + mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > > > > + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > > > + goto unlock_pt_out;
> > > > > > > > > > > +
> > > > > > > > > > > + uaddr = mreg->start_uaddr +
> > > > > > > > > > > + ((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > > > > > > > > > +
> > > > > > > > > > > + mmap_read_lock(current->mm);
> > > > > > > > > >
> > > > > > > > > > Semaphore can't be taken under spinlock.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Yeah, something didn't feel right here and I meant to recheck, now regret
> > > > > > > > > rushing to submit the patch.
> > > > > > > > >
> > > > > > > > > Rethinking, I think the pt_mem_regions_lock is not needed to protect
> > > > > > > > > the uaddr because unmap will properly serialize via the mm lock.
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > > + vma = vma_lookup(current->mm, uaddr);
> > > > > > > > > > > + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > > > > > > > > >
> > > > > > > > > > Why this check is needed again?
> > > > > > > > >
> > > > > > > > > To make sure region did not change. This check is under lock.
> > > > > > > > >
> > > > > > > >
> > > > > > > > How can this happen? One can't change VMA type without unmapping it
> > > > > > > > first. And unmapping it leads to a kernel MMIO region state dangling
> > > > > > > > around without corresponding user space mapping.
> > > > > > >
> > > > > > > Right, and vm_flags would not be mmio expected then.
> > > > > > >
> > > > > > > > This is similar to dangling pinned regions and should likely be
> > > > > > > > addressed the same way by utilizing MMU notifiers to destpoy memoty
> > > > > > > > regions is VMA is detached.
> > > > > > >
> > > > > > > I don't think we need that. Either it succeeds if the region did not
> > > > > > > change at all, or just fails.
> > > > > > >
> > > > > >
> > > > > > I'm afraid we do, as if the driver mapped a page with the previous
> > > > > > memory region, and then the region is unmapped, the page will stay
> > > > > > mapped in the hypervisor, but will be considered free by kernel, which
> > > > > > in turn will lead to GPF upn next allocation.
> > > > >
> > > > > There are no ram pages for mmio regions. Also, we don't do much with
> > > > > mmio regions other than tell the hyp about it.
> > > > >
> > > >
> > > > So, are you saying that the hypervisor does not use these pages and only
> > > > tracks them? That would make things easier.
> > > > However, if we later try to map a GPA that is already mapped, will the
> > > > hypervisor return an error?
> > >
> > > Hypervisor does not return an error.
> > >
> >
> > So, what happenes if we map a GPA that is already mapped? Does it just
> > remap it to the new PFN?
>
> yes, otherwise it would return error, right?
>
I see.
Please summarize and document this behaviour in the commit message.
Thanks,
Stanislav
> > Thanks,
> > Stanislav
> >
> > >
> > >
> > > > Thanks,
> > > > Stanislav
> > > >
> > > > > Thanks,
> > > > > -Mukesh
> > > > >
> > > > >
> > > > > > With pinned regions we issue is similar but less impacting: pages can't
> > > > > > be released by user space unmapping and thus will be simply leaked, but
> > > > > > the system stays intact.
> > > > > >
> > > > > > MMIO regions are simila to movable region in this regard: they don't
> > > > > > reference the user pages, and thus this guest region replaement is a
> > > > > > stright wat to kernel panic.
> > > > > >
> > > > > > >
> > > > > > > > > > The region type is stored on the region itself.
> > > > > > > > > > And the type is checked on the caller side.
> > > > > > > > > >
> > > > > > > > > > > + if (!is_mmio)
> > > > > > > > > > > + goto unlock_mmap_out;
> > > > > > > > > > > +
> > > > > > > > > > > + pfnmap_args.vma = vma;
> > > > > > > > > > > + pfnmap_args.address = uaddr;
> > > > > > > > > > > +
> > > > > > > > > > > + rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > > > + if (rc) {
> > > > > > > > > > > + rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > > > > > > > > > + NULL);
> > > > > > > > > > > + if (rc)
> > > > > > > > > > > + goto unlock_mmap_out;
> > > > > > > > > > > +
> > > > > > > > > > > + rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > > > + if (rc)
> > > > > > > > > > > + goto unlock_mmap_out;
> > > > > > > > > > > + }
> > > > > > > > > > > +
> > > > > > > > > > > + *mmio_pfnp = pfnmap_args.pfn;
> > > > > > > > > > > + follow_pfnmap_end(&pfnmap_args);
> > > > > > > > > > > +d
> > > > > > > > > > > +unlock_mmap_out:
> > > > > > > > > > > + mmap_read_unlock(current->mm);
> > > > > > > > > > > +unlock_pt_out:
> > > > > > > > > > > + spin_unlock(&pt->pt_mem_regions_lock);
> > > > > > > > > > > + return rc;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +/*
> > > > > > > > > > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > > > > > > > > > + * and resolve if possible.
> > > > > > > > > > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > > > > > > > > > + */
> > > > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > > > > > > > > > +{
> > > > > > > > > > > + struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > > > > > > > > > + struct hv_x64_memory_intercept_message *msg;
> > > > > > > > > > > + union hv_x64_memory_access_info accinfo;
> > > > > > > > > > > + u64 gfn, mmio_spa, numpgs;
> > > > > > > > > > > + struct mshv_mem_region *mreg;
> > > > > > > > > > > + int rc;
> > > > > > > > > > > + struct mshv_partition *pt = vp->vp_partition;
> > > > > > > > > > > +
> > > > > > > > > > > + msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > > > > > > > > > + accinfo = msg->memory_access_info;
> > > > > > > > > > > +
> > > > > > > > > > > + if (!accinfo.gva_gpa_valid)
> > > > > > > > > > > + return false;
> > > > > > > > > > > +
> > > > > > > > > > > + /* Do a fast check and bail if non mmio intercept */
> > > > > > > > > > > + gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > > > > > > > > > + mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > > >
> > > > > > > > > > This call needs to be protected by the spinlock.
> > > > > > > > >
> > > > > > > > > This is sorta fast path to bail. We recheck under partition lock above.
> > > > > > > > >
> > > > > > > >
> > > > > > > > Accessing the list of regions without lock is unsafe.
> > > > > > >
> > > > > > > I am not sure why? This check is done by a vcpu thread, so regions
> > > > > > > will not have just gone away.
> > > > > > >
> > > > > >
> > > > > > This is shared resources. Multiple VP thread get into this function
> > > > > > simultaneously, so there is a race already. But this one we can live
> > > > > > with without locking as they don't mutate the list of the regions.
> > > > > >
> > > > > > The issue happens when VMM adds or removed another region as it mutates
> > > > > > the list and races with VP threads doing this lookup.
> > > > > >
> > > > > > Thanks,
> > > > > > Stanislav
> > > > > >
> > > > > >
> > > > > > > Thanks,
> > > > > > > -Mukesh
> > > > > > >
> > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Stanislav
> > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > > -Mukesh
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > > Thanks,
> > > > > > > > > > Stanislav
> > > > > > > > > >
> > > > > > > > > > > + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > > > + return false;
> > > > > > > > > > > +
> > > > > > > > > > > + rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > > > > > > > > > + if (rc)
> > > > > > > > > > > + return false;
> > > > > > > > > > > +
> > > > > > > > > > > + if (!hv_nofull_mmio) { /* default case */
> > > > > > > > > > > + gfn = mreg->start_gfn;
> > > > > > > > > > > + mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > > > > > > > > > + numpgs = mreg->nr_pages;
> > > > > > > > > > > + } else
> > > > > > > > > > > + numpgs = 1;
> > > > > > > > > > > +
> > > > > > > > > > > + rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > > > > > > > > > +
> > > > > > > > > > > + return rc == 0;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > static struct mshv_mem_region *
> > > > > > > > > > > mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > > > > > > > > > > {
> > > > > > > > > > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > > > > > > > > > > return ret;
> > > > > > > > > > > }
> > > > > > > > > > > +
> > > > > > > > > > > #else /* CONFIG_X86_64 */
> > > > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > > > > > > > > > > static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > > > > > > > > > #endif /* CONFIG_X86_64 */
> > > > > > > > > > > static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > > > > > > > > > > {
> > > > > > > > > > > switch (vp->vp_intercept_msg_page->header.message_type) {
> > > > > > > > > > > + case HVMSG_UNMAPPED_GPA:
> > > > > > > > > > > + return mshv_handle_unmapped_gpa(vp);
> > > > > > > > > > > case HVMSG_GPA_INTERCEPT:
> > > > > > > > > > > return mshv_handle_gpa_intercept(vp);
> > > > > > > > > > > }
> > > > > > > > > > > --
> > > > > > > > > > > 2.51.2.vfs.0.1
> > > > > > > > > > >
^ permalink raw reply
* Re: [PATCH v2 4/4] mshv: Handle insufficient root memory hypervisor statuses
From: Anirudh Rayabharam @ 2026-02-05 18:07 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <177005515446.120041.8169777750859263202.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On Mon, Feb 02, 2026 at 05:59:14PM +0000, Stanislav Kinsburskii wrote:
> When creating guest partition objects, the hypervisor may fail to
> allocate root partition pages and return an insufficient memory status.
> In this case, deposit memory using the root partition ID instead.
>
> Note: This error should never occur in a guest of L1VH partition context.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/hv_common.c | 2 +
> drivers/hv/hv_proc.c | 14 ++++++++++
> include/hyperv/hvgdk_mini.h | 58 ++++++++++++++++++++++---------------------
> 3 files changed, 46 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
> index c7f63c9de503..cab0d1733607 100644
> --- a/drivers/hv/hv_common.c
> +++ b/drivers/hv/hv_common.c
> @@ -792,6 +792,8 @@ static const struct hv_status_info hv_status_infos[] = {
> _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
> _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
> _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM),
> + _STATUS_INFO(HV_STATUS_INSUFFICIENT_ROOT_MEMORY, -ENOMEM),
> + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY, -ENOMEM),
> _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
> _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
> _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
> diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> index dfa27be66ff7..935129e0b39d 100644
> --- a/drivers/hv/hv_proc.c
> +++ b/drivers/hv/hv_proc.c
> @@ -122,6 +122,18 @@ int hv_deposit_memory_node(int node, u64 partition_id,
> case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
> num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
> break;
> +
> + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY:
> + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
> + fallthrough;
> + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY:
Is num_pages uninitialized when we reach this case directly?
Thanks,
Anirudh.
> + if (!hv_root_partition()) {
> + hv_status_err(hv_status, "Unexpected root memory deposit\n");
> + return -ENOMEM;
> + }
> + partition_id = HV_PARTITION_ID_SELF;
> + break;
> +
> default:
> hv_status_err(hv_status, "Unexpected!\n");
> return -ENOMEM;
> @@ -135,6 +147,8 @@ bool hv_result_needs_memory(u64 status)
> switch (hv_result(status)) {
> case HV_STATUS_INSUFFICIENT_MEMORY:
> case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
> + case HV_STATUS_INSUFFICIENT_ROOT_MEMORY:
> + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY:
> return true;
> }
> return false;
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index 70f22ef44948..5b74a857ef43 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -14,34 +14,36 @@ struct hv_u128 {
> } __packed;
>
> /* NOTE: when adding below, update hv_result_to_string() */
> -#define HV_STATUS_SUCCESS 0x0
> -#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2
> -#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3
> -#define HV_STATUS_INVALID_ALIGNMENT 0x4
> -#define HV_STATUS_INVALID_PARAMETER 0x5
> -#define HV_STATUS_ACCESS_DENIED 0x6
> -#define HV_STATUS_INVALID_PARTITION_STATE 0x7
> -#define HV_STATUS_OPERATION_DENIED 0x8
> -#define HV_STATUS_UNKNOWN_PROPERTY 0x9
> -#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA
> -#define HV_STATUS_INSUFFICIENT_MEMORY 0xB
> -#define HV_STATUS_INVALID_PARTITION_ID 0xD
> -#define HV_STATUS_INVALID_VP_INDEX 0xE
> -#define HV_STATUS_NOT_FOUND 0x10
> -#define HV_STATUS_INVALID_PORT_ID 0x11
> -#define HV_STATUS_INVALID_CONNECTION_ID 0x12
> -#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13
> -#define HV_STATUS_NOT_ACKNOWLEDGED 0x14
> -#define HV_STATUS_INVALID_VP_STATE 0x15
> -#define HV_STATUS_NO_RESOURCES 0x1D
> -#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20
> -#define HV_STATUS_INVALID_LP_INDEX 0x41
> -#define HV_STATUS_INVALID_REGISTER_VALUE 0x50
> -#define HV_STATUS_OPERATION_FAILED 0x71
> -#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75
> -#define HV_STATUS_TIME_OUT 0x78
> -#define HV_STATUS_CALL_PENDING 0x79
> -#define HV_STATUS_VTL_ALREADY_ENABLED 0x86
> +#define HV_STATUS_SUCCESS 0x0
> +#define HV_STATUS_INVALID_HYPERCALL_CODE 0x2
> +#define HV_STATUS_INVALID_HYPERCALL_INPUT 0x3
> +#define HV_STATUS_INVALID_ALIGNMENT 0x4
> +#define HV_STATUS_INVALID_PARAMETER 0x5
> +#define HV_STATUS_ACCESS_DENIED 0x6
> +#define HV_STATUS_INVALID_PARTITION_STATE 0x7
> +#define HV_STATUS_OPERATION_DENIED 0x8
> +#define HV_STATUS_UNKNOWN_PROPERTY 0x9
> +#define HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE 0xA
> +#define HV_STATUS_INSUFFICIENT_MEMORY 0xB
> +#define HV_STATUS_INVALID_PARTITION_ID 0xD
> +#define HV_STATUS_INVALID_VP_INDEX 0xE
> +#define HV_STATUS_NOT_FOUND 0x10
> +#define HV_STATUS_INVALID_PORT_ID 0x11
> +#define HV_STATUS_INVALID_CONNECTION_ID 0x12
> +#define HV_STATUS_INSUFFICIENT_BUFFERS 0x13
> +#define HV_STATUS_NOT_ACKNOWLEDGED 0x14
> +#define HV_STATUS_INVALID_VP_STATE 0x15
> +#define HV_STATUS_NO_RESOURCES 0x1D
> +#define HV_STATUS_PROCESSOR_FEATURE_NOT_SUPPORTED 0x20
> +#define HV_STATUS_INVALID_LP_INDEX 0x41
> +#define HV_STATUS_INVALID_REGISTER_VALUE 0x50
> +#define HV_STATUS_OPERATION_FAILED 0x71
> +#define HV_STATUS_INSUFFICIENT_ROOT_MEMORY 0x73
> +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75
> +#define HV_STATUS_TIME_OUT 0x78
> +#define HV_STATUS_CALL_PENDING 0x79
> +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_ROOT_MEMORY 0x83
> +#define HV_STATUS_VTL_ALREADY_ENABLED 0x86
>
> /*
> * The Hyper-V TimeRefCount register and the TSC
>
>
^ permalink raw reply
* Re: [PATCH v2 3/4] mshv: Handle insufficient contiguous memory hypervisor status
From: Anirudh Rayabharam @ 2026-02-05 18:03 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <177005514902.120041.13078117373390753930.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On Mon, Feb 02, 2026 at 05:59:09PM +0000, Stanislav Kinsburskii wrote:
> The HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY status indicates that the
> hypervisor lacks sufficient contiguous memory for its internal allocations.
>
> When this status is encountered, allocate and deposit
> HV_MAX_CONTIGUOUS_ALLOCATION_PAGES contiguous pages to the hypervisor.
> HV_MAX_CONTIGUOUS_ALLOCATION_PAGES is defined in the hypervisor headers, a
> deposit of this size will always satisfy the hypervisor's requirements.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/hv_common.c | 1 +
> drivers/hv/hv_proc.c | 4 ++++
> include/hyperv/hvgdk_mini.h | 1 +
> include/hyperv/hvhdk_mini.h | 2 ++
> 4 files changed, 8 insertions(+)
>
> diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
> index 0a3ab7efed46..c7f63c9de503 100644
> --- a/drivers/hv/hv_common.c
> +++ b/drivers/hv/hv_common.c
> @@ -791,6 +791,7 @@ static const struct hv_status_info hv_status_infos[] = {
> _STATUS_INFO(HV_STATUS_UNKNOWN_PROPERTY, -EIO),
> _STATUS_INFO(HV_STATUS_PROPERTY_VALUE_OUT_OF_RANGE, -EIO),
> _STATUS_INFO(HV_STATUS_INSUFFICIENT_MEMORY, -ENOMEM),
> + _STATUS_INFO(HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY, -ENOMEM),
> _STATUS_INFO(HV_STATUS_INVALID_PARTITION_ID, -EINVAL),
> _STATUS_INFO(HV_STATUS_INVALID_VP_INDEX, -EINVAL),
> _STATUS_INFO(HV_STATUS_NOT_FOUND, -EIO),
> diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> index ffa25cd6e4e9..dfa27be66ff7 100644
> --- a/drivers/hv/hv_proc.c
> +++ b/drivers/hv/hv_proc.c
> @@ -119,6 +119,9 @@ int hv_deposit_memory_node(int node, u64 partition_id,
> case HV_STATUS_INSUFFICIENT_MEMORY:
> num_pages = 1;
> break;
> + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
> + num_pages = HV_MAX_CONTIGUOUS_ALLOCATION_PAGES;
> + break;
> default:
> hv_status_err(hv_status, "Unexpected!\n");
> return -ENOMEM;
> @@ -131,6 +134,7 @@ bool hv_result_needs_memory(u64 status)
> {
> switch (hv_result(status)) {
> case HV_STATUS_INSUFFICIENT_MEMORY:
> + case HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY:
> return true;
> }
> return false;
> diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h
> index 04b18d0e37af..70f22ef44948 100644
> --- a/include/hyperv/hvgdk_mini.h
> +++ b/include/hyperv/hvgdk_mini.h
> @@ -38,6 +38,7 @@ struct hv_u128 {
> #define HV_STATUS_INVALID_LP_INDEX 0x41
> #define HV_STATUS_INVALID_REGISTER_VALUE 0x50
> #define HV_STATUS_OPERATION_FAILED 0x71
> +#define HV_STATUS_INSUFFICIENT_CONTIGUOUS_MEMORY 0x75
> #define HV_STATUS_TIME_OUT 0x78
> #define HV_STATUS_CALL_PENDING 0x79
> #define HV_STATUS_VTL_ALREADY_ENABLED 0x86
> diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h
> index c0300910808b..091c03e26046 100644
> --- a/include/hyperv/hvhdk_mini.h
> +++ b/include/hyperv/hvhdk_mini.h
> @@ -7,6 +7,8 @@
>
> #include "hvgdk_mini.h"
>
> +#define HV_MAX_CONTIGUOUS_ALLOCATION_PAGES 8
> +
> /*
> * Doorbell connection_info flags.
> */
>
>
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
^ permalink raw reply
* Re: [PATCH v2 2/4] mshv: Introduce hv_deposit_memory helper functions
From: Anirudh Rayabharam @ 2026-02-05 18:01 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <177005514346.120041.5702271891856790910.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On Mon, Feb 02, 2026 at 05:59:03PM +0000, Stanislav Kinsburskii wrote:
> Introduce hv_deposit_memory_node() and hv_deposit_memory() helper
> functions to handle memory deposition with proper error handling.
>
> The new hv_deposit_memory_node() function takes the hypervisor status
> as a parameter and validates it before depositing pages. It checks for
> HV_STATUS_INSUFFICIENT_MEMORY specifically and returns an error for
> unexpected status codes.
>
> This is a precursor patch to new out-of-memory error codes support.
> No functional changes intended.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/hv_proc.c | 22 ++++++++++++++++++++--
> drivers/hv/mshv_root_hv_call.c | 25 +++++++++----------------
> drivers/hv/mshv_root_main.c | 3 +--
> include/asm-generic/mshyperv.h | 10 ++++++++++
> 4 files changed, 40 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> index e53204b9e05d..ffa25cd6e4e9 100644
> --- a/drivers/hv/hv_proc.c
> +++ b/drivers/hv/hv_proc.c
> @@ -110,6 +110,23 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> }
> EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
>
> +int hv_deposit_memory_node(int node, u64 partition_id,
> + u64 hv_status)
> +{
> + u32 num_pages;
> +
> + switch (hv_result(hv_status)) {
> + case HV_STATUS_INSUFFICIENT_MEMORY:
> + num_pages = 1;
> + break;
> + default:
> + hv_status_err(hv_status, "Unexpected!\n");
> + return -ENOMEM;
> + }
> + return hv_call_deposit_pages(node, partition_id, num_pages);
> +}
> +EXPORT_SYMBOL_GPL(hv_deposit_memory_node);
> +
> bool hv_result_needs_memory(u64 status)
> {
> switch (hv_result(status)) {
> @@ -155,7 +172,8 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
> }
> break;
> }
> - ret = hv_call_deposit_pages(node, hv_current_partition_id, 1);
> + ret = hv_deposit_memory_node(node, hv_current_partition_id,
> + status);
> } while (!ret);
>
> return ret;
> @@ -197,7 +215,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
> }
> break;
> }
> - ret = hv_call_deposit_pages(node, partition_id, 1);
> + ret = hv_deposit_memory_node(node, partition_id, status);
>
> } while (!ret);
>
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index 89afeeda21dd..174431cb5e0e 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -123,8 +123,7 @@ int hv_call_create_partition(u64 flags,
> break;
> }
> local_irq_restore(irq_flags);
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - hv_current_partition_id, 1);
> + ret = hv_deposit_memory(hv_current_partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -151,7 +150,7 @@ int hv_call_initialize_partition(u64 partition_id)
> ret = hv_result_to_errno(status);
> break;
> }
> - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
> + ret = hv_deposit_memory(partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -465,8 +464,7 @@ int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
> }
> local_irq_restore(flags);
>
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - partition_id, 1);
> + ret = hv_deposit_memory(partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -525,8 +523,7 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
> }
> local_irq_restore(flags);
>
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - partition_id, 1);
> + ret = hv_deposit_memory(partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -573,7 +570,7 @@ static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
>
> local_irq_restore(flags);
>
> - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id, 1);
> + ret = hv_deposit_memory(partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -722,8 +719,7 @@ hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
> ret = hv_result_to_errno(status);
> break;
> }
> - ret = hv_call_deposit_pages(NUMA_NO_NODE, port_partition_id, 1);
> -
> + ret = hv_deposit_memory(port_partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -776,8 +772,7 @@ hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
> ret = hv_result_to_errno(status);
> break;
> }
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - connection_partition_id, 1);
> + ret = hv_deposit_memory(connection_partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -848,8 +843,7 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type,
> break;
> }
>
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - hv_current_partition_id, 1);
> + ret = hv_deposit_memory(hv_current_partition_id, status);
> } while (!ret);
>
> return ret;
> @@ -885,8 +879,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
> return ret;
> }
>
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - hv_current_partition_id, 1);
> + ret = hv_deposit_memory(hv_current_partition_id, status);
> if (ret)
> return ret;
> } while (!ret);
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index ee30bfa6bb2e..dce255c94f9e 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -264,8 +264,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
> if (!hv_result_needs_memory(status))
> ret = hv_result_to_errno(status);
> else
> - ret = hv_call_deposit_pages(NUMA_NO_NODE,
> - pt_id, 1);
> + ret = hv_deposit_memory(pt_id, status);
> } while (!ret);
>
> args.status = hv_result(status);
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index 452426d5b2ab..d37b68238c97 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -344,6 +344,7 @@ static inline bool hv_parent_partition(void)
> }
>
> bool hv_result_needs_memory(u64 status);
> +int hv_deposit_memory_node(int node, u64 partition_id, u64 status);
> int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
> int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
> int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
> @@ -353,6 +354,10 @@ static inline bool hv_root_partition(void) { return false; }
> static inline bool hv_l1vh_partition(void) { return false; }
> static inline bool hv_parent_partition(void) { return false; }
> static inline bool hv_result_needs_memory(u64 status) { return false; }
> +static inline int hv_deposit_memory_node(int node, u64 partition_id, u64 status)
> +{
> + return -EOPNOTSUPP;
> +}
> static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> {
> return -EOPNOTSUPP;
> @@ -367,6 +372,11 @@ static inline int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u3
> }
> #endif /* CONFIG_MSHV_ROOT */
>
> +static inline int hv_deposit_memory(u64 partition_id, u64 status)
> +{
> + return hv_deposit_memory_node(NUMA_NO_NODE, partition_id, status);
> +}
> +
> #if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
> u8 __init get_vtl(void);
> #else
>
>
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
^ permalink raw reply
* Re: [PATCH v2 1/4] mshv: Introduce hv_result_needs_memory() helper function
From: Anirudh Rayabharam @ 2026-02-05 17:58 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <177005513775.120041.4894134857240187839.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>
On Mon, Feb 02, 2026 at 05:58:57PM +0000, Stanislav Kinsburskii wrote:
> Replace direct comparisons of hv_result(status) against
> HV_STATUS_INSUFFICIENT_MEMORY with a new hv_result_needs_memory() helper
> function.
> This improves code readability and provides a consistent and extendable
> interface for checking out-of-memory conditions in hypercall results.
>
> No functional changes intended.
>
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
> drivers/hv/hv_proc.c | 14 ++++++++++++--
> drivers/hv/mshv_root_hv_call.c | 20 ++++++++++----------
> drivers/hv/mshv_root_main.c | 2 +-
> include/asm-generic/mshyperv.h | 3 +++
> 4 files changed, 26 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/hv/hv_proc.c b/drivers/hv/hv_proc.c
> index fbb4eb3901bb..e53204b9e05d 100644
> --- a/drivers/hv/hv_proc.c
> +++ b/drivers/hv/hv_proc.c
> @@ -110,6 +110,16 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> }
> EXPORT_SYMBOL_GPL(hv_call_deposit_pages);
>
> +bool hv_result_needs_memory(u64 status)
> +{
> + switch (hv_result(status)) {
> + case HV_STATUS_INSUFFICIENT_MEMORY:
> + return true;
> + }
> + return false;
> +}
> +EXPORT_SYMBOL_GPL(hv_result_needs_memory);
> +
> int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
> {
> struct hv_input_add_logical_processor *input;
> @@ -137,7 +147,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
> input, output);
> local_irq_restore(flags);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> if (!hv_result_success(status)) {
> hv_status_err(status, "cpu %u apic ID: %u\n",
> lp_index, apic_id);
> @@ -179,7 +189,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
> status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
> local_irq_restore(irq_flags);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> if (!hv_result_success(status)) {
> hv_status_err(status, "vcpu: %u, lp: %u\n",
> vp_index, flags);
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index 598eaff4ff29..89afeeda21dd 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -115,7 +115,7 @@ int hv_call_create_partition(u64 flags,
> status = hv_do_hypercall(HVCALL_CREATE_PARTITION,
> input, output);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> if (hv_result_success(status))
> *partition_id = output->partition_id;
> local_irq_restore(irq_flags);
> @@ -147,7 +147,7 @@ int hv_call_initialize_partition(u64 partition_id)
> status = hv_do_fast_hypercall8(HVCALL_INITIALIZE_PARTITION,
> *(u64 *)&input);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> ret = hv_result_to_errno(status);
> break;
> }
> @@ -239,7 +239,7 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
>
> completed = hv_repcomp(status);
>
> - if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (hv_result_needs_memory(status)) {
> ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> HV_MAP_GPA_DEPOSIT_PAGES);
> if (ret)
> @@ -455,7 +455,7 @@ int hv_call_get_vp_state(u32 vp_index, u64 partition_id,
>
> status = hv_do_hypercall(control, input, output);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> if (hv_result_success(status) && ret_output)
> memcpy(ret_output, output, sizeof(*output));
>
> @@ -518,7 +518,7 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id,
>
> status = hv_do_hypercall(control, input, NULL);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> local_irq_restore(flags);
> ret = hv_result_to_errno(status);
> break;
> @@ -563,7 +563,7 @@ static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type,
> status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input,
> output);
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> if (hv_result_success(status))
> *state_page = pfn_to_page(output->map_location);
> local_irq_restore(flags);
> @@ -718,7 +718,7 @@ hv_call_create_port(u64 port_partition_id, union hv_port_id port_id,
> if (hv_result_success(status))
> break;
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> ret = hv_result_to_errno(status);
> break;
> }
> @@ -772,7 +772,7 @@ hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id,
> if (hv_result_success(status))
> break;
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> ret = hv_result_to_errno(status);
> break;
> }
> @@ -843,7 +843,7 @@ static int hv_call_map_stats_page2(enum hv_stats_object_type type,
> if (!ret)
> break;
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> hv_status_debug(status, "\n");
> break;
> }
> @@ -878,7 +878,7 @@ static int hv_call_map_stats_page(enum hv_stats_object_type type,
> pfn = output->map_location;
>
> local_irq_restore(flags);
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
> + if (!hv_result_needs_memory(status)) {
> ret = hv_result_to_errno(status);
> if (hv_result_success(status))
> break;
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index 6a6bf641b352..ee30bfa6bb2e 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -261,7 +261,7 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
> if (hv_result_success(status))
> break;
>
> - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
> + if (!hv_result_needs_memory(status))
> ret = hv_result_to_errno(status);
> else
> ret = hv_call_deposit_pages(NUMA_NO_NODE,
> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
> index ecedab554c80..452426d5b2ab 100644
> --- a/include/asm-generic/mshyperv.h
> +++ b/include/asm-generic/mshyperv.h
> @@ -342,6 +342,8 @@ static inline bool hv_parent_partition(void)
> {
> return hv_root_partition() || hv_l1vh_partition();
> }
> +
> +bool hv_result_needs_memory(u64 status);
> int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
> int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
> int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
> @@ -350,6 +352,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
> static inline bool hv_root_partition(void) { return false; }
> static inline bool hv_l1vh_partition(void) { return false; }
> static inline bool hv_parent_partition(void) { return false; }
> +static inline bool hv_result_needs_memory(u64 status) { return false; }
> static inline int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
> {
> return -EOPNOTSUPP;
>
>
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
^ permalink raw reply
* Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
From: Mukesh R @ 2026-02-05 17:57 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
linux-arch, kys, haiyangz, wei.liu, decui, longli,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
mhklinux
In-Reply-To: <aYTFJB4UcRkL2NwG@skinsburskii.localdomain>
On 2/5/26 08:28, Stanislav Kinsburskii wrote:
> On Wed, Feb 04, 2026 at 02:52:54PM -0800, Mukesh R wrote:
>> On 2/2/26 08:30, Stanislav Kinsburskii wrote:
>>> On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
>>>> On 1/27/26 10:57, Stanislav Kinsburskii wrote:
>>>>> On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
>>>>>> On 1/26/26 10:15, Stanislav Kinsburskii wrote:
>>>>>>> On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
>>>>>>>> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
>>>>>>>>> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>>>>>>>>>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>>>>
>>>>>>>>>> Upon guest access, in case of missing mmio mapping, the hypervisor
>>>>>>>>>> generates an unmapped gpa intercept. In this path, lookup the PCI
>>>>>>>>>> resource pfn for the guest gpa, and ask the hypervisor to map it
>>>>>>>>>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>>>>>>>>>> and obtained via fixup_user_fault call (similar to KVM).
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>>>> ---
>>>>>>>>>> drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>>>>>>>>> 1 file changed, 115 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>>>>>>>> index 03f3aa9f5541..4c8bc7cd0888 100644
>>>>>>>>>> --- a/drivers/hv/mshv_root_main.c
>>>>>>>>>> +++ b/drivers/hv/mshv_root_main.c
>>>>>>>>>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>>>>>>>>> };
>>>>>>>>>> } __packed;
>>>>>>>>>> +bool hv_nofull_mmio; /* don't map entire mmio region upon fault */
>>>>>>>>>> +static int __init setup_hv_full_mmio(char *str)
>>>>>>>>>> +{
>>>>>>>>>> + hv_nofull_mmio = true;
>>>>>>>>>> + return 0;
>>>>>>>>>> +}
>>>>>>>>>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>>>>>>>>>> +
>>>>>>>>>> struct mshv_root mshv_root;
>>>>>>>>>> enum hv_scheduler_type hv_scheduler_type;
>>>>>>>>>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>>>>>>> }
>>>>>>>>>> #ifdef CONFIG_X86_64
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>>>>>>>>>> + * else just return -errno.
>>>>>>>>>> + */
>>>>>>>>>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>>>>>>>>>> + u64 *mmio_pfnp)
>>>>>>>>>> +{
>>>>>>>>>> + struct vm_area_struct *vma;
>>>>>>>>>> + bool is_mmio;
>>>>>>>>>> + u64 uaddr;
>>>>>>>>>> + struct mshv_mem_region *mreg;
>>>>>>>>>> + struct follow_pfnmap_args pfnmap_args;
>>>>>>>>>> + int rc = -EINVAL;
>>>>>>>>>> +
>>>>>>>>>> + /*
>>>>>>>>>> + * Do not allow mem region to be deleted beneath us. VFIO uses
>>>>>>>>>> + * useraddr vma to lookup pci bar pfn.
>>>>>>>>>> + */
>>>>>>>>>> + spin_lock(&pt->pt_mem_regions_lock);
>>>>>>>>>> +
>>>>>>>>>> + /* Get the region again under the lock */
>>>>>>>>>> + mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>>>> + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>>>> + goto unlock_pt_out;
>>>>>>>>>> +
>>>>>>>>>> + uaddr = mreg->start_uaddr +
>>>>>>>>>> + ((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>>>>>>>>>> +
>>>>>>>>>> + mmap_read_lock(current->mm);
>>>>>>>>>
>>>>>>>>> Semaphore can't be taken under spinlock.
>>>>>>>
>>>>>>>>
>>>>>>>> Yeah, something didn't feel right here and I meant to recheck, now regret
>>>>>>>> rushing to submit the patch.
>>>>>>>>
>>>>>>>> Rethinking, I think the pt_mem_regions_lock is not needed to protect
>>>>>>>> the uaddr because unmap will properly serialize via the mm lock.
>>>>>>>>
>>>>>>>>
>>>>>>>>>> + vma = vma_lookup(current->mm, uaddr);
>>>>>>>>>> + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>>>>>>>>>
>>>>>>>>> Why this check is needed again?
>>>>>>>>
>>>>>>>> To make sure region did not change. This check is under lock.
>>>>>>>>
>>>>>>>
>>>>>>> How can this happen? One can't change VMA type without unmapping it
>>>>>>> first. And unmapping it leads to a kernel MMIO region state dangling
>>>>>>> around without corresponding user space mapping.
>>>>>>
>>>>>> Right, and vm_flags would not be mmio expected then.
>>>>>>
>>>>>>> This is similar to dangling pinned regions and should likely be
>>>>>>> addressed the same way by utilizing MMU notifiers to destpoy memoty
>>>>>>> regions is VMA is detached.
>>>>>>
>>>>>> I don't think we need that. Either it succeeds if the region did not
>>>>>> change at all, or just fails.
>>>>>>
>>>>>
>>>>> I'm afraid we do, as if the driver mapped a page with the previous
>>>>> memory region, and then the region is unmapped, the page will stay
>>>>> mapped in the hypervisor, but will be considered free by kernel, which
>>>>> in turn will lead to GPF upn next allocation.
>>>>
>>>> There are no ram pages for mmio regions. Also, we don't do much with
>>>> mmio regions other than tell the hyp about it.
>>>>
>>>
>>> So, are you saying that the hypervisor does not use these pages and only
>>> tracks them? That would make things easier.
>>> However, if we later try to map a GPA that is already mapped, will the
>>> hypervisor return an error?
>>
>> Hypervisor does not return an error.
>>
>
> So, what happenes if we map a GPA that is already mapped? Does it just
> remap it to the new PFN?
yes, otherwise it would return error, right?
> Thanks,
> Stanislav
>
>>
>>
>>> Thanks,
>>> Stanislav
>>>
>>>> Thanks,
>>>> -Mukesh
>>>>
>>>>
>>>>> With pinned regions we issue is similar but less impacting: pages can't
>>>>> be released by user space unmapping and thus will be simply leaked, but
>>>>> the system stays intact.
>>>>>
>>>>> MMIO regions are simila to movable region in this regard: they don't
>>>>> reference the user pages, and thus this guest region replaement is a
>>>>> stright wat to kernel panic.
>>>>>
>>>>>>
>>>>>>>>> The region type is stored on the region itself.
>>>>>>>>> And the type is checked on the caller side.
>>>>>>>>>
>>>>>>>>>> + if (!is_mmio)
>>>>>>>>>> + goto unlock_mmap_out;
>>>>>>>>>> +
>>>>>>>>>> + pfnmap_args.vma = vma;
>>>>>>>>>> + pfnmap_args.address = uaddr;
>>>>>>>>>> +
>>>>>>>>>> + rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>>>> + if (rc) {
>>>>>>>>>> + rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>>>>>>>>>> + NULL);
>>>>>>>>>> + if (rc)
>>>>>>>>>> + goto unlock_mmap_out;
>>>>>>>>>> +
>>>>>>>>>> + rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>>>> + if (rc)
>>>>>>>>>> + goto unlock_mmap_out;
>>>>>>>>>> + }
>>>>>>>>>> +
>>>>>>>>>> + *mmio_pfnp = pfnmap_args.pfn;
>>>>>>>>>> + follow_pfnmap_end(&pfnmap_args);
>>>>>>>>>> +d
>>>>>>>>>> +unlock_mmap_out:
>>>>>>>>>> + mmap_read_unlock(current->mm);
>>>>>>>>>> +unlock_pt_out:
>>>>>>>>>> + spin_unlock(&pt->pt_mem_regions_lock);
>>>>>>>>>> + return rc;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> +/*
>>>>>>>>>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>>>>>>>>>> + * and resolve if possible.
>>>>>>>>>> + * Returns: True if valid mmio intercept and it was handled, else false
>>>>>>>>>> + */
>>>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>>>>>>>>>> +{
>>>>>>>>>> + struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>>>>>>>>>> + struct hv_x64_memory_intercept_message *msg;
>>>>>>>>>> + union hv_x64_memory_access_info accinfo;
>>>>>>>>>> + u64 gfn, mmio_spa, numpgs;
>>>>>>>>>> + struct mshv_mem_region *mreg;
>>>>>>>>>> + int rc;
>>>>>>>>>> + struct mshv_partition *pt = vp->vp_partition;
>>>>>>>>>> +
>>>>>>>>>> + msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>>>>>>>>>> + accinfo = msg->memory_access_info;
>>>>>>>>>> +
>>>>>>>>>> + if (!accinfo.gva_gpa_valid)
>>>>>>>>>> + return false;
>>>>>>>>>> +
>>>>>>>>>> + /* Do a fast check and bail if non mmio intercept */
>>>>>>>>>> + gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>>>>>>>>>> + mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>>>
>>>>>>>>> This call needs to be protected by the spinlock.
>>>>>>>>
>>>>>>>> This is sorta fast path to bail. We recheck under partition lock above.
>>>>>>>>
>>>>>>>
>>>>>>> Accessing the list of regions without lock is unsafe.
>>>>>>
>>>>>> I am not sure why? This check is done by a vcpu thread, so regions
>>>>>> will not have just gone away.
>>>>>>
>>>>>
>>>>> This is shared resources. Multiple VP thread get into this function
>>>>> simultaneously, so there is a race already. But this one we can live
>>>>> with without locking as they don't mutate the list of the regions.
>>>>>
>>>>> The issue happens when VMM adds or removed another region as it mutates
>>>>> the list and races with VP threads doing this lookup.
>>>>>
>>>>> Thanks,
>>>>> Stanislav
>>>>>
>>>>>
>>>>>> Thanks,
>>>>>> -Mukesh
>>>>>>
>>>>>>
>>>>>>> Thanks,
>>>>>>> Stanislav
>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> -Mukesh
>>>>>>>>
>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Stanislav
>>>>>>>>>
>>>>>>>>>> + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>>>> + return false;
>>>>>>>>>> +
>>>>>>>>>> + rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>>>>>>>>>> + if (rc)
>>>>>>>>>> + return false;
>>>>>>>>>> +
>>>>>>>>>> + if (!hv_nofull_mmio) { /* default case */
>>>>>>>>>> + gfn = mreg->start_gfn;
>>>>>>>>>> + mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>>>>>>>>>> + numpgs = mreg->nr_pages;
>>>>>>>>>> + } else
>>>>>>>>>> + numpgs = 1;
>>>>>>>>>> +
>>>>>>>>>> + rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>>>>>>>>>> +
>>>>>>>>>> + return rc == 0;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>> static struct mshv_mem_region *
>>>>>>>>>> mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>>>>>>> {
>>>>>>>>>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>>>>>>> return ret;
>>>>>>>>>> }
>>>>>>>>>> +
>>>>>>>>>> #else /* CONFIG_X86_64 */
>>>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>>>>>>>>> static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>>>>>>>> #endif /* CONFIG_X86_64 */
>>>>>>>>>> static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>>>>>>> {
>>>>>>>>>> switch (vp->vp_intercept_msg_page->header.message_type) {
>>>>>>>>>> + case HVMSG_UNMAPPED_GPA:
>>>>>>>>>> + return mshv_handle_unmapped_gpa(vp);
>>>>>>>>>> case HVMSG_GPA_INTERCEPT:
>>>>>>>>>> return mshv_handle_gpa_intercept(vp);
>>>>>>>>>> }
>>>>>>>>>> --
>>>>>>>>>> 2.51.2.vfs.0.1
>>>>>>>>>>
^ permalink raw reply
* Re: [PATCH] mshv: fix SRCU protection in irqfd resampler ack handler
From: Anirudh Rayabharam @ 2026-02-05 17:53 UTC (permalink / raw)
To: lirongqing
Cc: K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
linux-hyperv, linux-kernel
In-Reply-To: <20260205094010.4301-1-lirongqing@baidu.com>
On Thu, Feb 05, 2026 at 04:40:10AM -0500, lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> Replace hlist_for_each_entry_rcu() with hlist_for_each_entry_srcu()
> in mshv_irqfd_resampler_ack() to correctly handle SRCU-protected
> linked list traversal.
>
> The function uses SRCU (sleepable RCU) synchronization via
> partition->pt_irq_srcu, but was incorrectly using the RCU variant
> for list iteration. This could lead to race conditions when the
> list is modified concurrently.
>
> Also add srcu_read_lock_held() assertion as required by
> hlist_for_each_entry_srcu() to ensure we're in the proper
> read-side critical section.
>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
Reviewed-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> ---
> drivers/hv/mshv_eventfd.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
> index 0b75ff1..6d176ed 100644
> --- a/drivers/hv/mshv_eventfd.c
> +++ b/drivers/hv/mshv_eventfd.c
> @@ -87,8 +87,9 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
>
> idx = srcu_read_lock(&partition->pt_irq_srcu);
>
> - hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
> - irqfd_resampler_hnode) {
> + hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
> + irqfd_resampler_hnode,
> + srcu_read_lock_held(&partition->pt_irq_srcu)) {
> if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
> hv_call_clear_virtual_interrupt(partition->pt_id);
>
> --
> 2.9.4
>
^ permalink raw reply
* Re: [PATCH] mshv: fix SRCU protection in irqfd resampler ack handler
From: Stanislav Kinsburskii @ 2026-02-05 17:14 UTC (permalink / raw)
To: lirongqing
Cc: K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
linux-hyperv, linux-kernel
In-Reply-To: <20260205094010.4301-1-lirongqing@baidu.com>
On Thu, Feb 05, 2026 at 04:40:10AM -0500, lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> Replace hlist_for_each_entry_rcu() with hlist_for_each_entry_srcu()
> in mshv_irqfd_resampler_ack() to correctly handle SRCU-protected
> linked list traversal.
>
> The function uses SRCU (sleepable RCU) synchronization via
> partition->pt_irq_srcu, but was incorrectly using the RCU variant
> for list iteration. This could lead to race conditions when the
> list is modified concurrently.
>
> Also add srcu_read_lock_held() assertion as required by
> hlist_for_each_entry_srcu() to ensure we're in the proper
> read-side critical section.
>
Thank you.
Acked-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
> ---
> drivers/hv/mshv_eventfd.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
> index 0b75ff1..6d176ed 100644
> --- a/drivers/hv/mshv_eventfd.c
> +++ b/drivers/hv/mshv_eventfd.c
> @@ -87,8 +87,9 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
>
> idx = srcu_read_lock(&partition->pt_irq_srcu);
>
> - hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
> - irqfd_resampler_hnode) {
> + hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
> + irqfd_resampler_hnode,
> + srcu_read_lock_held(&partition->pt_irq_srcu)) {
> if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
> hv_call_clear_virtual_interrupt(partition->pt_id);
>
> --
> 2.9.4
>
^ permalink raw reply
* Re: [PATCH] mshv: Make MSHV mutually exclusive with KEXEC
From: Stanislav Kinsburskii @ 2026-02-05 17:12 UTC (permalink / raw)
To: Anirudh Rayabharam
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aYQjt1FF_v-fNZFj@anirudh-surface.localdomain>
On Thu, Feb 05, 2026 at 04:59:35AM +0000, Anirudh Rayabharam wrote:
> On Wed, Feb 04, 2026 at 10:33:11AM -0800, Stanislav Kinsburskii wrote:
> > On Wed, Feb 04, 2026 at 05:33:29AM +0000, Anirudh Rayabharam wrote:
> > > On Tue, Feb 03, 2026 at 11:42:58AM -0800, Stanislav Kinsburskii wrote:
> > > > On Tue, Feb 03, 2026 at 04:46:03PM +0000, Anirudh Rayabharam wrote:
> > > > > On Tue, Feb 03, 2026 at 07:40:36AM -0800, Stanislav Kinsburskii wrote:
> > > > > > On Tue, Feb 03, 2026 at 10:34:28AM +0530, Anirudh Rayabharam wrote:
> > > > > > > On Mon, Feb 02, 2026 at 11:18:27AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > On Mon, Feb 02, 2026 at 07:01:01PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > On Mon, Feb 02, 2026 at 09:10:00AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > On Fri, Jan 30, 2026 at 08:32:45PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > On Fri, Jan 30, 2026 at 10:46:45AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > On Fri, Jan 30, 2026 at 05:11:12PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > On Wed, Jan 28, 2026 at 03:11:14PM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > On Wed, Jan 28, 2026 at 04:16:31PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > > > On Mon, Jan 26, 2026 at 12:46:44PM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > > > On Tue, Jan 27, 2026 at 12:19:24AM +0530, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > > > > > On Fri, Jan 23, 2026 at 10:20:53PM +0000, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > > > > > The MSHV driver deposits kernel-allocated pages to the hypervisor during
> > > > > > > > > > > > > > > > > > runtime and never withdraws them. This creates a fundamental incompatibility
> > > > > > > > > > > > > > > > > > with KEXEC, as these deposited pages remain unavailable to the new kernel
> > > > > > > > > > > > > > > > > > loaded via KEXEC, leading to potential system crashes upon kernel accessing
> > > > > > > > > > > > > > > > > > hypervisor deposited pages.
> > > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > > Make MSHV mutually exclusive with KEXEC until proper page lifecycle
> > > > > > > > > > > > > > > > > > management is implemented.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Someone might want to stop all guest VMs and do a kexec. Which is valid
> > > > > > > > > > > > > > > > > and would work without any issue for L1VH.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > No, it won't work and hypervsisor depostied pages won't be withdrawn.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > All pages that were deposited in the context of a guest partition (i.e.
> > > > > > > > > > > > > > > with the guest partition ID), would be withdrawn when you kill the VMs,
> > > > > > > > > > > > > > > right? What other deposited pages would be left?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > The driver deposits two types of pages: one for the guests (withdrawn
> > > > > > > > > > > > > > upon gust shutdown) and the other - for the host itself (never
> > > > > > > > > > > > > > withdrawn).
> > > > > > > > > > > > > > See hv_call_create_partition, for example: it deposits pages for the
> > > > > > > > > > > > > > host partition.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Hmm.. I see. Is it not possible to reclaim this memory in module_exit?
> > > > > > > > > > > > > Also, can't we forcefully kill all running partitions in module_exit and
> > > > > > > > > > > > > then reclaim memory? Would this help with kernel consistency
> > > > > > > > > > > > > irrespective of userspace behavior?
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > It would, but this is sloppy and cannot be a long-term solution.
> > > > > > > > > > > >
> > > > > > > > > > > > It is also not reliable. We have no hook to prevent kexec. So if we fail
> > > > > > > > > > > > to kill the guest or reclaim the memory for any reason, the new kernel
> > > > > > > > > > > > may still crash.
> > > > > > > > > > >
> > > > > > > > > > > Actually guests won't be running by the time we reach our module_exit
> > > > > > > > > > > function during a kexec. Userspace processes would've been killed by
> > > > > > > > > > > then.
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > No, they will not: "kexec -e" doesn't kill user processes.
> > > > > > > > > > We must not rely on OS to do graceful shutdown before doing
> > > > > > > > > > kexec.
> > > > > > > > >
> > > > > > > > > I see kexec -e is too brutal. Something like systemctl kexec is
> > > > > > > > > more graceful and is probably used more commonly. In this case at least
> > > > > > > > > we could register a reboot notifier and attempt to clean things up.
> > > > > > > > >
> > > > > > > > > I think it is better to support kexec to this extent rather than
> > > > > > > > > disabling it entirely.
> > > > > > > > >
> > > > > > > >
> > > > > > > > You do understand that once our kernel is released to third parties, we
> > > > > > > > can’t control how they will use kexec, right?
> > > > > > >
> > > > > > > Yes, we can't. But that's okay. It is fine for us to say that only some
> > > > > > > kexec scenarios are supported and some aren't (iff you're creating VMs
> > > > > > > using MSHV; if you're not creating VMs all of kexec is supported).
> > > > > > >
> > > > > >
> > > > > > Well, I disagree here. If we say the kernel supports MSHV, we must
> > > > > > provide a robust solution. A partially working solution is not
> > > > > > acceptable. It makes us look careless and can damage our reputation as a
> > > > > > team (and as a company).
> > > > >
> > > > > It won't if we call out upfront what is supported and what is not.
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > > This is a valid and existing option. We have to account for it. Yet
> > > > > > > > again, L1VH will be used by arbitrary third parties out there, not just
> > > > > > > > by us.
> > > > > > > >
> > > > > > > > We can’t say the kernel supports MSHV until we close these gaps. We must
> > > > > > >
> > > > > > > We can. It is okay say some scenarios are supported and some aren't.
> > > > > > >
> > > > > > > All kexecs are supported if they never create VMs using MSHV. If they do
> > > > > > > create VMs using MSHV and we implement cleanup in a reboot notifier at
> > > > > > > least systemctl kexec and crashdump kexec would which are probably the
> > > > > > > most common uses of kexec. It's okay to say that this is all we support
> > > > > > > as of now.
> > > > > > >
> > > > > >
> > > > > > I'm repeating myself, but I'll try to put it differently.
> > > > > > There won't be any kernel core collected if a page was deposited. You're
> > > > > > arguing for a lost cause here. Once a page is allocated and deposited,
> > > > > > the crash kernel will try to write it into the core.
> > > > >
> > > > > That's why we have to implement something where we attempt to destroy
> > > > > partitions and reclaim memory (and BUG() out if that fails; which
> > > > > hopefully should happen very rarely if at all). This should be *the*
> > > > > solution we work towards. We don't need a temporary disable kexec
> > > > > solution.
> > > > >
> > > >
> > > > No, the solution is to preserve the shared state and pass it over via KHO.
> > >
> > > Okay, then work towards it without doing temporary KEXEC disable. We can
> > > call out that kexec is not supported until then. Disabling KEXEC is too
> > > intrusive.
> > >
> >
> > What do you mean by "too intrusive"? The change if local to driver's
> > Kconfig. There are no verbal "callouts" in upstream Linux - that's
> > exactly what Kconfig is used for. Once the proper solution is
> > implemented, we can remove the restriction.
> >
> > > Is there any precedent for this? Do you know if any driver ever disabled
> > > KEXEC this way?
> > >
> >
> > No, but there is no other similar driver like this one.
>
> Doesn't have to be like this one. There could be issues with device
> states during kexec state.
>
> > Why does it matter though?
>
> To learn from past precedents.
>
> >
> > > >
> > > > > >
> > > > > > > Also, what makes you think customers would even be interested in enabling
> > > > > > > our module in their kernel configs if it takes away kexec?
> > > > > > >
> > > > > >
> > > > > > It's simple: L1VH isn't a host, so I can spin up new VMs instead of
> > > > > > servicing the existing ones.
> > > > >
> > > > > And what about the L2 VM state then? They might not be throwaway in all
> > > > > cases.
> > > > >
> > > >
> > > > L2 guest can (and likely will) be migrated fromt he old L1VH to the new
> > > > one.
> > > > And this is most likely the current scenario customers are using.
> > > >
> > > > > >
> > > > > > Why do you think there won’t be customers interested in using MSHV in
> > > > > > L1VH without kexec support?
> > > > >
> > > > > Because they could already be using kexec for their servicing needs or
> > > > > whatever. And no we can't just say "don't service these VMs just spin up
> > > > > new ones".
> > > > >
> > > >
> > > > Are you speculating or know for sure?
> > >
> > > It's a reasonable assumption that people are using kexec for servicing.
> > >
> >
> > Again, using kexec for servicing is not supported: why pretending it is?
>
> What this patch effectively asserts is that kexec is unsupported whenever the
> MSHV driver is enabled. But that is not accurate. Enabling MSHV does not
> necessarily imply that it is being used. The correct statement is that kexec is
> unsupported only when MSHV is *in use*, i.e. when one or more VMs are
> running.
>
> By disabling kexec unconditionally, the patch prevents a valid workflow in
> situations where no VMs exist and kexec would work without issue. This imposes a
> blanket restriction instead of enforcing the actual requirement.
>
> And sure, I understand there is no way to enforce that actual
> requirement. So this is what I propose:
>
> The statement "kexec is not supported when the MSHV driver is used" can be
> documented on docs.microsoft.com once direct virtualization becomes broadly
> available. The documentation can also provide operational guidance, such as
> shutting down all VMs before invoking kexec for servicing. This preserves a
> practical path for users who rely on kexec. If kexec is disabled entirely, that
> flexibility is lost.
>
> The stricter approach ensures users cannot accidentally make a mistake, which
> has its merits. However, my approach gives more power and discretion to
> the user. In parallel, we of course continue to work on making it
> robust.
>
The flexibility is much smaller than you described. The host can’t kexec
if a VM was ever created, because we don’t withdraw the host pages.
Even if we try to withdraw pages during kexec, it won’t help with crash
collection. Those pages will be in use and won’t be available to
withdraw.
So the trade-off is between being able to kexec safely only before any
VM has been launched, or blocking it completely.
> >
> > > >
> > > > > Also, keep in mind that once L1VH is available in Azure, the distros
> > > > > that run on it would be the same distros that run on all other Azure
> > > > > VMs. There won't be special distros with a kernel specifically built for
> > > > > L1VH. And KEXEC is generally enabled in distros. Distro vendors won't be
> > > > > happy that they would need to publish a separate version of their image with
> > > > > MSHV_ROOT enabled and KEXEC disabled because they wouldn't want KEXEC to
> > > > > be disabled for all Azure VMs. Also, the customers will be confused why
> > > > > the same distro doesn't work on L1VH.
> > > > >
> > > >
> > > > I don't think distro happiness is our concern. They already build custom
> > >
> > > If distros are not happy they won't package this and consequently
> > > nobody will use it.
> > >
> >
> > Could you provide an example of such issues in the past?
> >
> > > > versions for Azure. They can build another custom version for L1VH if
> > > > needed.
> > >
> > > We should at least check if they are ready to do this.
> > >
> >
> > This is a labor intrusive and long-term check. Unless there is a solid
> > evidence that they won't do it, I don't see the point in doing this.
>
> It is reasonable to assume that maintaining an additional flavor of a
> distro is an overhead (maintain new package(s), maintain Azure
> marketplace images etc etc). This should be enough reason to check. Not
> everything needs a solid evidence. Often times a reasonable suspiscion
> will do.
>
There will be a new kernel flavor anyway. That means a new kernel
package. If we also need a separate distro image for MSHV on Azure VMs,
it will be needed regardless of kexec support. There won’t be a generic
Ubuntu build that works both for regular guest VMs and for L1VH VMs any
time soon.
Thanks,
Stanislav
> Thanks,
> Anirudh.
>
> >
> > Thanks,
> > Stanislav
> >
> > > Thanks,
> > > Anirudh.
> > >
> > > >
> > > > Anyway, I don't see the point in continuing this discussion. All points
> > > > have been made, and solutions have been proposed.
> > > >
> > > > If you can come up with something better in the next few days, so we at
> > > > least have a chance to get it merged in the next merge window, great. If
> > > > not, we should explicitly forbid the unsupported feature and move on.
> > > >
> > > > Thanks,
> > > > Thanks,
> > > > Stanislav
> > > >
> > > > > Thanks,
> > > > > Anirudh.
^ permalink raw reply
* Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
From: Stanislav Kinsburskii @ 2026-02-05 16:28 UTC (permalink / raw)
To: Mukesh R
Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
linux-arch, kys, haiyangz, wei.liu, decui, longli,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
mhklinux
In-Reply-To: <596c9549-9edc-91f3-7473-e206ddc68e76@linux.microsoft.com>
On Wed, Feb 04, 2026 at 02:52:54PM -0800, Mukesh R wrote:
> On 2/2/26 08:30, Stanislav Kinsburskii wrote:
> > On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
> > > On 1/27/26 10:57, Stanislav Kinsburskii wrote:
> > > > On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
> > > > > On 1/26/26 10:15, Stanislav Kinsburskii wrote:
> > > > > > On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
> > > > > > > On 1/20/26 17:53, Stanislav Kinsburskii wrote:
> > > > > > > > On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
> > > > > > > > > From: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > >
> > > > > > > > > Upon guest access, in case of missing mmio mapping, the hypervisor
> > > > > > > > > generates an unmapped gpa intercept. In this path, lookup the PCI
> > > > > > > > > resource pfn for the guest gpa, and ask the hypervisor to map it
> > > > > > > > > via hypercall. The PCI resource pfn is maintained by the VFIO driver,
> > > > > > > > > and obtained via fixup_user_fault call (similar to KVM).
> > > > > > > > >
> > > > > > > > > Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
> > > > > > > > > ---
> > > > > > > > > drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > 1 file changed, 115 insertions(+)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> > > > > > > > > index 03f3aa9f5541..4c8bc7cd0888 100644
> > > > > > > > > --- a/drivers/hv/mshv_root_main.c
> > > > > > > > > +++ b/drivers/hv/mshv_root_main.c
> > > > > > > > > @@ -56,6 +56,14 @@ struct hv_stats_page {
> > > > > > > > > };
> > > > > > > > > } __packed;
> > > > > > > > > +bool hv_nofull_mmio; /* don't map entire mmio region upon fault */
> > > > > > > > > +static int __init setup_hv_full_mmio(char *str)
> > > > > > > > > +{
> > > > > > > > > + hv_nofull_mmio = true;
> > > > > > > > > + return 0;
> > > > > > > > > +}
> > > > > > > > > +__setup("hv_nofull_mmio", setup_hv_full_mmio);
> > > > > > > > > +
> > > > > > > > > struct mshv_root mshv_root;
> > > > > > > > > enum hv_scheduler_type hv_scheduler_type;
> > > > > > > > > @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
> > > > > > > > > }
> > > > > > > > > #ifdef CONFIG_X86_64
> > > > > > > > > +
> > > > > > > > > +/*
> > > > > > > > > + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
> > > > > > > > > + * else just return -errno.
> > > > > > > > > + */
> > > > > > > > > +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
> > > > > > > > > + u64 *mmio_pfnp)
> > > > > > > > > +{
> > > > > > > > > + struct vm_area_struct *vma;
> > > > > > > > > + bool is_mmio;
> > > > > > > > > + u64 uaddr;
> > > > > > > > > + struct mshv_mem_region *mreg;
> > > > > > > > > + struct follow_pfnmap_args pfnmap_args;
> > > > > > > > > + int rc = -EINVAL;
> > > > > > > > > +
> > > > > > > > > + /*
> > > > > > > > > + * Do not allow mem region to be deleted beneath us. VFIO uses
> > > > > > > > > + * useraddr vma to lookup pci bar pfn.
> > > > > > > > > + */
> > > > > > > > > + spin_lock(&pt->pt_mem_regions_lock);
> > > > > > > > > +
> > > > > > > > > + /* Get the region again under the lock */
> > > > > > > > > + mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > > > + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > + goto unlock_pt_out;
> > > > > > > > > +
> > > > > > > > > + uaddr = mreg->start_uaddr +
> > > > > > > > > + ((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
> > > > > > > > > +
> > > > > > > > > + mmap_read_lock(current->mm);
> > > > > > > >
> > > > > > > > Semaphore can't be taken under spinlock.
> > > > > >
> > > > > > >
> > > > > > > Yeah, something didn't feel right here and I meant to recheck, now regret
> > > > > > > rushing to submit the patch.
> > > > > > >
> > > > > > > Rethinking, I think the pt_mem_regions_lock is not needed to protect
> > > > > > > the uaddr because unmap will properly serialize via the mm lock.
> > > > > > >
> > > > > > >
> > > > > > > > > + vma = vma_lookup(current->mm, uaddr);
> > > > > > > > > + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
> > > > > > > >
> > > > > > > > Why this check is needed again?
> > > > > > >
> > > > > > > To make sure region did not change. This check is under lock.
> > > > > > >
> > > > > >
> > > > > > How can this happen? One can't change VMA type without unmapping it
> > > > > > first. And unmapping it leads to a kernel MMIO region state dangling
> > > > > > around without corresponding user space mapping.
> > > > >
> > > > > Right, and vm_flags would not be mmio expected then.
> > > > >
> > > > > > This is similar to dangling pinned regions and should likely be
> > > > > > addressed the same way by utilizing MMU notifiers to destpoy memoty
> > > > > > regions is VMA is detached.
> > > > >
> > > > > I don't think we need that. Either it succeeds if the region did not
> > > > > change at all, or just fails.
> > > > >
> > > >
> > > > I'm afraid we do, as if the driver mapped a page with the previous
> > > > memory region, and then the region is unmapped, the page will stay
> > > > mapped in the hypervisor, but will be considered free by kernel, which
> > > > in turn will lead to GPF upn next allocation.
> > >
> > > There are no ram pages for mmio regions. Also, we don't do much with
> > > mmio regions other than tell the hyp about it.
> > >
> >
> > So, are you saying that the hypervisor does not use these pages and only
> > tracks them? That would make things easier.
> > However, if we later try to map a GPA that is already mapped, will the
> > hypervisor return an error?
>
> Hypervisor does not return an error.
>
So, what happenes if we map a GPA that is already mapped? Does it just
remap it to the new PFN?
Thanks,
Stanislav
>
>
> > Thanks,
> > Stanislav
> >
> > > Thanks,
> > > -Mukesh
> > >
> > >
> > > > With pinned regions we issue is similar but less impacting: pages can't
> > > > be released by user space unmapping and thus will be simply leaked, but
> > > > the system stays intact.
> > > >
> > > > MMIO regions are simila to movable region in this regard: they don't
> > > > reference the user pages, and thus this guest region replaement is a
> > > > stright wat to kernel panic.
> > > >
> > > > >
> > > > > > > > The region type is stored on the region itself.
> > > > > > > > And the type is checked on the caller side.
> > > > > > > >
> > > > > > > > > + if (!is_mmio)
> > > > > > > > > + goto unlock_mmap_out;
> > > > > > > > > +
> > > > > > > > > + pfnmap_args.vma = vma;
> > > > > > > > > + pfnmap_args.address = uaddr;
> > > > > > > > > +
> > > > > > > > > + rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > + if (rc) {
> > > > > > > > > + rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
> > > > > > > > > + NULL);
> > > > > > > > > + if (rc)
> > > > > > > > > + goto unlock_mmap_out;
> > > > > > > > > +
> > > > > > > > > + rc = follow_pfnmap_start(&pfnmap_args);
> > > > > > > > > + if (rc)
> > > > > > > > > + goto unlock_mmap_out;
> > > > > > > > > + }
> > > > > > > > > +
> > > > > > > > > + *mmio_pfnp = pfnmap_args.pfn;
> > > > > > > > > + follow_pfnmap_end(&pfnmap_args);
> > > > > > > > > +d
> > > > > > > > > +unlock_mmap_out:
> > > > > > > > > + mmap_read_unlock(current->mm);
> > > > > > > > > +unlock_pt_out:
> > > > > > > > > + spin_unlock(&pt->pt_mem_regions_lock);
> > > > > > > > > + return rc;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +/*
> > > > > > > > > + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
> > > > > > > > > + * and resolve if possible.
> > > > > > > > > + * Returns: True if valid mmio intercept and it was handled, else false
> > > > > > > > > + */
> > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
> > > > > > > > > +{
> > > > > > > > > + struct hv_message *hvmsg = vp->vp_intercept_msg_page;
> > > > > > > > > + struct hv_x64_memory_intercept_message *msg;
> > > > > > > > > + union hv_x64_memory_access_info accinfo;
> > > > > > > > > + u64 gfn, mmio_spa, numpgs;
> > > > > > > > > + struct mshv_mem_region *mreg;
> > > > > > > > > + int rc;
> > > > > > > > > + struct mshv_partition *pt = vp->vp_partition;
> > > > > > > > > +
> > > > > > > > > + msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
> > > > > > > > > + accinfo = msg->memory_access_info;
> > > > > > > > > +
> > > > > > > > > + if (!accinfo.gva_gpa_valid)
> > > > > > > > > + return false;
> > > > > > > > > +
> > > > > > > > > + /* Do a fast check and bail if non mmio intercept */
> > > > > > > > > + gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
> > > > > > > > > + mreg = mshv_partition_region_by_gfn(pt, gfn);
> > > > > > > >
> > > > > > > > This call needs to be protected by the spinlock.
> > > > > > >
> > > > > > > This is sorta fast path to bail. We recheck under partition lock above.
> > > > > > >
> > > > > >
> > > > > > Accessing the list of regions without lock is unsafe.
> > > > >
> > > > > I am not sure why? This check is done by a vcpu thread, so regions
> > > > > will not have just gone away.
> > > > >
> > > >
> > > > This is shared resources. Multiple VP thread get into this function
> > > > simultaneously, so there is a race already. But this one we can live
> > > > with without locking as they don't mutate the list of the regions.
> > > >
> > > > The issue happens when VMM adds or removed another region as it mutates
> > > > the list and races with VP threads doing this lookup.
> > > >
> > > > Thanks,
> > > > Stanislav
> > > >
> > > >
> > > > > Thanks,
> > > > > -Mukesh
> > > > >
> > > > >
> > > > > > Thanks,
> > > > > > Stanislav
> > > > > >
> > > > > > > Thanks,
> > > > > > > -Mukesh
> > > > > > >
> > > > > > >
> > > > > > > > Thanks,
> > > > > > > > Stanislav
> > > > > > > >
> > > > > > > > > + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
> > > > > > > > > + return false;
> > > > > > > > > +
> > > > > > > > > + rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
> > > > > > > > > + if (rc)
> > > > > > > > > + return false;
> > > > > > > > > +
> > > > > > > > > + if (!hv_nofull_mmio) { /* default case */
> > > > > > > > > + gfn = mreg->start_gfn;
> > > > > > > > > + mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
> > > > > > > > > + numpgs = mreg->nr_pages;
> > > > > > > > > + } else
> > > > > > > > > + numpgs = 1;
> > > > > > > > > +
> > > > > > > > > + rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
> > > > > > > > > +
> > > > > > > > > + return rc == 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > static struct mshv_mem_region *
> > > > > > > > > mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
> > > > > > > > > {
> > > > > > > > > @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
> > > > > > > > > return ret;
> > > > > > > > > }
> > > > > > > > > +
> > > > > > > > > #else /* CONFIG_X86_64 */
> > > > > > > > > +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
> > > > > > > > > static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
> > > > > > > > > #endif /* CONFIG_X86_64 */
> > > > > > > > > static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
> > > > > > > > > {
> > > > > > > > > switch (vp->vp_intercept_msg_page->header.message_type) {
> > > > > > > > > + case HVMSG_UNMAPPED_GPA:
> > > > > > > > > + return mshv_handle_unmapped_gpa(vp);
> > > > > > > > > case HVMSG_GPA_INTERCEPT:
> > > > > > > > > return mshv_handle_gpa_intercept(vp);
> > > > > > > > > }
> > > > > > > > > --
> > > > > > > > > 2.51.2.vfs.0.1
> > > > > > > > >
^ permalink raw reply
* Re: [PATCH] x86: mshyperv: Use kthread for vmbus interrupts on PREEMPT_RT
From: Bezdeka, Florian @ 2026-02-05 14:12 UTC (permalink / raw)
To: kys@microsoft.com, decui@microsoft.com, bp@alien8.de,
longli@microsoft.com, dave.hansen@linux.intel.com,
mingo@redhat.com, wei.liu@kernel.org, tglx@kernel.org,
Kiszka, Jan, haiyangz@microsoft.com, x86@kernel.org
Cc: linux-rt-users@vger.kernel.org, linux-hyperv@vger.kernel.org,
linux-kernel@vger.kernel.org, levymitchell0@gmail.com
In-Reply-To: <133a95d9-8148-40ea-9acc-edfd8e3ceef4@siemens.com>
On Tue, 2026-02-03 at 17:01 +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> Resolves the following lockdep report when booting PREEMPT_RT on Hyper-V
> with related guest support enabled:
>
> [ 1.127941] hv_vmbus: registering driver hyperv_drm
>
> [ 1.132518] =============================
> [ 1.132519] [ BUG: Invalid wait context ]
> [ 1.132521] 6.19.0-rc8+ #9 Not tainted
> [ 1.132524] -----------------------------
> [ 1.132525] swapper/0/0 is trying to lock:
> [ 1.132526] ffff8b9381bb3c90 (&channel->sched_lock){....}-{3:3}, at: vmbus_chan_sched+0xc4/0x2b0
> [ 1.132543] other info that might help us debug this:
> [ 1.132544] context-{2:2}
> [ 1.132545] 1 lock held by swapper/0/0:
> [ 1.132547] #0: ffffffffa010c4c0 (rcu_read_lock){....}-{1:3}, at: vmbus_chan_sched+0x31/0x2b0
> [ 1.132557] stack backtrace:
> [ 1.132560] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.19.0-rc8+ #9 PREEMPT_{RT,(lazy)}
> [ 1.132565] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/25/2025
> [ 1.132567] Call Trace:
> [ 1.132570] <IRQ>
> [ 1.132573] dump_stack_lvl+0x6e/0xa0
> [ 1.132581] __lock_acquire+0xee0/0x21b0
> [ 1.132592] lock_acquire+0xd5/0x2d0
> [ 1.132598] ? vmbus_chan_sched+0xc4/0x2b0
> [ 1.132606] ? lock_acquire+0xd5/0x2d0
> [ 1.132613] ? vmbus_chan_sched+0x31/0x2b0
> [ 1.132619] rt_spin_lock+0x3f/0x1f0
> [ 1.132623] ? vmbus_chan_sched+0xc4/0x2b0
> [ 1.132629] ? vmbus_chan_sched+0x31/0x2b0
> [ 1.132634] vmbus_chan_sched+0xc4/0x2b0
> [ 1.132641] vmbus_isr+0x2c/0x150
> [ 1.132648] __sysvec_hyperv_callback+0x5f/0xa0
> [ 1.132654] sysvec_hyperv_callback+0x88/0xb0
> [ 1.132658] </IRQ>
> [ 1.132659] <TASK>
> [ 1.132660] asm_sysvec_hyperv_callback+0x1a/0x20
>
> As code paths that handle vmbus IRQs use sleepy locks under PREEMPT_RT,
> the complete vmbus_handler execution needs to be moved into thread
> context. Open-coding this allows to skip the IPI that irq_work would
> additionally bring and which we do not need, being an IRQ, never an NMI.
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Tested-by: Florian Bezdeka <florian.bezdeka@siemens.com>
This patch survived a 24h stress test with CONFIG_PREEMPT_RT enabled and
heavy load applied to the system.
There was no lockup happening without this patch. The lockdep warning is
gone now.
Best regards,
Florian
--
Siemens AG, Foundational Technologies
Linux Expert Center
^ permalink raw reply
* Re: [PATCH] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
From: Bezdeka, Florian @ 2026-02-05 14:09 UTC (permalink / raw)
To: kys@microsoft.com, decui@microsoft.com, longli@microsoft.com,
linux-hyperv@vger.kernel.org, wei.liu@kernel.org, Kiszka, Jan,
martin.petersen@oracle.com, James.Bottomley@HansenPartnership.com,
haiyangz@microsoft.com
Cc: linux-scsi@vger.kernel.org, linux-rt-users@vger.kernel.org,
linux-kernel@vger.kernel.org, levymitchell0@gmail.com
In-Reply-To: <0c7fb5cd-fb21-4760-8593-e04bade84744@siemens.com>
On Thu, 2026-01-29 at 15:30 +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> This resolves the follow splat and lock-up when running with PREEMPT_RT
> enabled on Hyper-V:
>
> [ 415.140818] BUG: scheduling while atomic: stress-ng-iomix/1048/0x00000002
> [ 415.140822] INFO: lockdep is turned off.
> [ 415.140823] Modules linked in: intel_rapl_msr intel_rapl_common intel_uncore_frequency_common intel_pmc_core pmt_telemetry pmt_discovery pmt_class intel_pmc_ssram_telemetry intel_vsec ghash_clmulni_intel aesni_intel rapl binfmt_misc nls_ascii nls_cp437 vfat fat snd_pcm hyperv_drm snd_timer drm_client_lib drm_shmem_helper snd sg soundcore drm_kms_helper pcspkr hv_balloon hv_utils evdev joydev drm configfs efi_pstore nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vsock vmw_vmci efivarfs autofs4 ext4 crc16 mbcache jbd2 sr_mod sd_mod cdrom hv_storvsc serio_raw hid_generic scsi_transport_fc hid_hyperv scsi_mod hid hv_netvsc hyperv_keyboard scsi_common
> [ 415.140846] Preemption disabled at:
> [ 415.140847] [<ffffffffc0656171>] storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc]
> [ 415.140854] CPU: 8 UID: 0 PID: 1048 Comm: stress-ng-iomix Not tainted 6.19.0-rc7 #30 PREEMPT_{RT,(full)}
> [ 415.140856] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 09/04/2024
> [ 415.140857] Call Trace:
> [ 415.140861] <TASK>
> [ 415.140861] ? storvsc_queuecommand+0x2e1/0xbe0 [hv_storvsc]
> [ 415.140863] dump_stack_lvl+0x91/0xb0
> [ 415.140870] __schedule_bug+0x9c/0xc0
> [ 415.140875] __schedule+0xdf6/0x1300
> [ 415.140877] ? rtlock_slowlock_locked+0x56c/0x1980
> [ 415.140879] ? rcu_is_watching+0x12/0x60
> [ 415.140883] schedule_rtlock+0x21/0x40
> [ 415.140885] rtlock_slowlock_locked+0x502/0x1980
> [ 415.140891] rt_spin_lock+0x89/0x1e0
> [ 415.140893] hv_ringbuffer_write+0x87/0x2a0
> [ 415.140899] vmbus_sendpacket_mpb_desc+0xb6/0xe0
> [ 415.140900] ? rcu_is_watching+0x12/0x60
> [ 415.140902] storvsc_queuecommand+0x669/0xbe0 [hv_storvsc]
> [ 415.140904] ? HARDIRQ_verbose+0x10/0x10
> [ 415.140908] ? __rq_qos_issue+0x28/0x40
> [ 415.140911] scsi_queue_rq+0x760/0xd80 [scsi_mod]
> [ 415.140926] __blk_mq_issue_directly+0x4a/0xc0
> [ 415.140928] blk_mq_issue_direct+0x87/0x2b0
> [ 415.140931] blk_mq_dispatch_queue_requests+0x120/0x440
> [ 415.140933] blk_mq_flush_plug_list+0x7a/0x1a0
> [ 415.140935] __blk_flush_plug+0xf4/0x150
> [ 415.140940] __submit_bio+0x2b2/0x5c0
> [ 415.140944] ? submit_bio_noacct_nocheck+0x272/0x360
> [ 415.140946] submit_bio_noacct_nocheck+0x272/0x360
> [ 415.140951] ext4_read_bh_lock+0x3e/0x60 [ext4]
> [ 415.140995] ext4_block_write_begin+0x396/0x650 [ext4]
> [ 415.141018] ? __pfx_ext4_da_get_block_prep+0x10/0x10 [ext4]
> [ 415.141038] ext4_da_write_begin+0x1c4/0x350 [ext4]
> [ 415.141060] generic_perform_write+0x14e/0x2c0
> [ 415.141065] ext4_buffered_write_iter+0x6b/0x120 [ext4]
> [ 415.141083] vfs_write+0x2ca/0x570
> [ 415.141087] ksys_write+0x76/0xf0
> [ 415.141089] do_syscall_64+0x99/0x1490
> [ 415.141093] ? rcu_is_watching+0x12/0x60
> [ 415.141095] ? finish_task_switch.isra.0+0xdf/0x3d0
> [ 415.141097] ? rcu_is_watching+0x12/0x60
> [ 415.141098] ? lock_release+0x1f0/0x2a0
> [ 415.141100] ? rcu_is_watching+0x12/0x60
> [ 415.141101] ? finish_task_switch.isra.0+0xe4/0x3d0
> [ 415.141103] ? rcu_is_watching+0x12/0x60
> [ 415.141104] ? __schedule+0xb34/0x1300
> [ 415.141106] ? hrtimer_try_to_cancel+0x1d/0x170
> [ 415.141109] ? do_nanosleep+0x8b/0x160
> [ 415.141111] ? hrtimer_nanosleep+0x89/0x100
> [ 415.141114] ? __pfx_hrtimer_wakeup+0x10/0x10
> [ 415.141116] ? xfd_validate_state+0x26/0x90
> [ 415.141118] ? rcu_is_watching+0x12/0x60
> [ 415.141120] ? do_syscall_64+0x1e0/0x1490
> [ 415.141121] ? do_syscall_64+0x1e0/0x1490
> [ 415.141123] ? rcu_is_watching+0x12/0x60
> [ 415.141124] ? do_syscall_64+0x1e0/0x1490
> [ 415.141125] ? do_syscall_64+0x1e0/0x1490
> [ 415.141127] ? irqentry_exit+0x140/0x7e0
> [ 415.141129] entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> get_cpu() disables preemption while the spinlock hv_ringbuffer_write is
> using is converted to an rt-mutex under PREEMPT_RT.
Tested-by: Florian Bezdeka <florian.bezdeka@siemens.com>
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
This patch survived a 24h stress test with CONFIG_PREEMPT_RT enabled and
heavy load applied to the system.
Without this patch - and very same system configuration - the system
will lock up within 2 minutes.
Best regards,
Florian
--
Siemens AG, Foundational Technologies
Linux Expert Center
^ permalink raw reply
* [PATCH] mshv: fix SRCU protection in irqfd resampler ack handler
From: lirongqing @ 2026-02-05 9:40 UTC (permalink / raw)
To: K . Y . Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
linux-hyperv, linux-kernel
Cc: Li RongQing
From: Li RongQing <lirongqing@baidu.com>
Replace hlist_for_each_entry_rcu() with hlist_for_each_entry_srcu()
in mshv_irqfd_resampler_ack() to correctly handle SRCU-protected
linked list traversal.
The function uses SRCU (sleepable RCU) synchronization via
partition->pt_irq_srcu, but was incorrectly using the RCU variant
for list iteration. This could lead to race conditions when the
list is modified concurrently.
Also add srcu_read_lock_held() assertion as required by
hlist_for_each_entry_srcu() to ensure we're in the proper
read-side critical section.
Signed-off-by: Li RongQing <lirongqing@baidu.com>
---
drivers/hv/mshv_eventfd.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 0b75ff1..6d176ed 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -87,8 +87,9 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
idx = srcu_read_lock(&partition->pt_irq_srcu);
- hlist_for_each_entry_rcu(irqfd, &resampler->rsmplr_irqfd_list,
- irqfd_resampler_hnode) {
+ hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
+ irqfd_resampler_hnode,
+ srcu_read_lock_held(&partition->pt_irq_srcu)) {
if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
hv_call_clear_virtual_interrupt(partition->pt_id);
--
2.9.4
^ permalink raw reply related
* Re: [EXTERNAL] [PATCH] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
From: Jan Kiszka @ 2026-02-05 6:37 UTC (permalink / raw)
To: Michael Kelley, Long Li, KY Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, James E.J. Bottomley, Martin K. Petersen,
linux-hyperv@vger.kernel.org
Cc: linux-scsi@vger.kernel.org, Linux Kernel Mailing List,
Florian Bezdeka, RT, Mitchell Levy
In-Reply-To: <SN6PR02MB41572C9E3650A6E581AA32C2D499A@SN6PR02MB4157.namprd02.prod.outlook.com>
On 05.02.26 06:42, Michael Kelley wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com> Sent: Monday, February 2, 2026 9:58 PM
>>
>> On 03.02.26 00:47, Long Li wrote:
>>>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>>>
>>>> This resolves the follow splat and lock-up when running with PREEMPT_RT
>>>> enabled on Hyper-V:
>>>
>>> Hi Jan,
>>>
>>> It's interesting to know the use-case of running a RT kernel over Hyper-V.
>>>
>>> Can you give an example?
>>>
>>
>> - functional testing of an RT base image over Hyper-V
>> - re-use of a common RT base image, without exploiting RT properties
>>
>>> As far as I know, Hyper-V makes no RT guarantees of scheduling VPs for a VM.
>>
>> This is well understood and not our goal. We only need the kernel to run
>> correctly over Hyper-V with PREEMPT-RT enabled, and that is not the case
>> right now.
>>
>> Thanks,
>> Jan
>>
>> PS: Who had to idea to drop a virtual UART from Gen 2 VMs? Early boot
>> guest debugging is true fun now...
>>
>
> Hmmm. I often do printk()-based debugging via a virtual UART in a Gen 2
> VM. The Linux serial console outputs to that virtual UART and I see the
> printk() output in PuTTY on the Windows host. What specifically are you
> trying to do? I'm trying to remember if there's any unique setup required
> on a Gen 2 VM vs. a Gen 1 VM, and nothing immediately comes to mind.
> Though maybe it's just so baked into my process that I don't remember it!
>
Indeed:
Powershell> Set-VMComPort -VMName "Debian 13" 1 \\.\pipe\comport
<Start VM>
Powershell> putty -serial \\.\pipe\comport
Well hidden...
Jan
--
Siemens AG, Foundational Technologies
Linux Expert Center
^ permalink raw reply
* Re: [PATCH 1/3] x86/x2apic: disable x2apic on resume if the kernel expects so
From: Shashank Balaji @ 2026-02-05 6:07 UTC (permalink / raw)
To: Sohil Mehta
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Suresh Siddha, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
Broadcom internal kernel review list, Jan Kiszka, Paolo Bonzini,
Vitaly Kuznetsov, Juergen Gross, Boris Ostrovsky, Ingo Molnar,
linux-kernel, linux-hyperv, virtualization, jailhouse-dev, kvm,
xen-devel, Rahul Bukte, Daniel Palmer, Tim Bird, stable
In-Reply-To: <722b53a7-7560-4a1b-ab26-73eeed3dffa5@intel.com>
On Wed, Feb 04, 2026 at 10:53:28AM -0800, Sohil Mehta wrote:
> On 2/4/2026 1:17 AM, Shashank Balaji wrote:
>
> > __x2apic_disable disables x2apic only if boot_cpu_has(X86_FEATURE_APIC)
> > and x2apic is already enabled.
>
> I meant the X86_FEATURE_X2APIC and not X86_FEATURE_APIC.
My bad, I got that wrong. __x2apic_disable checks for X86_FEATURE_APIC,
while x2apic_enabled checks for X86_FEATURE_X2APIC.
> But, thinking about it more, checking that the CPU is really in X2APIC mode
> by reading the MSR is good enough.
But yes, I agree.
> > x2apic_enabled also does the same checks,
> > the only difference being, it uses rdmsrq_safe instead of just rdmsrq,
> > which is what __x2apic_disable uses. The safe version is because of
> > Boris' suggestion [1]. If that's applicable here as well, then rdmsrq in
> > __x2apic_disable should be changed to rdmsrq_safe.
>
> I don't know if there is a strong justification for changing to
> rdmsrq_safe() over here. Also, that would be beyond the scope of this
> patch. In general, it's better to avoid such changes unless an actual
> issue pops up.
Makes sense.
> >> I considered if an error message should be printed along with this. But,
> >> I am not sure if it can really be called a firmware issue. It's probably
> >> just that newer CPUs might have started defaulting to x2apic on.
> >>
> >> Can you specify what platform you are encountering this?
> >
> >
> > I'm not sure it's the CPU defaulting to x2apic on. As per Section
> > 12.12.5.1 of the Intel SDM:
> >
> > On coming out of reset, the local APIC unit is enabled and is in
> > the xAPIC mode: IA32_APIC_BASE[EN]=1 and IA32_APIC_BASE[EXTD]=0.
> >
> > So, the CPU should be turning on in xapic mode. In fact, when x2apic is
> > disabled in the firmware, this problem doesn't happen.
> >
>
> It's a bit odd then that the firmware chooses to enable x2apic without
> the OS requesting it.
Well, the firmware has a setting saying "Enable x2apic", which was
enabled. So it did what the setting says
> Linux maintains a concept of X2APIC_ON_LOCKED in x2apic_state which is
> based on the hardware preference to keep the apic in X2APIC mode.
>
> When you have x2apic enabled in firmware, but the system is in XAPIC
> mode, can you read the values in MSR_IA32_ARCH_CAPABILITIES and
> MSR_IA32_XAPIC_DISABLE_STATUS?
>
> XAPIC shouldn't be disabled because you are running in that mode. But,
> it would be good to confirm.
With x2apic enabled by the firmware, and after kernel switches to xapic
(because no interrupt remapping support), bit 21 (XAPIC_DISABLE_STATUS)
of MSR_IA32_ARCH_CAPABILITIES is 0, and MSR_IA32_XAPIC_DISABLE_STATUS
MSR is not available.
> > Either way, a pr_warn maybe helpful. How about "x2apic re-enabled by the
> > firmware during resume. Disabling\n"?
>
> I mainly want to make sure the firmware is really at fault before we add
> such a print. But it seems likely now that the firmware messed up.
^ permalink raw reply
* RE: [EXTERNAL] [PATCH] scsi: storvsc: Fix scheduling while atomic on PREEMPT_RT
From: Michael Kelley @ 2026-02-05 5:42 UTC (permalink / raw)
To: Jan Kiszka, Long Li, KY Srinivasan, Haiyang Zhang, Wei Liu,
Dexuan Cui, James E.J. Bottomley, Martin K. Petersen,
linux-hyperv@vger.kernel.org
Cc: linux-scsi@vger.kernel.org, Linux Kernel Mailing List,
Florian Bezdeka, RT, Mitchell Levy
In-Reply-To: <6b4933df-6af2-449c-922b-30ef8fd4c8b8@siemens.com>
From: Jan Kiszka <jan.kiszka@siemens.com> Sent: Monday, February 2, 2026 9:58 PM
>
> On 03.02.26 00:47, Long Li wrote:
> >> From: Jan Kiszka <jan.kiszka@siemens.com>
> >>
> >> This resolves the follow splat and lock-up when running with PREEMPT_RT
> >> enabled on Hyper-V:
> >
> > Hi Jan,
> >
> > It's interesting to know the use-case of running a RT kernel over Hyper-V.
> >
> > Can you give an example?
> >
>
> - functional testing of an RT base image over Hyper-V
> - re-use of a common RT base image, without exploiting RT properties
>
> > As far as I know, Hyper-V makes no RT guarantees of scheduling VPs for a VM.
>
> This is well understood and not our goal. We only need the kernel to run
> correctly over Hyper-V with PREEMPT-RT enabled, and that is not the case
> right now.
>
> Thanks,
> Jan
>
> PS: Who had to idea to drop a virtual UART from Gen 2 VMs? Early boot
> guest debugging is true fun now...
>
Hmmm. I often do printk()-based debugging via a virtual UART in a Gen 2
VM. The Linux serial console outputs to that virtual UART and I see the
printk() output in PuTTY on the Windows host. What specifically are you
trying to do? I'm trying to remember if there's any unique setup required
on a Gen 2 VM vs. a Gen 1 VM, and nothing immediately comes to mind.
Though maybe it's just so baked into my process that I don't remember it!
Michael
^ permalink raw reply
* Re: [PATCH] mshv: Make MSHV mutually exclusive with KEXEC
From: Anirudh Rayabharam @ 2026-02-05 4:59 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aYOQ5-yHp_FrsTBF@skinsburskii.localdomain>
On Wed, Feb 04, 2026 at 10:33:11AM -0800, Stanislav Kinsburskii wrote:
> On Wed, Feb 04, 2026 at 05:33:29AM +0000, Anirudh Rayabharam wrote:
> > On Tue, Feb 03, 2026 at 11:42:58AM -0800, Stanislav Kinsburskii wrote:
> > > On Tue, Feb 03, 2026 at 04:46:03PM +0000, Anirudh Rayabharam wrote:
> > > > On Tue, Feb 03, 2026 at 07:40:36AM -0800, Stanislav Kinsburskii wrote:
> > > > > On Tue, Feb 03, 2026 at 10:34:28AM +0530, Anirudh Rayabharam wrote:
> > > > > > On Mon, Feb 02, 2026 at 11:18:27AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > On Mon, Feb 02, 2026 at 07:01:01PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > On Mon, Feb 02, 2026 at 09:10:00AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > On Fri, Jan 30, 2026 at 08:32:45PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > On Fri, Jan 30, 2026 at 10:46:45AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > On Fri, Jan 30, 2026 at 05:11:12PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > > On Wed, Jan 28, 2026 at 03:11:14PM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > On Wed, Jan 28, 2026 at 04:16:31PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > > On Mon, Jan 26, 2026 at 12:46:44PM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > > On Tue, Jan 27, 2026 at 12:19:24AM +0530, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > > > > On Fri, Jan 23, 2026 at 10:20:53PM +0000, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > > > > The MSHV driver deposits kernel-allocated pages to the hypervisor during
> > > > > > > > > > > > > > > > > runtime and never withdraws them. This creates a fundamental incompatibility
> > > > > > > > > > > > > > > > > with KEXEC, as these deposited pages remain unavailable to the new kernel
> > > > > > > > > > > > > > > > > loaded via KEXEC, leading to potential system crashes upon kernel accessing
> > > > > > > > > > > > > > > > > hypervisor deposited pages.
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Make MSHV mutually exclusive with KEXEC until proper page lifecycle
> > > > > > > > > > > > > > > > > management is implemented.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Someone might want to stop all guest VMs and do a kexec. Which is valid
> > > > > > > > > > > > > > > > and would work without any issue for L1VH.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > No, it won't work and hypervsisor depostied pages won't be withdrawn.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > All pages that were deposited in the context of a guest partition (i.e.
> > > > > > > > > > > > > > with the guest partition ID), would be withdrawn when you kill the VMs,
> > > > > > > > > > > > > > right? What other deposited pages would be left?
> > > > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > The driver deposits two types of pages: one for the guests (withdrawn
> > > > > > > > > > > > > upon gust shutdown) and the other - for the host itself (never
> > > > > > > > > > > > > withdrawn).
> > > > > > > > > > > > > See hv_call_create_partition, for example: it deposits pages for the
> > > > > > > > > > > > > host partition.
> > > > > > > > > > > >
> > > > > > > > > > > > Hmm.. I see. Is it not possible to reclaim this memory in module_exit?
> > > > > > > > > > > > Also, can't we forcefully kill all running partitions in module_exit and
> > > > > > > > > > > > then reclaim memory? Would this help with kernel consistency
> > > > > > > > > > > > irrespective of userspace behavior?
> > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > It would, but this is sloppy and cannot be a long-term solution.
> > > > > > > > > > >
> > > > > > > > > > > It is also not reliable. We have no hook to prevent kexec. So if we fail
> > > > > > > > > > > to kill the guest or reclaim the memory for any reason, the new kernel
> > > > > > > > > > > may still crash.
> > > > > > > > > >
> > > > > > > > > > Actually guests won't be running by the time we reach our module_exit
> > > > > > > > > > function during a kexec. Userspace processes would've been killed by
> > > > > > > > > > then.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > No, they will not: "kexec -e" doesn't kill user processes.
> > > > > > > > > We must not rely on OS to do graceful shutdown before doing
> > > > > > > > > kexec.
> > > > > > > >
> > > > > > > > I see kexec -e is too brutal. Something like systemctl kexec is
> > > > > > > > more graceful and is probably used more commonly. In this case at least
> > > > > > > > we could register a reboot notifier and attempt to clean things up.
> > > > > > > >
> > > > > > > > I think it is better to support kexec to this extent rather than
> > > > > > > > disabling it entirely.
> > > > > > > >
> > > > > > >
> > > > > > > You do understand that once our kernel is released to third parties, we
> > > > > > > can’t control how they will use kexec, right?
> > > > > >
> > > > > > Yes, we can't. But that's okay. It is fine for us to say that only some
> > > > > > kexec scenarios are supported and some aren't (iff you're creating VMs
> > > > > > using MSHV; if you're not creating VMs all of kexec is supported).
> > > > > >
> > > > >
> > > > > Well, I disagree here. If we say the kernel supports MSHV, we must
> > > > > provide a robust solution. A partially working solution is not
> > > > > acceptable. It makes us look careless and can damage our reputation as a
> > > > > team (and as a company).
> > > >
> > > > It won't if we call out upfront what is supported and what is not.
> > > >
> > > > >
> > > > > > >
> > > > > > > This is a valid and existing option. We have to account for it. Yet
> > > > > > > again, L1VH will be used by arbitrary third parties out there, not just
> > > > > > > by us.
> > > > > > >
> > > > > > > We can’t say the kernel supports MSHV until we close these gaps. We must
> > > > > >
> > > > > > We can. It is okay say some scenarios are supported and some aren't.
> > > > > >
> > > > > > All kexecs are supported if they never create VMs using MSHV. If they do
> > > > > > create VMs using MSHV and we implement cleanup in a reboot notifier at
> > > > > > least systemctl kexec and crashdump kexec would which are probably the
> > > > > > most common uses of kexec. It's okay to say that this is all we support
> > > > > > as of now.
> > > > > >
> > > > >
> > > > > I'm repeating myself, but I'll try to put it differently.
> > > > > There won't be any kernel core collected if a page was deposited. You're
> > > > > arguing for a lost cause here. Once a page is allocated and deposited,
> > > > > the crash kernel will try to write it into the core.
> > > >
> > > > That's why we have to implement something where we attempt to destroy
> > > > partitions and reclaim memory (and BUG() out if that fails; which
> > > > hopefully should happen very rarely if at all). This should be *the*
> > > > solution we work towards. We don't need a temporary disable kexec
> > > > solution.
> > > >
> > >
> > > No, the solution is to preserve the shared state and pass it over via KHO.
> >
> > Okay, then work towards it without doing temporary KEXEC disable. We can
> > call out that kexec is not supported until then. Disabling KEXEC is too
> > intrusive.
> >
>
> What do you mean by "too intrusive"? The change if local to driver's
> Kconfig. There are no verbal "callouts" in upstream Linux - that's
> exactly what Kconfig is used for. Once the proper solution is
> implemented, we can remove the restriction.
>
> > Is there any precedent for this? Do you know if any driver ever disabled
> > KEXEC this way?
> >
>
> No, but there is no other similar driver like this one.
Doesn't have to be like this one. There could be issues with device
states during kexec state.
> Why does it matter though?
To learn from past precedents.
>
> > >
> > > > >
> > > > > > Also, what makes you think customers would even be interested in enabling
> > > > > > our module in their kernel configs if it takes away kexec?
> > > > > >
> > > > >
> > > > > It's simple: L1VH isn't a host, so I can spin up new VMs instead of
> > > > > servicing the existing ones.
> > > >
> > > > And what about the L2 VM state then? They might not be throwaway in all
> > > > cases.
> > > >
> > >
> > > L2 guest can (and likely will) be migrated fromt he old L1VH to the new
> > > one.
> > > And this is most likely the current scenario customers are using.
> > >
> > > > >
> > > > > Why do you think there won’t be customers interested in using MSHV in
> > > > > L1VH without kexec support?
> > > >
> > > > Because they could already be using kexec for their servicing needs or
> > > > whatever. And no we can't just say "don't service these VMs just spin up
> > > > new ones".
> > > >
> > >
> > > Are you speculating or know for sure?
> >
> > It's a reasonable assumption that people are using kexec for servicing.
> >
>
> Again, using kexec for servicing is not supported: why pretending it is?
What this patch effectively asserts is that kexec is unsupported whenever the
MSHV driver is enabled. But that is not accurate. Enabling MSHV does not
necessarily imply that it is being used. The correct statement is that kexec is
unsupported only when MSHV is *in use*, i.e. when one or more VMs are
running.
By disabling kexec unconditionally, the patch prevents a valid workflow in
situations where no VMs exist and kexec would work without issue. This imposes a
blanket restriction instead of enforcing the actual requirement.
And sure, I understand there is no way to enforce that actual
requirement. So this is what I propose:
The statement "kexec is not supported when the MSHV driver is used" can be
documented on docs.microsoft.com once direct virtualization becomes broadly
available. The documentation can also provide operational guidance, such as
shutting down all VMs before invoking kexec for servicing. This preserves a
practical path for users who rely on kexec. If kexec is disabled entirely, that
flexibility is lost.
The stricter approach ensures users cannot accidentally make a mistake, which
has its merits. However, my approach gives more power and discretion to
the user. In parallel, we of course continue to work on making it
robust.
>
> > >
> > > > Also, keep in mind that once L1VH is available in Azure, the distros
> > > > that run on it would be the same distros that run on all other Azure
> > > > VMs. There won't be special distros with a kernel specifically built for
> > > > L1VH. And KEXEC is generally enabled in distros. Distro vendors won't be
> > > > happy that they would need to publish a separate version of their image with
> > > > MSHV_ROOT enabled and KEXEC disabled because they wouldn't want KEXEC to
> > > > be disabled for all Azure VMs. Also, the customers will be confused why
> > > > the same distro doesn't work on L1VH.
> > > >
> > >
> > > I don't think distro happiness is our concern. They already build custom
> >
> > If distros are not happy they won't package this and consequently
> > nobody will use it.
> >
>
> Could you provide an example of such issues in the past?
>
> > > versions for Azure. They can build another custom version for L1VH if
> > > needed.
> >
> > We should at least check if they are ready to do this.
> >
>
> This is a labor intrusive and long-term check. Unless there is a solid
> evidence that they won't do it, I don't see the point in doing this.
It is reasonable to assume that maintaining an additional flavor of a
distro is an overhead (maintain new package(s), maintain Azure
marketplace images etc etc). This should be enough reason to check. Not
everything needs a solid evidence. Often times a reasonable suspiscion
will do.
Thanks,
Anirudh.
>
> Thanks,
> Stanislav
>
> > Thanks,
> > Anirudh.
> >
> > >
> > > Anyway, I don't see the point in continuing this discussion. All points
> > > have been made, and solutions have been proposed.
> > >
> > > If you can come up with something better in the next few days, so we at
> > > least have a chance to get it merged in the next merge window, great. If
> > > not, we should explicitly forbid the unsupported feature and move on.
> > >
> > > Thanks,
> > > Thanks,
> > > Stanislav
> > >
> > > > Thanks,
> > > > Anirudh.
^ permalink raw reply
* Re: [PATCH v0 15/15] mshv: Populate mmio mappings for PCI passthru
From: Mukesh R @ 2026-02-04 22:52 UTC (permalink / raw)
To: Stanislav Kinsburskii
Cc: linux-kernel, linux-hyperv, linux-arm-kernel, iommu, linux-pci,
linux-arch, kys, haiyangz, wei.liu, decui, longli,
catalin.marinas, will, tglx, mingo, bp, dave.hansen, hpa, joro,
lpieralisi, kwilczynski, mani, robh, bhelgaas, arnd, nunodasneves,
mhklinux
In-Reply-To: <aYDROXpR5kvlylGG@skinsburskii.localdomain>
On 2/2/26 08:30, Stanislav Kinsburskii wrote:
> On Fri, Jan 30, 2026 at 02:17:24PM -0800, Mukesh R wrote:
>> On 1/27/26 10:57, Stanislav Kinsburskii wrote:
>>> On Mon, Jan 26, 2026 at 07:07:22PM -0800, Mukesh R wrote:
>>>> On 1/26/26 10:15, Stanislav Kinsburskii wrote:
>>>>> On Fri, Jan 23, 2026 at 06:19:15PM -0800, Mukesh R wrote:
>>>>>> On 1/20/26 17:53, Stanislav Kinsburskii wrote:
>>>>>>> On Mon, Jan 19, 2026 at 10:42:30PM -0800, Mukesh R wrote:
>>>>>>>> From: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>>
>>>>>>>> Upon guest access, in case of missing mmio mapping, the hypervisor
>>>>>>>> generates an unmapped gpa intercept. In this path, lookup the PCI
>>>>>>>> resource pfn for the guest gpa, and ask the hypervisor to map it
>>>>>>>> via hypercall. The PCI resource pfn is maintained by the VFIO driver,
>>>>>>>> and obtained via fixup_user_fault call (similar to KVM).
>>>>>>>>
>>>>>>>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>>>>>>>> ---
>>>>>>>> drivers/hv/mshv_root_main.c | 115 ++++++++++++++++++++++++++++++++++++
>>>>>>>> 1 file changed, 115 insertions(+)
>>>>>>>>
>>>>>>>> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
>>>>>>>> index 03f3aa9f5541..4c8bc7cd0888 100644
>>>>>>>> --- a/drivers/hv/mshv_root_main.c
>>>>>>>> +++ b/drivers/hv/mshv_root_main.c
>>>>>>>> @@ -56,6 +56,14 @@ struct hv_stats_page {
>>>>>>>> };
>>>>>>>> } __packed;
>>>>>>>> +bool hv_nofull_mmio; /* don't map entire mmio region upon fault */
>>>>>>>> +static int __init setup_hv_full_mmio(char *str)
>>>>>>>> +{
>>>>>>>> + hv_nofull_mmio = true;
>>>>>>>> + return 0;
>>>>>>>> +}
>>>>>>>> +__setup("hv_nofull_mmio", setup_hv_full_mmio);
>>>>>>>> +
>>>>>>>> struct mshv_root mshv_root;
>>>>>>>> enum hv_scheduler_type hv_scheduler_type;
>>>>>>>> @@ -612,6 +620,109 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
>>>>>>>> }
>>>>>>>> #ifdef CONFIG_X86_64
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * Check if uaddr is for mmio range. If yes, return 0 with mmio_pfn filled in
>>>>>>>> + * else just return -errno.
>>>>>>>> + */
>>>>>>>> +static int mshv_chk_get_mmio_start_pfn(struct mshv_partition *pt, u64 gfn,
>>>>>>>> + u64 *mmio_pfnp)
>>>>>>>> +{
>>>>>>>> + struct vm_area_struct *vma;
>>>>>>>> + bool is_mmio;
>>>>>>>> + u64 uaddr;
>>>>>>>> + struct mshv_mem_region *mreg;
>>>>>>>> + struct follow_pfnmap_args pfnmap_args;
>>>>>>>> + int rc = -EINVAL;
>>>>>>>> +
>>>>>>>> + /*
>>>>>>>> + * Do not allow mem region to be deleted beneath us. VFIO uses
>>>>>>>> + * useraddr vma to lookup pci bar pfn.
>>>>>>>> + */
>>>>>>>> + spin_lock(&pt->pt_mem_regions_lock);
>>>>>>>> +
>>>>>>>> + /* Get the region again under the lock */
>>>>>>>> + mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>> + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>> + goto unlock_pt_out;
>>>>>>>> +
>>>>>>>> + uaddr = mreg->start_uaddr +
>>>>>>>> + ((gfn - mreg->start_gfn) << HV_HYP_PAGE_SHIFT);
>>>>>>>> +
>>>>>>>> + mmap_read_lock(current->mm);
>>>>>>>
>>>>>>> Semaphore can't be taken under spinlock.
>>>>>
>>>>>>
>>>>>> Yeah, something didn't feel right here and I meant to recheck, now regret
>>>>>> rushing to submit the patch.
>>>>>>
>>>>>> Rethinking, I think the pt_mem_regions_lock is not needed to protect
>>>>>> the uaddr because unmap will properly serialize via the mm lock.
>>>>>>
>>>>>>
>>>>>>>> + vma = vma_lookup(current->mm, uaddr);
>>>>>>>> + is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
>>>>>>>
>>>>>>> Why this check is needed again?
>>>>>>
>>>>>> To make sure region did not change. This check is under lock.
>>>>>>
>>>>>
>>>>> How can this happen? One can't change VMA type without unmapping it
>>>>> first. And unmapping it leads to a kernel MMIO region state dangling
>>>>> around without corresponding user space mapping.
>>>>
>>>> Right, and vm_flags would not be mmio expected then.
>>>>
>>>>> This is similar to dangling pinned regions and should likely be
>>>>> addressed the same way by utilizing MMU notifiers to destpoy memoty
>>>>> regions is VMA is detached.
>>>>
>>>> I don't think we need that. Either it succeeds if the region did not
>>>> change at all, or just fails.
>>>>
>>>
>>> I'm afraid we do, as if the driver mapped a page with the previous
>>> memory region, and then the region is unmapped, the page will stay
>>> mapped in the hypervisor, but will be considered free by kernel, which
>>> in turn will lead to GPF upn next allocation.
>>
>> There are no ram pages for mmio regions. Also, we don't do much with
>> mmio regions other than tell the hyp about it.
>>
>
> So, are you saying that the hypervisor does not use these pages and only
> tracks them? That would make things easier.
> However, if we later try to map a GPA that is already mapped, will the
> hypervisor return an error?
Hypervisor does not return an error.
> Thanks,
> Stanislav
>
>> Thanks,
>> -Mukesh
>>
>>
>>> With pinned regions we issue is similar but less impacting: pages can't
>>> be released by user space unmapping and thus will be simply leaked, but
>>> the system stays intact.
>>>
>>> MMIO regions are simila to movable region in this regard: they don't
>>> reference the user pages, and thus this guest region replaement is a
>>> stright wat to kernel panic.
>>>
>>>>
>>>>>>> The region type is stored on the region itself.
>>>>>>> And the type is checked on the caller side.
>>>>>>>
>>>>>>>> + if (!is_mmio)
>>>>>>>> + goto unlock_mmap_out;
>>>>>>>> +
>>>>>>>> + pfnmap_args.vma = vma;
>>>>>>>> + pfnmap_args.address = uaddr;
>>>>>>>> +
>>>>>>>> + rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>> + if (rc) {
>>>>>>>> + rc = fixup_user_fault(current->mm, uaddr, FAULT_FLAG_WRITE,
>>>>>>>> + NULL);
>>>>>>>> + if (rc)
>>>>>>>> + goto unlock_mmap_out;
>>>>>>>> +
>>>>>>>> + rc = follow_pfnmap_start(&pfnmap_args);
>>>>>>>> + if (rc)
>>>>>>>> + goto unlock_mmap_out;
>>>>>>>> + }
>>>>>>>> +
>>>>>>>> + *mmio_pfnp = pfnmap_args.pfn;
>>>>>>>> + follow_pfnmap_end(&pfnmap_args);
>>>>>>>> +d
>>>>>>>> +unlock_mmap_out:
>>>>>>>> + mmap_read_unlock(current->mm);
>>>>>>>> +unlock_pt_out:
>>>>>>>> + spin_unlock(&pt->pt_mem_regions_lock);
>>>>>>>> + return rc;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +/*
>>>>>>>> + * At present, the only unmapped gpa is mmio space. Verify if it's mmio
>>>>>>>> + * and resolve if possible.
>>>>>>>> + * Returns: True if valid mmio intercept and it was handled, else false
>>>>>>>> + */
>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp)
>>>>>>>> +{
>>>>>>>> + struct hv_message *hvmsg = vp->vp_intercept_msg_page;
>>>>>>>> + struct hv_x64_memory_intercept_message *msg;
>>>>>>>> + union hv_x64_memory_access_info accinfo;
>>>>>>>> + u64 gfn, mmio_spa, numpgs;
>>>>>>>> + struct mshv_mem_region *mreg;
>>>>>>>> + int rc;
>>>>>>>> + struct mshv_partition *pt = vp->vp_partition;
>>>>>>>> +
>>>>>>>> + msg = (struct hv_x64_memory_intercept_message *)hvmsg->u.payload;
>>>>>>>> + accinfo = msg->memory_access_info;
>>>>>>>> +
>>>>>>>> + if (!accinfo.gva_gpa_valid)
>>>>>>>> + return false;
>>>>>>>> +
>>>>>>>> + /* Do a fast check and bail if non mmio intercept */
>>>>>>>> + gfn = msg->guest_physical_address >> HV_HYP_PAGE_SHIFT;
>>>>>>>> + mreg = mshv_partition_region_by_gfn(pt, gfn);
>>>>>>>
>>>>>>> This call needs to be protected by the spinlock.
>>>>>>
>>>>>> This is sorta fast path to bail. We recheck under partition lock above.
>>>>>>
>>>>>
>>>>> Accessing the list of regions without lock is unsafe.
>>>>
>>>> I am not sure why? This check is done by a vcpu thread, so regions
>>>> will not have just gone away.
>>>>
>>>
>>> This is shared resources. Multiple VP thread get into this function
>>> simultaneously, so there is a race already. But this one we can live
>>> with without locking as they don't mutate the list of the regions.
>>>
>>> The issue happens when VMM adds or removed another region as it mutates
>>> the list and races with VP threads doing this lookup.
>>>
>>> Thanks,
>>> Stanislav
>>>
>>>
>>>> Thanks,
>>>> -Mukesh
>>>>
>>>>
>>>>> Thanks,
>>>>> Stanislav
>>>>>
>>>>>> Thanks,
>>>>>> -Mukesh
>>>>>>
>>>>>>
>>>>>>> Thanks,
>>>>>>> Stanislav
>>>>>>>
>>>>>>>> + if (mreg == NULL || mreg->type != MSHV_REGION_TYPE_MMIO)
>>>>>>>> + return false;
>>>>>>>> +
>>>>>>>> + rc = mshv_chk_get_mmio_start_pfn(pt, gfn, &mmio_spa);
>>>>>>>> + if (rc)
>>>>>>>> + return false;
>>>>>>>> +
>>>>>>>> + if (!hv_nofull_mmio) { /* default case */
>>>>>>>> + gfn = mreg->start_gfn;
>>>>>>>> + mmio_spa = mmio_spa - (gfn - mreg->start_gfn);
>>>>>>>> + numpgs = mreg->nr_pages;
>>>>>>>> + } else
>>>>>>>> + numpgs = 1;
>>>>>>>> +
>>>>>>>> + rc = hv_call_map_mmio_pages(pt->pt_id, gfn, mmio_spa, numpgs);
>>>>>>>> +
>>>>>>>> + return rc == 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> static struct mshv_mem_region *
>>>>>>>> mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
>>>>>>>> {
>>>>>>>> @@ -666,13 +777,17 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
>>>>>>>> return ret;
>>>>>>>> }
>>>>>>>> +
>>>>>>>> #else /* CONFIG_X86_64 */
>>>>>>>> +static bool mshv_handle_unmapped_gpa(struct mshv_vp *vp) { return false; }
>>>>>>>> static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
>>>>>>>> #endif /* CONFIG_X86_64 */
>>>>>>>> static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
>>>>>>>> {
>>>>>>>> switch (vp->vp_intercept_msg_page->header.message_type) {
>>>>>>>> + case HVMSG_UNMAPPED_GPA:
>>>>>>>> + return mshv_handle_unmapped_gpa(vp);
>>>>>>>> case HVMSG_GPA_INTERCEPT:
>>>>>>>> return mshv_handle_gpa_intercept(vp);
>>>>>>>> }
>>>>>>>> --
>>>>>>>> 2.51.2.vfs.0.1
>>>>>>>>
^ permalink raw reply
* Re: [PATCH v2] mshv: make certain field names descriptive in a header struct
From: Mukesh R @ 2026-02-04 20:24 UTC (permalink / raw)
To: Wei Liu; +Cc: linux-hyperv
In-Reply-To: <20260204060620.GB79272@liuwe-devbox-debian-v2.local>
On 2/3/26 22:06, Wei Liu wrote:
> On Fri, Jan 16, 2026 at 02:49:04PM -0800, Mukesh Rathor wrote:
>> When struct fields use very common names like "pages" or "type", it makes
>> it difficult to find uses of these fields with tools like grep, cscope,
>> etc when the struct is in a header file included in many places. Add the
>> prefix mreg_ to some fields in struct mshv_mem_region to make it easier
>> to find them.
>>
>> There is no functional change.
>>
>> Signed-off-by: Mukesh Rathor <mrathor@linux.microsoft.com>
>
> I generally don't mind such changes, but this patch doesn't apply
> anymore. Please rebase to the latest hyperv-next.
>
> Wei
Done, please find V3. Thank you.
-Mukesh
^ permalink raw reply
* [PATCH v3] mshv: make field names descriptive in a header struct
From: Mukesh R @ 2026-02-04 20:23 UTC (permalink / raw)
To: linux-hyperv; +Cc: wei.liu
When struct fields use very common names like "pages" or "type", it makes
it difficult to find uses of these fields with tools like grep, cscope,
etc when the struct is in a header file included in many places. Add
prefix mreg_ to some fields in struct mshv_mem_region to make it easier
to find them.
There is no functional change.
Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
---
V3: rebase to afefdb2bc945 (origin/hyperv-next)
---
drivers/hv/mshv_regions.c | 60 ++++++++++++++++++-------------------
drivers/hv/mshv_root.h | 10 +++----
drivers/hv/mshv_root_main.c | 10 +++----
3 files changed, 40 insertions(+), 40 deletions(-)
diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index adba3564d9f1..c28aac0726de 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -88,7 +88,7 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
struct page *page;
int stride, ret;
- page = region->pages[page_offset];
+ page = region->mreg_pages[page_offset];
if (!page)
return -EINVAL;
@@ -98,7 +98,7 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
/* Start at stride since the first stride is validated */
for (count = stride; count < page_count; count += stride) {
- page = region->pages[page_offset + count];
+ page = region->mreg_pages[page_offset + count];
/* Break if current page is not present */
if (!page)
@@ -152,7 +152,7 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
while (page_count) {
/* Skip non-present pages */
- if (!region->pages[page_offset]) {
+ if (!region->mreg_pages[page_offset]) {
page_offset++;
page_count--;
continue;
@@ -190,7 +190,7 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
- kref_init(®ion->refcount);
+ kref_init(®ion->mreg_refcount);
return region;
}
@@ -204,7 +204,7 @@ static int mshv_region_chunk_share(struct mshv_mem_region *region,
flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages + page_offset,
+ region->mreg_pages + page_offset,
page_count,
HV_MAP_GPA_READABLE |
HV_MAP_GPA_WRITABLE,
@@ -229,7 +229,7 @@ static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages + page_offset,
+ region->mreg_pages + page_offset,
page_count, 0,
flags, false);
}
@@ -254,7 +254,7 @@ static int mshv_region_chunk_remap(struct mshv_mem_region *region,
return hv_call_map_gpa_pages(region->partition->pt_id,
region->start_gfn + page_offset,
page_count, flags,
- region->pages + page_offset);
+ region->mreg_pages + page_offset);
}
static int mshv_region_remap_pages(struct mshv_mem_region *region,
@@ -277,10 +277,10 @@ int mshv_region_map(struct mshv_mem_region *region)
static void mshv_region_invalidate_pages(struct mshv_mem_region *region,
u64 page_offset, u64 page_count)
{
- if (region->type == MSHV_REGION_TYPE_MEM_PINNED)
- unpin_user_pages(region->pages + page_offset, page_count);
+ if (region->mreg_type == MSHV_REGION_TYPE_MEM_PINNED)
+ unpin_user_pages(region->mreg_pages + page_offset, page_count);
- memset(region->pages + page_offset, 0,
+ memset(region->mreg_pages + page_offset, 0,
page_count * sizeof(struct page *));
}
@@ -297,7 +297,7 @@ int mshv_region_pin(struct mshv_mem_region *region)
int ret;
for (done_count = 0; done_count < region->nr_pages; done_count += ret) {
- pages = region->pages + done_count;
+ pages = region->mreg_pages + done_count;
userspace_addr = region->start_uaddr +
done_count * HV_HYP_PAGE_SIZE;
nr_pages = min(region->nr_pages - done_count,
@@ -348,11 +348,11 @@ static int mshv_region_unmap(struct mshv_mem_region *region)
static void mshv_region_destroy(struct kref *ref)
{
struct mshv_mem_region *region =
- container_of(ref, struct mshv_mem_region, refcount);
+ container_of(ref, struct mshv_mem_region, mreg_refcount);
struct mshv_partition *partition = region->partition;
int ret;
- if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
mshv_region_movable_fini(region);
if (mshv_partition_encrypted(partition)) {
@@ -374,12 +374,12 @@ static void mshv_region_destroy(struct kref *ref)
void mshv_region_put(struct mshv_mem_region *region)
{
- kref_put(®ion->refcount, mshv_region_destroy);
+ kref_put(®ion->mreg_refcount, mshv_region_destroy);
}
int mshv_region_get(struct mshv_mem_region *region)
{
- return kref_get_unless_zero(®ion->refcount);
+ return kref_get_unless_zero(®ion->mreg_refcount);
}
/**
@@ -405,16 +405,16 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
int ret;
range->notifier_seq = mmu_interval_read_begin(range->notifier);
- mmap_read_lock(region->mni.mm);
+ mmap_read_lock(region->mreg_mni.mm);
ret = hmm_range_fault(range);
- mmap_read_unlock(region->mni.mm);
+ mmap_read_unlock(region->mreg_mni.mm);
if (ret)
return ret;
- mutex_lock(®ion->mutex);
+ mutex_lock(®ion->mreg_mutex);
if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) {
- mutex_unlock(®ion->mutex);
+ mutex_unlock(®ion->mreg_mutex);
cond_resched();
return -EBUSY;
}
@@ -438,7 +438,7 @@ static int mshv_region_range_fault(struct mshv_mem_region *region,
u64 page_offset, u64 page_count)
{
struct hmm_range range = {
- .notifier = ®ion->mni,
+ .notifier = ®ion->mreg_mni,
.default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
};
unsigned long *pfns;
@@ -461,12 +461,12 @@ static int mshv_region_range_fault(struct mshv_mem_region *region,
goto out;
for (i = 0; i < page_count; i++)
- region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
+ region->mreg_pages[page_offset + i] = hmm_pfn_to_page(pfns[i]);
ret = mshv_region_remap_pages(region, region->hv_map_flags,
page_offset, page_count);
- mutex_unlock(®ion->mutex);
+ mutex_unlock(®ion->mreg_mutex);
out:
kfree(pfns);
return ret;
@@ -520,7 +520,7 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
{
struct mshv_mem_region *region = container_of(mni,
struct mshv_mem_region,
- mni);
+ mreg_mni);
u64 page_offset, page_count;
unsigned long mstart, mend;
int ret = -EPERM;
@@ -533,8 +533,8 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
page_count = HVPFN_DOWN(mend - mstart);
if (mmu_notifier_range_blockable(range))
- mutex_lock(®ion->mutex);
- else if (!mutex_trylock(®ion->mutex))
+ mutex_lock(®ion->mreg_mutex);
+ else if (!mutex_trylock(®ion->mreg_mutex))
goto out_fail;
mmu_interval_set_seq(mni, cur_seq);
@@ -546,12 +546,12 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
mshv_region_invalidate_pages(region, page_offset, page_count);
- mutex_unlock(®ion->mutex);
+ mutex_unlock(®ion->mreg_mutex);
return true;
out_unlock:
- mutex_unlock(®ion->mutex);
+ mutex_unlock(®ion->mreg_mutex);
out_fail:
WARN_ONCE(ret,
"Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n",
@@ -568,21 +568,21 @@ static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
void mshv_region_movable_fini(struct mshv_mem_region *region)
{
- mmu_interval_notifier_remove(®ion->mni);
+ mmu_interval_notifier_remove(®ion->mreg_mni);
}
bool mshv_region_movable_init(struct mshv_mem_region *region)
{
int ret;
- ret = mmu_interval_notifier_insert(®ion->mni, current->mm,
+ ret = mmu_interval_notifier_insert(®ion->mreg_mni, current->mm,
region->start_uaddr,
region->nr_pages << HV_HYP_PAGE_SHIFT,
&mshv_region_mni_ops);
if (ret)
return false;
- mutex_init(®ion->mutex);
+ mutex_init(®ion->mreg_mutex);
return true;
}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 7332d9af8373..04c2a1910a8a 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -82,16 +82,16 @@ enum mshv_region_type {
struct mshv_mem_region {
struct hlist_node hnode;
- struct kref refcount;
+ struct kref mreg_refcount;
u64 nr_pages;
u64 start_gfn;
u64 start_uaddr;
u32 hv_map_flags;
struct mshv_partition *partition;
- enum mshv_region_type type;
- struct mmu_interval_notifier mni;
- struct mutex mutex; /* protects region pages remapping */
- struct page *pages[];
+ enum mshv_region_type mreg_type;
+ struct mmu_interval_notifier mreg_mni;
+ struct mutex mreg_mutex; /* protects region pages remapping */
+ struct page *mreg_pages[];
};
struct mshv_irq_ack_notifier {
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index c633014ceb96..431aebf95bc7 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -650,7 +650,7 @@ static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
return false;
/* Only movable memory ranges are supported for GPA intercepts */
- if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
ret = mshv_region_handle_gfn_fault(region, gfn);
else
ret = false;
@@ -1193,12 +1193,12 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
return PTR_ERR(rg);
if (is_mmio)
- rg->type = MSHV_REGION_TYPE_MMIO;
+ rg->mreg_type = MSHV_REGION_TYPE_MMIO;
else if (mshv_partition_encrypted(partition) ||
!mshv_region_movable_init(rg))
- rg->type = MSHV_REGION_TYPE_MEM_PINNED;
+ rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
else
- rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
+ rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
rg->partition = partition;
@@ -1315,7 +1315,7 @@ mshv_map_user_memory(struct mshv_partition *partition,
if (ret)
return ret;
- switch (region->type) {
+ switch (region->mreg_type) {
case MSHV_REGION_TYPE_MEM_PINNED:
ret = mshv_prepare_pinned_region(region);
break;
--
2.51.2.vfs.0.1
^ permalink raw reply related
* Re: [PATCH 1/3] x86/x2apic: disable x2apic on resume if the kernel expects so
From: Sohil Mehta @ 2026-02-04 18:53 UTC (permalink / raw)
To: Shashank Balaji
Cc: Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
H. Peter Anvin, Suresh Siddha, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
Broadcom internal kernel review list, Jan Kiszka, Paolo Bonzini,
Vitaly Kuznetsov, Juergen Gross, Boris Ostrovsky, Ingo Molnar,
linux-kernel, linux-hyperv, virtualization, jailhouse-dev, kvm,
xen-devel, Rahul Bukte, Daniel Palmer, Tim Bird, stable
In-Reply-To: <aYMOqXTYMJ_IlEFA@JPC00244420>
On 2/4/2026 1:17 AM, Shashank Balaji wrote:
> __x2apic_disable disables x2apic only if boot_cpu_has(X86_FEATURE_APIC)
> and x2apic is already enabled.
I meant the X86_FEATURE_X2APIC and not X86_FEATURE_APIC. But, thinking
about it more, checking that the CPU is really in X2APIC mode by reading
the MSR is good enough.
> x2apic_enabled also does the same checks,
> the only difference being, it uses rdmsrq_safe instead of just rdmsrq,
> which is what __x2apic_disable uses. The safe version is because of
> Boris' suggestion [1]. If that's applicable here as well, then rdmsrq in
> __x2apic_disable should be changed to rdmsrq_safe.
I don't know if there is a strong justification for changing to
rdmsrq_safe() over here. Also, that would be beyond the scope of this
patch. In general, it's better to avoid such changes unless an actual
issue pops up.
>
>> I considered if an error message should be printed along with this. But,
>> I am not sure if it can really be called a firmware issue. It's probably
>> just that newer CPUs might have started defaulting to x2apic on.
>>
>> Can you specify what platform you are encountering this?
>
>
> I'm not sure it's the CPU defaulting to x2apic on. As per Section
> 12.12.5.1 of the Intel SDM:
>
> On coming out of reset, the local APIC unit is enabled and is in
> the xAPIC mode: IA32_APIC_BASE[EN]=1 and IA32_APIC_BASE[EXTD]=0.
>
> So, the CPU should be turning on in xapic mode. In fact, when x2apic is
> disabled in the firmware, this problem doesn't happen.
>
It's a bit odd then that the firmware chooses to enable x2apic without
the OS requesting it.
Linux maintains a concept of X2APIC_ON_LOCKED in x2apic_state which is
based on the hardware preference to keep the apic in X2APIC mode.
When you have x2apic enabled in firmware, but the system is in XAPIC
mode, can you read the values in MSR_IA32_ARCH_CAPABILITIES and
MSR_IA32_XAPIC_DISABLE_STATUS?
XAPIC shouldn't be disabled because you are running in that mode. But,
it would be good to confirm.
> Either way, a pr_warn maybe helpful. How about "x2apic re-enabled by the
> firmware during resume. Disabling\n"?
I mainly want to make sure the firmware is really at fault before we add
such a print. But it seems likely now that the firmware messed up.
^ permalink raw reply
* Re: [PATCH] mshv: Make MSHV mutually exclusive with KEXEC
From: Stanislav Kinsburskii @ 2026-02-04 18:33 UTC (permalink / raw)
To: Anirudh Rayabharam
Cc: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel
In-Reply-To: <aYLaKUEp23n2gxLU@anirudh-surface.localdomain>
On Wed, Feb 04, 2026 at 05:33:29AM +0000, Anirudh Rayabharam wrote:
> On Tue, Feb 03, 2026 at 11:42:58AM -0800, Stanislav Kinsburskii wrote:
> > On Tue, Feb 03, 2026 at 04:46:03PM +0000, Anirudh Rayabharam wrote:
> > > On Tue, Feb 03, 2026 at 07:40:36AM -0800, Stanislav Kinsburskii wrote:
> > > > On Tue, Feb 03, 2026 at 10:34:28AM +0530, Anirudh Rayabharam wrote:
> > > > > On Mon, Feb 02, 2026 at 11:18:27AM -0800, Stanislav Kinsburskii wrote:
> > > > > > On Mon, Feb 02, 2026 at 07:01:01PM +0000, Anirudh Rayabharam wrote:
> > > > > > > On Mon, Feb 02, 2026 at 09:10:00AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > On Fri, Jan 30, 2026 at 08:32:45PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > On Fri, Jan 30, 2026 at 10:46:45AM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > On Fri, Jan 30, 2026 at 05:11:12PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > On Wed, Jan 28, 2026 at 03:11:14PM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > On Wed, Jan 28, 2026 at 04:16:31PM +0000, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > On Mon, Jan 26, 2026 at 12:46:44PM -0800, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > On Tue, Jan 27, 2026 at 12:19:24AM +0530, Anirudh Rayabharam wrote:
> > > > > > > > > > > > > > > On Fri, Jan 23, 2026 at 10:20:53PM +0000, Stanislav Kinsburskii wrote:
> > > > > > > > > > > > > > > > The MSHV driver deposits kernel-allocated pages to the hypervisor during
> > > > > > > > > > > > > > > > runtime and never withdraws them. This creates a fundamental incompatibility
> > > > > > > > > > > > > > > > with KEXEC, as these deposited pages remain unavailable to the new kernel
> > > > > > > > > > > > > > > > loaded via KEXEC, leading to potential system crashes upon kernel accessing
> > > > > > > > > > > > > > > > hypervisor deposited pages.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Make MSHV mutually exclusive with KEXEC until proper page lifecycle
> > > > > > > > > > > > > > > > management is implemented.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Someone might want to stop all guest VMs and do a kexec. Which is valid
> > > > > > > > > > > > > > > and would work without any issue for L1VH.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > No, it won't work and hypervsisor depostied pages won't be withdrawn.
> > > > > > > > > > > > >
> > > > > > > > > > > > > All pages that were deposited in the context of a guest partition (i.e.
> > > > > > > > > > > > > with the guest partition ID), would be withdrawn when you kill the VMs,
> > > > > > > > > > > > > right? What other deposited pages would be left?
> > > > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > The driver deposits two types of pages: one for the guests (withdrawn
> > > > > > > > > > > > upon gust shutdown) and the other - for the host itself (never
> > > > > > > > > > > > withdrawn).
> > > > > > > > > > > > See hv_call_create_partition, for example: it deposits pages for the
> > > > > > > > > > > > host partition.
> > > > > > > > > > >
> > > > > > > > > > > Hmm.. I see. Is it not possible to reclaim this memory in module_exit?
> > > > > > > > > > > Also, can't we forcefully kill all running partitions in module_exit and
> > > > > > > > > > > then reclaim memory? Would this help with kernel consistency
> > > > > > > > > > > irrespective of userspace behavior?
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > It would, but this is sloppy and cannot be a long-term solution.
> > > > > > > > > >
> > > > > > > > > > It is also not reliable. We have no hook to prevent kexec. So if we fail
> > > > > > > > > > to kill the guest or reclaim the memory for any reason, the new kernel
> > > > > > > > > > may still crash.
> > > > > > > > >
> > > > > > > > > Actually guests won't be running by the time we reach our module_exit
> > > > > > > > > function during a kexec. Userspace processes would've been killed by
> > > > > > > > > then.
> > > > > > > > >
> > > > > > > >
> > > > > > > > No, they will not: "kexec -e" doesn't kill user processes.
> > > > > > > > We must not rely on OS to do graceful shutdown before doing
> > > > > > > > kexec.
> > > > > > >
> > > > > > > I see kexec -e is too brutal. Something like systemctl kexec is
> > > > > > > more graceful and is probably used more commonly. In this case at least
> > > > > > > we could register a reboot notifier and attempt to clean things up.
> > > > > > >
> > > > > > > I think it is better to support kexec to this extent rather than
> > > > > > > disabling it entirely.
> > > > > > >
> > > > > >
> > > > > > You do understand that once our kernel is released to third parties, we
> > > > > > can’t control how they will use kexec, right?
> > > > >
> > > > > Yes, we can't. But that's okay. It is fine for us to say that only some
> > > > > kexec scenarios are supported and some aren't (iff you're creating VMs
> > > > > using MSHV; if you're not creating VMs all of kexec is supported).
> > > > >
> > > >
> > > > Well, I disagree here. If we say the kernel supports MSHV, we must
> > > > provide a robust solution. A partially working solution is not
> > > > acceptable. It makes us look careless and can damage our reputation as a
> > > > team (and as a company).
> > >
> > > It won't if we call out upfront what is supported and what is not.
> > >
> > > >
> > > > > >
> > > > > > This is a valid and existing option. We have to account for it. Yet
> > > > > > again, L1VH will be used by arbitrary third parties out there, not just
> > > > > > by us.
> > > > > >
> > > > > > We can’t say the kernel supports MSHV until we close these gaps. We must
> > > > >
> > > > > We can. It is okay say some scenarios are supported and some aren't.
> > > > >
> > > > > All kexecs are supported if they never create VMs using MSHV. If they do
> > > > > create VMs using MSHV and we implement cleanup in a reboot notifier at
> > > > > least systemctl kexec and crashdump kexec would which are probably the
> > > > > most common uses of kexec. It's okay to say that this is all we support
> > > > > as of now.
> > > > >
> > > >
> > > > I'm repeating myself, but I'll try to put it differently.
> > > > There won't be any kernel core collected if a page was deposited. You're
> > > > arguing for a lost cause here. Once a page is allocated and deposited,
> > > > the crash kernel will try to write it into the core.
> > >
> > > That's why we have to implement something where we attempt to destroy
> > > partitions and reclaim memory (and BUG() out if that fails; which
> > > hopefully should happen very rarely if at all). This should be *the*
> > > solution we work towards. We don't need a temporary disable kexec
> > > solution.
> > >
> >
> > No, the solution is to preserve the shared state and pass it over via KHO.
>
> Okay, then work towards it without doing temporary KEXEC disable. We can
> call out that kexec is not supported until then. Disabling KEXEC is too
> intrusive.
>
What do you mean by "too intrusive"? The change if local to driver's
Kconfig. There are no verbal "callouts" in upstream Linux - that's
exactly what Kconfig is used for. Once the proper solution is
implemented, we can remove the restriction.
> Is there any precedent for this? Do you know if any driver ever disabled
> KEXEC this way?
>
No, but there is no other similar driver like this one.
Why does it matter though?
> >
> > > >
> > > > > Also, what makes you think customers would even be interested in enabling
> > > > > our module in their kernel configs if it takes away kexec?
> > > > >
> > > >
> > > > It's simple: L1VH isn't a host, so I can spin up new VMs instead of
> > > > servicing the existing ones.
> > >
> > > And what about the L2 VM state then? They might not be throwaway in all
> > > cases.
> > >
> >
> > L2 guest can (and likely will) be migrated fromt he old L1VH to the new
> > one.
> > And this is most likely the current scenario customers are using.
> >
> > > >
> > > > Why do you think there won’t be customers interested in using MSHV in
> > > > L1VH without kexec support?
> > >
> > > Because they could already be using kexec for their servicing needs or
> > > whatever. And no we can't just say "don't service these VMs just spin up
> > > new ones".
> > >
> >
> > Are you speculating or know for sure?
>
> It's a reasonable assumption that people are using kexec for servicing.
>
Again, using kexec for servicing is not supported: why pretending it is?
> >
> > > Also, keep in mind that once L1VH is available in Azure, the distros
> > > that run on it would be the same distros that run on all other Azure
> > > VMs. There won't be special distros with a kernel specifically built for
> > > L1VH. And KEXEC is generally enabled in distros. Distro vendors won't be
> > > happy that they would need to publish a separate version of their image with
> > > MSHV_ROOT enabled and KEXEC disabled because they wouldn't want KEXEC to
> > > be disabled for all Azure VMs. Also, the customers will be confused why
> > > the same distro doesn't work on L1VH.
> > >
> >
> > I don't think distro happiness is our concern. They already build custom
>
> If distros are not happy they won't package this and consequently
> nobody will use it.
>
Could you provide an example of such issues in the past?
> > versions for Azure. They can build another custom version for L1VH if
> > needed.
>
> We should at least check if they are ready to do this.
>
This is a labor intrusive and long-term check. Unless there is a solid
evidence that they won't do it, I don't see the point in doing this.
Thanks,
Stanislav
> Thanks,
> Anirudh.
>
> >
> > Anyway, I don't see the point in continuing this discussion. All points
> > have been made, and solutions have been proposed.
> >
> > If you can come up with something better in the next few days, so we at
> > least have a chance to get it merged in the next merge window, great. If
> > not, we should explicitly forbid the unsupported feature and move on.
> >
> > Thanks,
> > Thanks,
> > Stanislav
> >
> > > Thanks,
> > > Anirudh.
^ permalink raw reply
* [PATCH v3 2/2] mshv: add arm64 support for doorbell & intercept SINTs
From: Anirudh Rayabharam @ 2026-02-04 17:42 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel; +Cc: anirudh
In-Reply-To: <20260204174237.1201153-1-anirudh@anirudhrb.com>
From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
On x86, the HYPERVISOR_CALLBACK_VECTOR is used to receive synthetic
interrupts (SINTs) from the hypervisor for doorbells and intercepts.
There is no such vector reserved for arm64.
On arm64, the INTID for SINTs should be in the SGI or PPI range. The
hypervisor exposes a virtual device in the ACPI that reserves a
PPI for this use. Introduce a platform_driver that binds to this ACPI
device and obtains the interrupt vector that can be used for SINTs.
To better unify the code paths, introduce mshv_sint_vector_init() that
either registers the platform_driver and obtains the INTID (arm64) or
just uses HYPERVISOR_CALLBACK_VECTOR as the interrupt vector (x86).
Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
---
drivers/hv/mshv_synic.c | 163 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 156 insertions(+), 7 deletions(-)
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index 074e37c48876..eefb1e8fc1b4 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -10,17 +10,24 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/random.h>
#include <linux/cpuhotplug.h>
#include <linux/reboot.h>
#include <asm/mshyperv.h>
+#include <linux/platform_device.h>
+#include <linux/acpi.h>
#include "mshv_eventfd.h"
#include "mshv.h"
static int synic_cpuhp_online;
static struct hv_synic_pages __percpu *synic_pages;
+static int mshv_sint_vector = -1; /* hwirq for the SynIC SINTs */
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+static int mshv_sint_irq = -1; /* Linux IRQ for mshv_sint_vector */
+#endif
static u32 synic_event_ring_get_queued_port(u32 sint_index)
{
@@ -456,9 +463,7 @@ static int mshv_synic_cpu_init(unsigned int cpu)
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sirbp sirbp;
-#ifdef HYPERVISOR_CALLBACK_VECTOR
union hv_synic_sint sint;
-#endif
union hv_synic_scontrol sctrl;
struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
@@ -501,10 +506,13 @@ static int mshv_synic_cpu_init(unsigned int cpu)
hv_set_non_nested_msr(HV_MSR_SIRBP, sirbp.as_uint64);
-#ifdef HYPERVISOR_CALLBACK_VECTOR
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+ enable_percpu_irq(mshv_sint_irq, 0);
+#endif
+
/* Enable intercepts */
sint.as_uint64 = 0;
- sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.vector = mshv_sint_vector;
sint.masked = false;
sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX,
@@ -512,13 +520,12 @@ static int mshv_synic_cpu_init(unsigned int cpu)
/* Doorbell SINT */
sint.as_uint64 = 0;
- sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ sint.vector = mshv_sint_vector;
sint.masked = false;
sint.as_intercept = 1;
sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
sint.as_uint64);
-#endif
/* Enable global synic bit */
sctrl.as_uint64 = hv_get_non_nested_msr(HV_MSR_SCONTROL);
@@ -573,6 +580,10 @@ static int mshv_synic_cpu_exit(unsigned int cpu)
hv_set_non_nested_msr(HV_MSR_SINT0 + HV_SYNIC_DOORBELL_SINT_INDEX,
sint.as_uint64);
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+ disable_percpu_irq(mshv_sint_irq);
+#endif
+
/* Disable Synic's event ring page */
sirbp.as_uint64 = hv_get_non_nested_msr(HV_MSR_SIRBP);
sirbp.sirbp_enabled = false;
@@ -683,14 +694,149 @@ static struct notifier_block mshv_synic_reboot_nb = {
.notifier_call = mshv_synic_reboot_notify,
};
+#ifndef HYPERVISOR_CALLBACK_VECTOR
+#ifdef CONFIG_ACPI
+static long __percpu *mshv_evt;
+
+static acpi_status mshv_walk_resources(struct acpi_resource *res, void *ctx)
+{
+ struct resource r;
+
+ if (res->type == ACPI_RESOURCE_TYPE_EXTENDED_IRQ) {
+ if (!acpi_dev_resource_interrupt(res, 0, &r)) {
+ pr_err("Unable to parse MSHV ACPI interrupt\n");
+ return AE_ERROR;
+ }
+ /* ARM64 INTID */
+ mshv_sint_vector = res->data.extended_irq.interrupts[0];
+ /* Linux IRQ number */
+ mshv_sint_irq = r.start;
+ }
+
+ return AE_OK;
+}
+
+static irqreturn_t mshv_percpu_isr(int irq, void *dev_id)
+{
+ mshv_isr();
+ return IRQ_HANDLED;
+}
+
+static int mshv_sint_probe(struct platform_device *pdev)
+{
+ acpi_status result;
+ int ret;
+ struct acpi_device *device = ACPI_COMPANION(&pdev->dev);
+
+ result = acpi_walk_resources(device->handle, METHOD_NAME__CRS,
+ mshv_walk_resources, NULL);
+ if (ACPI_FAILURE(result)) {
+ ret = -ENODEV;
+ goto out_fail;
+ }
+
+ mshv_evt = alloc_percpu(long);
+ if (!mshv_evt) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ ret = request_percpu_irq(mshv_sint_irq, mshv_percpu_isr, "MSHV",
+ mshv_evt);
+ if (ret)
+ goto free_evt;
+
+ return 0;
+
+free_evt:
+ free_percpu(mshv_evt);
+out_fail:
+ mshv_sint_vector = -1;
+ mshv_sint_irq = -1;
+ return ret;
+}
+
+static void mshv_sint_remove(struct platform_device *pdev)
+{
+ free_percpu_irq(mshv_sint_irq, mshv_evt);
+ free_percpu(mshv_evt);
+}
+#else
+static int mshv_sint_probe(struct platform_device *pdev)
+{
+ return -ENODEV;
+}
+
+static void mshv_sint_remove(struct platform_device *pdev)
+{
+}
+#endif
+
+static const __maybe_unused struct acpi_device_id mshv_sint_device_ids[] = {
+ {"MSFT1003", 0},
+ {"", 0},
+};
+
+static struct platform_driver mshv_sint_drv = {
+ .probe = mshv_sint_probe,
+ .remove = mshv_sint_remove,
+ .driver = {
+ .name = "mshv_sint",
+ .acpi_match_table = ACPI_PTR(mshv_sint_device_ids),
+ .probe_type = PROBE_FORCE_SYNCHRONOUS,
+ },
+};
+
+static int __init mshv_sint_vector_init(void)
+{
+ int ret;
+
+ if (acpi_disabled)
+ return -ENODEV;
+
+ ret = platform_driver_register(&mshv_sint_drv);
+ if (ret)
+ return ret;
+
+ if (mshv_sint_vector == -1 || mshv_sint_irq == -1) {
+ platform_driver_unregister(&mshv_sint_drv);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void mshv_sint_vector_cleanup(void)
+{
+ platform_driver_unregister(&mshv_sint_drv);
+}
+#else /* HYPERVISOR_CALLBACK_VECTOR */
+static int __init mshv_sint_vector_init(void)
+{
+ mshv_sint_vector = HYPERVISOR_CALLBACK_VECTOR;
+ return 0;
+}
+
+static void mshv_sint_vector_cleanup(void)
+{
+}
+#endif /* HYPERVISOR_CALLBACK_VECTOR */
+
int __init mshv_synic_init(struct device *dev)
{
int ret = 0;
+ ret = mshv_sint_vector_init();
+ if (ret) {
+ dev_err(dev, "Failed to get MSHV SINT vector: %i\n", ret);
+ return ret;
+ }
+
synic_pages = alloc_percpu(struct hv_synic_pages);
if (!synic_pages) {
dev_err(dev, "Failed to allocate percpu synic page\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto sint_vector_cleanup;
}
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
@@ -713,6 +859,8 @@ int __init mshv_synic_init(struct device *dev)
cpuhp_remove_state(synic_cpuhp_online);
free_synic_pages:
free_percpu(synic_pages);
+sint_vector_cleanup:
+ mshv_sint_vector_cleanup();
return ret;
}
@@ -721,4 +869,5 @@ void mshv_synic_cleanup(void)
unregister_reboot_notifier(&mshv_synic_reboot_nb);
cpuhp_remove_state(synic_cpuhp_online);
free_percpu(synic_pages);
+ mshv_sint_vector_cleanup();
}
--
2.34.1
^ permalink raw reply related
* [PATCH v3 1/2] mshv: refactor synic init and cleanup
From: Anirudh Rayabharam @ 2026-02-04 17:42 UTC (permalink / raw)
To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel; +Cc: anirudh
In-Reply-To: <20260204174237.1201153-1-anirudh@anirudhrb.com>
From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
Rename mshv_synic_init() to mshv_synic_cpu_init() and
mshv_synic_cleanup() to mshv_synic_cpu_exit() to better reflect that
these functions handle per-cpu synic setup and teardown.
Use mshv_synic_init/cleanup() to perform init/cleanup that is not per-cpu.
Move all the synic related setup from mshv_parent_partition_init.
Move the reboot notifier to mshv_synic.c because it currently only
operates on the synic cpuhp state.
Move out synic_pages from the global mshv_root since it's use is now
completely local to mshv_synic.c.
This is in preparation for the next patch which will add more stuff to
mshv_synic_init().
No functional change.
Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
---
drivers/hv/mshv_root.h | 5 ++-
drivers/hv/mshv_root_main.c | 59 +++++-------------------------
drivers/hv/mshv_synic.c | 71 +++++++++++++++++++++++++++++++++----
3 files changed, 75 insertions(+), 60 deletions(-)
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 3c1d88b36741..26e0320c8097 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -183,7 +183,6 @@ struct hv_synic_pages {
};
struct mshv_root {
- struct hv_synic_pages __percpu *synic_pages;
spinlock_t pt_ht_lock;
DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS);
struct hv_partition_property_vmm_capabilities vmm_caps;
@@ -242,8 +241,8 @@ int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
void mshv_unregister_doorbell(u64 partition_id, int doorbell_portid);
void mshv_isr(void);
-int mshv_synic_init(unsigned int cpu);
-int mshv_synic_cleanup(unsigned int cpu);
+int mshv_synic_init(struct device *dev);
+void mshv_synic_cleanup(void);
static inline bool mshv_partition_encrypted(struct mshv_partition *partition)
{
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 681b58154d5e..7c1666456e78 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -2035,7 +2035,6 @@ mshv_dev_release(struct inode *inode, struct file *filp)
return 0;
}
-static int mshv_cpuhp_online;
static int mshv_root_sched_online;
static const char *scheduler_type_to_string(enum hv_scheduler_type type)
@@ -2198,40 +2197,14 @@ root_scheduler_deinit(void)
free_percpu(root_scheduler_output);
}
-static int mshv_reboot_notify(struct notifier_block *nb,
- unsigned long code, void *unused)
-{
- cpuhp_remove_state(mshv_cpuhp_online);
- return 0;
-}
-
-struct notifier_block mshv_reboot_nb = {
- .notifier_call = mshv_reboot_notify,
-};
-
static void mshv_root_partition_exit(void)
{
- unregister_reboot_notifier(&mshv_reboot_nb);
root_scheduler_deinit();
}
static int __init mshv_root_partition_init(struct device *dev)
{
- int err;
-
- err = root_scheduler_init(dev);
- if (err)
- return err;
-
- err = register_reboot_notifier(&mshv_reboot_nb);
- if (err)
- goto root_sched_deinit;
-
- return 0;
-
-root_sched_deinit:
- root_scheduler_deinit();
- return err;
+ return root_scheduler_init(dev);
}
static void mshv_init_vmm_caps(struct device *dev)
@@ -2276,31 +2249,18 @@ static int __init mshv_parent_partition_init(void)
MSHV_HV_MAX_VERSION);
}
- mshv_root.synic_pages = alloc_percpu(struct hv_synic_pages);
- if (!mshv_root.synic_pages) {
- dev_err(dev, "Failed to allocate percpu synic page\n");
- ret = -ENOMEM;
+ ret = mshv_synic_init(dev);
+ if (ret)
goto device_deregister;
- }
-
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
- mshv_synic_init,
- mshv_synic_cleanup);
- if (ret < 0) {
- dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
- goto free_synic_pages;
- }
-
- mshv_cpuhp_online = ret;
ret = mshv_retrieve_scheduler_type(dev);
if (ret)
- goto remove_cpu_state;
+ goto synic_cleanup;
if (hv_root_partition())
ret = mshv_root_partition_init(dev);
if (ret)
- goto remove_cpu_state;
+ goto synic_cleanup;
mshv_init_vmm_caps(dev);
@@ -2318,10 +2278,8 @@ static int __init mshv_parent_partition_init(void)
exit_partition:
if (hv_root_partition())
mshv_root_partition_exit();
-remove_cpu_state:
- cpuhp_remove_state(mshv_cpuhp_online);
-free_synic_pages:
- free_percpu(mshv_root.synic_pages);
+synic_cleanup:
+ mshv_synic_cleanup();
device_deregister:
misc_deregister(&mshv_dev);
return ret;
@@ -2335,8 +2293,7 @@ static void __exit mshv_parent_partition_exit(void)
mshv_irqfd_wq_cleanup();
if (hv_root_partition())
mshv_root_partition_exit();
- cpuhp_remove_state(mshv_cpuhp_online);
- free_percpu(mshv_root.synic_pages);
+ mshv_synic_cleanup();
}
module_init(mshv_parent_partition_init);
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index f8b0337cdc82..074e37c48876 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -12,11 +12,16 @@
#include <linux/mm.h>
#include <linux/io.h>
#include <linux/random.h>
+#include <linux/cpuhotplug.h>
+#include <linux/reboot.h>
#include <asm/mshyperv.h>
#include "mshv_eventfd.h"
#include "mshv.h"
+static int synic_cpuhp_online;
+static struct hv_synic_pages __percpu *synic_pages;
+
static u32 synic_event_ring_get_queued_port(u32 sint_index)
{
struct hv_synic_event_ring_page **event_ring_page;
@@ -26,7 +31,7 @@ static u32 synic_event_ring_get_queued_port(u32 sint_index)
u32 message;
u8 tail;
- spages = this_cpu_ptr(mshv_root.synic_pages);
+ spages = this_cpu_ptr(synic_pages);
event_ring_page = &spages->synic_event_ring_page;
synic_eventring_tail = (u8 **)this_cpu_ptr(hv_synic_eventring_tail);
@@ -393,7 +398,7 @@ mshv_intercept_isr(struct hv_message *msg)
void mshv_isr(void)
{
- struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_message *msg;
bool handled;
@@ -446,7 +451,7 @@ void mshv_isr(void)
}
}
-int mshv_synic_init(unsigned int cpu)
+static int mshv_synic_cpu_init(unsigned int cpu)
{
union hv_synic_simp simp;
union hv_synic_siefp siefp;
@@ -455,7 +460,7 @@ int mshv_synic_init(unsigned int cpu)
union hv_synic_sint sint;
#endif
union hv_synic_scontrol sctrl;
- struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
@@ -542,14 +547,14 @@ int mshv_synic_init(unsigned int cpu)
return -EFAULT;
}
-int mshv_synic_cleanup(unsigned int cpu)
+static int mshv_synic_cpu_exit(unsigned int cpu)
{
union hv_synic_sint sint;
union hv_synic_simp simp;
union hv_synic_siefp siefp;
union hv_synic_sirbp sirbp;
union hv_synic_scontrol sctrl;
- struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages);
+ struct hv_synic_pages *spages = this_cpu_ptr(synic_pages);
struct hv_message_page **msg_page = &spages->hyp_synic_message_page;
struct hv_synic_event_flags_page **event_flags_page =
&spages->synic_event_flags_page;
@@ -663,3 +668,57 @@ mshv_unregister_doorbell(u64 partition_id, int doorbell_portid)
mshv_portid_free(doorbell_portid);
}
+
+static int mshv_synic_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (!hv_root_partition())
+ return 0;
+
+ cpuhp_remove_state(synic_cpuhp_online);
+ return 0;
+}
+
+static struct notifier_block mshv_synic_reboot_nb = {
+ .notifier_call = mshv_synic_reboot_notify,
+};
+
+int __init mshv_synic_init(struct device *dev)
+{
+ int ret = 0;
+
+ synic_pages = alloc_percpu(struct hv_synic_pages);
+ if (!synic_pages) {
+ dev_err(dev, "Failed to allocate percpu synic page\n");
+ return -ENOMEM;
+ }
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mshv_synic",
+ mshv_synic_cpu_init,
+ mshv_synic_cpu_exit);
+ if (ret < 0) {
+ dev_err(dev, "Failed to setup cpu hotplug state: %i\n", ret);
+ goto free_synic_pages;
+ }
+
+ synic_cpuhp_online = ret;
+
+ ret = register_reboot_notifier(&mshv_synic_reboot_nb);
+ if (ret)
+ goto remove_cpuhp_state;
+
+ return 0;
+
+remove_cpuhp_state:
+ cpuhp_remove_state(synic_cpuhp_online);
+free_synic_pages:
+ free_percpu(synic_pages);
+ return ret;
+}
+
+void mshv_synic_cleanup(void)
+{
+ unregister_reboot_notifier(&mshv_synic_reboot_nb);
+ cpuhp_remove_state(synic_cpuhp_online);
+ free_percpu(synic_pages);
+}
--
2.34.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox