* [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs
@ 2025-08-28 9:16 Vitaly Kuznetsov
2025-08-29 17:03 ` Michael Kelley
0 siblings, 1 reply; 3+ messages in thread
From: Vitaly Kuznetsov @ 2025-08-28 9:16 UTC (permalink / raw)
To: linux-hyperv, Michael Kelley
Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, x86,
linux-kernel, Nuno Das Neves, Tianyu Lan, Li Tian, Philipp Rudo
Azure CVM instance types featuring a paravisor hang upon kdump. The
investigation shows that makedumpfile causes a hang when it steps on a page
which was previously share with the host
(HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
knowledge of these 'special' regions (which are Vmbus connection pages,
GPADL buffers, ...). There are several ways to approach the issue:
- Convey the knowledge about these regions to the new kernel somehow.
- Unshare these regions before accessing in the new kernel (it is unclear
if there's a way to query the status for a given GPA range).
- Unshare these regions before jumping to the new kernel (which this patch
implements).
To make the procedure as robust as possible, store PFN ranges of shared
regions in a linked list instead of storing GVAs and re-using
hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
on the kdump/kexec path.
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
Changes since v3 [Michael Kelley]:
- Employ x86_platform.guest.enc_kexec_{begin,finish} hooks.
- Don't use spinlock in what's now hv_vtom_kexec_finish().
- Handle possible hypercall failures in hv_mark_gpa_visibility()
symmetrically; change hv_list_enc_remove() to return -ENOMEM as well.
- Rebase to the latest hyperv/next.
---
arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 210 insertions(+), 1 deletion(-)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index ade6c665c97e..a4615b889f3e 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
hv_ghcb_msr_read(msr, value);
}
+/*
+ * Keep track of the PFN regions which were shared with the host. The access
+ * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
+ */
+struct hv_enc_pfn_region {
+ struct list_head list;
+ u64 pfn;
+ int count;
+};
+
+static LIST_HEAD(hv_list_enc);
+static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
+
+static int hv_list_enc_add(const u64 *pfn_list, int count)
+{
+ struct hv_enc_pfn_region *ent;
+ unsigned long flags;
+ u64 pfn;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i];
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ /* Check if the PFN already exists in some region first */
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
+ /* Nothing to do - pfn is already in the list */
+ goto unlock_done;
+ }
+
+ /*
+ * Check if the PFN is adjacent to an existing region. Growing
+ * a region can make it adjacent to another one but merging is
+ * not (yet) implemented for simplicity. A PFN cannot be added
+ * to two regions to keep the logic in hv_list_enc_remove()
+ * correct.
+ */
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ if (ent->pfn + ent->count == pfn) {
+ /* Grow existing region up */
+ ent->count++;
+ goto unlock_done;
+ } else if (pfn + 1 == ent->pfn) {
+ /* Grow existing region down */
+ ent->pfn--;
+ ent->count++;
+ goto unlock_done;
+ }
+ }
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+ /* No adjacent region found -- create a new one */
+ ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pfn = pfn;
+ ent->count = 1;
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_add(&ent->list, &hv_list_enc);
+
+unlock_done:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ }
+
+ return 0;
+}
+
+static int hv_list_enc_remove(const u64 *pfn_list, int count)
+{
+ struct hv_enc_pfn_region *ent, *t;
+ struct hv_enc_pfn_region new_region;
+ unsigned long flags;
+ u64 pfn;
+ int i;
+
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i];
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
+ if (pfn == ent->pfn + ent->count - 1) {
+ /* Removing tail pfn */
+ ent->count--;
+ if (!ent->count) {
+ list_del(&ent->list);
+ kfree(ent);
+ }
+ goto unlock_done;
+ } else if (pfn == ent->pfn) {
+ /* Removing head pfn */
+ ent->count--;
+ ent->pfn++;
+ if (!ent->count) {
+ list_del(&ent->list);
+ kfree(ent);
+ }
+ goto unlock_done;
+ } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
+ /*
+ * Removing a pfn in the middle. Cut off the tail
+ * of the existing region and create a template for
+ * the new one.
+ */
+ new_region.pfn = pfn + 1;
+ new_region.count = ent->count - (pfn - ent->pfn + 1);
+ ent->count = pfn - ent->pfn;
+ goto unlock_split;
+ }
+
+ }
+unlock_done:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ continue;
+
+unlock_split:
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+ ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pfn = new_region.pfn;
+ ent->count = new_region.count;
+
+ raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+ list_add(&ent->list, &hv_list_enc);
+ raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+ }
+
+ return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void hv_vtom_kexec_begin(void)
+{
+ if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ return;
+
+ /*
+ * Crash kernel reaches here with interrupts disabled: can't wait for
+ * conversions to finish.
+ *
+ * If race happened, just report and proceed.
+ */
+ if (!set_memory_enc_stop_conversion())
+ pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+static void hv_vtom_kexec_finish(void)
+{
+ struct hv_gpa_range_for_visibility *input;
+ struct hv_enc_pfn_region *ent;
+ unsigned long flags;
+ u64 hv_status;
+ int cur, i;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+ if (unlikely(!input))
+ goto out;
+
+ list_for_each_entry(ent, &hv_list_enc, list) {
+ for (i = 0, cur = 0; i < ent->count; i++) {
+ input->gpa_page_list[cur] = ent->pfn + i;
+ cur++;
+
+ if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
+ input->reserved0 = 0;
+ input->reserved1 = 0;
+ hv_status = hv_do_rep_hypercall(
+ HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
+ cur, 0, input, NULL);
+ WARN_ON_ONCE(!hv_result_success(hv_status));
+ cur = 0;
+ }
+ }
+
+ }
+
+out:
+ local_irq_restore(flags);
+}
+
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
*
@@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
struct hv_gpa_range_for_visibility *input;
u64 hv_status;
unsigned long flags;
+ int ret;
/* no-op if partition isolation is not enabled */
if (!hv_is_isolation_supported())
@@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
return -EINVAL;
}
+ if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+ ret = hv_list_enc_remove(pfn, count);
+ else
+ ret = hv_list_enc_add(pfn, count);
+ if (ret)
+ return ret;
+
local_irq_save(flags);
input = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
if (hv_result_success(hv_status))
return 0;
+
+ if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+ ret = hv_list_enc_add(pfn, count);
else
- return -EFAULT;
+ ret = hv_list_enc_remove(pfn, count);
+ /*
+ * There's no good way to recover from -ENOMEM here, the accounting is
+ * wrong either way.
+ */
+ WARN_ON_ONCE(ret);
+
+ return -EFAULT;
}
/*
@@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+ x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
+ x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
/* Set WB as the default cache mode. */
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
--
2.50.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* RE: [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs
2025-08-28 9:16 [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs Vitaly Kuznetsov
@ 2025-08-29 17:03 ` Michael Kelley
2025-08-31 17:37 ` Tianyu Lan
0 siblings, 1 reply; 3+ messages in thread
From: Michael Kelley @ 2025-08-29 17:03 UTC (permalink / raw)
To: Vitaly Kuznetsov, linux-hyperv@vger.kernel.org,
linux-coco@lists.linux.dev
Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
x86@kernel.org, linux-kernel@vger.kernel.org, Nuno Das Neves,
Tianyu Lan, Li Tian, Philipp Rudo
From: Vitaly Kuznetsov <vkuznets@redhat.com> Sent: Thursday, August 28, 2025 2:16 AM
>
> Azure CVM instance types featuring a paravisor hang upon kdump. The
> investigation shows that makedumpfile causes a hang when it steps on a page
> which was previously share with the host
> (HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
> knowledge of these 'special' regions (which are Vmbus connection pages,
> GPADL buffers, ...). There are several ways to approach the issue:
> - Convey the knowledge about these regions to the new kernel somehow.
> - Unshare these regions before accessing in the new kernel (it is unclear
> if there's a way to query the status for a given GPA range).
> - Unshare these regions before jumping to the new kernel (which this patch
> implements).
>
> To make the procedure as robust as possible, store PFN ranges of shared
> regions in a linked list instead of storing GVAs and re-using
> hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
> on the kdump/kexec path.
>
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Looks good!
Adding Confidential Computing mailing list (linux-coco@lists.linux.dev)
for visibility.
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> ---
> Changes since v3 [Michael Kelley]:
> - Employ x86_platform.guest.enc_kexec_{begin,finish} hooks.
> - Don't use spinlock in what's now hv_vtom_kexec_finish().
> - Handle possible hypercall failures in hv_mark_gpa_visibility()
> symmetrically; change hv_list_enc_remove() to return -ENOMEM as well.
> - Rebase to the latest hyperv/next.
> ---
> arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 210 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
> index ade6c665c97e..a4615b889f3e 100644
> --- a/arch/x86/hyperv/ivm.c
> +++ b/arch/x86/hyperv/ivm.c
> @@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
> hv_ghcb_msr_read(msr, value);
> }
>
> +/*
> + * Keep track of the PFN regions which were shared with the host. The access
> + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
> + */
> +struct hv_enc_pfn_region {
> + struct list_head list;
> + u64 pfn;
> + int count;
> +};
> +
> +static LIST_HEAD(hv_list_enc);
> +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
> +
> +static int hv_list_enc_add(const u64 *pfn_list, int count)
> +{
> + struct hv_enc_pfn_region *ent;
> + unsigned long flags;
> + u64 pfn;
> + int i;
> +
> + for (i = 0; i < count; i++) {
> + pfn = pfn_list[i];
> +
> + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> + /* Check if the PFN already exists in some region first */
> + list_for_each_entry(ent, &hv_list_enc, list) {
> + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
> + /* Nothing to do - pfn is already in the list */
> + goto unlock_done;
> + }
> +
> + /*
> + * Check if the PFN is adjacent to an existing region. Growing
> + * a region can make it adjacent to another one but merging is
> + * not (yet) implemented for simplicity. A PFN cannot be added
> + * to two regions to keep the logic in hv_list_enc_remove()
> + * correct.
> + */
> + list_for_each_entry(ent, &hv_list_enc, list) {
> + if (ent->pfn + ent->count == pfn) {
> + /* Grow existing region up */
> + ent->count++;
> + goto unlock_done;
> + } else if (pfn + 1 == ent->pfn) {
> + /* Grow existing region down */
> + ent->pfn--;
> + ent->count++;
> + goto unlock_done;
> + }
> + }
> + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +
> + /* No adjacent region found -- create a new one */
> + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pfn = pfn;
> + ent->count = 1;
> +
> + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> + list_add(&ent->list, &hv_list_enc);
> +
> +unlock_done:
> + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> + }
> +
> + return 0;
> +}
> +
> +static int hv_list_enc_remove(const u64 *pfn_list, int count)
> +{
> + struct hv_enc_pfn_region *ent, *t;
> + struct hv_enc_pfn_region new_region;
> + unsigned long flags;
> + u64 pfn;
> + int i;
> +
> + for (i = 0; i < count; i++) {
> + pfn = pfn_list[i];
> +
> + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> + list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
> + if (pfn == ent->pfn + ent->count - 1) {
> + /* Removing tail pfn */
> + ent->count--;
> + if (!ent->count) {
> + list_del(&ent->list);
> + kfree(ent);
> + }
> + goto unlock_done;
> + } else if (pfn == ent->pfn) {
> + /* Removing head pfn */
> + ent->count--;
> + ent->pfn++;
> + if (!ent->count) {
> + list_del(&ent->list);
> + kfree(ent);
> + }
> + goto unlock_done;
> + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
> + /*
> + * Removing a pfn in the middle. Cut off the tail
> + * of the existing region and create a template for
> + * the new one.
> + */
> + new_region.pfn = pfn + 1;
> + new_region.count = ent->count - (pfn - ent->pfn + 1);
> + ent->count = pfn - ent->pfn;
> + goto unlock_split;
> + }
> +
> + }
> +unlock_done:
> + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> + continue;
> +
> +unlock_split:
> + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +
> + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pfn = new_region.pfn;
> + ent->count = new_region.count;
> +
> + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> + list_add(&ent->list, &hv_list_enc);
> + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> + }
> +
> + return 0;
> +}
> +
> +/* Stop new private<->shared conversions */
> +static void hv_vtom_kexec_begin(void)
> +{
> + if (!IS_ENABLED(CONFIG_KEXEC_CORE))
> + return;
> +
> + /*
> + * Crash kernel reaches here with interrupts disabled: can't wait for
> + * conversions to finish.
> + *
> + * If race happened, just report and proceed.
> + */
> + if (!set_memory_enc_stop_conversion())
> + pr_warn("Failed to stop shared<->private conversions\n");
> +}
> +
> +static void hv_vtom_kexec_finish(void)
> +{
> + struct hv_gpa_range_for_visibility *input;
> + struct hv_enc_pfn_region *ent;
> + unsigned long flags;
> + u64 hv_status;
> + int cur, i;
> +
> + local_irq_save(flags);
> + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +
> + if (unlikely(!input))
> + goto out;
> +
> + list_for_each_entry(ent, &hv_list_enc, list) {
> + for (i = 0, cur = 0; i < ent->count; i++) {
> + input->gpa_page_list[cur] = ent->pfn + i;
> + cur++;
> +
> + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
> + input->partition_id = HV_PARTITION_ID_SELF;
> + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
> + input->reserved0 = 0;
> + input->reserved1 = 0;
> + hv_status = hv_do_rep_hypercall(
> + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
> + cur, 0, input, NULL);
> + WARN_ON_ONCE(!hv_result_success(hv_status));
> + cur = 0;
> + }
> + }
> +
> + }
> +
> +out:
> + local_irq_restore(flags);
> +}
> +
> /*
> * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
> *
> @@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> struct hv_gpa_range_for_visibility *input;
> u64 hv_status;
> unsigned long flags;
> + int ret;
>
> /* no-op if partition isolation is not enabled */
> if (!hv_is_isolation_supported())
> @@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> return -EINVAL;
> }
>
> + if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> + ret = hv_list_enc_remove(pfn, count);
> + else
> + ret = hv_list_enc_add(pfn, count);
> + if (ret)
> + return ret;
> +
> local_irq_save(flags);
> input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>
> @@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
>
> if (hv_result_success(hv_status))
> return 0;
> +
> + if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> + ret = hv_list_enc_add(pfn, count);
> else
> - return -EFAULT;
> + ret = hv_list_enc_remove(pfn, count);
> + /*
> + * There's no good way to recover from -ENOMEM here, the accounting is
> + * wrong either way.
> + */
> + WARN_ON_ONCE(ret);
> +
> + return -EFAULT;
> }
>
> /*
> @@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
> x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
> x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
> x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
> + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
> + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
>
> /* Set WB as the default cache mode. */
> guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
> --
> 2.50.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs
2025-08-29 17:03 ` Michael Kelley
@ 2025-08-31 17:37 ` Tianyu Lan
0 siblings, 0 replies; 3+ messages in thread
From: Tianyu Lan @ 2025-08-31 17:37 UTC (permalink / raw)
To: Michael Kelley
Cc: Vitaly Kuznetsov, linux-hyperv@vger.kernel.org,
linux-coco@lists.linux.dev, K. Y. Srinivasan, Haiyang Zhang,
Wei Liu, Dexuan Cui, x86@kernel.org, linux-kernel@vger.kernel.org,
Nuno Das Neves, Tianyu Lan, Li Tian, Philipp Rudo
On Sat, Aug 30, 2025 at 1:03 AM Michael Kelley <mhklinux@outlook.com> wrote:
>
> From: Vitaly Kuznetsov <vkuznets@redhat.com> Sent: Thursday, August 28, 2025 2:16 AM
> >
> > Azure CVM instance types featuring a paravisor hang upon kdump. The
> > investigation shows that makedumpfile causes a hang when it steps on a page
> > which was previously share with the host
> > (HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
> > knowledge of these 'special' regions (which are Vmbus connection pages,
> > GPADL buffers, ...). There are several ways to approach the issue:
> > - Convey the knowledge about these regions to the new kernel somehow.
> > - Unshare these regions before accessing in the new kernel (it is unclear
> > if there's a way to query the status for a given GPA range).
> > - Unshare these regions before jumping to the new kernel (which this patch
> > implements).
> >
> > To make the procedure as robust as possible, store PFN ranges of shared
> > regions in a linked list instead of storing GVAs and re-using
> > hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
> > on the kdump/kexec path.
> >
> > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>
> Looks good!
>
> Adding Confidential Computing mailing list (linux-coco@lists.linux.dev)
> for visibility.
>
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
>
> > ---
> > Changes since v3 [Michael Kelley]:
> > - Employ x86_platform.guest.enc_kexec_{begin,finish} hooks.
> > - Don't use spinlock in what's now hv_vtom_kexec_finish().
> > - Handle possible hypercall failures in hv_mark_gpa_visibility()
> > symmetrically; change hv_list_enc_remove() to return -ENOMEM as well.
> > - Rebase to the latest hyperv/next.
> > ---
> > arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 210 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
> > index ade6c665c97e..a4615b889f3e 100644
> > --- a/arch/x86/hyperv/ivm.c
> > +++ b/arch/x86/hyperv/ivm.c
> > @@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
> > hv_ghcb_msr_read(msr, value);
> > }
> >
> > +/*
> > + * Keep track of the PFN regions which were shared with the host. The access
> > + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
> > + */
> > +struct hv_enc_pfn_region {
> > + struct list_head list;
> > + u64 pfn;
> > + int count;
> > +};
> > +
> > +static LIST_HEAD(hv_list_enc);
> > +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
> > +
> > +static int hv_list_enc_add(const u64 *pfn_list, int count)
> > +{
> > + struct hv_enc_pfn_region *ent;
> > + unsigned long flags;
> > + u64 pfn;
> > + int i;
> > +
> > + for (i = 0; i < count; i++) {
> > + pfn = pfn_list[i];
> > +
> > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > + /* Check if the PFN already exists in some region first */
> > + list_for_each_entry(ent, &hv_list_enc, list) {
> > + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
> > + /* Nothing to do - pfn is already in the list */
> > + goto unlock_done;
> > + }
> > +
> > + /*
> > + * Check if the PFN is adjacent to an existing region. Growing
> > + * a region can make it adjacent to another one but merging is
> > + * not (yet) implemented for simplicity. A PFN cannot be added
> > + * to two regions to keep the logic in hv_list_enc_remove()
> > + * correct.
> > + */
> > + list_for_each_entry(ent, &hv_list_enc, list) {
> > + if (ent->pfn + ent->count == pfn) {
> > + /* Grow existing region up */
> > + ent->count++;
> > + goto unlock_done;
> > + } else if (pfn + 1 == ent->pfn) {
> > + /* Grow existing region down */
> > + ent->pfn--;
> > + ent->count++;
> > + goto unlock_done;
> > + }
> > + }
> > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +
> > + /* No adjacent region found -- create a new one */
> > + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> > + if (!ent)
> > + return -ENOMEM;
> > +
> > + ent->pfn = pfn;
> > + ent->count = 1;
> > +
> > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > + list_add(&ent->list, &hv_list_enc);
> > +
> > +unlock_done:
> > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static int hv_list_enc_remove(const u64 *pfn_list, int count)
> > +{
> > + struct hv_enc_pfn_region *ent, *t;
> > + struct hv_enc_pfn_region new_region;
> > + unsigned long flags;
> > + u64 pfn;
> > + int i;
> > +
> > + for (i = 0; i < count; i++) {
> > + pfn = pfn_list[i];
> > +
> > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > + list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
> > + if (pfn == ent->pfn + ent->count - 1) {
> > + /* Removing tail pfn */
> > + ent->count--;
> > + if (!ent->count) {
> > + list_del(&ent->list);
> > + kfree(ent);
> > + }
> > + goto unlock_done;
> > + } else if (pfn == ent->pfn) {
> > + /* Removing head pfn */
> > + ent->count--;
> > + ent->pfn++;
> > + if (!ent->count) {
> > + list_del(&ent->list);
> > + kfree(ent);
> > + }
> > + goto unlock_done;
> > + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
> > + /*
> > + * Removing a pfn in the middle. Cut off the tail
> > + * of the existing region and create a template for
> > + * the new one.
> > + */
> > + new_region.pfn = pfn + 1;
> > + new_region.count = ent->count - (pfn - ent->pfn + 1);
> > + ent->count = pfn - ent->pfn;
> > + goto unlock_split;
> > + }
> > +
> > + }
> > +unlock_done:
> > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > + continue;
> > +
> > +unlock_split:
> > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +
> > + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> > + if (!ent)
> > + return -ENOMEM;
> > +
> > + ent->pfn = new_region.pfn;
> > + ent->count = new_region.count;
> > +
> > + raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > + list_add(&ent->list, &hv_list_enc);
> > + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +/* Stop new private<->shared conversions */
> > +static void hv_vtom_kexec_begin(void)
> > +{
> > + if (!IS_ENABLED(CONFIG_KEXEC_CORE))
> > + return;
> > +
> > + /*
> > + * Crash kernel reaches here with interrupts disabled: can't wait for
> > + * conversions to finish.
> > + *
> > + * If race happened, just report and proceed.
> > + */
> > + if (!set_memory_enc_stop_conversion())
> > + pr_warn("Failed to stop shared<->private conversions\n");
> > +}
> > +
> > +static void hv_vtom_kexec_finish(void)
> > +{
> > + struct hv_gpa_range_for_visibility *input;
> > + struct hv_enc_pfn_region *ent;
> > + unsigned long flags;
> > + u64 hv_status;
> > + int cur, i;
> > +
> > + local_irq_save(flags);
> > + input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > +
> > + if (unlikely(!input))
> > + goto out;
> > +
> > + list_for_each_entry(ent, &hv_list_enc, list) {
> > + for (i = 0, cur = 0; i < ent->count; i++) {
> > + input->gpa_page_list[cur] = ent->pfn + i;
> > + cur++;
> > +
> > + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
> > + input->partition_id = HV_PARTITION_ID_SELF;
> > + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
> > + input->reserved0 = 0;
> > + input->reserved1 = 0;
> > + hv_status = hv_do_rep_hypercall(
> > + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
> > + cur, 0, input, NULL);
> > + WARN_ON_ONCE(!hv_result_success(hv_status));
> > + cur = 0;
> > + }
> > + }
> > +
> > + }
> > +
> > +out:
> > + local_irq_restore(flags);
> > +}
> > +
> > /*
> > * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
> > *
> > @@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> > struct hv_gpa_range_for_visibility *input;
> > u64 hv_status;
> > unsigned long flags;
> > + int ret;
> >
> > /* no-op if partition isolation is not enabled */
> > if (!hv_is_isolation_supported())
> > @@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> > return -EINVAL;
> > }
> >
> > + if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> > + ret = hv_list_enc_remove(pfn, count);
> > + else
> > + ret = hv_list_enc_add(pfn, count);
> > + if (ret)
> > + return ret;
> > +
> > local_irq_save(flags);
> > input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> >
> > @@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> >
> > if (hv_result_success(hv_status))
> > return 0;
> > +
> > + if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> > + ret = hv_list_enc_add(pfn, count);
> > else
> > - return -EFAULT;
> > + ret = hv_list_enc_remove(pfn, count);
> > + /*
> > + * There's no good way to recover from -ENOMEM here, the accounting is
> > + * wrong either way.
> > + */
> > + WARN_ON_ONCE(ret);
> > +
> > + return -EFAULT;
> > }
> >
> > /*
> > @@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
> > x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
> > x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
> > x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
> > + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
> > + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
> >
> > /* Set WB as the default cache mode. */
> > guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
> > --
> > 2.50.1
>
>
Reviewed-by: Tianyu Lan <tiala@microsoft.com>
--
Thanks
Tianyu Lan
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2025-08-31 17:38 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-28 9:16 [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs Vitaly Kuznetsov
2025-08-29 17:03 ` Michael Kelley
2025-08-31 17:37 ` Tianyu Lan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).