[PATCH v4] x86/hyperv: Fix kdump on Azure CVMs

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs
@ 2025-08-28  9:16 Vitaly Kuznetsov
  2025-08-29 17:03 ` Michael Kelley
  0 siblings, 1 reply; 3+ messages in thread
From: Vitaly Kuznetsov @ 2025-08-28  9:16 UTC (permalink / raw)
  To: linux-hyperv, Michael Kelley
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, x86,
	linux-kernel, Nuno Das Neves, Tianyu Lan, Li Tian, Philipp Rudo

Azure CVM instance types featuring a paravisor hang upon kdump. The
investigation shows that makedumpfile causes a hang when it steps on a page
which was previously share with the host
(HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
knowledge of these 'special' regions (which are Vmbus connection pages,
GPADL buffers, ...). There are several ways to approach the issue:
- Convey the knowledge about these regions to the new kernel somehow.
- Unshare these regions before accessing in the new kernel (it is unclear
if there's a way to query the status for a given GPA range).
- Unshare these regions before jumping to the new kernel (which this patch
implements).

To make the procedure as robust as possible, store PFN ranges of shared
regions in a linked list instead of storing GVAs and re-using
hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
on the kdump/kexec path.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
---
Changes since v3 [Michael Kelley]:
 - Employ x86_platform.guest.enc_kexec_{begin,finish} hooks.
 - Don't use spinlock in what's now hv_vtom_kexec_finish().
 - Handle possible hypercall failures in hv_mark_gpa_visibility()
   symmetrically; change hv_list_enc_remove() to return -ENOMEM as well.
 - Rebase to the latest hyperv/next.
---
 arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 210 insertions(+), 1 deletion(-)

diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index ade6c665c97e..a4615b889f3e 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
 		hv_ghcb_msr_read(msr, value);
 }
 
+/*
+ * Keep track of the PFN regions which were shared with the host. The access
+ * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
+ */
+struct hv_enc_pfn_region {
+	struct list_head list;
+	u64 pfn;
+	int count;
+};
+
+static LIST_HEAD(hv_list_enc);
+static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
+
+static int hv_list_enc_add(const u64 *pfn_list, int count)
+{
+	struct hv_enc_pfn_region *ent;
+	unsigned long flags;
+	u64 pfn;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		pfn = pfn_list[i];
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		/* Check if the PFN already exists in some region first */
+		list_for_each_entry(ent, &hv_list_enc, list) {
+			if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
+				/* Nothing to do - pfn is already in the list */
+				goto unlock_done;
+		}
+
+		/*
+		 * Check if the PFN is adjacent to an existing region. Growing
+		 * a region can make it adjacent to another one but merging is
+		 * not (yet) implemented for simplicity. A PFN cannot be added
+		 * to two regions to keep the logic in hv_list_enc_remove()
+		 * correct.
+		 */
+		list_for_each_entry(ent, &hv_list_enc, list) {
+			if (ent->pfn + ent->count == pfn) {
+				/* Grow existing region up */
+				ent->count++;
+				goto unlock_done;
+			} else if (pfn + 1 == ent->pfn) {
+				/* Grow existing region down */
+				ent->pfn--;
+				ent->count++;
+				goto unlock_done;
+			}
+		}
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+		/* No adjacent region found -- create a new one */
+		ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+		if (!ent)
+			return -ENOMEM;
+
+		ent->pfn = pfn;
+		ent->count = 1;
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		list_add(&ent->list, &hv_list_enc);
+
+unlock_done:
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+	}
+
+	return 0;
+}
+
+static int hv_list_enc_remove(const u64 *pfn_list, int count)
+{
+	struct hv_enc_pfn_region *ent, *t;
+	struct hv_enc_pfn_region new_region;
+	unsigned long flags;
+	u64 pfn;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		pfn = pfn_list[i];
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
+			if (pfn == ent->pfn + ent->count - 1) {
+				/* Removing tail pfn */
+				ent->count--;
+				if (!ent->count) {
+					list_del(&ent->list);
+					kfree(ent);
+				}
+				goto unlock_done;
+			} else if (pfn == ent->pfn) {
+				/* Removing head pfn */
+				ent->count--;
+				ent->pfn++;
+				if (!ent->count) {
+					list_del(&ent->list);
+					kfree(ent);
+				}
+				goto unlock_done;
+			} else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
+				/*
+				 * Removing a pfn in the middle. Cut off the tail
+				 * of the existing region and create a template for
+				 * the new one.
+				 */
+				new_region.pfn = pfn + 1;
+				new_region.count = ent->count - (pfn - ent->pfn + 1);
+				ent->count = pfn - ent->pfn;
+				goto unlock_split;
+			}
+
+		}
+unlock_done:
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+		continue;
+
+unlock_split:
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+		ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+		if (!ent)
+			return -ENOMEM;
+
+		ent->pfn = new_region.pfn;
+		ent->count = new_region.count;
+
+		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+		list_add(&ent->list, &hv_list_enc);
+		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+	}
+
+	return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void hv_vtom_kexec_begin(void)
+{
+	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+		return;
+
+	/*
+	 * Crash kernel reaches here with interrupts disabled: can't wait for
+	 * conversions to finish.
+	 *
+	 * If race happened, just report and proceed.
+	 */
+	if (!set_memory_enc_stop_conversion())
+		pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+static void hv_vtom_kexec_finish(void)
+{
+	struct hv_gpa_range_for_visibility *input;
+	struct hv_enc_pfn_region *ent;
+	unsigned long flags;
+	u64 hv_status;
+	int cur, i;
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+	if (unlikely(!input))
+		goto out;
+
+	list_for_each_entry(ent, &hv_list_enc, list) {
+		for (i = 0, cur = 0; i < ent->count; i++) {
+			input->gpa_page_list[cur] = ent->pfn + i;
+			cur++;
+
+			if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
+				input->partition_id = HV_PARTITION_ID_SELF;
+				input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
+				input->reserved0 = 0;
+				input->reserved1 = 0;
+				hv_status = hv_do_rep_hypercall(
+					HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
+					cur, 0, input, NULL);
+				WARN_ON_ONCE(!hv_result_success(hv_status));
+				cur = 0;
+			}
+		}
+
+	}
+
+out:
+	local_irq_restore(flags);
+}
+
 /*
  * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
  *
@@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 	struct hv_gpa_range_for_visibility *input;
 	u64 hv_status;
 	unsigned long flags;
+	int ret;
 
 	/* no-op if partition isolation is not enabled */
 	if (!hv_is_isolation_supported())
@@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 		return -EINVAL;
 	}
 
+	if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+		ret = hv_list_enc_remove(pfn, count);
+	else
+		ret = hv_list_enc_add(pfn, count);
+	if (ret)
+		return ret;
+
 	local_irq_save(flags);
 	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
 
@@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 
 	if (hv_result_success(hv_status))
 		return 0;
+
+	if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+		ret = hv_list_enc_add(pfn, count);
 	else
-		return -EFAULT;
+		ret = hv_list_enc_remove(pfn, count);
+	/*
+	 * There's no good way to recover from -ENOMEM here, the accounting is
+	 * wrong either way.
+	 */
+	WARN_ON_ONCE(ret);
+
+	return -EFAULT;
 }
 
 /*
@@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
 	x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
 	x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
 	x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+	x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
+	x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
 
 	/* Set WB as the default cache mode. */
 	guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
-- 
2.50.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* RE: [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs
  2025-08-28  9:16 [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs Vitaly Kuznetsov
@ 2025-08-29 17:03 ` Michael Kelley
  2025-08-31 17:37   ` Tianyu Lan
  0 siblings, 1 reply; 3+ messages in thread
From: Michael Kelley @ 2025-08-29 17:03 UTC (permalink / raw)
  To: Vitaly Kuznetsov, linux-hyperv@vger.kernel.org,
	linux-coco@lists.linux.dev
  Cc: K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui,
	x86@kernel.org, linux-kernel@vger.kernel.org, Nuno Das Neves,
	Tianyu Lan, Li Tian, Philipp Rudo

From: Vitaly Kuznetsov <vkuznets@redhat.com> Sent: Thursday, August 28, 2025 2:16 AM
> 
> Azure CVM instance types featuring a paravisor hang upon kdump. The
> investigation shows that makedumpfile causes a hang when it steps on a page
> which was previously share with the host
> (HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
> knowledge of these 'special' regions (which are Vmbus connection pages,
> GPADL buffers, ...). There are several ways to approach the issue:
> - Convey the knowledge about these regions to the new kernel somehow.
> - Unshare these regions before accessing in the new kernel (it is unclear
> if there's a way to query the status for a given GPA range).
> - Unshare these regions before jumping to the new kernel (which this patch
> implements).
> 
> To make the procedure as robust as possible, store PFN ranges of shared
> regions in a linked list instead of storing GVAs and re-using
> hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
> on the kdump/kexec path.
> 
> Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>

Looks good!

Adding Confidential Computing mailing list (linux-coco@lists.linux.dev)
for visibility.

Reviewed-by: Michael Kelley <mhklinux@outlook.com>

> ---
> Changes since v3 [Michael Kelley]:
>  - Employ x86_platform.guest.enc_kexec_{begin,finish} hooks.
>  - Don't use spinlock in what's now hv_vtom_kexec_finish().
>  - Handle possible hypercall failures in hv_mark_gpa_visibility()
>    symmetrically; change hv_list_enc_remove() to return -ENOMEM as well.
>  - Rebase to the latest hyperv/next.
> ---
>  arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 210 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
> index ade6c665c97e..a4615b889f3e 100644
> --- a/arch/x86/hyperv/ivm.c
> +++ b/arch/x86/hyperv/ivm.c
> @@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
>  		hv_ghcb_msr_read(msr, value);
>  }
> 
> +/*
> + * Keep track of the PFN regions which were shared with the host. The access
> + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
> + */
> +struct hv_enc_pfn_region {
> +	struct list_head list;
> +	u64 pfn;
> +	int count;
> +};
> +
> +static LIST_HEAD(hv_list_enc);
> +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
> +
> +static int hv_list_enc_add(const u64 *pfn_list, int count)
> +{
> +	struct hv_enc_pfn_region *ent;
> +	unsigned long flags;
> +	u64 pfn;
> +	int i;
> +
> +	for (i = 0; i < count; i++) {
> +		pfn = pfn_list[i];
> +
> +		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> +		/* Check if the PFN already exists in some region first */
> +		list_for_each_entry(ent, &hv_list_enc, list) {
> +			if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
> +				/* Nothing to do - pfn is already in the list */
> +				goto unlock_done;
> +		}
> +
> +		/*
> +		 * Check if the PFN is adjacent to an existing region. Growing
> +		 * a region can make it adjacent to another one but merging is
> +		 * not (yet) implemented for simplicity. A PFN cannot be added
> +		 * to two regions to keep the logic in hv_list_enc_remove()
> +		 * correct.
> +		 */
> +		list_for_each_entry(ent, &hv_list_enc, list) {
> +			if (ent->pfn + ent->count == pfn) {
> +				/* Grow existing region up */
> +				ent->count++;
> +				goto unlock_done;
> +			} else if (pfn + 1 == ent->pfn) {
> +				/* Grow existing region down */
> +				ent->pfn--;
> +				ent->count++;
> +				goto unlock_done;
> +			}
> +		}
> +		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +
> +		/* No adjacent region found -- create a new one */
> +		ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> +		if (!ent)
> +			return -ENOMEM;
> +
> +		ent->pfn = pfn;
> +		ent->count = 1;
> +
> +		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> +		list_add(&ent->list, &hv_list_enc);
> +
> +unlock_done:
> +		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +	}
> +
> +	return 0;
> +}
> +
> +static int hv_list_enc_remove(const u64 *pfn_list, int count)
> +{
> +	struct hv_enc_pfn_region *ent, *t;
> +	struct hv_enc_pfn_region new_region;
> +	unsigned long flags;
> +	u64 pfn;
> +	int i;
> +
> +	for (i = 0; i < count; i++) {
> +		pfn = pfn_list[i];
> +
> +		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> +		list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
> +			if (pfn == ent->pfn + ent->count - 1) {
> +				/* Removing tail pfn */
> +				ent->count--;
> +				if (!ent->count) {
> +					list_del(&ent->list);
> +					kfree(ent);
> +				}
> +				goto unlock_done;
> +			} else if (pfn == ent->pfn) {
> +				/* Removing head pfn */
> +				ent->count--;
> +				ent->pfn++;
> +				if (!ent->count) {
> +					list_del(&ent->list);
> +					kfree(ent);
> +				}
> +				goto unlock_done;
> +			} else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
> +				/*
> +				 * Removing a pfn in the middle. Cut off the tail
> +				 * of the existing region and create a template for
> +				 * the new one.
> +				 */
> +				new_region.pfn = pfn + 1;
> +				new_region.count = ent->count - (pfn - ent->pfn + 1);
> +				ent->count = pfn - ent->pfn;
> +				goto unlock_split;
> +			}
> +
> +		}
> +unlock_done:
> +		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +		continue;
> +
> +unlock_split:
> +		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +
> +		ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> +		if (!ent)
> +			return -ENOMEM;
> +
> +		ent->pfn = new_region.pfn;
> +		ent->count = new_region.count;
> +
> +		raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> +		list_add(&ent->list, &hv_list_enc);
> +		raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> +	}
> +
> +	return 0;
> +}
> +
> +/* Stop new private<->shared conversions */
> +static void hv_vtom_kexec_begin(void)
> +{
> +	if (!IS_ENABLED(CONFIG_KEXEC_CORE))
> +		return;
> +
> +	/*
> +	 * Crash kernel reaches here with interrupts disabled: can't wait for
> +	 * conversions to finish.
> +	 *
> +	 * If race happened, just report and proceed.
> +	 */
> +	if (!set_memory_enc_stop_conversion())
> +		pr_warn("Failed to stop shared<->private conversions\n");
> +}
> +
> +static void hv_vtom_kexec_finish(void)
> +{
> +	struct hv_gpa_range_for_visibility *input;
> +	struct hv_enc_pfn_region *ent;
> +	unsigned long flags;
> +	u64 hv_status;
> +	int cur, i;
> +
> +	local_irq_save(flags);
> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> +
> +	if (unlikely(!input))
> +		goto out;
> +
> +	list_for_each_entry(ent, &hv_list_enc, list) {
> +		for (i = 0, cur = 0; i < ent->count; i++) {
> +			input->gpa_page_list[cur] = ent->pfn + i;
> +			cur++;
> +
> +			if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
> +				input->partition_id = HV_PARTITION_ID_SELF;
> +				input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
> +				input->reserved0 = 0;
> +				input->reserved1 = 0;
> +				hv_status = hv_do_rep_hypercall(
> +					HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
> +					cur, 0, input, NULL);
> +				WARN_ON_ONCE(!hv_result_success(hv_status));
> +				cur = 0;
> +			}
> +		}
> +
> +	}
> +
> +out:
> +	local_irq_restore(flags);
> +}
> +
>  /*
>   * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
>   *
> @@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
>  	struct hv_gpa_range_for_visibility *input;
>  	u64 hv_status;
>  	unsigned long flags;
> +	int ret;
> 
>  	/* no-op if partition isolation is not enabled */
>  	if (!hv_is_isolation_supported())
> @@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
>  		return -EINVAL;
>  	}
> 
> +	if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> +		ret = hv_list_enc_remove(pfn, count);
> +	else
> +		ret = hv_list_enc_add(pfn, count);
> +	if (ret)
> +		return ret;
> +
>  	local_irq_save(flags);
>  	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> 
> @@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> 
>  	if (hv_result_success(hv_status))
>  		return 0;
> +
> +	if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> +		ret = hv_list_enc_add(pfn, count);
>  	else
> -		return -EFAULT;
> +		ret = hv_list_enc_remove(pfn, count);
> +	/*
> +	 * There's no good way to recover from -ENOMEM here, the accounting is
> +	 * wrong either way.
> +	 */
> +	WARN_ON_ONCE(ret);
> +
> +	return -EFAULT;
>  }
> 
>  /*
> @@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
>  	x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
>  	x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
>  	x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
> +	x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
> +	x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
> 
>  	/* Set WB as the default cache mode. */
>  	guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
> --
> 2.50.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs
  2025-08-29 17:03 ` Michael Kelley
@ 2025-08-31 17:37   ` Tianyu Lan
  0 siblings, 0 replies; 3+ messages in thread
From: Tianyu Lan @ 2025-08-31 17:37 UTC (permalink / raw)
  To: Michael Kelley
  Cc: Vitaly Kuznetsov, linux-hyperv@vger.kernel.org,
	linux-coco@lists.linux.dev, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, x86@kernel.org, linux-kernel@vger.kernel.org,
	Nuno Das Neves, Tianyu Lan, Li Tian, Philipp Rudo

On Sat, Aug 30, 2025 at 1:03 AM Michael Kelley <mhklinux@outlook.com> wrote:
>
> From: Vitaly Kuznetsov <vkuznets@redhat.com> Sent: Thursday, August 28, 2025 2:16 AM
> >
> > Azure CVM instance types featuring a paravisor hang upon kdump. The
> > investigation shows that makedumpfile causes a hang when it steps on a page
> > which was previously share with the host
> > (HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
> > knowledge of these 'special' regions (which are Vmbus connection pages,
> > GPADL buffers, ...). There are several ways to approach the issue:
> > - Convey the knowledge about these regions to the new kernel somehow.
> > - Unshare these regions before accessing in the new kernel (it is unclear
> > if there's a way to query the status for a given GPA range).
> > - Unshare these regions before jumping to the new kernel (which this patch
> > implements).
> >
> > To make the procedure as robust as possible, store PFN ranges of shared
> > regions in a linked list instead of storing GVAs and re-using
> > hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
> > on the kdump/kexec path.
> >
> > Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
>
> Looks good!
>
> Adding Confidential Computing mailing list (linux-coco@lists.linux.dev)
> for visibility.
>
> Reviewed-by: Michael Kelley <mhklinux@outlook.com>
>
> > ---
> > Changes since v3 [Michael Kelley]:
> >  - Employ x86_platform.guest.enc_kexec_{begin,finish} hooks.
> >  - Don't use spinlock in what's now hv_vtom_kexec_finish().
> >  - Handle possible hypercall failures in hv_mark_gpa_visibility()
> >    symmetrically; change hv_list_enc_remove() to return -ENOMEM as well.
> >  - Rebase to the latest hyperv/next.
> > ---
> >  arch/x86/hyperv/ivm.c | 211 +++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 210 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
> > index ade6c665c97e..a4615b889f3e 100644
> > --- a/arch/x86/hyperv/ivm.c
> > +++ b/arch/x86/hyperv/ivm.c
> > @@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
> >               hv_ghcb_msr_read(msr, value);
> >  }
> >
> > +/*
> > + * Keep track of the PFN regions which were shared with the host. The access
> > + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
> > + */
> > +struct hv_enc_pfn_region {
> > +     struct list_head list;
> > +     u64 pfn;
> > +     int count;
> > +};
> > +
> > +static LIST_HEAD(hv_list_enc);
> > +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
> > +
> > +static int hv_list_enc_add(const u64 *pfn_list, int count)
> > +{
> > +     struct hv_enc_pfn_region *ent;
> > +     unsigned long flags;
> > +     u64 pfn;
> > +     int i;
> > +
> > +     for (i = 0; i < count; i++) {
> > +             pfn = pfn_list[i];
> > +
> > +             raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > +             /* Check if the PFN already exists in some region first */
> > +             list_for_each_entry(ent, &hv_list_enc, list) {
> > +                     if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
> > +                             /* Nothing to do - pfn is already in the list */
> > +                             goto unlock_done;
> > +             }
> > +
> > +             /*
> > +              * Check if the PFN is adjacent to an existing region. Growing
> > +              * a region can make it adjacent to another one but merging is
> > +              * not (yet) implemented for simplicity. A PFN cannot be added
> > +              * to two regions to keep the logic in hv_list_enc_remove()
> > +              * correct.
> > +              */
> > +             list_for_each_entry(ent, &hv_list_enc, list) {
> > +                     if (ent->pfn + ent->count == pfn) {
> > +                             /* Grow existing region up */
> > +                             ent->count++;
> > +                             goto unlock_done;
> > +                     } else if (pfn + 1 == ent->pfn) {
> > +                             /* Grow existing region down */
> > +                             ent->pfn--;
> > +                             ent->count++;
> > +                             goto unlock_done;
> > +                     }
> > +             }
> > +             raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +
> > +             /* No adjacent region found -- create a new one */
> > +             ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> > +             if (!ent)
> > +                     return -ENOMEM;
> > +
> > +             ent->pfn = pfn;
> > +             ent->count = 1;
> > +
> > +             raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > +             list_add(&ent->list, &hv_list_enc);
> > +
> > +unlock_done:
> > +             raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +static int hv_list_enc_remove(const u64 *pfn_list, int count)
> > +{
> > +     struct hv_enc_pfn_region *ent, *t;
> > +     struct hv_enc_pfn_region new_region;
> > +     unsigned long flags;
> > +     u64 pfn;
> > +     int i;
> > +
> > +     for (i = 0; i < count; i++) {
> > +             pfn = pfn_list[i];
> > +
> > +             raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > +             list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
> > +                     if (pfn == ent->pfn + ent->count - 1) {
> > +                             /* Removing tail pfn */
> > +                             ent->count--;
> > +                             if (!ent->count) {
> > +                                     list_del(&ent->list);
> > +                                     kfree(ent);
> > +                             }
> > +                             goto unlock_done;
> > +                     } else if (pfn == ent->pfn) {
> > +                             /* Removing head pfn */
> > +                             ent->count--;
> > +                             ent->pfn++;
> > +                             if (!ent->count) {
> > +                                     list_del(&ent->list);
> > +                                     kfree(ent);
> > +                             }
> > +                             goto unlock_done;
> > +                     } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
> > +                             /*
> > +                              * Removing a pfn in the middle. Cut off the tail
> > +                              * of the existing region and create a template for
> > +                              * the new one.
> > +                              */
> > +                             new_region.pfn = pfn + 1;
> > +                             new_region.count = ent->count - (pfn - ent->pfn + 1);
> > +                             ent->count = pfn - ent->pfn;
> > +                             goto unlock_split;
> > +                     }
> > +
> > +             }
> > +unlock_done:
> > +             raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +             continue;
> > +
> > +unlock_split:
> > +             raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +
> > +             ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
> > +             if (!ent)
> > +                     return -ENOMEM;
> > +
> > +             ent->pfn = new_region.pfn;
> > +             ent->count = new_region.count;
> > +
> > +             raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
> > +             list_add(&ent->list, &hv_list_enc);
> > +             raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
> > +     }
> > +
> > +     return 0;
> > +}
> > +
> > +/* Stop new private<->shared conversions */
> > +static void hv_vtom_kexec_begin(void)
> > +{
> > +     if (!IS_ENABLED(CONFIG_KEXEC_CORE))
> > +             return;
> > +
> > +     /*
> > +      * Crash kernel reaches here with interrupts disabled: can't wait for
> > +      * conversions to finish.
> > +      *
> > +      * If race happened, just report and proceed.
> > +      */
> > +     if (!set_memory_enc_stop_conversion())
> > +             pr_warn("Failed to stop shared<->private conversions\n");
> > +}
> > +
> > +static void hv_vtom_kexec_finish(void)
> > +{
> > +     struct hv_gpa_range_for_visibility *input;
> > +     struct hv_enc_pfn_region *ent;
> > +     unsigned long flags;
> > +     u64 hv_status;
> > +     int cur, i;
> > +
> > +     local_irq_save(flags);
> > +     input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > +
> > +     if (unlikely(!input))
> > +             goto out;
> > +
> > +     list_for_each_entry(ent, &hv_list_enc, list) {
> > +             for (i = 0, cur = 0; i < ent->count; i++) {
> > +                     input->gpa_page_list[cur] = ent->pfn + i;
> > +                     cur++;
> > +
> > +                     if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
> > +                             input->partition_id = HV_PARTITION_ID_SELF;
> > +                             input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
> > +                             input->reserved0 = 0;
> > +                             input->reserved1 = 0;
> > +                             hv_status = hv_do_rep_hypercall(
> > +                                     HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
> > +                                     cur, 0, input, NULL);
> > +                             WARN_ON_ONCE(!hv_result_success(hv_status));
> > +                             cur = 0;
> > +                     }
> > +             }
> > +
> > +     }
> > +
> > +out:
> > +     local_irq_restore(flags);
> > +}
> > +
> >  /*
> >   * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
> >   *
> > @@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> >       struct hv_gpa_range_for_visibility *input;
> >       u64 hv_status;
> >       unsigned long flags;
> > +     int ret;
> >
> >       /* no-op if partition isolation is not enabled */
> >       if (!hv_is_isolation_supported())
> > @@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> >               return -EINVAL;
> >       }
> >
> > +     if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> > +             ret = hv_list_enc_remove(pfn, count);
> > +     else
> > +             ret = hv_list_enc_add(pfn, count);
> > +     if (ret)
> > +             return ret;
> > +
> >       local_irq_save(flags);
> >       input = *this_cpu_ptr(hyperv_pcpu_input_arg);
> >
> > @@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
> >
> >       if (hv_result_success(hv_status))
> >               return 0;
> > +
> > +     if (visibility == VMBUS_PAGE_NOT_VISIBLE)
> > +             ret = hv_list_enc_add(pfn, count);
> >       else
> > -             return -EFAULT;
> > +             ret = hv_list_enc_remove(pfn, count);
> > +     /*
> > +      * There's no good way to recover from -ENOMEM here, the accounting is
> > +      * wrong either way.
> > +      */
> > +     WARN_ON_ONCE(ret);
> > +
> > +     return -EFAULT;
> >  }
> >
> >  /*
> > @@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
> >       x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
> >       x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
> >       x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
> > +     x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
> > +     x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
> >
> >       /* Set WB as the default cache mode. */
> >       guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
> > --
> > 2.50.1
>
>

Reviewed-by: Tianyu Lan <tiala@microsoft.com>

-- 
Thanks
Tianyu Lan

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2025-08-31 17:38 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-28  9:16 [PATCH v4] x86/hyperv: Fix kdump on Azure CVMs Vitaly Kuznetsov
2025-08-29 17:03 ` Michael Kelley
2025-08-31 17:37   ` Tianyu Lan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).