Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
@ 2026-05-21 20:58 Kameron Carr
  2026-05-22  3:14 ` Matthew Wilcox
  2026-06-08 15:37 ` Catalin Marinas
  0 siblings, 2 replies; 7+ messages in thread
From: Kameron Carr @ 2026-05-21 20:58 UTC (permalink / raw)
  To: akpm, urezki; +Cc: linux-mm, linux-kernel, rppt, catalin.marinas, mhklinux

In confidential computing environments (arm64 CCA, x86 SEV/TDX), guest
memory is encrypted by default and must be explicitly transitioned to a
decrypted/shared state for host-visible access.  Calling
set_memory_decrypted() on a vmalloc address is not supported, and not
recommended as it would be inefficient to decrypt the pages after they
have been mapped.

Add vmalloc_decrypted() and vzalloc_decrypted() which decrypt pages on
the linear map before creating the vmalloc mapping via vmap(), so
physical pages are never mapped with conflicting encryption attributes
across aliases.  A new VM_DECRYPTED flag marks these allocations so that
vfree() automatically re-encrypts pages before returning them to the
page allocator.

Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/linux-arm-kernel/ZmNJdSxSz-sYpVgI@arm.com/
Signed-off-by: Kameron Carr <kameroncarr@linux.microsoft.com>
---
 include/linux/vmalloc.h |   7 ++
 mm/vmalloc.c            | 163 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3b02c0c6b371..d87e1953da55 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -38,6 +38,7 @@ struct iov_iter;		/* in uio.h */
 #define VM_DEFER_KMEMLEAK	0
 #endif
 #define VM_SPARSE		0x00001000	/* sparse vm_area. not all pages are present. */
+#define VM_DECRYPTED		0x00002000	/* pages decrypted for host-shared access, re-encrypt on vfree */
 
 /* bits [20..32] reserved for arch specific ioremap internals */
 
@@ -153,6 +154,12 @@ extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
 extern void *vzalloc_noprof(unsigned long size) __alloc_size(1);
 #define vzalloc(...)		alloc_hooks(vzalloc_noprof(__VA_ARGS__))
 
+extern void *vmalloc_decrypted_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_decrypted(...)	alloc_hooks(vmalloc_decrypted_noprof(__VA_ARGS__))
+
+extern void *vzalloc_decrypted_noprof(unsigned long size) __alloc_size(1);
+#define vzalloc_decrypted(...)	alloc_hooks(vzalloc_decrypted_noprof(__VA_ARGS__))
+
 extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1);
 #define vmalloc_user(...)	alloc_hooks(vmalloc_user_noprof(__VA_ARGS__))
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index eabb86b13b7e..0e7f0033aa84 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3416,6 +3416,103 @@ void vfree_atomic(const void *addr)
 		schedule_work(&p->wq);
 }
 
+/*
+ * Transition a single contiguous block of @nr pages at index @idx in
+ * @area->pages to encrypted or decrypted state.  On failure, the block's
+ * page-pointer slots are cleared so the standard free path will not return
+ * the pages to the allocator (they are leaked).
+ */
+static int __vm_pages_enc_dec(struct vm_struct *area, unsigned int idx,
+			      unsigned int nr, bool encrypt)
+{
+	unsigned long addr =
+		(unsigned long)kasan_reset_tag(page_address(area->pages[idx]));
+	int err = encrypt ? set_memory_encrypted(addr, nr) :
+			    set_memory_decrypted(addr, nr);
+
+	if (err)
+		memset(&area->pages[idx], 0, nr * sizeof(*area->pages));
+	return err;
+}
+
+/*
+ * Compact @area->pages, removing slots previously zeroed by
+ * __vm_pages_enc_dec().  Returns the number of leaked pages
+ * (old nr_pages - new nr_pages).
+ */
+static unsigned int vm_compact_leaked_pages(struct vm_struct *area)
+{
+	unsigned int i, dst;
+	unsigned int old_nr = area->nr_pages;
+
+	for (i = 0, dst = 0; i < area->nr_pages; i++) {
+		if (area->pages[i])
+			area->pages[dst++] = area->pages[i];
+	}
+	area->nr_pages = dst;
+	return old_nr - dst;
+}
+
+/*
+ * Re-encrypt the linear-map alias of all pages backing a VM_DECRYPTED area.
+ * Best-effort: on per-block failure the loop continues so as many pages as
+ * possible are returned to the encrypted state.  Pages that fail to
+ * transition are left out of area->pages and leaked.
+ */
+static int vm_pages_encrypt(struct vm_struct *area)
+{
+	unsigned int nr = 1U << vm_area_page_order(area);
+	unsigned int i;
+	unsigned int leaked;
+	int ret = 0;
+
+	for (i = 0; i < area->nr_pages; i += nr) {
+		int err = __vm_pages_enc_dec(area, i, nr, true);
+
+		if (err && !ret)
+			ret = err;
+	}
+
+	leaked = vm_compact_leaked_pages(area);
+	if (leaked)
+		pr_warn("vmalloc: re-encryption failed, leaked %u pages\n",
+			leaked);
+	return ret;
+}
+
+/*
+ * Decrypt the linear-map alias of all pages backing a VM_DECRYPTED area.
+ * On failure, the already-decrypted prefix is rolled back to encrypted.
+ * Pages that fail either the initial decrypt or the rollback re-encrypt are
+ * left out of area->pages and leaked.
+ */
+static int vm_pages_decrypt(struct vm_struct *area)
+{
+	unsigned int nr = 1U << vm_area_page_order(area);
+	unsigned int i;
+	unsigned int leaked;
+	int ret = 0;
+
+	for (i = 0; i < area->nr_pages; i += nr) {
+		ret = __vm_pages_enc_dec(area, i, nr, false);
+		if (ret)
+			goto rollback;
+	}
+	return 0;
+
+rollback:
+	while (i) {
+		i -= nr;
+		__vm_pages_enc_dec(area, i, nr, true);
+	}
+
+	leaked = vm_compact_leaked_pages(area);
+	if (leaked)
+		pr_warn("vmalloc: decryption failed, leaked %u pages\n",
+			leaked);
+	return ret;
+}
+
 /**
  * vfree - Release memory allocated by vmalloc()
  * @addr:  Memory base address
@@ -3457,6 +3554,9 @@ void vfree(const void *addr)
 		return;
 	}
 
+	if (unlikely(vm->flags & VM_DECRYPTED))
+		vm_pages_encrypt(vm);
+
 	if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
 		vm_reset_perms(vm);
 
@@ -3895,6 +3995,22 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		goto fail;
 	}
 
+	/*
+	 * For VM_DECRYPTED areas, decrypt each
+	 * page on the linear map before creating the vmalloc alias.
+	 */
+	if (area->flags & VM_DECRYPTED) {
+		if (vm_pages_decrypt(area)) {
+			/*
+			 * vm_pages_decrypt() re-encrypted what it could;
+			 * clear VM_DECRYPTED so the deferred cleanup path
+			 * doesn't try to re-encrypt again.
+			 */
+			area->flags &= ~VM_DECRYPTED;
+			goto fail;
+		}
+	}
+
 	/*
 	 * page tables allocations ignore external gfp mask, enforce it
 	 * by the scope API
@@ -4203,6 +4319,50 @@ void *vzalloc_noprof(unsigned long size)
 }
 EXPORT_SYMBOL(vzalloc_noprof);
 
+/**
+ * vmalloc_decrypted - allocate virtually contiguous decrypted memory
+ * @size: allocation size
+ *
+ * Allocate pages in decrypted/shared state for host-visible access in
+ * confidential computing environments.  Pages are automatically
+ * re-encrypted on vfree().
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_decrypted_noprof(unsigned long size)
+{
+	return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+					   GFP_KERNEL,
+					   pgprot_decrypted(PAGE_KERNEL),
+					   VM_DECRYPTED, NUMA_NO_NODE,
+					   __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_decrypted_noprof);
+
+/**
+ * vzalloc_decrypted - allocate zeroed virtually contiguous decrypted memory
+ * @size:    allocation size
+ *
+ * Like vmalloc_decrypted(), but the memory is set to zero.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vzalloc_decrypted_noprof(unsigned long size)
+{
+	void *addr;
+
+	addr = __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
+					   GFP_KERNEL,
+					   pgprot_decrypted(PAGE_KERNEL),
+					   VM_DECRYPTED, NUMA_NO_NODE,
+					   __builtin_return_address(0));
+	if (addr)
+		memset(addr, 0, size);
+
+	return addr;
+}
+EXPORT_SYMBOL(vzalloc_decrypted_noprof);
+
 /**
  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
  * @size: allocation size
@@ -5271,6 +5431,9 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
 			if (v->flags & VM_DMA_COHERENT)
 				seq_puts(m, " dma-coherent");
 
+			if (v->flags & VM_DECRYPTED)
+				seq_puts(m, " decrypted");
+
 			if (is_vmalloc_addr(v->pages))
 				seq_puts(m, " vpages");
 

base-commit: e9add7501ad3297dad9b90ce201266830a68ab47
-- 
2.45.4



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
  2026-05-21 20:58 [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted() Kameron Carr
@ 2026-05-22  3:14 ` Matthew Wilcox
  2026-06-08 15:37 ` Catalin Marinas
  1 sibling, 0 replies; 7+ messages in thread
From: Matthew Wilcox @ 2026-05-22  3:14 UTC (permalink / raw)
  To: Kameron Carr
  Cc: akpm, urezki, linux-mm, linux-kernel, rppt, catalin.marinas,
	mhklinux

On Thu, May 21, 2026 at 01:58:34PM -0700, Kameron Carr wrote:
> +/*
> + * Transition a single contiguous block of @nr pages at index @idx in

There's no parameter called @nr_pages; you probably meant @nr.

> + * @area->pages to encrypted or decrypted state.  On failure, the block's
> + * page-pointer slots are cleared so the standard free path will not return
> + * the pages to the allocator (they are leaked).
> + */
> +static int __vm_pages_enc_dec(struct vm_struct *area, unsigned int idx,
> +			      unsigned int nr, bool encrypt)

This 'bool encrypt' parameter is an antipattern.  Just split this into
two functions.

> +{
> +	unsigned long addr =
> +		(unsigned long)kasan_reset_tag(page_address(area->pages[idx]));
> +	int err = encrypt ? set_memory_encrypted(addr, nr) :
> +			    set_memory_decrypted(addr, nr);
> +
> +	if (err)
> +		memset(&area->pages[idx], 0, nr * sizeof(*area->pages));
> +	return err;
> +}

Does it really make sense to pass in 'area' and 'idx' rather than
passing in &area->pages[idx]?



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
  2026-05-21 20:58 [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted() Kameron Carr
  2026-05-22  3:14 ` Matthew Wilcox
@ 2026-06-08 15:37 ` Catalin Marinas
  2026-06-11 11:49   ` Jason Gunthorpe
  1 sibling, 1 reply; 7+ messages in thread
From: Catalin Marinas @ 2026-06-08 15:37 UTC (permalink / raw)
  To: Kameron Carr
  Cc: akpm, urezki, linux-mm, linux-kernel, rppt, mhklinux, linux-coco,
	Suzuki K Poulose

+ linux-coco, Suzuki (for the arm64 behaviour)

On Thu, May 21, 2026 at 01:58:34PM -0700, Kameron Carr wrote:
> In confidential computing environments (arm64 CCA, x86 SEV/TDX), guest
> memory is encrypted by default and must be explicitly transitioned to a
> decrypted/shared state for host-visible access.  Calling
> set_memory_decrypted() on a vmalloc address is not supported, and not
> recommended as it would be inefficient to decrypt the pages after they
> have been mapped.
> 
> Add vmalloc_decrypted() and vzalloc_decrypted() which decrypt pages on
> the linear map before creating the vmalloc mapping via vmap(), so
> physical pages are never mapped with conflicting encryption attributes
> across aliases.  A new VM_DECRYPTED flag marks these allocations so that
> vfree() automatically re-encrypts pages before returning them to the
> page allocator.
> 
> Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
> Link: https://lore.kernel.org/linux-arm-kernel/ZmNJdSxSz-sYpVgI@arm.com/
> Signed-off-by: Kameron Carr <kameroncarr@linux.microsoft.com>
> ---
>  include/linux/vmalloc.h |   7 ++
>  mm/vmalloc.c            | 163 ++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 170 insertions(+)

There are a few Sashiko comments worth reviewing:

https://sashiko.dev/#/patchset/20260521205834.1012925-1-kameroncarr@linux.microsoft.com

[...]
> +/*
> + * Re-encrypt the linear-map alias of all pages backing a VM_DECRYPTED area.
> + * Best-effort: on per-block failure the loop continues so as many pages as
> + * possible are returned to the encrypted state.  Pages that fail to
> + * transition are left out of area->pages and leaked.
> + */
> +static int vm_pages_encrypt(struct vm_struct *area)
> +{
> +	unsigned int nr = 1U << vm_area_page_order(area);
> +	unsigned int i;
> +	unsigned int leaked;
> +	int ret = 0;
> +
> +	for (i = 0; i < area->nr_pages; i += nr) {
> +		int err = __vm_pages_enc_dec(area, i, nr, true);
> +
> +		if (err && !ret)
> +			ret = err;
> +	}
> +
> +	leaked = vm_compact_leaked_pages(area);
> +	if (leaked)
> +		pr_warn("vmalloc: re-encryption failed, leaked %u pages\n",
> +			leaked);
> +	return ret;
> +}
> +
> +/*
> + * Decrypt the linear-map alias of all pages backing a VM_DECRYPTED area.
> + * On failure, the already-decrypted prefix is rolled back to encrypted.
> + * Pages that fail either the initial decrypt or the rollback re-encrypt are
> + * left out of area->pages and leaked.
> + */
> +static int vm_pages_decrypt(struct vm_struct *area)
> +{
> +	unsigned int nr = 1U << vm_area_page_order(area);
> +	unsigned int i;
> +	unsigned int leaked;
> +	int ret = 0;
> +
> +	for (i = 0; i < area->nr_pages; i += nr) {
> +		ret = __vm_pages_enc_dec(area, i, nr, false);
> +		if (ret)
> +			goto rollback;
> +	}
> +	return 0;
> +
> +rollback:
> +	while (i) {
> +		i -= nr;
> +		__vm_pages_enc_dec(area, i, nr, true);
> +	}
> +
> +	leaked = vm_compact_leaked_pages(area);
> +	if (leaked)
> +		pr_warn("vmalloc: decryption failed, leaked %u pages\n",
> +			leaked);
> +	return ret;
> +}
> +
>  /**
>   * vfree - Release memory allocated by vmalloc()
>   * @addr:  Memory base address
> @@ -3457,6 +3554,9 @@ void vfree(const void *addr)
>  		return;
>  	}
>  
> +	if (unlikely(vm->flags & VM_DECRYPTED))
> +		vm_pages_encrypt(vm);

I think we still have the vmalloc aliases at this point as we lazily
reclaim them. We should call vm_unmap_aliases() before
vm_pages_encrypt(). It matches the x86 __set_memory_enc_pgtable() as
well with the explicit call to vm_unmap_aliases().

The vrealloc() path may have some issues as well but I haven't looked in
detail. Not sure it actually re-allocs decrypted pages. The simplest is
to reject vrealloc() for such vms until we have a use-case.

> +/**
> + * vzalloc_decrypted - allocate zeroed virtually contiguous decrypted memory
> + * @size:    allocation size
> + *
> + * Like vmalloc_decrypted(), but the memory is set to zero.
> + *
> + * Return: pointer to the allocated memory or %NULL on error
> + */
> +void *vzalloc_decrypted_noprof(unsigned long size)
> +{
> +	void *addr;
> +
> +	addr = __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
> +					   GFP_KERNEL,
> +					   pgprot_decrypted(PAGE_KERNEL),
> +					   VM_DECRYPTED, NUMA_NO_NODE,
> +					   __builtin_return_address(0));
> +	if (addr)
> +		memset(addr, 0, size);

Talking to Suzuki, the small window between set_memory_decrypted() and
memset() potentially exposing stale data is safe, at least for Arm CCA
as the memory would be scrubbed (there are other places in the kernel
where we do something similar). I assume that's also the case for other
architectures, although not sure what pKVM does.

-- 
Catalin


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
  2026-06-08 15:37 ` Catalin Marinas
@ 2026-06-11 11:49   ` Jason Gunthorpe
  2026-06-12 17:49     ` Catalin Marinas
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Gunthorpe @ 2026-06-11 11:49 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Kameron Carr, akpm, urezki, linux-mm, linux-kernel, rppt,
	mhklinux, linux-coco, Suzuki K Poulose

On Mon, Jun 08, 2026 at 04:37:02PM +0100, Catalin Marinas wrote:
> > +/**
> > + * vzalloc_decrypted - allocate zeroed virtually contiguous decrypted memory
> > + * @size:    allocation size
> > + *
> > + * Like vmalloc_decrypted(), but the memory is set to zero.
> > + *
> > + * Return: pointer to the allocated memory or %NULL on error
> > + */
> > +void *vzalloc_decrypted_noprof(unsigned long size)
> > +{
> > +	void *addr;
> > +
> > +	addr = __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
> > +					   GFP_KERNEL,
> > +					   pgprot_decrypted(PAGE_KERNEL),
> > +					   VM_DECRYPTED, NUMA_NO_NODE,
> > +					   __builtin_return_address(0));
> > +	if (addr)
> > +		memset(addr, 0, size);
> 
> Talking to Suzuki, the small window between set_memory_decrypted() and
> memset() potentially exposing stale data is safe, at least for Arm CCA
> as the memory would be scrubbed (there are other places in the kernel
> where we do something similar). I assume that's also the case for other
> architectures, although not sure what pKVM does.

It seems like a poor practice though, this should probably be
re-organized to use __GFP_ZERO so things are ordered sensibly.

But what is the purpose of this? I guess some hyperv thing - but
shouldn't we have a more structured way to "DMA map" things for the
hypervisor instead of stuff like this? Why can't you use
dma_alloc_coherent() which actually gives you an address that is
sensible to pass to the hypervisor?

Jason


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
  2026-06-11 11:49   ` Jason Gunthorpe
@ 2026-06-12 17:49     ` Catalin Marinas
  2026-06-12 18:18       ` Jason Gunthorpe
  0 siblings, 1 reply; 7+ messages in thread
From: Catalin Marinas @ 2026-06-12 17:49 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Kameron Carr, akpm, urezki, linux-mm, linux-kernel, rppt,
	mhklinux, linux-coco, Suzuki K Poulose

On Thu, Jun 11, 2026 at 08:49:54AM -0300, Jason Gunthorpe wrote:
> On Mon, Jun 08, 2026 at 04:37:02PM +0100, Catalin Marinas wrote:
> > > +/**
> > > + * vzalloc_decrypted - allocate zeroed virtually contiguous decrypted memory
> > > + * @size:    allocation size
> > > + *
> > > + * Like vmalloc_decrypted(), but the memory is set to zero.
> > > + *
> > > + * Return: pointer to the allocated memory or %NULL on error
> > > + */
> > > +void *vzalloc_decrypted_noprof(unsigned long size)
> > > +{
> > > +	void *addr;
> > > +
> > > +	addr = __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
> > > +					   GFP_KERNEL,
> > > +					   pgprot_decrypted(PAGE_KERNEL),
> > > +					   VM_DECRYPTED, NUMA_NO_NODE,
> > > +					   __builtin_return_address(0));
> > > +	if (addr)
> > > +		memset(addr, 0, size);
> > 
> > Talking to Suzuki, the small window between set_memory_decrypted() and
> > memset() potentially exposing stale data is safe, at least for Arm CCA
> > as the memory would be scrubbed (there are other places in the kernel
> > where we do something similar). I assume that's also the case for other
> > architectures, although not sure what pKVM does.
> 
> It seems like a poor practice though, this should probably be
> re-organized to use __GFP_ZERO so things are ordered sensibly.

__GFP_ZERO doesn't work if the intermediate set_memory_decrypted()
mangles the data (e.g. changes encryption keys) and it no longer reads
as zeros.

> But what is the purpose of this? I guess some hyperv thing - but
> shouldn't we have a more structured way to "DMA map" things for the
> hypervisor instead of stuff like this? Why can't you use
> dma_alloc_coherent() which actually gives you an address that is
> sensible to pass to the hypervisor?

IIRC netvsc_init_buf() uses vzalloc() to allocate some memory and that
buffer ends up in set_memory_decrypted() via vmbus_establish_gpadl().
arm64 does not support changing the decrypted/shared attributed of
vmalloc mappings and I don't think we should add it. Better to just
allocate it properly upfront.

We might be able to use the DMA API but we won't get something like
vmalloc() - physically non-contiguous. I think dma_alloc_noncontiguous()
just falls back to dma_direct_alloc_pages() in the absence of an iommu.

-- 
Catalin


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
  2026-06-12 17:49     ` Catalin Marinas
@ 2026-06-12 18:18       ` Jason Gunthorpe
  2026-06-12 19:06         ` Michael Kelley
  0 siblings, 1 reply; 7+ messages in thread
From: Jason Gunthorpe @ 2026-06-12 18:18 UTC (permalink / raw)
  To: Catalin Marinas, Christoph Hellwig
  Cc: Kameron Carr, akpm, urezki, linux-mm, linux-kernel, rppt,
	mhklinux, linux-coco, Suzuki K Poulose

On Fri, Jun 12, 2026 at 06:49:28PM +0100, Catalin Marinas wrote:
> On Thu, Jun 11, 2026 at 08:49:54AM -0300, Jason Gunthorpe wrote:
> > On Mon, Jun 08, 2026 at 04:37:02PM +0100, Catalin Marinas wrote:
> > > > +/**
> > > > + * vzalloc_decrypted - allocate zeroed virtually contiguous decrypted memory
> > > > + * @size:    allocation size
> > > > + *
> > > > + * Like vmalloc_decrypted(), but the memory is set to zero.
> > > > + *
> > > > + * Return: pointer to the allocated memory or %NULL on error
> > > > + */
> > > > +void *vzalloc_decrypted_noprof(unsigned long size)
> > > > +{
> > > > +	void *addr;
> > > > +
> > > > +	addr = __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
> > > > +					   GFP_KERNEL,
> > > > +					   pgprot_decrypted(PAGE_KERNEL),
> > > > +					   VM_DECRYPTED, NUMA_NO_NODE,
> > > > +					   __builtin_return_address(0));
> > > > +	if (addr)
> > > > +		memset(addr, 0, size);
> > > 
> > > Talking to Suzuki, the small window between set_memory_decrypted() and
> > > memset() potentially exposing stale data is safe, at least for Arm CCA
> > > as the memory would be scrubbed (there are other places in the kernel
> > > where we do something similar). I assume that's also the case for other
> > > architectures, although not sure what pKVM does.
> > 
> > It seems like a poor practice though, this should probably be
> > re-organized to use __GFP_ZERO so things are ordered sensibly.
> 
> __GFP_ZERO doesn't work if the intermediate set_memory_decrypted()
> mangles the data (e.g. changes encryption keys) and it no longer reads
> as zeros.

I thought arches are either preserving the memory content or zeroing
it, you are saying some arch leaves it as garbage? I'd argue that's an
arch bug and they should clear it in their path.

Otherwise this sharp edge is not documented and we have many other
places getting it wrong, eg system_heap_allocate() doesn't re-zero the
memory after decrypting it.

> > But what is the purpose of this? I guess some hyperv thing - but
> > shouldn't we have a more structured way to "DMA map" things for the
> > hypervisor instead of stuff like this? Why can't you use
> > dma_alloc_coherent() which actually gives you an address that is
> > sensible to pass to the hypervisor?
> 
> IIRC netvsc_init_buf() uses vzalloc() to allocate some memory and that
> buffer ends up in set_memory_decrypted() via vmbus_establish_gpadl().
> arm64 does not support changing the decrypted/shared attributed of
> vmalloc mappings and I don't think we should add it. Better to just
> allocate it properly upfront.

Sure
 
> We might be able to use the DMA API but we won't get something like
> vmalloc() - physically non-contiguous. 

The entry point is dma_alloc_noncontiguous() and you get a scatterlist
back.

> I think dma_alloc_noncontiguous() just falls back to
> dma_direct_alloc_pages() in the absence of an iommu.

In all cases you get a scatterlist with a CPU list and a DMA
list. iommu gives a smaller DMA list.

If you want a vmap then you can feed that CPU page list from the sgl
into vmap().

A dma_alloc_noncontiguous_vmap() helper would not be hard to make, and
IMHO, would make alot more sense for hyperv to treat the memory access
from the hypervisor as "DMA" instead of trying to re-invent the DMA
API.. :\

HCH was already saying we should not be allowing drivers to use
set_memory_decrypted() at all, and hyperv is the biggest non-core user
right now...

Jason


^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted()
  2026-06-12 18:18       ` Jason Gunthorpe
@ 2026-06-12 19:06         ` Michael Kelley
  0 siblings, 0 replies; 7+ messages in thread
From: Michael Kelley @ 2026-06-12 19:06 UTC (permalink / raw)
  To: Jason Gunthorpe, Catalin Marinas, Christoph Hellwig
  Cc: Kameron Carr, akpm@linux-foundation.org, urezki@gmail.com,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org, rppt@kernel.org,
	Michael Kelley, linux-coco@lists.linux.dev, Suzuki K Poulose

From: Jason Gunthorpe <jgg@ziepe.ca> Sent: Friday, June 12, 2026 11:18 AM
> 
> On Fri, Jun 12, 2026 at 06:49:28PM +0100, Catalin Marinas wrote:
> > On Thu, Jun 11, 2026 at 08:49:54AM -0300, Jason Gunthorpe wrote:
> > > On Mon, Jun 08, 2026 at 04:37:02PM +0100, Catalin Marinas wrote:
> > > > > +/**
> > > > > + * vzalloc_decrypted - allocate zeroed virtually contiguous decrypted memory
> > > > > + * @size:    allocation size
> > > > > + *
> > > > > + * Like vmalloc_decrypted(), but the memory is set to zero.
> > > > > + *
> > > > > + * Return: pointer to the allocated memory or %NULL on error
> > > > > + */
> > > > > +void *vzalloc_decrypted_noprof(unsigned long size)
> > > > > +{
> > > > > +	void *addr;
> > > > > +
> > > > > +	addr = __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
> > > > > +					   GFP_KERNEL,
> > > > > +					   pgprot_decrypted(PAGE_KERNEL),
> > > > > +					   VM_DECRYPTED, NUMA_NO_NODE,
> > > > > +					   __builtin_return_address(0));
> > > > > +	if (addr)
> > > > > +		memset(addr, 0, size);
> > > >
> > > > Talking to Suzuki, the small window between set_memory_decrypted() and
> > > > memset() potentially exposing stale data is safe, at least for Arm CCA
> > > > as the memory would be scrubbed (there are other places in the kernel
> > > > where we do something similar). I assume that's also the case for other
> > > > architectures, although not sure what pKVM does.
> > >
> > > It seems like a poor practice though, this should probably be
> > > re-organized to use __GFP_ZERO so things are ordered sensibly.
> >
> > __GFP_ZERO doesn't work if the intermediate set_memory_decrypted()
> > mangles the data (e.g. changes encryption keys) and it no longer reads
> > as zeros.
> 
> I thought arches are either preserving the memory content or zeroing
> it, you are saying some arch leaves it as garbage? I'd argue that's an
> arch bug and they should clear it in their path.

AMD SEV-SNP leaves the memory contents as garbage after an encryption
or decryption state change. On the flip side, my understanding has been
that TDX zeroes the memory (or at least has an option to do so) after
such a state change, though a couple of AI chats say TDX also leaves
garbage. To be sure, I'd have to run an experiment to check in a TDX
guest on Hyper-V.

> 
> Otherwise this sharp edge is not documented and we have many other
> places getting it wrong, eg system_heap_allocate() doesn't re-zero the
> memory after decrypting it.

In the Hyper-V code that uses set_memory_decrypted()/encrypted(),
there's always an explicit call to set the memory to zero afterwards.

Michael

> 
> > > But what is the purpose of this? I guess some hyperv thing - but
> > > shouldn't we have a more structured way to "DMA map" things for the
> > > hypervisor instead of stuff like this? Why can't you use
> > > dma_alloc_coherent() which actually gives you an address that is
> > > sensible to pass to the hypervisor?
> >
> > IIRC netvsc_init_buf() uses vzalloc() to allocate some memory and that
> > buffer ends up in set_memory_decrypted() via vmbus_establish_gpadl().
> > arm64 does not support changing the decrypted/shared attributed of
> > vmalloc mappings and I don't think we should add it. Better to just
> > allocate it properly upfront.
> 
> Sure
> 
> > We might be able to use the DMA API but we won't get something like
> > vmalloc() - physically non-contiguous.
> 
> The entry point is dma_alloc_noncontiguous() and you get a scatterlist
> back.
> 
> > I think dma_alloc_noncontiguous() just falls back to
> > dma_direct_alloc_pages() in the absence of an iommu.
> 
> In all cases you get a scatterlist with a CPU list and a DMA
> list. iommu gives a smaller DMA list.
> 
> If you want a vmap then you can feed that CPU page list from the sgl
> into vmap().
> 
> A dma_alloc_noncontiguous_vmap() helper would not be hard to make, and
> IMHO, would make alot more sense for hyperv to treat the memory access
> from the hypervisor as "DMA" instead of trying to re-invent the DMA
> API.. :\
> 
> HCH was already saying we should not be allowing drivers to use
> set_memory_decrypted() at all, and hyperv is the biggest non-core user
> right now...
> 
> Jason



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2026-06-12 19:06 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-21 20:58 [RFC PATCH] mm/vmalloc: add vmalloc_decrypted() and vzalloc_decrypted() Kameron Carr
2026-05-22  3:14 ` Matthew Wilcox
2026-06-08 15:37 ` Catalin Marinas
2026-06-11 11:49   ` Jason Gunthorpe
2026-06-12 17:49     ` Catalin Marinas
2026-06-12 18:18       ` Jason Gunthorpe
2026-06-12 19:06         ` Michael Kelley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox