All of lore.kernel.org
 help / color / mirror / Atom feed
From: Gavin Shan <gshan@redhat.com>
To: Fuad Tabba <tabba@google.com>,
	kvm@vger.kernel.org, linux-arm-msm@vger.kernel.org,
	linux-mm@kvack.org
Cc: pbonzini@redhat.com, chenhuacai@kernel.org, mpe@ellerman.id.au,
	anup@brainfault.org, paul.walmsley@sifive.com,
	palmer@dabbelt.com, aou@eecs.berkeley.edu, seanjc@google.com,
	viro@zeniv.linux.org.uk, brauner@kernel.org, willy@infradead.org,
	akpm@linux-foundation.org, xiaoyao.li@intel.com,
	yilun.xu@intel.com, chao.p.peng@linux.intel.com,
	jarkko@kernel.org, amoorthy@google.com, dmatlack@google.com,
	isaku.yamahata@intel.com, mic@digikod.net, vbabka@suse.cz,
	vannapurve@google.com, ackerleytng@google.com,
	mail@maciej.szmigiero.name, david@redhat.com,
	michael.roth@amd.com, wei.w.wang@intel.com,
	liam.merwick@oracle.com, isaku.yamahata@gmail.com,
	kirill.shutemov@linux.intel.com, suzuki.poulose@arm.com,
	steven.price@arm.com, quic_eberman@quicinc.com,
	quic_mnalajal@quicinc.com, quic_tsoni@quicinc.com,
	quic_svaddagi@quicinc.com, quic_cvanscha@quicinc.com,
	quic_pderrin@quicinc.com, quic_pheragu@quicinc.com,
	catalin.marinas@arm.com, james.morse@arm.com,
	yuzenghui@huawei.com, oliver.upton@linux.dev, maz@kernel.org,
	will@kernel.org, qperret@google.com, keirf@google.com,
	roypat@amazon.co.uk, shuah@kernel.org, hch@infradead.org,
	jgg@nvidia.com, rientjes@google.com, jhubbard@nvidia.com,
	fvdl@google.com, hughd@google.com, jthoughton@google.com,
	peterx@redhat.com, pankaj.gupta@amd.com, ira.weiny@intel.com
Subject: Re: [PATCH v10 08/16] KVM: guest_memfd: Allow host to map guest_memfd pages
Date: Thu, 5 Jun 2025 16:40:58 +1000	[thread overview]
Message-ID: <a3d6ff25-236b-4dfd-8a04-6df437ecb4bb@redhat.com> (raw)
In-Reply-To: <20250527180245.1413463-9-tabba@google.com>

Hi Fuad,

On 5/28/25 4:02 AM, Fuad Tabba wrote:
> This patch enables support for shared memory in guest_memfd, including
> mapping that memory at the host userspace. This support is gated by the
> configuration option KVM_GMEM_SHARED_MEM, and toggled by the guest_memfd
> flag GUEST_MEMFD_FLAG_SUPPORT_SHARED, which can be set when creating a
> guest_memfd instance.
> 
> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> ---
>   arch/x86/include/asm/kvm_host.h | 10 ++++
>   arch/x86/kvm/x86.c              |  3 +-
>   include/linux/kvm_host.h        | 13 ++++++
>   include/uapi/linux/kvm.h        |  1 +
>   virt/kvm/Kconfig                |  5 ++
>   virt/kvm/guest_memfd.c          | 81 +++++++++++++++++++++++++++++++++
>   6 files changed, 112 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 709cc2a7ba66..ce9ad4cd93c5 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2255,8 +2255,18 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
>   
>   #ifdef CONFIG_KVM_GMEM
>   #define kvm_arch_supports_gmem(kvm) ((kvm)->arch.supports_gmem)
> +
> +/*
> + * CoCo VMs with hardware support that use guest_memfd only for backing private
> + * memory, e.g., TDX, cannot use guest_memfd with userspace mapping enabled.
> + */
> +#define kvm_arch_supports_gmem_shared_mem(kvm)			\
> +	(IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) &&			\
> +	 ((kvm)->arch.vm_type == KVM_X86_SW_PROTECTED_VM ||		\
> +	  (kvm)->arch.vm_type == KVM_X86_DEFAULT_VM))
>   #else
>   #define kvm_arch_supports_gmem(kvm) false
> +#define kvm_arch_supports_gmem_shared_mem(kvm) false
>   #endif
>   
>   #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 035ced06b2dd..2a02f2457c42 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -12718,7 +12718,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>   		return -EINVAL;
>   
>   	kvm->arch.vm_type = type;
> -	kvm->arch.supports_gmem = (type == KVM_X86_SW_PROTECTED_VM);
> +	kvm->arch.supports_gmem =
> +		type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
>   	/* Decided by the vendor code for other VM types.  */
>   	kvm->arch.pre_fault_allowed =
>   		type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 80371475818f..ba83547e62b0 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -729,6 +729,19 @@ static inline bool kvm_arch_supports_gmem(struct kvm *kvm)
>   }
>   #endif
>   
> +/*
> + * Returns true if this VM supports shared mem in guest_memfd.
> + *
> + * Arch code must define kvm_arch_supports_gmem_shared_mem if support for
> + * guest_memfd is enabled.
> + */
> +#if !defined(kvm_arch_supports_gmem_shared_mem) && !IS_ENABLED(CONFIG_KVM_GMEM)
> +static inline bool kvm_arch_supports_gmem_shared_mem(struct kvm *kvm)
> +{
> +	return false;
> +}
> +#endif
> +
>   #ifndef kvm_arch_has_readonly_mem
>   static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
>   {
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index b6ae8ad8934b..c2714c9d1a0e 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1566,6 +1566,7 @@ struct kvm_memory_attributes {
>   #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>   
>   #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> +#define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1ULL << 0)
>   
>   struct kvm_create_guest_memfd {
>   	__u64 size;
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 559c93ad90be..df225298ab10 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -128,3 +128,8 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
>   config HAVE_KVM_ARCH_GMEM_INVALIDATE
>          bool
>          depends on KVM_GMEM
> +
> +config KVM_GMEM_SHARED_MEM
> +       select KVM_GMEM
> +       bool
> +       prompt "Enable support for non-private (shared) memory in guest_memfd"
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 6db515833f61..5d34712f64fc 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -312,7 +312,81 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
>   	return gfn - slot->base_gfn + slot->gmem.pgoff;
>   }
>   
> +static bool kvm_gmem_supports_shared(struct inode *inode)
> +{
> +	u64 flags;
> +
> +	if (!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM))
> +		return false;
> +
> +	flags = (u64)inode->i_private;
> +
> +	return flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> +}
> +
> +
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> +{
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	struct folio *folio;
> +	vm_fault_t ret = VM_FAULT_LOCKED;
> +
> +	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +	if (IS_ERR(folio)) {
> +		int err = PTR_ERR(folio);
> +
> +		if (err == -EAGAIN)
> +			return VM_FAULT_RETRY;
> +
> +		return vmf_error(err);
> +	}
> +
> +	if (WARN_ON_ONCE(folio_test_large(folio))) {
> +		ret = VM_FAULT_SIGBUS;
> +		goto out_folio;
> +	}
> +
> +	if (!folio_test_uptodate(folio)) {
> +		clear_highpage(folio_page(folio, 0));
> +		kvm_gmem_mark_prepared(folio);
> +	}
> +
> +	vmf->page = folio_file_page(folio, vmf->pgoff);
> +
> +out_folio:
> +	if (ret != VM_FAULT_LOCKED) {
> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}
> +
> +	return ret;
> +}
> +
> +static const struct vm_operations_struct kvm_gmem_vm_ops = {
> +	.fault = kvm_gmem_fault_shared,
> +};
> +
> +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	if (!kvm_gmem_supports_shared(file_inode(file)))
> +		return -ENODEV;
> +
> +	if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> +	    (VM_SHARED | VM_MAYSHARE)) {
> +		return -EINVAL;
> +	}
> +
> +	vma->vm_ops = &kvm_gmem_vm_ops;
> +
> +	return 0;
> +}
> +#else
> +#define kvm_gmem_mmap NULL
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> +
>   static struct file_operations kvm_gmem_fops = {
> +	.mmap		= kvm_gmem_mmap,
>   	.open		= generic_file_open,
>   	.release	= kvm_gmem_release,
>   	.fallocate	= kvm_gmem_fallocate,
> @@ -463,6 +537,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>   	u64 flags = args->flags;
>   	u64 valid_flags = 0;
>   

It seems there is an uncovered corner case, which exists in current code (not directly
caused by this patch): After .mmap is hooked, the address space (inode->i_mapping) is
exposed to user space for futher requests like madvise().madvise(MADV_COLLAPSE) can
potentially collapse the pages to a huge page (folio) with the following assumptions.
It's not the expected behavior since huge page isn't supported yet.

- CONFIG_READ_ONLY_THP_FOR_FS = y
- the folios in the pagecache have been fully populated, it can be done by kvm_gmem_fallocate()
   or kvm_gmem_get_pfn().
- mmap(0x00000f0100000000, ..., MAP_FIXED_NOREPLACE) on the guest-memfd, and then do
   madvise(buf, size, MADV_COLLAPSE).

sys_madvise
   do_madvise
     madvise_do_behavior
       madvise_vma_behavior
         madvise_collapse
           thp_vma_allowable_order
             file_thp_enabled             // need to return false to bail from the path earlier at least
           hpage_collapse_scan_file
           collapse_pte_mapped_thp

The fix would be to increase inode->i_writecount using allow_write_access() in
__kvm_gmem_create() to break the check done by file_thp_enabled().

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 0cd12f94958b..fe706c9f21cf 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -502,6 +502,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
         }
  
         file->f_flags |= O_LARGEFILE;
+       allow_write_access(file);


> +	if (kvm_arch_supports_gmem_shared_mem(kvm))
> +		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> +
>   	if (flags & ~valid_flags)
>   		return -EINVAL;
>   
> @@ -501,6 +578,10 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>   	    offset + size > i_size_read(inode))
>   		goto err;
>   
> +	if (kvm_gmem_supports_shared(inode) &&
> +	    !kvm_arch_supports_gmem_shared_mem(kvm))
> +		goto err;
> +
>   	filemap_invalidate_lock(inode->i_mapping);
>   
>   	start = offset >> PAGE_SHIFT;

Thanks,
Gavin


  parent reply	other threads:[~2025-06-05  6:41 UTC|newest]

Thread overview: 62+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-27 18:02 [PATCH v10 00/16] KVM: Mapping guest_memfd backed memory at the host for software protected VMs Fuad Tabba
2025-05-27 18:02 ` [PATCH v10 01/16] KVM: Rename CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GMEM Fuad Tabba
2025-05-31 19:05   ` Shivank Garg
2025-06-05  8:19   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 02/16] KVM: Rename CONFIG_KVM_GENERIC_PRIVATE_MEM to CONFIG_KVM_GENERIC_GMEM_POPULATE Fuad Tabba
2025-05-31 19:07   ` Shivank Garg
2025-06-05  8:19   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 03/16] KVM: Rename kvm_arch_has_private_mem() to kvm_arch_supports_gmem() Fuad Tabba
2025-05-31 19:12   ` Shivank Garg
2025-06-05  8:20   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 04/16] KVM: x86: Rename kvm->arch.has_private_mem to kvm->arch.supports_gmem Fuad Tabba
2025-05-31 19:13   ` Shivank Garg
2025-06-05  8:21   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 05/16] KVM: Rename kvm_slot_can_be_private() to kvm_slot_has_gmem() Fuad Tabba
2025-05-31 19:13   ` Shivank Garg
2025-06-05  8:22   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 06/16] KVM: Fix comments that refer to slots_lock Fuad Tabba
2025-05-31 19:14   ` Shivank Garg
2025-06-05  8:22   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 07/16] KVM: Fix comment that refers to kvm uapi header path Fuad Tabba
2025-05-31 19:19   ` Shivank Garg
2025-06-02 10:10     ` Fuad Tabba
2025-06-02 10:23   ` David Hildenbrand
2025-06-04  9:00   ` Gavin Shan
2025-06-05  8:22   ` Vlastimil Babka
2025-05-27 18:02 ` [PATCH v10 08/16] KVM: guest_memfd: Allow host to map guest_memfd pages Fuad Tabba
2025-05-28 23:17   ` kernel test robot
2025-06-02 10:05   ` Fuad Tabba
2025-06-02 10:43   ` Shivank Garg
2025-06-02 11:13     ` Fuad Tabba
2025-06-04  6:02   ` Gavin Shan
2025-06-04  8:37     ` Fuad Tabba
2025-06-04 12:26   ` David Hildenbrand
2025-06-04 12:32     ` Fuad Tabba
2025-06-04 13:02       ` David Hildenbrand
2025-06-05  6:40   ` Gavin Shan [this message]
2025-06-05  8:25     ` Fuad Tabba
2025-06-05  9:53       ` Gavin Shan
2025-06-05  8:28   ` Vlastimil Babka
2025-06-05  8:44     ` Fuad Tabba
2025-05-27 18:02 ` [PATCH v10 09/16] KVM: guest_memfd: Track shared memory support in memslot Fuad Tabba
2025-06-04 12:25   ` David Hildenbrand
2025-06-04 12:31     ` Fuad Tabba
2025-05-27 18:02 ` [PATCH v10 10/16] KVM: x86/mmu: Handle guest page faults for guest_memfd with shared memory Fuad Tabba
2025-05-27 18:02 ` [PATCH v10 11/16] KVM: x86: Compute max_mapping_level with input from guest_memfd Fuad Tabba
2025-06-04 13:09   ` David Hildenbrand
2025-05-27 18:02 ` [PATCH v10 12/16] KVM: arm64: Refactor user_mem_abort() calculation of force_pte Fuad Tabba
2025-06-04  6:05   ` Gavin Shan
2025-05-27 18:02 ` [PATCH v10 13/16] KVM: arm64: Handle guest_memfd-backed guest page faults Fuad Tabba
2025-06-04 13:17   ` David Hildenbrand
2025-06-04 13:30     ` Fuad Tabba
2025-06-04 13:33       ` David Hildenbrand
2025-05-27 18:02 ` [PATCH v10 14/16] KVM: arm64: Enable mapping guest_memfd in arm64 Fuad Tabba
2025-06-04 13:17   ` David Hildenbrand
2025-06-04 13:31     ` Fuad Tabba
2025-05-27 18:02 ` [PATCH v10 15/16] KVM: Introduce the KVM capability KVM_CAP_GMEM_SHARED_MEM Fuad Tabba
2025-06-05  9:59   ` Gavin Shan
2025-05-27 18:02 ` [PATCH v10 16/16] KVM: selftests: guest_memfd mmap() test when mapping is allowed Fuad Tabba
2025-06-04  9:19   ` Gavin Shan
2025-06-04  9:48     ` Fuad Tabba
2025-06-04 10:05       ` Gavin Shan
2025-06-04 10:25         ` Fuad Tabba

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a3d6ff25-236b-4dfd-8a04-6df437ecb4bb@redhat.com \
    --to=gshan@redhat.com \
    --cc=ackerleytng@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=amoorthy@google.com \
    --cc=anup@brainfault.org \
    --cc=aou@eecs.berkeley.edu \
    --cc=brauner@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=chao.p.peng@linux.intel.com \
    --cc=chenhuacai@kernel.org \
    --cc=david@redhat.com \
    --cc=dmatlack@google.com \
    --cc=fvdl@google.com \
    --cc=hch@infradead.org \
    --cc=hughd@google.com \
    --cc=ira.weiny@intel.com \
    --cc=isaku.yamahata@gmail.com \
    --cc=isaku.yamahata@intel.com \
    --cc=james.morse@arm.com \
    --cc=jarkko@kernel.org \
    --cc=jgg@nvidia.com \
    --cc=jhubbard@nvidia.com \
    --cc=jthoughton@google.com \
    --cc=keirf@google.com \
    --cc=kirill.shutemov@linux.intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=liam.merwick@oracle.com \
    --cc=linux-arm-msm@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mail@maciej.szmigiero.name \
    --cc=maz@kernel.org \
    --cc=mic@digikod.net \
    --cc=michael.roth@amd.com \
    --cc=mpe@ellerman.id.au \
    --cc=oliver.upton@linux.dev \
    --cc=palmer@dabbelt.com \
    --cc=pankaj.gupta@amd.com \
    --cc=paul.walmsley@sifive.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=qperret@google.com \
    --cc=quic_cvanscha@quicinc.com \
    --cc=quic_eberman@quicinc.com \
    --cc=quic_mnalajal@quicinc.com \
    --cc=quic_pderrin@quicinc.com \
    --cc=quic_pheragu@quicinc.com \
    --cc=quic_svaddagi@quicinc.com \
    --cc=quic_tsoni@quicinc.com \
    --cc=rientjes@google.com \
    --cc=roypat@amazon.co.uk \
    --cc=seanjc@google.com \
    --cc=shuah@kernel.org \
    --cc=steven.price@arm.com \
    --cc=suzuki.poulose@arm.com \
    --cc=tabba@google.com \
    --cc=vannapurve@google.com \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wei.w.wang@intel.com \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=xiaoyao.li@intel.com \
    --cc=yilun.xu@intel.com \
    --cc=yuzenghui@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.