All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Garg, Shivank" <shivankg@amd.com>
To: seanjc@google.com, david@redhat.com, vbabka@suse.cz,
	willy@infradead.org, akpm@linux-foundation.org, shuah@kernel.org,
	pbonzini@redhat.com, brauner@kernel.org, viro@zeniv.linux.org.uk
Cc: ackerleytng@google.com, paul@paul-moore.com, jmorris@namei.org,
	serge@hallyn.com, pvorel@suse.cz, bfoster@redhat.com,
	tabba@google.com, vannapurve@google.com, chao.gao@intel.com,
	bharata@amd.com, nikunj@amd.com, michael.day@amd.com,
	shdhiman@amd.com, yan.y.zhao@intel.com, Neeraj.Upadhyay@amd.com,
	thomas.lendacky@amd.com, michael.roth@amd.com, aik@amd.com,
	jgg@nvidia.com, kalyazin@amazon.com, peterx@redhat.com,
	jack@suse.cz, rppt@kernel.org, hch@infradead.org,
	cgzones@googlemail.com, ira.weiny@intel.com, rientjes@google.com,
	roypat@amazon.co.uk, ziy@nvidia.com, matthew.brost@intel.com,
	joshua.hahnjy@gmail.com, rakie.kim@sk.com, byungchul@sk.com,
	gourry@gourry.net, kent.overstreet@linux.dev,
	ying.huang@linux.alibaba.com, apopple@nvidia.com,
	chao.p.peng@intel.com, amit@infradead.org, ddutile@redhat.com,
	dan.j.williams@intel.com, ashish.kalra@amd.com, gshan@redhat.com,
	jgowans@amazon.com, pankaj.gupta@amd.com, papaluri@amd.com,
	yuzhao@google.com, suzuki.poulose@arm.com,
	quic_eberman@quicinc.com, aneeshkumar.kizhakeveetil@arm.com,
	linux-fsdevel@vger.kernel.org, linux-mm@kvack.org,
	linux-kernel@vger.kernel.org,
	linux-security-module@vger.kernel.org, kvm@vger.kernel.org,
	linux-kselftest@vger.kernel.org, linux-coco@lists.linux.dev
Subject: Re: [PATCH RFC V10 5/7] KVM: guest_memfd: Add slab-allocated inode cache
Date: Wed, 13 Aug 2025 11:40:50 +0530	[thread overview]
Message-ID: <e7f7703d-fe76-4ab2-bef4-8d4c54da03ad@amd.com> (raw)
In-Reply-To: <20250811090605.16057-11-shivankg@amd.com>



On 8/11/2025 2:36 PM, Shivank Garg wrote:
> Add dedicated inode structure (kvm_gmem_inode_info) and slab-allocated
> inode cache for guest memory backing, similar to how shmem handles inodes.
> 
> This adds the necessary allocation/destruction functions and prepares
> for upcoming guest_memfd NUMA policy support changes.
> 
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
>  virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 67 insertions(+), 2 deletions(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 0e93323fc839..d9c23401e770 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -17,6 +17,15 @@ struct kvm_gmem {
>  	struct list_head entry;
>  };
>  
> +struct kvm_gmem_inode_info {
> +	struct inode vfs_inode;
> +};
> +
> +static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct inode *inode)
> +{
> +	return container_of(inode, struct kvm_gmem_inode_info, vfs_inode);
> +}
> +
>  /**
>   * folio_file_pfn - like folio_file_page, but return a pfn.
>   * @folio: The folio which contains this index.
> @@ -389,13 +398,46 @@ static struct file_operations kvm_gmem_fops = {
>  	.fallocate	= kvm_gmem_fallocate,
>  };
>  
> +static struct kmem_cache *kvm_gmem_inode_cachep;
> +
> +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
> +{
> +	struct kvm_gmem_inode_info *info;
> +
> +	info = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
> +	if (!info)
> +		return NULL;
> +
> +	return &info->vfs_inode;
> +}
> +
> +static void kvm_gmem_destroy_inode(struct inode *inode)
> +{
> +}
> +
> +static void kvm_gmem_free_inode(struct inode *inode)
> +{
> +	kmem_cache_free(kvm_gmem_inode_cachep, KVM_GMEM_I(inode));
> +}
> +
> +static const struct super_operations kvm_gmem_super_operations = {
> +	.statfs		= simple_statfs,
> +	.alloc_inode	= kvm_gmem_alloc_inode,
> +	.destroy_inode	= kvm_gmem_destroy_inode,
> +	.free_inode	= kvm_gmem_free_inode,
> +};
> +
>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
>  {
> +	struct pseudo_fs_context *ctx;
> +
>  	if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
>  		return -ENOMEM;
>  
>  	fc->s_iflags |= SB_I_NOEXEC;
>  	fc->s_iflags |= SB_I_NODEV;
> +	ctx = fc->fs_private;
> +	ctx->ops = &kvm_gmem_super_operations;
>  
>  	return 0;
>  }
> @@ -417,17 +459,40 @@ static int kvm_gmem_init_mount(void)
>  	return 0;
>  }
>  
> +static void kvm_gmem_init_inode(void *foo)
> +{
> +	struct kvm_gmem_inode_info *info = foo;
> +
> +	inode_init_once(&info->vfs_inode);
> +}
> +
>  int kvm_gmem_init(struct module *module)
>  {
> -	kvm_gmem_fops.owner = module;
> +	int ret;
> +	struct kmem_cache_args args = {
> +		.align = 0,
> +		.ctor = kvm_gmem_init_inode,
> +	};
>  
> -	return kvm_gmem_init_mount();
> +	kvm_gmem_fops.owner = module;
> +	kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
> +						  sizeof(struct kvm_gmem_inode_info),
> +						  &args, SLAB_ACCOUNT);
> +	if (!kvm_gmem_inode_cachep)
> +		return -ENOMEM;
> +	ret = kvm_gmem_init_mount();
> +	if (ret) {
> +		kmem_cache_destroy(kvm_gmem_inode_cachep);
> +		return ret;
> +	}
> +	return 0;
>  }
>  
>  void kvm_gmem_exit(void)
>  {
>  	kern_unmount(kvm_gmem_mnt);
>  	kvm_gmem_mnt = NULL;
> +	kmem_cache_destroy(kvm_gmem_inode_cachep);
>  }

While testing my code, I discovered a bug that occurs when unloading the kvm_amd module
after a guest_memfd-backed VM has run.

dmesg logs:
[  610.075763] =============================================================================
[  610.083933] BUG kvm_gmem_inode_cache (Not tainted): Objects remaining on __kmem_cache_shutdown()
[  610.092711] -----------------------------------------------------------------------------
[  610.102368] Object 0x000000008ee52a58 @offset=19200
[  610.107247] Slab 0x000000004b1b088c objects=51 used=1 fp=0x000000007c55fc00 flags=0x57ffffc0000240(workingset|head|node=1|zone=2|lastcpupid=0x1fffff)
[  610.120733] Disabling lock debugging due to kernel taint
[  610.120741] ------------[ cut here ]------------
[  610.120742] WARNING: CPU: 7 PID: 7554 at mm/slub.c:1171 __kmem_cache_shutdown+0x264/0x370
[  610.120751] Modules linked in: xt_set ip_set xt_addrtype xfrm_user xfrm_algo xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4 nft_compat nff_defrag_ipv4 nf_tables overlay bridge stp llc cfg80211 rfkill binfmt_misc ipmi_ssif amd_atl intel_rapl_msr wmi_bmof intel_rapl_common amd64_edac edac_mce_amdmem_helper drm_kms_helper i2c_piix4 ptdma i2c_smbus k10temp wmi acpi_power_meter ipmi_si acpi_ipmi ipmi_devintf ipmi_msghandler sg dm_multipath fuse drm dm_mo56 async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 raid0 sd_mod kvm_amd(-) ahci libahci kvm nvme tg3 libata ccp irqbypass nvme_c
[  610.120831] CPU: 7 UID: 0 PID: 7554 Comm: rmmod Kdump: loaded Tainted: G    B               6.16.0+ #10 PREEMPT(none)
[  610.120835] Tainted: [B]=BAD_PAGE
[  610.120836] Hardware name: Dell Inc. PowerEdge R6525/024PW1, BIOS 2.16.2 07/09/2024
[  610.120838] RIP: 0010:__kmem_cache_shutdown+0x264/0x370
[  610.120841] Code: 89 f1 4c 89 f6 4d 8b 46 20 48 c7 c7 08 08 ec 87 81 e2 ff 7f 00 00 e8 fb a7 d7 ff be 01 00 00 00 bf 05 00 00 00 e8 dc e9 cd ff <0f> 0b 48 fe ff ff
[  610.120843] RSP: 0018:ffffcd6962963cb8 EFLAGS: 00010046
[  610.120846] RAX: 0000000000000000 RBX: ffff89fde07d21c0 RCX: 0000000000000027
[  610.120848] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff89fcbe5dbe80
[  610.120850] RBP: ffff89fde07d21c0 R08: 0000000000000000 R09: 0000000000000003
[  610.120851] R10: ffffcd6962963b58 R11: ffffffff889db908 R12: ffff89fdcccd7f80
[  610.120852] R13: ffff89fdcccd0000 R14: fffff96802333400 R15: ffff89fdd6ab6c00
[  610.120854] FS:  00007f066eaab080(0000) GS:ffff89fd3516f000(0000) knlGS:0000000000000000
[  610.120856] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  610.120857] CR2: 00007ffefd577828 CR3: 0000000220406004 CR4: 0000000000770ef0
[  610.120859] PKRU: 55555554
[  610.120860] Call Trace:
[  610.120862]  <TASK>
[  610.120866]  kmem_cache_destroy+0x3a/0x150
[  610.120872]  kvm_exit+0x7b/0xa0 [kvm]
[  610.120919]  svm_exit+0x5/0x10 [kvm_amd]
[  610.120926]  __do_sys_delete_module.isra.0+0x18b/0x2e0
[  610.120933]  ? srso_alias_return_thunk+0x5/0xfbef5
[  610.120937]  ? syscall_trace_enter+0xfa/0x1a0
[  610.120941]  do_syscall_64+0x7b/0x2c0
[  610.120947]  ? srso_alias_return_thunk+0x5/0xfbef5
[  610.120950]  ? __handle_mm_fault+0x2aa/0x670
[  610.120954]  ? iterate_dir+0x11e/0x230
[  610.120960]  ? srso_alias_return_thunk+0x5/0xfbef5
[  610.120963]  ? count_memcg_events+0xb2/0x160
[  610.120967]  ? srso_alias_return_thunk+0x5/0xfbef5
[  610.120969]  ? handle_mm_fault+0xb2/0x2f0
[  610.120972]  ? srso_alias_return_thunk+0x5/0xfbef5
[  610.120975]  ? do_user_addr_fault+0x16f/0x6f0
[  610.120981]  ? srso_alias_return_thunk+0x5/0xfbef5
[  610.120984]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  610.120986] RIP: 0033:0x7f066e12ac9b
[  610.120989] Code: 73 01 c3 48 8b 0d 7d 81 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 b0 00 00 00 0f 05 <48> 3d 01 89 01 48
[  610.120990] RSP: 002b:00007ffc629f1878 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
[  610.120993] RAX: ffffffffffffffda RBX: 00005630e80256f0 RCX: 00007f066e12ac9b
[  610.120994] RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00005630e8025758
[  610.120996] RBP: 00007ffc629f18a0 R08: 1999999999999999 R09: 0000000000000000
[  610.120997] R10: 00007f066e1b1fc0 R11: 0000000000000206 R12: 0000000000000000
[  610.120999] R13: 00007ffc629f1af0 R14: 00005630e80256f0 R15: 0000000000000000
[  610.121003]  </TASK>
[  610.121004] ---[ end trace 0000000000000000 ]---
[  610.121017] ------------[ cut here ]------------

There is a race condition here:
kern_unmount() -> mntput() -> cleanup_mnt() -> deactivate_super() -> deactivate_locked_super() -> fs->kill_sb() (guest_memfd kill_sb) -> generic_shutdown_super() -> evict_inodes() -> destroy_inode() -> call_rcu()

I should be waiting for pending RCU callback to finish before calling the kmem_cache_destroy().

To fix this, I added rcu_barrier() like dax_fs_exit() is doing.

@@ -561,6 +566,7 @@ void kvm_gmem_exit(void)
 {
        kern_unmount(kvm_gmem_mnt);
        kvm_gmem_mnt = NULL;
+       rcu_barrier();
        kmem_cache_destroy(kvm_gmem_inode_cachep);
 }


I'll incorporate this fix into next version.

Thanks,
Shivank

  reply	other threads:[~2025-08-13  6:11 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-11  9:05 [PATCH RFC V10 0/7] Add NUMA mempolicy support for KVM guest-memfd Shivank Garg
2025-08-11  9:06 ` [PATCH RFC V10 1/7] mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio() Shivank Garg
2025-08-11  9:06 ` [PATCH RFC V10 2/7] mm/filemap: Extend __filemap_get_folio() to support NUMA memory policies Shivank Garg
2025-08-11  9:06 ` [PATCH RFC V10 3/7] mm/mempolicy: Export memory policy symbols Shivank Garg
2025-08-11  9:06 ` [PATCH RFC V10 4/7] KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes Shivank Garg
2025-08-11 15:33   ` David Hildenbrand
2025-08-11 21:23     ` Ackerley Tng
2025-08-13  5:37       ` Garg, Shivank
2025-08-11  9:06 ` [PATCH RFC V10 5/7] KVM: guest_memfd: Add slab-allocated inode cache Shivank Garg
2025-08-13  6:10   ` Garg, Shivank [this message]
2025-08-11  9:06 ` [PATCH RFC V10 6/7] KVM: guest_memfd: Enforce NUMA mempolicy using shared policy Shivank Garg
2025-08-11  9:06 ` [PATCH RFC V10 7/7] KVM: guest_memfd: selftests: Add tests for mmap and NUMA policy support Shivank Garg
2025-08-11 14:34 ` [PATCH RFC V10 0/7] Add NUMA mempolicy support for KVM guest-memfd Sean Christopherson
2025-08-11 14:41   ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e7f7703d-fe76-4ab2-bef4-8d4c54da03ad@amd.com \
    --to=shivankg@amd.com \
    --cc=Neeraj.Upadhyay@amd.com \
    --cc=ackerleytng@google.com \
    --cc=aik@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=amit@infradead.org \
    --cc=aneeshkumar.kizhakeveetil@arm.com \
    --cc=apopple@nvidia.com \
    --cc=ashish.kalra@amd.com \
    --cc=bfoster@redhat.com \
    --cc=bharata@amd.com \
    --cc=brauner@kernel.org \
    --cc=byungchul@sk.com \
    --cc=cgzones@googlemail.com \
    --cc=chao.gao@intel.com \
    --cc=chao.p.peng@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=david@redhat.com \
    --cc=ddutile@redhat.com \
    --cc=gourry@gourry.net \
    --cc=gshan@redhat.com \
    --cc=hch@infradead.org \
    --cc=ira.weiny@intel.com \
    --cc=jack@suse.cz \
    --cc=jgg@nvidia.com \
    --cc=jgowans@amazon.com \
    --cc=jmorris@namei.org \
    --cc=joshua.hahnjy@gmail.com \
    --cc=kalyazin@amazon.com \
    --cc=kent.overstreet@linux.dev \
    --cc=kvm@vger.kernel.org \
    --cc=linux-coco@lists.linux.dev \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=matthew.brost@intel.com \
    --cc=michael.day@amd.com \
    --cc=michael.roth@amd.com \
    --cc=nikunj@amd.com \
    --cc=pankaj.gupta@amd.com \
    --cc=papaluri@amd.com \
    --cc=paul@paul-moore.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=pvorel@suse.cz \
    --cc=quic_eberman@quicinc.com \
    --cc=rakie.kim@sk.com \
    --cc=rientjes@google.com \
    --cc=roypat@amazon.co.uk \
    --cc=rppt@kernel.org \
    --cc=seanjc@google.com \
    --cc=serge@hallyn.com \
    --cc=shdhiman@amd.com \
    --cc=shuah@kernel.org \
    --cc=suzuki.poulose@arm.com \
    --cc=tabba@google.com \
    --cc=thomas.lendacky@amd.com \
    --cc=vannapurve@google.com \
    --cc=vbabka@suse.cz \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=yan.y.zhao@intel.com \
    --cc=ying.huang@linux.alibaba.com \
    --cc=yuzhao@google.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.