* [PATCH v2 1/5] mm: userfaultfd: generic continue for non hugetlbfs
2025-04-02 16:07 [PATCH v2 0/5] KVM: guest_memfd: support for uffd minor Nikita Kalyazin
@ 2025-04-02 16:07 ` Nikita Kalyazin
2025-04-02 19:04 ` James Houghton
2025-04-02 16:07 ` [PATCH v2 2/5] KVM: guest_memfd: add kvm_gmem_vma_is_gmem Nikita Kalyazin
` (3 subsequent siblings)
4 siblings, 1 reply; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-02 16:07 UTC (permalink / raw)
To: akpm, pbonzini, shuah
Cc: kvm, linux-kselftest, linux-kernel, linux-mm, lorenzo.stoakes,
david, ryan.roberts, quic_eberman, jthoughton, peterx, graf,
jgowans, roypat, derekmn, nsaenz, xmarcalx, kalyazin
Remove shmem-specific code from UFFDIO_CONTINUE implementation for
non-huge pages by calling vm_ops->fault(). A new VMF flag,
FAULT_FLAG_NO_USERFAULT_MINOR, is introduced to avoid recursive call to
handle_userfault().
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
include/linux/mm_types.h | 3 +++
mm/hugetlb.c | 2 +-
mm/shmem.c | 3 ++-
mm/userfaultfd.c | 25 ++++++++++++++++++-------
4 files changed, 24 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0234f14f2aa6..91a00f2cd565 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1429,6 +1429,8 @@ enum tlb_flush_reason {
* @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
* We should only access orig_pte if this flag set.
* @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
+ * @FAULT_FLAG_NO_USERFAULT_MINOR: The fault handler must not call userfaultfd
+ * minor handler.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
@@ -1467,6 +1469,7 @@ enum fault_flag {
FAULT_FLAG_UNSHARE = 1 << 10,
FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
FAULT_FLAG_VMA_LOCK = 1 << 12,
+ FAULT_FLAG_NO_USERFAULT_MINOR = 1 << 13,
};
typedef unsigned int __bitwise zap_flags_t;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 97930d44d460..ba90d48144fc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
}
/* Check for page in userfault range. */
- if (userfaultfd_minor(vma)) {
+ if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
folio_unlock(folio);
folio_put(folio);
/* See comment in userfaultfd_missing() block above */
diff --git a/mm/shmem.c b/mm/shmem.c
index 1ede0800e846..5e1911e39dec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
fault_mm = vma ? vma->vm_mm : NULL;
folio = filemap_get_entry(inode->i_mapping, index);
- if (folio && vma && userfaultfd_minor(vma)) {
+ if (folio && vma && userfaultfd_minor(vma) &&
+ !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
if (!xa_is_value(folio))
folio_put(folio);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d06453fa8aba..68a995216789 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -386,24 +386,35 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
unsigned long dst_addr,
uffd_flags_t flags)
{
- struct inode *inode = file_inode(dst_vma->vm_file);
- pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
struct folio *folio;
struct page *page;
int ret;
+ struct vm_fault vmf = {
+ .vma = dst_vma,
+ .address = dst_addr,
+ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE |
+ FAULT_FLAG_NO_USERFAULT_MINOR,
+ .pte = NULL,
+ .page = NULL,
+ .pgoff = linear_page_index(dst_vma, dst_addr),
+ };
+
+ if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault)
+ return -EINVAL;
- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
- /* Our caller expects us to return -EFAULT if we failed to find folio */
- if (ret == -ENOENT)
+ ret = dst_vma->vm_ops->fault(&vmf);
+ if (ret & VM_FAULT_ERROR) {
ret = -EFAULT;
- if (ret)
goto out;
+ }
+
+ page = vmf.page;
+ folio = page_folio(page);
if (!folio) {
ret = -EFAULT;
goto out;
}
- page = folio_file_page(folio, pgoff);
if (PageHWPoison(page)) {
ret = -EIO;
goto out_release;
--
2.47.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 1/5] mm: userfaultfd: generic continue for non hugetlbfs
2025-04-02 16:07 ` [PATCH v2 1/5] mm: userfaultfd: generic continue for non hugetlbfs Nikita Kalyazin
@ 2025-04-02 19:04 ` James Houghton
2025-04-03 17:01 ` Nikita Kalyazin
0 siblings, 1 reply; 12+ messages in thread
From: James Houghton @ 2025-04-02 19:04 UTC (permalink / raw)
To: Nikita Kalyazin
Cc: akpm, pbonzini, shuah, kvm, linux-kselftest, linux-kernel,
linux-mm, lorenzo.stoakes, david, ryan.roberts, quic_eberman,
peterx, graf, jgowans, roypat, derekmn, nsaenz, xmarcalx
On Wed, Apr 2, 2025 at 9:07 AM Nikita Kalyazin <kalyazin@amazon.com> wrote:
>
> Remove shmem-specific code from UFFDIO_CONTINUE implementation for
> non-huge pages by calling vm_ops->fault(). A new VMF flag,
> FAULT_FLAG_NO_USERFAULT_MINOR, is introduced to avoid recursive call to
> handle_userfault().
>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> include/linux/mm_types.h | 3 +++
> mm/hugetlb.c | 2 +-
> mm/shmem.c | 3 ++-
> mm/userfaultfd.c | 25 ++++++++++++++++++-------
> 4 files changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0234f14f2aa6..91a00f2cd565 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1429,6 +1429,8 @@ enum tlb_flush_reason {
> * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
> * We should only access orig_pte if this flag set.
> * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
> + * @FAULT_FLAG_NO_USERFAULT_MINOR: The fault handler must not call userfaultfd
> + * minor handler.
Perhaps instead a flag that says to avoid the userfaultfd minor fault
handler, maybe we should have a flag to indicate that vm_ops->fault()
has been called by UFFDIO_CONTINUE. See below.
> *
> * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
> * whether we would allow page faults to retry by specifying these two
> @@ -1467,6 +1469,7 @@ enum fault_flag {
> FAULT_FLAG_UNSHARE = 1 << 10,
> FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
> FAULT_FLAG_VMA_LOCK = 1 << 12,
> + FAULT_FLAG_NO_USERFAULT_MINOR = 1 << 13,
> };
>
> typedef unsigned int __bitwise zap_flags_t;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 97930d44d460..ba90d48144fc 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
> }
>
> /* Check for page in userfault range. */
> - if (userfaultfd_minor(vma)) {
> + if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
> folio_unlock(folio);
> folio_put(folio);
> /* See comment in userfaultfd_missing() block above */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 1ede0800e846..5e1911e39dec 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> fault_mm = vma ? vma->vm_mm : NULL;
>
> folio = filemap_get_entry(inode->i_mapping, index);
> - if (folio && vma && userfaultfd_minor(vma)) {
> + if (folio && vma && userfaultfd_minor(vma) &&
> + !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
> if (!xa_is_value(folio))
> folio_put(folio);
> *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index d06453fa8aba..68a995216789 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -386,24 +386,35 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
> unsigned long dst_addr,
> uffd_flags_t flags)
> {
> - struct inode *inode = file_inode(dst_vma->vm_file);
> - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> struct folio *folio;
> struct page *page;
> int ret;
> + struct vm_fault vmf = {
> + .vma = dst_vma,
> + .address = dst_addr,
> + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE |
> + FAULT_FLAG_NO_USERFAULT_MINOR,
> + .pte = NULL,
> + .page = NULL,
> + .pgoff = linear_page_index(dst_vma, dst_addr),
> + };
> +
> + if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault)
> + return -EINVAL;
>
> - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
> - /* Our caller expects us to return -EFAULT if we failed to find folio */
> - if (ret == -ENOENT)
> + ret = dst_vma->vm_ops->fault(&vmf);
shmem_get_folio() was being called with SGP_NOALLOC, and now it is
being called with SGP_CACHE (by shmem_fault()). This will result in a
UAPI change: UFFDIO_CONTINUE for a VA without a page in the page cache
should result in EFAULT, but now the page will be allocated.
SGP_NOALLOC was carefully chosen[1], so I think a better way to do
this will be to:
1. Have a FAULT_FLAG_USERFAULT_CONTINUE (or something)
2. In shmem_fault(), if FAULT_FLAG_USERFAULT_CONTINUE, use SGP_NOALLOC
instead of SGP_CACHE (and make sure not to drop into
handle_userfault(), of course)
[1]: https://lore.kernel.org/linux-mm/20220610173812.1768919-1-axelrasmussen@google.com/
> + if (ret & VM_FAULT_ERROR) {
> ret = -EFAULT;
> - if (ret)
> goto out;
> + }
> +
> + page = vmf.page;
> + folio = page_folio(page);
> if (!folio) {
What if ret == VM_FAULT_RETRY? I think we should retry instead instead
of returning -EFAULT.
And I'm not sure how VM_FAULT_NOPAGE should be handled, like if we
need special logic for it or not.
> ret = -EFAULT;
> goto out;
> }
>
> - page = folio_file_page(folio, pgoff);
> if (PageHWPoison(page)) {
> ret = -EIO;
> goto out_release;
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 1/5] mm: userfaultfd: generic continue for non hugetlbfs
2025-04-02 19:04 ` James Houghton
@ 2025-04-03 17:01 ` Nikita Kalyazin
0 siblings, 0 replies; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-03 17:01 UTC (permalink / raw)
To: James Houghton
Cc: akpm, pbonzini, shuah, kvm, linux-kselftest, linux-kernel,
linux-mm, lorenzo.stoakes, david, ryan.roberts, quic_eberman,
peterx, graf, jgowans, roypat, derekmn, nsaenz, xmarcalx
On 02/04/2025 20:04, James Houghton wrote:
> On Wed, Apr 2, 2025 at 9:07 AM Nikita Kalyazin <kalyazin@amazon.com> wrote:
>>
>> Remove shmem-specific code from UFFDIO_CONTINUE implementation for
>> non-huge pages by calling vm_ops->fault(). A new VMF flag,
>> FAULT_FLAG_NO_USERFAULT_MINOR, is introduced to avoid recursive call to
>> handle_userfault().
>>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> include/linux/mm_types.h | 3 +++
>> mm/hugetlb.c | 2 +-
>> mm/shmem.c | 3 ++-
>> mm/userfaultfd.c | 25 ++++++++++++++++++-------
>> 4 files changed, 24 insertions(+), 9 deletions(-)
>>
>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index 0234f14f2aa6..91a00f2cd565 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -1429,6 +1429,8 @@ enum tlb_flush_reason {
>> * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
>> * We should only access orig_pte if this flag set.
>> * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
>> + * @FAULT_FLAG_NO_USERFAULT_MINOR: The fault handler must not call userfaultfd
>> + * minor handler.
>
> Perhaps instead a flag that says to avoid the userfaultfd minor fault
> handler, maybe we should have a flag to indicate that vm_ops->fault()
> has been called by UFFDIO_CONTINUE. See below.
>
>> *
>> * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
>> * whether we would allow page faults to retry by specifying these two
>> @@ -1467,6 +1469,7 @@ enum fault_flag {
>> FAULT_FLAG_UNSHARE = 1 << 10,
>> FAULT_FLAG_ORIG_PTE_VALID = 1 << 11,
>> FAULT_FLAG_VMA_LOCK = 1 << 12,
>> + FAULT_FLAG_NO_USERFAULT_MINOR = 1 << 13,
>> };
>>
>> typedef unsigned int __bitwise zap_flags_t;
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 97930d44d460..ba90d48144fc 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
>> }
>>
>> /* Check for page in userfault range. */
>> - if (userfaultfd_minor(vma)) {
>> + if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
>> folio_unlock(folio);
>> folio_put(folio);
>> /* See comment in userfaultfd_missing() block above */
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 1ede0800e846..5e1911e39dec 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
>> fault_mm = vma ? vma->vm_mm : NULL;
>>
>> folio = filemap_get_entry(inode->i_mapping, index);
>> - if (folio && vma && userfaultfd_minor(vma)) {
>> + if (folio && vma && userfaultfd_minor(vma) &&
>> + !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
>> if (!xa_is_value(folio))
>> folio_put(folio);
>> *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
>> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
>> index d06453fa8aba..68a995216789 100644
>> --- a/mm/userfaultfd.c
>> +++ b/mm/userfaultfd.c
>> @@ -386,24 +386,35 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
>> unsigned long dst_addr,
>> uffd_flags_t flags)
>> {
>> - struct inode *inode = file_inode(dst_vma->vm_file);
>> - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
>> struct folio *folio;
>> struct page *page;
>> int ret;
>> + struct vm_fault vmf = {
>> + .vma = dst_vma,
>> + .address = dst_addr,
>> + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE |
>> + FAULT_FLAG_NO_USERFAULT_MINOR,
>> + .pte = NULL,
>> + .page = NULL,
>> + .pgoff = linear_page_index(dst_vma, dst_addr),
>> + };
>> +
>> + if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault)
>> + return -EINVAL;
>>
>> - ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
>> - /* Our caller expects us to return -EFAULT if we failed to find folio */
>> - if (ret == -ENOENT)
>> + ret = dst_vma->vm_ops->fault(&vmf);
>
> shmem_get_folio() was being called with SGP_NOALLOC, and now it is
> being called with SGP_CACHE (by shmem_fault()). This will result in a
> UAPI change: UFFDIO_CONTINUE for a VA without a page in the page cache
> should result in EFAULT, but now the page will be allocated.
> SGP_NOALLOC was carefully chosen[1], so I think a better way to do
> this will be to:
>
> 1. Have a FAULT_FLAG_USERFAULT_CONTINUE (or something)
> 2. In shmem_fault(), if FAULT_FLAG_USERFAULT_CONTINUE, use SGP_NOALLOC
> instead of SGP_CACHE (and make sure not to drop into
> handle_userfault(), of course)
Yes, that is very true. Thanks for pointing out. Will apply in the
next version.
>
> [1]: https://lore.kernel.org/linux-mm/20220610173812.1768919-1-axelrasmussen@google.com/
>
>> + if (ret & VM_FAULT_ERROR) {
>> ret = -EFAULT;
>> - if (ret)
>> goto out;
>> + }
>> +
>> + page = vmf.page;
>> + folio = page_folio(page);
>> if (!folio) {
>
> What if ret == VM_FAULT_RETRY? I think we should retry instead instead
> of returning -EFAULT.
True. I'm going to retry the vm_ops->fault() call in this case.
>
> And I'm not sure how VM_FAULT_NOPAGE should be handled, like if we
> need special logic for it or not.
As far as I see, the only case it can come up in shmem is during a fault
when a hole is being punched [1]. I'm thinking of returning EAGAIN to
the userspace if this happens.
[1] https://elixir.bootlin.com/linux/v6.14-rc6/source/mm/shmem.c#L2670
>
>> ret = -EFAULT;
>> goto out;
>> }
>>
>> - page = folio_file_page(folio, pgoff);
>> if (PageHWPoison(page)) {
>> ret = -EIO;
>> goto out_release;
>> --
>> 2.47.1
>>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2 2/5] KVM: guest_memfd: add kvm_gmem_vma_is_gmem
2025-04-02 16:07 [PATCH v2 0/5] KVM: guest_memfd: support for uffd minor Nikita Kalyazin
2025-04-02 16:07 ` [PATCH v2 1/5] mm: userfaultfd: generic continue for non hugetlbfs Nikita Kalyazin
@ 2025-04-02 16:07 ` Nikita Kalyazin
2025-04-02 16:07 ` [PATCH v2 3/5] mm: userfaultfd: allow to register continue for guest_memfd Nikita Kalyazin
` (2 subsequent siblings)
4 siblings, 0 replies; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-02 16:07 UTC (permalink / raw)
To: akpm, pbonzini, shuah
Cc: kvm, linux-kselftest, linux-kernel, linux-mm, lorenzo.stoakes,
david, ryan.roberts, quic_eberman, jthoughton, peterx, graf,
jgowans, roypat, derekmn, nsaenz, xmarcalx, kalyazin
It will be used to distinguish the vma type in userfaultfd code.
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
virt/kvm/guest_memfd.c | 5 +++++
virt/kvm/kvm_mm.h | 1 +
2 files changed, 6 insertions(+)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fbf89e643add..26b1734b9623 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -416,6 +416,11 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
+
+bool kvm_gmem_vma_is_gmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &kvm_gmem_vm_ops;
+}
#else
#define kvm_gmem_mmap NULL
#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index acef3f5c582a..09fcdf18a072 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -73,6 +73,7 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned int fd, loff_t offset);
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+bool kvm_gmem_vma_is_gmem(struct vm_area_struct *vma);
#else
static inline void kvm_gmem_init(struct module *module)
{
--
2.47.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 3/5] mm: userfaultfd: allow to register continue for guest_memfd
2025-04-02 16:07 [PATCH v2 0/5] KVM: guest_memfd: support for uffd minor Nikita Kalyazin
2025-04-02 16:07 ` [PATCH v2 1/5] mm: userfaultfd: generic continue for non hugetlbfs Nikita Kalyazin
2025-04-02 16:07 ` [PATCH v2 2/5] KVM: guest_memfd: add kvm_gmem_vma_is_gmem Nikita Kalyazin
@ 2025-04-02 16:07 ` Nikita Kalyazin
2025-04-02 21:25 ` James Houghton
2025-04-02 16:07 ` [PATCH v2 4/5] KVM: guest_memfd: add support for userfaultfd minor Nikita Kalyazin
2025-04-02 16:07 ` [PATCH v2 5/5] KVM: selftests: test userfaultfd minor for guest_memfd Nikita Kalyazin
4 siblings, 1 reply; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-02 16:07 UTC (permalink / raw)
To: akpm, pbonzini, shuah
Cc: kvm, linux-kselftest, linux-kernel, linux-mm, lorenzo.stoakes,
david, ryan.roberts, quic_eberman, jthoughton, peterx, graf,
jgowans, roypat, derekmn, nsaenz, xmarcalx, kalyazin
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
include/linux/userfaultfd_k.h | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 75342022d144..bc184edfbb85 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -212,6 +212,10 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return vma->vm_flags & __VM_UFFD_FLAGS;
}
+#ifdef CONFIG_KVM_PRIVATE_MEM
+bool kvm_gmem_vma_is_gmem(struct vm_area_struct *vma);
+#endif
+
static inline bool vma_can_userfault(struct vm_area_struct *vma,
unsigned long vm_flags,
bool wp_async)
@@ -222,7 +226,11 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
return false;
if ((vm_flags & VM_UFFD_MINOR) &&
- (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
+ (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ && !kvm_gmem_vma_is_gmem(vma)
+#endif
+ )
return false;
/*
@@ -244,6 +252,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
/* By default, allow any of anon|shmem|hugetlb */
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ kvm_gmem_vma_is_gmem(vma) ||
+#endif
vma_is_shmem(vma);
}
--
2.47.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 3/5] mm: userfaultfd: allow to register continue for guest_memfd
2025-04-02 16:07 ` [PATCH v2 3/5] mm: userfaultfd: allow to register continue for guest_memfd Nikita Kalyazin
@ 2025-04-02 21:25 ` James Houghton
2025-04-03 17:01 ` Nikita Kalyazin
0 siblings, 1 reply; 12+ messages in thread
From: James Houghton @ 2025-04-02 21:25 UTC (permalink / raw)
To: Nikita Kalyazin
Cc: akpm, pbonzini, shuah, kvm, linux-kselftest, linux-kernel,
linux-mm, lorenzo.stoakes, david, ryan.roberts, quic_eberman,
peterx, graf, jgowans, roypat, derekmn, nsaenz, xmarcalx
On Wed, Apr 2, 2025 at 9:08 AM Nikita Kalyazin <kalyazin@amazon.com> wrote:
>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> include/linux/userfaultfd_k.h | 13 ++++++++++++-
> 1 file changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index 75342022d144..bc184edfbb85 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -212,6 +212,10 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
> return vma->vm_flags & __VM_UFFD_FLAGS;
> }
>
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> +bool kvm_gmem_vma_is_gmem(struct vm_area_struct *vma);
> +#endif
> +
> static inline bool vma_can_userfault(struct vm_area_struct *vma,
> unsigned long vm_flags,
> bool wp_async)
> @@ -222,7 +226,11 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
> return false;
>
> if ((vm_flags & VM_UFFD_MINOR) &&
> - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
> + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> + && !kvm_gmem_vma_is_gmem(vma)
> +#endif
Maybe a better way to do this is to add a vm_ops->can_userfault() or
something, so we could write something like this:
if (vma->vm_ops && !vma->vm_ops->can_userfault)
return false;
if (vma->vm_ops && !vma->vm_ops->can_userfault(vm_flags))
return false;
And shmem/hugetlbfs can advertise support for everything they already
support that way.
> + )
> return false;
>
> /*
> @@ -244,6 +252,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
>
> /* By default, allow any of anon|shmem|hugetlb */
> return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> + kvm_gmem_vma_is_gmem(vma) ||
> +#endif
> vma_is_shmem(vma);
> }
>
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 3/5] mm: userfaultfd: allow to register continue for guest_memfd
2025-04-02 21:25 ` James Houghton
@ 2025-04-03 17:01 ` Nikita Kalyazin
0 siblings, 0 replies; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-03 17:01 UTC (permalink / raw)
To: James Houghton
Cc: akpm, pbonzini, shuah, kvm, linux-kselftest, linux-kernel,
linux-mm, lorenzo.stoakes, david, ryan.roberts, quic_eberman,
peterx, graf, jgowans, roypat, derekmn, nsaenz, xmarcalx
On 02/04/2025 22:25, James Houghton wrote:
> On Wed, Apr 2, 2025 at 9:08 AM Nikita Kalyazin <kalyazin@amazon.com> wrote:
>>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> include/linux/userfaultfd_k.h | 13 ++++++++++++-
>> 1 file changed, 12 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
>> index 75342022d144..bc184edfbb85 100644
>> --- a/include/linux/userfaultfd_k.h
>> +++ b/include/linux/userfaultfd_k.h
>> @@ -212,6 +212,10 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
>> return vma->vm_flags & __VM_UFFD_FLAGS;
>> }
>>
>> +#ifdef CONFIG_KVM_PRIVATE_MEM
>> +bool kvm_gmem_vma_is_gmem(struct vm_area_struct *vma);
>> +#endif
>> +
>> static inline bool vma_can_userfault(struct vm_area_struct *vma,
>> unsigned long vm_flags,
>> bool wp_async)
>> @@ -222,7 +226,11 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
>> return false;
>>
>> if ((vm_flags & VM_UFFD_MINOR) &&
>> - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
>> + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))
>> +#ifdef CONFIG_KVM_PRIVATE_MEM
>> + && !kvm_gmem_vma_is_gmem(vma)
>> +#endif
>
> Maybe a better way to do this is to add a vm_ops->can_userfault() or
> something, so we could write something like this:
>
> if (vma->vm_ops && !vma->vm_ops->can_userfault)
> return false;
> if (vma->vm_ops && !vma->vm_ops->can_userfault(vm_flags))
> return false;
I like that, thanks! What do you see passing vm_flags being useful for?
Shall we pass the entire vma struct like in most of other callbacks?
>
> And shmem/hugetlbfs can advertise support for everything they already
> support that way.
>
>> + )
>> return false;
>>
>> /*
>> @@ -244,6 +252,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
>>
>> /* By default, allow any of anon|shmem|hugetlb */
>> return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
>> +#ifdef CONFIG_KVM_PRIVATE_MEM
>> + kvm_gmem_vma_is_gmem(vma) ||
>> +#endif
>> vma_is_shmem(vma);
>> }
>>
>> --
>> 2.47.1
>>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2 4/5] KVM: guest_memfd: add support for userfaultfd minor
2025-04-02 16:07 [PATCH v2 0/5] KVM: guest_memfd: support for uffd minor Nikita Kalyazin
` (2 preceding siblings ...)
2025-04-02 16:07 ` [PATCH v2 3/5] mm: userfaultfd: allow to register continue for guest_memfd Nikita Kalyazin
@ 2025-04-02 16:07 ` Nikita Kalyazin
2025-04-02 16:07 ` [PATCH v2 5/5] KVM: selftests: test userfaultfd minor for guest_memfd Nikita Kalyazin
4 siblings, 0 replies; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-02 16:07 UTC (permalink / raw)
To: akpm, pbonzini, shuah
Cc: kvm, linux-kselftest, linux-kernel, linux-mm, lorenzo.stoakes,
david, ryan.roberts, quic_eberman, jthoughton, peterx, graf,
jgowans, roypat, derekmn, nsaenz, xmarcalx, kalyazin
Add support for sending a pagefault event if userfaultfd is registered.
Only page minor event is currently supported.
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
virt/kvm/guest_memfd.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 26b1734b9623..92d3e6b51dc2 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,9 @@
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
+#ifdef CONFIG_KVM_PRIVATE_MEM
+#include <linux/userfaultfd_k.h>
+#endif /* CONFIG_KVM_PRIVATE_MEM */
#include "kvm_mm.h"
@@ -380,6 +383,13 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
kvm_gmem_mark_prepared(folio);
}
+ if (userfaultfd_minor(vmf->vma) &&
+ !(vmf->flags & FAULT_FLAG_NO_USERFAULT_MINOR)) {
+ folio_unlock(folio);
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+ return handle_userfault(vmf, VM_UFFD_MINOR);
+ }
+
vmf->page = folio_file_page(folio, vmf->pgoff);
out_folio:
--
2.47.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v2 5/5] KVM: selftests: test userfaultfd minor for guest_memfd
2025-04-02 16:07 [PATCH v2 0/5] KVM: guest_memfd: support for uffd minor Nikita Kalyazin
` (3 preceding siblings ...)
2025-04-02 16:07 ` [PATCH v2 4/5] KVM: guest_memfd: add support for userfaultfd minor Nikita Kalyazin
@ 2025-04-02 16:07 ` Nikita Kalyazin
2025-04-02 21:10 ` James Houghton
4 siblings, 1 reply; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-02 16:07 UTC (permalink / raw)
To: akpm, pbonzini, shuah
Cc: kvm, linux-kselftest, linux-kernel, linux-mm, lorenzo.stoakes,
david, ryan.roberts, quic_eberman, jthoughton, peterx, graf,
jgowans, roypat, derekmn, nsaenz, xmarcalx, kalyazin
The test demonstrates that a minor userfaultfd event in guest_memfd can
be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl.
Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
---
.../testing/selftests/kvm/guest_memfd_test.c | 94 +++++++++++++++++++
1 file changed, 94 insertions(+)
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index 38c501e49e0e..9b47b796f3aa 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -10,12 +10,16 @@
#include <errno.h>
#include <stdio.h>
#include <fcntl.h>
+#include <pthread.h>
#include <linux/bitmap.h>
#include <linux/falloc.h>
+#include <linux/userfaultfd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
#include "kvm_util.h"
#include "test_util.h"
@@ -206,6 +210,93 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
close(fd1);
}
+struct fault_args {
+ char *addr;
+ volatile char value;
+};
+
+static void *fault_thread_fn(void *arg)
+{
+ struct fault_args *args = arg;
+
+ /* Trigger page fault */
+ args->value = *args->addr;
+ return NULL;
+}
+
+static void test_uffd_missing(int fd, size_t page_size, size_t total_size)
+{
+ struct uffdio_register uffd_reg;
+ struct uffdio_continue uffd_cont;
+ struct uffd_msg msg;
+ struct fault_args args;
+ pthread_t fault_thread;
+ void *mem, *mem_nofault, *buf = NULL;
+ int uffd, ret;
+ off_t offset = page_size;
+ void *fault_addr;
+
+ ret = posix_memalign(&buf, page_size, total_size);
+ TEST_ASSERT_EQ(ret, 0);
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ TEST_ASSERT(uffd != -1, "userfaultfd creation should succeed");
+
+ struct uffdio_api uffdio_api = {
+ .api = UFFD_API,
+ .features = UFFD_FEATURE_MISSING_SHMEM,
+ };
+ ret = ioctl(uffd, UFFDIO_API, &uffdio_api);
+ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_API) should succeed");
+
+ mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ TEST_ASSERT(mem != MAP_FAILED, "mmap should succeed");
+
+ mem_nofault = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ TEST_ASSERT(mem_nofault != MAP_FAILED, "mmap should succeed");
+
+ uffd_reg.range.start = (unsigned long)mem;
+ uffd_reg.range.len = total_size;
+ uffd_reg.mode = UFFDIO_REGISTER_MODE_MINOR;
+ ret = ioctl(uffd, UFFDIO_REGISTER, &uffd_reg);
+ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_REGISTER) should succeed");
+
+ ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ offset, page_size);
+ TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed");
+
+ fault_addr = mem + offset;
+ args.addr = fault_addr;
+
+ ret = pthread_create(&fault_thread, NULL, fault_thread_fn, &args);
+ TEST_ASSERT(ret == 0, "pthread_create should succeed");
+
+ ret = read(uffd, &msg, sizeof(msg));
+ TEST_ASSERT(ret != -1, "read from userfaultfd should succeed");
+ TEST_ASSERT(msg.event == UFFD_EVENT_PAGEFAULT, "event type should be pagefault");
+ TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) == fault_addr,
+ "pagefault should occur at expected address");
+
+ memcpy(mem_nofault + offset, buf + offset, page_size);
+
+ uffd_cont.range.start = (unsigned long)fault_addr;
+ uffd_cont.range.len = page_size;
+ uffd_cont.mode = 0;
+ ret = ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont);
+ TEST_ASSERT(ret != -1, "ioctl(UFFDIO_CONTINUE) should succeed");
+
+ ret = pthread_join(fault_thread, NULL);
+ TEST_ASSERT(ret == 0, "pthread_join should succeed");
+
+ ret = munmap(mem_nofault, total_size);
+ TEST_ASSERT(!ret, "munmap should succeed");
+
+ ret = munmap(mem, total_size);
+ TEST_ASSERT(!ret, "munmap should succeed");
+ free(buf);
+ close(uffd);
+}
+
unsigned long get_shared_type(void)
{
#ifdef __x86_64__
@@ -244,6 +335,9 @@ void test_vm_type(unsigned long type, bool is_shared)
test_fallocate(fd, page_size, total_size);
test_invalid_punch_hole(fd, page_size, total_size);
+ if (is_shared)
+ test_uffd_missing(fd, page_size, total_size);
+
close(fd);
kvm_vm_release(vm);
}
--
2.47.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v2 5/5] KVM: selftests: test userfaultfd minor for guest_memfd
2025-04-02 16:07 ` [PATCH v2 5/5] KVM: selftests: test userfaultfd minor for guest_memfd Nikita Kalyazin
@ 2025-04-02 21:10 ` James Houghton
2025-04-03 17:02 ` Nikita Kalyazin
0 siblings, 1 reply; 12+ messages in thread
From: James Houghton @ 2025-04-02 21:10 UTC (permalink / raw)
To: Nikita Kalyazin
Cc: akpm, pbonzini, shuah, kvm, linux-kselftest, linux-kernel,
linux-mm, lorenzo.stoakes, david, ryan.roberts, quic_eberman,
peterx, graf, jgowans, roypat, derekmn, nsaenz, xmarcalx
On Wed, Apr 2, 2025 at 9:08 AM Nikita Kalyazin <kalyazin@amazon.com> wrote:
>
> The test demonstrates that a minor userfaultfd event in guest_memfd can
> be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl.
>
> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
> ---
> .../testing/selftests/kvm/guest_memfd_test.c | 94 +++++++++++++++++++
> 1 file changed, 94 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
> index 38c501e49e0e..9b47b796f3aa 100644
> --- a/tools/testing/selftests/kvm/guest_memfd_test.c
> +++ b/tools/testing/selftests/kvm/guest_memfd_test.c
> @@ -10,12 +10,16 @@
> #include <errno.h>
> #include <stdio.h>
> #include <fcntl.h>
> +#include <pthread.h>
>
> #include <linux/bitmap.h>
> #include <linux/falloc.h>
> +#include <linux/userfaultfd.h>
> #include <sys/mman.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> +#include <sys/syscall.h>
> +#include <sys/ioctl.h>
>
> #include "kvm_util.h"
> #include "test_util.h"
> @@ -206,6 +210,93 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
> close(fd1);
> }
>
> +struct fault_args {
> + char *addr;
> + volatile char value;
I think you should/must put volatile on `addr` and not on `value`.
> +};
> +
> +static void *fault_thread_fn(void *arg)
> +{
> + struct fault_args *args = arg;
> +
> + /* Trigger page fault */
> + args->value = *args->addr;
> + return NULL;
> +}
> +
> +static void test_uffd_missing(int fd, size_t page_size, size_t total_size)
test_uffd_minor? :)
> +{
> + struct uffdio_register uffd_reg;
> + struct uffdio_continue uffd_cont;
> + struct uffd_msg msg;
> + struct fault_args args;
> + pthread_t fault_thread;
> + void *mem, *mem_nofault, *buf = NULL;
> + int uffd, ret;
> + off_t offset = page_size;
> + void *fault_addr;
> +
> + ret = posix_memalign(&buf, page_size, total_size);
> + TEST_ASSERT_EQ(ret, 0);
> +
> + uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
> + TEST_ASSERT(uffd != -1, "userfaultfd creation should succeed");
> +
> + struct uffdio_api uffdio_api = {
> + .api = UFFD_API,
> + .features = UFFD_FEATURE_MISSING_SHMEM,
I think you mean UFFD_FEATURE_MINOR_SHMEM...?
And I'm trying to think through what feature we should expose for
guest_memfd; UFFD_FEATURE_MINOR_SHMEM already indicates support for
shmem.
We could have UFFD_FEATURE_MINOR_GUESTMEMFD, perhaps that's enough.
Or we could have UFFD_FEATURE_MINOR_GENERIC (or nothing at all!). Some
VMAs might not support the minor mode, and the user will figure that
out when UFFDIO_REGISTER fails.
> + };
> + ret = ioctl(uffd, UFFDIO_API, &uffdio_api);
> + TEST_ASSERT(ret != -1, "ioctl(UFFDIO_API) should succeed");
> +
> + mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> + TEST_ASSERT(mem != MAP_FAILED, "mmap should succeed");
> +
> + mem_nofault = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> + TEST_ASSERT(mem_nofault != MAP_FAILED, "mmap should succeed");
> +
> + uffd_reg.range.start = (unsigned long)mem;
> + uffd_reg.range.len = total_size;
> + uffd_reg.mode = UFFDIO_REGISTER_MODE_MINOR;
> + ret = ioctl(uffd, UFFDIO_REGISTER, &uffd_reg);
> + TEST_ASSERT(ret != -1, "ioctl(UFFDIO_REGISTER) should succeed");
> +
> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
> + offset, page_size);
> + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed");
> +
> + fault_addr = mem + offset;
> + args.addr = fault_addr;
> +
> + ret = pthread_create(&fault_thread, NULL, fault_thread_fn, &args);
> + TEST_ASSERT(ret == 0, "pthread_create should succeed");
> +
> + ret = read(uffd, &msg, sizeof(msg));
> + TEST_ASSERT(ret != -1, "read from userfaultfd should succeed");
> + TEST_ASSERT(msg.event == UFFD_EVENT_PAGEFAULT, "event type should be pagefault");
> + TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) == fault_addr,
> + "pagefault should occur at expected address");
> +
> + memcpy(mem_nofault + offset, buf + offset, page_size);
> +
> + uffd_cont.range.start = (unsigned long)fault_addr;
> + uffd_cont.range.len = page_size;
> + uffd_cont.mode = 0;
> + ret = ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont);
> + TEST_ASSERT(ret != -1, "ioctl(UFFDIO_CONTINUE) should succeed");
> +
> + ret = pthread_join(fault_thread, NULL);
> + TEST_ASSERT(ret == 0, "pthread_join should succeed");
And maybe also:
/* Right value? */
TEST_ASSERT(args.value == *(char *)mem_nofault));
/* No second fault? */
TEST_ASSERT(args.value == *(char *)mem);
> +
> + ret = munmap(mem_nofault, total_size);
> + TEST_ASSERT(!ret, "munmap should succeed");
> +
> + ret = munmap(mem, total_size);
> + TEST_ASSERT(!ret, "munmap should succeed");
> + free(buf);
> + close(uffd);
> +}
> +
> unsigned long get_shared_type(void)
> {
> #ifdef __x86_64__
> @@ -244,6 +335,9 @@ void test_vm_type(unsigned long type, bool is_shared)
> test_fallocate(fd, page_size, total_size);
> test_invalid_punch_hole(fd, page_size, total_size);
>
> + if (is_shared)
> + test_uffd_missing(fd, page_size, total_size);
> +
> close(fd);
> kvm_vm_release(vm);
> }
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2 5/5] KVM: selftests: test userfaultfd minor for guest_memfd
2025-04-02 21:10 ` James Houghton
@ 2025-04-03 17:02 ` Nikita Kalyazin
0 siblings, 0 replies; 12+ messages in thread
From: Nikita Kalyazin @ 2025-04-03 17:02 UTC (permalink / raw)
To: James Houghton
Cc: akpm, pbonzini, shuah, kvm, linux-kselftest, linux-kernel,
linux-mm, lorenzo.stoakes, david, ryan.roberts, quic_eberman,
peterx, graf, jgowans, roypat, derekmn, nsaenz, xmarcalx
On 02/04/2025 22:10, James Houghton wrote:
> On Wed, Apr 2, 2025 at 9:08 AM Nikita Kalyazin <kalyazin@amazon.com> wrote:
>>
>> The test demonstrates that a minor userfaultfd event in guest_memfd can
>> be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl.
>>
>> Signed-off-by: Nikita Kalyazin <kalyazin@amazon.com>
>> ---
>> .../testing/selftests/kvm/guest_memfd_test.c | 94 +++++++++++++++++++
>> 1 file changed, 94 insertions(+)
>>
>> diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
>> index 38c501e49e0e..9b47b796f3aa 100644
>> --- a/tools/testing/selftests/kvm/guest_memfd_test.c
>> +++ b/tools/testing/selftests/kvm/guest_memfd_test.c
>> @@ -10,12 +10,16 @@
>> #include <errno.h>
>> #include <stdio.h>
>> #include <fcntl.h>
>> +#include <pthread.h>
>>
>> #include <linux/bitmap.h>
>> #include <linux/falloc.h>
>> +#include <linux/userfaultfd.h>
>> #include <sys/mman.h>
>> #include <sys/types.h>
>> #include <sys/stat.h>
>> +#include <sys/syscall.h>
>> +#include <sys/ioctl.h>
>>
>> #include "kvm_util.h"
>> #include "test_util.h"
>> @@ -206,6 +210,93 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm)
>> close(fd1);
>> }
>>
>> +struct fault_args {
>> + char *addr;
>> + volatile char value;
>
> I think you should/must put volatile on `addr` and not on `value`.
This was to prevent the compiler from omitting the write to the value,
because it's never read later on.
>
>> +};
>> +
>> +static void *fault_thread_fn(void *arg)
>> +{
>> + struct fault_args *args = arg;
>> +
>> + /* Trigger page fault */
>> + args->value = *args->addr;
>> + return NULL;
>> +}
>> +
>> +static void test_uffd_missing(int fd, size_t page_size, size_t total_size)
>
> test_uffd_minor? :)
>
>> +{
>> + struct uffdio_register uffd_reg;
>> + struct uffdio_continue uffd_cont;
>> + struct uffd_msg msg;
>> + struct fault_args args;
>> + pthread_t fault_thread;
>> + void *mem, *mem_nofault, *buf = NULL;
>> + int uffd, ret;
>> + off_t offset = page_size;
>> + void *fault_addr;
>> +
>> + ret = posix_memalign(&buf, page_size, total_size);
>> + TEST_ASSERT_EQ(ret, 0);
>> +
>> + uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
>> + TEST_ASSERT(uffd != -1, "userfaultfd creation should succeed");
>> +
>> + struct uffdio_api uffdio_api = {
>> + .api = UFFD_API,
>> + .features = UFFD_FEATURE_MISSING_SHMEM,
>
> I think you mean UFFD_FEATURE_MINOR_SHMEM...?
>
> And I'm trying to think through what feature we should expose for
> guest_memfd; UFFD_FEATURE_MINOR_SHMEM already indicates support for
> shmem.
>
> We could have UFFD_FEATURE_MINOR_GUESTMEMFD, perhaps that's enough.
Yes, I will introduce UFFD_FEATURE_MINOR_GUEST_MEMFD in the next version.
>
> Or we could have UFFD_FEATURE_MINOR_GENERIC (or nothing at all!). Some
> VMAs might not support the minor mode, and the user will figure that
> out when UFFDIO_REGISTER fails.
My concern is the exact reason of the failure may not be apparent to the
caller in that case.
>
>> + };
>> + ret = ioctl(uffd, UFFDIO_API, &uffdio_api);
>> + TEST_ASSERT(ret != -1, "ioctl(UFFDIO_API) should succeed");
>> +
>> + mem = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>> + TEST_ASSERT(mem != MAP_FAILED, "mmap should succeed");
>> +
>> + mem_nofault = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>> + TEST_ASSERT(mem_nofault != MAP_FAILED, "mmap should succeed");
>> +
>> + uffd_reg.range.start = (unsigned long)mem;
>> + uffd_reg.range.len = total_size;
>> + uffd_reg.mode = UFFDIO_REGISTER_MODE_MINOR;
>> + ret = ioctl(uffd, UFFDIO_REGISTER, &uffd_reg);
>> + TEST_ASSERT(ret != -1, "ioctl(UFFDIO_REGISTER) should succeed");
>> +
>> + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
>> + offset, page_size);
>> + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed");
>> +
>> + fault_addr = mem + offset;
>> + args.addr = fault_addr;
>> +
>> + ret = pthread_create(&fault_thread, NULL, fault_thread_fn, &args);
>> + TEST_ASSERT(ret == 0, "pthread_create should succeed");
>> +
>> + ret = read(uffd, &msg, sizeof(msg));
>> + TEST_ASSERT(ret != -1, "read from userfaultfd should succeed");
>> + TEST_ASSERT(msg.event == UFFD_EVENT_PAGEFAULT, "event type should be pagefault");
>> + TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) == fault_addr,
>> + "pagefault should occur at expected address");
>> +
>> + memcpy(mem_nofault + offset, buf + offset, page_size);
>> +
>> + uffd_cont.range.start = (unsigned long)fault_addr;
>> + uffd_cont.range.len = page_size;
>> + uffd_cont.mode = 0;
>> + ret = ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont);
>> + TEST_ASSERT(ret != -1, "ioctl(UFFDIO_CONTINUE) should succeed");
>> +
>> + ret = pthread_join(fault_thread, NULL);
>> + TEST_ASSERT(ret == 0, "pthread_join should succeed");
>
> And maybe also:
>
> /* Right value? */
> TEST_ASSERT(args.value == *(char *)mem_nofault));
> /* No second fault? */
> TEST_ASSERT(args.value == *(char *)mem);
Good idea, thanks. I don't need the volatile anymore :)
>
>> +
>> + ret = munmap(mem_nofault, total_size);
>> + TEST_ASSERT(!ret, "munmap should succeed");
>> +
>> + ret = munmap(mem, total_size);
>> + TEST_ASSERT(!ret, "munmap should succeed");
>> + free(buf);
>> + close(uffd);
>> +}
>> +
>> unsigned long get_shared_type(void)
>> {
>> #ifdef __x86_64__
>> @@ -244,6 +335,9 @@ void test_vm_type(unsigned long type, bool is_shared)
>> test_fallocate(fd, page_size, total_size);
>> test_invalid_punch_hole(fd, page_size, total_size);
>>
>> + if (is_shared)
>> + test_uffd_missing(fd, page_size, total_size);
>> +
>> close(fd);
>> kvm_vm_release(vm);
>> }
>> --
>> 2.47.1
>>
^ permalink raw reply [flat|nested] 12+ messages in thread