* [PATCH 1/5] KVM: e500: retry if no memslot is found
2025-01-09 13:38 [PATCH 0/5] KVM: e500: map readonly host pages for read, and cleanup Paolo Bonzini
@ 2025-01-09 13:38 ` Paolo Bonzini
2025-01-09 19:00 ` Sean Christopherson
2025-01-09 13:38 ` [PATCH 2/5] KVM: e500: use shadow TLB entry as witness for writability Paolo Bonzini
` (3 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2025-01-09 13:38 UTC (permalink / raw)
To: linux-kernel, kvm
Cc: oliver.upton, Will Deacon, Anup Patel, Andrew Jones, seanjc,
linuxppc-dev, regressions, stable
Avoid a NULL pointer dereference if the memslot table changes between the
exit and the call to kvmppc_e500_shadow_map().
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
arch/powerpc/kvm/e500_mmu_host.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index e5a145b578a4..732335444d68 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -349,6 +349,11 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
* pointer through from the first lookup.
*/
slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
+ if (!slot) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
hva = gfn_to_hva_memslot(slot, gfn);
if (tlbsel == 1) {
--
2.47.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH 1/5] KVM: e500: retry if no memslot is found
2025-01-09 13:38 ` [PATCH 1/5] KVM: e500: retry if no memslot is found Paolo Bonzini
@ 2025-01-09 19:00 ` Sean Christopherson
0 siblings, 0 replies; 10+ messages in thread
From: Sean Christopherson @ 2025-01-09 19:00 UTC (permalink / raw)
To: Paolo Bonzini
Cc: linux-kernel, kvm, oliver.upton, Will Deacon, Anup Patel,
Andrew Jones, linuxppc-dev, regressions, stable
On Thu, Jan 09, 2025, Paolo Bonzini wrote:
> Avoid a NULL pointer dereference if the memslot table changes between the
> exit and the call to kvmppc_e500_shadow_map().
>
> Cc: stable@vger.kernel.org
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> arch/powerpc/kvm/e500_mmu_host.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
> index e5a145b578a4..732335444d68 100644
> --- a/arch/powerpc/kvm/e500_mmu_host.c
> +++ b/arch/powerpc/kvm/e500_mmu_host.c
> @@ -349,6 +349,11 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
> * pointer through from the first lookup.
> */
> slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
> + if (!slot) {
> + ret = -EAGAIN;
> + goto out;
> + }
This is unnecessary, __gfn_to_hva_many() checks for a NULL @slot.
static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return KVM_HVA_ERR_BAD;
if (memslot_is_readonly(slot) && write)
return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
*nr_pages = slot->npages - (gfn - slot->base_gfn);
return __gfn_to_hva_memslot(slot, gfn);
}
unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
gfn_t gfn)
{
return gfn_to_hva_many(slot, gfn, NULL);
}
Not checking the return value and doing a VMA lookup on hva=-1 when tlbsel==1 is
gross, but it should be functionally safe.
Returning -EAGAIN is nicer (kvmppc_e500_shadow_map() will inevitably return -EINVAL),
but in practice it doesn't matter because all callers ultimately ignore the return
value.
Since there's a ratelimited printk that yells if there's no slot, it's probably
best to let sleeping dogs lie.
if (likely(!pfnmap)) {
tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page);
if (is_error_noslot_pfn(pfn)) {
if (printk_ratelimit())
pr_err("%s: real page not found for gfn %lx\n",
__func__, (long)gfn);
return -EINVAL;
}
> +
> hva = gfn_to_hva_memslot(slot, gfn);
>
> if (tlbsel == 1) {
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 2/5] KVM: e500: use shadow TLB entry as witness for writability
2025-01-09 13:38 [PATCH 0/5] KVM: e500: map readonly host pages for read, and cleanup Paolo Bonzini
2025-01-09 13:38 ` [PATCH 1/5] KVM: e500: retry if no memslot is found Paolo Bonzini
@ 2025-01-09 13:38 ` Paolo Bonzini
2025-01-09 19:01 ` Sean Christopherson
2025-01-09 13:38 ` [PATCH 3/5] KVM: e500: track host-writability of pages Paolo Bonzini
` (2 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2025-01-09 13:38 UTC (permalink / raw)
To: linux-kernel, kvm
Cc: oliver.upton, Will Deacon, Anup Patel, Andrew Jones, seanjc,
linuxppc-dev, regressions
kvmppc_e500_ref_setup is returning whether the guest TLB entry is writable,
which is than passed to kvm_release_faultin_page. This makes little sense
for two reasons: first, because the function sets up the private data for
the page and the return value feels like it has been bolted on the side;
second, because what really matters is whether the _shadow_ TLB entry is
writable. If it is not writable, the page can be released as non-dirty.
Shift from using tlbe_is_writable(gtlbe) to doing the same check on
the shadow TLB entry.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
arch/powerpc/kvm/e500_mmu_host.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 732335444d68..06e23c625be0 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
}
-static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref,
+static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
struct kvm_book3e_206_tlb_entry *gtlbe,
kvm_pfn_t pfn, unsigned int wimg)
{
@@ -251,8 +251,6 @@ static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref,
/* Use guest supplied MAS2_G and MAS2_E */
ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
-
- return tlbe_is_writable(gtlbe);
}
static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
@@ -493,10 +491,11 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
goto out;
}
}
- writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
+ kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
ref, gvaddr, stlbe);
+ writable = tlbe_is_writable(stlbe);
/* Clear i-cache for new pages */
kvmppc_mmu_flush_icache(pfn);
--
2.47.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH 2/5] KVM: e500: use shadow TLB entry as witness for writability
2025-01-09 13:38 ` [PATCH 2/5] KVM: e500: use shadow TLB entry as witness for writability Paolo Bonzini
@ 2025-01-09 19:01 ` Sean Christopherson
0 siblings, 0 replies; 10+ messages in thread
From: Sean Christopherson @ 2025-01-09 19:01 UTC (permalink / raw)
To: Paolo Bonzini
Cc: linux-kernel, kvm, oliver.upton, Will Deacon, Anup Patel,
Andrew Jones, linuxppc-dev, regressions
On Thu, Jan 09, 2025, Paolo Bonzini wrote:
> kvmppc_e500_ref_setup is returning whether the guest TLB entry is writable,
> which is than passed to kvm_release_faultin_page. This makes little sense
s/than/then
> for two reasons: first, because the function sets up the private data for
> the page and the return value feels like it has been bolted on the side;
> second, because what really matters is whether the _shadow_ TLB entry is
> writable. If it is not writable, the page can be released as non-dirty.
> Shift from using tlbe_is_writable(gtlbe) to doing the same check on
> the shadow TLB entry.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> arch/powerpc/kvm/e500_mmu_host.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
> index 732335444d68..06e23c625be0 100644
> --- a/arch/powerpc/kvm/e500_mmu_host.c
> +++ b/arch/powerpc/kvm/e500_mmu_host.c
> @@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
> return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
> }
>
> -static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref,
> +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
> struct kvm_book3e_206_tlb_entry *gtlbe,
> kvm_pfn_t pfn, unsigned int wimg)
> {
> @@ -251,8 +251,6 @@ static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref,
>
> /* Use guest supplied MAS2_G and MAS2_E */
> ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
> -
> - return tlbe_is_writable(gtlbe);
> }
>
> static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
> @@ -493,10 +491,11 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
> goto out;
> }
> }
> - writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
>
> + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
> kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
> ref, gvaddr, stlbe);
> + writable = tlbe_is_writable(stlbe);
>
> /* Clear i-cache for new pages */
> kvmppc_mmu_flush_icache(pfn);
> --
> 2.47.1
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 3/5] KVM: e500: track host-writability of pages
2025-01-09 13:38 [PATCH 0/5] KVM: e500: map readonly host pages for read, and cleanup Paolo Bonzini
2025-01-09 13:38 ` [PATCH 1/5] KVM: e500: retry if no memslot is found Paolo Bonzini
2025-01-09 13:38 ` [PATCH 2/5] KVM: e500: use shadow TLB entry as witness for writability Paolo Bonzini
@ 2025-01-09 13:38 ` Paolo Bonzini
2025-01-09 13:38 ` [PATCH 4/5] KVM: e500: map readonly host pages for read Paolo Bonzini
2025-01-09 13:38 ` [PATCH 5/5] KVM: e500: perform hugepage check after looking up the PFN Paolo Bonzini
4 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2025-01-09 13:38 UTC (permalink / raw)
To: linux-kernel, kvm
Cc: oliver.upton, Will Deacon, Anup Patel, Andrew Jones, seanjc,
linuxppc-dev, regressions
Add the possibility of marking a page so that the UW and SW bits are
force-cleared. This is stored in the private info so that it persists
across multiple calls to kvmppc_e500_setup_stlbe.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
arch/powerpc/kvm/e500.h | 2 ++
arch/powerpc/kvm/e500_mmu_host.c | 15 +++++++++++----
2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 6d0d329cbb35..f9acf866c709 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -34,6 +34,8 @@ enum vcpu_ftr {
#define E500_TLB_BITMAP (1 << 30)
/* TLB1 entry is mapped by host TLB0 */
#define E500_TLB_TLB0 (1 << 29)
+/* entry is writable on the host */
+#define E500_TLB_WRITABLE (1 << 28)
/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */
#define E500_TLB_MAS2_ATTR (0x7f)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 06e23c625be0..e332a10fff00 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -45,11 +45,14 @@ static inline unsigned int tlb1_max_shadow_size(void)
return host_tlb_params[1].entries - tlbcam_index - 1;
}
-static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
+static inline u32 e500_shadow_mas3_attrib(u32 mas3, bool writable, int usermode)
{
/* Mask off reserved bits. */
mas3 &= MAS3_ATTRIB_MASK;
+ if (!writable)
+ mas3 &= ~(MAS3_UW|MAS3_SW);
+
#ifndef CONFIG_KVM_BOOKE_HV
if (!usermode) {
/* Guest is in supervisor mode,
@@ -244,10 +247,13 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
struct kvm_book3e_206_tlb_entry *gtlbe,
- kvm_pfn_t pfn, unsigned int wimg)
+ kvm_pfn_t pfn, unsigned int wimg,
+ bool writable)
{
ref->pfn = pfn;
ref->flags = E500_TLB_VALID;
+ if (writable)
+ ref->flags |= E500_TLB_WRITABLE;
/* Use guest supplied MAS2_G and MAS2_E */
ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
@@ -303,6 +309,7 @@ static void kvmppc_e500_setup_stlbe(
{
kvm_pfn_t pfn = ref->pfn;
u32 pr = vcpu->arch.shared->msr & MSR_PR;
+ bool writable = !!(ref->flags & E500_TLB_WRITABLE);
BUG_ON(!(ref->flags & E500_TLB_VALID));
@@ -310,7 +317,7 @@ static void kvmppc_e500_setup_stlbe(
stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR);
stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
- e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
+ e500_shadow_mas3_attrib(gtlbe->mas7_3, writable, pr);
}
static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -492,7 +499,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
}
}
- kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
+ kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, true);
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
ref, gvaddr, stlbe);
writable = tlbe_is_writable(stlbe);
--
2.47.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 4/5] KVM: e500: map readonly host pages for read
2025-01-09 13:38 [PATCH 0/5] KVM: e500: map readonly host pages for read, and cleanup Paolo Bonzini
` (2 preceding siblings ...)
2025-01-09 13:38 ` [PATCH 3/5] KVM: e500: track host-writability of pages Paolo Bonzini
@ 2025-01-09 13:38 ` Paolo Bonzini
2025-01-09 13:38 ` [PATCH 5/5] KVM: e500: perform hugepage check after looking up the PFN Paolo Bonzini
4 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2025-01-09 13:38 UTC (permalink / raw)
To: linux-kernel, kvm
Cc: oliver.upton, Will Deacon, Anup Patel, Andrew Jones, seanjc,
linuxppc-dev, regressions, Christian Zigotzky
The new __kvm_faultin_pfn() function is upset by the fact that e500 KVM
ignores host page permissions - __kvm_faultin requires a "writable"
outgoing argument, but e500 KVM is nonchalantly passing NULL.
If the host page permissions do not include writability, the shadow
TLB entry is forcibly mapped read-only.
Reported-by: Christian Zigotzky <chzigotzky@xenosoft.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
arch/powerpc/kvm/e500_mmu_host.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index e332a10fff00..7752b7f24c51 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -379,6 +379,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
unsigned long slot_start, slot_end;
pfnmap = 1;
+ writable = vma->vm_flags & VM_WRITE;
start = vma->vm_pgoff;
end = start +
@@ -454,7 +455,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (likely(!pfnmap)) {
tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
- pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page);
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
if (is_error_noslot_pfn(pfn)) {
if (printk_ratelimit())
pr_err("%s: real page not found for gfn %lx\n",
@@ -499,7 +500,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
}
}
- kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, true);
+ kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable);
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
ref, gvaddr, stlbe);
writable = tlbe_is_writable(stlbe);
--
2.47.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* [PATCH 5/5] KVM: e500: perform hugepage check after looking up the PFN
2025-01-09 13:38 [PATCH 0/5] KVM: e500: map readonly host pages for read, and cleanup Paolo Bonzini
` (3 preceding siblings ...)
2025-01-09 13:38 ` [PATCH 4/5] KVM: e500: map readonly host pages for read Paolo Bonzini
@ 2025-01-09 13:38 ` Paolo Bonzini
2025-01-09 19:07 ` Sean Christopherson
4 siblings, 1 reply; 10+ messages in thread
From: Paolo Bonzini @ 2025-01-09 13:38 UTC (permalink / raw)
To: linux-kernel, kvm
Cc: oliver.upton, Will Deacon, Anup Patel, Andrew Jones, seanjc,
linuxppc-dev, regressions
e500 KVM tries to bypass __kvm_faultin_pfn() in order to map VM_PFNMAP
VMAs as huge pages. This is a Bad Idea because VM_PFNMAP VMAs could
become noncontiguous as a result of callsto remap_pfn_range().
Instead, use the already existing host PTE lookup to retrieve a
valid host-side mapping level after __kvm_faultin_pfn() has
returned. Then find the largest size that will satisfy the
guest's request while staying within a single host PTE.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
arch/powerpc/kvm/e500_mmu_host.c | 180 ++++++++++++-------------------
1 file changed, 70 insertions(+), 110 deletions(-)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 7752b7f24c51..0457bbc2526f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -326,15 +326,14 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
struct tlbe_ref *ref)
{
struct kvm_memory_slot *slot;
- unsigned long pfn = 0; /* silence GCC warning */
+ unsigned int psize;
+ unsigned long pfn;
struct page *page = NULL;
unsigned long hva;
- int pfnmap = 0;
int tsize = BOOK3E_PAGESZ_4K;
int ret = 0;
unsigned long mmu_seq;
struct kvm *kvm = vcpu_e500->vcpu.kvm;
- unsigned long tsize_pages = 0;
pte_t *ptep;
unsigned int wimg = 0;
pgd_t *pgdir;
@@ -361,111 +360,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
hva = gfn_to_hva_memslot(slot, gfn);
- if (tlbsel == 1) {
- struct vm_area_struct *vma;
- mmap_read_lock(kvm->mm);
-
- vma = find_vma(kvm->mm, hva);
- if (vma && hva >= vma->vm_start &&
- (vma->vm_flags & VM_PFNMAP)) {
- /*
- * This VMA is a physically contiguous region (e.g.
- * /dev/mem) that bypasses normal Linux page
- * management. Find the overlap between the
- * vma and the memslot.
- */
-
- unsigned long start, end;
- unsigned long slot_start, slot_end;
-
- pfnmap = 1;
- writable = vma->vm_flags & VM_WRITE;
-
- start = vma->vm_pgoff;
- end = start +
- vma_pages(vma);
-
- pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
-
- slot_start = pfn - (gfn - slot->base_gfn);
- slot_end = slot_start + slot->npages;
-
- if (start < slot_start)
- start = slot_start;
- if (end > slot_end)
- end = slot_end;
-
- tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
- MAS1_TSIZE_SHIFT;
-
- /*
- * e500 doesn't implement the lowest tsize bit,
- * or 1K pages.
- */
- tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
-
- /*
- * Now find the largest tsize (up to what the guest
- * requested) that will cover gfn, stay within the
- * range, and for which gfn and pfn are mutually
- * aligned.
- */
-
- for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
- unsigned long gfn_start, gfn_end;
- tsize_pages = 1UL << (tsize - 2);
-
- gfn_start = gfn & ~(tsize_pages - 1);
- gfn_end = gfn_start + tsize_pages;
-
- if (gfn_start + pfn - gfn < start)
- continue;
- if (gfn_end + pfn - gfn > end)
- continue;
- if ((gfn & (tsize_pages - 1)) !=
- (pfn & (tsize_pages - 1)))
- continue;
-
- gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
- pfn &= ~(tsize_pages - 1);
- break;
- }
- } else if (vma && hva >= vma->vm_start &&
- is_vm_hugetlb_page(vma)) {
- unsigned long psize = vma_kernel_pagesize(vma);
-
- tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
- MAS1_TSIZE_SHIFT;
-
- /*
- * Take the largest page size that satisfies both host
- * and guest mapping
- */
- tsize = min(__ilog2(psize) - 10, tsize);
-
- /*
- * e500 doesn't implement the lowest tsize bit,
- * or 1K pages.
- */
- tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
- }
-
- mmap_read_unlock(kvm->mm);
- }
-
- if (likely(!pfnmap)) {
- tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
- pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
- if (is_error_noslot_pfn(pfn)) {
- if (printk_ratelimit())
- pr_err("%s: real page not found for gfn %lx\n",
- __func__, (long)gfn);
- return -EINVAL;
- }
-
- /* Align guest and physical address to page map boundaries */
- pfn &= ~(tsize_pages - 1);
- gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
+ if (is_error_noslot_pfn(pfn)) {
+ if (printk_ratelimit())
+ pr_err("%s: real page not found for gfn %lx\n",
+ __func__, (long)gfn);
+ return -EINVAL;
}
spin_lock(&kvm->mmu_lock);
@@ -483,7 +383,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
* can't run hence pfn won't change.
*/
local_irq_save(flags);
- ptep = find_linux_pte(pgdir, hva, NULL, NULL);
+ ptep = find_linux_pte(pgdir, hva, NULL, &psize);
if (ptep) {
pte_t pte = READ_ONCE(*ptep);
@@ -500,6 +400,66 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
}
}
+ if (psize && tlbsel == 1) {
+ unsigned long psize_pages, tsize_pages;
+ unsigned long start, end;
+ unsigned long slot_start, slot_end;
+
+ psize_pages = 1UL << (psize - PAGE_SHIFT);
+ start = pfn & ~(psize_pages - 1);
+ end = start + psize_pages;
+
+ slot_start = pfn - (gfn - slot->base_gfn);
+ slot_end = slot_start + slot->npages;
+
+ if (start < slot_start)
+ start = slot_start;
+ if (end > slot_end)
+ end = slot_end;
+
+ tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+ MAS1_TSIZE_SHIFT;
+
+ /*
+ * Any page size that doesn't satisfy the host mapping
+ * will fail the start and end tests.
+ */
+ tsize = min(psize - PAGE_SHIFT + BOOK3E_PAGESZ_4K, tsize);
+
+ /*
+ * e500 doesn't implement the lowest tsize bit,
+ * or 1K pages.
+ */
+ tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+ /*
+ * Now find the largest tsize (up to what the guest
+ * requested) that will cover gfn, stay within the
+ * range, and for which gfn and pfn are mutually
+ * aligned.
+ */
+
+ for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+ unsigned long gfn_start, gfn_end;
+ tsize_pages = 1UL << (tsize - 2);
+
+ gfn_start = gfn & ~(tsize_pages - 1);
+ gfn_end = gfn_start + tsize_pages;
+
+ if (gfn_start + pfn - gfn < start)
+ continue;
+ if (gfn_end + pfn - gfn > end)
+ continue;
+ if ((gfn & (tsize_pages - 1)) !=
+ (pfn & (tsize_pages - 1)))
+ continue;
+
+ gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+ pfn &= ~(tsize_pages - 1);
+ break;
+ }
+ }
+
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable);
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
ref, gvaddr, stlbe);
--
2.47.1
^ permalink raw reply related [flat|nested] 10+ messages in thread* Re: [PATCH 5/5] KVM: e500: perform hugepage check after looking up the PFN
2025-01-09 13:38 ` [PATCH 5/5] KVM: e500: perform hugepage check after looking up the PFN Paolo Bonzini
@ 2025-01-09 19:07 ` Sean Christopherson
0 siblings, 0 replies; 10+ messages in thread
From: Sean Christopherson @ 2025-01-09 19:07 UTC (permalink / raw)
To: Paolo Bonzini
Cc: linux-kernel, kvm, oliver.upton, Will Deacon, Anup Patel,
Andrew Jones, linuxppc-dev, regressions
On Thu, Jan 09, 2025, Paolo Bonzini wrote:
> @@ -483,7 +383,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
> * can't run hence pfn won't change.
> */
> local_irq_save(flags);
> - ptep = find_linux_pte(pgdir, hva, NULL, NULL);
> + ptep = find_linux_pte(pgdir, hva, NULL, &psize);
> if (ptep) {
> pte_t pte = READ_ONCE(*ptep);
LOL, this code is such a mess. If no ptep is found, IRQs are left disabled. The
bug has existed since commit 691e95fd7396 ("powerpc/mm/thp: Make page table walk
safe against thp split/collapse"), i.e. we didn't accidentally delete a
local_irq_restore() at some point.
@@ -468,14 +469,23 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
pgdir = vcpu_e500->vcpu.arch.pgdir;
+ /*
+ * We are just looking at the wimg bits, so we don't
+ * care much about the trans splitting bit.
+ * We are holding kvm->mmu_lock so a notifier invalidate
+ * can't run hence pfn won't change.
+ */
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
if (ptep) {
pte_t pte = READ_ONCE(*ptep);
- if (pte_present(pte))
+ if (pte_present(pte)) {
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
MAS2_WIMGE_MASK;
- else {
+ local_irq_restore(flags);
+ } else {
+ local_irq_restore(flags);
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
__func__, (long)gfn, pfn);
ret = -EINVAL;
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 5/5] KVM: e500: perform hugepage check after looking up the PFN
2025-01-12 9:55 [PATCH v2 0/5] KVM: e500: map readonly host pages for read, and cleanup Paolo Bonzini
@ 2025-01-12 9:55 ` Paolo Bonzini
0 siblings, 0 replies; 10+ messages in thread
From: Paolo Bonzini @ 2025-01-12 9:55 UTC (permalink / raw)
To: linux-kernel, kvm; +Cc: seanjc, linuxppc-dev, regressions, Christian Zigotzky
e500 KVM tries to bypass __kvm_faultin_pfn() in order to map VM_PFNMAP
VMAs as huge pages. This is a Bad Idea because VM_PFNMAP VMAs could
become noncontiguous as a result of callsto remap_pfn_range().
Instead, use the already existing host PTE lookup to retrieve a
valid host-side mapping level after __kvm_faultin_pfn() has
returned. Then find the largest size that will satisfy the
guest's request while staying within a single host PTE.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
arch/powerpc/kvm/e500_mmu_host.c | 178 ++++++++++++-------------------
1 file changed, 69 insertions(+), 109 deletions(-)
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index b38679e5821b..06caf8bbbe2b 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -326,15 +326,14 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
struct tlbe_ref *ref)
{
struct kvm_memory_slot *slot;
- unsigned long pfn = 0; /* silence GCC warning */
+ unsigned int psize;
+ unsigned long pfn;
struct page *page = NULL;
unsigned long hva;
- int pfnmap = 0;
int tsize = BOOK3E_PAGESZ_4K;
int ret = 0;
unsigned long mmu_seq;
struct kvm *kvm = vcpu_e500->vcpu.kvm;
- unsigned long tsize_pages = 0;
pte_t *ptep;
unsigned int wimg = 0;
pgd_t *pgdir;
@@ -356,111 +355,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
hva = gfn_to_hva_memslot(slot, gfn);
- if (tlbsel == 1) {
- struct vm_area_struct *vma;
- mmap_read_lock(kvm->mm);
-
- vma = find_vma(kvm->mm, hva);
- if (vma && hva >= vma->vm_start &&
- (vma->vm_flags & VM_PFNMAP)) {
- /*
- * This VMA is a physically contiguous region (e.g.
- * /dev/mem) that bypasses normal Linux page
- * management. Find the overlap between the
- * vma and the memslot.
- */
-
- unsigned long start, end;
- unsigned long slot_start, slot_end;
-
- pfnmap = 1;
- writable = vma->vm_flags & VM_WRITE;
-
- start = vma->vm_pgoff;
- end = start +
- vma_pages(vma);
-
- pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
-
- slot_start = pfn - (gfn - slot->base_gfn);
- slot_end = slot_start + slot->npages;
-
- if (start < slot_start)
- start = slot_start;
- if (end > slot_end)
- end = slot_end;
-
- tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
- MAS1_TSIZE_SHIFT;
-
- /*
- * e500 doesn't implement the lowest tsize bit,
- * or 1K pages.
- */
- tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
-
- /*
- * Now find the largest tsize (up to what the guest
- * requested) that will cover gfn, stay within the
- * range, and for which gfn and pfn are mutually
- * aligned.
- */
-
- for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
- unsigned long gfn_start, gfn_end;
- tsize_pages = 1UL << (tsize - 2);
-
- gfn_start = gfn & ~(tsize_pages - 1);
- gfn_end = gfn_start + tsize_pages;
-
- if (gfn_start + pfn - gfn < start)
- continue;
- if (gfn_end + pfn - gfn > end)
- continue;
- if ((gfn & (tsize_pages - 1)) !=
- (pfn & (tsize_pages - 1)))
- continue;
-
- gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
- pfn &= ~(tsize_pages - 1);
- break;
- }
- } else if (vma && hva >= vma->vm_start &&
- is_vm_hugetlb_page(vma)) {
- unsigned long psize = vma_kernel_pagesize(vma);
-
- tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
- MAS1_TSIZE_SHIFT;
-
- /*
- * Take the largest page size that satisfies both host
- * and guest mapping
- */
- tsize = min(__ilog2(psize) - 10, tsize);
-
- /*
- * e500 doesn't implement the lowest tsize bit,
- * or 1K pages.
- */
- tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
- }
-
- mmap_read_unlock(kvm->mm);
- }
-
- if (likely(!pfnmap)) {
- tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
- pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
- if (is_error_noslot_pfn(pfn)) {
- if (printk_ratelimit())
- pr_err("%s: real page not found for gfn %lx\n",
- __func__, (long)gfn);
- return -EINVAL;
- }
-
- /* Align guest and physical address to page map boundaries */
- pfn &= ~(tsize_pages - 1);
- gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &page);
+ if (is_error_noslot_pfn(pfn)) {
+ if (printk_ratelimit())
+ pr_err("%s: real page not found for gfn %lx\n",
+ __func__, (long)gfn);
+ return -EINVAL;
}
spin_lock(&kvm->mmu_lock);
@@ -478,7 +378,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
* can't run hence pfn won't change.
*/
local_irq_save(flags);
- ptep = find_linux_pte(pgdir, hva, NULL, NULL);
+ ptep = find_linux_pte(pgdir, hva, NULL, &psize);
if (ptep) {
pte_t pte = READ_ONCE(*ptep);
@@ -495,6 +395,66 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
}
local_irq_restore(flags);
+ if (psize && tlbsel == 1) {
+ unsigned long psize_pages, tsize_pages;
+ unsigned long start, end;
+ unsigned long slot_start, slot_end;
+
+ psize_pages = 1UL << (psize - PAGE_SHIFT);
+ start = pfn & ~(psize_pages - 1);
+ end = start + psize_pages;
+
+ slot_start = pfn - (gfn - slot->base_gfn);
+ slot_end = slot_start + slot->npages;
+
+ if (start < slot_start)
+ start = slot_start;
+ if (end > slot_end)
+ end = slot_end;
+
+ tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+ MAS1_TSIZE_SHIFT;
+
+ /*
+ * Any page size that doesn't satisfy the host mapping
+ * will fail the start and end tests.
+ */
+ tsize = min(psize - PAGE_SHIFT + BOOK3E_PAGESZ_4K, tsize);
+
+ /*
+ * e500 doesn't implement the lowest tsize bit,
+ * or 1K pages.
+ */
+ tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+ /*
+ * Now find the largest tsize (up to what the guest
+ * requested) that will cover gfn, stay within the
+ * range, and for which gfn and pfn are mutually
+ * aligned.
+ */
+
+ for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+ unsigned long gfn_start, gfn_end;
+ tsize_pages = 1UL << (tsize - 2);
+
+ gfn_start = gfn & ~(tsize_pages - 1);
+ gfn_end = gfn_start + tsize_pages;
+
+ if (gfn_start + pfn - gfn < start)
+ continue;
+ if (gfn_end + pfn - gfn > end)
+ continue;
+ if ((gfn & (tsize_pages - 1)) !=
+ (pfn & (tsize_pages - 1)))
+ continue;
+
+ gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+ pfn &= ~(tsize_pages - 1);
+ break;
+ }
+ }
+
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg, writable);
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
ref, gvaddr, stlbe);
--
2.47.1
^ permalink raw reply related [flat|nested] 10+ messages in thread