* [PATCH] powerpc/kvm: Handle transparent hugepage in KVM
@ 2013-06-19 6:44 Aneesh Kumar K.V
2013-06-19 7:11 ` Michael Neuling
0 siblings, 1 reply; 4+ messages in thread
From: Aneesh Kumar K.V @ 2013-06-19 6:44 UTC (permalink / raw)
To: benh, paulus; +Cc: linuxppc-dev, Aneesh Kumar K.V
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We can find pte that are splitting while walking page tables. Return
None pte in that case.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/kvm_book3s_64.h | 51 ++++++++++++++++++--------------
arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 +++--
arch/powerpc/kvm/book3s_hv_rm_mmu.c | 4 +--
3 files changed, 34 insertions(+), 28 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9c1ff33..ce20f7e 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -162,33 +162,40 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
* Lock and read a linux PTE. If it's present and writable, atomically
* set dirty and referenced bits and return the PTE, otherwise return 0.
*/
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
+ unsigned int hugepage)
{
- pte_t pte, tmp;
-
- /* wait until _PAGE_BUSY is clear then set it atomically */
- __asm__ __volatile__ (
- "1: ldarx %0,0,%3\n"
- " andi. %1,%0,%4\n"
- " bne- 1b\n"
- " ori %1,%0,%4\n"
- " stdcx. %1,0,%3\n"
- " bne- 1b"
- : "=&r" (pte), "=&r" (tmp), "=m" (*p)
- : "r" (p), "i" (_PAGE_BUSY)
- : "cc");
-
- if (pte_present(pte)) {
- pte = pte_mkyoung(pte);
- if (writing && pte_write(pte))
- pte = pte_mkdirty(pte);
- }
+ pte_t old_pte, new_pte = __pte(0);
+repeat:
+ do {
+ old_pte = pte_val(*ptep);
+ /*
+ * wait until _PAGE_BUSY is clear then set it atomically
+ */
+ if (unlikely(old_pte & _PAGE_BUSY))
+ goto repeat;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* If hugepage and is trans splitting return None */
+ if (unlikely(hugepage &&
+ pmd_trans_splitting(pte_pmd(old_pte))))
+ return __pte(0);
+#endif
- *p = pte; /* clears _PAGE_BUSY */
+ /* If pte is not present return None */
+ if (unlikely(!(old_pte & _PAGE_PRESENT)))
+ return __pte(0);
- return pte;
+ new_pte = pte_mkyoung(old_pte);
+ if (writing && pte_write(old_pte))
+ new_pte = pte_mkdirty(new_pte);
+
+ } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
+ old_pte, new_pte));
+ return new_pte;
}
+
/* Return HPTE cache control bits corresponding to Linux pte bits */
static inline unsigned long hpte_cache_bits(unsigned long pte_val)
{
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 5880dfb..e1a9415 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
/* if the guest wants write access, see if that is OK */
if (!writing && hpte_is_writable(r)) {
+ unsigned int shift;
pte_t *ptep, pte;
/*
@@ -683,9 +684,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
rcu_read_lock_sched();
ptep = find_linux_pte_or_hugepte(current->mm->pgd,
- hva, NULL);
- if (ptep && pte_present(*ptep)) {
- pte = kvmppc_read_update_linux_pte(ptep, 1);
+ hva, &shift);
+ if (ptep) {
+ pte = kvmppc_read_update_linux_pte(ptep, 1, shift);
if (pte_write(pte))
write_ok = 1;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index dcf892d..39ae723 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -150,9 +150,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
*pte_sizep = PAGE_SIZE;
if (ps > *pte_sizep)
return __pte(0);
- if (!pte_present(*ptep))
- return __pte(0);
- return kvmppc_read_update_linux_pte(ptep, writing);
+ return kvmppc_read_update_linux_pte(ptep, writing, shift);
}
static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
--
1.8.1.2
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH] powerpc/kvm: Handle transparent hugepage in KVM
2013-06-19 6:44 [PATCH] powerpc/kvm: Handle transparent hugepage in KVM Aneesh Kumar K.V
@ 2013-06-19 7:11 ` Michael Neuling
2013-06-19 12:30 ` Aneesh Kumar K.V
0 siblings, 1 reply; 4+ messages in thread
From: Michael Neuling @ 2013-06-19 7:11 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev
Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>
> We can find pte that are splitting while walking page tables. Return
> None pte in that case.
Can you expand on this more please. There are a lot of details below
like removing a ldarx/stdcx loop that should be better described here.
>
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/kvm_book3s_64.h | 51 ++++++++++++++++++--------------
> arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 +++--
> arch/powerpc/kvm/book3s_hv_rm_mmu.c | 4 +--
> 3 files changed, 34 insertions(+), 28 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 9c1ff33..ce20f7e 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -162,33 +162,40 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
> * Lock and read a linux PTE. If it's present and writable, atomically
> * set dirty and referenced bits and return the PTE, otherwise return 0.
This is comment still valid now the ldarx/stdcx is gone?
> */
> -static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
> +static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
> + unsigned int hugepage)
> {
> - pte_t pte, tmp;
> -
> - /* wait until _PAGE_BUSY is clear then set it atomically */
> - __asm__ __volatile__ (
> - "1: ldarx %0,0,%3\n"
> - " andi. %1,%0,%4\n"
> - " bne- 1b\n"
> - " ori %1,%0,%4\n"
> - " stdcx. %1,0,%3\n"
> - " bne- 1b"
> - : "=&r" (pte), "=&r" (tmp), "=m" (*p)
> - : "r" (p), "i" (_PAGE_BUSY)
> - : "cc");
> -
> - if (pte_present(pte)) {
> - pte = pte_mkyoung(pte);
> - if (writing && pte_write(pte))
> - pte = pte_mkdirty(pte);
> - }
> + pte_t old_pte, new_pte = __pte(0);
> +repeat:
> + do {
> + old_pte = pte_val(*ptep);
> + /*
> + * wait until _PAGE_BUSY is clear then set it atomically
> + */
> + if (unlikely(old_pte & _PAGE_BUSY))
> + goto repeat;
continue here? Please don't create looping primitives.
> +
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> + /* If hugepage and is trans splitting return None */
> + if (unlikely(hugepage &&
> + pmd_trans_splitting(pte_pmd(old_pte))))
Comment looks much like the code... seems redundant.
> + return __pte(0);
> +#endif
>
> - *p = pte; /* clears _PAGE_BUSY */
> + /* If pte is not present return None */
> + if (unlikely(!(old_pte & _PAGE_PRESENT)))
> + return __pte(0);
>
> - return pte;
> + new_pte = pte_mkyoung(old_pte);
> + if (writing && pte_write(old_pte))
> + new_pte = pte_mkdirty(new_pte);
> +
> + } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
> + old_pte, new_pte));
> + return new_pte;
> }
>
> +
Whitespace
> /* Return HPTE cache control bits corresponding to Linux pte bits */
> static inline unsigned long hpte_cache_bits(unsigned long pte_val)
> {
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 5880dfb..e1a9415 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> }
> /* if the guest wants write access, see if that is OK */
> if (!writing && hpte_is_writable(r)) {
> + unsigned int shift;
> pte_t *ptep, pte;
>
> /*
> @@ -683,9 +684,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> */
> rcu_read_lock_sched();
> ptep = find_linux_pte_or_hugepte(current->mm->pgd,
> - hva, NULL);
> - if (ptep && pte_present(*ptep)) {
> - pte = kvmppc_read_update_linux_pte(ptep, 1);
> + hva, &shift);
> + if (ptep) {
> + pte = kvmppc_read_update_linux_pte(ptep, 1, shift);
> if (pte_write(pte))
> write_ok = 1;
> }
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> index dcf892d..39ae723 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> @@ -150,9 +150,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> *pte_sizep = PAGE_SIZE;
> if (ps > *pte_sizep)
> return __pte(0);
> - if (!pte_present(*ptep))
> - return __pte(0);
> - return kvmppc_read_update_linux_pte(ptep, writing);
> + return kvmppc_read_update_linux_pte(ptep, writing, shift);
'shift' goes into the new 'hugepage' parameter? Doesn't seem logical?
Can we harmonise the name to make it less confusing?
Mikey
> }
>
> static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
> --
> 1.8.1.2
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] powerpc/kvm: Handle transparent hugepage in KVM
2013-06-19 7:11 ` Michael Neuling
@ 2013-06-19 12:30 ` Aneesh Kumar K.V
2013-06-19 23:59 ` Michael Neuling
0 siblings, 1 reply; 4+ messages in thread
From: Aneesh Kumar K.V @ 2013-06-19 12:30 UTC (permalink / raw)
To: Michael Neuling; +Cc: paulus, linuxppc-dev
Michael Neuling <mikey@neuling.org> writes:
> Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> wrote:
>
>> From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
>>
>> We can find pte that are splitting while walking page tables. Return
>> None pte in that case.
>
> Can you expand on this more please. There are a lot of details below
> like removing a ldarx/stdcx loop that should be better described here.
>
>>
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/include/asm/kvm_book3s_64.h | 51 ++++++++++++++++++--------------
>> arch/powerpc/kvm/book3s_64_mmu_hv.c | 7 +++--
>> arch/powerpc/kvm/book3s_hv_rm_mmu.c | 4 +--
>> 3 files changed, 34 insertions(+), 28 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
>> index 9c1ff33..ce20f7e 100644
>> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
>> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
>> @@ -162,33 +162,40 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
>> * Lock and read a linux PTE. If it's present and writable, atomically
>> * set dirty and referenced bits and return the PTE, otherwise return 0.
>
> This is comment still valid now the ldarx/stdcx is gone?
In a way yes. Instead of lock and read as it was before, it is now done
via cmpxchg which still use ldarx/stdcx
>
>> */
>> -static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing)
>> +static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
>> + unsigned int hugepage)
>> {
>> - pte_t pte, tmp;
>> -
>> - /* wait until _PAGE_BUSY is clear then set it atomically */
>> - __asm__ __volatile__ (
>> - "1: ldarx %0,0,%3\n"
>> - " andi. %1,%0,%4\n"
>> - " bne- 1b\n"
>> - " ori %1,%0,%4\n"
>> - " stdcx. %1,0,%3\n"
>> - " bne- 1b"
>> - : "=&r" (pte), "=&r" (tmp), "=m" (*p)
>> - : "r" (p), "i" (_PAGE_BUSY)
>> - : "cc");
>> -
>> - if (pte_present(pte)) {
>> - pte = pte_mkyoung(pte);
>> - if (writing && pte_write(pte))
>> - pte = pte_mkdirty(pte);
>> - }
>> + pte_t old_pte, new_pte = __pte(0);
>> +repeat:
>> + do {
>> + old_pte = pte_val(*ptep);
>> + /*
>> + * wait until _PAGE_BUSY is clear then set it atomically
>> + */
>> + if (unlikely(old_pte & _PAGE_BUSY))
>> + goto repeat;
>
> continue here? Please don't create looping primitives.
No that would be wrong. (I did that in an earlier version :).We really
don't want the below cmpxchg to run if we find _PAGE_BUSY.
>
>> +
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> + /* If hugepage and is trans splitting return None */
>> + if (unlikely(hugepage &&
>> + pmd_trans_splitting(pte_pmd(old_pte))))
>
> Comment looks much like the code... seems redundant.
>
>> + return __pte(0);
>> +#endif
>>
>> - *p = pte; /* clears _PAGE_BUSY */
>> + /* If pte is not present return None */
>> + if (unlikely(!(old_pte & _PAGE_PRESENT)))
>> + return __pte(0);
>>
>> - return pte;
>> + new_pte = pte_mkyoung(old_pte);
>> + if (writing && pte_write(old_pte))
>> + new_pte = pte_mkdirty(new_pte);
>> +
>> + } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
>> + old_pte, new_pte));
>> + return new_pte;
>> }
>>
>> +
>
> Whitespace
>
>> /* Return HPTE cache control bits corresponding to Linux pte bits */
>> static inline unsigned long hpte_cache_bits(unsigned long pte_val)
>> {
>> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
>> index 5880dfb..e1a9415 100644
>> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
>> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
>> @@ -675,6 +675,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>> }
>> /* if the guest wants write access, see if that is OK */
>> if (!writing && hpte_is_writable(r)) {
>> + unsigned int shift;
>> pte_t *ptep, pte;
>>
>> /*
>> @@ -683,9 +684,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>> */
>> rcu_read_lock_sched();
>> ptep = find_linux_pte_or_hugepte(current->mm->pgd,
>> - hva, NULL);
>> - if (ptep && pte_present(*ptep)) {
>> - pte = kvmppc_read_update_linux_pte(ptep, 1);
>> + hva, &shift);
>> + if (ptep) {
>> + pte = kvmppc_read_update_linux_pte(ptep, 1, shift);
>> if (pte_write(pte))
>> write_ok = 1;
>> }
>> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
>> index dcf892d..39ae723 100644
>> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
>> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
>> @@ -150,9 +150,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
>> *pte_sizep = PAGE_SIZE;
>> if (ps > *pte_sizep)
>> return __pte(0);
>> - if (!pte_present(*ptep))
>> - return __pte(0);
>> - return kvmppc_read_update_linux_pte(ptep, writing);
>> + return kvmppc_read_update_linux_pte(ptep, writing, shift);
>
> 'shift' goes into the new 'hugepage' parameter? Doesn't seem logical?
> Can we harmonise the name to make it less confusing?
>
it is actually the shift bits represending hugepage size. We set it to 0
if we don't find hugepage in find_linux_pte_or_hugepte. May be something
like hugepage_shift is better ?
-aneesh
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] powerpc/kvm: Handle transparent hugepage in KVM
2013-06-19 12:30 ` Aneesh Kumar K.V
@ 2013-06-19 23:59 ` Michael Neuling
0 siblings, 0 replies; 4+ messages in thread
From: Michael Neuling @ 2013-06-19 23:59 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: paulus, linuxppc-dev
> >> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> >> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> >> @@ -162,33 +162,40 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
> >> * Lock and read a linux PTE. If it's present and writable, atomically
> >> * set dirty and referenced bits and return the PTE, otherwise return 0.
> >
> > This is comment still valid now the ldarx/stdcx is gone?
>
> In a way yes. Instead of lock and read as it was before, it is now done
> via cmpxchg which still use ldarx/stdcx
OK, maybe you can update to reflect that.
> >> + pte_t old_pte, new_pte = __pte(0);
> >> +repeat:
> >> + do {
> >> + old_pte = pte_val(*ptep);
> >> + /*
> >> + * wait until _PAGE_BUSY is clear then set it atomically
> >> + */
> >> + if (unlikely(old_pte & _PAGE_BUSY))
> >> + goto repeat;
> >
> > continue here? Please don't create looping primitives.
>
> No that would be wrong. (I did that in an earlier version :).We really
> don't want the below cmpxchg to run if we find _PAGE_BUSY.
How about something like this then?
while (1) {
if (unlikely(old_pte & _PAGE_BUSY))
continue;
.....
if cmpxchg(foo)
break;
}
>
> >
> >> +
> >> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >> + /* If hugepage and is trans splitting return None */
> >> + if (unlikely(hugepage &&
> >> + pmd_trans_splitting(pte_pmd(old_pte))))
> >
> > Comment looks much like the code... seems redundant.
> >
> >> + return __pte(0);
> >> +#endif
> >>
> >> - *p = pte; /* clears _PAGE_BUSY */
> >> + /* If pte is not present return None */
> >> + if (unlikely(!(old_pte & _PAGE_PRESENT)))
> >> + return __pte(0);
> >>
> >> - return pte;
> >> + new_pte = pte_mkyoung(old_pte);
> >> + if (writing && pte_write(old_pte))
> >> + new_pte = pte_mkdirty(new_pte);
> >> +
> >> + } while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
> >> + old_pte, new_pte));
> >> + return new_pte;
> >> }
> >>
> >> +
> >
> > Whitespace
> >> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> >> index dcf892d..39ae723 100644
> >> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> >> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> >> @@ -150,9 +150,7 @@ static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
> >> *pte_sizep = PAGE_SIZE;
> >> if (ps > *pte_sizep)
> >> return __pte(0);
> >> - if (!pte_present(*ptep))
> >> - return __pte(0);
> >> - return kvmppc_read_update_linux_pte(ptep, writing);
> >> + return kvmppc_read_update_linux_pte(ptep, writing, shift);
> >
> > 'shift' goes into the new 'hugepage' parameter? Doesn't seem logical?
> > Can we harmonise the name to make it less confusing?
> >
>
> it is actually the shift bits represending hugepage size. We set it to 0
> if we don't find hugepage in find_linux_pte_or_hugepte. May be something
> like hugepage_shift is better ?
Sure.
Mikey
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2013-06-19 23:59 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-06-19 6:44 [PATCH] powerpc/kvm: Handle transparent hugepage in KVM Aneesh Kumar K.V
2013-06-19 7:11 ` Michael Neuling
2013-06-19 12:30 ` Aneesh Kumar K.V
2013-06-19 23:59 ` Michael Neuling
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).