* Re: [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off
2018-12-12 4:17 ` [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off Paul Mackerras
@ 2018-12-12 5:22 ` Suraj Jitindar Singh
2018-12-12 23:54 ` David Gibson
2020-04-28 15:57 ` Laurent Vivier
2 siblings, 0 replies; 8+ messages in thread
From: Suraj Jitindar Singh @ 2018-12-12 5:22 UTC (permalink / raw)
To: kvm-ppc
On Wed, 2018-12-12 at 15:17 +1100, Paul Mackerras wrote:
> This adds code to flush the partition-scoped page tables for a radix
> guest when dirty tracking is turned on or off for a memslot. Only
> the
> guest real addresses covered by the memslot are flushed. The reason
> for this is to get rid of any 2M PTEs in the partition-scoped page
> tables that correspond to host transparent huge pages, so that page
> dirtiness is tracked at a system page (4k or 64k) granularity rather
> than a 2M granularity. The page tables are also flushed when turning
> dirty tracking off so that the memslot's address space can be
> repopulated with THPs if possible.
>
> To do this, we add a new function
> kvmppc_radix_flush_memslot(). Since
> this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
> guest, we now make kvmppc_core_flush_memslot_hv() call the new
> kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
> for each page in the memslot. This has the effect of fixing a bug in
> that kvmppc_core_flush_memslot_hv() was previously calling
> kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
> is required to be held.
>
One comment below.
Either way:
Reviewed-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> ---
> arch/powerpc/include/asm/kvm_book3s.h | 2 ++
> arch/powerpc/kvm/book3s_64_mmu_hv.c | 9 +++++----
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c | 17 +++++++++++++++++
> 4 files changed, 44 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 728d2b7..f8a5ac8 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -222,6 +222,8 @@ extern int kvm_test_age_radix(struct kvm *kvm,
> struct kvm_memory_slot *memslot,
> unsigned long gfn);
> extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
> struct kvm_memory_slot *memslot, unsigned
> long *map);
> +extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
> + const struct kvm_memory_slot *memslot);
> extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct
> kvm_ppc_rmmu_info *info);
>
> /* XXX remove this export when load_last_inst() is generic */
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index a18afda..6f2d2fb 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -899,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm
> *kvm,
>
> gfn = memslot->base_gfn;
> rmapp = memslot->arch.rmap;
> + if (kvm_is_radix(kvm)) {
> + kvmppc_radix_flush_memslot(kvm, memslot);
> + return;
> + }
> +
> for (n = memslot->npages; n; --n, ++gfn) {
> - if (kvm_is_radix(kvm)) {
> - kvm_unmap_radix(kvm, memslot, gfn);
> - continue;
> - }
> /*
> * Testing the present bit without locking is OK
> because
> * the memslot has been marked invalid already, and
> hence
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 52711eb..d675ad9 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -958,6 +958,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm
> *kvm,
> return 0;
> }
>
> +void kvmppc_radix_flush_memslot(struct kvm *kvm,
> + const struct kvm_memory_slot
> *memslot)
> +{
> + unsigned long n;
> + pte_t *ptep;
> + unsigned long gpa;
> + unsigned int shift;
> +
> + gpa = memslot->base_gfn << PAGE_SHIFT;
> + spin_lock(&kvm->mmu_lock);
> + for (n = memslot->npages; n; --n) {
> + ptep = __find_linux_pte(kvm->arch.pgtable, gpa,
> NULL, &shift);
> + if (ptep && pte_present(*ptep))
> + kvmppc_unmap_pte(kvm, ptep, gpa, shift,
> memslot,
> + kvm->arch.lpid);
> + gpa += PAGE_SIZE;
> + }
> + spin_unlock(&kvm->mmu_lock);
I don't know how expensive it is calling __find_linux_pte() may times
for a 2M or 1G page when we know we've already invalidated the mapping?
Could be improved with:
end_gpa = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
for (; gpa < end_gpa; gpa += (1UL << shift)) {
...
if (!shift)
shift = PAGE_SHIFT;
}
> +}
> +
> static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
> int psize, int *indexp)
> {
> diff --git a/arch/powerpc/kvm/book3s_hv.c
> b/arch/powerpc/kvm/book3s_hv.c
> index f4fbb7b5..074ff5b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4384,6 +4384,23 @@ static void
> kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
> */
> if (npages)
> atomic64_inc(&kvm->arch.mmio_update);
> +
> + /*
> + * For change = KVM_MR_MOVE or KVM_MR_DELETE, higher levels
> + * have already called kvm_arch_flush_shadow_memslot() to
> + * flush shadow mappings. For KVM_MR_CREATE we have no
> + * previous mappings. So the only case to handle is
> + * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
> + * has been changed.
> + * For radix guests, we flush on setting
> KVM_MEM_LOG_DIRTY_PAGES
> + * to get rid of any THP PTEs in the partition-scoped page
> tables
> + * so we can track dirtiness at the page level; we flush
> when
> + * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back
> to
> + * using THP PTEs.
> + */
> + if (change = KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> + ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> + kvmppc_radix_flush_memslot(kvm, old);
> }
>
> /*
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off
2018-12-12 4:17 ` [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off Paul Mackerras
2018-12-12 5:22 ` Suraj Jitindar Singh
@ 2018-12-12 23:54 ` David Gibson
2020-04-28 15:57 ` Laurent Vivier
2 siblings, 0 replies; 8+ messages in thread
From: David Gibson @ 2018-12-12 23:54 UTC (permalink / raw)
To: kvm-ppc
[-- Attachment #1: Type: text/plain, Size: 5507 bytes --]
On Wed, Dec 12, 2018 at 03:17:17PM +1100, Paul Mackerras wrote:
> This adds code to flush the partition-scoped page tables for a radix
> guest when dirty tracking is turned on or off for a memslot. Only the
> guest real addresses covered by the memslot are flushed. The reason
> for this is to get rid of any 2M PTEs in the partition-scoped page
> tables that correspond to host transparent huge pages, so that page
> dirtiness is tracked at a system page (4k or 64k) granularity rather
> than a 2M granularity. The page tables are also flushed when turning
> dirty tracking off so that the memslot's address space can be
> repopulated with THPs if possible.
>
> To do this, we add a new function kvmppc_radix_flush_memslot(). Since
> this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
> guest, we now make kvmppc_core_flush_memslot_hv() call the new
> kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
> for each page in the memslot. This has the effect of fixing a bug in
> that kvmppc_core_flush_memslot_hv() was previously calling
> kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
> is required to be held.
>
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> arch/powerpc/include/asm/kvm_book3s.h | 2 ++
> arch/powerpc/kvm/book3s_64_mmu_hv.c | 9 +++++----
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c | 17 +++++++++++++++++
> 4 files changed, 44 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 728d2b7..f8a5ac8 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -222,6 +222,8 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
> unsigned long gfn);
> extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
> struct kvm_memory_slot *memslot, unsigned long *map);
> +extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
> + const struct kvm_memory_slot *memslot);
> extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
>
> /* XXX remove this export when load_last_inst() is generic */
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index a18afda..6f2d2fb 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -899,11 +899,12 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
>
> gfn = memslot->base_gfn;
> rmapp = memslot->arch.rmap;
> + if (kvm_is_radix(kvm)) {
> + kvmppc_radix_flush_memslot(kvm, memslot);
> + return;
> + }
> +
> for (n = memslot->npages; n; --n, ++gfn) {
> - if (kvm_is_radix(kvm)) {
> - kvm_unmap_radix(kvm, memslot, gfn);
> - continue;
> - }
> /*
> * Testing the present bit without locking is OK because
> * the memslot has been marked invalid already, and hence
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 52711eb..d675ad9 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -958,6 +958,26 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
> return 0;
> }
>
> +void kvmppc_radix_flush_memslot(struct kvm *kvm,
> + const struct kvm_memory_slot *memslot)
> +{
> + unsigned long n;
> + pte_t *ptep;
> + unsigned long gpa;
> + unsigned int shift;
> +
> + gpa = memslot->base_gfn << PAGE_SHIFT;
> + spin_lock(&kvm->mmu_lock);
> + for (n = memslot->npages; n; --n) {
> + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
> + if (ptep && pte_present(*ptep))
> + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
> + kvm->arch.lpid);
> + gpa += PAGE_SIZE;
> + }
> + spin_unlock(&kvm->mmu_lock);
> +}
> +
> static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
> int psize, int *indexp)
> {
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index f4fbb7b5..074ff5b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4384,6 +4384,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
> */
> if (npages)
> atomic64_inc(&kvm->arch.mmio_update);
> +
> + /*
> + * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
> + * have already called kvm_arch_flush_shadow_memslot() to
> + * flush shadow mappings. For KVM_MR_CREATE we have no
> + * previous mappings. So the only case to handle is
> + * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
> + * has been changed.
> + * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
> + * to get rid of any THP PTEs in the partition-scoped page tables
> + * so we can track dirtiness at the page level; we flush when
> + * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
> + * using THP PTEs.
> + */
> + if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> + ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> + kvmppc_radix_flush_memslot(kvm, old);
> }
>
> /*
--
David Gibson | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off
2018-12-12 4:17 ` [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off Paul Mackerras
2018-12-12 5:22 ` Suraj Jitindar Singh
2018-12-12 23:54 ` David Gibson
@ 2020-04-28 15:57 ` Laurent Vivier
2020-05-06 5:12 ` Paul Mackerras
2 siblings, 1 reply; 8+ messages in thread
From: Laurent Vivier @ 2020-04-28 15:57 UTC (permalink / raw)
To: Paul Mackerras, kvm, kvm-ppc; +Cc: David Gibson, Nicholas Piggin
On 12/12/2018 05:17, Paul Mackerras wrote:
> This adds code to flush the partition-scoped page tables for a radix
> guest when dirty tracking is turned on or off for a memslot. Only the
> guest real addresses covered by the memslot are flushed. The reason
> for this is to get rid of any 2M PTEs in the partition-scoped page
> tables that correspond to host transparent huge pages, so that page
> dirtiness is tracked at a system page (4k or 64k) granularity rather
> than a 2M granularity. The page tables are also flushed when turning
> dirty tracking off so that the memslot's address space can be
> repopulated with THPs if possible.
>
> To do this, we add a new function kvmppc_radix_flush_memslot(). Since
> this does what's needed for kvmppc_core_flush_memslot_hv() on a radix
> guest, we now make kvmppc_core_flush_memslot_hv() call the new
> kvmppc_radix_flush_memslot() rather than calling kvm_unmap_radix()
> for each page in the memslot. This has the effect of fixing a bug in
> that kvmppc_core_flush_memslot_hv() was previously calling
> kvm_unmap_radix() without holding the kvm->mmu_lock spinlock, which
> is required to be held.
>
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> Reviewed-by: Suraj Jitindar Singh <sjitindarsingh@gmail.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> arch/powerpc/include/asm/kvm_book3s.h | 2 ++
> arch/powerpc/kvm/book3s_64_mmu_hv.c | 9 +++++----
> arch/powerpc/kvm/book3s_64_mmu_radix.c | 20 ++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c | 17 +++++++++++++++++
> 4 files changed, 44 insertions(+), 4 deletions(-)
>
...
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index f4fbb7b5..074ff5b 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4384,6 +4384,23 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
> */
> if (npages)
> atomic64_inc(&kvm->arch.mmio_update);
> +
> + /*
> + * For change = KVM_MR_MOVE or KVM_MR_DELETE, higher levels
> + * have already called kvm_arch_flush_shadow_memslot() to
> + * flush shadow mappings. For KVM_MR_CREATE we have no
> + * previous mappings. So the only case to handle is
> + * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
> + * has been changed.
> + * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
> + * to get rid of any THP PTEs in the partition-scoped page tables
> + * so we can track dirtiness at the page level; we flush when
> + * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
> + * using THP PTEs.
> + */
> + if (change = KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> + ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> + kvmppc_radix_flush_memslot(kvm, old);
> }
Hi,
this part introduces a regression in QEMU test suite.
We have the following warning in the host kernel log:
[ 96.631336] ------------[ cut here ]------------
[ 96.631372] WARNING: CPU: 32 PID: 4095 at
arch/powerpc/kvm/book3s_64_mmu_radix.c:436
kvmppc_unmap_free_pte+0x84/0x150 [kvm_hv]
[ 96.631394] Modules linked in: xt_CHECKSUM nft_chain_nat
xt_MASQUERADE nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6
nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 nft_counter nft_compat
nf_tables nfnetlink tun bridge stp llc rpcsec_gss_krb5 auth_rpcgss nfsv4
dns_resolver nfs lockd grace fscache rfkill kvm_hv kvm i2c_dev sunrpc
ext4 mbcache jbd2 xts ofpart ses enclosure vmx_crypto scsi_transport_sas
at24 ipmi_powernv powernv_flash ipmi_devintf opal_prd mtd
ipmi_msghandler uio_pdrv_genirq uio ibmpowernv ip_tables xfs libcrc32c
ast i2c_algo_bit drm_vram_helper drm_ttm_helper ttm drm_kms_helper
syscopyarea sysfillrect sysimgblt fb_sys_fops cec drm sd_mod i40e t10_pi
sg aacraid drm_panel_orientation_quirks dm_mirror dm_region_hash dm_log
dm_mod
[ 96.631553] CPU: 32 PID: 4095 Comm: CPU 0/KVM Not tainted 5.7.0-rc3 #24
[ 96.631571] NIP: c008000007745d1c LR: c008000007746c20 CTR:
c000000000098aa0
[ 96.631598] REGS: c0000003b842f440 TRAP: 0700 Not tainted (5.7.0-rc3)
[ 96.631615] MSR: 900000010282b033
<SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE,TM[E]> CR: 24222448 XER: 20040000
[ 96.631648] CFAR: c008000007746c1c IRQMASK: 0
[ 96.631648] GPR00: c008000007746c20 c0000003b842f6d0 c00800000775b300
c000000008a60000
[ 96.631648] GPR04: c00000039b835200 c0000003aae00387 0000000000000001
000000009b835205
[ 96.631648] GPR08: 0552839b03000080 0000000000000020 0000000000000005
c00800000774dd88
[ 96.631648] GPR12: c000000000098aa0 c0000007fffdb000 0000000000000000
0000000000000000
[ 96.631648] GPR16: 0000000000000000 0000000000000000 0000000000000000
0000000000000000
[ 96.631648] GPR20: 0000000000000000 0000000000000001 8703e0aa030000c0
c00000000183d9d8
[ 96.631648] GPR24: c00000000183d9e0 c00000039b835200 0000000000000001
c000000008a60000
[ 96.631648] GPR28: 0000000000000001 c00000000183d9e8 0000000000000000
c00000039b835200
[ 96.631784] NIP [c008000007745d1c] kvmppc_unmap_free_pte+0x84/0x150
[kvm_hv]
[ 96.631813] LR [c008000007746c20] kvmppc_create_pte+0x948/0xa90 [kvm_hv]
[ 96.631830] Call Trace:
[ 96.631845] [c0000003b842f6d0] [00007fff60740000] 0x7fff60740000
(unreliable)
[ 96.631866] [c0000003b842f730] [c008000007746c20]
kvmppc_create_pte+0x948/0xa90 [kvm_hv]
[ 96.631886] [c0000003b842f7d0] [c0080000077470f8]
kvmppc_book3s_instantiate_page+0x270/0x5c0 [kvm_hv]
[ 96.631920] [c0000003b842f8c0] [c008000007747618]
kvmppc_book3s_radix_page_fault+0x1d0/0x300 [kvm_hv]
[ 96.631951] [c0000003b842f970] [c008000007741f7c]
kvmppc_book3s_hv_page_fault+0x1f4/0xd50 [kvm_hv]
[ 96.631981] [c0000003b842fa60] [c00800000773da34]
kvmppc_vcpu_run_hv+0x9bc/0xba0 [kvm_hv]
[ 96.632006] [c0000003b842fb30] [c008000007c6aabc]
kvmppc_vcpu_run+0x34/0x48 [kvm]
[ 96.632030] [c0000003b842fb50] [c008000007c6686c]
kvm_arch_vcpu_ioctl_run+0x2f4/0x400 [kvm]
[ 96.632061] [c0000003b842fbe0] [c008000007c57fb8]
kvm_vcpu_ioctl+0x340/0x7d0 [kvm]
[ 96.632082] [c0000003b842fd50] [c0000000004c9598] ksys_ioctl+0xf8/0x150
[ 96.632100] [c0000003b842fda0] [c0000000004c9618] sys_ioctl+0x28/0x80
[ 96.632118] [c0000003b842fdc0] [c000000000034b34]
system_call_exception+0x104/0x1d0
[ 96.632146] [c0000003b842fe20] [c00000000000c970]
system_call_common+0xf0/0x278
[ 96.632165] Instruction dump:
[ 96.632181] 7cda3378 7c9f2378 3bc00000 3b800001 e95d0000 7d295030
2f890000 419e0054
[ 96.632211] 60000000 7ca0fc28 2fa50000 419e0028 <0fe00000> 78a55924
7f48d378 7fe4fb78
[ 96.632241] ---[ end trace 4f8aa0280f1215fb ]---
The problem is detected with the "migration-test" test program in qemu,
on a POWER9 host in radix mode with THP. It happens only the first time
the test program is run after loading kvm_hv. The warning is an explicit
detection [1] of a problem:
arch/powerpc/kvm/book3s_64_mmu_radix.c:
414 /*
415 * kvmppc_free_p?d are used to free existing page tables, and
recursively
416 * descend and clear and free children.
417 * Callers are responsible for flushing the PWC.
418 *
419 * When page tables are being unmapped/freed as part of page fault path
420 * (full = false), ptes are not expected. There is code to unmap them
421 * and emit a warning if encountered, but there may already be data
422 * corruption due to the unexpected mappings.
423 */
424 static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool
full,
425 unsigned int lpid)
426 {
427 if (full) {
428 memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
429 } else {
430 pte_t *p = pte;
431 unsigned long it;
432
433 for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
434 if (pte_val(*p) = 0)
435 continue;
436 WARN_ON_ONCE(1);
437 kvmppc_unmap_pte(kvm, p,
438 pte_pfn(*p) << PAGE_SHIFT,
439 PAGE_SHIFT, NULL, lpid);
440 }
441 }
442
443 kvmppc_pte_free(pte);
444 }
I reproduce the problem in QEMU 4.2 build directory with :
sudo rmmod kvm_hv
sudo modprobe kvm_hv
make check-qtest-ppc64
or once the test binary is built with
sudo rmmod kvm_hv
sudo modprobe kvm_hv
export QTEST_QEMU_BINARY=ppc64-softmmu/qemu-system-ppc64
tests/qtest/migration-test
The sub-test is "/migration/validate_uuid_error" that generates some
memory stress with a SLOF program in the source guest and fails because
of a mismatch with the destination guest. So the source guest continues
to execute the stress program while the migration is aborted.
Another way to reproduce the problem is:
Source guest:
sudo rmmod kvm-hv
sudo modprobe kvm-hv
qemu-system-ppc64 -display none -accel kvm -name source -m 256M \
-nodefaults -prom-env use-nvramrc?=true \
-prom-env 'nvramrc=hex ." _" begin 6400000 100000 \
do i c@ 1 + i c! 1000 +loop ." B" 0 until' -monitor \
stdio
Destination guest (on the same host):
qemu-system-ppc64 -display none -accel kvm -name source -m 512M \
-nodefaults -monitor stdio -incoming tcp:0:4444
Then in source guest monitor:
(qemu) migrate tcp:localhost:4444
The migration intentionally fails because of a memory size mismatch and
the warning is generated in the host kernel log.
It was not detected earlier because the problem is only detected with a
test added in qemu-4.2.0 [2] with THP enabled that starts a migration
and fails, so the memory stress program continues to run in the source
guest while the dirty log bitmap is released. The problem cannot be
shown with qemu-5.0 because a change in the migration speed has been
added [3]
The problem seems to happen because QEMU modifies the memory map while a
vCPU is running:
(qemu) migrate tcp:localhost:4444
108264@1578573121.242431:kvm_run_exit cpu_index 0, reason 19
108264@1578573121.242454:kvm_vcpu_ioctl cpu_index 0, type 0x2000ae80,
arg (nil)
108264@1578573121.248110:kvm_run_exit cpu_index 0, reason 19
108281@1578573121.248710:kvm_set_user_memory Slot#0 flags=0x1 gpa=0x0
size=0x10000000 ua=0x7fff8fe00000 ret=0
-> 108264@1578573121.249335:kvm_vcpu_ioctl cpu_index 0, type 0x2000ae80,
arg (nil)
-> 108259@1578573121.253111:kvm_set_user_memory Slot#0 flags=0x0 gpa=0x0
size=0x10000000 ua=0x7fff8fe00000 ret=0
-> 108264@1578573121.256593:kvm_run_exit cpu_index 0, reason 19
This is part of the cleanup function after the migration has failed:
migration_iteration_finish
migrate_fd_cleanup_schedule()
migrate_fd_cleanup_bh
...
migrate_fd_cleanup
qemu_savevm_state_cleanup
ram_save_cleanup
memory_global_dirty_log_stop
memory_global_dirty_log_do_stop
..
If I pause the VM in qemu_savevm_state_cleanup() while save_cleanup()
functions are called (ram_save_cleanup()), the warning disappears.
Any idea?
Thanks,
Laurent
[1] warning introduced by
a5704e83aa3d KVM: PPC: Book3S HV: Recursively unmap all page table
entries when unmapping
[2] 3af31a3469 "tests/migration: Add a test for validate-uuid capability"
[3] 97e1e06780 "migration: Rate limit inside host pages"
^ permalink raw reply [flat|nested] 8+ messages in thread* Re: [PATCH 4/4] KVM: PPC: Book3S HV: Flush guest mappings when turning dirty tracking on/off
2020-04-28 15:57 ` Laurent Vivier
@ 2020-05-06 5:12 ` Paul Mackerras
[not found] ` <e7aae742-d189-2882-5c41-3dd993c029bb@redhat.com>
0 siblings, 1 reply; 8+ messages in thread
From: Paul Mackerras @ 2020-05-06 5:12 UTC (permalink / raw)
To: Laurent Vivier; +Cc: kvm, kvm-ppc, David Gibson, Nicholas Piggin
On Tue, Apr 28, 2020 at 05:57:59PM +0200, Laurent Vivier wrote:
> On 12/12/2018 05:17, Paul Mackerras wrote:
> > + if (change = KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
> > + ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
> > + kvmppc_radix_flush_memslot(kvm, old);
> > }
>
> Hi,
>
> this part introduces a regression in QEMU test suite.
>
> We have the following warning in the host kernel log:
>
[snip]
>
> The problem is detected with the "migration-test" test program in qemu,
> on a POWER9 host in radix mode with THP. It happens only the first time
> the test program is run after loading kvm_hv. The warning is an explicit
> detection [1] of a problem:
Yes, we found a valid PTE where we didn't expect there to be one.
[snip]
> I reproduce the problem in QEMU 4.2 build directory with :
>
> sudo rmmod kvm_hv
> sudo modprobe kvm_hv
> make check-qtest-ppc64
>
> or once the test binary is built with
>
> sudo rmmod kvm_hv
> sudo modprobe kvm_hv
> export QTEST_QEMU_BINARY=ppc64-softmmu/qemu-system-ppc64
> tests/qtest/migration-test
>
> The sub-test is "/migration/validate_uuid_error" that generates some
> memory stress with a SLOF program in the source guest and fails because
> of a mismatch with the destination guest. So the source guest continues
> to execute the stress program while the migration is aborted.
>
> Another way to reproduce the problem is:
>
> Source guest:
>
> sudo rmmod kvm-hv
> sudo modprobe kvm-hv
> qemu-system-ppc64 -display none -accel kvm -name source -m 256M \
> -nodefaults -prom-env use-nvramrc?=true \
> -prom-env 'nvramrc=hex ." _" begin 6400000 100000 \
> do i c@ 1 + i c! 1000 +loop ." B" 0 until' -monitor \
> stdio
>
> Destination guest (on the same host):
>
> qemu-system-ppc64 -display none -accel kvm -name source -m 512M \
> -nodefaults -monitor stdio -incoming tcp:0:4444
>
> Then in source guest monitor:
>
> (qemu) migrate tcp:localhost:4444
>
> The migration intentionally fails because of a memory size mismatch and
> the warning is generated in the host kernel log.
I built QEMU 4.2 and tried all three methods you list without seeing
the problem manifest itself. Is there any other configuration
regarding THP or anything necessary?
I was using the patch below, which you could try also, since you are
better able to trigger the problem. I never saw the "flush_memslot
was necessary" message nor the WARN_ON. If you see the flush_memslot
message then that will give us a clue as to where the problem is
coming from.
Regards,
Paul.
--
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 506e4df2d730..14ddf1411279 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -220,7 +220,7 @@ extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn);
extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map);
-extern void kvmppc_radix_flush_memslot(struct kvm *kvm,
+extern int kvmppc_radix_flush_memslot(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index aa12cd4078b3..930042632d8f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -433,7 +433,7 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
if (pte_val(*p) = 0)
continue;
- WARN_ON_ONCE(1);
+ WARN_ON(1);
kvmppc_unmap_pte(kvm, p,
pte_pfn(*p) << PAGE_SHIFT,
PAGE_SHIFT, NULL, lpid);
@@ -1092,30 +1092,34 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
return 0;
}
-void kvmppc_radix_flush_memslot(struct kvm *kvm,
+int kvmppc_radix_flush_memslot(struct kvm *kvm,
const struct kvm_memory_slot *memslot)
{
unsigned long n;
pte_t *ptep;
unsigned long gpa;
unsigned int shift;
+ int ret = 0;
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
kvmppc_uvmem_drop_pages(memslot, kvm, true);
if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
- return;
+ return 0;
gpa = memslot->base_gfn << PAGE_SHIFT;
spin_lock(&kvm->mmu_lock);
for (n = memslot->npages; n; --n) {
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
- if (ptep && pte_present(*ptep))
+ if (ptep && pte_present(*ptep)) {
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
+ ret = 1;
+ }
gpa += PAGE_SIZE;
}
spin_unlock(&kvm->mmu_lock);
+ return ret;
}
static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 93493f0cbfe8..40b50f867835 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4508,6 +4508,10 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
if (change = KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
kvmppc_radix_flush_memslot(kvm, old);
+ else if (kvm_is_radix(kvm) && kvmppc_radix_flush_memslot(kvm, old))
+ pr_err("flush_memslot was necessary - change = %d flags %x -> %x\n",
+ change, old->flags, new->flags);
+
/*
* If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
*/
^ permalink raw reply related [flat|nested] 8+ messages in thread