LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: PPC: BOOK3S: HV: Don't try to allocate from kernel page allocator for hash page table.
From: Aneesh Kumar K.V @ 2014-05-04 17:25 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

We reserve 5% of total ram for CMA allocation and not using that can
result in us running out of numa node memory with specific
configuration. One caveat is we may not have node local hpt with pinned
vcpu configuration. But currently libvirt also pins the vcpu to cpuset
after creating hash page table.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index fb25ebc0af0c..f32896ffd784 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -52,7 +52,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm);
 
 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 {
-	unsigned long hpt;
+	unsigned long hpt = 0;
 	struct revmap_entry *rev;
 	struct page *page = NULL;
 	long order = KVM_DEFAULT_HPT_ORDER;
@@ -64,22 +64,11 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	}
 
 	kvm->arch.hpt_cma_alloc = 0;
-	/*
-	 * try first to allocate it from the kernel page allocator.
-	 * We keep the CMA reserved for failed allocation.
-	 */
-	hpt = __get_free_pages(GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT |
-			       __GFP_NOWARN, order - PAGE_SHIFT);
-
-	/* Next try to allocate from the preallocated pool */
-	if (!hpt) {
-		VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
-		page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
-		if (page) {
-			hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
-			kvm->arch.hpt_cma_alloc = 1;
-		} else
-			--order;
+	VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER);
+	page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT));
+	if (page) {
+		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+		kvm->arch.hpt_cma_alloc = 1;
 	}
 
 	/* Lastly try successively smaller sizes from the page allocator */
-- 
1.9.1

^ permalink raw reply related

* [PATCH] KVM: PPC: BOOK3S: PR: Fix WARN_ON with debug options on
From: Aneesh Kumar K.V @ 2014-05-04 17:26 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

With debug option "sleep inside atomic section checking" enabled we get
the below WARN_ON during a PR KVM boot. This is because upstream now
have PREEMPT_COUNT enabled even if we have preempt disabled. Fix the
warning by adding preempt_disable/enable around floating point and altivec
enable.

WARNING: at arch/powerpc/kernel/process.c:156
Modules linked in: kvm_pr kvm
CPU: 1 PID: 3990 Comm: qemu-system-ppc Tainted: G        W     3.15.0-rc1+ #4
task: c0000000eb85b3a0 ti: c0000000ec59c000 task.ti: c0000000ec59c000
NIP: c000000000015c84 LR: d000000003334644 CTR: c000000000015c00
REGS: c0000000ec59f140 TRAP: 0700   Tainted: G        W      (3.15.0-rc1+)
MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>  CR: 42000024  XER: 20000000
CFAR: c000000000015c24 SOFTE: 1
GPR00: d000000003334644 c0000000ec59f3c0 c000000000e2fa40 c0000000e2f80000
GPR04: 0000000000000800 0000000000002000 0000000000000001 8000000000000000
GPR08: 0000000000000001 0000000000000001 0000000000002000 c000000000015c00
GPR12: d00000000333da18 c00000000fb80900 0000000000000000 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 00003fffce4e0fa1
GPR20: 0000000000000010 0000000000000001 0000000000000002 00000000100b9a38
GPR24: 0000000000000002 0000000000000000 0000000000000000 0000000000000013
GPR28: 0000000000000000 c0000000eb85b3a0 0000000000002000 c0000000e2f80000
NIP [c000000000015c84] .enable_kernel_fp+0x84/0x90
LR [d000000003334644] .kvmppc_handle_ext+0x134/0x190 [kvm_pr]
Call Trace:
[c0000000ec59f3c0] [0000000000000010] 0x10 (unreliable)
[c0000000ec59f430] [d000000003334644] .kvmppc_handle_ext+0x134/0x190 [kvm_pr]
[c0000000ec59f4c0] [d00000000324b380] .kvmppc_set_msr+0x30/0x50 [kvm]
[c0000000ec59f530] [d000000003337cac] .kvmppc_core_emulate_op_pr+0x16c/0x5e0 [kvm_pr]
[c0000000ec59f5f0] [d00000000324a944] .kvmppc_emulate_instruction+0x284/0xa80 [kvm]
[c0000000ec59f6c0] [d000000003336888] .kvmppc_handle_exit_pr+0x488/0xb70 [kvm_pr]
[c0000000ec59f790] [d000000003338d34] kvm_start_lightweight+0xcc/0xdc [kvm_pr]
[c0000000ec59f960] [d000000003336288] .kvmppc_vcpu_run_pr+0xc8/0x190 [kvm_pr]
[c0000000ec59f9f0] [d00000000324c880] .kvmppc_vcpu_run+0x30/0x50 [kvm]
[c0000000ec59fa60] [d000000003249e74] .kvm_arch_vcpu_ioctl_run+0x54/0x1b0 [kvm]
[c0000000ec59faf0] [d000000003244948] .kvm_vcpu_ioctl+0x478/0x760 [kvm]
[c0000000ec59fcb0] [c000000000224e34] .do_vfs_ioctl+0x4d4/0x790
[c0000000ec59fd90] [c000000000225148] .SyS_ioctl+0x58/0xb0
[c0000000ec59fe30] [c00000000000a1e4] syscall_exit+0x0/0x98

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/book3s_pr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c5c052a9729c..f30cdfee800d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -683,16 +683,20 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
 
 	if (msr & MSR_FP) {
+		preempt_disable();
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
 		t->fp_save_area = &vcpu->arch.fp;
+		preempt_enable();
 	}
 
 	if (msr & MSR_VEC) {
 #ifdef CONFIG_ALTIVEC
+		preempt_disable();
 		enable_kernel_altivec();
 		load_vr_state(&vcpu->arch.vr);
 		t->vr_save_area = &vcpu->arch.vr;
+		preempt_enable();
 #endif
 	}
 
@@ -716,13 +720,17 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
 		return;
 
 	if (lost_ext & MSR_FP) {
+		preempt_disable();
 		enable_kernel_fp();
 		load_fp_state(&vcpu->arch.fp);
+		preempt_enable();
 	}
 #ifdef CONFIG_ALTIVEC
 	if (lost_ext & MSR_VEC) {
+		preempt_disable();
 		enable_kernel_altivec();
 		load_vr_state(&vcpu->arch.vr);
+		preempt_enable();
 	}
 #endif
 	current->thread.regs->msr |= lost_ext;
-- 
1.9.1

^ permalink raw reply related

* [RFC PATCH] KVM: PPC: BOOK3S: HV: THP support for guest
From: Aneesh Kumar K.V @ 2014-05-04 17:30 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc, Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
 arch/powerpc/kvm/book3s_hv.c             |   7 ++
 2 files changed, 130 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 51388befeddb..f03ea8f90576 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 	return old == 0;
 }
 
+static inline int __hpte_actual_psize(unsigned int lp, int psize)
+{
+	int i, shift;
+	unsigned int mask;
+
+	/* start from 1 ignoring MMU_PAGE_4K */
+	for (i = 1; i < MMU_PAGE_COUNT; i++) {
+
+		/* invalid penc */
+		if (mmu_psize_defs[psize].penc[i] == -1)
+			continue;
+		/*
+		 * encoding bits per actual page size
+		 *        PTE LP     actual page size
+		 *    rrrr rrrz		>=8KB
+		 *    rrrr rrzz		>=16KB
+		 *    rrrr rzzz		>=32KB
+		 *    rrrr zzzz		>=64KB
+		 * .......
+		 */
+		shift = mmu_psize_defs[i].shift - LP_SHIFT;
+		if (shift > LP_BITS)
+			shift = LP_BITS;
+		mask = (1 << shift) - 1;
+		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
+			return i;
+	}
+	return -1;
+}
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
-	unsigned long rb, va_low;
+	int b_size, a_size;
+	unsigned int penc;
+	unsigned long rb = 0, va_low, sllp;
+	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
+	if (!(v & HPTE_V_LARGE)) {
+		/* both base and actual psize is 4k */
+		b_size = MMU_PAGE_4K;
+		a_size = MMU_PAGE_4K;
+	} else {
+		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
+
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[b_size].shift)
+				continue;
 
+			a_size = __hpte_actual_psize(lp, b_size);
+			if (a_size != -1)
+				break;
+		}
+	}
+	/*
+	 * Ignore the top 14 bits of va
+	 * v have top two bits covering segment size, hence move
+	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
+	 * AVA field in v also have the lower 23 bits ignored.
+	 * For base page size 4K we need 14 .. 65 bits (so need to
+	 * collect extra 11 bits)
+	 * For others we need 14..14+i
+	 */
+	/* This covers 14..54 bits of va*/
 	rb = (v & ~0x7fUL) << 16;		/* AVA field */
+	/*
+	 * AVA in v had cleared lower 23 bits. We need to derive
+	 * that from pteg index
+	 */
 	va_low = pte_index >> 3;
 	if (v & HPTE_V_SECONDARY)
 		va_low = ~va_low;
-	/* xor vsid from AVA */
+	/*
+	 * get the vpn bits from va_low using reverse of hashing.
+	 * In v we have va with 23 bits dropped and then left shifted
+	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
+	 * right shift it with (SID_SHIFT - (23 - 7))
+	 */
 	if (!(v & HPTE_V_1TB_SEG))
-		va_low ^= v >> 12;
+		va_low ^= v >> (SID_SHIFT - 16);
 	else
-		va_low ^= v >> 24;
+		va_low ^= v >> (SID_SHIFT_1T - 16);
 	va_low &= 0x7ff;
-	if (v & HPTE_V_LARGE) {
-		rb |= 1;			/* L field */
-		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
-		    (r & 0xff000)) {
-			/* non-16MB large page, must be 64k */
-			/* (masks depend on page size) */
-			rb |= 0x1000;		/* page encoding in LP field */
-			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
-			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
-		}
-	} else {
-		/* 4kB page */
-		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
+
+	switch (b_size) {
+	case MMU_PAGE_4K:
+		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
+			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
+		rb |= sllp << 5;	/*  AP field */
+		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
+		break;
+	default:
+	{
+		int aval_shift;
+		/*
+		 * remaining 7bits of AVA/LP fields
+		 * Also contain the rr bits of LP
+		 */
+		rb |= (va_low & 0x7f) << 16;
+		/*
+		 * Now clear not needed LP bits based on actual psize
+		 */
+		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
+		/*
+		 * AVAL field 58..77 - base_page_shift bits of va
+		 * we have space for 58..64 bits, Missing bits should
+		 * be zero filled. +1 is to take care of L bit shift
+		 */
+		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
+		rb |= ((va_low << aval_shift) & 0xfe);
+
+		rb |= 1;		/* L field */
+		penc = mmu_psize_defs[b_size].penc[a_size];
+		rb |= penc << 12;	/* LP field */
+		break;
+	}
 	}
 	rb |= (v >> 54) & 0x300;		/* B field */
 	return rb;
@@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 
 static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
 {
+	int size, a_size;
+	/* Look at the 8 bit LP value */
+	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
+
 	/* only handle 4k, 64k and 16M pages for now */
 	if (!(h & HPTE_V_LARGE))
-		return 1ul << 12;		/* 4k page */
-	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
-		return 1ul << 16;		/* 64k page */
-	if ((l & 0xff000) == 0)
-		return 1ul << 24;		/* 16M page */
-	return 0;				/* error */
+		return 1ul << 12;
+	else {
+		for (size = 0; size < MMU_PAGE_COUNT; size++) {
+			/* valid entries have a shift value */
+			if (!mmu_psize_defs[size].shift)
+				continue;
+
+			a_size = __hpte_actual_psize(lp, size);
+			if (a_size != -1)
+				return 1ul << mmu_psize_defs[a_size].shift;
+		}
+
+	}
+	return 0;
 }
 
 static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8227dba5af0f..a38d3289320a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
 	 * support pte_enc here
 	 */
 	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
+	/*
+	 * Add 16MB MPSS support
+	 */
+	if (linux_psize != MMU_PAGE_16M) {
+		(*sps)->enc[1].page_shift = 24;
+		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+	}
 	(*sps)++;
 }
 
-- 
1.9.1

^ permalink raw reply related

* Re: [RFC PATCH] KVM: PPC: BOOK3S: HV: THP support for guest
From: Aneesh Kumar K.V @ 2014-05-04 17:36 UTC (permalink / raw)
  To: agraf, benh, paulus; +Cc: linuxppc-dev, kvm, kvm-ppc
In-Reply-To: <1399224616-25142-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> writes:

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
>  arch/powerpc/kvm/book3s_hv.c             |   7 ++
>  2 files changed, 130 insertions(+), 23 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 51388befeddb..f03ea8f90576 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
>  	return old == 0;
>  }
>
> +static inline int __hpte_actual_psize(unsigned int lp, int psize)
> +{
> +	int i, shift;
> +	unsigned int mask;
> +
> +	/* start from 1 ignoring MMU_PAGE_4K */
> +	for (i = 1; i < MMU_PAGE_COUNT; i++) {
> +
> +		/* invalid penc */
> +		if (mmu_psize_defs[psize].penc[i] == -1)
> +			continue;
> +		/*
> +		 * encoding bits per actual page size
> +		 *        PTE LP     actual page size
> +		 *    rrrr rrrz		>=8KB
> +		 *    rrrr rrzz		>=16KB
> +		 *    rrrr rzzz		>=32KB
> +		 *    rrrr zzzz		>=64KB
> +		 * .......
> +		 */
> +		shift = mmu_psize_defs[i].shift - LP_SHIFT;
> +		if (shift > LP_BITS)
> +			shift = LP_BITS;
> +		mask = (1 << shift) - 1;
> +		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>  					     unsigned long pte_index)
>  {
> -	unsigned long rb, va_low;
> +	int b_size, a_size;
> +	unsigned int penc;
> +	unsigned long rb = 0, va_low, sllp;
> +	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
> +	if (!(v & HPTE_V_LARGE)) {
> +		/* both base and actual psize is 4k */
> +		b_size = MMU_PAGE_4K;
> +		a_size = MMU_PAGE_4K;
> +	} else {
> +		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
> +
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[b_size].shift)
> +				continue;
>
> +			a_size = __hpte_actual_psize(lp, b_size);
> +			if (a_size != -1)
> +				break;
> +		}
> +	}
> +	/*
> +	 * Ignore the top 14 bits of va
> +	 * v have top two bits covering segment size, hence move
> +	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
> +	 * AVA field in v also have the lower 23 bits ignored.
> +	 * For base page size 4K we need 14 .. 65 bits (so need to
> +	 * collect extra 11 bits)
> +	 * For others we need 14..14+i
> +	 */
> +	/* This covers 14..54 bits of va*/
>  	rb = (v & ~0x7fUL) << 16;		/* AVA field */
> +	/*
> +	 * AVA in v had cleared lower 23 bits. We need to derive
> +	 * that from pteg index
> +	 */
>  	va_low = pte_index >> 3;
>  	if (v & HPTE_V_SECONDARY)
>  		va_low = ~va_low;
> -	/* xor vsid from AVA */
> +	/*
> +	 * get the vpn bits from va_low using reverse of hashing.
> +	 * In v we have va with 23 bits dropped and then left shifted
> +	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
> +	 * right shift it with (SID_SHIFT - (23 - 7))
> +	 */
>  	if (!(v & HPTE_V_1TB_SEG))
> -		va_low ^= v >> 12;
> +		va_low ^= v >> (SID_SHIFT - 16);
>  	else
> -		va_low ^= v >> 24;
> +		va_low ^= v >> (SID_SHIFT_1T - 16);
>  	va_low &= 0x7ff;
> -	if (v & HPTE_V_LARGE) {
> -		rb |= 1;			/* L field */
> -		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> -		    (r & 0xff000)) {
> -			/* non-16MB large page, must be 64k */
> -			/* (masks depend on page size) */
> -			rb |= 0x1000;		/* page encoding in LP field */
> -			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
> -			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
> -		}
> -	} else {
> -		/* 4kB page */
> -		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
> +
> +	switch (b_size) {
> +	case MMU_PAGE_4K:
> +		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
> +			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
> +		rb |= sllp << 5;	/*  AP field */
> +		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
> +		break;
> +	default:
> +	{
> +		int aval_shift;
> +		/*
> +		 * remaining 7bits of AVA/LP fields
> +		 * Also contain the rr bits of LP
> +		 */
> +		rb |= (va_low & 0x7f) << 16;
> +		/*
> +		 * Now clear not needed LP bits based on actual psize
> +		 */
> +		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
> +		/*
> +		 * AVAL field 58..77 - base_page_shift bits of va
> +		 * we have space for 58..64 bits, Missing bits should
> +		 * be zero filled. +1 is to take care of L bit shift
> +		 */
> +		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
> +		rb |= ((va_low << aval_shift) & 0xfe);
> +
> +		rb |= 1;		/* L field */
> +		penc = mmu_psize_defs[b_size].penc[a_size];
> +		rb |= penc << 12;	/* LP field */
> +		break;
> +	}
>  	}
>  	rb |= (v >> 54) & 0x300;		/* B field */
>  	return rb;
> @@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>
>  static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
>  {
> +	int size, a_size;
> +	/* Look at the 8 bit LP value */
> +	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
>  	/* only handle 4k, 64k and 16M pages for now */
>  	if (!(h & HPTE_V_LARGE))
> -		return 1ul << 12;		/* 4k page */
> -	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
> -		return 1ul << 16;		/* 64k page */
> -	if ((l & 0xff000) == 0)
> -		return 1ul << 24;		/* 16M page */
> -	return 0;				/* error */
> +		return 1ul << 12;
> +	else {
> +		for (size = 0; size < MMU_PAGE_COUNT; size++) {
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
> +
> +			a_size = __hpte_actual_psize(lp, size);
> +			if (a_size != -1)
> +				return 1ul << mmu_psize_defs[a_size].shift;
> +		}
> +
> +	}
> +	return 0;
>  }
>
>  static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 8227dba5af0f..a38d3289320a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>  	 * support pte_enc here
>  	 */
>  	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
> +	/*
> +	 * Add 16MB MPSS support
> +	 */
> +	if (linux_psize != MMU_PAGE_16M) {
> +		(*sps)->enc[1].page_shift = 24;
> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
> +	}

We ideally want to do this only when the guest memory is backed up by
hugetlbfs. I was thinking qemu should ensure that. But then i am not
sure existing qemu work that way. So we may want to look at how to
enable MPSS.

-aneesh

^ permalink raw reply

* [PATCH 3/3] powerpc/powernv: Don't escalate non-existing frozen PE
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan
In-Reply-To: <1399246144-20247-1-git-send-email-gwshan@linux.vnet.ibm.com>

Commit cb5b242c ("powerpc/eeh: Escalate error on non-existing PE")
escalates the frozen state on non-existing PE to fenced PHB. It
was to improve kdump reliability. After that, commit 361f2a2a
("powrpc/powernv: Reset PHB in kdump kernel") was introduced to
issue complete reset on all PHBs to increase the reliability of
kdump kernel.

Commit cb5b242c becomes unuseful and it would be reverted.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-ioda.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 65feec6..1b5982f 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -884,13 +884,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 			 * it again.
 			 */
 			if (ioda_eeh_get_pe(hose, frozen_pe_no, pe)) {
-				*pe = phb_pe;
-				pr_err("EEH: Escalated frozen PHB#%x-"
-				       "PE#%llx (%s) detected\n",
-					hose->global_number,
-					frozen_pe_no,
-					eeh_pe_loc_get(phb_pe));
-				ret = EEH_NEXT_ERR_FENCED_PHB;
+				/* Try best to clear it */
+				pr_info("EEH: Clear non-existing PHB#%x-PE#%llx\n",
+					hose->global_number, frozen_pe_no);
+				opal_pci_eeh_freeze_clear(phb->opal_id, frozen_pe_no,
+					OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+				ret = EEH_NEXT_ERR_NONE;
 			} else if ((*pe)->state & EEH_PE_ISOLATED) {
 				ret = EEH_NEXT_ERR_NONE;
 			} else {
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 2/3] powerpc/eeh: Report frozen parent PE prior to child PE
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan
In-Reply-To: <1399246144-20247-1-git-send-email-gwshan@linux.vnet.ibm.com>

When we have the corner case of frozen parent and child PE at the
same time, we have to handle the frozen parent PE prior to the
child. Without clearning the frozen state on parent PE, the child
PE can't be recovered successfully.

The patch searches the EEH PE hierarchy tree and returns the toppest
frozen PE to be handled. It ensures the frozen parent PE will be
handled prior to child PE.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh.c                 | 27 ++++++++++++++++++++++++---
 arch/powerpc/platforms/powernv/eeh-ioda.c | 30 ++++++++++++++++++++++++++++--
 2 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9f8de75..33d683a 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -357,10 +357,11 @@ out:
 int eeh_dev_check_failure(struct eeh_dev *edev)
 {
 	int ret;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	unsigned long flags;
 	struct device_node *dn;
 	struct pci_dev *dev;
-	struct eeh_pe *pe;
+	struct eeh_pe *pe, *parent_pe;
 	int rc = 0;
 	const char *location;
 
@@ -438,14 +439,34 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 	 */
 	if ((ret < 0) ||
 	    (ret == EEH_STATE_NOT_SUPPORT) ||
-	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
-	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+	    ((ret & active_flags) == active_flags)) {
 		eeh_stats.false_positives++;
 		pe->false_positives++;
 		rc = 0;
 		goto dn_unlock;
 	}
 
+	/*
+	 * It should be corner case that the parent PE has been
+	 * put into frozen state as well. We should take care
+	 * that at first.
+	 */
+	parent_pe = pe->parent;
+	while (parent_pe) {
+		/* Hit the ceiling ? */
+		if (parent_pe->type & EEH_PE_PHB)
+			break;
+
+		/* Frozen parent PE ? */
+		ret = eeh_ops->get_state(parent_pe, NULL);
+		if (ret > 0 &&
+		    (ret & active_flags) != active_flags)
+			pe = parent_pe;
+
+		/* Next parent level */
+		parent_pe = parent_pe->parent;
+	}
+
 	eeh_stats.slot_resets++;
 
 	/* Avoid repeated reports of this failure, including problems
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index e5b88c5..65feec6 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -783,11 +783,12 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
-	struct eeh_pe *phb_pe;
+	struct eeh_pe *phb_pe, *parent_pe;
 	u64 frozen_pe_no;
+	int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
 	u16 err_type, severity;
 	long rc;
-	int ret = EEH_NEXT_ERR_NONE;
+	int state, ret = EEH_NEXT_ERR_NONE;
 
 	/*
 	 * While running here, it's safe to purge the event queue.
@@ -920,6 +921,31 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		}
 
 		/*
+		 * We probably have the frozen parent PE out there and
+		 * we need have to handle frozen parent PE firstly.
+		 */
+		if (ret == EEH_NEXT_ERR_FROZEN_PE) {
+			parent_pe = (*pe)->parent;
+			while (parent_pe) {
+				/* Hit the ceiling ? */
+				if (parent_pe->type & EEH_PE_PHB)
+					break;
+
+				/* Frozen parent PE ? */
+				state = ioda_eeh_get_state(parent_pe);
+				if (state > 0 &&
+				    (state & active_flags) != active_flags)
+					*pe = parent_pe;
+
+				/* Next parent level */
+				parent_pe = parent_pe->parent;
+			}
+
+			/* We possibly migrate to another PE */
+			eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
+		}
+
+		/*
 		 * If we have no errors on the specific PHB or only
 		 * informative error there, we continue poking it.
 		 * Otherwise, we need actions to be taken by upper
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 1/3] powerpc/eeh: Clear frozen state for child PE
From: Gavin Shan @ 2014-05-04 23:29 UTC (permalink / raw)
  To: benh; +Cc: linuxppc-dev, Gavin Shan

Since commit cb523e09 ("powerpc/eeh: Avoid I/O access during PE
reset"), the PE is kept as frozen state on hardware level until
the PE reset is done completely. After that, we explicitly clear
the frozen state of the affected PE. However, there might have
frozen child PEs of the affected PE and we also need clear their
frozen state as well. Otherwise, the recovery is going to fail.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh_driver.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7100a5b..8bb40e7 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -447,8 +447,9 @@ static void *eeh_pe_detach_dev(void *data, void *userdata)
  * PE reset (for 3 times), we try to clear the frozen state
  * for 3 times as well.
  */
-static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+static void *__eeh_clear_pe_frozen_state(void *data, void *flag)
 {
+	struct eeh_pe *pe = (struct eeh_pe *)data;
 	int i, rc;
 
 	for (i = 0; i < 3; i++) {
@@ -461,13 +462,24 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
 	}
 
 	/* The PE has been isolated, clear it */
-	if (rc)
+	if (rc) {
 		pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n",
 			__func__, pe->phb->global_number, pe->addr, rc);
-	else
+		return (void *)pe;
+	}
+
+	return NULL;
+}
+
+static int eeh_clear_pe_frozen_state(struct eeh_pe *pe)
+{
+	void *rc;
+
+	rc = eeh_pe_traverse(pe, __eeh_clear_pe_frozen_state, NULL);
+	if (!rc)
 		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
 
-	return rc;
+	return rc ? -EIO : 0;
 }
 
 /**
-- 
1.8.3.2

^ permalink raw reply related

* Re: [PATCH V4] KVM: PPC: BOOK3S: PR: Enable Little Endian PR guest
From: Paul Mackerras @ 2014-05-04 23:59 UTC (permalink / raw)
  To: Aneesh Kumar K.V; +Cc: linuxppc-dev, agraf, kvm-ppc, kvm
In-Reply-To: <1399223932-17840-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>

On Sun, May 04, 2014 at 10:48:52PM +0530, Aneesh Kumar K.V wrote:
> This patch make sure we inherit the LE bit correctly in different case
> so that we can run Little Endian distro in PR mode

[snip]

> +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
> +{
> +	/*
> +	 * If ILE (interrupt little-endian) has changed, update the
> +	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
> +	 */
> +	if ((new_lpcr & LPCR_ILE) != (vcpu->arch.intr_msr & MSR_LE)) {

Since LPCR_ILE != MSR_LE, this condition is always going to be true.
I suggest you remove this if statement and just do the body
unconditionally.

Paul.

^ permalink raw reply

* [PATCH RFC 00/22] EEH Support for VFIO PCI devices on PowerKVM guest
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan

The series of patches intends to support EEH for PCI devices, which have been
passed through to PowerKVM based guest via VFIO. The implementation is
straightforward based on the issues or problems we have to resolve to support
EEH for PowerKVM based guest.

- Emulation for EEH RTAS requests. Thanksfully, we already have infrastructure
  to emulate XICS. Without introducing new mechanism, we just extend that
  existing infrastructure to support EEH RTAS emulation. EEH RTAS requests
  initiated from guest are posted to host where the requests get handled or
  delivered to underly firmware for further handling. For that, the host kerenl
  has to maintain the PCI address (host domain/bus/slot/function to guest's
  PHB BUID/bus/slot/function) mapping via KVM VFIO device. The address mapping
  will be built when initializing VFIO device in QEMU and destroied when the
  VFIO device in QEMU is going to offline, or VM is destroy.

- The infrastructure for error injection is introduced. The emulation for the
  related RTAS services is similar to what we do for EEH/XICS RTAS requests.
  For now, we just support PCI error injection. We need extend it for injecting
  other types of errors in future.

The series of patches requires corresponding firmware changes from Mike Qiu to
support error injection and QEMU changes to support EEH for guest. It also needs
QEMU changes to support it. QEMU patchset will be sent separately.

I usually use command line (not virsh) to start PowerKVM based guests on Firebird-L
machine with different types of PCI devices assigend (passed through) to guest.
Following cases have been tested. The EEH error can be injected by utility "errinjct"
running on guest successfully and we can recover from the EEH error successfully.

Testing on P7
=============

- Emulex adapter
- USB (OHCI) PCI adapter

Testing on P8
=============

- MLX4 adapter (Partially)
- USB (xHCI) PCI adapter

-----

arch/powerpc/include/asm/book3s_errinjct.h     |  97 ++++++++++++++++++++++++
arch/powerpc/include/asm/eeh.h                 |  78 ++++++++++++++++++++
arch/powerpc/include/asm/kvm_ppc.h             |   7 ++
arch/powerpc/include/asm/opal.h                |  65 ++++++++++++++++
arch/powerpc/kernel/eeh.c                      |   8 ++
arch/powerpc/kernel/eeh_pe.c                   | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/kvm/Kconfig                       |  17 +++++
arch/powerpc/kvm/Makefile                      |   6 ++
arch/powerpc/kvm/book3s_errinjct.c             | 329 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/kvm/book3s_hv.c                   |   2 +
arch/powerpc/kvm/book3s_rtas.c                 |  67 +++++++++++++++++
arch/powerpc/platforms/powernv/Makefile        |   2 +
arch/powerpc/platforms/powernv/eeh-ioda.c      |   3 +-
arch/powerpc/platforms/powernv/eeh-rtas.c      | 551 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/errinjct.c      | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++++
arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
include/linux/kvm_host.h                       |  21 ++++++
include/uapi/linux/kvm.h                       |  10 +++
virt/kvm/vfio.c                                |  60 ++++++++++++++-
19 files changed, 1834 insertions(+), 2 deletions(-)
create mode 100644 arch/powerpc/include/asm/book3s_errinjct.h
create mode 100644 arch/powerpc/kvm/book3s_errinjct.c
create mode 100644 arch/powerpc/platforms/powernv/eeh-rtas.c
create mode 100644 arch/powerpc/platforms/powernv/errinjct.c

Thanks,
Gavin

^ permalink raw reply

* [PATCH 02/22] powerpc/eeh: Info to trace passed devices
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The address of passed PCI devices (domain:bus:slot:func) might be
quite different from the perspective of host and guest. We have to
trace the address mapping so that we can emulate EEH RTAS requests
from guest. The patch introduces additional fields to eeh_pe and
eeh_dev for the purpose.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h | 49 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7782056..8bfb167 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -48,6 +48,17 @@ struct device_node;
 #define EEH_PE_RST_HOLD_TIME		250
 #define EEH_PE_RST_SETTLE_TIME		1800
 
+#ifdef CONFIG_KVM_EEH
+struct eeh_vfio_pci_addr {
+	struct kvm	*kvm;		/* KVM identifier		*/
+	unsigned int	buid_hi;	/* PHB BUID high		*/
+	unsigned int	buid_lo;	/* PHB BUID low			*/
+	unsigned char	bus;		/* Bus number			*/
+	unsigned char	devfn;		/* Slot and function		*/
+	int		pe_addr;	/* PE configuration address	*/
+};
+#endif /* CONFIG_KVM_EEH */
+
 /*
  * The struct is used to trace PE related EEH functionality.
  * In theory, there will have one instance of the struct to
@@ -72,6 +83,7 @@ struct device_node;
 #define EEH_PE_RESET		(1 << 2)	/* PE reset in progress	*/
 
 #define EEH_PE_KEEP		(1 << 8)	/* Keep PE on hotplug	*/
+#define EEH_PE_PASSTHROUGH	(1 << 9)	/* PE owned by guest	*/
 
 struct eeh_pe {
 	int type;			/* PE type: PHB/Bus/Device	*/
@@ -85,6 +97,9 @@ struct eeh_pe {
 	struct timeval tstamp;		/* Time on first-time freeze	*/
 	int false_positives;		/* Times of reported #ff's	*/
 	struct eeh_pe *parent;		/* Parent PE			*/
+#ifdef CONFIG_KVM_EEH
+	struct eeh_vfio_pci_addr gaddr;	/* Associated KVM guest address */
+#endif
 	struct list_head child_list;	/* Link PE to the child list	*/
 	struct list_head edevs;		/* Link list of EEH devices	*/
 	struct list_head child;		/* Child PEs			*/
@@ -93,6 +108,21 @@ struct eeh_pe {
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
 		list_for_each_entry_safe(edev, tmp, &pe->edevs, list)
 
+static inline bool eeh_pe_passed(struct eeh_pe *pe)
+{
+	return pe ? !!(pe->state & EEH_PE_PASSTHROUGH) : false;
+}
+
+static inline void eeh_pe_set_passed(struct eeh_pe *pe, bool passed)
+{
+	if (pe) {
+		if (passed)
+			pe->state |= EEH_PE_PASSTHROUGH;
+		else
+			pe->state &= ~EEH_PE_PASSTHROUGH;
+	}
+}
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
@@ -110,6 +140,7 @@ struct eeh_pe {
 #define EEH_DEV_SYSFS		(1 << 9)	/* Sysfs created	*/
 #define EEH_DEV_REMOVED		(1 << 10)	/* Removed permanently	*/
 #define EEH_DEV_FRESET		(1 << 11)	/* Fundamental reset	*/
+#define EEH_DEV_PASSTHROUGH	(1 << 12)	/* Owned by guest	*/
 
 struct eeh_dev {
 	int mode;			/* EEH mode			*/
@@ -126,6 +157,9 @@ struct eeh_dev {
 	struct device_node *dn;		/* Associated device node	*/
 	struct pci_dev *pdev;		/* Associated PCI device	*/
 	struct pci_bus *bus;		/* PCI bus for partial hotplug	*/
+#ifdef CONFIG_KVM_EEH
+	struct eeh_vfio_pci_addr gaddr;	/* Address in guest		*/
+#endif
 };
 
 static inline struct device_node *eeh_dev_to_of_node(struct eeh_dev *edev)
@@ -138,6 +172,21 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 	return edev ? edev->pdev : NULL;
 }
 
+static inline bool eeh_dev_passed(struct eeh_dev *dev)
+{
+	return dev ? !!(dev->mode & EEH_DEV_PASSTHROUGH) : false;
+}
+
+static inline void eeh_dev_set_passed(struct eeh_dev *dev, bool passed)
+{
+	if (dev) {
+		if (passed)
+			dev->mode |= EEH_DEV_PASSTHROUGH;
+		else
+			dev->mode &= ~EEH_DEV_PASSTHROUGH;
+	}
+}
+
 /* Return values from eeh_ops::next_error */
 enum {
 	EEH_NEXT_ERR_NONE = 0,
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 01/22] powerpc: Introduce CONFIG_KVM_EEH
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The patch introduces kernel configuration option KVM_EEH, which
depends on KVM_BOOK3S_64, VFIO_IOMMU_SPAPR_TCE and EEH. The option
is to enable emulating EEH RTAS services that required by EEH
module in pSeries-based guest.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/Kconfig | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 141b202..743d2d9 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -189,6 +189,14 @@ config KVM_XICS
 	  Specification) interrupt controller architecture used on
 	  IBM POWER (pSeries) servers.
 
+config KVM_EEH
+	bool "KVM in-kernel EEH RTAS emulation"
+	depends on PPC_POWERNV && KVM_BOOK3S_64 && EEH && VFIO_IOMMU_SPAPR_TCE
+	default y
+	---help---
+	  Enable support for emulating EEH RTAS services used on IBM
+	  POWER (pSeries) servers.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 05/22] powerpc/eeh: Release VFIO dev on VM destruction
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

When the VM is destroyed, the EEH devices and PEs that have been
marked as being owned by guest should be returned to host. The
patch introduces kvmppc_vfio_pci_free() to do it.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h |  6 +++++-
 arch/powerpc/kernel/eeh_pe.c   | 42 ++++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c   |  2 ++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 3807167..677c719 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -380,6 +380,8 @@ static inline void eeh_add_sysfs_files(struct pci_bus *bus) { }
 
 static inline void eeh_remove_device(struct pci_dev *dev) { }
 
+static inline void kvmppc_eeh_vfio_release(struct kvm *kvm) { }
+
 #define EEH_POSSIBLE_ERROR(val, type) (0)
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
@@ -388,7 +390,9 @@ static inline void eeh_remove_device(struct pci_dev *dev) { }
 #ifdef CONFIG_KVM_EEH
 struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr);
 struct eeh_pe *eeh_vfio_pe_get(struct eeh_vfio_pci_addr *addr);
-
+void kvmppc_eeh_vfio_release(struct kvm *kvm);
+#else
+static inline void kvmppc_eeh_vfio_release(void *kvm) { };
 #endif /* CONFIG_KVM_EEH */
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 1bd7b1f..9e73188 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -331,6 +331,48 @@ struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr)
 
 	return NULL;
 }
+
+static void *__kvmppc_eeh_vfio_release(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct kvm *kvm = (struct kvm *)flag;
+	struct eeh_dev *edev, *tmp;
+
+	if (!eeh_pe_passed(pe))
+		return NULL;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (!eeh_dev_passed(edev))
+			continue;
+
+		if (edev->gaddr.kvm == kvm)
+			eeh_dev_set_passed(edev, false);
+	}
+
+	eeh_pe_set_passed(pe, false);
+
+	return NULL;
+}
+
+/**
+ * kvmppc_eeh_vfio_release - Release VFIO devices for the given VM
+ * @kvm: VM indicator
+ *
+ * The function is expected to be called while the VM is destroyed.
+ * In turn, the PCI devices that have been passed to that VM should
+ * be released and their address mapping maintained will be destroyed.
+ */
+void kvmppc_eeh_vfio_release(struct kvm *kvm)
+{
+	struct eeh_pe *root;
+	void *ret;
+
+	list_for_each_entry(root, &eeh_phb_pe, child) {
+		ret = eeh_pe_traverse(root, __kvmppc_eeh_vfio_release, kvm);
+		if (ret) return;
+	}
+}
+EXPORT_SYMBOL_GPL(kvmppc_eeh_vfio_release);
 #endif /* CONFIG_KVM_EEH */
 
 /**
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8227dba..f07a12d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -49,6 +49,7 @@
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
 #include <asm/smp.h>
+#include <asm/eeh.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -2344,6 +2345,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 		kvm->arch.rma = NULL;
 	}
 
+	kvmppc_eeh_vfio_release(kvm);
 	kvmppc_free_hpt(kvm);
 }
 
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 04/22] powerpc/eeh: Search EEH PE by guest address
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The patch introduces function eeh_vfio_pe_get() to search the EEH
PE according to its guest address, which is made up of KVM indicator,
PHB ID and PE configuration address. The function will be useful in
backends for EEH RTAS emulation.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h |  1 +
 arch/powerpc/kernel/eeh_pe.c   | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index b12e3e9..3807167 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -387,6 +387,7 @@ static inline void eeh_remove_device(struct pci_dev *dev) { }
 
 #ifdef CONFIG_KVM_EEH
 struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr);
+struct eeh_pe *eeh_vfio_pe_get(struct eeh_vfio_pci_addr *addr);
 
 #endif /* CONFIG_KVM_EEH */
 
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index dba7c82..1bd7b1f 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -249,6 +249,46 @@ struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
 }
 
 #ifdef CONFIG_KVM_EEH
+static void *__eeh_vfio_pe_get(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_vfio_pci_addr *addr = (struct eeh_vfio_pci_addr *)flag;
+
+	if (!eeh_pe_passed(pe))
+		return NULL;
+
+	/* Comparing the address */
+	if (addr->kvm     == pe->gaddr.kvm &&
+	    addr->buid_hi == pe->gaddr.buid_hi &&
+	    addr->buid_lo == pe->gaddr.buid_lo &&
+	    addr->pe_addr == pe->gaddr.pe_addr)
+		return pe;
+
+	return NULL;
+}
+
+/**
+ * eeh_vfio_pe_get - Search EEH PE based on guest's address
+ * @addr: EEH PE guest address
+ *
+ * Search the EEH PE according to the guest address, which
+ * is made up of VM indicator, PHB BUID, and PE configuration
+ * address.
+ */
+struct eeh_pe *eeh_vfio_pe_get(struct eeh_vfio_pci_addr *addr)
+{
+	struct eeh_pe *root;
+	struct eeh_pe *pe;
+
+	list_for_each_entry(root, &eeh_phb_pe, child) {
+		pe = eeh_pe_traverse(root, __eeh_vfio_pe_get, addr);
+		if (pe)
+			return pe;
+	}
+
+	return NULL;
+}
+
 static void *__eeh_vfio_dev_get(void *data, void *flag)
 {
 	struct eeh_pe *pe = (struct eeh_pe *)data;
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 06/22] powerpc/eeh: Function for address mapping
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The patch introduces function kvm_vfio_eeh_dev_map(), which is
expected to be called on IOCTL command issued to the VM device, in
order to build the address mapping for VFIO PCI device.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh_pe.c | 88 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm_host.h     | 14 +++++++
 2 files changed, 102 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 9e73188..200cd5a 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -332,6 +332,94 @@ struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr)
 	return NULL;
 }
 
+/**
+ * kvm_vfio_eeh_dev_map - Build the address mapping for VFIO device
+ *
+ * @kvm: VM descriptor
+ * @domain: host domain of PCI device
+ * @bdn: host bus/device/function number
+ * @buid: BUID of guest PHB
+ * @gbdn: guest bus/device/function number
+ *
+ * Build the address mapping between host and guest deivce. It's called
+ * while passing through PCI device from host to guest.
+ */
+int kvm_vfio_eeh_dev_map(struct kvm *kvm, int domain,
+			 int bdn, unsigned long buid, int gbdn)
+{
+	struct pci_bus *bus, *pe_bus;
+	struct pci_dev *dev;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	int bus_no, devfn;
+
+	/* Find the PCI device in host side */
+	bus_no = (bdn >> 8) & 0xff;
+	devfn = bdn & 0xff;
+	bus = pci_find_bus(domain, bus_no);
+	if (!bus) {
+		pr_warn("%s: PCI bus %04x:%02x not found\n",
+			__func__, domain, bus_no);
+		return -ENODEV;
+	}
+
+	dev = pci_get_slot(bus, devfn);
+	if (!dev) {
+		pr_warn("%s: PCI device %04x:%02x:%02x.%01x not found\n",
+			__func__, domain, bus_no,
+			PCI_SLOT(devfn), PCI_FUNC(devfn));
+		return -ENODEV;
+	}
+
+	/*
+	 * Mark the EEH device as passed. We allow dynamic change
+	 * on the address mapping.
+	 */
+	edev = pci_dev_to_eeh_dev(dev);
+	if (!edev) {
+		pr_warn("%s: No EEH dev for PCI device %s\n",
+			__func__, pci_name(dev));
+		return -ENODEV;
+	}
+
+	/*
+	 * The PE configuration address is exactly PCI config address
+	 * of the PE primary bus. That has format 00BBSS00 defined in
+	 * PAPR.
+	 */
+	pe = edev->pe;
+	if (!eeh_pe_passed(pe)) {
+		pe_bus = eeh_pe_bus_get(pe);
+		BUG_ON(!pe_bus);
+
+		pe->gaddr.kvm		= kvm;
+		pe->gaddr.buid_hi	= BUID_HI(buid);
+		pe->gaddr.buid_lo	= BUID_LO(buid);
+		pe->gaddr.pe_addr	= pe_bus->number << 16;
+		eeh_pe_set_passed(pe, true);
+	} else if (pe->gaddr.kvm != kvm ||
+		   pe->gaddr.buid_hi != BUID_HI(buid) ||
+		   pe->gaddr.buid_lo != BUID_LO(buid)) {
+		pr_warn("%s: Mismatched VM or PHB on passing %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
+
+	edev->gaddr.kvm		= kvm;
+	edev->gaddr.buid_hi	= BUID_HI(buid);
+	edev->gaddr.buid_lo	= BUID_LO(buid);
+	edev->gaddr.bus		= (gbdn >> 8) & 0xff;
+	edev->gaddr.devfn	= gbdn & 0xff;
+	eeh_dev_set_passed(edev, true);
+
+	pr_debug("EEH: Host PCI device %s passed to %lx-%02x:%02x.%01x\n",
+		 pci_name(dev), buid, (gbdn >> 8) & 0xff,
+		 PCI_SLOT(gbdn & 0xff), PCI_FUNC(gbdn & 0xff));
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vfio_eeh_dev_map);
+
 static void *__kvmppc_eeh_vfio_release(void *data, void *flag)
 {
 	struct eeh_pe *pe = (struct eeh_pe *)data;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d21cf9..294ce48 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1102,5 +1102,19 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 }
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+#ifdef CONFIG_KVM_EEH
+typedef int (*kvm_vfio_dev_eeh_map)(struct kvm *kvm, int domain,
+				    int bdn, unsigned long buid, int gbdn);
+extern int kvm_vfio_eeh_dev_map(struct kvm *kvm, int domain,
+				int bdn, unsigned long buid, int gbdn);
+#else
+static inline int kvm_vfio_eeh_dev_map(struct kvm *kvm, int domain,
+				       int bdn, unsigned long buid, int gbdn)
+{
+	return 0;
+}
+#endif /* CONFIG_KVM_EEH */
+
 #endif
 
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 03/22] powerpc/eeh: Search EEH device by guest address
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The patch introduces function eeh_vfio_dev_get() to search the EEH
device according to its guest address, which is made up of VM indicator,
PHB BUID, bus, slot and function number. The function is useful in the
backends for EEH RTAS emulation.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h |  6 ++++++
 arch/powerpc/kernel/eeh_pe.c   | 45 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8bfb167..b12e3e9 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -384,6 +384,12 @@ static inline void eeh_remove_device(struct pci_dev *dev) { }
 #define EEH_IO_ERROR_VALUE(size) (-1UL)
 #endif /* CONFIG_EEH */
 
+
+#ifdef CONFIG_KVM_EEH
+struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr);
+
+#endif /* CONFIG_KVM_EEH */
+
 #ifdef CONFIG_PPC64
 /*
  * MMIO read/write operations with EEH support.
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index fbd01eb..dba7c82 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -248,6 +248,51 @@ struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
 	return pe;
 }
 
+#ifdef CONFIG_KVM_EEH
+static void *__eeh_vfio_dev_get(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_vfio_pci_addr *addr = (struct eeh_vfio_pci_addr *)flag;
+	struct eeh_dev *edev, *tmp;
+
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		if (!eeh_dev_passed(edev))
+			continue;
+
+		/* Comparing the address in the guest */
+		if (addr->kvm     == edev->gaddr.kvm &&
+		    addr->buid_hi == edev->gaddr.buid_hi &&
+		    addr->buid_lo == edev->gaddr.buid_lo &&
+		    addr->bus     == edev->gaddr.bus &&
+		    addr->devfn   == edev->gaddr.devfn)
+			return edev;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_vfio_dev_get - Search EEH device based on guest's address
+ * @addr: EEH device guest address
+ *
+ * Search the EEH device according to its guest's address, which
+ * is made up of PHB BUID, and PCI config address.
+ */
+struct eeh_dev *eeh_vfio_dev_get(struct eeh_vfio_pci_addr *addr)
+{
+	struct eeh_pe *root;
+	struct eeh_dev *edev;
+
+	list_for_each_entry(root, &eeh_phb_pe, child) {
+		edev = eeh_pe_traverse(root, __eeh_vfio_dev_get, addr);
+		if (edev)
+			return edev;
+	}
+
+	return NULL;
+}
+#endif /* CONFIG_KVM_EEH */
+
 /**
  * eeh_pe_get_parent - Retrieve the parent PE
  * @edev: EEH device
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 08/22] kvm: Address mapping for VFIO device
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The address (domain/bus/slot/function) looks different from the
perspective of host and guest. We have to setup the mapping for
EEH and tear it down accordingly. The patch introduces additional
attributes to KVM VFIO device for address mapping or unmapping.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kvm/Kconfig  |  1 +
 arch/powerpc/kvm/Makefile |  3 +++
 include/uapi/linux/kvm.h  | 10 ++++++++
 virt/kvm/vfio.c           | 60 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 743d2d9..6764fc5 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -64,6 +64,7 @@ config KVM_BOOK3S_64
 	select KVM_BOOK3S_64_HANDLER
 	select KVM
 	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+	select KVM_VFIO if VFIO
 	---help---
 	  Support running unmodified book3s_64 and book3s_32 guest kernels
 	  in virtual machines on book3s_64 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ce569b6..673038d 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -97,6 +97,9 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
 	book3s_xics.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_VFIO) += \
+	$(addprefix ../../../virt/kvm/, vfio.o)
+
 kvm-book3s_64-module-objs += \
 	$(KVM)/kvm_main.o \
 	$(KVM)/eventfd.o \
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a8f4ee5..97b4d1e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -932,9 +932,19 @@ struct kvm_device_attr {
 #define  KVM_DEV_VFIO_GROUP			1
 #define   KVM_DEV_VFIO_GROUP_ADD			1
 #define   KVM_DEV_VFIO_GROUP_DEL			2
+#define  KVM_DEV_VFIO_DEV			2
+#define   KVM_DEV_VFIO_DEV_EEH_MAP			1
+#define   KVM_DEV_VFIO_DEV_EEH_UNMAP			2
 #define KVM_DEV_TYPE_ARM_VGIC_V2	5
 #define KVM_DEV_TYPE_FLIC		6
 
+struct kvm_vfio_pci_addr {
+	__u32 domain;	/* Host PHB domain	*/
+	__u32 bdn;	/* Host bus/dev/func	*/
+	__u64 gbuid;	/* Guet PHB BUID	*/
+	__u32 gbdn;	/* Guest bus/dev/func	*/
+};
+
 /*
  * ioctls for VM fds
  */
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index ba1a93f..778015d 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -28,6 +28,10 @@ struct kvm_vfio {
 	struct list_head group_list;
 	struct mutex lock;
 	bool noncoherent;
+#ifdef CONFIG_KVM_EEH
+	kvm_vfio_dev_eeh_map eeh_map;
+	kvm_vfio_dev_eeh_unmap eeh_unmap;
+#endif
 };
 
 static struct vfio_group *kvm_vfio_group_get_external_user(struct file *filep)
@@ -201,12 +205,53 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 	return -ENXIO;
 }
 
+static int kvm_vfio_set_dev(struct kvm_device *dev, long attr, u64 arg)
+{
+	struct kvm_vfio *kv = dev->private;
+	struct kvm_vfio_pci_addr addr;
+	int ret = -ENXIO;
+
+	switch (attr) {
+#ifdef CONFIG_KVM_EEH
+	case KVM_DEV_VFIO_DEV_EEH_MAP:
+		if (copy_from_user(&addr, (void __user *)arg, sizeof(addr))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (kv->eeh_map)
+			ret = kv->eeh_map(dev->kvm, addr.domain,
+					  addr.bdn, addr.gbuid, addr.gbdn);
+		else
+			ret = 0;
+
+		break;
+	case KVM_DEV_VFIO_DEV_EEH_UNMAP:
+		if (copy_from_user(&addr, (void __user *)arg, sizeof(addr))) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (kv->eeh_unmap)
+			ret = kv->eeh_unmap(dev->kvm, addr.domain, addr.bdn);
+		else
+			ret = 0;
+
+		break;
+#endif
+	}
+
+	return ret;
+}
+
 static int kvm_vfio_set_attr(struct kvm_device *dev,
 			     struct kvm_device_attr *attr)
 {
 	switch (attr->group) {
 	case KVM_DEV_VFIO_GROUP:
 		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
+	case KVM_DEV_VFIO_DEV:
+		return kvm_vfio_set_dev(dev, attr->attr, attr->addr);
 	}
 
 	return -ENXIO;
@@ -224,6 +269,16 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
 		}
 
 		break;
+	case KVM_DEV_VFIO_DEV:
+		switch (attr->attr) {
+#ifdef CONFIG_KVM_EEH
+		case KVM_DEV_VFIO_DEV_EEH_MAP:
+		case KVM_DEV_VFIO_DEV_EEH_UNMAP:
+			return 0;
+#endif
+		}
+
+		break;
 	}
 
 	return -ENXIO;
@@ -262,7 +317,10 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
 
 	INIT_LIST_HEAD(&kv->group_list);
 	mutex_init(&kv->lock);
-
+#ifdef CONFIG_KVM_EEH
+	kv->eeh_map = kvm_vfio_eeh_dev_map;
+	kv->eeh_unmap = kvm_vfio_eeh_dev_unmap;
+#endif
 	dev->private = kv;
 
 	return 0;
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 07/22] powerpc/eeh: Function to tear down address mapping
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The patch introduces function kvm_vfio_eeh_dev_unmap(), which is
expected to be called on IOCTL command issued to the VM device, in
order to tear down the address mapping for VFIO PCI device.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/eeh_pe.c | 82 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm_host.h     |  7 ++++
 2 files changed, 89 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 200cd5a..8398efc 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -420,6 +420,88 @@ int kvm_vfio_eeh_dev_map(struct kvm *kvm, int domain,
 }
 EXPORT_SYMBOL_GPL(kvm_vfio_eeh_dev_map);
 
+ /**
+  * kvm_vfio_eeh_dev_unmap - Tear down address mapping for VFIO PCI device
+  *
+  * @kvm: VM descriptor
+  * @domain: host domain
+  * @bdn: host bus/device/function number
+  *
+  * Tear down address mapping for VFIO PCI device.
+  */
+int kvm_vfio_eeh_dev_unmap(struct kvm *kvm, int domain, int bdn)
+{
+	struct pci_bus *bus;
+	struct pci_dev *dev;
+	struct eeh_pe *pe;
+	struct eeh_dev *edev, *tmp;
+	int bus_no, devfn;
+	bool passed;
+
+	/* Find the PCI device in host side */
+	bus_no = (bdn >> 8) & 0xff;
+	devfn = bdn & 0xff;
+	bus = pci_find_bus(domain, bus_no);
+	if (!bus) {
+		pr_warn("%s: PCI bus %04x:%02x not found\n",
+			__func__, domain, bus_no);
+		return -ENODEV;
+	}
+
+	dev = pci_get_slot(bus, devfn);
+	if (!dev) {
+		pr_warn("%s: PCI device %04x:%02x:%02x.%01x not found\n",
+			__func__, domain, bus_no,
+			PCI_SLOT(devfn), PCI_FUNC(devfn));
+		return -ENODEV;
+	}
+
+	/* Mark the EEH device as non-passed */
+	edev = pci_dev_to_eeh_dev(dev);
+	if (!edev) {
+		pr_warn("%s: No EEH dev for PCI device %s\n",
+			__func__, pci_name(dev));
+		return -ENODEV;
+	} else if (!eeh_dev_passed(edev)    ||
+		   !eeh_pe_passed(edev->pe) ||
+		   edev->gaddr.kvm != kvm   ||
+		   edev->pe->gaddr.kvm != kvm) {
+		pr_warn("%s: Non-passsed PCI dev %s or PE\n",
+			__func__, pci_name(dev));
+		return 0;
+	}
+	memset(&edev->gaddr, 0, sizeof(edev->gaddr));
+	eeh_dev_set_passed(edev, false);
+	pr_debug("EEH: Host PCI device %s returned\n",
+		pci_name(dev));
+
+	/*
+	 * Mark the PE as non-passed if all PCI devices
+	 * except P2P bridges are non-passed.
+	 */
+	pe = edev->pe;
+	passed = false;
+	eeh_pe_for_each_dev(pe, edev, tmp) {
+		dev = eeh_dev_to_pci_dev(edev);
+		if (dev && dev->subordinate)
+			continue;
+		if (eeh_dev_passed(edev)) {
+			passed = true;
+			break;
+		}
+	}
+
+	if (!passed) {
+		memset(&pe->gaddr, 0, sizeof(pe->gaddr));
+		eeh_pe_set_passed(pe, false);
+		pr_debug("EEH: PHB#%x-PE#%x returned to host\n",
+			pe->phb->global_number, pe->addr);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vfio_eeh_dev_unmap);
+
 static void *__kvmppc_eeh_vfio_release(void *data, void *flag)
 {
 	struct eeh_pe *pe = (struct eeh_pe *)data;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 294ce48..520b3d0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1106,14 +1106,21 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 #ifdef CONFIG_KVM_EEH
 typedef int (*kvm_vfio_dev_eeh_map)(struct kvm *kvm, int domain,
 				    int bdn, unsigned long buid, int gbdn);
+typedef int (*kvm_vfio_dev_eeh_unmap)(struct kvm *kvm, int domain, int bdn);
 extern int kvm_vfio_eeh_dev_map(struct kvm *kvm, int domain,
 				int bdn, unsigned long buid, int gbdn);
+extern int kvm_vfio_eeh_dev_unmap(struct kvm *kvm, int domain, int bdn);
 #else
 static inline int kvm_vfio_eeh_dev_map(struct kvm *kvm, int domain,
 				       int bdn, unsigned long buid, int gbdn)
 {
 	return 0;
 }
+
+static inline int kvm_vfio_eeh_dev_unmap(struct kvm *kvm, int domain, int bdn)
+{
+	return 0;
+}
 #endif /* CONFIG_KVM_EEH */
 
 #endif
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 10/22] powerpc/eeh: Introduce kvmppc_eeh_format_addr()
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The guest will pass 2 kinds of addresses: tranditional bus/device/
function combo, and guest sensitive PE address returned from host.
The patch introduces function kvmppc_eeh_format_addr() to convert
the guest address information from RTAS call argument (struct rtas_args)
and retrieve the EEH device or PE instance if necessary. The function
will be used by subsequent patches.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index fded461..f04b820 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -39,6 +39,58 @@
 #include "powernv.h"
 #include "pci.h"
 
+/*
+ * Guest is passing 2 types of addresses. First one would be
+ * traditional bus/device/function combo and another one is
+ * PE address, which starts from 0x10000
+ */
+static int kvmppc_eeh_format_addr(struct kvm_vcpu *vcpu,
+				  struct rtas_args *args,
+				  struct eeh_vfio_pci_addr *addr,
+				  bool is_legacy,
+				  struct eeh_dev **pedev,
+				  struct eeh_pe **ppe)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+
+	if (pedev) *pedev = NULL;
+	if (ppe) *ppe = NULL;
+
+	addr->kvm       = vcpu->kvm;
+	addr->buid_hi   = args->args[1];
+	addr->buid_lo   = args->args[2];
+	if (is_legacy) {
+		addr->bus   = (args->args[0] >> 16) & 0xFF;
+		addr->devfn = (args->args[0] >> 8) & 0xFF;
+
+		edev = eeh_vfio_dev_get(addr);
+		if (!edev) {
+			pr_warn("%s: Can't find VFIO device "
+				"(%08x-%08x-%02x-%02x)\n",
+				__func__, addr->buid_hi,
+				addr->buid_lo, addr->bus, addr->devfn);
+			return -EEXIST;
+		}
+
+		if (pedev) *pedev = edev;
+		if (ppe)   *ppe = edev->pe;
+	} else {
+		addr->pe_addr = args->args[0];
+		pe = eeh_vfio_pe_get(addr);
+		if (!pe) {
+			pr_warn("%s: Can't find PE (%08x-%08x-%x)\n",
+				__func__, addr->buid_hi,
+				addr->buid_lo, addr->pe_addr);
+			return -EEXIST;
+		}
+
+		if (ppe) *ppe = pe;
+	}
+
+	return 0;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 09/22] powerpc/powernv: EEH RTAS emulation backend
From: Gavin Shan @ 2014-05-05  1:27 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The implementation of EEH RTAS emulation is split up into 2 layers:
kvm and powernv platform layer. The KVM layer is quite simple to
dispatch RTAS requests from guest to powernv platform layer. After
that, the powernv platform layer takes care of the details, process
the request and return result to kvm layer.

The patch implements the infrastructure of powernv platform layer
for EEH RTAS emulation.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h            | 18 +++++++++
 arch/powerpc/platforms/powernv/Makefile   |  1 +
 arch/powerpc/platforms/powernv/eeh-rtas.c | 64 +++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 arch/powerpc/platforms/powernv/eeh-rtas.c

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 677c719..7384dee 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -49,6 +49,24 @@ struct device_node;
 #define EEH_PE_RST_SETTLE_TIME		1800
 
 #ifdef CONFIG_KVM_EEH
+
+/*
+ * Those EEH RTAS operations are going to be emulated.
+ * According to PAPR specification, there're much more
+ * operations. However, the following RTAS operations
+ * are enough for EEH in guest to work properly.
+ */
+enum {
+	eeh_rtas_first			= 0,
+	eeh_rtas_set_option		= 0,
+	eeh_rtas_set_slot_reset		= 1,
+	eeh_rtas_read_slot_reset_state2	= 2,
+	eeh_rtas_get_config_addr_info2	= 3,
+	eeh_rtas_slot_error_detail	= 4,
+	eeh_rtas_configure_pe		= 5,
+	eeh_rtas_last			= 5
+};
+
 struct eeh_vfio_pci_addr {
 	struct kvm	*kvm;		/* KVM identifier		*/
 	unsigned int	buid_hi;	/* PHB BUID high		*/
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 63cebb9..d8ea670 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,5 +6,6 @@ obj-y			+= opal-msglog.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
 obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
+obj-$(CONFIG_KVM_EEH)	+= eeh-rtas.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
new file mode 100644
index 0000000..fded461
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -0,0 +1,64 @@
+/*
+ * The file intends to implement emulation for EEH related RTAS services,
+ * which is expected to be done inside hypervisor. The specific RTAS
+ * service is identified by its unique token. Currently, the tokens
+ * are assigned by QEMU in a dynamic way and the dedicated hcall (0xf000)
+ * was introduced for the purpose of RTAS emulation either in hypervisor
+ * or QEMU.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2014.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/rtas.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/iommu.h>
+#include <asm/opal.h>
+#include <asm/msi_bitmap.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/tce.h>
+
+#include "powernv.h"
+#include "pci.h"
+
+/**
+ * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
+ * @vcpu: KVM virtual CPU
+ * @args: RTAS parameter
+ * @op: identifier of the specific EEH RTAS service
+ *
+ * The function will be called when the hypervisor receives emulation
+ * request on EEH RTAS from guest. Accordingly, it will dispatch to
+ * specific functions to handle the request.
+ */
+void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
+{
+	int ret = -3;
+
+	/* Parse the requested service */
+	switch (op) {
+	default:
+		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
+			__func__, op);
+	}
+
+	args->rets[0] = ret;
+}
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 12/22] powerpc/eeh: Emulate RTAS call ibm,set-slot-reset
From: Gavin Shan @ 2014-05-05  1:28 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The RTAS call "ibm,set-slot-reset" is being used to reset one
particular PE, either foundamental or hot reset. The patche intends
to implement the backend to emulate the RTAS call.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 92 +++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index 1a037fd..3e38d13 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -171,6 +171,95 @@ out:
 	return ret;
 }
 
+static int kvmppc_eeh_set_reset(struct kvm_vcpu *vcpu,
+				struct rtas_args *args)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct eeh_vfio_pci_addr addr;
+	int opcode;
+	int ret = 0;
+
+	/* Sanity check on parameter */
+	if (args->nargs != 4 || args->nret != 1) {
+		pr_warn("%s: Non-matched arguments (%d, %d) - (4, 1)\n",
+			__func__, args->nargs, args->nret);
+		ret = -3;
+		goto out;
+	}
+
+	/* Sanity check on opcode */
+	opcode = args->args[3];
+	if (opcode != EEH_RESET_DEACTIVATE &&
+	    opcode != EEH_RESET_HOT &&
+	    opcode != EEH_RESET_FUNDAMENTAL) {
+		pr_warn("%s: Unsupported opcode %d\n",
+			__func__, opcode);
+		ret = -3;
+		goto out;
+	}
+
+	/* Figure out the address. We always have PE address */
+	if (kvmppc_eeh_format_addr(vcpu, args, &addr, false, &edev, &pe)) {
+		ret = -3;
+		goto out;
+	}
+
+	/* Insure that the EEH stuff has been initialized */
+	hose = pe->phb;
+	phb = hose->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH)) {
+		pr_warn("%s: EEH disable on PHB#%d\n",
+			__func__, hose->global_number);
+		ret = -7;
+		goto out;
+	}
+
+	/* Call into the IODA dependent backend to do the reset */
+	if (!phb->eeh_ops ||
+	    !phb->eeh_ops->set_option ||
+	    !phb->eeh_ops->reset) {
+		pr_warn("%s: Unsupported request\n", __func__);
+		ret = -7;
+	} else {
+		/*
+		 * The frozen PE might be caused by the mechanism called
+		 * PAPR error injection, which is supposed to be one-shot
+		 * without "sticky" bit as being stated by the spec. But
+		 * the reality isn't that, at least on P7IOC. So we have
+		 * to clear that to avoid recrusive error, which fail the
+		 * recovery.
+		 */
+		if (opcode == EEH_RESET_DEACTIVATE)
+			opal_pci_reset(phb->opal_id,
+				       OPAL_PHB_ERROR,
+				       OPAL_ASSERT_RESET);
+
+		if (phb->eeh_ops->reset(pe, opcode)) {
+			pr_warn("%s: Failure from backend\n",
+				__func__);
+			ret = -1;
+			goto out;
+		}
+
+		/*
+		 * The PE is still in frozen state and we need clear that.
+		 * It's good to clear frozen state after deassert to avoid
+		 * messy IO access during reset, which might cause recrusive
+		 * frozen PE.
+		 */
+		if (opcode == EEH_RESET_DEACTIVATE) {
+			phb->eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+			phb->eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+		}
+	}
+
+out:
+	return ret;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
@@ -190,6 +279,9 @@ void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
 	case eeh_rtas_set_option:
 		ret = kvmppc_eeh_set_option(vcpu, args);
 		break;
+	case eeh_rtas_set_slot_reset:
+		ret = kvmppc_eeh_set_reset(vcpu, args);
+		break;
 	default:
 		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
 			__func__, op);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 11/22] powerpc/eeh: Emulate RTAS call ibm,set-eeh-option
From: Gavin Shan @ 2014-05-05  1:28 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The RTAS call "ibm,set-eeh-option" is being used to enable/disable
EEH functionality on the specified PE, or enable MMIO/DMA for the
frozen PE. The patch emulates the RTAS call.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 83 +++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index f04b820..1a037fd 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -91,6 +91,86 @@ static int kvmppc_eeh_format_addr(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvmppc_eeh_set_option(struct kvm_vcpu *vcpu,
+				 struct rtas_args *args)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct eeh_vfio_pci_addr addr;
+	int opcode;
+	bool is_legacy = false;
+	int ret = 0;
+
+	/* Sanity check on parameter */
+	if (args->nargs != 4 || args->nret != 1) {
+		pr_warn("%s: Non-matched arguments (%d, %d) - (4, 1)\n",
+			__func__, args->nargs, args->nret);
+		ret = -3;
+		goto out;
+	}
+
+	/* Check on opcode */
+	opcode = args->args[3];
+	if (opcode < EEH_OPT_DISABLE || opcode > EEH_OPT_THAW_DMA) {
+		pr_warn("%s: opcode %d out of range (%d, %d)\n",
+			__func__, opcode, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+		ret = -3;
+		goto out;
+	}
+
+	if (opcode == EEH_OPT_ENABLE)
+		is_legacy = true;
+
+	/* Figure out the address */
+	if (kvmppc_eeh_format_addr(vcpu, args, &addr, is_legacy, &edev, &pe)) {
+		ret = -7;
+		goto out;
+	}
+
+	/* Insure that the EEH stuff has been initialized */
+	hose = pe->phb;
+	phb = hose->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH)) {
+		pr_warn("%s: EEH disabled on PHB#%d\n",
+			__func__, hose->global_number);
+		ret = -7;
+		goto out;
+	}
+
+	/*
+	 * The EEH functionality has been enabled on all PEs
+	 * by default. So just return success. The same situation
+	 * would be applied while we disable EEH functionality.
+	 * However, the guest isn't expected to disable that
+	 * at all.
+	 */
+	if (opcode == EEH_OPT_DISABLE ||
+		opcode == EEH_OPT_ENABLE) {
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * Call into the IODA dependent backend in order
+	 * to enable DMA or MMIO for the indicated PE.
+	 */
+	if (phb->eeh_ops && phb->eeh_ops->set_option) {
+		if (phb->eeh_ops->set_option(pe, opcode)) {
+			pr_warn("%s: Failure from backend\n",
+				__func__);
+			ret = -1;
+		}
+	} else {
+		pr_warn("%s: Unsupported request\n",
+			__func__);
+		ret = -7;
+	}
+out:
+	return ret;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
@@ -107,6 +187,9 @@ void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
 
 	/* Parse the requested service */
 	switch (op) {
+	case eeh_rtas_set_option:
+		ret = kvmppc_eeh_set_option(vcpu, args);
+		break;
 	default:
 		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
 			__func__, op);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 15/22] powerpc/eeh: Emulate RTAS call ibm,slot-error-detail
From: Gavin Shan @ 2014-05-05  1:28 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The RTAS call "ibm,slot-error-detail" is being used to retrieve the
error log (either permanent or temporary) from the underlying firmware.
The patch implements the backend to emulate the RTAS call.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 75 +++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index 4a9c2c7..8934564 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -390,6 +390,78 @@ out:
 	return ret;
 }
 
+static int kvmppc_eeh_get_error(struct kvm_vcpu *vcpu,
+				struct rtas_args *args)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct eeh_vfio_pci_addr addr;
+	char *log;
+	int guest_log;
+	int len, severity;
+	int ret = 0;
+
+	/* Sanity check on parameter */
+	if (args->nargs != 8 || args->nret != 1) {
+		pr_warn("%s: Non-matched arguments (%d, %d) - (8, 1)\n",
+			__func__, args->nargs, args->nret);
+		ret = 1;
+		goto out;
+	} else if (args->args[7] != 1 && args->args[7] != 2) {
+		pr_warn("%s: Invalid Log type\n", __func__);
+		ret = 1;
+		goto out;
+	}
+
+	/* Figure out the address */
+	if (kvmppc_eeh_format_addr(vcpu, args, &addr, false, &edev, &pe)) {
+		ret = 1;
+		goto out;
+	}
+
+	/* Make sure that the EEH stuff has been initialized */
+	hose = pe->phb;
+	phb = hose->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH)) {
+		pr_warn("%s: EEH disabled on PHB#%d\n",
+			__func__, hose->global_number);
+		ret = 1;
+		goto out;
+	}
+
+	/*
+	 * Retrieve error log from PE. We don't have cached error
+	 * log for one specific PE yet, which need to be figured
+	 * out later.
+	 */
+	if (phb->eeh_ops && phb->eeh_ops->get_log) {
+		guest_log = args->args[5];
+		len = args->args[6];
+		severity = args->args[7];
+		log = kzalloc(len, GFP_KERNEL);
+		if (!log) {
+			pr_err("%s: Out of memory!\n", __func__);
+			ret = 1;
+			goto out;
+		}
+
+		phb->eeh_ops->get_log(pe, severity, log, len);
+		if (kvm_write_guest(vcpu->kvm, guest_log, log, len)) {
+			pr_warn("%s: Fail pushing log to guest\n",
+				__func__);
+			ret = 1;
+		}
+
+		kfree(log);
+	} else {
+		ret = 1;
+	}
+out:
+	return ret;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
@@ -418,6 +490,9 @@ void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
 	case eeh_rtas_get_config_addr_info2:
 		ret = kvmppc_eeh_get_addr2(vcpu, args);
 		break;
+	case eeh_rtas_slot_error_detail:
+		ret = kvmppc_eeh_get_error(vcpu, args);
+		break;
 	default:
 		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
 			__func__, op);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 13/22] powerpc/eeh: Emulate RTAS call ibm, read-slot-reset-state2
From: Gavin Shan @ 2014-05-05  1:28 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The RTAS call "ibm,read-slot-reset-state2" is being used to retrieve
the various states of the specified PE, e.g. reset state, frozen DMA,
frozen MMIO etc. The patch implements the backend to emulate the
RTAS call.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 77 +++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index 3e38d13..031ee8c 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -260,6 +260,80 @@ out:
 	return ret;
 }
 
+static int kvmppc_eeh_get_state2(struct kvm_vcpu *vcpu,
+				 struct rtas_args *args)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct eeh_vfio_pci_addr addr;
+	int result, ret = 0;
+
+	/* Sanity check on parameter */
+	if (args->nargs != 3 || (args->nret != 4 && args->nret != 5)) {
+		pr_warn("%s: Non-matched argument (%d, %d) - (3, 4/5)\n",
+			__func__, args->nargs, args->nret);
+		ret = -3;
+		goto out;
+	}
+
+	/* Figure out the address */
+	if (kvmppc_eeh_format_addr(vcpu, args, &addr, false, &edev, &pe)) {
+		ret = -3;
+		goto out;
+	}
+
+	/* Make sure that the EEH stuff has been initialized */
+	hose = pe->phb;
+	phb = hose->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH)) {
+		pr_warn("%s: EEH disabled on PHB#%d\n",
+			__func__, hose->global_number);
+		ret = -3;
+		args->rets[2] = 0;
+		goto out;
+	}
+
+	/*
+	 * Mark EEH supported on the PCI device. Otherwise,
+	 * the PE state is meaningless to the guest
+	 */
+	args->rets[2] = 1;
+
+	/* Call to the IOC dependent function */
+	if (phb->eeh_ops && phb->eeh_ops->get_state) {
+		result = phb->eeh_ops->get_state(pe);
+
+		if (!(result & EEH_STATE_RESET_ACTIVE) &&
+		    (result & EEH_STATE_DMA_ENABLED) &&
+		    (result & EEH_STATE_MMIO_ENABLED))
+			args->rets[1] = 0;
+		else if (result & EEH_STATE_RESET_ACTIVE)
+			args->rets[1] = 1;
+		else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+			 !(result & EEH_STATE_DMA_ENABLED) &&
+			 !(result & EEH_STATE_MMIO_ENABLED))
+			args->rets[1] = 2;
+		else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+			(result & EEH_STATE_DMA_ENABLED) &&
+			!(result & EEH_STATE_MMIO_ENABLED))
+			args->rets[1] = 4;
+		else {
+			args->rets[1] = 5;
+			args->rets[3] = 1000;
+		}
+
+		ret = 0;
+	} else {
+		pr_warn("%s: Unsupported request\n",
+			__func__);
+		ret = -3;
+	}
+out:
+	return ret;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
@@ -282,6 +356,9 @@ void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
 	case eeh_rtas_set_slot_reset:
 		ret = kvmppc_eeh_set_reset(vcpu, args);
 		break;
+	case eeh_rtas_read_slot_reset_state2:
+		ret = kvmppc_eeh_get_state2(vcpu, args);
+		break;
 	default:
 		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
 			__func__, op);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 16/22] powerpc/eeh: Emulate RTAS call ibm,configure-pe
From: Gavin Shan @ 2014-05-05  1:28 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The RTAS call "ibm,configure-pe" is being used to restore everything
after PE reset. The patch implements the backend to emulate the
RTAS call. In that, we restores BARs for the affected PCI device in
host side because the guest might not have full access to the config
space.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 49 +++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index 8934564..a663cd8 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -462,6 +462,52 @@ out:
 	return ret;
 }
 
+static int kvmppc_eeh_configure_pe(struct kvm_vcpu *vcpu,
+				   struct rtas_args *args)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct eeh_vfio_pci_addr addr;
+	int ret = 0;
+
+	/* Sanity check on parameter */
+	if (args->nargs != 3 || args->nret != 1) {
+		pr_warn("%s: Non-matched arguments (%d, %d) - (3, 1)\n",
+			__func__, args->nargs, args->nret);
+		ret = -3;
+		goto out;
+	}
+
+	/* Figure out the address */
+	if (kvmppc_eeh_format_addr(vcpu, args, &addr, false, &edev, &pe)) {
+		ret = -3;
+		goto out;
+	}
+
+	/* Make sure that the EEH stuff has been initialized */
+	hose = pe->phb;
+	phb = hose->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH)) {
+		pr_warn("%s: EEH disabled on PHB#%x\n",
+			__func__, hose->global_number);
+		ret = -3;
+		goto out;
+	}
+
+	/*
+	 * The access to PCI config space on VFIO device has some
+	 * limitations. Part of PCI config space, including BAR
+	 * registers are not readable and writable. So the guest
+	 * should have stale values for those registers and we have
+	 * to restore them in host side.
+	 */
+	eeh_pe_restore_bars(pe);
+out:
+	return ret;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
@@ -493,6 +539,9 @@ void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
 	case eeh_rtas_slot_error_detail:
 		ret = kvmppc_eeh_get_error(vcpu, args);
 		break;
+	case eeh_rtas_configure_pe:
+		ret = kvmppc_eeh_configure_pe(vcpu, args);
+		break;
 	default:
 		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
 			__func__, op);
-- 
1.8.3.2

^ permalink raw reply related

* [PATCH 14/22] powerpc/eeh: Emulate RTAS call ibm, get-config-addr-info2
From: Gavin Shan @ 2014-05-05  1:28 UTC (permalink / raw)
  To: linuxppc-dev, kvm, kvm-ppc; +Cc: aik, alex.williamson, qiudayu, Gavin Shan
In-Reply-To: <1399253291-3975-1-git-send-email-gwshan@linux.vnet.ibm.com>

The RTAS call "ibm,get-config-addr-info2" is being used by guest
to retrieve the corresponding PE number for the specified PCI device.
The patch implements the backend to support the emulation of the
RTAS call.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/powernv/eeh-rtas.c | 59 +++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/eeh-rtas.c b/arch/powerpc/platforms/powernv/eeh-rtas.c
index 031ee8c..4a9c2c7 100644
--- a/arch/powerpc/platforms/powernv/eeh-rtas.c
+++ b/arch/powerpc/platforms/powernv/eeh-rtas.c
@@ -334,6 +334,62 @@ out:
 	return ret;
 }
 
+static int kvmppc_eeh_get_addr2(struct kvm_vcpu *vcpu,
+				struct rtas_args *args)
+{
+	struct pci_controller *hose;
+	struct pnv_phb *phb;
+	struct eeh_dev *edev;
+	struct eeh_pe *pe;
+	struct eeh_vfio_pci_addr addr;
+	int opcode;
+	int ret = 0;
+
+	/* Sanity check on parameter */
+	if (args->nargs != 4 || args->nret != 2) {
+		pr_warn("%s: Non-matched arguments (%d, %d) - (4, 2)\n",
+			__func__, args->nargs, args->nret);
+		ret = -3;
+		goto out;
+	}
+
+	/* Check on the operation code */
+	opcode = args->args[3];
+	if (opcode != 0 && opcode != 1) {
+		pr_warn("%s: opcode %d out of range (0, 1)\n",
+			__func__, opcode);
+		ret = -3;
+		goto out;
+	}
+
+	/* Figure out address */
+	if (kvmppc_eeh_format_addr(vcpu, args, &addr, true, &edev, &pe)) {
+		ret = -3;
+		goto out;
+	}
+
+	/* Insure that the EEH stuff has been initialized */
+	hose = pe->phb;
+	phb = hose->private_data;
+	if (!(phb->flags & PNV_PHB_FLAG_EEH)) {
+		pr_warn("%s: EEH disabled on PHB#%d\n",
+			__func__, hose->global_number);
+		ret = -3;
+		goto out;
+	}
+
+	/*
+	 * Fill result according to opcode. We don't differentiate
+	 * PCI bus and device sensitive PE here.
+	 */
+	if (opcode == 0)
+		args->rets[1] = pe->gaddr.pe_addr;
+	else
+		args->rets[1] = 1;
+out:
+	return ret;
+}
+
 /**
  * kvmppc_eeh_rtas - Backend for EEH RTAS emulation
  * @vcpu: KVM virtual CPU
@@ -359,6 +415,9 @@ void kvmppc_eeh_rtas(struct kvm_vcpu *vcpu, struct rtas_args *args, int op)
 	case eeh_rtas_read_slot_reset_state2:
 		ret = kvmppc_eeh_get_state2(vcpu, args);
 		break;
+	case eeh_rtas_get_config_addr_info2:
+		ret = kvmppc_eeh_get_addr2(vcpu, args);
+		break;
 	default:
 		pr_warn("%s: Unsupported EEH RTAS service#%d\n",
 			__func__, op);
-- 
1.8.3.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox