From mboxrd@z Thu Jan 1 00:00:00 1970 From: Xiao Guangrong Subject: Re: [PATCH V4 4/7] KVM, pkeys: add pkeys support for permission_fault logic Date: Sun, 6 Mar 2016 16:00:07 +0800 Message-ID: <56DBE387.4080704@linux.intel.com> References: <1457177252-7577-1-git-send-email-huaitong.han@intel.com> <1457177252-7577-5-git-send-email-huaitong.han@intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org To: Huaitong Han , pbonzini@redhat.com, gleb@kernel.org Return-path: Received: from mga01.intel.com ([192.55.52.88]:4282 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751035AbcCFIA1 (ORCPT ); Sun, 6 Mar 2016 03:00:27 -0500 In-Reply-To: <1457177252-7577-5-git-send-email-huaitong.han@intel.com> Sender: kvm-owner@vger.kernel.org List-ID: On 03/05/2016 07:27 PM, Huaitong Han wrote: > Protection keys define a new 4-bit protection key field (PKEY) in bits > 62:59 of leaf entries of the page tables, the PKEY is an index to PKRU > register(16 domains), every domain has 2 bits(write disable bit, access > disable bit). > > Static logic has been produced in update_permission_bitmask, dynamic logic > need read pkey from page table entries, get pkru value, and deduce the > correct result. > > Signed-off-by: Huaitong Han > --- > Changes in v4: > *Patch has rebased on http://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/log/?h=mm/pkeys > pte_pkey function has been deleted in kernel PKU support, so KVM patches use > pte_flags_pkey(pte_flags(pte)) instead of pte_pkey(pte). > > arch/x86/kvm/mmu.h | 56 +++++++++++++++++++++++++++++++++++++++++----- > arch/x86/kvm/paging_tmpl.h | 18 ++++++++++++--- > arch/x86/kvm/x86.c | 2 +- > 3 files changed, 67 insertions(+), 9 deletions(-) > > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index 55ffb7b..7a9f627 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -3,6 +3,7 @@ > > #include > #include "kvm_cache_regs.h" > +#include "x86.h" > > #define PT64_PT_BITS 9 > #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) > @@ -24,6 +25,7 @@ > #define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) > #define PT_PAT_MASK (1ULL << 7) > #define PT_GLOBAL_MASK (1ULL << 8) > + > #define PT64_NX_SHIFT 63 > #define PT64_NX_MASK (1ULL << PT64_NX_SHIFT) > > @@ -45,6 +47,10 @@ > #define PT_PAGE_TABLE_LEVEL 1 > #define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) > > +#define PKRU_READ 0 > +#define PKRU_WRITE 1 > +#define PKRU_ATTRS 2 > + > static inline u64 rsvd_bits(int s, int e) > { > return ((1ULL << (e - s + 1)) - 1) << s; > @@ -145,10 +151,50 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) > * fault with the given access (in ACC_* format)? > */ > static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > - unsigned pte_access, unsigned pfec) > + unsigned pte_access, unsigned pte_pkeys, unsigned pfec) > { > - int cpl = kvm_x86_ops->get_cpl(vcpu); > - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); > + unsigned long smap, rflags; > + u32 pkru, pkru_bits; > + int cpl, index; > + bool wf, uf; > + > + cpl = kvm_x86_ops->get_cpl(vcpu); > + rflags = kvm_x86_ops->get_rflags(vcpu); > + > + /* > + * PKU is computed dynamically in permission_fault. > + * 2nd and 6th conditions: > + * 2.EFER_LMA=1 > + * 6.PKRU.AD=1 > + * or The access is a data write and PKRU.WD=1 and > + * either CR0.WP=1 or it is a user mode access > + */ > + pkru = is_long_mode(vcpu) ? read_pkru() : 0; > + if (unlikely(pkru) && (pfec & PFERR_PK_MASK)) > + { > + /* > + * PKRU defines 32 bits, there are 16 domains and 2 attribute bits per > + * domain in pkru, pkey is the index to a defined domain, so the value > + * of pkey * PKRU_ATTRS is offset of a defined domain. > + */ > + pkru_bits = (pkru >> (pte_pkeys * PKRU_ATTRS)) & 3; > + > + wf = pfec & PFERR_WRITE_MASK; > + uf = pfec & PFERR_USER_MASK; > + > + /* > + * Ignore PKRU.WD if not relevant to this access (a read, > + * or a supervisor mode access if CR0.WP=0). > + * So 6th conditions is equivalent to "pkru_bits != 0" > + */ > + if (!wf || (!uf && !is_write_protection(vcpu))) > + pkru_bits &= ~(1 << PKRU_WRITE); > + > + /* Flip pfec on PK bit if pkru_bits is zero */ > + pfec ^= pkru_bits ? 0 : PFERR_PK_MASK; > + } > + else > + pfec &= ~PFERR_PK_MASK; > > /* > * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. > @@ -163,8 +209,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > * but it will be one in index if SMAP checks are being overridden. > * It is important to keep this branchless. > */ > - unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); > - int index = (pfec >> 1) + > + smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); > + index = (pfec >> 1) + > (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); See my comments in the previous patch. > > WARN_ON(pfec & PFERR_RSVD_MASK); > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h > index 6c9fed9..9afc066 100644 > --- a/arch/x86/kvm/paging_tmpl.h > +++ b/arch/x86/kvm/paging_tmpl.h > @@ -253,6 +253,15 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, > } > return 0; > } > +static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) > +{ > + unsigned pkeys = 0; > +#if PTTYPE == 64 > + pte_t pte = {.pte = gpte}; > + pkeys = pte_flags_pkey(pte_flags(pte)); > +#endif It works for PAE guest either. It is not a problem as the bits used by PKEY are reserved on PAE mode which should be figured out by is_rsvd_bits_set() prior to this call and PKEY is ignored if EFER.LMA = 0. But please add some comments here to explain why it is safe. > + return pkeys; > +} > > /* > * Fetch a guest pte for a guest virtual address > @@ -265,12 +274,13 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, > pt_element_t pte; > pt_element_t __user *uninitialized_var(ptep_user); > gfn_t table_gfn; > - unsigned index, pt_access, pte_access, accessed_dirty; > + unsigned index, pt_access, pte_access, accessed_dirty, pte_pkeys; > gpa_t pte_gpa; > int offset; > const int write_fault = access & PFERR_WRITE_MASK; > const int user_fault = access & PFERR_USER_MASK; > const int fetch_fault = access & PFERR_FETCH_MASK; > + const int pk_fault = access & PFERR_PK_MASK; > u16 errcode = 0; > gpa_t real_gpa; > gfn_t gfn; > @@ -356,7 +366,9 @@ retry_walk: > walker->ptes[walker->level - 1] = pte; > } while (!is_last_gpte(mmu, walker->level, pte)); > > - if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { > + pte_pkeys = FNAME(gpte_pkeys)(vcpu, pte); > + if (unlikely(permission_fault(vcpu, mmu, pte_access, pte_pkeys, > + access))) { > errcode |= PFERR_PRESENT_MASK; > goto error; > } > @@ -399,7 +411,7 @@ retry_walk: > return 1; > > error: > - errcode |= write_fault | user_fault; > + errcode |= write_fault | user_fault | pk_fault; No. The pk_fault is not always caused by guest page table as it's depends on CR0.WP which is always true on shadow page table. > if (fetch_fault && (mmu->nx || > kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) > errcode |= PFERR_FETCH_MASK; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 05e7838..b697a99 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -4317,7 +4317,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, > > if (vcpu_match_mmio_gva(vcpu, gva) > && !permission_fault(vcpu, vcpu->arch.walk_mmu, > - vcpu->arch.access, access)) { > + vcpu->arch.access, 0, access)) { No. The pkey is not always 0. We should cache PKEY for the mmio access and use it here to check if the right is adequate.