Re: GPU passthrough performance regression in >4GB vms due to XSA-60 changes

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

From: Tomasz Wroblewski <tomasz.wroblewski@gmail.com>
To: Jan Beulich <JBeulich@suse.com>
Cc: xen-devel@lists.xenproject.org
Subject: Re: GPU passthrough performance regression in >4GB vms due to XSA-60 changes
Date: Mon, 19 May 2014 16:20:12 +0200	[thread overview]
Message-ID: <537A131C.8010303@gmail.com> (raw)
In-Reply-To: <537A18B90200007800013A09@mail.emea.novell.com>

[-- Attachment #1: Type: text/plain, Size: 1561 bytes --]


On 05/19/2014 02:44 PM, Jan Beulich wrote:
>>>> On 19.05.14 at 14:17, <tomasz.wroblewski@gmail.com> wrote:
>> On 05/19/2014 02:06 PM, Jan Beulich wrote:
>>>>>> On 19.05.14 at 13:32, <tomasz.wroblewski@gmail.com> wrote:
>>>> Yeah I gave about a day of effort to port us onto unstable and test
>>>> there but it sadly looks to be a bigger job, so leaving that as a last
>>>> resort (though planning to spend couple more days on it soon).
>>> Then as an alternative did you try pulling over the EPT changes
>>> from -unstable?
>> That would be indeed preferable, I've looked over them but couldn't
>> figure out which particular change would fix the EPT update after MTRR
>> enable. Do you remember which that was? I could test it and try to
>> narrow any other commits it'd require (seems there were a lot of ept
>> related changes)
> I used plural for a reason - I'm afraid you would need to start out with
> taking them all, and then possibly determine which ones to drop as
> being unrelated to the issue at hand.
Looks like a partial backport of your commit

commit aa9114edd97b292cd89b3616e3f2089471fd2201
Author: Jan Beulich <jbeulich@suse.com>
Date:   Thu Apr 10 16:01:41 2014 +0200

     x86/EPT: force re-evaluation of memory type as necessary

is all that's necessary. Attaching it versus 4.3.2. I only left the 
memory_type_change calls in MTRR related areas, since only this is 
problematic for the particular issue. This is probably good enough for 
us, thanks for the pointers! Do you think this one is a relatively safe 
for the stable branches?


[-- Attachment #2: ept-bp-aa9114edd97b292cd89b3616e3f2089471fd2201 --]
[-- Type: text/plain, Size: 13286 bytes --]

@@ -3249,13 +3250,13 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
     case MSR_MTRRdefType:
         if ( !mtrr )
             goto gp_fault;
-        if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
+        if ( !mtrr_def_type_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, msr_content) )
            goto gp_fault;
         break;
     case MSR_MTRRfix64K_00000:
         if ( !mtrr )
             goto gp_fault;
-        if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
+        if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, 0, msr_content) )
             goto gp_fault;
         break;
     case MSR_MTRRfix16K_80000:
@@ -3263,7 +3264,7 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         if ( !mtrr )
             goto gp_fault;
         index = msr - MSR_MTRRfix16K_80000 + 1;
-        if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
+        if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
                                      index, msr_content) )
             goto gp_fault;
         break;
@@ -3271,7 +3272,7 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
         if ( !mtrr )
             goto gp_fault;
         index = msr - MSR_MTRRfix4K_C0000 + 3;
-        if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
+        if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr,
                                      index, msr_content) )
             goto gp_fault;
         break;
diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
index 83ff1ff..6763322 100644
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -403,7 +403,7 @@ uint32_t get_pat_flags(struct vcpu *v,
     return pat_type_2_pte_flags(pat_entry_value);
 }
 
-bool_t mtrr_def_type_msr_set(struct mtrr_state *m, uint64_t msr_content)
+bool_t mtrr_def_type_msr_set(struct domain *d, struct mtrr_state *m, uint64_t msr_content)
 {
     uint8_t def_type = msr_content & 0xff;
     uint8_t enabled = (msr_content >> 10) & 0x3;
@@ -422,13 +422,17 @@ bool_t mtrr_def_type_msr_set(struct mtrr_state *m, uint64_t msr_content)
          return 0;
     }
 
-    m->enabled = enabled;
-    m->def_type = def_type;
+    if ( m->enabled != enabled || m->def_type != def_type )
+    {
+        m->enabled = enabled;
+        m->def_type = def_type;
+        memory_type_changed(d);
+    }
 
     return 1;
 }
 
-bool_t mtrr_fix_range_msr_set(struct mtrr_state *m, uint32_t row,
+bool_t mtrr_fix_range_msr_set(struct domain *d, struct mtrr_state *m, uint32_t row,
                               uint64_t msr_content)
 {
     uint64_t *fixed_range_base = (uint64_t *)m->fixed_ranges;
@@ -447,6 +451,7 @@ bool_t mtrr_fix_range_msr_set(struct mtrr_state *m, uint32_t row,
         }
 
         fixed_range_base[row] = msr_content;
+        memory_type_changed(d);
     }
 
     return 1;
@@ -488,6 +493,8 @@ bool_t mtrr_var_range_msr_set(
 
     m->overlapped = is_var_mtrr_overlapped(m);
 
+    memory_type_changed(d);
+
     return 1;
 }
 
@@ -662,7 +669,7 @@ static int hvm_load_mtrr_msr(struct domain *d, hvm_domain_context_t *h)
     mtrr_state->mtrr_cap = hw_mtrr.msr_mtrr_cap;
 
     for ( i = 0; i < NUM_FIXED_MSR; i++ )
-        mtrr_fix_range_msr_set(mtrr_state, i, hw_mtrr.msr_mtrr_fixed[i]);
+        mtrr_fix_range_msr_set(d, mtrr_state, i, hw_mtrr.msr_mtrr_fixed[i]);
 
     for ( i = 0; i < MTRR_VCNT; i++ )
     {
@@ -672,7 +679,7 @@ static int hvm_load_mtrr_msr(struct domain *d, hvm_domain_context_t *h)
                 MTRRphysMask_MSR(i), hw_mtrr.msr_mtrr_var[i*2+1]);
     }
 
-    mtrr_def_type_msr_set(mtrr_state, hw_mtrr.msr_mtrr_def_type);
+    mtrr_def_type_msr_set(d, mtrr_state, hw_mtrr.msr_mtrr_def_type);
 
     return 0;
 }
@@ -680,6 +687,12 @@ static int hvm_load_mtrr_msr(struct domain *d, hvm_domain_context_t *h)
 HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr,
                           1, HVMSR_PER_VCPU);
 
+void memory_type_changed(struct domain *d)
+{
+    if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] )
+        p2m_memory_type_changed(d);
+}
+
 uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
                            uint8_t *ipat, bool_t direct_mmio)
 {
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 03216b6..2e9d12d 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2907,6 +2907,14 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         break;
     }
 
+    case EXIT_REASON_EPT_MISCONFIG:
+    {
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+        if ( !ept_handle_misconfig(gpa) )
+            goto exit_and_crash;
+        break;
+    }
+
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 92d9e2d..16ba317 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -271,6 +271,125 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
     return GUEST_TABLE_NORMAL_PAGE;
 }
 
+static bool_t ept_invalidate_emt(mfn_t mfn)
+{
+    ept_entry_t *epte = map_domain_page(mfn_x(mfn));
+    unsigned int i;
+    bool_t changed = 0;
+
+    for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        ept_entry_t e = atomic_read_ept_entry(&epte[i]);
+
+        if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
+             e.emt == MTRR_NUM_TYPES )
+            continue;
+
+        e.emt = MTRR_NUM_TYPES;
+        atomic_write_ept_entry(&epte[i], e);
+        changed = 1;
+    }
+
+    unmap_domain_page(epte);
+
+    return changed;
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct ept_data *ept = &p2m->ept;
+    unsigned int level = ept_get_wl(ept);
+    unsigned long gfn = PFN_DOWN(gpa);
+    unsigned long mfn = ept_get_asr(ept);
+    ept_entry_t *epte;
+    int okay;
+
+    if ( !mfn )
+        return 0;
+
+    p2m_lock(p2m);
+
+    okay = -curr->arch.hvm_vmx.ept_spurious_misconfig;
+    for ( ; ; --level )
+    {
+        ept_entry_t e;
+        unsigned int i;
+
+        epte = map_domain_page(mfn);
+        i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
+        e = atomic_read_ept_entry(&epte[i]);
+
+        if ( level == 0 || is_epte_superpage(&e) )
+        {
+            uint8_t ipat = 0;
+
+            if ( e.emt != MTRR_NUM_TYPES )
+                break;
+
+            if ( level == 0 )
+            {
+                for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
+                {
+                    e = atomic_read_ept_entry(&epte[i]);
+                    if ( e.emt == MTRR_NUM_TYPES )
+                        e.emt = 0;
+                    if ( !is_epte_valid(&e) || !is_epte_present(&e) )
+                        continue;
+                    e.emt = epte_get_entry_emt(p2m->domain, gfn + i,
+                                               _mfn(e.mfn), &ipat,
+                                               e.sa_p2mt == p2m_mmio_direct);
+                    e.ipat = ipat;
+                    atomic_write_ept_entry(&epte[i], e);
+                }
+            }
+            else
+            {
+                e.emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
+                                           &ipat,
+                                           e.sa_p2mt == p2m_mmio_direct);
+                e.ipat = ipat;
+                atomic_write_ept_entry(&epte[i], e);
+            }
+
+            okay = 1;
+            break;
+        }
+
+        if ( e.emt == MTRR_NUM_TYPES )
+        {
+            ASSERT(is_epte_present(&e));
+            ept_invalidate_emt(_mfn(e.mfn));
+            smp_wmb();
+            e.emt = 0;
+            atomic_write_ept_entry(&epte[i], e);
+            unmap_domain_page(epte);
+            okay = 1;
+        }
+        else if ( is_epte_present(&e) && !e.emt )
+            unmap_domain_page(epte);
+        else
+            break;
+
+        mfn = e.mfn;
+    }
+
+    unmap_domain_page(epte);
+    if ( okay > 0 )
+    {
+        struct vcpu *v;
+
+        for_each_vcpu ( curr->domain, v )
+            v->arch.hvm_vmx.ept_spurious_misconfig = 1;
+    }
+    curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
+    ept_sync_domain(p2m);
+    p2m_unlock(p2m);
+
+    return !!okay;
+}
+
 /*
  * ept_set_entry() computes 'need_modify_vtd_table' for itself,
  * by observing whether any gfn->mfn translations are modified.
@@ -687,6 +806,17 @@ static void ept_change_entry_type_global(struct p2m_domain *p2m,
     ept_sync_domain(p2m);
 }
 
+static void ept_memory_type_changed(struct p2m_domain *p2m)
+{
+    unsigned long mfn = ept_get_asr(&p2m->ept);
+
+    if ( !mfn )
+        return;
+
+    if ( ept_invalidate_emt(_mfn(mfn)) )
+        ept_sync_domain(p2m);
+}
+
 static void __ept_sync_domain(void *info)
 {
     struct ept_data *ept = &((struct p2m_domain *)info)->ept;
@@ -724,6 +854,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->set_entry = ept_set_entry;
     p2m->get_entry = ept_get_entry;
     p2m->change_entry_type_global = ept_change_entry_type_global;
+    p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
 
     /* Set the memory type used when accessing EPT paging structures. */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index f5ddd20..a3ecb36 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct domain *d,
     p2m_unlock(p2m);
 }
 
+void p2m_memory_type_changed(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->memory_type_changed )
+    {
+        p2m_lock(p2m);
+        p2m->memory_type_changed(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
                     unsigned int *page_order, bool_t locked)
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index 4d55573..0d85347 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -124,6 +124,9 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */
+    bool_t               ept_spurious_misconfig;
+
     /* Is the guest in real mode? */
     uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
index f4d759b..55daed9 100644
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -455,6 +455,7 @@ int ept_p2m_init(struct p2m_domain *p2m);
 void ept_p2m_uninit(struct p2m_domain *p2m);
 
 void ept_walk_table(struct domain *d, unsigned long gfn);
+bool_t ept_handle_misconfig(uint64_t gpa);
 void setup_ept_dump(void);
 
 void update_guest_eip(void);
diff --git a/xen/include/asm-x86/mtrr.h b/xen/include/asm-x86/mtrr.h
index 6b4d632..a6f426e 100644
--- a/xen/include/asm-x86/mtrr.h
+++ b/xen/include/asm-x86/mtrr.h
@@ -78,10 +78,11 @@ extern void mtrr_bp_restore(void);
 extern bool_t mtrr_var_range_msr_set(
     struct domain *d, struct mtrr_state *m,
     uint32_t msr, uint64_t msr_content);
-extern bool_t mtrr_fix_range_msr_set(struct mtrr_state *v,
+extern bool_t mtrr_fix_range_msr_set(struct domain *d, struct mtrr_state *v,
 				uint32_t row, uint64_t msr_content);
-extern bool_t mtrr_def_type_msr_set(struct mtrr_state *v, uint64_t msr_content);
+extern bool_t mtrr_def_type_msr_set(struct domain *d, struct mtrr_state *v, uint64_t msr_content);
 extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr);
+extern void memory_type_changed(struct domain *);
 
 bool_t is_var_mtrr_overlapped(struct mtrr_state *m);
 bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index f4e7253..facd318 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -233,6 +233,7 @@ struct p2m_domain {
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
+    void               (*memory_type_changed)(struct p2m_domain *p2m);
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,
                                           unsigned long gfn, l1_pgentry_t *p,
@@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain *d,
 p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
                            p2m_type_t ot, p2m_type_t nt);
 
+/* Report a change affecting memory types. */
+void p2m_memory_type_changed(struct domain *d);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

next prev parent reply	other threads:[~2014-05-19 14:20 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-15  9:11 GPU passthrough performance regression in >4GB vms due to XSA-60 changes Tomasz Wroblewski
2014-05-15 12:32 ` Jan Beulich
2014-05-15 12:10   ` Tomasz Wroblewski
2014-05-15 13:23     ` Jan Beulich
2014-05-15 13:39       ` Tomasz Wroblewski
2014-05-15 14:34         ` Tomasz Wroblewski
2014-05-15 14:56           ` Tomasz Wroblewski
2014-05-15 16:07             ` Jan Beulich
2014-05-15 15:39               ` Tomasz Wroblewski
2014-05-16  6:33                 ` Jan Beulich
2014-05-16 11:18                   ` Tomasz Wroblewski
2014-05-16 11:38                     ` Jan Beulich
2014-05-16 14:36                       ` Jan Beulich
2014-05-19 10:29                         ` Tomasz Wroblewski
2014-05-19 10:38                           ` Jan Beulich
2014-05-19 10:47                             ` Tomasz Wroblewski
2014-05-19 11:07                               ` Jan Beulich
2014-05-19 11:32                                 ` Tomasz Wroblewski
2014-05-19 12:06                                   ` Jan Beulich
2014-05-19 12:17                                     ` Tomasz Wroblewski
2014-05-19 12:44                                       ` Jan Beulich
2014-05-19 14:20                                         ` Tomasz Wroblewski [this message]
2014-05-19 15:24                                           ` Jan Beulich
2014-05-19 15:48                                             ` Tomasz Wroblewski
2014-05-19 17:36                                             ` Tim Deegan
2014-05-20  6:31                                               ` Jan Beulich
2014-05-19 10:42                           ` Tomasz Wroblewski
2014-05-19 11:01                             ` Jan Beulich
2014-05-19 11:09                               ` Tomasz Wroblewski
2014-05-19 11:19                                 ` Jan Beulich
2014-05-15 16:01         ` Jan Beulich

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:83ff1ff dfblob:6763322 dfblob:03216b6 dfblob:2e9d12d
dfblob:92d9e2d dfblob:16ba317 dfblob:f5ddd20 dfblob:a3ecb36
dfblob:4d55573 dfblob:0d85347 dfblob:f4d759b dfblob:55daed9
dfblob:6b4d632 dfblob:a6f426e dfblob:f4e7253 dfblob:facd318 )
 OR (
bs:"Re: GPU passthrough performance regression in >4GB vms due to XSA-60 changes" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=537A131C.8010303@gmail.com \
    --to=tomasz.wroblewski@gmail.com \
    --cc=JBeulich@suse.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).