From mboxrd@z Thu Jan  1 00:00:00 1970
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: Re: [PATCH] x86: don't drop guest visible state updates
 when 64-bit PV guest is in user mode
Date: Tue, 21 Jan 2014 16:55:57 +0000
Message-ID: <52DEA69D.4010102@citrix.com>
References: <52DEA16B02000078001156E8@nat28.tlf.novell.com>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="===============2147995466282644967=="
Return-path: <xen-devel-bounces@lists.xen.org>
Received: from mail6.bemta4.messagelabs.com ([85.158.143.247])
	by lists.xen.org with esmtp (Exim 4.72)
	(envelope-from <Andrew.Cooper3@citrix.com>) id 1W5ecN-0006Ee-K7
	for xen-devel@lists.xenproject.org; Tue, 21 Jan 2014 16:56:03 +0000
In-Reply-To: <52DEA16B02000078001156E8@nat28.tlf.novell.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: Jan Beulich <JBeulich@suse.com>, xen-devel <xen-devel@lists.xenproject.org>
Cc: Keir Fraser <keir@xen.org>
List-Id: xen-devel@lists.xenproject.org

--===============2147995466282644967==
Content-Type: multipart/alternative;
	boundary="------------000303050600060402060603"

--------------000303050600060402060603
Content-Type: text/plain; charset="ISO-8859-1"
Content-Transfer-Encoding: 7bit

On 21/01/2014 15:33, Jan Beulich wrote:
> Since 64-bit PV uses separate kernel and user mode page tables, kernel
> addresses (as usually provided via VCPUOP_register_runstate_memory_area
> and possibly via VCPUOP_register_vcpu_time_memory_area) aren't
> necessarily accessible when the respective updating occurs. Add logic
> for toggle_guest_mode() to take care of this (if necessary) the next
> time the vCPU switches to kernel mode.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1323,10 +1323,10 @@ static void paravirt_ctxt_switch_to(stru
>  }
>  
>  /* Update per-VCPU guest runstate shared memory area (if registered). */
> -static void update_runstate_area(struct vcpu *v)
> +bool_t update_runstate_area(const struct vcpu *v)

Can you adjust the comment to indicate what the return value means.  The
logic is quite opaque, but I believe it is "true if the runstate has
been suitably updated".

>  {
>      if ( guest_handle_is_null(runstate_guest(v)) )
> -        return;
> +        return 1;
>  
>      if ( has_32bit_shinfo(v->domain) )
>      {
> @@ -1334,10 +1334,18 @@ static void update_runstate_area(struct 
>  
>          XLAT_vcpu_runstate_info(&info, &v->runstate);
>          __copy_to_guest(v->runstate_guest.compat, &info, 1);
> -        return;
> +        return 1;
>      }
>  
> -    __copy_to_guest(runstate_guest(v), &v->runstate, 1);
> +    return __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
> +           sizeof(v->runstate);
> +}
> +
> +static void _update_runstate_area(struct vcpu *v)
> +{
> +    if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
> +         !(v->arch.flags & TF_kernel_mode) )
> +        v->arch.pv_vcpu.need_update_runstate_area = 1;
>  }
>  
>  static inline int need_full_gdt(struct vcpu *v)
> @@ -1443,8 +1451,8 @@ void context_switch(struct vcpu *prev, s
>          flush_tlb_mask(&dirty_mask);
>      }
>  
> -    if (prev != next)
> -        update_runstate_area(prev);
> +    if ( prev != next )
> +        _update_runstate_area(prev);
>  
>      if ( is_hvm_vcpu(prev) )
>      {
> @@ -1497,8 +1505,8 @@ void context_switch(struct vcpu *prev, s
>  
>      context_saved(prev);
>  
> -    if (prev != next)
> -        update_runstate_area(next);
> +    if ( prev != next )
> +        _update_runstate_area(next);
>  
>      /* Ensure that the vcpu has an up-to-date time base. */
>      update_vcpu_system_time(next);
> --- a/xen/arch/x86/time.c
> +++ b/xen/arch/x86/time.c
> @@ -740,7 +740,6 @@ static void __update_vcpu_system_time(st
>  {
>      struct cpu_time       *t;
>      struct vcpu_time_info *u, _u;
> -    XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
>      struct domain *d = v->domain;
>      s_time_t tsc_stamp = 0;
>  
> @@ -805,19 +804,31 @@ static void __update_vcpu_system_time(st
>      /* 3. Update guest kernel version. */
>      u->version = version_update_end(u->version);
>  
> -    user_u = v->arch.time_info_guest;
> -    if ( !guest_handle_is_null(user_u) )
> -    {
> -        /* 1. Update userspace version. */
> -        __copy_field_to_guest(user_u, &_u, version);
> -        wmb();
> -        /* 2. Update all other userspavce fields. */
> -        __copy_to_guest(user_u, &_u, 1);
> -        wmb();
> -        /* 3. Update userspace version. */
> -        _u.version = version_update_end(_u.version);
> -        __copy_field_to_guest(user_u, &_u, version);
> -    }
> +    if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) &&
> +         !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) )
> +        v->arch.pv_vcpu.pending_system_time = _u;
> +}
> +
> +bool_t update_secondary_system_time(const struct vcpu *v,
> +                                    struct vcpu_time_info *u)
> +{
> +    XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
> +
> +    if ( guest_handle_is_null(user_u) )
> +        return 1;
> +
> +    /* 1. Update userspace version. */
> +    if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) )
> +        return 0;
> +    wmb();
> +    /* 2. Update all other userspace fields. */
> +    __copy_to_guest(user_u, u, 1);
> +    wmb();
> +    /* 3. Update userspace version. */
> +    u->version = version_update_end(u->version);
> +    __copy_field_to_guest(user_u, u, version);
> +
> +    return 1;
>  }
>  
>  void update_vcpu_system_time(struct vcpu *v)
> --- a/xen/arch/x86/x86_64/traps.c
> +++ b/xen/arch/x86/x86_64/traps.c
> @@ -266,6 +266,18 @@ void toggle_guest_mode(struct vcpu *v)
>  #else
>      write_ptbase(v);
>  #endif
> +
> +    if ( !(v->arch.flags & TF_kernel_mode) )
> +        return;
> +
> +    if ( v->arch.pv_vcpu.need_update_runstate_area &&
> +         update_runstate_area(v) )
> +        v->arch.pv_vcpu.need_update_runstate_area = 0;
> +
> +    if ( v->arch.pv_vcpu.pending_system_time.version &&
> +         update_secondary_system_time(v,
> +                                      &v->arch.pv_vcpu.pending_system_time) )
> +        v->arch.pv_vcpu.pending_system_time.version = 0;

What would happen now if a guest kernel loads a faulting address for its
runstate info (or edits its pagetables so a previously good address now
faults)?  It appears as if we could retry writing to the same bad
address each time we try to toggle into kernel mode.

Given that we know we are running on the correct set of pagetables, I
think both of these pending variable can be unconditionally cleared,
whether or not update{runstate_area,secondary_system_time} succeed or fail.

~Andrew

>  }
>  
>  unsigned long do_iret(void)
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -373,6 +373,10 @@ struct pv_vcpu
>      /* Current LDT details. */
>      unsigned long shadow_ldt_mapcnt;
>      spinlock_t shadow_ldt_lock;
> +
> +    /* Deferred VA-based update state. */
> +    bool_t need_update_runstate_area;
> +    struct vcpu_time_info pending_system_time;
>  };
>  
>  struct arch_vcpu
> @@ -445,6 +449,10 @@ struct arch_vcpu
>  #define hvm_vmx         hvm_vcpu.u.vmx
>  #define hvm_svm         hvm_vcpu.u.svm
>  
> +bool_t update_runstate_area(const struct vcpu *);
> +bool_t update_secondary_system_time(const struct vcpu *,
> +                                    struct vcpu_time_info *);
> +
>  void vcpu_show_execution_state(struct vcpu *);
>  void vcpu_show_registers(const struct vcpu *);
>  
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


--------------000303050600060402060603
Content-Type: text/html; charset="ISO-8859-1"
Content-Transfer-Encoding: 7bit

<html>
  <head>
    <meta content="text/html; charset=ISO-8859-1"
      http-equiv="Content-Type">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">On 21/01/2014 15:33, Jan Beulich wrote:<br>
    </div>
    <blockquote cite="mid:52DEA16B02000078001156E8@nat28.tlf.novell.com"
      type="cite">
      <pre wrap="">Since 64-bit PV uses separate kernel and user mode page tables, kernel
addresses (as usually provided via VCPUOP_register_runstate_memory_area
and possibly via VCPUOP_register_vcpu_time_memory_area) aren't
necessarily accessible when the respective updating occurs. Add logic
for toggle_guest_mode() to take care of this (if necessary) the next
time the vCPU switches to kernel mode.

Signed-off-by: Jan Beulich <a class="moz-txt-link-rfc2396E" href="mailto:jbeulich@suse.com">&lt;jbeulich@suse.com&gt;</a>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1323,10 +1323,10 @@ static void paravirt_ctxt_switch_to(stru
 }
 
 /* Update per-VCPU guest runstate shared memory area (if registered). */
-static void update_runstate_area(struct vcpu *v)
+bool_t update_runstate_area(const struct vcpu *v)</pre>
    </blockquote>
    <br>
    Can you adjust the comment to indicate what the return value means.&nbsp;
    The logic is quite opaque, but I believe it is "true if the runstate
    has been suitably updated".<br>
    <br>
    <blockquote cite="mid:52DEA16B02000078001156E8@nat28.tlf.novell.com"
      type="cite">
      <pre wrap="">
 {
     if ( guest_handle_is_null(runstate_guest(v)) )
-        return;
+        return 1;
 
     if ( has_32bit_shinfo(v-&gt;domain) )
     {
@@ -1334,10 +1334,18 @@ static void update_runstate_area(struct 
 
         XLAT_vcpu_runstate_info(&amp;info, &amp;v-&gt;runstate);
         __copy_to_guest(v-&gt;runstate_guest.compat, &amp;info, 1);
-        return;
+        return 1;
     }
 
-    __copy_to_guest(runstate_guest(v), &amp;v-&gt;runstate, 1);
+    return __copy_to_guest(runstate_guest(v), &amp;v-&gt;runstate, 1) !=
+           sizeof(v-&gt;runstate);
+}
+
+static void _update_runstate_area(struct vcpu *v)
+{
+    if ( !update_runstate_area(v) &amp;&amp; is_pv_vcpu(v) &amp;&amp;
+         !(v-&gt;arch.flags &amp; TF_kernel_mode) )
+        v-&gt;arch.pv_vcpu.need_update_runstate_area = 1;
 }
 
 static inline int need_full_gdt(struct vcpu *v)
@@ -1443,8 +1451,8 @@ void context_switch(struct vcpu *prev, s
         flush_tlb_mask(&amp;dirty_mask);
     }
 
-    if (prev != next)
-        update_runstate_area(prev);
+    if ( prev != next )
+        _update_runstate_area(prev);
 
     if ( is_hvm_vcpu(prev) )
     {
@@ -1497,8 +1505,8 @@ void context_switch(struct vcpu *prev, s
 
     context_saved(prev);
 
-    if (prev != next)
-        update_runstate_area(next);
+    if ( prev != next )
+        _update_runstate_area(next);
 
     /* Ensure that the vcpu has an up-to-date time base. */
     update_vcpu_system_time(next);
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -740,7 +740,6 @@ static void __update_vcpu_system_time(st
 {
     struct cpu_time       *t;
     struct vcpu_time_info *u, _u;
-    XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
     struct domain *d = v-&gt;domain;
     s_time_t tsc_stamp = 0;
 
@@ -805,19 +804,31 @@ static void __update_vcpu_system_time(st
     /* 3. Update guest kernel version. */
     u-&gt;version = version_update_end(u-&gt;version);
 
-    user_u = v-&gt;arch.time_info_guest;
-    if ( !guest_handle_is_null(user_u) )
-    {
-        /* 1. Update userspace version. */
-        __copy_field_to_guest(user_u, &amp;_u, version);
-        wmb();
-        /* 2. Update all other userspavce fields. */
-        __copy_to_guest(user_u, &amp;_u, 1);
-        wmb();
-        /* 3. Update userspace version. */
-        _u.version = version_update_end(_u.version);
-        __copy_field_to_guest(user_u, &amp;_u, version);
-    }
+    if ( !update_secondary_system_time(v, &amp;_u) &amp;&amp; is_pv_domain(d) &amp;&amp;
+         !is_pv_32bit_domain(d) &amp;&amp; !(v-&gt;arch.flags &amp; TF_kernel_mode) )
+        v-&gt;arch.pv_vcpu.pending_system_time = _u;
+}
+
+bool_t update_secondary_system_time(const struct vcpu *v,
+                                    struct vcpu_time_info *u)
+{
+    XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v-&gt;arch.time_info_guest;
+
+    if ( guest_handle_is_null(user_u) )
+        return 1;
+
+    /* 1. Update userspace version. */
+    if ( __copy_field_to_guest(user_u, u, version) == sizeof(u-&gt;version) )
+        return 0;
+    wmb();
+    /* 2. Update all other userspace fields. */
+    __copy_to_guest(user_u, u, 1);
+    wmb();
+    /* 3. Update userspace version. */
+    u-&gt;version = version_update_end(u-&gt;version);
+    __copy_field_to_guest(user_u, u, version);
+
+    return 1;
 }
 
 void update_vcpu_system_time(struct vcpu *v)
--- a/xen/arch/x86/x86_64/traps.c
+++ b/xen/arch/x86/x86_64/traps.c
@@ -266,6 +266,18 @@ void toggle_guest_mode(struct vcpu *v)
 #else
     write_ptbase(v);
 #endif
+
+    if ( !(v-&gt;arch.flags &amp; TF_kernel_mode) )
+        return;
+
+    if ( v-&gt;arch.pv_vcpu.need_update_runstate_area &amp;&amp;
+         update_runstate_area(v) )
+        v-&gt;arch.pv_vcpu.need_update_runstate_area = 0;
+
+    if ( v-&gt;arch.pv_vcpu.pending_system_time.version &amp;&amp;
+         update_secondary_system_time(v,
+                                      &amp;v-&gt;arch.pv_vcpu.pending_system_time) )
+        v-&gt;arch.pv_vcpu.pending_system_time.version = 0;</pre>
    </blockquote>
    <br>
    What would happen now if a guest kernel loads a faulting address for
    its runstate info (or edits its pagetables so a previously good
    address now faults)?&nbsp; It appears as if we could retry writing to the
    same bad address each time we try to toggle into kernel mode.<br>
    <br>
    Given that we know we are running on the correct set of pagetables,
    I think both of these pending variable can be unconditionally
    cleared, whether or not update{runstate_area,secondary_system_time}
    succeed or fail.<br>
    <br>
    ~Andrew<br>
    <br>
    <blockquote cite="mid:52DEA16B02000078001156E8@nat28.tlf.novell.com"
      type="cite">
      <pre wrap="">
 }
 
 unsigned long do_iret(void)
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -373,6 +373,10 @@ struct pv_vcpu
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
+
+    /* Deferred VA-based update state. */
+    bool_t need_update_runstate_area;
+    struct vcpu_time_info pending_system_time;
 };
 
 struct arch_vcpu
@@ -445,6 +449,10 @@ struct arch_vcpu
 #define hvm_vmx         hvm_vcpu.u.vmx
 #define hvm_svm         hvm_vcpu.u.svm
 
+bool_t update_runstate_area(const struct vcpu *);
+bool_t update_secondary_system_time(const struct vcpu *,
+                                    struct vcpu_time_info *);
+
 void vcpu_show_execution_state(struct vcpu *);
 void vcpu_show_registers(const struct vcpu *);
 

</pre>
      <br>
      <fieldset class="mimeAttachmentHeader"></fieldset>
      <br>
      <pre wrap="">_______________________________________________
Xen-devel mailing list
<a class="moz-txt-link-abbreviated" href="mailto:Xen-devel@lists.xen.org">Xen-devel@lists.xen.org</a>
<a class="moz-txt-link-freetext" href="http://lists.xen.org/xen-devel">http://lists.xen.org/xen-devel</a>
</pre>
    </blockquote>
    <br>
  </body>
</html>

--------------000303050600060402060603--


--===============2147995466282644967==
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

--===============2147995466282644967==--