From mboxrd@z Thu Jan  1 00:00:00 1970
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: Re: [PATCH v2 1/5] IOMMU: make page table population
 preemptible
Date: Fri, 13 Dec 2013 15:09:02 +0000
Message-ID: <52AB230E.90607@citrix.com>
References: <52A744B7020000780010BEF1@nat28.tlf.novell.com>	<52A74539020000780010BF1F@nat28.tlf.novell.com>	<52A8B1A5.9040302@citrix.com>
	<52AB20DB020000780010D060@nat28.tlf.novell.com>
Mime-Version: 1.0
Content-Type: multipart/mixed; boundary="===============4918853730236402631=="
Return-path: <xen-devel-bounces@lists.xen.org>
Received: from mail6.bemta3.messagelabs.com ([195.245.230.39])
	by lists.xen.org with esmtp (Exim 4.72)
	(envelope-from <Andrew.Cooper3@citrix.com>) id 1VrUMb-0002JR-MI
	for xen-devel@lists.xenproject.org; Fri, 13 Dec 2013 15:09:14 +0000
In-Reply-To: <52AB20DB020000780010D060@nat28.tlf.novell.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: Jan Beulich <JBeulich@suse.com>, xen-devel <xen-devel@lists.xenproject.org>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>, Tim Deegan <tim@xen.org>, Keir Fraser <keir@xen.org>, xiantao.zhang@intel.com
List-Id: xen-devel@lists.xenproject.org

--===============4918853730236402631==
Content-Type: multipart/alternative;
	boundary="------------060306060400050907020003"

--------------060306060400050907020003
Content-Type: text/plain; charset="ISO-8859-1"
Content-Transfer-Encoding: 7bit

On 13/12/2013 13:59, Jan Beulich wrote:
> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
>
> The subject domain's rel_mem_list is being (ab)used for this, in a way
> similar to and compatible with broken page offlining.
>
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn't be able to access it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Extend comment on struct domain's need_iommu field.
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list, &d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()). */
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )

We now have a case where, for the first device, we could set up
pagetables for a large domain, get an error with assignment, then tear
them all back down.  (-EBUSY from pci_get_pdev() looks like a good
non-fatal candidate for causing this behaviour)

I am wondering whether this is better for worse than the race condition
where a guest couldn't use the device.  A guest could not reasonably
expect to use a device before the toolstack is done setting it up.   A
buggy toolstack could quite easily tie up a lot of Xen time creating and
destroying complete iommu pagetable sets.

~Andrew

>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of the
> +         * list. Hence we first move the whole list, and then move back the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL &&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
> +        d->need_iommu = 0;
>          hd->platform_ops->teardown(d);
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -322,8 +322,8 @@ struct domain
>      enum guest_type guest_type;
>  
>  #ifdef HAS_PASSTHROUGH
> -    /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    /* Does this guest need iommu mappings (-1 meaning "being set up")? */
> +    s8               need_iommu;
>  #endif
>      /* is node-affinity automatically computed? */
>      bool_t           auto_node_affinity;
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


--------------060306060400050907020003
Content-Type: text/html; charset="ISO-8859-1"
Content-Transfer-Encoding: 7bit

<html>
  <head>
    <meta content="text/html; charset=ISO-8859-1"
      http-equiv="Content-Type">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">On 13/12/2013 13:59, Jan Beulich wrote:<br>
    </div>
    <blockquote cite="mid:52AB20DB020000780010D060@nat28.tlf.novell.com"
      type="cite">
      <pre wrap="">Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain's rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn't be able to access it.

Signed-off-by: Jan Beulich <a class="moz-txt-link-rfc2396E" href="mailto:jbeulich@suse.com">&lt;jbeulich@suse.com&gt;</a>
---
v2: Extend comment on struct domain's need_iommu field.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
         }
 
         d-&gt;arch.relmem = RELMEM_xen;
+
+        spin_lock(&amp;d-&gt;page_alloc_lock);
+        page_list_splice(&amp;d-&gt;arch.relmem_list, &amp;d-&gt;page_list);
+        INIT_PAGE_LIST_HEAD(&amp;d-&gt;arch.relmem_list);
+        spin_unlock(&amp;d-&gt;page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &amp;d-&gt;xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &amp;d-&gt;arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &amp;d-&gt;arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include &lt;asm/hvm/iommu.h&gt;
 #include &lt;xen/paging.h&gt;
 #include &lt;xen/guest_access.h&gt;
+#include &lt;xen/event.h&gt;
 #include &lt;xen/softirq.h&gt;
 #include &lt;xen/keyhandler.h&gt;
 #include &lt;xsm/xsm.h&gt;
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d-&gt;mem_event-&gt;paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&amp;pcidevs_lock);
+    if ( !spin_trylock(&amp;pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) &lt;= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&amp;pcidevs_lock);
+                return rc;
+            }
+        }
+        d-&gt;need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) &amp;&amp; !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) &amp;&amp; need_iommu(d) )</pre>
    </blockquote>
    <br>
    We now have a case where, for the first device, we could set up
    pagetables for a large domain, get an error with assignment, then
    tear them all back down.&nbsp; (-EBUSY from pci_get_pdev() looks like a
    good non-fatal candidate for causing this behaviour)<br>
    <br>
    I am wondering whether this is better for worse than the race
    condition where a guest couldn't use the device.&nbsp; A guest could not
    reasonably expect to use a device before the toolstack is done
    setting it up.&nbsp;&nbsp; A buggy toolstack could quite easily tie up a lot
    of Xen time creating and destroying complete iommu pagetable sets.<br>
    <br>
    ~Andrew<br>
    <br>
    <blockquote cite="mid:52AB20DB020000780010D060@nat28.tlf.novell.com"
      type="cite">
      <pre wrap="">
     {
-        d-&gt;need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d-&gt;need_iommu = 0;
+        hd-&gt;platform_ops-&gt;teardown(d);
     }
-done:
     spin_unlock(&amp;pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d-&gt;need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&amp;d-&gt;page_alloc_lock);
 
-    page_list_for_each ( page, &amp;d-&gt;page_list )
+    if ( unlikely(d-&gt;is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc &amp;&amp; (page = page_list_remove_head(&amp;d-&gt;page_list)) )
     {
         if ( is_hvm_domain(d) ||
             (page-&gt;u.inuse.type_info &amp; PGT_type_mask) == PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &amp;d-&gt;page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &amp;d-&gt;arch.relmem_list);
+        if ( !(++n &amp; 0xff) &amp;&amp; !page_list_empty(&amp;d-&gt;page_list) &amp;&amp;
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&amp;d-&gt;page_list, &amp;d-&gt;arch.relmem_list);
+        while ( (page = page_list_first(&amp;d-&gt;page_list)) != NULL &amp;&amp;
+                (page-&gt;count_info &amp; (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &amp;d-&gt;page_list);
+            page_list_add_tail(page, &amp;d-&gt;arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
+        d-&gt;need_iommu = 0;
         hd-&gt;platform_ops-&gt;teardown(d);
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -322,8 +322,8 @@ struct domain
     enum guest_type guest_type;
 
 #ifdef HAS_PASSTHROUGH
-    /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    /* Does this guest need iommu mappings (-1 meaning "being set up")? */
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;


</pre>
      <br>
      <fieldset class="mimeAttachmentHeader"></fieldset>
      <br>
      <pre wrap="">_______________________________________________
Xen-devel mailing list
<a class="moz-txt-link-abbreviated" href="mailto:Xen-devel@lists.xen.org">Xen-devel@lists.xen.org</a>
<a class="moz-txt-link-freetext" href="http://lists.xen.org/xen-devel">http://lists.xen.org/xen-devel</a>
</pre>
    </blockquote>
    <br>
  </body>
</html>

--------------060306060400050907020003--


--===============4918853730236402631==
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

--===============4918853730236402631==--