All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Jan Beulich <JBeulich@suse.com>,
	xen-devel <xen-devel@lists.xenproject.org>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>,
	Keir Fraser <keir@xen.org>, Tim Deegan <tim@xen.org>,
	xiantao.zhang@intel.com
Subject: Re: [PATCH 1/5] IOMMU: make page table population preemptible
Date: Wed, 11 Dec 2013 18:40:37 +0000	[thread overview]
Message-ID: <52A8B1A5.9040302@citrix.com> (raw)
In-Reply-To: <52A74539020000780010BF1F@nat28.tlf.novell.com>

On 10/12/2013 15:45, Jan Beulich wrote:
> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
>
> The subject domain's rel_mem_list is being (ab)used for this, in a way
> similar to and compatible with broken page offlining.
>
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn't be able to access it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list, &d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()). */
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&

Why the forced restart here?  If nothing needs pre-empting, surely it is
better to continue?

Or is this about equality on the pcidevs_lock ?

> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of the
> +         * list. Hence we first move the whole list, and then move back the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL &&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
>          hd->platform_ops->teardown(d);
> +        d->need_iommu = 0;
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -323,7 +323,7 @@ struct domain
>  
>  #ifdef HAS_PASSTHROUGH
>      /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    s8               need_iommu;

I think this change from bool_t to s8 needs a comment explaining that -1
indicates "the iommu mappings are pending creation"

Is there any particular reason that -ERESTART is used when -EAGAIN is
the prevailing style for hypercall continuations?

~Andrew

  reply	other threads:[~2013-12-11 18:40 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
2013-12-11 18:40   ` Andrew Cooper [this message]
2013-12-13  9:51     ` Jan Beulich
2013-12-13 12:18       ` Andrew Cooper
2013-12-13 12:34         ` Jan Beulich
2013-12-13 13:57           ` Andrew Cooper
2013-12-13 13:59     ` [PATCH v2 " Jan Beulich
2013-12-13 14:16       ` Tim Deegan
2013-12-13 15:09       ` Andrew Cooper
2013-12-13 15:41         ` Jan Beulich
2013-12-13 15:44           ` Andrew Cooper
2013-12-30 13:43             ` Zhang, Xiantao
2014-01-07 13:23               ` Jan Beulich
2013-12-20 13:06   ` [PATCH " Jan Beulich
2013-12-10 15:46 ` [PATCH 2/5] IOMMU: make page table deallocation preemptible Jan Beulich
2013-12-11 19:01   ` Andrew Cooper
2013-12-13  9:55     ` Jan Beulich
2013-12-13 14:00     ` [PATCH v2 " Jan Beulich
2013-12-13 15:26       ` Andrew Cooper
2014-01-07 14:51       ` Zhang, Xiantao
2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
2013-12-10 15:58   ` Andrew Cooper
2013-12-10 17:31   ` George Dunlap
2013-12-13 13:46   ` Tim Deegan
2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
2013-12-10 16:01   ` Andrew Cooper
2013-12-10 17:32   ` George Dunlap
2013-12-17  9:16   ` Ping: " Jan Beulich
2013-12-17 10:45     ` Andrew Cooper
2013-12-17 11:02       ` Jan Beulich
2013-12-10 15:48 ` [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest() Jan Beulich
2013-12-10 17:23   ` Andrew Cooper
2013-12-10 17:33   ` George Dunlap
2013-12-10 18:17     ` Keir Fraser

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=52A8B1A5.9040302@citrix.com \
    --to=andrew.cooper3@citrix.com \
    --cc=George.Dunlap@eu.citrix.com \
    --cc=JBeulich@suse.com \
    --cc=keir@xen.org \
    --cc=tim@xen.org \
    --cc=xen-devel@lists.xenproject.org \
    --cc=xiantao.zhang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.