xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] XSA-77 follow-ups
@ 2013-12-10 15:43 Jan Beulich
  2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-10 15:43 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Keir Fraser

1: IOMMU: make page table population preemptible
2: IOMMU: make page table deallocation preemptible
3: x86/p2m: restrict auditing to debug builds
4: HVM: prevent leaking heap data from hvm_save_one()
5: x86/PV: don't commit debug register values early in arch_set_info_guest()

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
@ 2013-12-10 15:45 ` Jan Beulich
  2013-12-11 18:40   ` Andrew Cooper
  2013-12-20 13:06   ` [PATCH " Jan Beulich
  2013-12-10 15:46 ` [PATCH 2/5] IOMMU: make page table deallocation preemptible Jan Beulich
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-10 15:45 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Tim Deegan, Keir Fraser, xiantao.zhang

[-- Attachment #1: Type: text/plain, Size: 6080 bytes --]

Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain's rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn't be able to access it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
         }
 
         d->arch.relmem = RELMEM_xen;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_splice(&d->arch.relmem_list, &d->page_list);
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+        spin_unlock(&d->page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &d->arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include <asm/hvm/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
+#include <xen/event.h>
 #include <xen/softirq.h>
 #include <xen/keyhandler.h>
 #include <xsm/xsm.h>
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d->mem_event->paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&pcidevs_lock);
+    if ( !spin_trylock(&pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) <= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&pcidevs_lock);
+                return rc;
+            }
+        }
+        d->need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) && !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) && need_iommu(d) )
     {
-        d->need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d->need_iommu = 0;
+        hd->platform_ops->teardown(d);
     }
-done:
     spin_unlock(&pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d->need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&d->page_alloc_lock);
 
-    page_list_for_each ( page, &d->page_list )
+    if ( unlikely(d->is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
     {
         if ( is_hvm_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &d->page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &d->arch.relmem_list);
+        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&d->page_list, &d->arch.relmem_list);
+        while ( (page = page_list_first(&d->page_list)) != NULL &&
+                (page->count_info & (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &d->page_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
         hd->platform_ops->teardown(d);
+        d->need_iommu = 0;
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -323,7 +323,7 @@ struct domain
 
 #ifdef HAS_PASSTHROUGH
     /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;



[-- Attachment #2: IOMMU-populate-pt-continuation.patch --]
[-- Type: text/plain, Size: 6125 bytes --]

IOMMU: make page table population preemptible

Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain's rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn't be able to access it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
         }
 
         d->arch.relmem = RELMEM_xen;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_splice(&d->arch.relmem_list, &d->page_list);
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+        spin_unlock(&d->page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &d->arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include <asm/hvm/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
+#include <xen/event.h>
 #include <xen/softirq.h>
 #include <xen/keyhandler.h>
 #include <xsm/xsm.h>
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d->mem_event->paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&pcidevs_lock);
+    if ( !spin_trylock(&pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) <= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&pcidevs_lock);
+                return rc;
+            }
+        }
+        d->need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) && !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) && need_iommu(d) )
     {
-        d->need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d->need_iommu = 0;
+        hd->platform_ops->teardown(d);
     }
-done:
     spin_unlock(&pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d->need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&d->page_alloc_lock);
 
-    page_list_for_each ( page, &d->page_list )
+    if ( unlikely(d->is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
     {
         if ( is_hvm_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &d->page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &d->arch.relmem_list);
+        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&d->page_list, &d->arch.relmem_list);
+        while ( (page = page_list_first(&d->page_list)) != NULL &&
+                (page->count_info & (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &d->page_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
         hd->platform_ops->teardown(d);
+        d->need_iommu = 0;
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -323,7 +323,7 @@ struct domain
 
 #ifdef HAS_PASSTHROUGH
     /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/5] IOMMU: make page table deallocation preemptible
  2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
  2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
@ 2013-12-10 15:46 ` Jan Beulich
  2013-12-11 19:01   ` Andrew Cooper
  2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Jan Beulich @ 2013-12-10 15:46 UTC (permalink / raw)
  To: xen-devel
  Cc: George Dunlap, Keir Fraser, xiantao.zhang, suravee.suthikulpanit

[-- Attachment #1: Type: text/plain, Size: 5601 bytes --]

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling
the necessary preemption logic in line seems close to impossible given
that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d
     return 0;
 }
 
+static void deallocate_page_tables(unsigned long unused);
+
 int __init amd_iov_detect(void)
 {
     INIT_LIST_HEAD(&amd_iommu_head);
@@ -215,6 +217,7 @@ int __init amd_iov_detect(void)
         return -ENODEV;
     }
 
+    tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables, 0);
     init_done = 1;
 
     if ( !amd_iommu_perdev_intremap )
@@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);
 }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level)
+{
+    spin_lock(&iommu_pt_cleanup_lock);
+    PFN_ORDER(pg) = level;
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -439,6 +452,23 @@ static void deallocate_next_page_table(s
     free_amd_iommu_pgtable(pg);
 }
 
+static void deallocate_page_tables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        deallocate_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
+}
+
 static void deallocate_iommu_page_tables(struct domain *d)
 {
     struct hvm_iommu *hd  = domain_hvm_iommu(d);
@@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables
     {
         deallocate_next_page_table(hd->root_table, hd->paging_mode);
         hd->root_table = NULL;
+        tasklet_schedule(&iommu_pt_cleanup_tasklet);
     }
     spin_unlock(&hd->mapping_lock);
 }
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    spin_lock(&iommu_pt_cleanup_lock);
+    PFN_ORDER(pg) = level;
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg)
+{
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_
     free_pgtable_maddr(pt_maddr);
 }
 
+static void iommu_free_pagetables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
+}
+
 static int iommu_set_root_entry(struct iommu *iommu)
 {
     u32 sts;
@@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain
     iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
     hd->pgd_maddr = 0;
     spin_unlock(&hd->mapping_lock);
+
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
 }
 
 static int intel_iommu_map_page(
@@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void)
     if ( ret )
         goto error;
 
+    tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
     register_keyhandler('V', &dump_iommu_info_keyhandler);
 
     return 0;
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock;
+extern struct page_list_head iommu_pt_cleanup_list;
+extern struct tasklet iommu_pt_cleanup_tasklet;
+
 #endif /* _IOMMU_H_ */



[-- Attachment #2: IOMMU-deallocate-pt-continuation.patch --]
[-- Type: text/plain, Size: 5648 bytes --]

IOMMU: make page table deallocation preemptible

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling
the necessary preemption logic in line seems close to impossible given
that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d
     return 0;
 }
 
+static void deallocate_page_tables(unsigned long unused);
+
 int __init amd_iov_detect(void)
 {
     INIT_LIST_HEAD(&amd_iommu_head);
@@ -215,6 +217,7 @@ int __init amd_iov_detect(void)
         return -ENODEV;
     }
 
+    tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables, 0);
     init_done = 1;
 
     if ( !amd_iommu_perdev_intremap )
@@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);
 }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level)
+{
+    spin_lock(&iommu_pt_cleanup_lock);
+    PFN_ORDER(pg) = level;
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -439,6 +452,23 @@ static void deallocate_next_page_table(s
     free_amd_iommu_pgtable(pg);
 }
 
+static void deallocate_page_tables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        deallocate_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
+}
+
 static void deallocate_iommu_page_tables(struct domain *d)
 {
     struct hvm_iommu *hd  = domain_hvm_iommu(d);
@@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables
     {
         deallocate_next_page_table(hd->root_table, hd->paging_mode);
         hd->root_table = NULL;
+        tasklet_schedule(&iommu_pt_cleanup_tasklet);
     }
     spin_unlock(&hd->mapping_lock);
 }
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    spin_lock(&iommu_pt_cleanup_lock);
+    PFN_ORDER(pg) = level;
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg)
+{
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_
     free_pgtable_maddr(pt_maddr);
 }
 
+static void iommu_free_pagetables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
+}
+
 static int iommu_set_root_entry(struct iommu *iommu)
 {
     u32 sts;
@@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain
     iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
     hd->pgd_maddr = 0;
     spin_unlock(&hd->mapping_lock);
+
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
 }
 
 static int intel_iommu_map_page(
@@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void)
     if ( ret )
         goto error;
 
+    tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
     register_keyhandler('V', &dump_iommu_info_keyhandler);
 
     return 0;
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock;
+extern struct page_list_head iommu_pt_cleanup_list;
+extern struct tasklet iommu_pt_cleanup_tasklet;
+
 #endif /* _IOMMU_H_ */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 3/5] x86/p2m: restrict auditing to debug builds
  2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
  2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
  2013-12-10 15:46 ` [PATCH 2/5] IOMMU: make page table deallocation preemptible Jan Beulich
@ 2013-12-10 15:47 ` Jan Beulich
  2013-12-10 15:58   ` Andrew Cooper
                     ` (2 more replies)
  2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
  2013-12-10 15:48 ` [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest() Jan Beulich
  4 siblings, 3 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-10 15:47 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Tim Deegan, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 545 bytes --]

... since iterating through all of a guest's pages may take unduly
long.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m
 extern void p2m_pt_init(struct p2m_domain *p2m);
 
 /* Debugging and auditing of the P2M code? */
+#ifndef NDEBUG
 #define P2M_AUDIT     1
+#else
+#define P2M_AUDIT     0
+#endif
 #define P2M_DEBUGGING 0
 
 #if P2M_AUDIT




[-- Attachment #2: x86-p2m-audit-debug-only.patch --]
[-- Type: text/plain, Size: 585 bytes --]

x86/p2m: restrict auditing to debug builds

... since iterating through all of a guest's pages may take unduly
long.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m
 extern void p2m_pt_init(struct p2m_domain *p2m);
 
 /* Debugging and auditing of the P2M code? */
+#ifndef NDEBUG
 #define P2M_AUDIT     1
+#else
+#define P2M_AUDIT     0
+#endif
 #define P2M_DEBUGGING 0
 
 #if P2M_AUDIT

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
  2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
                   ` (2 preceding siblings ...)
  2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
@ 2013-12-10 15:48 ` Jan Beulich
  2013-12-10 16:01   ` Andrew Cooper
                     ` (2 more replies)
  2013-12-10 15:48 ` [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest() Jan Beulich
  4 siblings, 3 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-10 15:48 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Keir Fraser, Don Slutz

When one or more of the vCPU-s of a guest are offline, no data may be
put into the allocated space for them and, due to another bug, such
uninitialized data may be passed back to the caller.

Signed-off-by: Don Slutz <dslutz@verizon.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/common/hvm/save.c
+++ b/xen/common/hvm/save.c
@@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
         return -EINVAL;
 
     ctxt.size = sz;
-    ctxt.data = xmalloc_bytes(sz);
+    ctxt.data = xzalloc_bytes(sz);
     if ( !ctxt.data )
         return -ENOMEM;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest()
  2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
                   ` (3 preceding siblings ...)
  2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
@ 2013-12-10 15:48 ` Jan Beulich
  2013-12-10 17:23   ` Andrew Cooper
  2013-12-10 17:33   ` George Dunlap
  4 siblings, 2 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-10 15:48 UTC (permalink / raw)
  To: xen-devel; +Cc: George Dunlap, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 1096 bytes --]

They're being taken care of later (via set_debugreg()), and temporarily
copying them into struct vcpu means that bad values may end up getting
loaded during context switch if the vCPU is already running and the
function errors out between the premature and real commit step, leading
to the same issue that XSA-12 dealt with.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -740,11 +740,12 @@ int arch_set_info_guest(
             XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
                            c.cmp->trap_ctxt + i);
     }
-    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
-        v->arch.debugreg[i] = c(debugreg[i]);
 
     if ( has_hvm_container_vcpu(v) )
     {
+        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
+            v->arch.debugreg[i] = c(debugreg[i]);
+
         /*
          * NB: TF_kernel_mode is set unconditionally for HVM guests,
          * so we always use the gs_base_kernel here. If we change this




[-- Attachment #2: x86-PV-commit-DRs-once.patch --]
[-- Type: text/plain, Size: 1167 bytes --]

x86/PV: don't commit debug register values early in arch_set_info_guest()

They're being taken care of later (via set_debugreg()), and temporarily
copying them into struct vcpu means that bad values may end up getting
loaded during context switch if the vCPU is already running and the
function errors out between the premature and real commit step, leading
to the same issue that XSA-12 dealt with.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -740,11 +740,12 @@ int arch_set_info_guest(
             XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
                            c.cmp->trap_ctxt + i);
     }
-    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
-        v->arch.debugreg[i] = c(debugreg[i]);
 
     if ( has_hvm_container_vcpu(v) )
     {
+        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
+            v->arch.debugreg[i] = c(debugreg[i]);
+
         /*
          * NB: TF_kernel_mode is set unconditionally for HVM guests,
          * so we always use the gs_base_kernel here. If we change this

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds
  2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
@ 2013-12-10 15:58   ` Andrew Cooper
  2013-12-10 17:31   ` George Dunlap
  2013-12-13 13:46   ` Tim Deegan
  2 siblings, 0 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-10 15:58 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Keir Fraser, Tim Deegan


[-- Attachment #1.1: Type: text/plain, Size: 797 bytes --]

On 10/12/13 15:47, Jan Beulich wrote:
> ... since iterating through all of a guest's pages may take unduly
> long.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>
> --- a/xen/include/asm-x86/p2m.h
> +++ b/xen/include/asm-x86/p2m.h
> @@ -600,7 +600,11 @@ int set_p2m_entry(struct p2m_domain *p2m
>  extern void p2m_pt_init(struct p2m_domain *p2m);
>  
>  /* Debugging and auditing of the P2M code? */
> +#ifndef NDEBUG
>  #define P2M_AUDIT     1
> +#else
> +#define P2M_AUDIT     0
> +#endif
>  #define P2M_DEBUGGING 0
>  
>  #if P2M_AUDIT
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 1762 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
  2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
@ 2013-12-10 16:01   ` Andrew Cooper
  2013-12-10 17:32   ` George Dunlap
  2013-12-17  9:16   ` Ping: " Jan Beulich
  2 siblings, 0 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-10 16:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Keir Fraser, Don Slutz

On 10/12/13 15:48, Jan Beulich wrote:
> When one or more of the vCPU-s of a guest are offline, no data may be
> put into the allocated space for them and, due to another bug, such
> uninitialized data may be passed back to the caller.
>
> Signed-off-by: Don Slutz <dslutz@verizon.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>
> --- a/xen/common/hvm/save.c
> +++ b/xen/common/hvm/save.c
> @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
>          return -EINVAL;
>  
>      ctxt.size = sz;
> -    ctxt.data = xmalloc_bytes(sz);
> +    ctxt.data = xzalloc_bytes(sz);
>      if ( !ctxt.data )
>          return -ENOMEM;
>  
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest()
  2013-12-10 15:48 ` [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest() Jan Beulich
@ 2013-12-10 17:23   ` Andrew Cooper
  2013-12-10 17:33   ` George Dunlap
  1 sibling, 0 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-10 17:23 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Keir Fraser


[-- Attachment #1.1: Type: text/plain, Size: 1354 bytes --]

On 10/12/13 15:48, Jan Beulich wrote:
> They're being taken care of later (via set_debugreg()), and temporarily
> copying them into struct vcpu means that bad values may end up getting
> loaded during context switch if the vCPU is already running and the
> function errors out between the premature and real commit step, leading
> to the same issue that XSA-12 dealt with.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -740,11 +740,12 @@ int arch_set_info_guest(
>              XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
>                             c.cmp->trap_ctxt + i);
>      }
> -    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> -        v->arch.debugreg[i] = c(debugreg[i]);
>  
>      if ( has_hvm_container_vcpu(v) )
>      {
> +        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> +            v->arch.debugreg[i] = c(debugreg[i]);
> +
>          /*
>           * NB: TF_kernel_mode is set unconditionally for HVM guests,
>           * so we always use the gs_base_kernel here. If we change this
>
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 2331 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds
  2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
  2013-12-10 15:58   ` Andrew Cooper
@ 2013-12-10 17:31   ` George Dunlap
  2013-12-13 13:46   ` Tim Deegan
  2 siblings, 0 replies; 35+ messages in thread
From: George Dunlap @ 2013-12-10 17:31 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Tim Deegan, Keir Fraser

On 12/10/2013 03:47 PM, Jan Beulich wrote:
> ... since iterating through all of a guest's pages may take unduly
> long.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
  2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
  2013-12-10 16:01   ` Andrew Cooper
@ 2013-12-10 17:32   ` George Dunlap
  2013-12-17  9:16   ` Ping: " Jan Beulich
  2 siblings, 0 replies; 35+ messages in thread
From: George Dunlap @ 2013-12-10 17:32 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser, Don Slutz

On 12/10/2013 03:48 PM, Jan Beulich wrote:
> When one or more of the vCPU-s of a guest are offline, no data may be
> put into the allocated space for them and, due to another bug, such
> uninitialized data may be passed back to the caller.
>
> Signed-off-by: Don Slutz <dslutz@verizon.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>

>
> --- a/xen/common/hvm/save.c
> +++ b/xen/common/hvm/save.c
> @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
>           return -EINVAL;
>   
>       ctxt.size = sz;
> -    ctxt.data = xmalloc_bytes(sz);
> +    ctxt.data = xzalloc_bytes(sz);
>       if ( !ctxt.data )
>           return -ENOMEM;
>   
>
>
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest()
  2013-12-10 15:48 ` [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest() Jan Beulich
  2013-12-10 17:23   ` Andrew Cooper
@ 2013-12-10 17:33   ` George Dunlap
  2013-12-10 18:17     ` Keir Fraser
  1 sibling, 1 reply; 35+ messages in thread
From: George Dunlap @ 2013-12-10 17:33 UTC (permalink / raw)
  To: Jan Beulich, xen-devel; +Cc: Keir Fraser

On 12/10/2013 03:48 PM, Jan Beulich wrote:
> They're being taken care of later (via set_debugreg()), and temporarily
> copying them into struct vcpu means that bad values may end up getting
> loaded during context switch if the vCPU is already running and the
> function errors out between the premature and real commit step, leading
> to the same issue that XSA-12 dealt with.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>

>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -740,11 +740,12 @@ int arch_set_info_guest(
>               XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
>                              c.cmp->trap_ctxt + i);
>       }
> -    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> -        v->arch.debugreg[i] = c(debugreg[i]);
>   
>       if ( has_hvm_container_vcpu(v) )
>       {
> +        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
> +            v->arch.debugreg[i] = c(debugreg[i]);
> +
>           /*
>            * NB: TF_kernel_mode is set unconditionally for HVM guests,
>            * so we always use the gs_base_kernel here. If we change this
>
>
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest()
  2013-12-10 17:33   ` George Dunlap
@ 2013-12-10 18:17     ` Keir Fraser
  0 siblings, 0 replies; 35+ messages in thread
From: Keir Fraser @ 2013-12-10 18:17 UTC (permalink / raw)
  To: George Dunlap, Jan Beulich, xen-devel

On 10/12/2013 17:33, "George Dunlap" <george.dunlap@eu.citrix.com> wrote:

> On 12/10/2013 03:48 PM, Jan Beulich wrote:
>> They're being taken care of later (via set_debugreg()), and temporarily
>> copying them into struct vcpu means that bad values may end up getting
>> loaded during context switch if the vCPU is already running and the
>> function errors out between the premature and real commit step, leading
>> to the same issue that XSA-12 dealt with.
>> 
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> 
> Release-acked-by: George Dunlap <george.dunlap@eu.citrix.com>

Acked-by: Keir Fraser <keir@xen.org>

>> 
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -740,11 +740,12 @@ int arch_set_info_guest(
>>               XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
>>                              c.cmp->trap_ctxt + i);
>>       }
>> -    for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
>> -        v->arch.debugreg[i] = c(debugreg[i]);
>>   
>>       if ( has_hvm_container_vcpu(v) )
>>       {
>> +        for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
>> +            v->arch.debugreg[i] = c(debugreg[i]);
>> +
>>           /*
>>            * NB: TF_kernel_mode is set unconditionally for HVM guests,
>>            * so we always use the gs_base_kernel here. If we change this
>> 
>> 
>> 
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
@ 2013-12-11 18:40   ` Andrew Cooper
  2013-12-13  9:51     ` Jan Beulich
  2013-12-13 13:59     ` [PATCH v2 " Jan Beulich
  2013-12-20 13:06   ` [PATCH " Jan Beulich
  1 sibling, 2 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-11 18:40 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Keir Fraser, Tim Deegan, xiantao.zhang

On 10/12/2013 15:45, Jan Beulich wrote:
> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
>
> The subject domain's rel_mem_list is being (ab)used for this, in a way
> similar to and compatible with broken page offlining.
>
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn't be able to access it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list, &d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()). */
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&

Why the forced restart here?  If nothing needs pre-empting, surely it is
better to continue?

Or is this about equality on the pcidevs_lock ?

> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of the
> +         * list. Hence we first move the whole list, and then move back the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL &&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
>          hd->platform_ops->teardown(d);
> +        d->need_iommu = 0;
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -323,7 +323,7 @@ struct domain
>  
>  #ifdef HAS_PASSTHROUGH
>      /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    s8               need_iommu;

I think this change from bool_t to s8 needs a comment explaining that -1
indicates "the iommu mappings are pending creation"

Is there any particular reason that -ERESTART is used when -EAGAIN is
the prevailing style for hypercall continuations?

~Andrew

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible
  2013-12-10 15:46 ` [PATCH 2/5] IOMMU: make page table deallocation preemptible Jan Beulich
@ 2013-12-11 19:01   ` Andrew Cooper
  2013-12-13  9:55     ` Jan Beulich
  2013-12-13 14:00     ` [PATCH v2 " Jan Beulich
  0 siblings, 2 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-11 19:01 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Keir Fraser, suravee.suthikulpanit, xiantao.zhang


[-- Attachment #1.1: Type: text/plain, Size: 6142 bytes --]

On 10/12/2013 15:46, Jan Beulich wrote:
> This too can take an arbitrary amount of time.
>
> In fact, the bulk of the work is being moved to a tasklet, as handling
> the necessary preemption logic in line seems close to impossible given
> that the teardown may also be invoked on error paths.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

There is a lot of duplicated code here.  I would suggest making most of
this infrastructure common, and having a new iommu_op for "void
free_io_page_table(struct page_info*);"

~Andrew

>
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -195,6 +195,8 @@ static int __init amd_iommu_setup_dom0_d
>      return 0;
>  }
>  
> +static void deallocate_page_tables(unsigned long unused);
> +
>  int __init amd_iov_detect(void)
>  {
>      INIT_LIST_HEAD(&amd_iommu_head);
> @@ -215,6 +217,7 @@ int __init amd_iov_detect(void)
>          return -ENODEV;
>      }
>  
> +    tasklet_init(&iommu_pt_cleanup_tasklet, deallocate_page_tables, 0);
>      init_done = 1;
>  
>      if ( !amd_iommu_perdev_intremap )
> @@ -405,11 +408,21 @@ static int amd_iommu_assign_device(struc
>      return reassign_device(dom0, d, devfn, pdev);
>  }
>  
> -static void deallocate_next_page_table(struct page_info* pg, int level)
> +static void deallocate_next_page_table(struct page_info *pg, int level)
> +{
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    PFN_ORDER(pg) = level;
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void deallocate_page_table(struct page_info *pg)
>  {
>      void *table_vaddr, *pde;
>      u64 next_table_maddr;
> -    int index, next_level;
> +    unsigned int index, level = PFN_ORDER(pg), next_level;
> +
> +    PFN_ORDER(pg) = 0;
>  
>      if ( level <= 1 )
>      {
> @@ -439,6 +452,23 @@ static void deallocate_next_page_table(s
>      free_amd_iommu_pgtable(pg);
>  }
>  
> +static void deallocate_page_tables(unsigned long unused)
> +{
> +    do {
> +        struct page_info *pg;
> +
> +        spin_lock(&iommu_pt_cleanup_lock);
> +        pg = page_list_remove_head(&iommu_pt_cleanup_list);
> +        spin_unlock(&iommu_pt_cleanup_lock);
> +        if ( !pg )
> +            return;
> +        deallocate_page_table(pg);
> +    } while ( !softirq_pending(smp_processor_id()) );
> +
> +    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
> +                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
> +}
> +
>  static void deallocate_iommu_page_tables(struct domain *d)
>  {
>      struct hvm_iommu *hd  = domain_hvm_iommu(d);
> @@ -451,6 +481,7 @@ static void deallocate_iommu_page_tables
>      {
>          deallocate_next_page_table(hd->root_table, hd->paging_mode);
>          hd->root_table = NULL;
> +        tasklet_schedule(&iommu_pt_cleanup_tasklet);
>      }
>      spin_unlock(&hd->mapping_lock);
>  }
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
>  
>  DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
> +PAGE_LIST_HEAD(iommu_pt_cleanup_list);
> +struct tasklet iommu_pt_cleanup_tasklet;
> +
>  static struct keyhandler iommu_p2m_table = {
>      .diagnostic = 0,
>      .u.fn = iommu_dump_p2m_table,
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
>  
>  static void iommu_free_pagetable(u64 pt_maddr, int level)
>  {
> -    int i;
> -    struct dma_pte *pt_vaddr, *pte;
> -    int next_level = level - 1;
> +    struct page_info *pg = maddr_to_page(pt_maddr);
>  
>      if ( pt_maddr == 0 )
>          return;
>  
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    PFN_ORDER(pg) = level;
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void iommu_free_page_table(struct page_info *pg)
> +{
> +    unsigned int i, next_level = PFN_ORDER(pg) - 1;
> +    u64 pt_maddr = page_to_maddr(pg);
> +    struct dma_pte *pt_vaddr, *pte;
> +
> +    PFN_ORDER(pg) = 0;
>      pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
>  
>      for ( i = 0; i < PTE_NUM; i++ )
> @@ -694,6 +705,23 @@ static void iommu_free_pagetable(u64 pt_
>      free_pgtable_maddr(pt_maddr);
>  }
>  
> +static void iommu_free_pagetables(unsigned long unused)
> +{
> +    do {
> +        struct page_info *pg;
> +
> +        spin_lock(&iommu_pt_cleanup_lock);
> +        pg = page_list_remove_head(&iommu_pt_cleanup_list);
> +        spin_unlock(&iommu_pt_cleanup_lock);
> +        if ( !pg )
> +            return;
> +        iommu_free_page_table(pg);
> +    } while ( !softirq_pending(smp_processor_id()) );
> +
> +    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
> +                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
> +}
> +
>  static int iommu_set_root_entry(struct iommu *iommu)
>  {
>      u32 sts;
> @@ -1704,6 +1732,8 @@ void iommu_domain_teardown(struct domain
>      iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw));
>      hd->pgd_maddr = 0;
>      spin_unlock(&hd->mapping_lock);
> +
> +    tasklet_schedule(&iommu_pt_cleanup_tasklet);
>  }
>  
>  static int intel_iommu_map_page(
> @@ -2204,6 +2234,7 @@ int __init intel_vtd_setup(void)
>      if ( ret )
>          goto error;
>  
> +    tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
>      register_keyhandler('V', &dump_iommu_info_keyhandler);
>  
>      return 0;
> --- a/xen/include/xen/iommu.h
> +++ b/xen/include/xen/iommu.h
> @@ -151,4 +151,8 @@ int adjust_vtd_irq_affinities(void);
>   */
>  DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +extern struct spinlock iommu_pt_cleanup_lock;
> +extern struct page_list_head iommu_pt_cleanup_list;
> +extern struct tasklet iommu_pt_cleanup_tasklet;
> +
>  #endif /* _IOMMU_H_ */
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 6784 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-11 18:40   ` Andrew Cooper
@ 2013-12-13  9:51     ` Jan Beulich
  2013-12-13 12:18       ` Andrew Cooper
  2013-12-13 13:59     ` [PATCH v2 " Jan Beulich
  1 sibling, 1 reply; 35+ messages in thread
From: Jan Beulich @ 2013-12-13  9:51 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang, Tim Deegan

>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 10/12/2013 15:45, Jan Beulich wrote:
>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>>                  IOMMUF_readable|IOMMUF_writable);
>>              if ( rc )
>> +            {
>> +                page_list_add(page, &d->page_list);
>>                  break;
>> +            }
>> +        }
>> +        page_list_add_tail(page, &d->arch.relmem_list);
>> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
> 
> Why the forced restart here?  If nothing needs pre-empting, surely it is
> better to continue?
> 
> Or is this about equality on the pcidevs_lock ?
> 
>> +             hypercall_preempt_check() )

Did you overlook this part of the condition?

>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -323,7 +323,7 @@ struct domain
>>  
>>  #ifdef HAS_PASSTHROUGH
>>      /* Does this guest need iommu mappings? */
>> -    bool_t           need_iommu;
>> +    s8               need_iommu;
> 
> I think this change from bool_t to s8 needs a comment explaining that -1
> indicates "the iommu mappings are pending creation"

Will do.

> Is there any particular reason that -ERESTART is used when -EAGAIN is
> the prevailing style for hypercall continuations?

I meanwhile realized that using -EAGAIN was a mistake (iirc taken
from certain domctl-s having passed this back up to the caller to
request re-invocation a long time ago) - -EAGAIN really has a
different meaning, and hence we ought to switch all its current
mis-uses to -ERESTART.

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/5] IOMMU: make page table deallocation preemptible
  2013-12-11 19:01   ` Andrew Cooper
@ 2013-12-13  9:55     ` Jan Beulich
  2013-12-13 14:00     ` [PATCH v2 " Jan Beulich
  1 sibling, 0 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-13  9:55 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang,
	suravee.suthikulpanit

>>> On 11.12.13 at 20:01, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 10/12/2013 15:46, Jan Beulich wrote:
>> This too can take an arbitrary amount of time.
>>
>> In fact, the bulk of the work is being moved to a tasklet, as handling
>> the necessary preemption logic in line seems close to impossible given
>> that the teardown may also be invoked on error paths.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> There is a lot of duplicated code here.  I would suggest making most of
> this infrastructure common, and having a new iommu_op for "void
> free_io_page_table(struct page_info*);"

I guess that could be done - let me see how this works out.

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-13  9:51     ` Jan Beulich
@ 2013-12-13 12:18       ` Andrew Cooper
  2013-12-13 12:34         ` Jan Beulich
  0 siblings, 1 reply; 35+ messages in thread
From: Andrew Cooper @ 2013-12-13 12:18 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang, Tim Deegan

On 13/12/2013 09:51, Jan Beulich wrote:
>>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 10/12/2013 15:45, Jan Beulich wrote:
>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>>>                  IOMMUF_readable|IOMMUF_writable);
>>>              if ( rc )
>>> +            {
>>> +                page_list_add(page, &d->page_list);
>>>                  break;
>>> +            }
>>> +        }
>>> +        page_list_add_tail(page, &d->arch.relmem_list);
>>> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
>> Why the forced restart here?  If nothing needs pre-empting, surely it is
>> better to continue?
>>
>> Or is this about equality on the pcidevs_lock ?
>>
>>> +             hypercall_preempt_check() )
> Did you overlook this part of the condition?

No, but I did mentally get the logic inverted when trying to work out
what was going on.

How about (++n > 0xff) ?

If we have already spent a while in this loop, and the
hypercall_preempt_check() doesn't flip to 1 until a few iterations after
n is congruent with 0x100, waiting for another 0x100 iterations before
checking again seems a little long.

>
>>> --- a/xen/include/xen/sched.h
>>> +++ b/xen/include/xen/sched.h
>>> @@ -323,7 +323,7 @@ struct domain
>>>  
>>>  #ifdef HAS_PASSTHROUGH
>>>      /* Does this guest need iommu mappings? */
>>> -    bool_t           need_iommu;
>>> +    s8               need_iommu;
>> I think this change from bool_t to s8 needs a comment explaining that -1
>> indicates "the iommu mappings are pending creation"
> Will do.
>
>> Is there any particular reason that -ERESTART is used when -EAGAIN is
>> the prevailing style for hypercall continuations?
> I meanwhile realized that using -EAGAIN was a mistake (iirc taken
> from certain domctl-s having passed this back up to the caller to
> request re-invocation a long time ago) - -EAGAIN really has a
> different meaning, and hence we ought to switch all its current
> mis-uses to -ERESTART.
>
> Jan
>

Ok.

~Andrew

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-13 12:18       ` Andrew Cooper
@ 2013-12-13 12:34         ` Jan Beulich
  2013-12-13 13:57           ` Andrew Cooper
  0 siblings, 1 reply; 35+ messages in thread
From: Jan Beulich @ 2013-12-13 12:34 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang, TimDeegan

>>> On 13.12.13 at 13:18, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 13/12/2013 09:51, Jan Beulich wrote:
>>>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>> On 10/12/2013 15:45, Jan Beulich wrote:
>>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>>>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>>>>                  IOMMUF_readable|IOMMUF_writable);
>>>>              if ( rc )
>>>> +            {
>>>> +                page_list_add(page, &d->page_list);
>>>>                  break;
>>>> +            }
>>>> +        }
>>>> +        page_list_add_tail(page, &d->arch.relmem_list);
>>>> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
>>> Why the forced restart here?  If nothing needs pre-empting, surely it is
>>> better to continue?
>>>
>>> Or is this about equality on the pcidevs_lock ?
>>>
>>>> +             hypercall_preempt_check() )
>> Did you overlook this part of the condition?
> 
> No, but I did mentally get the logic inverted when trying to work out
> what was going on.
> 
> How about (++n > 0xff) ?
> 
> If we have already spent a while in this loop, and the
> hypercall_preempt_check() doesn't flip to 1 until a few iterations after
> n is congruent with 0x100, waiting for another 0x100 iterations before
> checking again seems a little long.

The thing I'm trying to avoid is calling hypercall_preempt_check()
overly often - especially when the prior if()'s body doesn't get
entered we might otherwise do very little useful for per check.

While 256 isn't overly much (covering merely 1Mb of guest space),
I could see two possibilities to get this a little more towards what
you want: Either we right shift the mask each time we actually
call hypercall_preempt_check(), or we bias the increment (adding
e.g. 10 when going the expensive route, else 1).

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] x86/p2m: restrict auditing to debug builds
  2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
  2013-12-10 15:58   ` Andrew Cooper
  2013-12-10 17:31   ` George Dunlap
@ 2013-12-13 13:46   ` Tim Deegan
  2 siblings, 0 replies; 35+ messages in thread
From: Tim Deegan @ 2013-12-13 13:46 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Keir Fraser

At 15:47 +0000 on 10 Dec (1386686837), Jan Beulich wrote:
> ... since iterating through all of a guest's pages may take unduly
> long.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

Acked-by: Tim Deegan <tim@xen.org>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-13 12:34         ` Jan Beulich
@ 2013-12-13 13:57           ` Andrew Cooper
  0 siblings, 0 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-13 13:57 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang, TimDeegan

On 13/12/2013 12:34, Jan Beulich wrote:
>>>> On 13.12.13 at 13:18, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 13/12/2013 09:51, Jan Beulich wrote:
>>>>>> On 11.12.13 at 19:40, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>>> On 10/12/2013 15:45, Jan Beulich wrote:
>>>>> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>>>>>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>>>>>                  IOMMUF_readable|IOMMUF_writable);
>>>>>              if ( rc )
>>>>> +            {
>>>>> +                page_list_add(page, &d->page_list);
>>>>>                  break;
>>>>> +            }
>>>>> +        }
>>>>> +        page_list_add_tail(page, &d->arch.relmem_list);
>>>>> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
>>>> Why the forced restart here?  If nothing needs pre-empting, surely it is
>>>> better to continue?
>>>>
>>>> Or is this about equality on the pcidevs_lock ?
>>>>
>>>>> +             hypercall_preempt_check() )
>>> Did you overlook this part of the condition?
>> No, but I did mentally get the logic inverted when trying to work out
>> what was going on.
>>
>> How about (++n > 0xff) ?
>>
>> If we have already spent a while in this loop, and the
>> hypercall_preempt_check() doesn't flip to 1 until a few iterations after
>> n is congruent with 0x100, waiting for another 0x100 iterations before
>> checking again seems a little long.
> The thing I'm trying to avoid is calling hypercall_preempt_check()
> overly often - especially when the prior if()'s body doesn't get
> entered we might otherwise do very little useful for per check.
>
> While 256 isn't overly much (covering merely 1Mb of guest space),
> I could see two possibilities to get this a little more towards what
> you want: Either we right shift the mask each time we actually
> call hypercall_preempt_check(), or we bias the increment (adding
> e.g. 10 when going the expensive route, else 1).
>
> Jan

I suppose it probably doesn't matter too much.  In the slim case where
he loop does manage to get to 257 iterations before the preempt check
returns true, doing another 254 iterations isn't going to skew the
timings too much.  And the net time till completion will be fractionally
shorter.

~Andrew

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-11 18:40   ` Andrew Cooper
  2013-12-13  9:51     ` Jan Beulich
@ 2013-12-13 13:59     ` Jan Beulich
  2013-12-13 14:16       ` Tim Deegan
  2013-12-13 15:09       ` Andrew Cooper
  1 sibling, 2 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-13 13:59 UTC (permalink / raw)
  To: xen-devel
  Cc: George Dunlap, Andrew Cooper, Keir Fraser, xiantao.zhang,
	Tim Deegan

[-- Attachment #1: Type: text/plain, Size: 6253 bytes --]

Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain's rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn't be able to access it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend comment on struct domain's need_iommu field.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
         }
 
         d->arch.relmem = RELMEM_xen;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_splice(&d->arch.relmem_list, &d->page_list);
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+        spin_unlock(&d->page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &d->arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include <asm/hvm/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
+#include <xen/event.h>
 #include <xen/softirq.h>
 #include <xen/keyhandler.h>
 #include <xsm/xsm.h>
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d->mem_event->paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&pcidevs_lock);
+    if ( !spin_trylock(&pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) <= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&pcidevs_lock);
+                return rc;
+            }
+        }
+        d->need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) && !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) && need_iommu(d) )
     {
-        d->need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d->need_iommu = 0;
+        hd->platform_ops->teardown(d);
     }
-done:
     spin_unlock(&pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d->need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&d->page_alloc_lock);
 
-    page_list_for_each ( page, &d->page_list )
+    if ( unlikely(d->is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
     {
         if ( is_hvm_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &d->page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &d->arch.relmem_list);
+        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&d->page_list, &d->arch.relmem_list);
+        while ( (page = page_list_first(&d->page_list)) != NULL &&
+                (page->count_info & (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &d->page_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
+        d->need_iommu = 0;
         hd->platform_ops->teardown(d);
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -322,8 +322,8 @@ struct domain
     enum guest_type guest_type;
 
 #ifdef HAS_PASSTHROUGH
-    /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    /* Does this guest need iommu mappings (-1 meaning "being set up")? */
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;



[-- Attachment #2: IOMMU-populate-pt-continuation.patch --]
[-- Type: text/plain, Size: 6298 bytes --]

IOMMU: make page table population preemptible

Since this can take an arbitrary amount of time, the rooting domctl as
well as all involved code must become aware of this requiring a
continuation.

The subject domain's rel_mem_list is being (ab)used for this, in a way
similar to and compatible with broken page offlining.

Further, operations get slightly re-ordered in assign_device(): IOMMU
page tables now get set up _before_ the first device gets assigned, at
once closing a small timing window in which the guest may already see
the device but wouldn't be able to access it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Extend comment on struct domain's need_iommu field.

--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
         }
 
         d->arch.relmem = RELMEM_xen;
+
+        spin_lock(&d->page_alloc_lock);
+        page_list_splice(&d->arch.relmem_list, &d->page_list);
+        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
+        spin_unlock(&d->page_alloc_lock);
+
         /* Fallthrough. Relinquish every page of memory. */
     case RELMEM_xen:
         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
--- a/xen/arch/x86/mm/p2m-pod.c
+++ b/xen/arch/x86/mm/p2m-pod.c
@@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
 
 pod_hit:
     lock_page_alloc(p2m);
-    page_list_add_tail(p, &d->arch.relmem_list);
+    /* Insertion must be at list head (see iommu_populate_page_table()). */
+    page_list_add(p, &d->arch.relmem_list);
     unlock_page_alloc(p2m);
     pod_unlock(p2m);
     return 1;
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -18,6 +18,7 @@
 #include <asm/hvm/iommu.h>
 #include <xen/paging.h>
 #include <xen/guest_access.h>
+#include <xen/event.h>
 #include <xen/softirq.h>
 #include <xen/keyhandler.h>
 #include <xsm/xsm.h>
@@ -265,7 +266,23 @@ static int assign_device(struct domain *
              d->mem_event->paging.ring_page)) )
         return -EXDEV;
 
-    spin_lock(&pcidevs_lock);
+    if ( !spin_trylock(&pcidevs_lock) )
+        return -ERESTART;
+
+    if ( need_iommu(d) <= 0 )
+    {
+        if ( !iommu_use_hap_pt(d) )
+        {
+            rc = iommu_populate_page_table(d);
+            if ( rc )
+            {
+                spin_unlock(&pcidevs_lock);
+                return rc;
+            }
+        }
+        d->need_iommu = 1;
+    }
+
     pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
     if ( !pdev )
     {
@@ -290,15 +307,14 @@ static int assign_device(struct domain *
                    rc);
     }
 
-    if ( has_arch_pdevs(d) && !need_iommu(d) )
+ done:
+    if ( !has_arch_pdevs(d) && need_iommu(d) )
     {
-        d->need_iommu = 1;
-        if ( !iommu_use_hap_pt(d) )
-            rc = iommu_populate_page_table(d);
-        goto done;
+        d->need_iommu = 0;
+        hd->platform_ops->teardown(d);
     }
-done:
     spin_unlock(&pcidevs_lock);
+
     return rc;
 }
 
@@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
     struct page_info *page;
-    int rc = 0;
+    int rc = 0, n = 0;
+
+    d->need_iommu = -1;
 
     this_cpu(iommu_dont_flush_iotlb) = 1;
     spin_lock(&d->page_alloc_lock);
 
-    page_list_for_each ( page, &d->page_list )
+    if ( unlikely(d->is_dying) )
+        rc = -ESRCH;
+
+    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
     {
         if ( is_hvm_domain(d) ||
             (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
@@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
                 d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
                 IOMMUF_readable|IOMMUF_writable);
             if ( rc )
+            {
+                page_list_add(page, &d->page_list);
                 break;
+            }
+        }
+        page_list_add_tail(page, &d->arch.relmem_list);
+        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
+             hypercall_preempt_check() )
+            rc = -ERESTART;
+    }
+
+    if ( !rc )
+    {
+        /*
+         * The expectation here is that generally there are many normal pages
+         * on relmem_list (the ones we put there) and only few being in an
+         * offline/broken state. The latter ones are always at the head of the
+         * list. Hence we first move the whole list, and then move back the
+         * first few entries.
+         */
+        page_list_move(&d->page_list, &d->arch.relmem_list);
+        while ( (page = page_list_first(&d->page_list)) != NULL &&
+                (page->count_info & (PGC_state|PGC_broken)) )
+        {
+            page_list_del(page, &d->page_list);
+            page_list_add_tail(page, &d->arch.relmem_list);
         }
     }
 
@@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
 
     if ( !rc )
         iommu_iotlb_flush_all(d);
-    else
+    else if ( rc != -ERESTART )
+    {
+        d->need_iommu = 0;
         hd->platform_ops->teardown(d);
+    }
 
     return rc;
 }
@@ -688,7 +737,10 @@ int iommu_do_domctl(
 
         ret = device_assigned(seg, bus, devfn) ?:
               assign_device(d, seg, bus, devfn);
-        if ( ret )
+        if ( ret == -ERESTART )
+            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
+                                                "h", u_domctl);
+        else if ( ret )
             printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
                    "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -322,8 +322,8 @@ struct domain
     enum guest_type guest_type;
 
 #ifdef HAS_PASSTHROUGH
-    /* Does this guest need iommu mappings? */
-    bool_t           need_iommu;
+    /* Does this guest need iommu mappings (-1 meaning "being set up")? */
+    s8               need_iommu;
 #endif
     /* is node-affinity automatically computed? */
     bool_t           auto_node_affinity;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH v2 2/5] IOMMU: make page table deallocation preemptible
  2013-12-11 19:01   ` Andrew Cooper
  2013-12-13  9:55     ` Jan Beulich
@ 2013-12-13 14:00     ` Jan Beulich
  2013-12-13 15:26       ` Andrew Cooper
  2014-01-07 14:51       ` Zhang, Xiantao
  1 sibling, 2 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-13 14:00 UTC (permalink / raw)
  To: xen-devel
  Cc: George Dunlap, Andrew Cooper, Keir Fraser, xiantao.zhang,
	suravee.suthikulpanit

[-- Attachment #1: Type: text/plain, Size: 7054 bytes --]

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling
the necessary preemption logic in line seems close to impossible given
that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: abstract out tasklet logic

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);
 }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level)
+{
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = {
     .teardown = amd_iommu_domain_destroy,
     .map_page = amd_iommu_map_page,
     .unmap_page = amd_iommu_unmap_page,
+    .free_page_table = deallocate_page_table,
     .reassign_device = reassign_device,
     .get_device_group_id = amd_iommu_group_id,
     .update_ire_from_apic = amd_iommu_ioapic_update_ire,
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+static struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
@@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev *
     return hd->platform_ops->remove_device(pdev->devfn, pdev);
 }
 
+static void iommu_teardown(struct domain *d)
+{
+    const struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    d->need_iommu = 0;
+    hd->platform_ops->teardown(d);
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
+}
+
 /*
  * If the device isn't owned by dom0, it means it already
  * has been assigned to other domain, or it doesn't exist.
@@ -309,10 +322,7 @@ static int assign_device(struct domain *
 
  done:
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
     spin_unlock(&pcidevs_lock);
 
     return rc;
@@ -377,10 +387,7 @@ static int iommu_populate_page_table(str
     if ( !rc )
         iommu_iotlb_flush_all(d);
     else if ( rc != -ERESTART )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return rc;
 }
@@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain 
         return;
 
     if ( need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
     {
@@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u
     return hd->platform_ops->unmap_page(d, gfn);
 }
 
+static void iommu_free_pagetables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_get_ops()->free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
+}
+
 void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
@@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1
     pdev->fault.count = 0;
 
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return ret;
 }
@@ -542,6 +560,7 @@ int __init iommu_setup(void)
                iommu_passthrough ? "Passthrough" :
                iommu_dom0_strict ? "Strict" : "Relaxed");
         printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis");
+        tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
     }
 
     return rc;
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg)
+{
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops =
     .teardown = iommu_domain_teardown,
     .map_page = intel_iommu_map_page,
     .unmap_page = intel_iommu_unmap_page,
+    .free_page_table = iommu_free_page_table,
     .reassign_device = reassign_device_ownership,
     .get_device_group_id = intel_iommu_group_id,
     .update_ire_from_apic = io_apic_write_remap_rte,
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags)
 
 struct msi_desc;
 struct msi_msg;
+struct page_info;
 
 struct iommu_ops {
     int (*init)(struct domain *d);
@@ -100,6 +101,7 @@ struct iommu_ops {
     int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
                     unsigned int flags);
     int (*unmap_page)(struct domain *d, unsigned long gfn);
+    void (*free_page_table)(struct page_info *);
     int (*reassign_device)(struct domain *s, struct domain *t,
 			   u8 devfn, struct pci_dev *);
     int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn);
@@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock;
+extern struct page_list_head iommu_pt_cleanup_list;
+
 #endif /* _IOMMU_H_ */



[-- Attachment #2: IOMMU-deallocate-pt-continuation.patch --]
[-- Type: text/plain, Size: 7101 bytes --]

IOMMU: make page table deallocation preemptible

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling
the necessary preemption logic in line seems close to impossible given
that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: abstract out tasklet logic

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);
 }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level)
+{
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = {
     .teardown = amd_iommu_domain_destroy,
     .map_page = amd_iommu_map_page,
     .unmap_page = amd_iommu_unmap_page,
+    .free_page_table = deallocate_page_table,
     .reassign_device = reassign_device,
     .get_device_group_id = amd_iommu_group_id,
     .update_ire_from_apic = amd_iommu_ioapic_update_ire,
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+static struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
@@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev *
     return hd->platform_ops->remove_device(pdev->devfn, pdev);
 }
 
+static void iommu_teardown(struct domain *d)
+{
+    const struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    d->need_iommu = 0;
+    hd->platform_ops->teardown(d);
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
+}
+
 /*
  * If the device isn't owned by dom0, it means it already
  * has been assigned to other domain, or it doesn't exist.
@@ -309,10 +322,7 @@ static int assign_device(struct domain *
 
  done:
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
     spin_unlock(&pcidevs_lock);
 
     return rc;
@@ -377,10 +387,7 @@ static int iommu_populate_page_table(str
     if ( !rc )
         iommu_iotlb_flush_all(d);
     else if ( rc != -ERESTART )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return rc;
 }
@@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain 
         return;
 
     if ( need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
     {
@@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u
     return hd->platform_ops->unmap_page(d, gfn);
 }
 
+static void iommu_free_pagetables(unsigned long unused)
+{
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_get_ops()->free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
+}
+
 void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count)
 {
     struct hvm_iommu *hd = domain_hvm_iommu(d);
@@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1
     pdev->fault.count = 0;
 
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return ret;
 }
@@ -542,6 +560,7 @@ int __init iommu_setup(void)
                iommu_passthrough ? "Passthrough" :
                iommu_dom0_strict ? "Strict" : "Relaxed");
         printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis");
+        tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
     }
 
     return rc;
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)
 {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg)
+{
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops =
     .teardown = iommu_domain_teardown,
     .map_page = intel_iommu_map_page,
     .unmap_page = intel_iommu_unmap_page,
+    .free_page_table = iommu_free_page_table,
     .reassign_device = reassign_device_ownership,
     .get_device_group_id = intel_iommu_group_id,
     .update_ire_from_apic = io_apic_write_remap_rte,
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags)
 
 struct msi_desc;
 struct msi_msg;
+struct page_info;
 
 struct iommu_ops {
     int (*init)(struct domain *d);
@@ -100,6 +101,7 @@ struct iommu_ops {
     int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
                     unsigned int flags);
     int (*unmap_page)(struct domain *d, unsigned long gfn);
+    void (*free_page_table)(struct page_info *);
     int (*reassign_device)(struct domain *s, struct domain *t,
 			   u8 devfn, struct pci_dev *);
     int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn);
@@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock;
+extern struct page_list_head iommu_pt_cleanup_list;
+
 #endif /* _IOMMU_H_ */

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-13 13:59     ` [PATCH v2 " Jan Beulich
@ 2013-12-13 14:16       ` Tim Deegan
  2013-12-13 15:09       ` Andrew Cooper
  1 sibling, 0 replies; 35+ messages in thread
From: Tim Deegan @ 2013-12-13 14:16 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang,
	Andrew Cooper

At 13:59 +0000 on 13 Dec (1386939579), Jan Beulich wrote:
> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
> 
> The subject domain's rel_mem_list is being (ab)used for this, in a way
> similar to and compatible with broken page offlining.
> 
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn't be able to access it.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Tim Deegan <tim@xen.org>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-13 13:59     ` [PATCH v2 " Jan Beulich
  2013-12-13 14:16       ` Tim Deegan
@ 2013-12-13 15:09       ` Andrew Cooper
  2013-12-13 15:41         ` Jan Beulich
  1 sibling, 1 reply; 35+ messages in thread
From: Andrew Cooper @ 2013-12-13 15:09 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Tim Deegan, Keir Fraser, xiantao.zhang


[-- Attachment #1.1: Type: text/plain, Size: 7207 bytes --]

On 13/12/2013 13:59, Jan Beulich wrote:
> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
>
> The subject domain's rel_mem_list is being (ab)used for this, in a way
> similar to and compatible with broken page offlining.
>
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn't be able to access it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Extend comment on struct domain's need_iommu field.
>
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1924,6 +1924,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list, &d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()). */
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )

We now have a case where, for the first device, we could set up
pagetables for a large domain, get an error with assignment, then tear
them all back down.  (-EBUSY from pci_get_pdev() looks like a good
non-fatal candidate for causing this behaviour)

I am wondering whether this is better for worse than the race condition
where a guest couldn't use the device.  A guest could not reasonably
expect to use a device before the toolstack is done setting it up.   A
buggy toolstack could quite easily tie up a lot of Xen time creating and
destroying complete iommu pagetable sets.

~Andrew

>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of the
> +         * list. Hence we first move the whole list, and then move back the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL &&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
> +        d->need_iommu = 0;
>          hd->platform_ops->teardown(d);
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -322,8 +322,8 @@ struct domain
>      enum guest_type guest_type;
>  
>  #ifdef HAS_PASSTHROUGH
> -    /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    /* Does this guest need iommu mappings (-1 meaning "being set up")? */
> +    s8               need_iommu;
>  #endif
>      /* is node-affinity automatically computed? */
>      bool_t           auto_node_affinity;
>
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel


[-- Attachment #1.2: Type: text/html, Size: 8050 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/5] IOMMU: make page table deallocation preemptible
  2013-12-13 14:00     ` [PATCH v2 " Jan Beulich
@ 2013-12-13 15:26       ` Andrew Cooper
  2014-01-07 14:51       ` Zhang, Xiantao
  1 sibling, 0 replies; 35+ messages in thread
From: Andrew Cooper @ 2013-12-13 15:26 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Keir Fraser, xiantao.zhang, suravee.suthikulpanit

On 13/12/2013 14:00, Jan Beulich wrote:
> This too can take an arbitrary amount of time.
>
> In fact, the bulk of the work is being moved to a tasklet, as handling
> the necessary preemption logic in line seems close to impossible given
> that the teardown may also be invoked on error paths.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

> ---
> v2: abstract out tasklet logic
>
> --- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
> +++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
> @@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc
>      return reassign_device(dom0, d, devfn, pdev);
>  }
>  
> -static void deallocate_next_page_table(struct page_info* pg, int level)
> +static void deallocate_next_page_table(struct page_info *pg, int level)
> +{
> +    PFN_ORDER(pg) = level;
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void deallocate_page_table(struct page_info *pg)
>  {
>      void *table_vaddr, *pde;
>      u64 next_table_maddr;
> -    int index, next_level;
> +    unsigned int index, level = PFN_ORDER(pg), next_level;
> +
> +    PFN_ORDER(pg) = 0;
>  
>      if ( level <= 1 )
>      {
> @@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = {
>      .teardown = amd_iommu_domain_destroy,
>      .map_page = amd_iommu_map_page,
>      .unmap_page = amd_iommu_unmap_page,
> +    .free_page_table = deallocate_page_table,
>      .reassign_device = reassign_device,
>      .get_device_group_id = amd_iommu_group_id,
>      .update_ire_from_apic = amd_iommu_ioapic_update_ire,
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
>  
>  DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
> +PAGE_LIST_HEAD(iommu_pt_cleanup_list);
> +static struct tasklet iommu_pt_cleanup_tasklet;
> +
>  static struct keyhandler iommu_p2m_table = {
>      .diagnostic = 0,
>      .u.fn = iommu_dump_p2m_table,
> @@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev *
>      return hd->platform_ops->remove_device(pdev->devfn, pdev);
>  }
>  
> +static void iommu_teardown(struct domain *d)
> +{
> +    const struct hvm_iommu *hd = domain_hvm_iommu(d);
> +
> +    d->need_iommu = 0;
> +    hd->platform_ops->teardown(d);
> +    tasklet_schedule(&iommu_pt_cleanup_tasklet);
> +}
> +
>  /*
>   * If the device isn't owned by dom0, it means it already
>   * has been assigned to other domain, or it doesn't exist.
> @@ -309,10 +322,7 @@ static int assign_device(struct domain *
>  
>   done:
>      if ( !has_arch_pdevs(d) && need_iommu(d) )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>      spin_unlock(&pcidevs_lock);
>  
>      return rc;
> @@ -377,10 +387,7 @@ static int iommu_populate_page_table(str
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
>      else if ( rc != -ERESTART )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>  
>      return rc;
>  }
> @@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain 
>          return;
>  
>      if ( need_iommu(d) )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>  
>      list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
>      {
> @@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u
>      return hd->platform_ops->unmap_page(d, gfn);
>  }
>  
> +static void iommu_free_pagetables(unsigned long unused)
> +{
> +    do {
> +        struct page_info *pg;
> +
> +        spin_lock(&iommu_pt_cleanup_lock);
> +        pg = page_list_remove_head(&iommu_pt_cleanup_list);
> +        spin_unlock(&iommu_pt_cleanup_lock);
> +        if ( !pg )
> +            return;
> +        iommu_get_ops()->free_page_table(pg);
> +    } while ( !softirq_pending(smp_processor_id()) );
> +
> +    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
> +                            cpumask_cycle(smp_processor_id(), &cpu_online_map));
> +}
> +
>  void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count)
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
> @@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1
>      pdev->fault.count = 0;
>  
>      if ( !has_arch_pdevs(d) && need_iommu(d) )
> -    {
> -        d->need_iommu = 0;
> -        hd->platform_ops->teardown(d);
> -    }
> +        iommu_teardown(d);
>  
>      return ret;
>  }
> @@ -542,6 +560,7 @@ int __init iommu_setup(void)
>                 iommu_passthrough ? "Passthrough" :
>                 iommu_dom0_strict ? "Strict" : "Relaxed");
>          printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis");
> +        tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 0);
>      }
>  
>      return rc;
> --- a/xen/drivers/passthrough/vtd/iommu.c
> +++ b/xen/drivers/passthrough/vtd/iommu.c
> @@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
>  
>  static void iommu_free_pagetable(u64 pt_maddr, int level)
>  {
> -    int i;
> -    struct dma_pte *pt_vaddr, *pte;
> -    int next_level = level - 1;
> +    struct page_info *pg = maddr_to_page(pt_maddr);
>  
>      if ( pt_maddr == 0 )
>          return;
>  
> +    PFN_ORDER(pg) = level;
> +    spin_lock(&iommu_pt_cleanup_lock);
> +    page_list_add_tail(pg, &iommu_pt_cleanup_list);
> +    spin_unlock(&iommu_pt_cleanup_lock);
> +}
> +
> +static void iommu_free_page_table(struct page_info *pg)
> +{
> +    unsigned int i, next_level = PFN_ORDER(pg) - 1;
> +    u64 pt_maddr = page_to_maddr(pg);
> +    struct dma_pte *pt_vaddr, *pte;
> +
> +    PFN_ORDER(pg) = 0;
>      pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
>  
>      for ( i = 0; i < PTE_NUM; i++ )
> @@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops =
>      .teardown = iommu_domain_teardown,
>      .map_page = intel_iommu_map_page,
>      .unmap_page = intel_iommu_unmap_page,
> +    .free_page_table = iommu_free_page_table,
>      .reassign_device = reassign_device_ownership,
>      .get_device_group_id = intel_iommu_group_id,
>      .update_ire_from_apic = io_apic_write_remap_rte,
> --- a/xen/include/xen/iommu.h
> +++ b/xen/include/xen/iommu.h
> @@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags)
>  
>  struct msi_desc;
>  struct msi_msg;
> +struct page_info;
>  
>  struct iommu_ops {
>      int (*init)(struct domain *d);
> @@ -100,6 +101,7 @@ struct iommu_ops {
>      int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
>                      unsigned int flags);
>      int (*unmap_page)(struct domain *d, unsigned long gfn);
> +    void (*free_page_table)(struct page_info *);
>      int (*reassign_device)(struct domain *s, struct domain *t,
>  			   u8 devfn, struct pci_dev *);
>      int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn);
> @@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void);
>   */
>  DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
>  
> +extern struct spinlock iommu_pt_cleanup_lock;
> +extern struct page_list_head iommu_pt_cleanup_list;
> +
>  #endif /* _IOMMU_H_ */
>
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-13 15:09       ` Andrew Cooper
@ 2013-12-13 15:41         ` Jan Beulich
  2013-12-13 15:44           ` Andrew Cooper
  0 siblings, 1 reply; 35+ messages in thread
From: Jan Beulich @ 2013-12-13 15:41 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang, Tim Deegan

>>> On 13.12.13 at 16:09, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 13/12/2013 13:59, Jan Beulich wrote:
>> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>>                     rc);
>>      }
>>  
>> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
>> + done:
>> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
> 
> We now have a case where, for the first device, we could set up
> pagetables for a large domain, get an error with assignment, then tear
> them all back down.  (-EBUSY from pci_get_pdev() looks like a good
> non-fatal candidate for causing this behaviour)
> 
> I am wondering whether this is better for worse than the race condition
> where a guest couldn't use the device.  A guest could not reasonably
> expect to use a device before the toolstack is done setting it up.   A
> buggy toolstack could quite easily tie up a lot of Xen time creating and
> destroying complete iommu pagetable sets.

I don't think it's worth worrying about buggy tool stacks here -
they should simply get fixed.

Furthermore this change in operation ordering is only a (nice)
side effect, the necessary cleanup seemed easier with the
order changed. And said time window would have grown with
the added preemption handling.

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-13 15:41         ` Jan Beulich
@ 2013-12-13 15:44           ` Andrew Cooper
  2013-12-30 13:43             ` Zhang, Xiantao
  0 siblings, 1 reply; 35+ messages in thread
From: Andrew Cooper @ 2013-12-13 15:44 UTC (permalink / raw)
  To: Jan Beulich
  Cc: George Dunlap, xen-devel, Keir Fraser, xiantao.zhang, Tim Deegan

On 13/12/2013 15:41, Jan Beulich wrote:
>>>> On 13.12.13 at 16:09, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 13/12/2013 13:59, Jan Beulich wrote:
>>> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>>>                     rc);
>>>      }
>>>  
>>> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
>>> + done:
>>> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>> We now have a case where, for the first device, we could set up
>> pagetables for a large domain, get an error with assignment, then tear
>> them all back down.  (-EBUSY from pci_get_pdev() looks like a good
>> non-fatal candidate for causing this behaviour)
>>
>> I am wondering whether this is better for worse than the race condition
>> where a guest couldn't use the device.  A guest could not reasonably
>> expect to use a device before the toolstack is done setting it up.   A
>> buggy toolstack could quite easily tie up a lot of Xen time creating and
>> destroying complete iommu pagetable sets.
> I don't think it's worth worrying about buggy tool stacks here -
> they should simply get fixed.
>
> Furthermore this change in operation ordering is only a (nice)
> side effect, the necessary cleanup seemed easier with the
> order changed. And said time window would have grown with
> the added preemption handling.
>
> Jan
>

All true.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Ping: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
  2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
  2013-12-10 16:01   ` Andrew Cooper
  2013-12-10 17:32   ` George Dunlap
@ 2013-12-17  9:16   ` Jan Beulich
  2013-12-17 10:45     ` Andrew Cooper
  2 siblings, 1 reply; 35+ messages in thread
From: Jan Beulich @ 2013-12-17  9:16 UTC (permalink / raw)
  To: Keir Fraser; +Cc: George Dunlap, xen-devel, Don Slutz

>>> On 10.12.13 at 16:48, "Jan Beulich" <JBeulich@suse.com> wrote:
> When one or more of the vCPU-s of a guest are offline, no data may be
> put into the allocated space for them and, due to another bug, such
> uninitialized data may be passed back to the caller.
> 
> Signed-off-by: Don Slutz <dslutz@verizon.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Keir?

> --- a/xen/common/hvm/save.c
> +++ b/xen/common/hvm/save.c
> @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
>          return -EINVAL;
>  
>      ctxt.size = sz;
> -    ctxt.data = xmalloc_bytes(sz);
> +    ctxt.data = xzalloc_bytes(sz);
>      if ( !ctxt.data )
>          return -ENOMEM;
>  

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: Ping: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
  2013-12-17  9:16   ` Ping: " Jan Beulich
@ 2013-12-17 10:45     ` Andrew Cooper
  2013-12-17 11:02       ` Jan Beulich
  0 siblings, 1 reply; 35+ messages in thread
From: Andrew Cooper @ 2013-12-17 10:45 UTC (permalink / raw)
  To: Jan Beulich; +Cc: George Dunlap, xen-devel, Keir Fraser, Don Slutz

On 17/12/13 09:16, Jan Beulich wrote:
>>>> On 10.12.13 at 16:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>> When one or more of the vCPU-s of a guest are offline, no data may be
>> put into the allocated space for them and, due to another bug, such
>> uninitialized data may be passed back to the caller.
>>
>> Signed-off-by: Don Slutz <dslutz@verizon.com>
>> Acked-by: Ian Campbell <ian.campbell@citrix.com>
>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> Keir?

This issue is completely fixed by the latest patch is it not?

With the latest patch, we always copy out of the written subset of
ctxt.data, even if ctxt.size is larger.

~Andrew

>
>> --- a/xen/common/hvm/save.c
>> +++ b/xen/common/hvm/save.c
>> @@ -102,7 +102,7 @@ int hvm_save_one(struct domain *d, uint1
>>          return -EINVAL;
>>  
>>      ctxt.size = sz;
>> -    ctxt.data = xmalloc_bytes(sz);
>> +    ctxt.data = xzalloc_bytes(sz);
>>      if ( !ctxt.data )
>>          return -ENOMEM;
>>  
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: Ping: [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one()
  2013-12-17 10:45     ` Andrew Cooper
@ 2013-12-17 11:02       ` Jan Beulich
  0 siblings, 0 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-17 11:02 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: George Dunlap, xen-devel, Keir Fraser, Don Slutz

>>> On 17.12.13 at 11:45, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> On 17/12/13 09:16, Jan Beulich wrote:
>>>>> On 10.12.13 at 16:48, "Jan Beulich" <JBeulich@suse.com> wrote:
>>> When one or more of the vCPU-s of a guest are offline, no data may be
>>> put into the allocated space for them and, due to another bug, such
>>> uninitialized data may be passed back to the caller.
>>>
>>> Signed-off-by: Don Slutz <dslutz@verizon.com>
>>> Acked-by: Ian Campbell <ian.campbell@citrix.com>
>>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>> Keir?
> 
> This issue is completely fixed by the latest patch is it not?
> 
> With the latest patch, we always copy out of the written subset of
> ctxt.data, even if ctxt.size is larger.

Indeed, the patch here shouldn't be needed anymore with the
latest version of Don's.

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] IOMMU: make page table population preemptible
  2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
  2013-12-11 18:40   ` Andrew Cooper
@ 2013-12-20 13:06   ` Jan Beulich
  1 sibling, 0 replies; 35+ messages in thread
From: Jan Beulich @ 2013-12-20 13:06 UTC (permalink / raw)
  To: xiantao.zhang; +Cc: George Dunlap, xen-devel, Keir Fraser, Tim Deegan

>>> On 10.12.13 at 16:45, "Jan Beulich" <JBeulich@suse.com> wrote:
> Since this can take an arbitrary amount of time, the rooting domctl as
> well as all involved code must become aware of this requiring a
> continuation.
> 
> The subject domain's rel_mem_list is being (ab)used for this, in a way
> similar to and compatible with broken page offlining.
> 
> Further, operations get slightly re-ordered in assign_device(): IOMMU
> page tables now get set up _before_ the first device gets assigned, at
> once closing a small timing window in which the guest may already see
> the device but wouldn't be able to access it.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Ping (also for patch 2 of this series)?

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1923,6 +1923,12 @@ int domain_relinquish_resources(struct d
>          }
>  
>          d->arch.relmem = RELMEM_xen;
> +
> +        spin_lock(&d->page_alloc_lock);
> +        page_list_splice(&d->arch.relmem_list, &d->page_list);
> +        INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
> +        spin_unlock(&d->page_alloc_lock);
> +
>          /* Fallthrough. Relinquish every page of memory. */
>      case RELMEM_xen:
>          ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
> --- a/xen/arch/x86/mm/p2m-pod.c
> +++ b/xen/arch/x86/mm/p2m-pod.c
> @@ -459,7 +459,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>  
>  pod_hit:
>      lock_page_alloc(p2m);
> -    page_list_add_tail(p, &d->arch.relmem_list);
> +    /* Insertion must be at list head (see iommu_populate_page_table()). */
> +    page_list_add(p, &d->arch.relmem_list);
>      unlock_page_alloc(p2m);
>      pod_unlock(p2m);
>      return 1;
> --- a/xen/drivers/passthrough/iommu.c
> +++ b/xen/drivers/passthrough/iommu.c
> @@ -18,6 +18,7 @@
>  #include <asm/hvm/iommu.h>
>  #include <xen/paging.h>
>  #include <xen/guest_access.h>
> +#include <xen/event.h>
>  #include <xen/softirq.h>
>  #include <xen/keyhandler.h>
>  #include <xsm/xsm.h>
> @@ -265,7 +266,23 @@ static int assign_device(struct domain *
>               d->mem_event->paging.ring_page)) )
>          return -EXDEV;
>  
> -    spin_lock(&pcidevs_lock);
> +    if ( !spin_trylock(&pcidevs_lock) )
> +        return -ERESTART;
> +
> +    if ( need_iommu(d) <= 0 )
> +    {
> +        if ( !iommu_use_hap_pt(d) )
> +        {
> +            rc = iommu_populate_page_table(d);
> +            if ( rc )
> +            {
> +                spin_unlock(&pcidevs_lock);
> +                return rc;
> +            }
> +        }
> +        d->need_iommu = 1;
> +    }
> +
>      pdev = pci_get_pdev_by_domain(dom0, seg, bus, devfn);
>      if ( !pdev )
>      {
> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>                     rc);
>      }
>  
> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
> + done:
> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>      {
> -        d->need_iommu = 1;
> -        if ( !iommu_use_hap_pt(d) )
> -            rc = iommu_populate_page_table(d);
> -        goto done;
> +        d->need_iommu = 0;
> +        hd->platform_ops->teardown(d);
>      }
> -done:
>      spin_unlock(&pcidevs_lock);
> +
>      return rc;
>  }
>  
> @@ -306,12 +322,17 @@ static int iommu_populate_page_table(str
>  {
>      struct hvm_iommu *hd = domain_hvm_iommu(d);
>      struct page_info *page;
> -    int rc = 0;
> +    int rc = 0, n = 0;
> +
> +    d->need_iommu = -1;
>  
>      this_cpu(iommu_dont_flush_iotlb) = 1;
>      spin_lock(&d->page_alloc_lock);
>  
> -    page_list_for_each ( page, &d->page_list )
> +    if ( unlikely(d->is_dying) )
> +        rc = -ESRCH;
> +
> +    while ( !rc && (page = page_list_remove_head(&d->page_list)) )
>      {
>          if ( is_hvm_domain(d) ||
>              (page->u.inuse.type_info & PGT_type_mask) == PGT_writable_page )
> @@ -321,7 +342,32 @@ static int iommu_populate_page_table(str
>                  d, mfn_to_gmfn(d, page_to_mfn(page)), page_to_mfn(page),
>                  IOMMUF_readable|IOMMUF_writable);
>              if ( rc )
> +            {
> +                page_list_add(page, &d->page_list);
>                  break;
> +            }
> +        }
> +        page_list_add_tail(page, &d->arch.relmem_list);
> +        if ( !(++n & 0xff) && !page_list_empty(&d->page_list) &&
> +             hypercall_preempt_check() )
> +            rc = -ERESTART;
> +    }
> +
> +    if ( !rc )
> +    {
> +        /*
> +         * The expectation here is that generally there are many normal 
> pages
> +         * on relmem_list (the ones we put there) and only few being in an
> +         * offline/broken state. The latter ones are always at the head of 
> the
> +         * list. Hence we first move the whole list, and then move back the
> +         * first few entries.
> +         */
> +        page_list_move(&d->page_list, &d->arch.relmem_list);
> +        while ( (page = page_list_first(&d->page_list)) != NULL &&
> +                (page->count_info & (PGC_state|PGC_broken)) )
> +        {
> +            page_list_del(page, &d->page_list);
> +            page_list_add_tail(page, &d->arch.relmem_list);
>          }
>      }
>  
> @@ -330,8 +376,11 @@ static int iommu_populate_page_table(str
>  
>      if ( !rc )
>          iommu_iotlb_flush_all(d);
> -    else
> +    else if ( rc != -ERESTART )
> +    {
>          hd->platform_ops->teardown(d);
> +        d->need_iommu = 0;
> +    }
>  
>      return rc;
>  }
> @@ -688,7 +737,10 @@ int iommu_do_domctl(
>  
>          ret = device_assigned(seg, bus, devfn) ?:
>                assign_device(d, seg, bus, devfn);
> -        if ( ret )
> +        if ( ret == -ERESTART )
> +            ret = hypercall_create_continuation(__HYPERVISOR_domctl,
> +                                                "h", u_domctl);
> +        else if ( ret )
>              printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
>                     "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
>                     seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -323,7 +323,7 @@ struct domain
>  
>  #ifdef HAS_PASSTHROUGH
>      /* Does this guest need iommu mappings? */
> -    bool_t           need_iommu;
> +    s8               need_iommu;
>  #endif
>      /* is node-affinity automatically computed? */
>      bool_t           auto_node_affinity;

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-13 15:44           ` Andrew Cooper
@ 2013-12-30 13:43             ` Zhang, Xiantao
  2014-01-07 13:23               ` Jan Beulich
  0 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2013-12-30 13:43 UTC (permalink / raw)
  To: Andrew Cooper, Jan Beulich
  Cc: George Dunlap, xen-devel, Keir Fraser, Tim Deegan

Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Xiantao
-----Original Message-----
From: Andrew Cooper [mailto:andrew.cooper3@citrix.com] 
Sent: Friday, December 13, 2013 11:44 PM
To: Jan Beulich
Cc: George Dunlap; Zhang, Xiantao; xen-devel; Keir Fraser; Tim Deegan
Subject: Re: [Xen-devel] [PATCH v2 1/5] IOMMU: make page table population preemptible

On 13/12/2013 15:41, Jan Beulich wrote:
>>>> On 13.12.13 at 16:09, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>> On 13/12/2013 13:59, Jan Beulich wrote:
>>> @@ -290,15 +307,14 @@ static int assign_device(struct domain *
>>>                     rc);
>>>      }
>>>  
>>> -    if ( has_arch_pdevs(d) && !need_iommu(d) )
>>> + done:
>>> +    if ( !has_arch_pdevs(d) && need_iommu(d) )
>> We now have a case where, for the first device, we could set up 
>> pagetables for a large domain, get an error with assignment, then 
>> tear them all back down.  (-EBUSY from pci_get_pdev() looks like a 
>> good non-fatal candidate for causing this behaviour)
>>
>> I am wondering whether this is better for worse than the race 
>> condition where a guest couldn't use the device.  A guest could not reasonably
>> expect to use a device before the toolstack is done setting it up.   A
>> buggy toolstack could quite easily tie up a lot of Xen time creating 
>> and destroying complete iommu pagetable sets.
> I don't think it's worth worrying about buggy tool stacks here - they 
> should simply get fixed.
>
> Furthermore this change in operation ordering is only a (nice) side 
> effect, the necessary cleanup seemed easier with the order changed. 
> And said time window would have grown with the added preemption 
> handling.
>
> Jan
>

All true.

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 1/5] IOMMU: make page table population preemptible
  2013-12-30 13:43             ` Zhang, Xiantao
@ 2014-01-07 13:23               ` Jan Beulich
  0 siblings, 0 replies; 35+ messages in thread
From: Jan Beulich @ 2014-01-07 13:23 UTC (permalink / raw)
  To: Xiantao Zhang
  Cc: George Dunlap, Andrew Cooper, Tim Deegan, Keir Fraser, xen-devel

>>> On 30.12.13 at 14:43, "Zhang, Xiantao" <xiantao.zhang@intel.com> wrote:
> Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>

Thanks - but what about patch 2 of this same series?

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH v2 2/5] IOMMU: make page table deallocation preemptible
  2013-12-13 14:00     ` [PATCH v2 " Jan Beulich
  2013-12-13 15:26       ` Andrew Cooper
@ 2014-01-07 14:51       ` Zhang, Xiantao
  1 sibling, 0 replies; 35+ messages in thread
From: Zhang, Xiantao @ 2014-01-07 14:51 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: George Dunlap, Andrew Cooper, Keir Fraser,
	suravee.suthikulpanit@amd.com

Jan, Thanks! 
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>

-----Original Message-----
From: Jan Beulich [mailto:JBeulich@suse.com] 
Sent: Friday, December 13, 2013 10:00 PM
To: xen-devel
Cc: suravee.suthikulpanit@amd.com; Andrew Cooper; George Dunlap; Zhang, Xiantao; Keir Fraser
Subject: [PATCH v2 2/5] IOMMU: make page table deallocation preemptible

This too can take an arbitrary amount of time.

In fact, the bulk of the work is being moved to a tasklet, as handling the necessary preemption logic in line seems close to impossible given that the teardown may also be invoked on error paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: abstract out tasklet logic

--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -405,11 +405,21 @@ static int amd_iommu_assign_device(struc
     return reassign_device(dom0, d, devfn, pdev);  }
 
-static void deallocate_next_page_table(struct page_info* pg, int level)
+static void deallocate_next_page_table(struct page_info *pg, int level) 
+{
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void deallocate_page_table(struct page_info *pg)
 {
     void *table_vaddr, *pde;
     u64 next_table_maddr;
-    int index, next_level;
+    unsigned int index, level = PFN_ORDER(pg), next_level;
+
+    PFN_ORDER(pg) = 0;
 
     if ( level <= 1 )
     {
@@ -599,6 +609,7 @@ const struct iommu_ops amd_iommu_ops = {
     .teardown = amd_iommu_domain_destroy,
     .map_page = amd_iommu_map_page,
     .unmap_page = amd_iommu_unmap_page,
+    .free_page_table = deallocate_page_table,
     .reassign_device = reassign_device,
     .get_device_group_id = amd_iommu_group_id,
     .update_ire_from_apic = amd_iommu_ioapic_update_ire,
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -58,6 +58,10 @@ bool_t __read_mostly amd_iommu_perdev_in
 
 DEFINE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+DEFINE_SPINLOCK(iommu_pt_cleanup_lock);
+PAGE_LIST_HEAD(iommu_pt_cleanup_list);
+static struct tasklet iommu_pt_cleanup_tasklet;
+
 static struct keyhandler iommu_p2m_table = {
     .diagnostic = 0,
     .u.fn = iommu_dump_p2m_table,
@@ -235,6 +239,15 @@ int iommu_remove_device(struct pci_dev *
     return hd->platform_ops->remove_device(pdev->devfn, pdev);  }
 
+static void iommu_teardown(struct domain *d) {
+    const struct hvm_iommu *hd = domain_hvm_iommu(d);
+
+    d->need_iommu = 0;
+    hd->platform_ops->teardown(d);
+    tasklet_schedule(&iommu_pt_cleanup_tasklet);
+}
+
 /*
  * If the device isn't owned by dom0, it means it already
  * has been assigned to other domain, or it doesn't exist.
@@ -309,10 +322,7 @@ static int assign_device(struct domain *
 
  done:
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
     spin_unlock(&pcidevs_lock);
 
     return rc;
@@ -377,10 +387,7 @@ static int iommu_populate_page_table(str
     if ( !rc )
         iommu_iotlb_flush_all(d);
     else if ( rc != -ERESTART )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return rc;
 }
@@ -397,10 +404,7 @@ void iommu_domain_destroy(struct domain 
         return;
 
     if ( need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     list_for_each_safe ( ioport_list, tmp, &hd->g2m_ioport_list )
     {
@@ -438,6 +442,23 @@ int iommu_unmap_page(struct domain *d, u
     return hd->platform_ops->unmap_page(d, gfn);  }
 
+static void iommu_free_pagetables(unsigned long unused) {
+    do {
+        struct page_info *pg;
+
+        spin_lock(&iommu_pt_cleanup_lock);
+        pg = page_list_remove_head(&iommu_pt_cleanup_list);
+        spin_unlock(&iommu_pt_cleanup_lock);
+        if ( !pg )
+            return;
+        iommu_get_ops()->free_page_table(pg);
+    } while ( !softirq_pending(smp_processor_id()) );
+
+    tasklet_schedule_on_cpu(&iommu_pt_cleanup_tasklet,
+                            cpumask_cycle(smp_processor_id(), 
+&cpu_online_map)); }
+
 void iommu_iotlb_flush(struct domain *d, unsigned long gfn, unsigned int page_count)  {
     struct hvm_iommu *hd = domain_hvm_iommu(d); @@ -500,10 +521,7 @@ int deassign_device(struct domain *d, u1
     pdev->fault.count = 0;
 
     if ( !has_arch_pdevs(d) && need_iommu(d) )
-    {
-        d->need_iommu = 0;
-        hd->platform_ops->teardown(d);
-    }
+        iommu_teardown(d);
 
     return ret;
 }
@@ -542,6 +560,7 @@ int __init iommu_setup(void)
                iommu_passthrough ? "Passthrough" :
                iommu_dom0_strict ? "Strict" : "Relaxed");
         printk("Interrupt remapping %sabled\n", iommu_intremap ? "en" : "dis");
+        tasklet_init(&iommu_pt_cleanup_tasklet, iommu_free_pagetables, 
+ 0);
     }
 
     return rc;
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -668,13 +668,24 @@ static void dma_pte_clear_one(struct dom
 
 static void iommu_free_pagetable(u64 pt_maddr, int level)  {
-    int i;
-    struct dma_pte *pt_vaddr, *pte;
-    int next_level = level - 1;
+    struct page_info *pg = maddr_to_page(pt_maddr);
 
     if ( pt_maddr == 0 )
         return;
 
+    PFN_ORDER(pg) = level;
+    spin_lock(&iommu_pt_cleanup_lock);
+    page_list_add_tail(pg, &iommu_pt_cleanup_list);
+    spin_unlock(&iommu_pt_cleanup_lock);
+}
+
+static void iommu_free_page_table(struct page_info *pg) {
+    unsigned int i, next_level = PFN_ORDER(pg) - 1;
+    u64 pt_maddr = page_to_maddr(pg);
+    struct dma_pte *pt_vaddr, *pte;
+
+    PFN_ORDER(pg) = 0;
     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
 
     for ( i = 0; i < PTE_NUM; i++ )
@@ -2430,6 +2441,7 @@ const struct iommu_ops intel_iommu_ops =
     .teardown = iommu_domain_teardown,
     .map_page = intel_iommu_map_page,
     .unmap_page = intel_iommu_unmap_page,
+    .free_page_table = iommu_free_page_table,
     .reassign_device = reassign_device_ownership,
     .get_device_group_id = intel_iommu_group_id,
     .update_ire_from_apic = io_apic_write_remap_rte,
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -88,6 +88,7 @@ bool_t pt_irq_need_timer(uint32_t flags)
 
 struct msi_desc;
 struct msi_msg;
+struct page_info;
 
 struct iommu_ops {
     int (*init)(struct domain *d);
@@ -100,6 +101,7 @@ struct iommu_ops {
     int (*map_page)(struct domain *d, unsigned long gfn, unsigned long mfn,
                     unsigned int flags);
     int (*unmap_page)(struct domain *d, unsigned long gfn);
+    void (*free_page_table)(struct page_info *);
     int (*reassign_device)(struct domain *s, struct domain *t,
 			   u8 devfn, struct pci_dev *);
     int (*get_device_group_id)(u16 seg, u8 bus, u8 devfn); @@ -151,4 +153,7 @@ int adjust_vtd_irq_affinities(void);
  */
 DECLARE_PER_CPU(bool_t, iommu_dont_flush_iotlb);
 
+extern struct spinlock iommu_pt_cleanup_lock; extern struct 
+page_list_head iommu_pt_cleanup_list;
+
 #endif /* _IOMMU_H_ */

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2014-01-07 14:51 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-12-10 15:43 [PATCH 0/5] XSA-77 follow-ups Jan Beulich
2013-12-10 15:45 ` [PATCH 1/5] IOMMU: make page table population preemptible Jan Beulich
2013-12-11 18:40   ` Andrew Cooper
2013-12-13  9:51     ` Jan Beulich
2013-12-13 12:18       ` Andrew Cooper
2013-12-13 12:34         ` Jan Beulich
2013-12-13 13:57           ` Andrew Cooper
2013-12-13 13:59     ` [PATCH v2 " Jan Beulich
2013-12-13 14:16       ` Tim Deegan
2013-12-13 15:09       ` Andrew Cooper
2013-12-13 15:41         ` Jan Beulich
2013-12-13 15:44           ` Andrew Cooper
2013-12-30 13:43             ` Zhang, Xiantao
2014-01-07 13:23               ` Jan Beulich
2013-12-20 13:06   ` [PATCH " Jan Beulich
2013-12-10 15:46 ` [PATCH 2/5] IOMMU: make page table deallocation preemptible Jan Beulich
2013-12-11 19:01   ` Andrew Cooper
2013-12-13  9:55     ` Jan Beulich
2013-12-13 14:00     ` [PATCH v2 " Jan Beulich
2013-12-13 15:26       ` Andrew Cooper
2014-01-07 14:51       ` Zhang, Xiantao
2013-12-10 15:47 ` [PATCH 3/5] x86/p2m: restrict auditing to debug builds Jan Beulich
2013-12-10 15:58   ` Andrew Cooper
2013-12-10 17:31   ` George Dunlap
2013-12-13 13:46   ` Tim Deegan
2013-12-10 15:48 ` [PATCH 4/5] HVM: prevent leaking heap data from hvm_save_one() Jan Beulich
2013-12-10 16:01   ` Andrew Cooper
2013-12-10 17:32   ` George Dunlap
2013-12-17  9:16   ` Ping: " Jan Beulich
2013-12-17 10:45     ` Andrew Cooper
2013-12-17 11:02       ` Jan Beulich
2013-12-10 15:48 ` [PATCH 5/5] x86/PV: don't commit debug register values early in arch_set_info_guest() Jan Beulich
2013-12-10 17:23   ` Andrew Cooper
2013-12-10 17:33   ` George Dunlap
2013-12-10 18:17     ` Keir Fraser

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).