[PATCH][v2] Hybrid extension support in Xen

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH][v2] Hybrid extension support in Xen
@ 2010-02-02  8:16 Sheng Yang
  2010-02-02 11:22 ` Ian Campbell
                   ` (4 more replies)
  0 siblings, 5 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02  8:16 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com

[-- Attachment #1: Type: Text/Plain, Size: 1254 bytes --]

Hi Keir

Here is the second version of Hybrid extension support in Xen. Mostly the 
patch did:

1. Enable SMP support through VCPU_OP in arch_set_info_guest().
2. A new hypercall in hvm_ops to enable hybrid.
3. Mapping IRQ to VIRQ when deliver to the guest.
4. Inject a guest defined vector to deliver notification for events.
5. Use CPUID leaf 0x40000002 to support hybrid feature
6. Reserve some space at the end of MMIO hole for grant table use.

Please review. Thanks!

-- 
regards
Yang, Sheng

--
 tools/firmware/hvmloader/config.h    |    6 ++-
 tools/firmware/hvmloader/hvmloader.c |    6 +++
 xen/arch/x86/domain.c                |    9 ++++
 xen/arch/x86/hvm/hvm.c               |   62 +++++++++++++++++++++++++++++---
 xen/arch/x86/hvm/irq.c               |   66 
++++++++++++++++++++++++++++-------
 xen/arch/x86/hvm/vmx/intr.c          |    3 +
 xen/arch/x86/traps.c                 |    9 ++++
 xen/include/asm-x86/hvm/hvm.h        |    4 +-
 xen/include/asm-x86/hvm/irq.h        |    4 +-
 xen/include/public/arch-x86/cpuid.h  |    7 +++
 xen/include/public/hvm/hvm_op.h      |    6 +++
 xen/include/public/xen.h             |    7 +++
 xen/include/xen/sched.h              |    9 ++++
 13 files changed, 175 insertions(+), 23 deletions(-)

[-- Attachment #2: hybrid-xen.patch --]
[-- Type: text/x-patch, Size: 15850 bytes --]

diff --git a/tools/firmware/hvmloader/config.h b/tools/firmware/hvmloader/config.h
--- a/tools/firmware/hvmloader/config.h
+++ b/tools/firmware/hvmloader/config.h
@@ -16,8 +16,12 @@
 
 /* MMIO hole: Hardcoded defaults, which can be dynamically expanded. */
 #define PCI_MEM_START       0xf0000000
-#define PCI_MEM_END         0xfc000000
+#define PCI_MEM_END         0xfbfe0000
 extern unsigned long pci_mem_start, pci_mem_end;
+
+/* Reserve 128KB for grant table */
+#define GNTTAB_MEMBASE	    0xfbfe0000
+#define GNTTAB_MEMSIZE	    0x20000
 
 /* We reserve 16MB for special BIOS mappings, etc. */
 #define RESERVED_MEMBASE    0xfc000000
diff --git a/tools/firmware/hvmloader/hvmloader.c b/tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c
+++ b/tools/firmware/hvmloader/hvmloader.c
@@ -618,6 +618,12 @@ static void build_e820_table(void)
     e820[nr].type = E820_RAM;
     nr++;
 
+    /* Reserved for grant table */
+    e820[nr].addr = GNTTAB_MEMBASE;
+    e820[nr].size = GNTTAB_MEMSIZE;
+    e820[nr].type = E820_RESERVED;
+    nr++;
+
     /*
      * Explicitly reserve space for special pages.
      * This space starts at RESERVED_MEMBASE an extends to cover various
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -682,7 +682,16 @@ int arch_set_info_guest(
 
     if ( is_hvm_vcpu(v) )
     {
+        unsigned long eip, cs;
+
         hvm_set_info_guest(v);
+
+        eip = c(user_regs.eip);
+        if (eip != 0) {
+            cs = eip >> 12 << 8;
+            hvm_vcpu_reset_state(v, cs, 0);
+            hvm_funcs.set_tsc_offset(v, 0);
+        }
         goto out;
     }
 
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2325,6 +2325,17 @@ static hvm_hypercall_t *hvm_hypercall32_
     HYPERCALL(hvm_op)
 };
 
+static hvm_hypercall_t *hvm_hypercall_hybrid64_table[NR_hypercalls] = {
+    [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
+    [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t *)hvm_grant_table_op,
+    HYPERCALL(xen_version),
+    HYPERCALL(console_io),
+    HYPERCALL(vcpu_op),
+    HYPERCALL(sched_op),
+    HYPERCALL(event_channel_op),
+    HYPERCALL(hvm_op),
+};
+
 #endif /* defined(__x86_64__) */
 
 int hvm_do_hypercall(struct cpu_user_regs *regs)
@@ -2355,7 +2366,8 @@ int hvm_do_hypercall(struct cpu_user_reg
     if ( (eax & 0x80000000) && is_viridian_domain(curr->domain) )
         return viridian_hypercall(regs);
 
-    if ( (eax >= NR_hypercalls) || !hvm_hypercall32_table[eax] )
+    if ( (eax >= NR_hypercalls) ||
+         (!hvm_hypercall32_table[eax] && !is_hybrid_vcpu(curr)) )
     {
         regs->eax = -ENOSYS;
         return HVM_HCALL_completed;
@@ -2370,11 +2382,18 @@ int hvm_do_hypercall(struct cpu_user_reg
                     regs->rdi, regs->rsi, regs->rdx, regs->r10, regs->r8);
 
         this_cpu(hvm_64bit_hcall) = 1;
-        regs->rax = hvm_hypercall64_table[eax](regs->rdi,
-                                               regs->rsi,
-                                               regs->rdx,
-                                               regs->r10,
-                                               regs->r8); 
+        if (is_hybrid_vcpu(curr))
+            regs->rax = hvm_hypercall_hybrid64_table[eax](regs->rdi,
+                                                          regs->rsi,
+                                                          regs->rdx,
+                                                          regs->r10,
+                                                          regs->r8);
+        else
+            regs->rax = hvm_hypercall64_table[eax](regs->rdi,
+                                                   regs->rsi,
+                                                   regs->rdx,
+                                                   regs->r10,
+                                                   regs->r8);
         this_cpu(hvm_64bit_hcall) = 0;
     }
     else
@@ -3109,6 +3128,37 @@ long do_hvm_op(unsigned long op, XEN_GUE
         break;
     }
 
+    case HVMOP_enable_hybrid: {
+        struct xen_hvm_hybrid_type a;
+        struct domain *d;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(a.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) )
+            goto param_fail5;
+
+        rc = xsm_hvm_param(d, op);
+        if ( rc )
+            goto param_fail5;
+
+        d->hybrid_enabled = XEN_HYBRID_ENABLED;
+        printk("HVM: Hybrid domain enabled\n");
+        if (a.flags & HVM_HYBRID_EVTCHN) {
+            update_domain_wallclock_time(d);
+            hvm_funcs.set_tsc_offset(d->vcpu[0], 0);
+            d->hybrid_enabled |= XEN_HYBRID_EVTCHN_ENABLED;
+        }
+param_fail5:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
--- a/xen/arch/x86/hvm/irq.c
+++ b/xen/arch/x86/hvm/irq.c
@@ -46,8 +46,18 @@ static void __hvm_pci_intx_assert(
     if ( (hvm_irq->pci_link_assert_count[link]++ == 0) && isa_irq &&
          (hvm_irq->gsi_assert_count[isa_irq]++ == 0) )
     {
-        vioapic_irq_positive_edge(d, isa_irq);
-        vpic_irq_positive_edge(d, isa_irq);
+        if ( !is_hybrid_evtchn_enabled_domain(d) )
+        {
+            vioapic_irq_positive_edge(d, isa_irq);
+            vpic_irq_positive_edge(d, isa_irq);
+        }
+        else
+        {
+            /* TODO fix the critical region here */
+            spin_unlock(&d->arch.hvm_domain.irq_lock);
+            send_guest_global_virq(d, VIRQ_EMUL_PIN(isa_irq));
+            spin_lock(&d->arch.hvm_domain.irq_lock);
+	}
     }
 }
 
@@ -76,8 +86,10 @@ static void __hvm_pci_intx_deassert(
     link    = hvm_pci_intx_link(device, intx);
     isa_irq = hvm_irq->pci_link.route[link];
     if ( (--hvm_irq->pci_link_assert_count[link] == 0) && isa_irq &&
-         (--hvm_irq->gsi_assert_count[isa_irq] == 0) )
-        vpic_irq_negative_edge(d, isa_irq);
+         (--hvm_irq->gsi_assert_count[isa_irq] == 0) ) {
+        if ( !is_hybrid_evtchn_enabled_domain(d) )
+            vpic_irq_negative_edge(d, isa_irq);
+    }
 }
 
 void hvm_pci_intx_deassert(
@@ -93,6 +105,7 @@ void hvm_isa_irq_assert(
 {
     struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq;
     unsigned int gsi = hvm_isa_irq_to_gsi(isa_irq);
+    int send_virq = 0;
 
     ASSERT(isa_irq <= 15);
 
@@ -101,11 +114,21 @@ void hvm_isa_irq_assert(
     if ( !__test_and_set_bit(isa_irq, &hvm_irq->isa_irq.i) &&
          (hvm_irq->gsi_assert_count[gsi]++ == 0) )
     {
-        vioapic_irq_positive_edge(d, gsi);
-        vpic_irq_positive_edge(d, isa_irq);
+        if ( !is_hybrid_evtchn_enabled_domain(d) )
+        {
+            vioapic_irq_positive_edge(d, gsi);
+            vpic_irq_positive_edge(d, isa_irq);
+        }
+        else
+        {
+            send_virq = 1;
+        }
     }
 
     spin_unlock(&d->arch.hvm_domain.irq_lock);
+
+    if (send_virq)
+	    send_guest_global_virq(d, VIRQ_EMUL_PIN(isa_irq));
 }
 
 void hvm_isa_irq_deassert(
@@ -120,7 +143,10 @@ void hvm_isa_irq_deassert(
 
     if ( __test_and_clear_bit(isa_irq, &hvm_irq->isa_irq.i) &&
          (--hvm_irq->gsi_assert_count[gsi] == 0) )
-        vpic_irq_negative_edge(d, isa_irq);
+    {
+        if ( !is_hybrid_evtchn_enabled_domain(d) )
+            vpic_irq_negative_edge(d, isa_irq);
+    }
 
     spin_unlock(&d->arch.hvm_domain.irq_lock);
 }
@@ -165,6 +191,8 @@ static void hvm_set_callback_irq_level(s
             __hvm_pci_intx_assert(d, pdev, pintx);
         else
             __hvm_pci_intx_deassert(d, pdev, pintx);
+    case HVMIRQ_callback_vector:
+        vcpu_kick(v);
     default:
         break;
     }
@@ -185,16 +213,17 @@ void hvm_maybe_deassert_evtchn_irq(void)
 
 void hvm_assert_evtchn_irq(struct vcpu *v)
 {
-    if ( v->vcpu_id != 0 )
-        return;
-
     if ( unlikely(in_irq() || !local_irq_is_enabled()) )
     {
         tasklet_schedule(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet);
         return;
     }
 
-    hvm_set_callback_irq_level(v);
+    /* set_call_irq_level can't deal with vcpu other than 0 */
+    if (v->vcpu_id != 0)
+        vcpu_kick(v);
+    else
+        hvm_set_callback_irq_level(v);
 }
 
 void hvm_set_pci_link_route(struct domain *d, u8 link, u8 isa_irq)
@@ -251,7 +280,7 @@ void hvm_set_callback_via(struct domain 
 
     via_type = (uint8_t)(via >> 56) + 1;
     if ( ((via_type == HVMIRQ_callback_gsi) && (via == 0)) ||
-         (via_type > HVMIRQ_callback_pci_intx) )
+         (via_type > HVMIRQ_callback_vector) )
         via_type = HVMIRQ_callback_none;
 
     spin_lock(&d->arch.hvm_domain.irq_lock);
@@ -297,6 +326,9 @@ void hvm_set_callback_via(struct domain 
         if ( hvm_irq->callback_via_asserted )
              __hvm_pci_intx_assert(d, pdev, pintx);
         break;
+    case HVMIRQ_callback_vector:
+        hvm_irq->callback_via.vector = (uint8_t)via;
+        break;
     default:
         break;
     }
@@ -312,6 +344,10 @@ void hvm_set_callback_via(struct domain 
     case HVMIRQ_callback_pci_intx:
         printk("PCI INTx Dev 0x%02x Int%c\n", pdev, 'A' + pintx);
         break;
+    case HVMIRQ_callback_vector:
+        printk("Set HVMIRQ_callback_vector to %u\n",
+               hvm_irq->callback_via.vector);
+        break;
     default:
         printk("None\n");
         break;
@@ -322,6 +358,10 @@ struct hvm_intack hvm_vcpu_has_pending_i
 {
     struct hvm_domain *plat = &v->domain->arch.hvm_domain;
     int vector;
+
+    if (plat->irq.callback_via_type == HVMIRQ_callback_vector &&
+            vcpu_info(v, evtchn_upcall_pending))
+        return hvm_intack_vector(plat->irq.callback_via.vector);
 
     if ( unlikely(v->nmi_pending) )
         return hvm_intack_nmi;
@@ -363,6 +403,8 @@ struct hvm_intack hvm_vcpu_ack_pending_i
     case hvm_intsrc_lapic:
         if ( !vlapic_ack_pending_irq(v, intack.vector) )
             intack = hvm_intack_none;
+        break;
+    case hvm_intsrc_vector:
         break;
     default:
         intack = hvm_intack_none;
diff --git a/xen/arch/x86/hvm/vmx/intr.c b/xen/arch/x86/hvm/vmx/intr.c
--- a/xen/arch/x86/hvm/vmx/intr.c
+++ b/xen/arch/x86/hvm/vmx/intr.c
@@ -164,7 +164,8 @@ asmlinkage void vmx_intr_assist(void)
     {
         HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
         vmx_inject_extint(intack.vector);
-        pt_intr_post(v, intack);
+        if (intack.source != hvm_intsrc_vector)
+             pt_intr_post(v, intack);
     }
 
     /* Is there another IRQ to queue up behind this one? */
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -686,6 +686,7 @@ int cpuid_hypervisor_leaves( uint32_t id
     struct domain *d = current->domain;
     /* Optionally shift out of the way of Viridian architectural leaves. */
     uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
+    unsigned int tmp_eax, tmp_ebx, tmp_ecx, tmp_edx;
 
     idx -= base;
     if ( idx > 3 ) 
@@ -716,6 +717,14 @@ int cpuid_hypervisor_leaves( uint32_t id
         *edx = 0;          /* Features 2 */
         if ( !is_hvm_vcpu(current) )
             *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
+
+        /* Check if additional feature specified, e.g. Hybrid */
+        if ( !is_viridian_domain(d) ) {
+            domain_cpuid(d, 0x40000002, 0,
+                         &tmp_eax, &tmp_ebx, &tmp_ecx, &tmp_edx);
+            if (tmp_edx != 0)
+                *edx = tmp_edx & XEN_CPUID_FEAT2_MASK;
+        }
         break;
 
     case 3:
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -33,7 +33,8 @@ enum hvm_intsrc {
     hvm_intsrc_pic,
     hvm_intsrc_lapic,
     hvm_intsrc_nmi,
-    hvm_intsrc_mce
+    hvm_intsrc_mce,
+    hvm_intsrc_vector,
 };
 struct hvm_intack {
     uint8_t source; /* enum hvm_intsrc */
@@ -44,6 +45,7 @@ struct hvm_intack {
 #define hvm_intack_lapic(vec) ( (struct hvm_intack) { hvm_intsrc_lapic, vec } )
 #define hvm_intack_nmi        ( (struct hvm_intack) { hvm_intsrc_nmi,   2 } )
 #define hvm_intack_mce        ( (struct hvm_intack) { hvm_intsrc_mce,   18 } )
+#define hvm_intack_vector(vec)( (struct hvm_intack) { hvm_intsrc_vector, vec } )
 enum hvm_intblk {
     hvm_intblk_none,      /* not blocked (deliverable) */
     hvm_intblk_shadow,    /* MOV-SS or STI shadow */
diff --git a/xen/include/asm-x86/hvm/irq.h b/xen/include/asm-x86/hvm/irq.h
--- a/xen/include/asm-x86/hvm/irq.h
+++ b/xen/include/asm-x86/hvm/irq.h
@@ -54,12 +54,14 @@ struct hvm_irq {
         enum {
             HVMIRQ_callback_none,
             HVMIRQ_callback_gsi,
-            HVMIRQ_callback_pci_intx
+            HVMIRQ_callback_pci_intx,
+            HVMIRQ_callback_vector,
         } callback_via_type;
     };
     union {
         uint32_t gsi;
         struct { uint8_t dev, intx; } pci;
+        uint32_t vector;
     } callback_via;
 
     /* Number of INTx wires asserting each PCI-ISA link. */
diff --git a/xen/include/public/arch-x86/cpuid.h b/xen/include/public/arch-x86/cpuid.h
--- a/xen/include/public/arch-x86/cpuid.h
+++ b/xen/include/public/arch-x86/cpuid.h
@@ -65,4 +65,11 @@
 #define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
 #define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
 
+/* Mask unsupported CPUID specified by user */
+#define XEN_CPUID_FEAT2_MASK 0x3ul
+#define _XEN_CPUID_FEAT2_HYBRID 0
+#define XEN_CPUID_FEAT2_HYBRID (1u<<0)
+#define _XEN_CPUID_FEAT2_HYBRID_EVTCHN 1
+#define XEN_CPUID_FEAT2_HYBRID_EVTCHN (1u<<1)
+
 #endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
--- a/xen/include/public/hvm/hvm_op.h
+++ b/xen/include/public/hvm/hvm_op.h
@@ -127,6 +127,12 @@ typedef struct xen_hvm_set_mem_type xen_
 typedef struct xen_hvm_set_mem_type xen_hvm_set_mem_type_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_mem_type_t);
 
+#define HVMOP_enable_hybrid    9
+struct xen_hvm_hybrid_type {
+    domid_t domid;
+    uint32_t flags;
+#define HVM_HYBRID_EVTCHN (1ull<<1)
+};
 
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -158,7 +158,12 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define VIRQ_ARCH_6    22
 #define VIRQ_ARCH_7    23
 
-#define NR_VIRQS       24
+#define VIRQ_EMUL_PIN_START 24
+#define VIRQ_EMUL_PIN_END 39
+#define VIRQ_EMUL_PIN_NUM 16
+#define VIRQ_EMUL_PIN(x) (VIRQ_EMUL_PIN_START + x)
+
+#define NR_VIRQS       40
 
 /*
  * HYPERVISOR_mmu_update(reqs, count, pdone, foreigndom)
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -309,6 +309,10 @@ struct domain
 
     /* Memory paging support */
     struct mem_event_domain mem_event;
+
+#define XEN_HYBRID_ENABLED          (1u << 0)
+#define XEN_HYBRID_EVTCHN_ENABLED   (1u << 1)
+    uint64_t hybrid_enabled;
 };
 
 struct domain_setup_info
@@ -595,6 +599,11 @@ uint64_t get_cpu_idle_time(unsigned int 
 #define is_hvm_vcpu(v)   (is_hvm_domain(v->domain))
 #define need_iommu(d)    ((d)->need_iommu)
 
+#define is_hybrid_domain(d) ((d)->hybrid_enabled & XEN_HYBRID_ENABLED)
+#define is_hybrid_vcpu(v)   (is_hybrid_domain(v->domain))
+#define is_hybrid_evtchn_enabled_domain(d) (is_hybrid_domain(d) && \
+		(d)->hybrid_enabled & XEN_HYBRID_EVTCHN_ENABLED)
+
 void set_vcpu_migration_delay(unsigned int delay);
 unsigned int get_vcpu_migration_delay(void);
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02  8:16 [PATCH][v2] Hybrid extension support in Xen Sheng Yang
@ 2010-02-02 11:22 ` Ian Campbell
  2010-02-02 12:54   ` Sheng Yang
  2010-02-02 13:35   ` Keir Fraser
  2010-02-02 11:26 ` Ian Campbell
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 11:22 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> 
> +/* Reserve 128KB for grant table */
> +#define GNTTAB_MEMBASE     0xfbfe0000
> +#define GNTTAB_MEMSIZE     0x20000 

Why is this necessary? Isn't the grant table contained within one of the
BARS on the virtual PCI device? What needs grant tables for prior to the
kernel finding the PCI device which necessitates hardcoding these
addresses in both guest and hypervisor?

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02  8:16 [PATCH][v2] Hybrid extension support in Xen Sheng Yang
  2010-02-02 11:22 ` Ian Campbell
@ 2010-02-02 11:26 ` Ian Campbell
  2010-02-02 13:06   ` Sheng Yang
  2010-02-02 11:32 ` Paul Durrant
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 11:26 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> 
> +static hvm_hypercall_t *hvm_hypercall_hybrid64_table[NR_hypercalls] =
> {
> +    [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
> +    [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t
> *)hvm_grant_table_op,
> +    HYPERCALL(xen_version),
> +    HYPERCALL(console_io),
> +    HYPERCALL(vcpu_op),
> +    HYPERCALL(sched_op),
> +    HYPERCALL(event_channel_op),
> +    HYPERCALL(hvm_op),
> +};

Why not just expand the exiting hvm hypercall table to incorporate these
new hypercalls?

In fact, why is hybrid mode considered a separate mode by the hypervisor
at all? Shouldn't it just be an extension to regular HVM mode which
guests may choose to use? This seems like it would eliminate a bunch of
the random conditionals.

Have you verified that all of the operations you are making available
are safe to be called from (hybrid)hvm mode?

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02  8:16 [PATCH][v2] Hybrid extension support in Xen Sheng Yang
  2010-02-02 11:22 ` Ian Campbell
  2010-02-02 11:26 ` Ian Campbell
@ 2010-02-02 11:32 ` Paul Durrant
  2010-02-02 13:23 ` Keir Fraser
  2010-02-02 13:52 ` Ian Campbell
  4 siblings, 0 replies; 35+ messages in thread
From: Paul Durrant @ 2010-02-02 11:32 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com, Keir Fraser

Sheng Yang wrote:
> 
> 1. Enable SMP support through VCPU_OP in arch_set_info_guest().
> 2. A new hypercall in hvm_ops to enable hybrid.
> 3. Mapping IRQ to VIRQ when deliver to the guest.
> 4. Inject a guest defined vector to deliver notification for events.
> 5. Use CPUID leaf 0x40000002 to support hybrid feature
> 6. Reserve some space at the end of MMIO hole for grant table use.
> 
> Please review. Thanks!
> 

Why a complete new hypercall table; could you not just use the existing 
HVM table and add the extra hypercalls in?
Doing it this way would negate the need for the concept of a 'hybrid' 
HVM guest would it not? HVM guests that were aware of the hybrid 
extensions would use them, those that were unaware would not.

Also, why the extra E820 space for the grant table? You don't need the 
table that early so could you not just use fake PCI BAR space or 
somesuch and do the XENMEM_add_to_physmap hypercall to add 
XENMAPSPACE_grant_table later on?

   Paul

-- 
===============================
Paul Durrant, Software Engineer

Citrix Systems (R&D) Ltd.
First Floor, Building 101
Cambridge Science Park
Milton Road
Cambridge CB4 0FY
United Kingdom
===============================

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 11:22 ` Ian Campbell
@ 2010-02-02 12:54   ` Sheng Yang
  2010-02-02 13:19     ` Ian Campbell
  2010-02-02 13:35   ` Keir Fraser
  1 sibling, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 12:54 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tuesday 02 February 2010 19:22:21 Ian Campbell wrote:
> On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > +/* Reserve 128KB for grant table */
> > +#define GNTTAB_MEMBASE     0xfbfe0000
> > +#define GNTTAB_MEMSIZE     0x20000
> 
> Why is this necessary? Isn't the grant table contained within one of the
> BARS on the virtual PCI device? What needs grant tables for prior to the
> kernel finding the PCI device which necessitates hardcoding these
> addresses in both guest and hypervisor?
> 
Thanks for so quick and detail comments. :)

And this one is purposed, because we don't want to depends on QEmu. As you 
see, we now have PV drivers, QEmu is an alternative way to provide device 
model now. We think that still involving QEmu as a requirement is somehow 
strange. So this reserved region is there, to drop the dependence with QEmu to 
provide PV driver(PV driver depends on QEmu is still strange, right?).

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 11:26 ` Ian Campbell
@ 2010-02-02 13:06   ` Sheng Yang
  2010-02-02 13:52     ` Ian Campbell
  0 siblings, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 13:06 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tuesday 02 February 2010 19:26:55 Ian Campbell wrote:
> On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > +static hvm_hypercall_t *hvm_hypercall_hybrid64_table[NR_hypercalls] =
> > {
> > +    [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
> > +    [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t
> > *)hvm_grant_table_op,
> > +    HYPERCALL(xen_version),
> > +    HYPERCALL(console_io),
> > +    HYPERCALL(vcpu_op),
> > +    HYPERCALL(sched_op),
> > +    HYPERCALL(event_channel_op),
> > +    HYPERCALL(hvm_op),
> > +};
> 
> Why not just expand the exiting hvm hypercall table to incorporate these
> new hypercalls?

I am just afraid the normal HVM guest called these hypercalls would result in 
some chaos, so add a limitation(for hybrid only) here. (I admit it didn't much 
improve the security for a malicious guest...)
> 
> In fact, why is hybrid mode considered a separate mode by the hypervisor
> at all? Shouldn't it just be an extension to regular HVM mode which
> guests may choose to use? This seems like it would eliminate a bunch of
> the random conditionals.

There is still something different from normal HVM guest. For example, to use 
PV timer, we should clean the tsc offset in HVM domain; and for event 
delivery, we would use predefined VIRQ rather than emulated IOAPIC/APIC. These 
code are exclusive, we need them wrapped with flag(which we called "hybrid"). 
The word "mode" here may be is inaccuracy, a "extension" should be more 
proper. I would change the phrase next time.
> 
> Have you verified that all of the operations you are making available
> are safe to be called from (hybrid)hvm mode?

You mean malicious guest attack? Sorry, I haven't try it yet...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 12:54   ` Sheng Yang
@ 2010-02-02 13:19     ` Ian Campbell
  2010-02-02 13:28       ` Sheng Yang
  0 siblings, 1 reply; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 13:19 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tue, 2010-02-02 at 12:54 +0000, Sheng Yang wrote:
> On Tuesday 02 February 2010 19:22:21 Ian Campbell wrote:
> > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > +/* Reserve 128KB for grant table */
> > > +#define GNTTAB_MEMBASE     0xfbfe0000
> > > +#define GNTTAB_MEMSIZE     0x20000
> > 
> > Why is this necessary? Isn't the grant table contained within one of the
> > BARS on the virtual PCI device? What needs grant tables for prior to the
> > kernel finding the PCI device which necessitates hardcoding these
> > addresses in both guest and hypervisor?
> > 
> Thanks for so quick and detail comments. :)
> 
> And this one is purposed, because we don't want to depends on QEmu. As you 
> see, we now have PV drivers, QEmu is an alternative way to provide device 
> model now. We think that still involving QEmu as a requirement is somehow 
> strange. So this reserved region is there, to drop the dependence with QEmu to 
> provide PV driver(PV driver depends on QEmu is still strange, right?).

So with your patchset you can run an HVM guest with no qemu process at
all? What about the other emulated devices which have no PV equivalent?
How does the VM boot, does the BIOS have pv INT 13 handler?

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02  8:16 [PATCH][v2] Hybrid extension support in Xen Sheng Yang
                   ` (2 preceding siblings ...)
  2010-02-02 11:32 ` Paul Durrant
@ 2010-02-02 13:23 ` Keir Fraser
  2010-02-02 13:37   ` Sheng Yang
  2010-02-02 13:52 ` Ian Campbell
  4 siblings, 1 reply; 35+ messages in thread
From: Keir Fraser @ 2010-02-02 13:23 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com

On 02/02/2010 08:16, "Sheng Yang" <sheng@linux.intel.com> wrote:

> 
> Here is the second version of Hybrid extension support in Xen. Mostly the
> patch did:
> 
> 1. Enable SMP support through VCPU_OP in arch_set_info_guest().

What does that mean and why would we want it?

 -- Keir

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:19     ` Ian Campbell
@ 2010-02-02 13:28       ` Sheng Yang
  2010-02-02 13:50         ` Ian Campbell
  0 siblings, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 13:28 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tuesday 02 February 2010 21:19:32 Ian Campbell wrote:
> On Tue, 2010-02-02 at 12:54 +0000, Sheng Yang wrote:
> > On Tuesday 02 February 2010 19:22:21 Ian Campbell wrote:
> > > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > > +/* Reserve 128KB for grant table */
> > > > +#define GNTTAB_MEMBASE     0xfbfe0000
> > > > +#define GNTTAB_MEMSIZE     0x20000
> > >
> > > Why is this necessary? Isn't the grant table contained within one of
> > > the BARS on the virtual PCI device? What needs grant tables for prior
> > > to the kernel finding the PCI device which necessitates hardcoding
> > > these addresses in both guest and hypervisor?
> >
> > Thanks for so quick and detail comments. :)
> >
> > And this one is purposed, because we don't want to depends on QEmu. As
> > you see, we now have PV drivers, QEmu is an alternative way to provide
> > device model now. We think that still involving QEmu as a requirement is
> > somehow strange. So this reserved region is there, to drop the dependence
> > with QEmu to provide PV driver(PV driver depends on QEmu is still
> > strange, right?).
> 
> So with your patchset you can run an HVM guest with no qemu process at
> all? What about the other emulated devices which have no PV equivalent?
> How does the VM boot, does the BIOS have pv INT 13 handler?

No, not currently... Sorry for confusion. 

We just think QEmu provided the availability of PV driver seems unelegant, so 
we want to decouple them. Because what QEmu provided is a PCI IRQ and a MMIO 
region for grant table. And event channel is available without that PCI IRQ, 
so we think decouple MMIO from QEmu should be more elegant...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 11:22 ` Ian Campbell
  2010-02-02 12:54   ` Sheng Yang
@ 2010-02-02 13:35   ` Keir Fraser
  2010-02-02 13:52     ` Sheng Yang
  1 sibling, 1 reply; 35+ messages in thread
From: Keir Fraser @ 2010-02-02 13:35 UTC (permalink / raw)
  To: Ian Campbell, Sheng Yang; +Cc: xen-devel@lists.xensource.com

On 02/02/2010 11:22, "Ian Campbell" <Ian.Campbell@eu.citrix.com> wrote:

> On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
>> 
>> +/* Reserve 128KB for grant table */
>> +#define GNTTAB_MEMBASE     0xfbfe0000
>> +#define GNTTAB_MEMSIZE     0x20000
> 
> Why is this necessary? Isn't the grant table contained within one of the
> BARS on the virtual PCI device? What needs grant tables for prior to the
> kernel finding the PCI device which necessitates hardcoding these
> addresses in both guest and hypervisor?

I didn't even find where these get used, except to reserve an area in e820,
and it wasn't clear why that reservation is necessary.

 -- Keir

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:23 ` Keir Fraser
@ 2010-02-02 13:37   ` Sheng Yang
  2010-02-02 14:03     ` Keir Fraser
  0 siblings, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 13:37 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com

On Tuesday 02 February 2010 21:23:11 Keir Fraser wrote:
> On 02/02/2010 08:16, "Sheng Yang" <sheng@linux.intel.com> wrote:
> > Here is the second version of Hybrid extension support in Xen. Mostly the
> > patch did:
> >
> > 1. Enable SMP support through VCPU_OP in arch_set_info_guest().
> 
> What does that mean and why would we want it?

It's not a good description, sorry... We need a entry address and boot up code 
AP. For that purpose, I reuse VCPUOP_initialise and VCPUOP_up hypercall. The 
former guest set the entry address of AP, the later one boot up them. For 
this, I add some code in arch_set_info_guest() to initialize the AP(mostly 
what we did after INIT-SIPI-SIPI sequence), that's what I means here...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:28       ` Sheng Yang
@ 2010-02-02 13:50         ` Ian Campbell
  2010-02-02 14:00           ` Tim Deegan
  2010-02-02 14:28           ` Sheng Yang
  0 siblings, 2 replies; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 13:50 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tue, 2010-02-02 at 13:28 +0000, Sheng Yang wrote:
> On Tuesday 02 February 2010 21:19:32 Ian Campbell wrote:
> > On Tue, 2010-02-02 at 12:54 +0000, Sheng Yang wrote:
> > > On Tuesday 02 February 2010 19:22:21 Ian Campbell wrote:
> > > > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > > > +/* Reserve 128KB for grant table */
> > > > > +#define GNTTAB_MEMBASE     0xfbfe0000
> > > > > +#define GNTTAB_MEMSIZE     0x20000
> > > >
> > > > Why is this necessary? Isn't the grant table contained within one of
> > > > the BARS on the virtual PCI device? What needs grant tables for prior
> > > > to the kernel finding the PCI device which necessitates hardcoding
> > > > these addresses in both guest and hypervisor?
> > >
> > > Thanks for so quick and detail comments. :)
> > >
> > > And this one is purposed, because we don't want to depends on QEmu. As
> > > you see, we now have PV drivers, QEmu is an alternative way to provide
> > > device model now. We think that still involving QEmu as a requirement is
> > > somehow strange. So this reserved region is there, to drop the dependence
> > > with QEmu to provide PV driver(PV driver depends on QEmu is still
> > > strange, right?).
> > 
> > So with your patchset you can run an HVM guest with no qemu process at
> > all? What about the other emulated devices which have no PV equivalent?
> > How does the VM boot, does the BIOS have pv INT 13 handler?
> 
> No, not currently... Sorry for confusion. 
> 
> We just think QEmu provided the availability of PV driver seems unelegant, so 
> we want to decouple them. Because what QEmu provided is a PCI IRQ and a MMIO 
> region for grant table. And event channel is available without that PCI IRQ, 
> so we think decouple MMIO from QEmu should be more elegant...

OK, but in that case I think we should have a mechanism for the guest to
query the location of the grant table pages (hypercall, MSR etc) rather
than hardcoding a magic address. It may well end up being hardcoded on
the tools/hypervisor side for now but there is no reason to expose that
to the guest.

I wonder if we could turn things around and have the guest pick some
pages and tell the hypervisor to put the grant table there, removing the
need to reserve any of the physical address space up front. How do
full-PV guests find their grant table, is that a mechanism which could
be re-used here instead of reserving magic regions?

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:35   ` Keir Fraser
@ 2010-02-02 13:52     ` Sheng Yang
  2010-02-02 14:01       ` Keir Fraser
  0 siblings, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 13:52 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Ian Campbell, xen-devel@lists.xensource.com

On Tuesday 02 February 2010 21:35:23 Keir Fraser wrote:
> On 02/02/2010 11:22, "Ian Campbell" <Ian.Campbell@eu.citrix.com> wrote:
> > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> >> +/* Reserve 128KB for grant table */
> >> +#define GNTTAB_MEMBASE     0xfbfe0000
> >> +#define GNTTAB_MEMSIZE     0x20000
> >
> > Why is this necessary? Isn't the grant table contained within one of the
> > BARS on the virtual PCI device? What needs grant tables for prior to the
> > kernel finding the PCI device which necessitates hardcoding these
> > addresses in both guest and hypervisor?
> 
> I didn't even find where these get used, except to reserve an area in e820,
> and it wasn't clear why that reservation is necessary.
> 
It has been used in the last [6/6] patch of Linux kernel side, which would use 
the pages to map grant table. It works the same as the MMIO region in PVonHVM 
device. Reserve it in BIOS because we think it's more elegant than depends on 
QEmu to provide the reserved memory space.

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:06   ` Sheng Yang
@ 2010-02-02 13:52     ` Ian Campbell
  2010-02-02 14:04       ` Stefano Stabellini
  2010-02-02 14:07       ` Sheng Yang
  0 siblings, 2 replies; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 13:52 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Keir, xen-devel@lists.xensource.com, Fraser

On Tue, 2010-02-02 at 13:06 +0000, Sheng Yang wrote:
> On Tuesday 02 February 2010 19:26:55 Ian Campbell wrote:
> > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > +static hvm_hypercall_t *hvm_hypercall_hybrid64_table[NR_hypercalls] =
> > > {
> > > +    [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
> > > +    [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t
> > > *)hvm_grant_table_op,
> > > +    HYPERCALL(xen_version),
> > > +    HYPERCALL(console_io),
> > > +    HYPERCALL(vcpu_op),
> > > +    HYPERCALL(sched_op),
> > > +    HYPERCALL(event_channel_op),
> > > +    HYPERCALL(hvm_op),
> > > +};
> > 
> > Why not just expand the exiting hvm hypercall table to incorporate these
> > new hypercalls?
> 
> I am just afraid the normal HVM guest called these hypercalls would result in 
> some chaos, so add a limitation(for hybrid only) here. (I admit it didn't much 
> improve the security for a malicious guest...)

I don't think this limitation adds any meaningful security or reduction
in chaos. A non-hybrid aware HVM guest simply won't make those
hypercalls.

> > In fact, why is hybrid mode considered a separate mode by the hypervisor
> > at all? Shouldn't it just be an extension to regular HVM mode which
> > guests may choose to use? This seems like it would eliminate a bunch of
> > the random conditionals.
> 
> There is still something different from normal HVM guest. For example, to use 
> PV timer, we should clean the tsc offset in HVM domain; and for event 
> delivery, we would use predefined VIRQ rather than emulated IOAPIC/APIC. These 
> code are exclusive, we need them wrapped with flag(which we called "hybrid"). 
> The word "mode" here may be is inaccuracy, a "extension" should be more 
> proper. I would change the phrase next time.

But the old mechanisms (emulated IOAPIC etc) are still present until the
enable_hybrid HVMOP is called, aren't they? Why can't you perform the
switch at the point at which the new feature is requested by the guest
e.g. when the VIRQ is configured?

It looks like you are using evtchn's for all interrupt injection,
including any emulated or passthrough devices which may be present.
Using evtchn's for PV devices obviously makes sense but I think this
needs to coexist with emulated interrupt injection for non-PV devices so
the IOAPIC/APIC should not be mutually exclusive with using PV evtchns.

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02  8:16 [PATCH][v2] Hybrid extension support in Xen Sheng Yang
                   ` (3 preceding siblings ...)
  2010-02-02 13:23 ` Keir Fraser
@ 2010-02-02 13:52 ` Ian Campbell
  2010-02-02 13:53   ` Sheng Yang
  4 siblings, 1 reply; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 13:52 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
> --- a/xen/arch/x86/hvm/irq.c
> +++ b/xen/arch/x86/hvm/irq.c
> @@ -46,8 +46,18 @@ static void __hvm_pci_intx_assert(
>      if ( (hvm_irq->pci_link_assert_count[link]++ == 0) && isa_irq &&
>           (hvm_irq->gsi_assert_count[isa_irq]++ == 0) )
>      {
> -        vioapic_irq_positive_edge(d, isa_irq);
> -        vpic_irq_positive_edge(d, isa_irq);
> +        if ( !is_hybrid_evtchn_enabled_domain(d) )
> +        {
> +            vioapic_irq_positive_edge(d, isa_irq);
> +            vpic_irq_positive_edge(d, isa_irq);
> +        }
> +        else
> +        {
> +            /* TODO fix the critical region here */
> +            spin_unlock(&d->arch.hvm_domain.irq_lock);
> +            send_guest_global_virq(d, VIRQ_EMUL_PIN(isa_irq));
> +            spin_lock(&d->arch.hvm_domain.irq_lock);
> +       }
>      }
>  } 

This can't be right, at least not without a big comment explaining why
it is safe to drop the lock here...

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:52 ` Ian Campbell
@ 2010-02-02 13:53   ` Sheng Yang
  0 siblings, 0 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 13:53 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tuesday 02 February 2010 21:52:55 Ian Campbell wrote:
> On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
> > --- a/xen/arch/x86/hvm/irq.c
> > +++ b/xen/arch/x86/hvm/irq.c
> > @@ -46,8 +46,18 @@ static void __hvm_pci_intx_assert(
> >      if ( (hvm_irq->pci_link_assert_count[link]++ == 0) && isa_irq &&
> >           (hvm_irq->gsi_assert_count[isa_irq]++ == 0) )
> >      {
> > -        vioapic_irq_positive_edge(d, isa_irq);
> > -        vpic_irq_positive_edge(d, isa_irq);
> > +        if ( !is_hybrid_evtchn_enabled_domain(d) )
> > +        {
> > +            vioapic_irq_positive_edge(d, isa_irq);
> > +            vpic_irq_positive_edge(d, isa_irq);
> > +        }
> > +        else
> > +        {
> > +            /* TODO fix the critical region here */
> > +            spin_unlock(&d->arch.hvm_domain.irq_lock);
> > +            send_guest_global_virq(d, VIRQ_EMUL_PIN(isa_irq));
> > +            spin_lock(&d->arch.hvm_domain.irq_lock);
> > +       }
> >      }
> >  }
> 
> This can't be right, at least not without a big comment explaining why
> it is safe to drop the lock here...

... Would address this in the next version.

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:50         ` Ian Campbell
@ 2010-02-02 14:00           ` Tim Deegan
  2010-02-02 14:22             ` Sheng Yang
  2010-02-02 14:28           ` Sheng Yang
  1 sibling, 1 reply; 35+ messages in thread
From: Tim Deegan @ 2010-02-02 14:00 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser, Sheng Yang

At 13:50 +0000 on 02 Feb (1265118605), Ian Campbell wrote:
> I wonder if we could turn things around and have the guest pick some
> pages and tell the hypervisor to put the grant table there, removing the
> need to reserve any of the physical address space up front. 

Yes; this is how all existing PV-on-HVM drivers work, using the
grant-table hypercall.  I don't see the need to add another mechanism.

Tim.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:52     ` Sheng Yang
@ 2010-02-02 14:01       ` Keir Fraser
  2010-02-02 14:13         ` Sheng Yang
  0 siblings, 1 reply; 35+ messages in thread
From: Keir Fraser @ 2010-02-02 14:01 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Ian Campbell, xen-devel@lists.xensource.com

On 02/02/2010 13:52, "Sheng Yang" <sheng@linux.intel.com> wrote:

>> I didn't even find where these get used, except to reserve an area in e820,
>> and it wasn't clear why that reservation is necessary.
>> 
> It has been used in the last [6/6] patch of Linux kernel side, which would use
> the pages to map grant table. It works the same as the MMIO region in PVonHVM
> device. Reserve it in BIOS because we think it's more elegant than depends on
> QEmu to provide the reserved memory space.

Hmm. Can't this be done in phases? It seems unnecessary to be making changes
solely to remove qemu dependencies in the intial patchset, when the patchset
does not actually achieve that aim.

I think an HVM guest with no PCI space is a little way off, and perhaps we
can find a better way than hardcoding an address in two places.

 -- Keir

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:37   ` Sheng Yang
@ 2010-02-02 14:03     ` Keir Fraser
  2010-02-02 14:08       ` Sheng Yang
  0 siblings, 1 reply; 35+ messages in thread
From: Keir Fraser @ 2010-02-02 14:03 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com

On 02/02/2010 13:37, "Sheng Yang" <sheng@linux.intel.com> wrote:

>>> 1. Enable SMP support through VCPU_OP in arch_set_info_guest().
>> 
>> What does that mean and why would we want it?
> 
> It's not a good description, sorry... We need a entry address and boot up code
> AP. For that purpose, I reuse VCPUOP_initialise and VCPUOP_up hypercall. The
> former guest set the entry address of AP, the later one boot up them. For
> this, I add some code in arch_set_info_guest() to initialize the AP(mostly
> what we did after INIT-SIPI-SIPI sequence), that's what I means here...

Okay, so that leads to the obvious next question: why do you want to avoid
using INIT-SIPI-SIPI?

 -- Keir

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:52     ` Ian Campbell
@ 2010-02-02 14:04       ` Stefano Stabellini
  2010-02-02 14:07       ` Sheng Yang
  1 sibling, 0 replies; 35+ messages in thread
From: Stefano Stabellini @ 2010-02-02 14:04 UTC (permalink / raw)
  To: Ian Campbell; +Cc: Keir, Keir Fraser, xen-devel@lists.xensource.com, Sheng Yang

On Tue, 2 Feb 2010, Ian Campbell wrote:
> It looks like you are using evtchn's for all interrupt injection,
> including any emulated or passthrough devices which may be present.
> Using evtchn's for PV devices obviously makes sense but I think this
> needs to coexist with emulated interrupt injection for non-PV devices so
> the IOAPIC/APIC should not be mutually exclusive with using PV evtchns.
> 

Indeed.
Evtchn's for PV devices are more than welcome, but this:

> diff --git a/xen/arch/x86/hvm/irq.c b/xen/arch/x86/hvm/irq.c
> --- a/xen/arch/x86/hvm/irq.c
> +++ b/xen/arch/x86/hvm/irq.c
> @@ -46,8 +46,18 @@ static void __hvm_pci_intx_assert(
>      if ( (hvm_irq->pci_link_assert_count[link]++ == 0) && isa_irq &&
>           (hvm_irq->gsi_assert_count[isa_irq]++ == 0) )
>      {
> -        vioapic_irq_positive_edge(d, isa_irq);
> -        vpic_irq_positive_edge(d, isa_irq);
> +        if ( !is_hybrid_evtchn_enabled_domain(d) )
> +        {
> +            vioapic_irq_positive_edge(d, isa_irq);
> +            vpic_irq_positive_edge(d, isa_irq);
> +        }
> +        else
> +        {
> +            /* TODO fix the critical region here */
> +            spin_unlock(&d->arch.hvm_domain.irq_lock);
> +            send_guest_global_virq(d, VIRQ_EMUL_PIN(isa_irq));
> +            spin_lock(&d->arch.hvm_domain.irq_lock);
> +	}
>      }
>  }
>  

and this:

> @@ -101,11 +114,21 @@ void hvm_isa_irq_assert(
>      if ( !__test_and_set_bit(isa_irq, &hvm_irq->isa_irq.i) &&
>           (hvm_irq->gsi_assert_count[gsi]++ == 0) )
>      {
> -        vioapic_irq_positive_edge(d, gsi);
> -        vpic_irq_positive_edge(d, isa_irq);
> +        if ( !is_hybrid_evtchn_enabled_domain(d) )
> +        {
> +            vioapic_irq_positive_edge(d, gsi);
> +            vpic_irq_positive_edge(d, isa_irq);
> +        }
> +        else
> +        {
> +            send_virq = 1;
> +        }
>      }
>  

and this:

> @@ -120,7 +143,10 @@ void hvm_isa_irq_deassert(
>  
>      if ( __test_and_clear_bit(isa_irq, &hvm_irq->isa_irq.i) &&
>           (--hvm_irq->gsi_assert_count[gsi] == 0) )
> -        vpic_irq_negative_edge(d, isa_irq);
> +    {
> +        if ( !is_hybrid_evtchn_enabled_domain(d) )
> +            vpic_irq_negative_edge(d, isa_irq);
> +    }
>  
>      spin_unlock(&d->arch.hvm_domain.irq_lock);
>  }
> 

I think they are a very bad idea and a maintenance pain, especially when
you think about pci passthrought.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:52     ` Ian Campbell
  2010-02-02 14:04       ` Stefano Stabellini
@ 2010-02-02 14:07       ` Sheng Yang
  2010-02-02 16:15         ` Ian Campbell
  1 sibling, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 14:07 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tuesday 02 February 2010 21:52:44 Ian Campbell wrote:
> On Tue, 2010-02-02 at 13:06 +0000, Sheng Yang wrote:
> > On Tuesday 02 February 2010 19:26:55 Ian Campbell wrote:
> > > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > > +static hvm_hypercall_t *hvm_hypercall_hybrid64_table[NR_hypercalls]
> > > > = {
> > > > +    [ __HYPERVISOR_memory_op ] = (hvm_hypercall_t *)hvm_memory_op,
> > > > +    [ __HYPERVISOR_grant_table_op ] = (hvm_hypercall_t
> > > > *)hvm_grant_table_op,
> > > > +    HYPERCALL(xen_version),
> > > > +    HYPERCALL(console_io),
> > > > +    HYPERCALL(vcpu_op),
> > > > +    HYPERCALL(sched_op),
> > > > +    HYPERCALL(event_channel_op),
> > > > +    HYPERCALL(hvm_op),
> > > > +};
> > >
> > > Why not just expand the exiting hvm hypercall table to incorporate
> > > these new hypercalls?
> >
> > I am just afraid the normal HVM guest called these hypercalls would
> > result in some chaos, so add a limitation(for hybrid only) here. (I admit
> > it didn't much improve the security for a malicious guest...)
> 
> I don't think this limitation adds any meaningful security or reduction
> in chaos. A non-hybrid aware HVM guest simply won't make those
> hypercalls.

Um, yes... Would update this in the next version.
> 
> > > In fact, why is hybrid mode considered a separate mode by the
> > > hypervisor at all? Shouldn't it just be an extension to regular HVM
> > > mode which guests may choose to use? This seems like it would eliminate
> > > a bunch of the random conditionals.
> >
> > There is still something different from normal HVM guest. For example, to
> > use PV timer, we should clean the tsc offset in HVM domain; and for event
> > delivery, we would use predefined VIRQ rather than emulated IOAPIC/APIC.
> > These code are exclusive, we need them wrapped with flag(which we called
> > "hybrid"). The word "mode" here may be is inaccuracy, a "extension"
> > should be more proper. I would change the phrase next time.
> 
> But the old mechanisms (emulated IOAPIC etc) are still present until the
> enable_hybrid HVMOP is called, aren't they? Why can't you perform the
> switch at the point at which the new feature is requested by the guest
> e.g. when the VIRQ is configured?

Yes, they are there before the enable_hybrid HVMOP called. But sorry for I 
don't quite understand about the point "when the VIRQ is configured?"
> 
> It looks like you are using evtchn's for all interrupt injection,
> including any emulated or passthrough devices which may be present.
> Using evtchn's for PV devices obviously makes sense but I think this
> needs to coexist with emulated interrupt injection for non-PV devices so
> the IOAPIC/APIC should not be mutually exclusive with using PV evtchns.

Yes. similar to dom0(maybe more like pv_ops dom0). Currently passthrough 
devices have issue with it, due to lack of MSI/MSI-X support(which would be 
our next step and can be shared with pv_ops dom0); for INTx method, because 
event channel is edge triggered, but INTx is level triggered, so they also 
can't be used. Passthrough devices can be used with some hack, if MSI2INTX 
translation is worked.

But let IOAPIC/APIC coexisted with event channel is not our target. As we 
know, the overhead brought by them, notably EOI by LAPIC, impact performance 
badly. We want event channel that because event channel have much less 
overhead compared to IOAPIC/LAPIC, a completely virtualization aware solution 
which elimiate all the unnecessary overhead. That's what we want Xen guest to 
be benefit.

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:03     ` Keir Fraser
@ 2010-02-02 14:08       ` Sheng Yang
  2010-02-02 14:32         ` Keir Fraser
  0 siblings, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 14:08 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com

On Tuesday 02 February 2010 22:03:31 Keir Fraser wrote:
> On 02/02/2010 13:37, "Sheng Yang" <sheng@linux.intel.com> wrote:
> >>> 1. Enable SMP support through VCPU_OP in arch_set_info_guest().
> >>
> >> What does that mean and why would we want it?
> >
> > It's not a good description, sorry... We need a entry address and boot up
> > code AP. For that purpose, I reuse VCPUOP_initialise and VCPUOP_up
> > hypercall. The former guest set the entry address of AP, the later one
> > boot up them. For this, I add some code in arch_set_info_guest() to
> > initialize the AP(mostly what we did after INIT-SIPI-SIPI sequence),
> > that's what I means here...
> 
> Okay, so that leads to the obvious next question: why do you want to avoid
> using INIT-SIPI-SIPI?
> 
Because we don't have IOAPIC/LAPIC...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:01       ` Keir Fraser
@ 2010-02-02 14:13         ` Sheng Yang
  0 siblings, 0 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 14:13 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Ian Campbell, xen-devel@lists.xensource.com

On Tuesday 02 February 2010 22:01:43 Keir Fraser wrote:
> On 02/02/2010 13:52, "Sheng Yang" <sheng@linux.intel.com> wrote:
> >> I didn't even find where these get used, except to reserve an area in
> >> e820, and it wasn't clear why that reservation is necessary.
> >
> > It has been used in the last [6/6] patch of Linux kernel side, which
> > would use the pages to map grant table. It works the same as the MMIO
> > region in PVonHVM device. Reserve it in BIOS because we think it's more
> > elegant than depends on QEmu to provide the reserved memory space.
> 
> Hmm. Can't this be done in phases? It seems unnecessary to be making
>  changes solely to remove qemu dependencies in the intial patchset, when
>  the patchset does not actually achieve that aim.
> 
> I think an HVM guest with no PCI space is a little way off, and perhaps we
> can find a better way than hardcoding an address in two places.

Sure. I would try stick to old QEmu provided MMIO first. (Maybe try Ian 
Campbell's advice to get some more flexible ones later).

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:00           ` Tim Deegan
@ 2010-02-02 14:22             ` Sheng Yang
  0 siblings, 0 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 14:22 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel@lists.xensource.com, Ian Campbell, Keir Fraser

On Tuesday 02 February 2010 22:00:47 Tim Deegan wrote:
> At 13:50 +0000 on 02 Feb (1265118605), Ian Campbell wrote:
> > I wonder if we could turn things around and have the guest pick some
> > pages and tell the hypervisor to put the grant table there, removing the
> > need to reserve any of the physical address space up front.
> 
> Yes; this is how all existing PV-on-HVM drivers work, using the
> grant-table hypercall.  I don't see the need to add another mechanism.

...Not quite understand. I think the PV-on-HVM drivers reserved the memory 
space in QEmu provided device's MMIO space, similar to what I did here(except 
reserved in BIOS)?...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 13:50         ` Ian Campbell
  2010-02-02 14:00           ` Tim Deegan
@ 2010-02-02 14:28           ` Sheng Yang
  1 sibling, 0 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 14:28 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Tuesday 02 February 2010 21:50:05 Ian Campbell wrote:
> On Tue, 2010-02-02 at 13:28 +0000, Sheng Yang wrote:
> > On Tuesday 02 February 2010 21:19:32 Ian Campbell wrote:
> > > On Tue, 2010-02-02 at 12:54 +0000, Sheng Yang wrote:
> > > > On Tuesday 02 February 2010 19:22:21 Ian Campbell wrote:
> > > > > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > > > > +/* Reserve 128KB for grant table */
> > > > > > +#define GNTTAB_MEMBASE     0xfbfe0000
> > > > > > +#define GNTTAB_MEMSIZE     0x20000
> > > > >
> > > > > Why is this necessary? Isn't the grant table contained within one
> > > > > of the BARS on the virtual PCI device? What needs grant tables for
> > > > > prior to the kernel finding the PCI device which necessitates
> > > > > hardcoding these addresses in both guest and hypervisor?
> > > >
> > > > Thanks for so quick and detail comments. :)
> > > >
> > > > And this one is purposed, because we don't want to depends on QEmu.
> > > > As you see, we now have PV drivers, QEmu is an alternative way to
> > > > provide device model now. We think that still involving QEmu as a
> > > > requirement is somehow strange. So this reserved region is there, to
> > > > drop the dependence with QEmu to provide PV driver(PV driver depends
> > > > on QEmu is still strange, right?).
> > >
> > > So with your patchset you can run an HVM guest with no qemu process at
> > > all? What about the other emulated devices which have no PV equivalent?
> > > How does the VM boot, does the BIOS have pv INT 13 handler?
> >
> > No, not currently... Sorry for confusion.
> >
> > We just think QEmu provided the availability of PV driver seems
> > unelegant, so we want to decouple them. Because what QEmu provided is a
> > PCI IRQ and a MMIO region for grant table. And event channel is available
> > without that PCI IRQ, so we think decouple MMIO from QEmu should be more
> > elegant...
> 
> OK, but in that case I think we should have a mechanism for the guest to
> query the location of the grant table pages (hypercall, MSR etc) rather
> than hardcoding a magic address. It may well end up being hardcoded on
> the tools/hypervisor side for now but there is no reason to expose that
> to the guest.

Yes. I would try to use QEmu provided device for this - though I think it may 
still have problem if I use PCI probe mechanism to find the address, because 
the initialization of grant table maybe earlier.

Rather than probe as a PCI device, I prefer a query or hardcode in this case. 
> 
> I wonder if we could turn things around and have the guest pick some
> pages and tell the hypervisor to put the grant table there, removing the
> need to reserve any of the physical address space up front. How do
> full-PV guests find their grant table, is that a mechanism which could
> be re-used here instead of reserving magic regions?

I would check if we can follow the PV solution.(I remember I once checked 
that, but forgot the reason for not doing so now...)

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:08       ` Sheng Yang
@ 2010-02-02 14:32         ` Keir Fraser
  2010-02-02 14:37           ` Keir Fraser
  2010-02-02 14:39           ` Sheng Yang
  0 siblings, 2 replies; 35+ messages in thread
From: Keir Fraser @ 2010-02-02 14:32 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com

On 02/02/2010 14:08, "Sheng Yang" <sheng@linux.intel.com> wrote:

>> Okay, so that leads to the obvious next question: why do you want to avoid
>> using INIT-SIPI-SIPI?
>> 
> Because we don't have IOAPIC/LAPIC...

Is it necessary to remove the LAPICs completely? If you go very far down the
route of ripping emulated stuff out of HVM, it starts to feel like starting
with a pure PV guest and HVMing it up is closer in spirit to what you might
be aiming for.

 -- Keir

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:32         ` Keir Fraser
@ 2010-02-02 14:37           ` Keir Fraser
  2010-02-02 15:51             ` Sheng Yang
  2010-02-02 14:39           ` Sheng Yang
  1 sibling, 1 reply; 35+ messages in thread
From: Keir Fraser @ 2010-02-02 14:37 UTC (permalink / raw)
  To: Sheng Yang; +Cc: xen-devel@lists.xensource.com

On 02/02/2010 14:32, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

>>> Okay, so that leads to the obvious next question: why do you want to avoid
>>> using INIT-SIPI-SIPI?
>>> 
>> Because we don't have IOAPIC/LAPIC...
> 
> Is it necessary to remove the LAPICs completely? If you go very far down the
> route of ripping emulated stuff out of HVM, it starts to feel like starting
> with a pure PV guest and HVMing it up is closer in spirit to what you might
> be aiming for.

The other thing is, removing some of this stuff weakens the argument that
the hybrid approach lets us 'ride the wave' of improving hardware virt
support. For example, LAPIC is pretty architectural these days, and is ripe
for fuller hardware virtualisation. If you put HVM onto event channels and
totally rip out the LAPIC, where does that leave us if full LAPIC virt comes
along, with all its potential for especially improving device passthru
performance?

 -- Keir

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:32         ` Keir Fraser
  2010-02-02 14:37           ` Keir Fraser
@ 2010-02-02 14:39           ` Sheng Yang
  1 sibling, 0 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 14:39 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com, Eddie Dong, Jun Nakajima

On Tuesday 02 February 2010 22:32:09 Keir Fraser wrote:
> On 02/02/2010 14:08, "Sheng Yang" <sheng@linux.intel.com> wrote:
> >> Okay, so that leads to the obvious next question: why do you want to
> >> avoid using INIT-SIPI-SIPI?
> >
> > Because we don't have IOAPIC/LAPIC...
> 
> Is it necessary to remove the LAPICs completely? If you go very far down
>  the route of ripping emulated stuff out of HVM, it starts to feel like
>  starting with a pure PV guest and HVMing it up is closer in spirit to what
>  you might be aiming for.

The performance of interrupt delivery is what we care about. The IOAPIC/LAPIC 
don't work well now, so we want event channel to eliminate their overhead, 
notably EOI in LAPIC.

And you know, start from a PV guest is another story...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:37           ` Keir Fraser
@ 2010-02-02 15:51             ` Sheng Yang
  0 siblings, 0 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 15:51 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel@lists.xensource.com, Eddie Dong, Jun Nakajima

On Tuesday 02 February 2010 22:37:06 Keir Fraser wrote:
> On 02/02/2010 14:32, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:
> >>> Okay, so that leads to the obvious next question: why do you want to
> >>> avoid using INIT-SIPI-SIPI?
> >>
> >> Because we don't have IOAPIC/LAPIC...
> >
> > Is it necessary to remove the LAPICs completely? If you go very far down
> > the route of ripping emulated stuff out of HVM, it starts to feel like
> > starting with a pure PV guest and HVMing it up is closer in spirit to
> > what you might be aiming for.
> 
> The other thing is, removing some of this stuff weakens the argument that
> the hybrid approach lets us 'ride the wave' of improving hardware virt
> support. For example, LAPIC is pretty architectural these days, and is ripe
> for fuller hardware virtualisation. If you put HVM onto event channels and
> totally rip out the LAPIC, where does that leave us if full LAPIC virt
>  comes along, with all its potential for especially improving device
>  passthru performance?

In fact, our target is improving device passthru performance(before full LAPIC 
virtualziation ready in every server). And what we mostly targeted is MSI/MSI-
X interrupt intensive devices. This didn't include in this patchset(we have 
experiment patches of course), because that can't avoid touching the some 
generic kernel code, e.g. MSI related things in ioapic.c, make it harder to be 
checked in. The code to support hybrid MSI/MSI-X device can be shared with 
pv_ops dom0, so that is what we planed next step.

And these hybrid features are component based. I admit, sooner or later, these 
PV solutions would be replaced with hardware features. If you got new enough 
hardwares, you can simply disable them; if you didn't, you can still benefit 
from them. Of course, as you know, it takes time to get new enough machine for 
everyone. So I think, it's better to do something now rather than simply 
waiting for hardware feature come. :)

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 14:07       ` Sheng Yang
@ 2010-02-02 16:15         ` Ian Campbell
  2010-02-02 16:31           ` Sheng Yang
  0 siblings, 1 reply; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 16:15 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Keir, xen-devel@lists.xensource.com, Fraser

On Tue, 2010-02-02 at 14:07 +0000, Sheng Yang wrote:
> On Tuesday 02 February 2010 21:52:44 Ian Campbell wrote:
> > On Tue, 2010-02-02 at 13:06 +0000, Sheng Yang wrote:
> > > On Tuesday 02 February 2010 19:26:55 Ian Campbell wrote:
> > > > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > > In fact, why is hybrid mode considered a separate mode by the
> > > > hypervisor at all? Shouldn't it just be an extension to regular HVM
> > > > mode which guests may choose to use? This seems like it would eliminate
> > > > a bunch of the random conditionals.
> > >
> > > There is still something different from normal HVM guest. For example, to
> > > use PV timer, we should clean the tsc offset in HVM domain; and for event
> > > delivery, we would use predefined VIRQ rather than emulated IOAPIC/APIC.
> > > These code are exclusive, we need them wrapped with flag(which we called
> > > "hybrid"). The word "mode" here may be is inaccuracy, a "extension"
> > > should be more proper. I would change the phrase next time.
> > 
> > But the old mechanisms (emulated IOAPIC etc) are still present until the
> > enable_hybrid HVMOP is called, aren't they? Why can't you perform the
> > switch at the point at which the new feature is requested by the guest
> > e.g. when the VIRQ is configured?
> 
> Yes, they are there before the enable_hybrid HVMOP called. But sorry for I 
> don't quite understand about the point "when the VIRQ is configured?"

I just meant that you should enable evtchn mode at the point at which
the guest tries to use event channels and not in some specific "enable
hybrid mode call", similarly for PV-timer mode etc.

Of course that presupposes that event channels and APIC etc are mutually
exclusive, which I don't think is a given.

I'm of the opinion that the word "hybrid" should not occur anywhere in
the tools or hypervisor -- instead there should simply be PV
functionalities which are made available to HVM guest kernels which
request them.

I'm similarly of the opinion that the hybrid-ness should not leak out of
the core Xen-aware kernel code, for example the word hybrid should not
be used in the front end drivers, rather the differences should be
encapsulated in the evtchn code (etc).

> But let IOAPIC/APIC coexisted with event channel is not our target. As
> we know, the overhead brought by them, notably EOI by LAPIC, impact
> performance badly. We want event channel that because event channel
> have much less overhead compared to IOAPIC/LAPIC, a completely
> virtualization aware solution which elimiate all the unnecessary
> overhead. That's what we want Xen guest to be benefit.

It certainly makes sense to deliver interrupts to PV drivers via
evtchns. I don't think it necessarily follows that all interrupts should
be injected via event channels. For example for low throughput emulated
devices I think it makes sense to continue to inject interrupts via the
emulated *APIC paths such that the treatment of a given device is either
consistently emulated or consistently paravirtualised but not some
mixture.

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 16:15         ` Ian Campbell
@ 2010-02-02 16:31           ` Sheng Yang
  2010-02-02 18:03             ` Ian Campbell
  2010-02-02 18:27             ` Stefano Stabellini
  0 siblings, 2 replies; 35+ messages in thread
From: Sheng Yang @ 2010-02-02 16:31 UTC (permalink / raw)
  To: Ian Campbell; +Cc: xen-devel@lists.xensource.com, Keir Fraser

On Wednesday 03 February 2010 00:15:50 Ian Campbell wrote:
> On Tue, 2010-02-02 at 14:07 +0000, Sheng Yang wrote:
> > On Tuesday 02 February 2010 21:52:44 Ian Campbell wrote:
> > > On Tue, 2010-02-02 at 13:06 +0000, Sheng Yang wrote:
> > > > On Tuesday 02 February 2010 19:26:55 Ian Campbell wrote:
> > > > > On Tue, 2010-02-02 at 08:16 +0000, Sheng Yang wrote:
> > > > > In fact, why is hybrid mode considered a separate mode by the
> > > > > hypervisor at all? Shouldn't it just be an extension to regular HVM
> > > > > mode which guests may choose to use? This seems like it would
> > > > > eliminate a bunch of the random conditionals.
> > > >
> > > > There is still something different from normal HVM guest. For
> > > > example, to use PV timer, we should clean the tsc offset in HVM
> > > > domain; and for event delivery, we would use predefined VIRQ rather
> > > > than emulated IOAPIC/APIC. These code are exclusive, we need them
> > > > wrapped with flag(which we called "hybrid"). The word "mode" here may
> > > > be is inaccuracy, a "extension" should be more proper. I would change
> > > > the phrase next time.
> > >
> > > But the old mechanisms (emulated IOAPIC etc) are still present until
> > > the enable_hybrid HVMOP is called, aren't they? Why can't you perform
> > > the switch at the point at which the new feature is requested by the
> > > guest e.g. when the VIRQ is configured?
> >
> > Yes, they are there before the enable_hybrid HVMOP called. But sorry for
> > I don't quite understand about the point "when the VIRQ is configured?"
> 
> I just meant that you should enable evtchn mode at the point at which
> the guest tries to use event channels and not in some specific "enable
> hybrid mode call", similarly for PV-timer mode etc.
> 
> Of course that presupposes that event channels and APIC etc are mutually
> exclusive, which I don't think is a given.

I would give it try. Seems I need to bind tsc offset clean in event channel 
binding hypercall(if I didn't introduce another hypercall), sounds a little 
messy.
 
> I'm of the opinion that the word "hybrid" should not occur anywhere in
> the tools or hypervisor -- instead there should simply be PV
> functionalities which are made available to HVM guest kernels which
> request them.
> 
> I'm similarly of the opinion that the hybrid-ness should not leak out of
> the core Xen-aware kernel code, for example the word hybrid should not
> be used in the front end drivers, rather the differences should be
> encapsulated in the evtchn code (etc).

That's reasonable. I would try to make them more like natural PV feature.
> 
> > But let IOAPIC/APIC coexisted with event channel is not our target. As
> > we know, the overhead brought by them, notably EOI by LAPIC, impact
> > performance badly. We want event channel that because event channel
> > have much less overhead compared to IOAPIC/LAPIC, a completely
> > virtualization aware solution which elimiate all the unnecessary
> > overhead. That's what we want Xen guest to be benefit.
> 
> It certainly makes sense to deliver interrupts to PV drivers via
> evtchns. I don't think it necessarily follows that all interrupts should
> be injected via event channels. For example for low throughput emulated
> devices I think it makes sense to continue to inject interrupts via the
> emulated *APIC paths such that the treatment of a given device is either
> consistently emulated or consistently paravirtualised but not some
> mixture.

Well, for us, we want evtchn because we want to improve interrupt intensive 
passthru device's performance(though too big for the first step, we have 
experiment patches, but would like to consolidate with the solution of pv_ops 
dom0). The situation won't change if we still use emulated APIC path...

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 16:31           ` Sheng Yang
@ 2010-02-02 18:03             ` Ian Campbell
  2010-02-02 18:27             ` Stefano Stabellini
  1 sibling, 0 replies; 35+ messages in thread
From: Ian Campbell @ 2010-02-02 18:03 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Keir, xen-devel@lists.xensource.com, Fraser

On Tue, 2010-02-02 at 16:31 +0000, Sheng Yang wrote:
> 
> > I just meant that you should enable evtchn mode at the point at
> which
> > the guest tries to use event channels and not in some specific
> "enable
> > hybrid mode call", similarly for PV-timer mode etc.
> > 
> > Of course that presupposes that event channels and APIC etc are
> mutually
> > exclusive, which I don't think is a given.
> 
> I would give it try. Seems I need to bind tsc offset clean in event
> channel binding hypercall(if I didn't introduce another hypercall),
> sounds a little messy. 

I guess to clarify what I really meant was in the code which enables the
PV timer for a domain, rather than specifically in the event channel
hypercall which implements the binding to the timer evtchn. I'm not sure
if enabling the timer is a separate hypercall or is just a side effect
of binding to the evtchn.

Ian.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 16:31           ` Sheng Yang
  2010-02-02 18:03             ` Ian Campbell
@ 2010-02-02 18:27             ` Stefano Stabellini
  2010-02-03  5:15               ` Sheng Yang
  1 sibling, 1 reply; 35+ messages in thread
From: Stefano Stabellini @ 2010-02-02 18:27 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Ian Campbell, xen-devel@lists.xensource.com, Keir Fraser

On Tue, 2 Feb 2010, Sheng Yang wrote:
> Well, for us, we want evtchn because we want to improve interrupt intensive 
> passthru device's performance(though too big for the first step, we have 
> experiment patches, but would like to consolidate with the solution of pv_ops 
> dom0). The situation won't change if we still use emulated APIC path...
> 

I think you should keep this as the last step: once you have all the
other PV features working on HVM like Ian suggested, send a patch to
allow a guest kernel to switch from emulated APIC to evtchn for every
device.
If you manage to keep it simple, it could be a win for everyone.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-02 18:27             ` Stefano Stabellini
@ 2010-02-03  5:15               ` Sheng Yang
  2010-02-03 10:39                 ` Tim Deegan
  0 siblings, 1 reply; 35+ messages in thread
From: Sheng Yang @ 2010-02-03  5:15 UTC (permalink / raw)
  To: Stefano Stabellini
  Cc: Ian Campbell, xen-devel@lists.xensource.com, Keir Fraser

On Wednesday 03 February 2010 02:27:42 Stefano Stabellini wrote:
> On Tue, 2 Feb 2010, Sheng Yang wrote:
> > Well, for us, we want evtchn because we want to improve interrupt
> > intensive passthru device's performance(though too big for the first
> > step, we have experiment patches, but would like to consolidate with the
> > solution of pv_ops dom0). The situation won't change if we still use
> > emulated APIC path...
> 
> I think you should keep this as the last step: once you have all the
> other PV features working on HVM like Ian suggested, send a patch to
> allow a guest kernel to switch from emulated APIC to evtchn for every
> device.
> If you manage to keep it simple, it could be a win for everyone.
> 
Make event channel coexist with IOAPIC/LAPIC is quite different from what we 
are doing now, and Xen already have PV-on-HVM for that.

Of course, I also want to have a elegant solution in the end, so I would try 
hard to keep things simple, and keep the code clear.

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH][v2] Hybrid extension support in Xen
  2010-02-03  5:15               ` Sheng Yang
@ 2010-02-03 10:39                 ` Tim Deegan
  0 siblings, 0 replies; 35+ messages in thread
From: Tim Deegan @ 2010-02-03 10:39 UTC (permalink / raw)
  To: Sheng Yang
  Cc: Ian Campbell, xen-devel@lists.xensource.com, Keir Fraser,
	Stefano Stabellini

At 05:15 +0000 on 03 Feb (1265174106), Sheng Yang wrote:
> Make event channel coexist with IOAPIC/LAPIC is quite different from what we 
> are doing now, and Xen already have PV-on-HVM for that.

Indeed it does.  I don't see any advantage in adding more mechanism to
both Xen and linux to give us something we basically already have.

Cheers,

Tim.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2010-02-03 10:39 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-02-02  8:16 [PATCH][v2] Hybrid extension support in Xen Sheng Yang
2010-02-02 11:22 ` Ian Campbell
2010-02-02 12:54   ` Sheng Yang
2010-02-02 13:19     ` Ian Campbell
2010-02-02 13:28       ` Sheng Yang
2010-02-02 13:50         ` Ian Campbell
2010-02-02 14:00           ` Tim Deegan
2010-02-02 14:22             ` Sheng Yang
2010-02-02 14:28           ` Sheng Yang
2010-02-02 13:35   ` Keir Fraser
2010-02-02 13:52     ` Sheng Yang
2010-02-02 14:01       ` Keir Fraser
2010-02-02 14:13         ` Sheng Yang
2010-02-02 11:26 ` Ian Campbell
2010-02-02 13:06   ` Sheng Yang
2010-02-02 13:52     ` Ian Campbell
2010-02-02 14:04       ` Stefano Stabellini
2010-02-02 14:07       ` Sheng Yang
2010-02-02 16:15         ` Ian Campbell
2010-02-02 16:31           ` Sheng Yang
2010-02-02 18:03             ` Ian Campbell
2010-02-02 18:27             ` Stefano Stabellini
2010-02-03  5:15               ` Sheng Yang
2010-02-03 10:39                 ` Tim Deegan
2010-02-02 11:32 ` Paul Durrant
2010-02-02 13:23 ` Keir Fraser
2010-02-02 13:37   ` Sheng Yang
2010-02-02 14:03     ` Keir Fraser
2010-02-02 14:08       ` Sheng Yang
2010-02-02 14:32         ` Keir Fraser
2010-02-02 14:37           ` Keir Fraser
2010-02-02 15:51             ` Sheng Yang
2010-02-02 14:39           ` Sheng Yang
2010-02-02 13:52 ` Ian Campbell
2010-02-02 13:53   ` Sheng Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.