[v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts

All of lore.kernel.org
 help / color / mirror / Atom feed

* [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-23 11:35 [v4 00/17] Add VT-d Posted-Interrupts support Feng Wu
@ 2015-07-23 11:35 ` Feng Wu
  2015-07-23 12:50   ` Dario Faggioli
  2015-07-28 14:15   ` Dario Faggioli
  0 siblings, 2 replies; 10+ messages in thread
From: Feng Wu @ 2015-07-23 11:35 UTC (permalink / raw)
  To: xen-devel
  Cc: Kevin Tian, Keir Fraser, George Dunlap, Andrew Cooper,
	Dario Faggioli, Jan Beulich, Feng Wu

This patch adds the following arch hooks in scheduler:
- vmx_pre_ctx_switch_pi():
It is called before context switch, we update the posted
interrupt descriptor when the vCPU is preempted, go to sleep,
or is blocked.

- vmx_post_ctx_switch_pi()
It is called after context switch, we update the posted
interrupt descriptor when the vCPU is going to run.

- arch_vcpu_wake()
It will be called when waking up the vCPU, we update
the posted interrupt descriptor when the vCPU is unblocked.

CC: Keir Fraser <keir@xen.org>
CC: Jan Beulich <jbeulich@suse.com>
CC: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Kevin Tian <kevin.tian@intel.com>
CC: George Dunlap <george.dunlap@eu.citrix.com>
CC: Dario Faggioli <dario.faggioli@citrix.com>
Sugguested-by: Dario Faggioli <dario.faggioli@citrix.com>
Signed-off-by: Feng Wu <feng.wu@intel.com>
---
v4:
- Newly added

 xen/arch/x86/domain.c              |  10 +++
 xen/arch/x86/hvm/vmx/vmx.c         | 145 +++++++++++++++++++++++++++++++++++++
 xen/common/schedule.c              |   2 +
 xen/include/asm-x86/domain.h       |   3 +
 xen/include/asm-x86/hvm/hvm.h      |   2 +
 xen/include/asm-x86/hvm/vmx/vmcs.h |   8 ++
 6 files changed, 170 insertions(+)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index db073a6..36fd2d5 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1550,9 +1550,19 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
 
     set_current(next);
 
+    /*
+     * We need to update posted interrupt descriptor for each context switch,
+     * hence cannot use the lazy context switch for this.
+     */
+    if ( !is_idle_vcpu(prev) && prev->arch.pi_ctxt_switch_from )
+        prev->arch.pi_ctxt_switch_from(prev);
+
     if ( (per_cpu(curr_vcpu, cpu) == next) ||
          (is_idle_vcpu(next) && cpu_online(cpu)) )
     {
+        if ( !is_idle_vcpu(next) && next->arch.pi_ctxt_switch_to )
+            next->arch.pi_ctxt_switch_to(next);
+
         local_irq_enable();
     }
     else
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index f2b7fe0..f8b3601 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -67,6 +67,8 @@ enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
 
 static void vmx_ctxt_switch_from(struct vcpu *v);
 static void vmx_ctxt_switch_to(struct vcpu *v);
+static void vmx_pre_ctx_switch_pi(struct vcpu *v);
+static void vmx_post_ctx_switch_pi(struct vcpu *v);
 
 static int  vmx_alloc_vlapic_mapping(struct domain *d);
 static void vmx_free_vlapic_mapping(struct domain *d);
@@ -116,10 +118,17 @@ static int vmx_vcpu_initialise(struct vcpu *v)
     INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocked_vcpu_list);
     INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_vcpu_on_set_list);
 
+    v->arch.hvm_vmx.pi_block_cpu = -1;
+
+    spin_lock_init(&v->arch.hvm_vmx.pi_lock);
+
     v->arch.schedule_tail    = vmx_do_resume;
     v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
     v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
 
+    v->arch.pi_ctxt_switch_from = vmx_pre_ctx_switch_pi;
+    v->arch.pi_ctxt_switch_to = vmx_post_ctx_switch_pi;
+
     if ( (rc = vmx_create_vmcs(v)) != 0 )
     {
         dprintk(XENLOG_WARNING,
@@ -715,6 +724,141 @@ static void vmx_fpu_leave(struct vcpu *v)
     }
 }
 
+void arch_vcpu_wake(struct vcpu *v)
+{
+    unsigned long gflags;
+
+    if ( !iommu_intpost || !is_hvm_vcpu(v) || !has_arch_pdevs(v->domain) )
+        return;
+
+    spin_lock_irqsave(&v->arch.hvm_vmx.pi_lock, gflags);
+
+    if ( likely(vcpu_runnable(v)) ||
+         !test_bit(_VPF_blocked, &v->pause_flags) )
+    {
+        struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+        unsigned long flags;
+
+        /*
+         * We don't need to send notification event to a non-running
+         * vcpu, the interrupt information will be delivered to it before
+         * VM-ENTRY when the vcpu is scheduled to run next time.
+         */
+        pi_set_sn(pi_desc);
+
+        /*
+         * Set 'NV' feild back to posted_intr_vector, so the
+         * Posted-Interrupts can be delivered to the vCPU by
+         * VT-d HW after it is scheduled to run.
+         */
+        write_atomic((uint8_t*)&pi_desc->nv, posted_intr_vector);
+
+        /*
+         * Delete the vCPU from the related block list
+         * if we are resuming from blocked state
+         */
+        if ( v->arch.hvm_vmx.pi_block_cpu != -1 )
+        {
+            spin_lock_irqsave(&per_cpu(pi_blocked_vcpu_lock,
+                              v->arch.hvm_vmx.pi_block_cpu), flags);
+            list_del_init(&v->arch.hvm_vmx.pi_blocked_vcpu_list);
+            spin_unlock_irqrestore(&per_cpu(pi_blocked_vcpu_lock,
+                                    v->arch.hvm_vmx.pi_block_cpu), flags);
+        }
+    }
+
+    spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_lock, gflags);
+}
+
+static void vmx_pre_ctx_switch_pi(struct vcpu *v)
+{
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+    struct pi_desc old, new;
+    unsigned long flags, gflags;
+
+    if ( !iommu_intpost || !is_hvm_vcpu(v) || !has_arch_pdevs(v->domain) )
+        return;
+
+    spin_lock_irqsave(&v->arch.hvm_vmx.pi_lock, gflags);
+
+    if ( vcpu_runnable(v) || !test_bit(_VPF_blocked, &v->pause_flags) )
+    {
+        /*
+         * The vCPU has been preempted or went to sleep. We don't need to send
+         * notification event to a non-running vcpu, the interrupt information
+         * will be delivered to it before VM-ENTRY when the vcpu is scheduled
+         * to run next time.
+         */
+        pi_set_sn(pi_desc);
+
+    }
+    else if ( test_bit(_VPF_blocked, &v->pause_flags) )
+    {
+        /*
+         * The vCPU is blocking, we need to add it to one of the per pCPU lists.
+         * We save v->processor to v->arch.hvm_vmx.pi_block_cpu and use it for
+         * the per-CPU list, we also save it to posted-interrupt descriptor and
+         * make it as the destination of the wake-up notification event.
+         */
+        v->arch.hvm_vmx.pi_block_cpu = v->processor;
+        spin_lock_irqsave(&per_cpu(pi_blocked_vcpu_lock,
+                          v->arch.hvm_vmx.pi_block_cpu), flags);
+        list_add_tail(&v->arch.hvm_vmx.pi_blocked_vcpu_list,
+                      &per_cpu(pi_blocked_vcpu, v->arch.hvm_vmx.pi_block_cpu));
+        spin_unlock_irqrestore(&per_cpu(pi_blocked_vcpu_lock,
+                           v->arch.hvm_vmx.pi_block_cpu), flags);
+
+        do {
+            old.control = new.control = pi_desc->control;
+
+            /* Should not block the vCPU if an interrupt was posted for it */
+
+            if ( pi_test_on(&old) )
+            {
+                spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_lock, gflags);
+                vcpu_unblock(v);
+                return;
+            }
+
+            /*
+             * Change the 'NDST' field to v->arch.hvm_vmx.pi_block_cpu,
+             * so when external interrupts from assigned deivces happen,
+             * wakeup notifiction event will go to
+             * v->arch.hvm_vmx.pi_block_cpu, then in pi_wakeup_interrupt()
+             * we can find the vCPU in the right list to wake up.
+             */
+            if ( x2apic_enabled )
+                new.ndst = cpu_physical_id(v->arch.hvm_vmx.pi_block_cpu);
+            else
+                new.ndst = MASK_INSR(cpu_physical_id(
+                                 v->arch.hvm_vmx.pi_block_cpu),
+                                 PI_xAPIC_NDST_MASK);
+            pi_clear_sn(&new);
+            new.nv = pi_wakeup_vector;
+        } while ( cmpxchg(&pi_desc->control, old.control, new.control)
+                  != old.control );
+    }
+
+    spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_lock, gflags);
+}
+
+static void vmx_post_ctx_switch_pi(struct vcpu *v)
+{
+    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
+
+    if ( !iommu_intpost || !is_hvm_vcpu(v) || !has_arch_pdevs(v->domain) )
+        return;
+
+    if ( x2apic_enabled )
+        write_atomic(&pi_desc->ndst, cpu_physical_id(v->processor));
+    else
+        write_atomic(&pi_desc->ndst,
+                     MASK_INSR(cpu_physical_id(v->processor),
+                     PI_xAPIC_NDST_MASK));
+
+    pi_clear_sn(pi_desc);
+}
+
 static void vmx_ctxt_switch_from(struct vcpu *v)
 {
     /*
@@ -753,6 +897,7 @@ static void vmx_ctxt_switch_to(struct vcpu *v)
 
     vmx_restore_guest_msrs(v);
     vmx_restore_dr(v);
+    vmx_post_ctx_switch_pi(v);
 }
 
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 6b02f98..90616c4 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -381,6 +381,8 @@ void vcpu_wake(struct vcpu *v)
     unsigned long flags;
     spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
 
+    arch_vcpu_wake(v);
+
     if ( likely(vcpu_runnable(v)) )
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
index a3c117f..ff7a995 100644
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -446,6 +446,9 @@ struct arch_vcpu
     void (*ctxt_switch_from) (struct vcpu *);
     void (*ctxt_switch_to) (struct vcpu *);
 
+    void (*pi_ctxt_switch_from) (struct vcpu *);
+    void (*pi_ctxt_switch_to) (struct vcpu *);
+
     /* Virtual Machine Extensions */
     union {
         struct pv_vcpu pv_vcpu;
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 77eeac5..eeabb32 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -509,6 +509,8 @@ bool_t nhvm_vmcx_hap_enabled(struct vcpu *v);
 /* interrupt */
 enum hvm_intblk nhvm_interrupt_blocked(struct vcpu *v);
 
+void arch_vcpu_wake(struct vcpu *v);
+
 #ifndef NDEBUG
 /* Permit use of the Forced Emulation Prefix in HVM guests */
 extern bool_t opt_hvm_fep;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h
index fdaf6fb..6ed2081 100644
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -165,6 +165,14 @@ struct arch_vmx_struct {
 
     struct list_head     pi_blocked_vcpu_list;
     struct list_head     pi_vcpu_on_set_list;
+
+    /*
+     * Before vCPU is blocked, it is added to the global per-cpu list
+     * of 'pi_block_cpu', then VT-d engine can send wakeup notification
+     * event to 'pi_block_cpu' and wakeup the related vCPU.
+     */
+    int                  pi_block_cpu;
+    spinlock_t           pi_lock;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-23 11:35 ` [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts Feng Wu
@ 2015-07-23 12:50   ` Dario Faggioli
  2015-07-24  0:49     ` Wu, Feng
  2015-07-28 14:15   ` Dario Faggioli
  1 sibling, 1 reply; 10+ messages in thread
From: Dario Faggioli @ 2015-07-23 12:50 UTC (permalink / raw)
  To: Feng Wu
  Cc: Kevin Tian, Keir Fraser, George Dunlap, Andrew Cooper, xen-devel,
	Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 1221 bytes --]

On Thu, 2015-07-23 at 19:35 +0800, Feng Wu wrote:
> This patch adds the following arch hooks in scheduler:
>
So, preliminary question: does this mean that you have identified (and
fixed) the differences in behavior wrt the runstate based model, which
was causing performance issue?

I've been sidetracked a bit in looking at your previous patch, but did
found a couple of differences, which I was about to report... But I
guess that's no longer necessary, I guess, is it?

Dario

> - vmx_pre_ctx_switch_pi():
> It is called before context switch, we update the posted
> interrupt descriptor when the vCPU is preempted, go to sleep,
> or is blocked.
> 
> - vmx_post_ctx_switch_pi()
> It is called after context switch, we update the posted
> interrupt descriptor when the vCPU is going to run.
> 
> - arch_vcpu_wake()
> It will be called when waking up the vCPU, we update
> the posted interrupt descriptor when the vCPU is unblocked.

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-23 12:50   ` Dario Faggioli
@ 2015-07-24  0:49     ` Wu, Feng
  0 siblings, 0 replies; 10+ messages in thread
From: Wu, Feng @ 2015-07-24  0:49 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Tian, Kevin, Keir Fraser, George Dunlap, Andrew Cooper,
	xen-devel@lists.xen.org, Jan Beulich, Wu, Feng



> -----Original Message-----
> From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
> Sent: Thursday, July 23, 2015 8:50 PM
> To: Wu, Feng
> Cc: xen-devel@lists.xen.org; Tian, Kevin; Keir Fraser; George Dunlap; Andrew
> Cooper; Jan Beulich
> Subject: Re: [Xen-devel] [v4 16/17] vmx: Add some scheduler hooks for VT-d
> posted interrupts
> 
> On Thu, 2015-07-23 at 19:35 +0800, Feng Wu wrote:
> > This patch adds the following arch hooks in scheduler:
> >
> So, preliminary question: does this mean that you have identified (and
> fixed) the differences in behavior wrt the runstate based model, which
> was causing performance issue?

Yes, the issue has been fixed in this version.

> 
> I've been sidetracked a bit in looking at your previous patch, but did
> found a couple of differences, which I was about to report... But I
> guess that's no longer necessary, I guess, is it?

Thanks for your effort, Dario! Maybe you can give a review about this
new version, if you have time.

Thanks,
Feng

> 
> Dario
> 
> > - vmx_pre_ctx_switch_pi():
> > It is called before context switch, we update the posted
> > interrupt descriptor when the vCPU is preempted, go to sleep,
> > or is blocked.
> >
> > - vmx_post_ctx_switch_pi()
> > It is called after context switch, we update the posted
> > interrupt descriptor when the vCPU is going to run.
> >
> > - arch_vcpu_wake()
> > It will be called when waking up the vCPU, we update
> > the posted interrupt descriptor when the vCPU is unblocked.
> 
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-23 11:35 ` [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts Feng Wu
  2015-07-23 12:50   ` Dario Faggioli
@ 2015-07-28 14:15   ` Dario Faggioli
  2015-07-30  2:04     ` Wu, Feng
  1 sibling, 1 reply; 10+ messages in thread
From: Dario Faggioli @ 2015-07-28 14:15 UTC (permalink / raw)
  To: Feng Wu
  Cc: Kevin Tian, Keir Fraser, George Dunlap, Andrew Cooper, xen-devel,
	Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 7080 bytes --]

On Thu, 2015-07-23 at 19:35 +0800, Feng Wu wrote:
> This patch adds the following arch hooks in scheduler:
> - vmx_pre_ctx_switch_pi():
> It is called before context switch, we update the posted
> interrupt descriptor when the vCPU is preempted, go to sleep,
> or is blocked.
> 
> - vmx_post_ctx_switch_pi()
> It is called after context switch, we update the posted
> interrupt descriptor when the vCPU is going to run.
> 
> - arch_vcpu_wake()
> It will be called when waking up the vCPU, we update
> the posted interrupt descriptor when the vCPU is unblocked.
> 
Thanks again for doing this. I'll never get tired to say how much better
I like this, wrt the previous RUNSTATE_* based approach. :-)

> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1550,9 +1550,19 @@ void context_switch(struct vcpu *prev, struct vcpu *next)
>  
>      set_current(next);
>  
> +    /*
> +     * We need to update posted interrupt descriptor for each context switch,
> +     * hence cannot use the lazy context switch for this.
> +     */
>
Perhaps it's me, but I don't get the comment. Why do you mention "the
lazy context switch"? We can't use it "for this", as opposed to what
other circumstance where we can use it?

> +    if ( !is_idle_vcpu(prev) && prev->arch.pi_ctxt_switch_from )
> +        prev->arch.pi_ctxt_switch_from(prev);
> +
Flip the order in the conditions, maybe? So that guests with
pi_ctxt_switch_from==NULL suffers the least possible overhead?


>      if ( (per_cpu(curr_vcpu, cpu) == next) ||
>           (is_idle_vcpu(next) && cpu_online(cpu)) )
>      {
> +        if ( !is_idle_vcpu(next) && next->arch.pi_ctxt_switch_to )
>
Same as above.

> +            next->arch.pi_ctxt_switch_to(next);
> +
>          local_irq_enable();
>
Another thing: if prev == next --and let's call such vcpu pp-- you go
through both:

    pp->arch.pi_ctxt_switch_from(pp);
    pp->arch.pi_ctxt_switch_to(pp);

Is this really necessary? AFAICT, it reduces to:

    pi_set_sn(&pp->arch.hvm_vmx.pi_desc);
    if ( x2apic_enabled )
        write_atomic(&pp->arch.hvm_vmx.pi_desc->ndst,
                     cpu_physical_id(v->processor));
    else
        write_atomic(&pp->arch.hvm_vmx.pi_desc->ndst,
                     MASK_INSR(cpu_physical_id(v->processor),
                     PI_xAPIC_NDST_MASK));
    pi_clear_sn(pi_desc);

In fact, if the scheduler picked prev again, as the next vcpu to be run,
I expect

    ( cpu_runnable(v) || !test_bit(_VPF_blocked, &v->pause_flags) )

to be true, and hence vmx_pre_ctx_switch_pi() reducing to just a call to
pi_set_sn().

So, either it is the write_atomic() that you absolutely need, which I
don't think, or you really can check whether such situation is
occurring, and avoid going through _pre_ and _post_ back to back.

> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c

> @@ -116,10 +118,17 @@ static int vmx_vcpu_initialise(struct vcpu *v)
>      INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocked_vcpu_list);
>      INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_vcpu_on_set_list);
>  
> +    v->arch.hvm_vmx.pi_block_cpu = -1;
> +
> +    spin_lock_init(&v->arch.hvm_vmx.pi_lock);
> +
>      v->arch.schedule_tail    = vmx_do_resume;
>      v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
>      v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
>  
> +    v->arch.pi_ctxt_switch_from = vmx_pre_ctx_switch_pi;
> +    v->arch.pi_ctxt_switch_to = vmx_post_ctx_switch_pi;
> +
Would it make sense to get the two initialization above with:

  if ( iommu_intpost && is_hvm_vcpu(v) )

and leave them NULL if that is false? I think it works, and it looks
worthwhile to me.

In fact, this would make the operands flip suggested above actually
effective at cutting the overhead for non-PI enabled guests/hosts.

> +static void vmx_pre_ctx_switch_pi(struct vcpu *v)
> +{
> +    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> +    struct pi_desc old, new;
> +    unsigned long flags, gflags;
> +
> +    if ( !iommu_intpost || !is_hvm_vcpu(v) || !has_arch_pdevs(v->domain) )
> +        return;
> +
And, if you do as I suggest in context_switch() and
vmx_vcpu_initialise(), this will reduce to:

    if ( !has_arch_pdevs(v->domain) )
        return;

> +    else if ( test_bit(_VPF_blocked, &v->pause_flags) )
> +    {
> +        /*
> +         * The vCPU is blocking, we need to add it to one of the per pCPU lists.
> +         * We save v->processor to v->arch.hvm_vmx.pi_block_cpu and use it for
> +         * the per-CPU list, we also save it to posted-interrupt descriptor and
> +         * make it as the destination of the wake-up notification event.
> +         */
> +        v->arch.hvm_vmx.pi_block_cpu = v->processor;
> +        spin_lock_irqsave(&per_cpu(pi_blocked_vcpu_lock,
> +                          v->arch.hvm_vmx.pi_block_cpu), flags);
> +        list_add_tail(&v->arch.hvm_vmx.pi_blocked_vcpu_list,
> +                      &per_cpu(pi_blocked_vcpu, v->arch.hvm_vmx.pi_block_cpu));
> +        spin_unlock_irqrestore(&per_cpu(pi_blocked_vcpu_lock,
> +                           v->arch.hvm_vmx.pi_block_cpu), flags);
> +
> +        do {
> +            old.control = new.control = pi_desc->control;
> +
> +            /* Should not block the vCPU if an interrupt was posted for it */
> +
There's no need for the blank line, and I'd say kill it.

> +            if ( pi_test_on(&old) )
> +            {
> +                spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_lock, gflags);
> +                vcpu_unblock(v);
> +                return;
> +            }

> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -381,6 +381,8 @@ void vcpu_wake(struct vcpu *v)
>      unsigned long flags;
>      spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
>  
> +    arch_vcpu_wake(v);
> +
So, in the draft you sent a few days back, this was called at the end of
vcpu_wake(), right before releasing the lock. Now it's at the beginning,
before the scheduler's wakeup routine has a chance to run.

IMO, it feels more natural for it to be at the bottom (i.e., generic
stuff first, arch specific stuff afterwards), and, after a quick
inspection, I don't think I see nothing preventing things to be that
way.

However, I recall you mentioning having issues with such draft, which
are now resolved with this version. Since this is one of the differences
between the two, was it the cause of the issues you were seeing? If yes,
can you elaborate on how and why?

In the end, I'm not too opposed to the hook being at the beginning
rather than at the end, but there has to be a reason, which may well end
up better be stated in a comment...

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-28 14:15   ` Dario Faggioli
@ 2015-07-30  2:04     ` Wu, Feng
  2015-07-30 18:26       ` Dario Faggioli
  0 siblings, 1 reply; 10+ messages in thread
From: Wu, Feng @ 2015-07-30  2:04 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Tian, Kevin, Keir Fraser, George Dunlap, Andrew Cooper,
	xen-devel@lists.xen.org, Jan Beulich, Wu, Feng



> -----Original Message-----
> From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
> Sent: Tuesday, July 28, 2015 10:16 PM
> To: Wu, Feng
> Cc: xen-devel@lists.xen.org; Keir Fraser; Jan Beulich; Andrew Cooper; Tian,
> Kevin; George Dunlap
> Subject: Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted
> interrupts
> 
> On Thu, 2015-07-23 at 19:35 +0800, Feng Wu wrote:
> > This patch adds the following arch hooks in scheduler:
> > - vmx_pre_ctx_switch_pi():
> > It is called before context switch, we update the posted
> > interrupt descriptor when the vCPU is preempted, go to sleep,
> > or is blocked.
> >
> > - vmx_post_ctx_switch_pi()
> > It is called after context switch, we update the posted
> > interrupt descriptor when the vCPU is going to run.
> >
> > - arch_vcpu_wake()
> > It will be called when waking up the vCPU, we update
> > the posted interrupt descriptor when the vCPU is unblocked.
> >
> Thanks again for doing this. I'll never get tired to say how much better
> I like this, wrt the previous RUNSTATE_* based approach. :-)

I should say thanks to you, it is you that suggested this method! :)

> 
> > --- a/xen/arch/x86/domain.c
> > +++ b/xen/arch/x86/domain.c
> > @@ -1550,9 +1550,19 @@ void context_switch(struct vcpu *prev, struct
> vcpu *next)
> >
> >      set_current(next);
> >
> > +    /*
> > +     * We need to update posted interrupt descriptor for each context
> switch,
> > +     * hence cannot use the lazy context switch for this.
> > +     */
> >
> Perhaps it's me, but I don't get the comment. Why do you mention "the
> lazy context switch"? We can't use it "for this", as opposed to what
> other circumstance where we can use it?

Oh, maybe I shouldn't use the word here, what I want to say here is
__context_switch() isn't called in each context switch, such as,
non-idle vcpu -> idle vcpu, so we need to call prev->arch.pi_ctxt_switch_from
explicitly instead of in __context_switch().

> 
> > +    if ( !is_idle_vcpu(prev) && prev->arch.pi_ctxt_switch_from )
> > +        prev->arch.pi_ctxt_switch_from(prev);
> > +
> Flip the order in the conditions, maybe? So that guests with
> pi_ctxt_switch_from==NULL suffers the least possible overhead?
> 

Sounds good!

> 
> >      if ( (per_cpu(curr_vcpu, cpu) == next) ||
> >           (is_idle_vcpu(next) && cpu_online(cpu)) )
> >      {
> > +        if ( !is_idle_vcpu(next) && next->arch.pi_ctxt_switch_to )
> >
> Same as above.
> 
> > +            next->arch.pi_ctxt_switch_to(next);
> > +
> >          local_irq_enable();
> >
> Another thing: if prev == next --and let's call such vcpu pp-- you go
> through both:
> 
>     pp->arch.pi_ctxt_switch_from(pp);
>     pp->arch.pi_ctxt_switch_to(pp);

In my understanding, if the scheduler chooses the same vcpu to run, it
will return early in schedule() as below:

static void schedule(void)
{
    ....

    /* get policy-specific decision on scheduling... */
    sched = this_cpu(scheduler);
    next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);

    next = next_slice.task;

    sd->curr = next;

    if ( next_slice.time >= 0 ) /* -ve means no limit */
        set_timer(&sd->s_timer, now + next_slice.time);

    if ( unlikely(prev == next) )
    {
        pcpu_schedule_unlock_irq(lock, cpu);
        trace_continue_running(next);
        return continue_running(prev);
    }

    ....

}

If this is that case, when we get context_switch(), the prev and next are
different. Do I miss something?

> 
> Is this really necessary? AFAICT, it reduces to:
> 
>     pi_set_sn(&pp->arch.hvm_vmx.pi_desc);
>     if ( x2apic_enabled )
>         write_atomic(&pp->arch.hvm_vmx.pi_desc->ndst,
>                      cpu_physical_id(v->processor));
>     else
>         write_atomic(&pp->arch.hvm_vmx.pi_desc->ndst,
>                      MASK_INSR(cpu_physical_id(v->processor),
>                      PI_xAPIC_NDST_MASK));
>     pi_clear_sn(pi_desc);
> 
> In fact, if the scheduler picked prev again, as the next vcpu to be run,
> I expect
> 
>     ( cpu_runnable(v) || !test_bit(_VPF_blocked, &v->pause_flags) )
> 
> to be true, and hence vmx_pre_ctx_switch_pi() reducing to just a call to
> pi_set_sn().
> 
> So, either it is the write_atomic() that you absolutely need, which I
> don't think, or you really can check whether such situation is
> occurring, and avoid going through _pre_ and _post_ back to back.
> 
> > --- a/xen/arch/x86/hvm/vmx/vmx.c
> > +++ b/xen/arch/x86/hvm/vmx/vmx.c
> 
> > @@ -116,10 +118,17 @@ static int vmx_vcpu_initialise(struct vcpu *v)
> >      INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_blocked_vcpu_list);
> >      INIT_LIST_HEAD(&v->arch.hvm_vmx.pi_vcpu_on_set_list);
> >
> > +    v->arch.hvm_vmx.pi_block_cpu = -1;
> > +
> > +    spin_lock_init(&v->arch.hvm_vmx.pi_lock);
> > +
> >      v->arch.schedule_tail    = vmx_do_resume;
> >      v->arch.ctxt_switch_from = vmx_ctxt_switch_from;
> >      v->arch.ctxt_switch_to   = vmx_ctxt_switch_to;
> >
> > +    v->arch.pi_ctxt_switch_from = vmx_pre_ctx_switch_pi;
> > +    v->arch.pi_ctxt_switch_to = vmx_post_ctx_switch_pi;
> > +
> Would it make sense to get the two initialization above with:
> 
>   if ( iommu_intpost && is_hvm_vcpu(v) )
> 
> and leave them NULL if that is false? I think it works, and it looks
> worthwhile to me.
> 
> In fact, this would make the operands flip suggested above actually
> effective at cutting the overhead for non-PI enabled guests/hosts.
> 
> > +static void vmx_pre_ctx_switch_pi(struct vcpu *v)
> > +{
> > +    struct pi_desc *pi_desc = &v->arch.hvm_vmx.pi_desc;
> > +    struct pi_desc old, new;
> > +    unsigned long flags, gflags;
> > +
> > +    if ( !iommu_intpost || !is_hvm_vcpu(v)
> || !has_arch_pdevs(v->domain) )
> > +        return;
> > +
> And, if you do as I suggest in context_switch() and
> vmx_vcpu_initialise(), this will reduce to:
> 
>     if ( !has_arch_pdevs(v->domain) )
>         return;
> 
> > +    else if ( test_bit(_VPF_blocked, &v->pause_flags) )
> > +    {
> > +        /*
> > +         * The vCPU is blocking, we need to add it to one of the per pCPU
> lists.
> > +         * We save v->processor to v->arch.hvm_vmx.pi_block_cpu and
> use it for
> > +         * the per-CPU list, we also save it to posted-interrupt descriptor
> and
> > +         * make it as the destination of the wake-up notification event.
> > +         */
> > +        v->arch.hvm_vmx.pi_block_cpu = v->processor;
> > +        spin_lock_irqsave(&per_cpu(pi_blocked_vcpu_lock,
> > +                          v->arch.hvm_vmx.pi_block_cpu), flags);
> > +        list_add_tail(&v->arch.hvm_vmx.pi_blocked_vcpu_list,
> > +                      &per_cpu(pi_blocked_vcpu,
> v->arch.hvm_vmx.pi_block_cpu));
> > +        spin_unlock_irqrestore(&per_cpu(pi_blocked_vcpu_lock,
> > +                           v->arch.hvm_vmx.pi_block_cpu), flags);
> > +
> > +        do {
> > +            old.control = new.control = pi_desc->control;
> > +
> > +            /* Should not block the vCPU if an interrupt was posted for it
> */
> > +
> There's no need for the blank line, and I'd say kill it.
> 
> > +            if ( pi_test_on(&old) )
> > +            {
> > +                spin_unlock_irqrestore(&v->arch.hvm_vmx.pi_lock,
> gflags);
> > +                vcpu_unblock(v);
> > +                return;
> > +            }
> 
> > --- a/xen/common/schedule.c
> > +++ b/xen/common/schedule.c
> > @@ -381,6 +381,8 @@ void vcpu_wake(struct vcpu *v)
> >      unsigned long flags;
> >      spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
> >
> > +    arch_vcpu_wake(v);
> > +
> So, in the draft you sent a few days back, this was called at the end of
> vcpu_wake(), right before releasing the lock. Now it's at the beginning,
> before the scheduler's wakeup routine has a chance to run.
> 
> IMO, it feels more natural for it to be at the bottom (i.e., generic
> stuff first, arch specific stuff afterwards), and, after a quick
> inspection, I don't think I see nothing preventing things to be that
> way.
> 
> However, I recall you mentioning having issues with such draft, which
> are now resolved with this version. 

The long latency issue mentioned previously is caused by another reason.
Originally I called the ' pi_ctxt_switch_from ' and ' pi_ctxt_switch_to ' in
__context_switch(), however, this function is not called for each context
switch, as I described above, after fixing this, the performance issue
disappeared.

> Since this is one of the differences
> between the two, was it the cause of the issues you were seeing? If yes,
> can you elaborate on how and why?
> 
> In the end, I'm not too opposed to the hook being at the beginning
> rather than at the end, but there has to be a reason, which may well end
> up better be stated in a comment...

Here is the reason I put arch_vcpu_wake() ahead of vcpu_wake():
arch_vcpu_wake() does some prerequisites for a vCPU which is about
to run, such as, setting SN again, changing NV filed back to
' posted_intr_vector ', which should be finished before the vCPU is
actually scheduled to run. However, if we put arch_vcpu_wake() later
in vcpu_wake() right before ' vcpu_schedule_unlock_irqrestore', after
the 'wake' hook get finished, the vcpu can run at any time (maybe in
another pCPU since the current pCPU is protected by the lock), if
this can happen, it is incorrect. Does my understanding make sense?

Thanks,
Feng

> 
> Thanks and Regards,
> Dario
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-30  2:04     ` Wu, Feng
@ 2015-07-30 18:26       ` Dario Faggioli
  2015-08-11 10:23         ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Dario Faggioli @ 2015-07-30 18:26 UTC (permalink / raw)
  To: Wu, Feng
  Cc: Tian, Kevin, Keir Fraser, George Dunlap, Andrew Cooper,
	xen-devel@lists.xen.org, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 7571 bytes --]

On Thu, 2015-07-30 at 02:04 +0000, Wu, Feng wrote:
> > -----Original Message-----
> > From: Dario Faggioli [mailto:dario.faggioli@citrix.com]

> > > --- a/xen/arch/x86/domain.c
> > > +++ b/xen/arch/x86/domain.c
> > > @@ -1550,9 +1550,19 @@ void context_switch(struct vcpu *prev, struct
> > vcpu *next)
> > >
> > >      set_current(next);
> > >
> > > +    /*
> > > +     * We need to update posted interrupt descriptor for each context
> > switch,
> > > +     * hence cannot use the lazy context switch for this.
> > > +     */
> > >
> > Perhaps it's me, but I don't get the comment. Why do you mention "the
> > lazy context switch"? We can't use it "for this", as opposed to what
> > other circumstance where we can use it?
> 
> Oh, maybe I shouldn't use the word here, what I want to say here is
> __context_switch() isn't called in each context switch, such as,
> non-idle vcpu -> idle vcpu, so we need to call prev->arch.pi_ctxt_switch_from
> explicitly instead of in __context_switch().
> 
Ok, I see what you mean now, and it's probably correct, as 'lazy context
switch' is, in this context, exactly that (i.e., not actually context
switching if next is the idle vcpu).

It's just that such term is used, in literature, in different places to
mean (slightly) different thing, and there is no close reference to it
(like in the function), so I still see a bit of room for potential
confusion.

In the end, as you which. If it were me, I'd add a few word to specify
things better, something very similar to what you've put in this email,
e.g.:

"When switching from non-idle to idle, we only do a lazy context switch.
However, in order for posted interrupt (if available and enabled) to
work properly, we at least need to update the descriptors"

Or some better English form of it. :-)

But that's certainly something not critical, and I'll be ok with
everything other maintainers agree on.

> > >      if ( (per_cpu(curr_vcpu, cpu) == next) ||
> > >           (is_idle_vcpu(next) && cpu_online(cpu)) )
> > >      {
> > > +        if ( !is_idle_vcpu(next) && next->arch.pi_ctxt_switch_to )
> > >
> > Same as above.
> > 
> > > +            next->arch.pi_ctxt_switch_to(next);
> > > +
> > >          local_irq_enable();
> > >
> > Another thing: if prev == next --and let's call such vcpu pp-- you go
> > through both:
> > 
> >     pp->arch.pi_ctxt_switch_from(pp);
> >     pp->arch.pi_ctxt_switch_to(pp);
> 
> In my understanding, if the scheduler chooses the same vcpu to run, it
> will return early in schedule() as below:
> 
> static void schedule(void)
> {
>     ....
> 
>     /* get policy-specific decision on scheduling... */
>     sched = this_cpu(scheduler);
>     next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
> 
>     next = next_slice.task;
> 
>     sd->curr = next;
> 
>     if ( next_slice.time >= 0 ) /* -ve means no limit */
>         set_timer(&sd->s_timer, now + next_slice.time);
> 
>     if ( unlikely(prev == next) )
>     {
>         pcpu_schedule_unlock_irq(lock, cpu);
>         trace_continue_running(next);
>         return continue_running(prev);
>     }
> 
>     ....
> 
> }
> 
> If this is that case, when we get context_switch(), the prev and next are
> different. Do I miss something?
> 
That looks correct. Still, there are checks like '(prev!=next)' around
in context_switch(), for both x86 and ARM... weird. I shall have a
deeper look...

In any case, as far as this hunk is concerned, the
'(per_cpu(curr_vcpu,cpu)==next)' is there to deal with the case where we
went from vcpu v to idle, and we're now going from idle to v again,
which is something you want to intercept.

So, at least for now, ignore my comments about it. I'll let you know if
I find something interesting that you should take into account.

> > > --- a/xen/common/schedule.c
> > > +++ b/xen/common/schedule.c
> > > @@ -381,6 +381,8 @@ void vcpu_wake(struct vcpu *v)
> > >      unsigned long flags;
> > >      spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
> > >
> > > +    arch_vcpu_wake(v);
> > > +
> > So, in the draft you sent a few days back, this was called at the end of
> > vcpu_wake(), right before releasing the lock. Now it's at the beginning,
> > before the scheduler's wakeup routine has a chance to run.
> > 
> > IMO, it feels more natural for it to be at the bottom (i.e., generic
> > stuff first, arch specific stuff afterwards), and, after a quick
> > inspection, I don't think I see nothing preventing things to be that
> > way.
> > 
> > However, I recall you mentioning having issues with such draft, which
> > are now resolved with this version. 
> 
> The long latency issue mentioned previously is caused by another reason.
> Originally I called the ' pi_ctxt_switch_from ' and ' pi_ctxt_switch_to ' in
> __context_switch(), however, this function is not called for each context
> switch, as I described above, after fixing this, the performance issue
> disappeared.
> 
I see, thanks for explaining this.

> > Since this is one of the differences
> > between the two, was it the cause of the issues you were seeing? If yes,
> > can you elaborate on how and why?
> > 
> > In the end, I'm not too opposed to the hook being at the beginning
> > rather than at the end, but there has to be a reason, which may well end
> > up better be stated in a comment...
> 
> Here is the reason I put arch_vcpu_wake() ahead of vcpu_wake():
> arch_vcpu_wake() does some prerequisites for a vCPU which is about
> to run, such as, setting SN again, changing NV filed back to
> ' posted_intr_vector ', which should be finished before the vCPU is
> actually scheduled to run. However, if we put arch_vcpu_wake() later
> in vcpu_wake() right before ' vcpu_schedule_unlock_irqrestore', after
> the 'wake' hook get finished, the vcpu can run at any time (maybe in
> another pCPU since the current pCPU is protected by the lock), if
> this can happen, it is incorrect. Does my understanding make sense?
> 
It's safe in any case. In fact, the spinlock will  prevent both the
vcpu's processor to schedule, as well as any other processors to steal
the waking vcpu from the runqueue to run it.

That's actually why I wanted to double check you changing the position
of the hook (wrt the draft), as it felt weird that the issue were in
there. :-)

So, now that we know that safety is not an issue, where should we put
the hook?

Having it before SCHED_OP(wake) may make people think that arch specific
code is (or can, at some point) somehow influencing the scheduler
specific wakeup code, which is not (and should not become, if possible)
the case.

However, I kind of like the fact that the spinlock is released as soon
as possible, after the call to SCHED_OP(wake). That will make it more
likely, for the processors we may have sent IPIs to, during the
scheduler specific wakeup code, to find the spinlock free. So, looking
at things from this angle, it would be better to avoid putting stuff in
between SCHED_OP(wake) and vcpu_schedule_unlock().

So, all in all, I'd say leave it on top, where it is in this patch. Of
course, if others have opinions, I'm all ears. :-)

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
@ 2015-08-03  1:36 Wu, Feng
  2015-08-03 10:02 ` Dario Faggioli
  0 siblings, 1 reply; 10+ messages in thread
From: Wu, Feng @ 2015-08-03  1:36 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Tian, Kevin, Keir Fraser, George Dunlap, Andrew Cooper,
	xen-devel@lists.xen.org, Jan Beulich, Wu, Feng



> -----Original Message-----
> From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
> Sent: Friday, July 31, 2015 2:27 AM
> To: Wu, Feng
> Cc: xen-devel@lists.xen.org; Keir Fraser; Jan Beulich; Andrew Cooper; Tian,
> Kevin; George Dunlap
> Subject: Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted
> interrupts
> 
> On Thu, 2015-07-30 at 02:04 +0000, Wu, Feng wrote:
> > > -----Original Message-----
> > > From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
> 
> > > > --- a/xen/arch/x86/domain.c
> > > > +++ b/xen/arch/x86/domain.c
> > > > @@ -1550,9 +1550,19 @@ void context_switch(struct vcpu *prev, struct
> > > vcpu *next)
> > > >
> > > >      set_current(next);
> > > >
> > > > +    /*
> > > > +     * We need to update posted interrupt descriptor for each context
> > > switch,
> > > > +     * hence cannot use the lazy context switch for this.
> > > > +     */
> > > >
> > > Perhaps it's me, but I don't get the comment. Why do you mention "the
> > > lazy context switch"? We can't use it "for this", as opposed to what
> > > other circumstance where we can use it?
> >
> > Oh, maybe I shouldn't use the word here, what I want to say here is
> > __context_switch() isn't called in each context switch, such as,
> > non-idle vcpu -> idle vcpu, so we need to call prev->arch.pi_ctxt_switch_from
> > explicitly instead of in __context_switch().
> >
> Ok, I see what you mean now, and it's probably correct, as 'lazy context
> switch' is, in this context, exactly that (i.e., not actually context
> switching if next is the idle vcpu).
> 
> It's just that such term is used, in literature, in different places to
> mean (slightly) different thing, and there is no close reference to it
> (like in the function), so I still see a bit of room for potential
> confusion.
> 
> In the end, as you which. If it were me, I'd add a few word to specify
> things better, something very similar to what you've put in this email,
> e.g.:
> 
> "When switching from non-idle to idle, we only do a lazy context switch.
> However, in order for posted interrupt (if available and enabled) to
> work properly, we at least need to update the descriptors"

Sounds good!

> 
> Or some better English form of it. :-)
> 
> But that's certainly something not critical, and I'll be ok with
> everything other maintainers agree on.
> 
> > > >      if ( (per_cpu(curr_vcpu, cpu) == next) ||
> > > >           (is_idle_vcpu(next) && cpu_online(cpu)) )
> > > >      {
> > > > +        if ( !is_idle_vcpu(next) && next->arch.pi_ctxt_switch_to )
> > > >
> > > Same as above.
> > >
> > > > +            next->arch.pi_ctxt_switch_to(next);
> > > > +
> > > >          local_irq_enable();
> > > >
> > > Another thing: if prev == next --and let's call such vcpu pp-- you go
> > > through both:
> > >
> > >     pp->arch.pi_ctxt_switch_from(pp);
> > >     pp->arch.pi_ctxt_switch_to(pp);
> >
> > In my understanding, if the scheduler chooses the same vcpu to run, it
> > will return early in schedule() as below:
> >
> > static void schedule(void)
> > {
> >     ....
> >
> >     /* get policy-specific decision on scheduling... */
> >     sched = this_cpu(scheduler);
> >     next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
> >
> >     next = next_slice.task;
> >
> >     sd->curr = next;
> >
> >     if ( next_slice.time >= 0 ) /* -ve means no limit */
> >         set_timer(&sd->s_timer, now + next_slice.time);
> >
> >     if ( unlikely(prev == next) )
> >     {
> >         pcpu_schedule_unlock_irq(lock, cpu);
> >         trace_continue_running(next);
> >         return continue_running(prev);
> >     }
> >
> >     ....
> >
> > }
> >
> > If this is that case, when we get context_switch(), the prev and next are
> > different. Do I miss something?
> >
> That looks correct. Still, there are checks like '(prev!=next)' around
> in context_switch(), for both x86 and ARM... weird. I shall have a
> deeper look...
> 
> In any case, as far as this hunk is concerned, the
> '(per_cpu(curr_vcpu,cpu)==next)' is there to deal with the case where we
> went from vcpu v to idle, and we're now going from idle to v again,
> which is something you want to intercept.
> 
> So, at least for now, ignore my comments about it. I'll let you know if
> I find something interesting that you should take into account.
> 
> > > > --- a/xen/common/schedule.c
> > > > +++ b/xen/common/schedule.c
> > > > @@ -381,6 +381,8 @@ void vcpu_wake(struct vcpu *v)
> > > >      unsigned long flags;
> > > >      spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
> > > >
> > > > +    arch_vcpu_wake(v);
> > > > +
> > > So, in the draft you sent a few days back, this was called at the end of
> > > vcpu_wake(), right before releasing the lock. Now it's at the beginning,
> > > before the scheduler's wakeup routine has a chance to run.
> > >
> > > IMO, it feels more natural for it to be at the bottom (i.e., generic
> > > stuff first, arch specific stuff afterwards), and, after a quick
> > > inspection, I don't think I see nothing preventing things to be that
> > > way.
> > >
> > > However, I recall you mentioning having issues with such draft, which
> > > are now resolved with this version.
> >
> > The long latency issue mentioned previously is caused by another reason.
> > Originally I called the ' pi_ctxt_switch_from ' and ' pi_ctxt_switch_to ' in
> > __context_switch(), however, this function is not called for each context
> > switch, as I described above, after fixing this, the performance issue
> > disappeared.
> >
> I see, thanks for explaining this.
> 
> > > Since this is one of the differences
> > > between the two, was it the cause of the issues you were seeing? If yes,
> > > can you elaborate on how and why?
> > >
> > > In the end, I'm not too opposed to the hook being at the beginning
> > > rather than at the end, but there has to be a reason, which may well end
> > > up better be stated in a comment...
> >
> > Here is the reason I put arch_vcpu_wake() ahead of vcpu_wake():
> > arch_vcpu_wake() does some prerequisites for a vCPU which is about
> > to run, such as, setting SN again, changing NV filed back to
> > ' posted_intr_vector ', which should be finished before the vCPU is
> > actually scheduled to run. However, if we put arch_vcpu_wake() later
> > in vcpu_wake() right before ' vcpu_schedule_unlock_irqrestore', after
> > the 'wake' hook get finished, the vcpu can run at any time (maybe in
> > another pCPU since the current pCPU is protected by the lock), if
> > this can happen, it is incorrect. Does my understanding make sense?
> >
> It's safe in any case. In fact, the spinlock will  prevent both the
> vcpu's processor to schedule, as well as any other processors to steal
> the waking vcpu from the runqueue to run it.

Good to know this. For " as well as any other processors to steal
the waking vcpu from the runqueue to run it ", could you please show
some hints in the code side, so I can better understand how this can
be protected by the spinlock. Thank you!

Thanks,
Feng

> 
> That's actually why I wanted to double check you changing the position
> of the hook (wrt the draft), as it felt weird that the issue were in
> there. :-)
> 
> So, now that we know that safety is not an issue, where should we put
> the hook?
> 
> Having it before SCHED_OP(wake) may make people think that arch specific
> code is (or can, at some point) somehow influencing the scheduler
> specific wakeup code, which is not (and should not become, if possible)
> the case.
> 
> However, I kind of like the fact that the spinlock is released as soon
> as possible, after the call to SCHED_OP(wake). That will make it more
> likely, for the processors we may have sent IPIs to, during the
> scheduler specific wakeup code, to find the spinlock free. So, looking
> at things from this angle, it would be better to avoid putting stuff in
> between SCHED_OP(wake) and vcpu_schedule_unlock().
> 
> So, all in all, I'd say leave it on top, where it is in this patch. Of
> course, if others have opinions, I'm all ears. :-)
> 
> Thanks and Regards,
> Dario
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-08-03  1:36 [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts Wu, Feng
@ 2015-08-03 10:02 ` Dario Faggioli
  2015-08-05  6:06   ` Wu, Feng
  0 siblings, 1 reply; 10+ messages in thread
From: Dario Faggioli @ 2015-08-03 10:02 UTC (permalink / raw)
  To: Wu, Feng
  Cc: Tian, Kevin, Keir Fraser, George Dunlap, Andrew Cooper,
	xen-devel@lists.xen.org, Jan Beulich


[-- Attachment #1.1: Type: text/plain, Size: 1882 bytes --]

On Mon, 2015-08-03 at 01:36 +0000, Wu, Feng wrote:
> > -----Original Message-----
> > From: Dario Faggioli [mailto:dario.faggioli@citrix.com]

> > It's safe in any case. In fact, the spinlock will  prevent both the
> > vcpu's processor to schedule, as well as any other processors to steal
> > the waking vcpu from the runqueue to run it.
> 
> Good to know this. For " as well as any other processors to steal
> the waking vcpu from the runqueue to run it ", could you please show
> some hints in the code side, so I can better understand how this can
> be protected by the spinlock. Thank you!
> 
Sure. For instance, for the Credit1 scheduler, look at
csched_load_balance(). vcpus are moved from one runqueue to another by
means of csched_runq_steal() which, as you can see, is called only if
the spinlock for the pcpu from where we want to try stealing has been
successfully acquired:

    spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);

    if ( !lock )
    {
        SCHED_STAT_CRANK(steal_trylock_failed);
        peer_cpu = cpumask_cycle(peer_cpu, &workers);
        continue;
    }

    /* Any work over there to steal? */
    speer = cpumask_test_cpu(peer_cpu, online) ?
         csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
    pcpu_schedule_unlock(lock, peer_cpu);

Same happen in Credit2. Check choose_cpu(), and you'll fine similar
reasoning and code.

In RTDS, finally, there's just one runqueue, for all pcpus, so each time
each one of them has to schedule it has to take the only one spinlock
protecting it.

Hope this helped.

Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

[-- Attachment #1.2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-08-03 10:02 ` Dario Faggioli
@ 2015-08-05  6:06   ` Wu, Feng
  0 siblings, 0 replies; 10+ messages in thread
From: Wu, Feng @ 2015-08-05  6:06 UTC (permalink / raw)
  To: Dario Faggioli
  Cc: Tian, Kevin, Keir Fraser, George Dunlap, Andrew Cooper,
	xen-devel@lists.xen.org, Jan Beulich, Wu, Feng



> -----Original Message-----
> From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
> Sent: Monday, August 03, 2015 6:02 PM
> To: Wu, Feng
> Cc: xen-devel@lists.xen.org; Keir Fraser; Jan Beulich; Andrew Cooper; Tian,
> Kevin; George Dunlap
> Subject: Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted
> interrupts
> 
> On Mon, 2015-08-03 at 01:36 +0000, Wu, Feng wrote:
> > > -----Original Message-----
> > > From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
> 
> > > It's safe in any case. In fact, the spinlock will  prevent both the
> > > vcpu's processor to schedule, as well as any other processors to steal
> > > the waking vcpu from the runqueue to run it.
> >
> > Good to know this. For " as well as any other processors to steal
> > the waking vcpu from the runqueue to run it ", could you please show
> > some hints in the code side, so I can better understand how this can
> > be protected by the spinlock. Thank you!
> >
> Sure. For instance, for the Credit1 scheduler, look at
> csched_load_balance(). vcpus are moved from one runqueue to another by
> means of csched_runq_steal() which, as you can see, is called only if
> the spinlock for the pcpu from where we want to try stealing has been
> successfully acquired:
> 
>     spinlock_t *lock = pcpu_schedule_trylock(peer_cpu);
> 
>     if ( !lock )
>     {
>         SCHED_STAT_CRANK(steal_trylock_failed);
>         peer_cpu = cpumask_cycle(peer_cpu, &workers);
>         continue;
>     }
> 
>     /* Any work over there to steal? */
>     speer = cpumask_test_cpu(peer_cpu, online) ?
>          csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL;
>     pcpu_schedule_unlock(lock, peer_cpu);
> 
> Same happen in Credit2. Check choose_cpu(), and you'll fine similar
> reasoning and code.
> 
> In RTDS, finally, there's just one runqueue, for all pcpus, so each time
> each one of them has to schedule it has to take the only one spinlock
> protecting it.
> 
> Hope this helped.

Thanks for the elaborate description, it is very helpful.

Thanks,
Feng

> 
> Regards,
> Dario
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts
  2015-07-30 18:26       ` Dario Faggioli
@ 2015-08-11 10:23         ` Jan Beulich
  0 siblings, 0 replies; 10+ messages in thread
From: Jan Beulich @ 2015-08-11 10:23 UTC (permalink / raw)
  To: Dario Faggioli, Feng Wu
  Cc: GeorgeDunlap, Andrew Cooper, Kevin Tian, Keir Fraser,
	xen-devel@lists.xen.org

>>> On 30.07.15 at 20:26, <dario.faggioli@citrix.com> wrote:
> On Thu, 2015-07-30 at 02:04 +0000, Wu, Feng wrote:
>> > -----Original Message-----
>> > From: Dario Faggioli [mailto:dario.faggioli@citrix.com]
>> > Since this is one of the differences
>> > between the two, was it the cause of the issues you were seeing? If yes,
>> > can you elaborate on how and why?
>> > 
>> > In the end, I'm not too opposed to the hook being at the beginning
>> > rather than at the end, but there has to be a reason, which may well end
>> > up better be stated in a comment...
>> 
>> Here is the reason I put arch_vcpu_wake() ahead of vcpu_wake():
>> arch_vcpu_wake() does some prerequisites for a vCPU which is about
>> to run, such as, setting SN again, changing NV filed back to
>> ' posted_intr_vector ', which should be finished before the vCPU is
>> actually scheduled to run. However, if we put arch_vcpu_wake() later
>> in vcpu_wake() right before ' vcpu_schedule_unlock_irqrestore', after
>> the 'wake' hook get finished, the vcpu can run at any time (maybe in
>> another pCPU since the current pCPU is protected by the lock), if
>> this can happen, it is incorrect. Does my understanding make sense?
>> 
> It's safe in any case. In fact, the spinlock will  prevent both the
> vcpu's processor to schedule, as well as any other processors to steal
> the waking vcpu from the runqueue to run it.
> 
> That's actually why I wanted to double check you changing the position
> of the hook (wrt the draft), as it felt weird that the issue were in
> there. :-)
> 
> So, now that we know that safety is not an issue, where should we put
> the hook?
> 
> Having it before SCHED_OP(wake) may make people think that arch specific
> code is (or can, at some point) somehow influencing the scheduler
> specific wakeup code, which is not (and should not become, if possible)
> the case.
> 
> However, I kind of like the fact that the spinlock is released as soon
> as possible, after the call to SCHED_OP(wake). That will make it more
> likely, for the processors we may have sent IPIs to, during the
> scheduler specific wakeup code, to find the spinlock free. So, looking
> at things from this angle, it would be better to avoid putting stuff in
> between SCHED_OP(wake) and vcpu_schedule_unlock().
> 
> So, all in all, I'd say leave it on top, where it is in this patch. Of
> course, if others have opinions, I'm all ears. :-)

If it is kept at the beginning, the hook should be renamed to
something like arch_vcpu_wake_prepare().

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-08-11 10:23 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-03  1:36 [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts Wu, Feng
2015-08-03 10:02 ` Dario Faggioli
2015-08-05  6:06   ` Wu, Feng
  -- strict thread matches above, loose matches on Subject: below --
2015-07-23 11:35 [v4 00/17] Add VT-d Posted-Interrupts support Feng Wu
2015-07-23 11:35 ` [v4 16/17] vmx: Add some scheduler hooks for VT-d posted interrupts Feng Wu
2015-07-23 12:50   ` Dario Faggioli
2015-07-24  0:49     ` Wu, Feng
2015-07-28 14:15   ` Dario Faggioli
2015-07-30  2:04     ` Wu, Feng
2015-07-30 18:26       ` Dario Faggioli
2015-08-11 10:23         ` Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.