[PATCH 1/4] CPU online/offline support in Xen

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/4] CPU online/offline support in Xen
@ 2008-09-09  8:59 Shan, Haitao
  2008-09-10 10:43 ` Keir Fraser
  0 siblings, 1 reply; 30+ messages in thread
From: Shan, Haitao @ 2008-09-09  8:59 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 73 bytes --]

This patch implements cpu offline feature.

Best Regards
Haitao Shan

[-- Attachment #2: cpu_offline.patch --]
[-- Type: application/octet-stream, Size: 9919 bytes --]

diff -r be573a356c90 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/arch/x86/irq.c	Sat Sep 06 15:50:12 2008 +0800
@@ -739,6 +739,7 @@
 {
     unsigned int irq;
     static int warned;
+    irq_guest_action_t *action;
 
     for ( irq = 0; irq < NR_IRQS; irq++ )
     {
@@ -756,6 +757,16 @@
             irq_desc[irq].handler->set_affinity(irq, mask);
         else if ( irq_desc[irq].action && !(warned++) )
             printk("Cannot set affinity for irq %i\n", irq);
+
+        if ( !(irq_desc[irq].status & IRQ_GUEST) )
+            continue;
+        action = (irq_guest_action_t *)irq_desc[irq].action;
+        if ( cpu_isset(smp_processor_id(), action->cpu_eoi_map) )
+        {
+            ack_APIC_irq();
+            cpu_clear(smp_processor_id(), action->cpu_eoi_map);
+            printk("Flushing pending eoi for irq %i\n", irq);
+        }
     }
 
     local_irq_enable();
diff -r be573a356c90 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/arch/x86/smpboot.c	Sat Sep 06 15:50:12 2008 +0800
@@ -39,6 +39,7 @@
 #include <xen/mm.h>
 #include <xen/domain.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
 #include <xen/softirq.h>
@@ -531,6 +532,8 @@
 
 	cpu_set(smp_processor_id(), cpu_online_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+    cpu_schedule_map_set(smp_processor_id());
 
 	init_percpu_time();
 
@@ -1180,6 +1183,7 @@
 	cpu_set(smp_processor_id(), cpu_callout_map);
 	cpu_set(smp_processor_id(), cpu_present_map);
 	cpu_set(smp_processor_id(), cpu_possible_map);
+    cpu_schedule_map_set(smp_processor_id());
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
@@ -1225,15 +1229,6 @@
 	if (cpu == 0)
 		return -EBUSY;
 
-	/*
-	 * Only S3 is using this path, and thus idle vcpus are running on all
-	 * APs when we are called. To support full cpu hotplug, other 
-	 * notification mechanisms should be introduced (e.g., migrate vcpus
-	 * off this physical cpu before rendezvous point).
-	 */
-	if (!is_idle_vcpu(current))
-		return -EINVAL;
-
 	local_irq_disable();
 	clear_local_APIC();
 	/* Allow any queued timer interrupts to get serviced */
@@ -1275,28 +1270,15 @@
     return __cpu_disable();
 }
 
-/* 
- * XXX: One important thing missed here is to migrate vcpus
- * from dead cpu to other online ones and then put whole
- * system into a stop state. It assures a safe environment
- * for a cpu hotplug/remove at normal running state.
- *
- * However for xen PM case, at this point:
- * 	-> All other domains should be notified with PM event,
- *	   and then in following states:
- *		* Suspend state, or
- *		* Paused state, which is a force step to all
- *		  domains if they do nothing to suspend
- *	-> All vcpus of dom0 (except vcpu0) have already beem
- *	   hot removed
- * with the net effect that all other cpus only have idle vcpu
- * running. In this special case, we can avoid vcpu migration
- * then and system can be considered in a stop state.
- *
- * So current cpu hotplug is a special version for PM specific
- * usage, and need more effort later for full cpu hotplug.
- * (ktian1)
- */
+static int fixing_scheduler_map(void *data)
+{
+    unsigned int cpu = *(unsigned int *)data;
+
+    cpu_schedule_map_clear(cpu);
+
+    return 0;
+}
+
 int cpu_down(unsigned int cpu)
 {
 	int err = 0;
@@ -1307,16 +1289,46 @@
 		goto out;
 	}
 
+    /* Can not offline BSP */
+    if ( cpu == 0 )
+    {
+        err = -EINVAL;
+        goto out;
+    }
+
 	if (!cpu_online(cpu)) {
 		err = -EINVAL;
 		goto out;
 	}
 
+    /* Modify vcpu affinity of those whose affinity only includes this dying
+     * cpu. Otherwise, after we modifies the schedule_map, the scheduler
+     * is confused.
+     */
+    prepare_migration_on_cpu(cpu);
+
+    /* Prevent the scheduler from migrating vcpus to this cpu.*/
+    err = stop_machine_run(fixing_scheduler_map, &cpu, smp_processor_id());
+    if ( err < 0 )
+    {
+        printk("stop machine rmove siblinginfo failed\n");
+        goto out;
+    }
+
+    /* Actually migrate all vcpus on this dying cpu away. */
+    migrate_all_vcpus_on_cpu(cpu);
+
 	printk("Prepare to bring CPU%d down...\n", cpu);
+
+    while ( !idle_vcpu[cpu]->is_running )
+        cpu_relax();
 
 	err = stop_machine_run(take_cpu_down, NULL, cpu);
 	if ( err < 0 )
+    {
+        cpu_schedule_map_set(cpu);
 		goto out;
+    }
 
 	__cpu_die(cpu);
 
diff -r be573a356c90 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/common/sched_credit.c	Sat Sep 06 15:50:12 2008 +0800
@@ -407,11 +407,14 @@
 static inline int
 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
 {
+    cpumask_t mask;
+
     /*
      * Don't pick up work that's in the peer's scheduling tail. Also only pick
      * up work that's allowed to run on our CPU.
      */
-    return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+    cpus_and(mask, vc->cpu_affinity, cpu_schedule_map);
+    return !vc->is_running && cpu_isset(dest_cpu, mask);
 }
 
 static int
@@ -425,7 +428,7 @@
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    cpus_and(cpus, cpu_schedule_map, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : __cycle_cpu(vc->processor, &cpus);
@@ -1118,7 +1121,7 @@
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    cpus_andnot(workers, cpu_schedule_map, csched_priv.idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
diff -r be573a356c90 xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/common/sched_sedf.c	Sat Sep 06 15:50:12 2008 +0800
@@ -415,7 +415,7 @@
 {
     cpumask_t online_affinity;
 
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    cpus_and(online_affinity, v->cpu_affinity, cpu_schedule_map);
     return first_cpu(online_affinity);
 }
 
diff -r be573a356c90 xen/common/schedule.c
--- a/xen/common/schedule.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/common/schedule.c	Sat Sep 06 15:50:12 2008 +0800
@@ -39,6 +39,8 @@
 string_param("sched", opt_sched);
 
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
+
+cpumask_t cpu_schedule_map;
 
 /* Various timer handlers. */
 static void s_timer_fn(void *unused);
@@ -266,6 +268,82 @@
         vcpu_sleep_nosync(v);
         vcpu_migrate(v);
     }
+}
+
+static void vcpu_force_migrate(struct vcpu *v, int cpu_from)
+{
+    unsigned long flags;
+
+    vcpu_schedule_lock_irqsave(v, flags);
+
+    if ( v->processor != cpu_from )
+    {
+        vcpu_schedule_unlock_irqrestore(v, flags);
+        return;
+    }
+
+    set_bit(_VPF_migrating, &v->pause_flags);
+    vcpu_schedule_unlock_irqrestore(v, flags);
+
+    if ( test_bit(_VPF_migrating, &v->pause_flags) )
+    {
+        vcpu_sleep_nosync(v);
+        vcpu_migrate(v);
+    }
+}
+
+void prepare_migration_on_cpu(int cpu)
+{
+    struct domain *d = NULL;
+    struct vcpu *v = NULL;
+    unsigned long flags;
+
+    for_each_domain(d)
+        for_each_vcpu(d, v)
+        {
+            if ( is_idle_vcpu(v) )
+                continue;
+
+            /* If vcpu is pinned on dying cpu, give warning here and vcpu can
+             * be continued on any cpus.
+             */
+            if ( cpus_weight(v->cpu_affinity) == 1
+                 && cpu_isset(cpu, v->cpu_affinity) )
+            {
+                printk("Breaking vcpu affinity for domain %d vcpu %d\n",
+                        v->domain->domain_id, v->vcpu_id);
+                vcpu_schedule_lock_irqsave(v, flags);
+                cpus_setall(v->cpu_affinity);
+                vcpu_schedule_unlock_irqrestore(v, flags);
+            }
+        }
+
+}
+
+/* This function is used by cpu_hotplug code. All vcpus but idle vcpu is
+ * migrated to other cpus. The caller should already prevented migration
+ * to this dying cpu.
+ */
+void migrate_all_vcpus_on_cpu(int cpu)
+{
+    struct domain *d = NULL;
+    struct vcpu *v = NULL;
+
+    for_each_domain(d)
+        for_each_vcpu(d, v)
+        {
+            if ( is_idle_vcpu(v) )
+                continue;
+
+            /* Single shot timer might be left active on this cpu, migrate it
+             * to bsp. A new cpu will be automatically shosen when * the timer
+             * is set again.
+             */
+            if ( v->singleshot_timer.cpu == cpu )
+                migrate_timer(&v->singleshot_timer, 0);
+
+            vcpu_force_migrate(v, cpu);
+        }
 }
 
 static int __vcpu_set_affinity(
diff -r be573a356c90 xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/include/xen/sched-if.h	Sat Sep 06 15:50:12 2008 +0800
@@ -79,4 +79,18 @@
     void         (*dump_cpu_state) (int);
 };
 
+extern cpumask_t cpu_schedule_map;
+
+static inline void cpu_schedule_map_set(int cpu)
+{
+    cpu_set(cpu, cpu_schedule_map);
+    smp_mb();
+}
+
+static inline void cpu_schedule_map_clear(int cpu)
+{
+    cpu_clear(cpu, cpu_schedule_map);
+    smp_mb();
+}
+
 #endif /* __XEN_SCHED_IF_H__ */
diff -r be573a356c90 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/include/xen/sched.h	Sat Sep 06 15:50:12 2008 +0800
@@ -524,6 +524,8 @@
 void cpu_init(void);
 
 void vcpu_force_reschedule(struct vcpu *v);
+void prepare_migration_on_cpu(int cpu);
+void migrate_all_vcpus_on_cpu(int cpu);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-09  8:59 [PATCH 1/4] CPU online/offline support in Xen Shan, Haitao
@ 2008-09-10 10:43 ` Keir Fraser
  2008-09-10 10:59   ` Keir Fraser
  2008-09-10 12:59   ` Haitao Shan
  0 siblings, 2 replies; 30+ messages in thread
From: Keir Fraser @ 2008-09-10 10:43 UTC (permalink / raw)
  To: Shan, Haitao; +Cc: xen-devel

I feel this is more complicated than it needs to be.

How about clearing VCPUs from the offlined CPU's runqueue from the very end
of __cpu_disable()? At that point all other CPUs are safely in softirq
context with IRQs disabled, and we are running on the correct CPU (being
offlined). We could have a hook into the scheduler subsystem at that point
to break affinities, assign to different runqueues, etc. We would just need
to be careful not to try an IPI. :-) This approach would not need a
cpu_schedule_map (which is really increasing code fragility imo, by creating
possible extra confusion about which cpumask is the wright one to use in a
given situation).

My feeling, unless I've missed something, is that this would make the patch
quite a bit smaller and with a smaller spread of code changes.

 -- Keir

On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> This patch implements cpu offline feature.
> 
> Best Regards
> Haitao Shan

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-10 10:43 ` Keir Fraser
@ 2008-09-10 10:59   ` Keir Fraser
  2008-09-10 12:59   ` Haitao Shan
  1 sibling, 0 replies; 30+ messages in thread
From: Keir Fraser @ 2008-09-10 10:59 UTC (permalink / raw)
  To: Shan, Haitao; +Cc: xen-devel

On 10/9/08 11:43, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

> I feel this is more complicated than it needs to be.
> 
> How about clearing VCPUs from the offlined CPU's runqueue from the very end
> of __cpu_disable()? At that point all other CPUs are safely in softirq
> context with IRQs disabled, and we are running on the correct CPU (being
> offlined). We could have a hook into the scheduler subsystem at that point
> to break affinities, assign to different runqueues, etc. We would just need
> to be careful not to try an IPI. :-) This approach would not need a
> cpu_schedule_map (which is really increasing code fragility imo, by creating
> possible extra confusion about which cpumask is the wright one to use in a
> given situation).
> 
> My feeling, unless I've missed something, is that this would make the patch
> quite a bit smaller and with a smaller spread of code changes.

Another thought: we may (appear to) need an IPI after VCPUs have been
migrated to other runqueues. And actually that will be safe because
smp_send_event_check_cpu() is non-blocking (does not wait for the remote CPU
to run the IPI handler). So it *is* safe to do non-blocking IPIs from
stop_machine_run() context.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-10 10:43 ` Keir Fraser
  2008-09-10 10:59   ` Keir Fraser
@ 2008-09-10 12:59   ` Haitao Shan
  2008-09-10 16:05     ` Frank van der Linden
  2008-09-11  8:02     ` Shan, Haitao
  1 sibling, 2 replies; 30+ messages in thread
From: Haitao Shan @ 2008-09-10 12:59 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Shan, Haitao

Agree. Placing migration in stop_machine context will definitely make
our jobs easier. I will start making a new patch tomorrow. :)
I place the migraton code outside the stop_machine_run context, partly
because I am not quite sure how long it will take to migrate all the
vcpus away. If it takes too much time, all useful works are blocked
since all cpus are in the stop_machine context. Of course, I borrowed
the ideas from kernel, which also let me made the desicion.

2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
> I feel this is more complicated than it needs to be.
>
> How about clearing VCPUs from the offlined CPU's runqueue from the very end
> of __cpu_disable()? At that point all other CPUs are safely in softirq
> context with IRQs disabled, and we are running on the correct CPU (being
> offlined). We could have a hook into the scheduler subsystem at that point
> to break affinities, assign to different runqueues, etc. We would just need
> to be careful not to try an IPI. :-) This approach would not need a
> cpu_schedule_map (which is really increasing code fragility imo, by creating
> possible extra confusion about which cpumask is the wright one to use in a
> given situation).
>
> My feeling, unless I've missed something, is that this would make the patch
> quite a bit smaller and with a smaller spread of code changes.
>
>  -- Keir
>
> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
>
>> This patch implements cpu offline feature.
>>
>> Best Regards
>> Haitao Shan
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-10 12:59   ` Haitao Shan
@ 2008-09-10 16:05     ` Frank van der Linden
  2008-09-11  7:36       ` Keir Fraser
  2008-09-11  8:02     ` Shan, Haitao
  1 sibling, 1 reply; 30+ messages in thread
From: Frank van der Linden @ 2008-09-10 16:05 UTC (permalink / raw)
  To: Haitao Shan; +Cc: xen-devel, Shan, Haitao, Keir Fraser

Haitao Shan wrote:
> Agree. Placing migration in stop_machine context will definitely make
> our jobs easier. I will start making a new patch tomorrow. :)
> I place the migraton code outside the stop_machine_run context, partly
> because I am not quite sure how long it will take to migrate all the
> vcpus away. If it takes too much time, all useful works are blocked
> since all cpus are in the stop_machine context. Of course, I borrowed
> the ideas from kernel, which also let me made the desicion.
>
> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
>   
>> I feel this is more complicated than it needs to be.
>>
>> How about clearing VCPUs from the offlined CPU's runqueue from the very end
>> of __cpu_disable()? At that point all other CPUs are safely in softirq
>> context with IRQs disabled, and we are running on the correct CPU (being
>> offlined). We could have a hook into the scheduler subsystem at that point
>> to break affinities, assign to different runqueues, etc. We would just need
>> to be careful not to try an IPI. :-) This approach would not need a
>> cpu_schedule_map (which is really increasing code fragility imo, by creating
>> possible extra confusion about which cpumask is the wright one to use in a
>> given situation).
>>
>> My feeling, unless I've missed something, is that this would make the patch
>> quite a bit smaller and with a smaller spread of code changes.
>>
>>  -- Keir
>>     
This would also address some problems I saw with the patch: race 
conditions regarding migration of VCPUs, because other CPUs may call 
runq_tickle. Or a hypercall may come in changing the VCPU affinity, 
since things are done in 2 stages.

The changes I have are more complicated, because I was working off 
3.1.4, which is our current Xen version. It doesn't have things like 
stop_machine_run. But if the patch is simplified in this manner, it is 
easier for us to use, and we can just backport things like 
stop_machine_run for the time being.

The other issue I was seeing was that cpu_up sometimes did not succeed 
in actually getting a CPU to boot. But there have been a few fixes to 
smpboot.c, so I'll have to see if that always works now.

- Frank

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-10 16:05     ` Frank van der Linden
@ 2008-09-11  7:36       ` Keir Fraser
  0 siblings, 0 replies; 30+ messages in thread
From: Keir Fraser @ 2008-09-11  7:36 UTC (permalink / raw)
  To: Frank van der Linden, Haitao Shan; +Cc: xen-devel

On 10/9/08 17:05, "Frank van der Linden" <Frank.Vanderlinden@Sun.COM> wrote:

> The other issue I was seeing was that cpu_up sometimes did not succeed
> in actually getting a CPU to boot. But there have been a few fixes to
> smpboot.c, so I'll have to see if that always works now.

Thanks, that would be interesting. Perhaps we need to tweak some delay
parameters in smpboot.c.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-10 12:59   ` Haitao Shan
  2008-09-10 16:05     ` Frank van der Linden
@ 2008-09-11  8:02     ` Shan, Haitao
  2008-09-11 11:12       ` Keir Fraser
  1 sibling, 1 reply; 30+ messages in thread
From: Shan, Haitao @ 2008-09-11  8:02 UTC (permalink / raw)
  To: Haitao Shan, Keir Fraser; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 2006 bytes --]

Hi, Keir,

Attached is the updated patch using the methods as you described in
another mail.
What do you think of the one?

Signed-off-by:	Shan Haitao <haitao.shan@intel.com> 

Best Regards
Haitao Shan

Haitao Shan wrote:
> Agree. Placing migration in stop_machine context will definitely make
> our jobs easier. I will start making a new patch tomorrow. :)
> I place the migraton code outside the stop_machine_run context, partly
> because I am not quite sure how long it will take to migrate all the
> vcpus away. If it takes too much time, all useful works are blocked
> since all cpus are in the stop_machine context. Of course, I borrowed
> the ideas from kernel, which also let me made the desicion.
> 
> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
>> I feel this is more complicated than it needs to be.
>> 
>> How about clearing VCPUs from the offlined CPU's runqueue from the
>> very end of __cpu_disable()? At that point all other CPUs are safely
>> in softirq context with IRQs disabled, and we are running on the
>> correct CPU (being offlined). We could have a hook into the
>> scheduler subsystem at that point to break affinities, assign to
>> different runqueues, etc. We would just need to be careful not to
>> try an IPI. :-) This approach would not need a cpu_schedule_map
>> (which is really increasing code fragility imo, by creating possible
>> extra confusion about which cpumask is the wright one to use in a
>> given situation).  
>> 
>> My feeling, unless I've missed something, is that this would make
>> the patch quite a bit smaller and with a smaller spread of code
>> changes. 
>> 
>>  -- Keir
>> 
>> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
>> 
>>> This patch implements cpu offline feature.
>>> 
>>> Best Regards
>>> Haitao Shan
>> 
>> 
>> 
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel

[-- Attachment #2: cpu_offline.patch --]
[-- Type: application/octet-stream, Size: 6299 bytes --]

diff -r f5e72cbfbb17 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c	Wed Sep 10 11:26:16 2008 +0100
+++ b/xen/arch/x86/irq.c	Thu Sep 11 15:48:05 2008 +0800
@@ -739,6 +739,7 @@
 {
     unsigned int irq;
     static int warned;
+    irq_guest_action_t *action;
 
     for ( irq = 0; irq < NR_IRQS; irq++ )
     {
@@ -756,6 +757,16 @@
             irq_desc[irq].handler->set_affinity(irq, mask);
         else if ( irq_desc[irq].action && !(warned++) )
             printk("Cannot set affinity for irq %i\n", irq);
+
+        if ( !(irq_desc[irq].status & IRQ_GUEST) )
+            continue;
+        action = (irq_guest_action_t *)irq_desc[irq].action;
+        if ( cpu_isset(smp_processor_id(), action->cpu_eoi_map) )
+        {
+            ack_APIC_irq();
+            cpu_clear(smp_processor_id(), action->cpu_eoi_map);
+            printk("Flushing pending eoi for irq %i\n", irq);
+        }
     }
 
     local_irq_enable();
diff -r f5e72cbfbb17 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c	Wed Sep 10 11:26:16 2008 +0100
+++ b/xen/arch/x86/smpboot.c	Thu Sep 11 15:48:05 2008 +0800
@@ -1225,15 +1225,6 @@
 	if (cpu == 0)
 		return -EBUSY;
 
-	/*
-	 * Only S3 is using this path, and thus idle vcpus are running on all
-	 * APs when we are called. To support full cpu hotplug, other 
-	 * notification mechanisms should be introduced (e.g., migrate vcpus
-	 * off this physical cpu before rendezvous point).
-	 */
-	if (!is_idle_vcpu(current))
-		return -EINVAL;
-
 	local_irq_disable();
 	clear_local_APIC();
 	/* Allow any queued timer interrupts to get serviced */
@@ -1249,6 +1240,8 @@
 	fixup_irqs(map);
 	/* It's now safe to remove this processor from the online map */
 	cpu_clear(cpu, cpu_online_map);
+
+    migrate_all_vcpus_on_cpu(smp_processor_id());
 	return 0;
 }
 
@@ -1275,28 +1268,6 @@
     return __cpu_disable();
 }
 
-/* 
- * XXX: One important thing missed here is to migrate vcpus
- * from dead cpu to other online ones and then put whole
- * system into a stop state. It assures a safe environment
- * for a cpu hotplug/remove at normal running state.
- *
- * However for xen PM case, at this point:
- * 	-> All other domains should be notified with PM event,
- *	   and then in following states:
- *		* Suspend state, or
- *		* Paused state, which is a force step to all
- *		  domains if they do nothing to suspend
- *	-> All vcpus of dom0 (except vcpu0) have already beem
- *	   hot removed
- * with the net effect that all other cpus only have idle vcpu
- * running. In this special case, we can avoid vcpu migration
- * then and system can be considered in a stop state.
- *
- * So current cpu hotplug is a special version for PM specific
- * usage, and need more effort later for full cpu hotplug.
- * (ktian1)
- */
 int cpu_down(unsigned int cpu)
 {
 	int err = 0;
@@ -1306,6 +1277,13 @@
 		err = -EBUSY;
 		goto out;
 	}
+
+    /* Can not offline BSP */
+    if ( cpu == 0 )
+    {
+        err = -EINVAL;
+        goto out;
+    }
 
 	if (!cpu_online(cpu)) {
 		err = -EINVAL;
diff -r f5e72cbfbb17 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Wed Sep 10 11:26:16 2008 +0100
+++ b/xen/common/sched_credit.c	Thu Sep 11 15:48:05 2008 +0800
@@ -407,11 +407,14 @@
 static inline int
 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
 {
+    cpumask_t mask;
+
     /*
      * Don't pick up work that's in the peer's scheduling tail. Also only pick
      * up work that's allowed to run on our CPU.
      */
-    return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+    cpus_and(mask, vc->cpu_affinity, cpu_online_map);
+    return !vc->is_running && cpu_isset(dest_cpu, mask);
 }
 
 static int
diff -r f5e72cbfbb17 xen/common/schedule.c
--- a/xen/common/schedule.c	Wed Sep 10 11:26:16 2008 +0100
+++ b/xen/common/schedule.c	Thu Sep 11 15:48:05 2008 +0800
@@ -286,6 +286,67 @@
         vcpu_sleep_nosync(v);
         vcpu_migrate(v);
     }
+}
+
+static void vcpu_force_migrate(struct vcpu *v, int cpu_from)
+{
+    unsigned long flags;
+
+    vcpu_schedule_lock_irqsave(v, flags);
+
+    if ( v->processor != cpu_from )
+    {
+        vcpu_schedule_unlock_irqrestore(v, flags);
+        return;
+    }
+
+    set_bit(_VPF_migrating, &v->pause_flags);
+    vcpu_schedule_unlock_irqrestore(v, flags);
+
+    if ( test_bit(_VPF_migrating, &v->pause_flags) )
+    {
+        vcpu_sleep_nosync(v);
+        vcpu_migrate(v);
+    }
+}
+
+/* This function is used by cpu_hotplug code. All vcpus but idle vcpu is
+ * migrated to other cpus. The caller should already prevented migration
+ * to this dying cpu.
+ */
+void migrate_all_vcpus_on_cpu(int cpu)
+{
+    struct domain *d = NULL;
+    struct vcpu *v = NULL;
+    unsigned long flags;
+
+    for_each_domain(d)
+        for_each_vcpu(d, v)
+        {
+            if ( is_idle_vcpu(v) )
+                continue;
+
+            /* If vcpu is pinned on dying cpu, give warning here and vcpu can
+             * be continued on any cpus.
+             */
+            if ( cpus_weight(v->cpu_affinity) == 1
+                 && cpu_isset(cpu, v->cpu_affinity) )
+            {
+                printk("Breaking vcpu affinity for domain %d vcpu %d\n",
+                        v->domain->domain_id, v->vcpu_id);
+                vcpu_schedule_lock_irqsave(v, flags);
+                cpus_setall(v->cpu_affinity);
+                vcpu_schedule_unlock_irqrestore(v, flags);
+            }
+            /* Single shot timer might be left active on this cpu, migrate it
+             * to bsp. A new cpu will be automatically shosen when * the timer
+             * is set again.
+             */
+            if ( v->singleshot_timer.cpu == cpu )
+                migrate_timer(&v->singleshot_timer, 0);
+
+            vcpu_force_migrate(v, cpu);
+        }
 }
 
 static int __vcpu_set_affinity(
diff -r f5e72cbfbb17 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Wed Sep 10 11:26:16 2008 +0100
+++ b/xen/include/xen/sched.h	Thu Sep 11 15:48:05 2008 +0800
@@ -524,6 +524,7 @@
 void cpu_init(void);
 
 void vcpu_force_reschedule(struct vcpu *v);
+void migrate_all_vcpus_on_cpu(int cpu);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11  8:02     ` Shan, Haitao
@ 2008-09-11 11:12       ` Keir Fraser
  2008-09-11 11:33         ` Shan, Haitao
  0 siblings, 1 reply; 30+ messages in thread
From: Keir Fraser @ 2008-09-11 11:12 UTC (permalink / raw)
  To: Shan, Haitao, Haitao Shan; +Cc: xen-devel

It looks much better. I'll read through, maybe tweak, and most likely then
check it in.

 Thanks,
 Keir

On 11/9/08 09:02, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Hi, Keir,
> 
> Attached is the updated patch using the methods as you described in
> another mail.
> What do you think of the one?
> 
> Signed-off-by: Shan Haitao <haitao.shan@intel.com>
> 
> Best Regards
> Haitao Shan
> 
> Haitao Shan wrote:
>> Agree. Placing migration in stop_machine context will definitely make
>> our jobs easier. I will start making a new patch tomorrow. :)
>> I place the migraton code outside the stop_machine_run context, partly
>> because I am not quite sure how long it will take to migrate all the
>> vcpus away. If it takes too much time, all useful works are blocked
>> since all cpus are in the stop_machine context. Of course, I borrowed
>> the ideas from kernel, which also let me made the desicion.
>> 
>> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
>>> I feel this is more complicated than it needs to be.
>>> 
>>> How about clearing VCPUs from the offlined CPU's runqueue from the
>>> very end of __cpu_disable()? At that point all other CPUs are safely
>>> in softirq context with IRQs disabled, and we are running on the
>>> correct CPU (being offlined). We could have a hook into the
>>> scheduler subsystem at that point to break affinities, assign to
>>> different runqueues, etc. We would just need to be careful not to
>>> try an IPI. :-) This approach would not need a cpu_schedule_map
>>> (which is really increasing code fragility imo, by creating possible
>>> extra confusion about which cpumask is the wright one to use in a
>>> given situation).
>>> 
>>> My feeling, unless I've missed something, is that this would make
>>> the patch quite a bit smaller and with a smaller spread of code
>>> changes. 
>>> 
>>>  -- Keir
>>> 
>>> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
>>> 
>>>> This patch implements cpu offline feature.
>>>> 
>>>> Best Regards
>>>> Haitao Shan
>>> 
>>> 
>>> 
>>> _______________________________________________
>>> Xen-devel mailing list
>>> Xen-devel@lists.xensource.com
>>> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 11:12       ` Keir Fraser
@ 2008-09-11 11:33         ` Shan, Haitao
  2008-09-11 12:42           ` Keir Fraser
  2008-09-11 14:15           ` Keir Fraser
  0 siblings, 2 replies; 30+ messages in thread
From: Shan, Haitao @ 2008-09-11 11:33 UTC (permalink / raw)
  To: Keir Fraser, Haitao Shan, Tian, Kevin; +Cc: xen-devel

Thanks!
Concerning cpu online/offline development, I have a small question here.
Since cpu_online_map is very important, code in different subsystems may use it extensively. If such code is not designed with cpu online/offline in mind, it may introduce race conditions, just like the one fixed in cpu calibration rendezvous.
Currently, we solve it in a find-and-fix manner. Do you have any idea that can solve the problem in a cleaner way?
Thanks in advance.

Shan Haitao 

-----Original Message-----
From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: 2008年9月11日 19:13
To: Shan, Haitao; Haitao Shan
Cc: xen-devel@lists.xensource.com
Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in Xen

It looks much better. I'll read through, maybe tweak, and most likely then
check it in.

 Thanks,
 Keir

On 11/9/08 09:02, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Hi, Keir,
> 
> Attached is the updated patch using the methods as you described in
> another mail.
> What do you think of the one?
> 
> Signed-off-by: Shan Haitao <haitao.shan@intel.com>
> 
> Best Regards
> Haitao Shan
> 
> Haitao Shan wrote:
>> Agree. Placing migration in stop_machine context will definitely make
>> our jobs easier. I will start making a new patch tomorrow. :)
>> I place the migraton code outside the stop_machine_run context, partly
>> because I am not quite sure how long it will take to migrate all the
>> vcpus away. If it takes too much time, all useful works are blocked
>> since all cpus are in the stop_machine context. Of course, I borrowed
>> the ideas from kernel, which also let me made the desicion.
>> 
>> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
>>> I feel this is more complicated than it needs to be.
>>> 
>>> How about clearing VCPUs from the offlined CPU's runqueue from the
>>> very end of __cpu_disable()? At that point all other CPUs are safely
>>> in softirq context with IRQs disabled, and we are running on the
>>> correct CPU (being offlined). We could have a hook into the
>>> scheduler subsystem at that point to break affinities, assign to
>>> different runqueues, etc. We would just need to be careful not to
>>> try an IPI. :-) This approach would not need a cpu_schedule_map
>>> (which is really increasing code fragility imo, by creating possible
>>> extra confusion about which cpumask is the wright one to use in a
>>> given situation).
>>> 
>>> My feeling, unless I've missed something, is that this would make
>>> the patch quite a bit smaller and with a smaller spread of code
>>> changes. 
>>> 
>>>  -- Keir
>>> 
>>> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
>>> 
>>>> This patch implements cpu offline feature.
>>>> 
>>>> Best Regards
>>>> Haitao Shan
>>> 
>>> 
>>> 
>>> _______________________________________________
>>> Xen-devel mailing list
>>> Xen-devel@lists.xensource.com
>>> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 11:33         ` Shan, Haitao
@ 2008-09-11 12:42           ` Keir Fraser
  2008-09-11 14:15           ` Keir Fraser
  1 sibling, 0 replies; 30+ messages in thread
From: Keir Fraser @ 2008-09-11 12:42 UTC (permalink / raw)
  To: Shan, Haitao, Haitao Shan, Tian, Kevin; +Cc: xen-devel

On 11/9/08 12:33, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Thanks!
> Concerning cpu online/offline development, I have a small question here.
> Since cpu_online_map is very important, code in different subsystems may use
> it extensively. If such code is not designed with cpu online/offline in mind,
> it may introduce race conditions, just like the one fixed in cpu calibration
> rendezvous.
> Currently, we solve it in a find-and-fix manner. Do you have any idea that can
> solve the problem in a cleaner way?
> Thanks in advance.

Mostly it will be race against CPUs coming online, since stop_machine_run()
is a strong barrier in the offline case. We could use stop_machine_run() for
onlining too (even if just to set a bit in cpu_online_map). Or indeed we
just carry on fixing up the bugs one by one as we find them. I actually
doubt there are that many, so I think this current approach is fine.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 11:33         ` Shan, Haitao
  2008-09-11 12:42           ` Keir Fraser
@ 2008-09-11 14:15           ` Keir Fraser
  2008-09-11 14:23             ` Christoph Egger
  2008-09-11 16:00             ` Shan, Haitao
  1 sibling, 2 replies; 30+ messages in thread
From: Keir Fraser @ 2008-09-11 14:15 UTC (permalink / raw)
  To: Shan, Haitao, Haitao Shan, Tian, Kevin; +Cc: xen-devel

I applied the patch with the following changes:
 * I rewrote your changes to fixup_irqs(). We should force lazy EOIs *after*
we have serviced any straggling interrupts. Also we should actually clear
the EOI stack so it is empty next time the CPU comes online.
 * I simplified your changes to schedule.c in light of the fact we run in
stop_machine context. Hence we can be quite relaxed about locking, for
example.
 * I removed your change to __csched_vcpu_is_migrateable() and instead put a
similar check in csched_load_balance(). I think this is clearer and also
cheaper.

I note that the VCPU currently running on the offlined CPU continues to run
there even after __cpu_disable(), and until that CPU does a final run
through the scheduler soon after. I hope it does not matter there is one
vcpu with v->processor == offlined_cpu for a short while (e.g., what if
another CPU does vcpu_sleep_nosync(v) -> cpu_raise_softirq(v->processor,
...)). I *think* it's actually okay, but I'm not totally certain. Really I
guess this patch needs some stress testing (lots of online/offline cycles
while pausing/unpausing domains, etc). Perhaps we could plumb through a Xen
sysctl and make a small dom0 utility for this purpose?

 -- Keir

On 11/9/08 12:33, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Thanks!
> Concerning cpu online/offline development, I have a small question here.
> Since cpu_online_map is very important, code in different subsystems may use
> it extensively. If such code is not designed with cpu online/offline in mind,
> it may introduce race conditions, just like the one fixed in cpu calibration
> rendezvous.
> Currently, we solve it in a find-and-fix manner. Do you have any idea that can
> solve the problem in a cleaner way?
> Thanks in advance.
> 
> Shan Haitao 
> 
> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: 2008年9月11日 19:13
> To: Shan, Haitao; Haitao Shan
> Cc: xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in Xen
> 
> It looks much better. I'll read through, maybe tweak, and most likely then
> check it in.
> 
>  Thanks,
>  Keir
> 
> On 11/9/08 09:02, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> 
>> Hi, Keir,
>> 
>> Attached is the updated patch using the methods as you described in
>> another mail.
>> What do you think of the one?
>> 
>> Signed-off-by: Shan Haitao <haitao.shan@intel.com>
>> 
>> Best Regards
>> Haitao Shan
>> 
>> Haitao Shan wrote:
>>> Agree. Placing migration in stop_machine context will definitely make
>>> our jobs easier. I will start making a new patch tomorrow. :)
>>> I place the migraton code outside the stop_machine_run context, partly
>>> because I am not quite sure how long it will take to migrate all the
>>> vcpus away. If it takes too much time, all useful works are blocked
>>> since all cpus are in the stop_machine context. Of course, I borrowed
>>> the ideas from kernel, which also let me made the desicion.
>>> 
>>> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
>>>> I feel this is more complicated than it needs to be.
>>>> 
>>>> How about clearing VCPUs from the offlined CPU's runqueue from the
>>>> very end of __cpu_disable()? At that point all other CPUs are safely
>>>> in softirq context with IRQs disabled, and we are running on the
>>>> correct CPU (being offlined). We could have a hook into the
>>>> scheduler subsystem at that point to break affinities, assign to
>>>> different runqueues, etc. We would just need to be careful not to
>>>> try an IPI. :-) This approach would not need a cpu_schedule_map
>>>> (which is really increasing code fragility imo, by creating possible
>>>> extra confusion about which cpumask is the wright one to use in a
>>>> given situation).
>>>> 
>>>> My feeling, unless I've missed something, is that this would make
>>>> the patch quite a bit smaller and with a smaller spread of code
>>>> changes. 
>>>> 
>>>>  -- Keir
>>>> 
>>>> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
>>>> 
>>>>> This patch implements cpu offline feature.
>>>>> 
>>>>> Best Regards
>>>>> Haitao Shan
>>>> 
>>>> 
>>>> 
>>>> _______________________________________________
>>>> Xen-devel mailing list
>>>> Xen-devel@lists.xensource.com
>>>> http://lists.xensource.com/xen-devel
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 14:15           ` Keir Fraser
@ 2008-09-11 14:23             ` Christoph Egger
  2008-09-11 14:32               ` Keir Fraser
  2008-09-17  4:17               ` Gavin Maltby
  2008-09-11 16:00             ` Shan, Haitao
  1 sibling, 2 replies; 30+ messages in thread
From: Christoph Egger @ 2008-09-11 14:23 UTC (permalink / raw)
  To: xen-devel; +Cc: Haitao Shan, Tian, Kevin, Shan, Haitao, Keir Fraser

On Thursday 11 September 2008 16:15:14 Keir Fraser wrote:
> I applied the patch with the following changes:
>  * I rewrote your changes to fixup_irqs(). We should force lazy EOIs
> *after* we have serviced any straggling interrupts. Also we should actually
> clear the EOI stack so it is empty next time the CPU comes online.
>  * I simplified your changes to schedule.c in light of the fact we run in
> stop_machine context. Hence we can be quite relaxed about locking, for
> example.
>  * I removed your change to __csched_vcpu_is_migrateable() and instead put
> a similar check in csched_load_balance(). I think this is clearer and also
> cheaper.
>
> I note that the VCPU currently running on the offlined CPU continues to run
> there even after __cpu_disable(), and until that CPU does a final run
> through the scheduler soon after. I hope it does not matter there is one
> vcpu with v->processor == offlined_cpu for a short while 

This is not acceptable regarding to machine check. When Dom0 offlines a
defect cpu, nothing may continue on it or silent data corruption occurs.

Christoph


> On 11/9/08 12:33, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> > Thanks!
> > Concerning cpu online/offline development, I have a small question here.
> > Since cpu_online_map is very important, code in different subsystems may
> > use it extensively. If such code is not designed with cpu online/offline
> > in mind, it may introduce race conditions, just like the one fixed in cpu
> > calibration rendezvous.
> > Currently, we solve it in a find-and-fix manner. Do you have any idea
> > that can solve the problem in a cleaner way?
> > Thanks in advance.
> >
> > Shan Haitao
> >
> > -----Original Message-----
> > From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> > Sent: 2008年9月11日 19:13
> > To: Shan, Haitao; Haitao Shan
> > Cc: xen-devel@lists.xensource.com
> > Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in
> > Xen
> >
> > It looks much better. I'll read through, maybe tweak, and most likely
> > then check it in.
> >
> >  Thanks,
> >  Keir
> >
> > On 11/9/08 09:02, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> >> Hi, Keir,
> >>
> >> Attached is the updated patch using the methods as you described in
> >> another mail.
> >> What do you think of the one?
> >>
> >> Signed-off-by: Shan Haitao <haitao.shan@intel.com>
> >>
> >> Best Regards
> >> Haitao Shan
> >>
> >> Haitao Shan wrote:
> >>> Agree. Placing migration in stop_machine context will definitely make
> >>> our jobs easier. I will start making a new patch tomorrow. :)
> >>> I place the migraton code outside the stop_machine_run context, partly
> >>> because I am not quite sure how long it will take to migrate all the
> >>> vcpus away. If it takes too much time, all useful works are blocked
> >>> since all cpus are in the stop_machine context. Of course, I borrowed
> >>> the ideas from kernel, which also let me made the desicion.
> >>>
> >>> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
> >>>> I feel this is more complicated than it needs to be.
> >>>>
> >>>> How about clearing VCPUs from the offlined CPU's runqueue from the
> >>>> very end of __cpu_disable()? At that point all other CPUs are safely
> >>>> in softirq context with IRQs disabled, and we are running on the
> >>>> correct CPU (being offlined). We could have a hook into the
> >>>> scheduler subsystem at that point to break affinities, assign to
> >>>> different runqueues, etc. We would just need to be careful not to
> >>>> try an IPI. :-) This approach would not need a cpu_schedule_map
> >>>> (which is really increasing code fragility imo, by creating possible
> >>>> extra confusion about which cpumask is the wright one to use in a
> >>>> given situation).
> >>>>
> >>>> My feeling, unless I've missed something, is that this would make
> >>>> the patch quite a bit smaller and with a smaller spread of code
> >>>> changes.
> >>>>
> >>>>  -- Keir
> >>>>
> >>>> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> >>>>> This patch implements cpu offline feature.
> >>>>>
> >>>>> Best Regards
> >>>>> Haitao Shan


-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 14:23             ` Christoph Egger
@ 2008-09-11 14:32               ` Keir Fraser
  2008-09-11 14:47                 ` Keir Fraser
  2008-09-17  4:17               ` Gavin Maltby
  1 sibling, 1 reply; 30+ messages in thread
From: Keir Fraser @ 2008-09-11 14:32 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Haitao Shan, Tian, Kevin, Shan, Haitao

On 11/9/08 15:23, "Christoph Egger" <Christoph.Egger@amd.com> wrote:

>> I note that the VCPU currently running on the offlined CPU continues to run
>> there even after __cpu_disable(), and until that CPU does a final run
>> through the scheduler soon after. I hope it does not matter there is one
>> vcpu with v->processor == offlined_cpu for a short while
> 
> This is not acceptable regarding to machine check. When Dom0 offlines a
> defect cpu, nothing may continue on it or silent data corruption occurs.

It doesn't run for unbounded time. The offline CPU will immediately run
softirq work as the very next thing it does, causing a run through the
scheduler, where it will 100% definitely pick up the idle vcpu and hence
play dead in the idle loop.

This won't be guaranteed synchronous with a offline request via a hypercall
though. By which I mean, that hypercall may return but the last vcpu may not
stop running on the cpu until some tiny amount of time later.

If this is unacceptable we have to add a hook into sched_credit.c to
synchronously and forcibly deschedule the currently running VCPU from within
stop_machine context. This is probably moderately tricky, so I hope we can
avoid it.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 14:32               ` Keir Fraser
@ 2008-09-11 14:47                 ` Keir Fraser
  0 siblings, 0 replies; 30+ messages in thread
From: Keir Fraser @ 2008-09-11 14:47 UTC (permalink / raw)
  To: Christoph Egger, xen-devel; +Cc: Haitao Shan, Tian, Kevin, Shan, Haitao

On 11/9/08 15:32, "Keir Fraser" <keir.fraser@eu.citrix.com> wrote:

>>> I note that the VCPU currently running on the offlined CPU continues to run
>>> there even after __cpu_disable(), and until that CPU does a final run
>>> through the scheduler soon after. I hope it does not matter there is one
>>> vcpu with v->processor == offlined_cpu for a short while
>> 
>> This is not acceptable regarding to machine check. When Dom0 offlines a
>> defect cpu, nothing may continue on it or silent data corruption occurs.
> 
> It doesn't run for unbounded time. The offline CPU will immediately run
> softirq work as the very next thing it does, causing a run through the
> scheduler, where it will 100% definitely pick up the idle vcpu and hence
> play dead in the idle loop.
> 
> This won't be guaranteed synchronous with a offline request via a hypercall
> though. By which I mean, that hypercall may return but the last vcpu may not
> stop running on the cpu until some tiny amount of time later.

Actually I'm wrong on this. __cpu_die() will ensure the CPU really is
totally offline before returning. So calls to cpu_down() really are
synchronous already.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 14:15           ` Keir Fraser
  2008-09-11 14:23             ` Christoph Egger
@ 2008-09-11 16:00             ` Shan, Haitao
  2008-09-11 16:52               ` Keir Fraser
  1 sibling, 1 reply; 30+ messages in thread
From: Shan, Haitao @ 2008-09-11 16:00 UTC (permalink / raw)
  To: Keir Fraser, Haitao Shan, Tian, Kevin; +Cc: xen-devel

Hi, Keir,

Concerning the last running vcpu on the dying cpu, I have some thought.
Yes, there would be a short time after the stop_machine_run when this vcpu v->processor == dying_cpu. But anyhow, we set fie __VPF_migrating flag for that vcpu and issued a schedule_softirq on the dying cpu. 
This softirq should run immediately after stop_machine context, am I right? If so, by the time the schedule softirq is executed, this last vcpu is migrated away from this dying cpu. But saving of its context will be delayed to play_dead->sync_lazy_context.
If another cpu issues the schedule request to this dying cpu (vcpu_sleep_nosync->cpu_raise_softirq(vc->processor....)) during this time, the request will be serviced by the above code sequence. So it is safe in such cases.
Am I missing something important? I am not quite confident on the statements, though.

Shan Haitao

-----Original Message-----
From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: 2008年9月11日 22:15
To: Shan, Haitao; Haitao Shan; Tian, Kevin
Cc: xen-devel@lists.xensource.com
Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in Xen

I applied the patch with the following changes:
 * I rewrote your changes to fixup_irqs(). We should force lazy EOIs *after*
we have serviced any straggling interrupts. Also we should actually clear
the EOI stack so it is empty next time the CPU comes online.
 * I simplified your changes to schedule.c in light of the fact we run in
stop_machine context. Hence we can be quite relaxed about locking, for
example.
 * I removed your change to __csched_vcpu_is_migrateable() and instead put a
similar check in csched_load_balance(). I think this is clearer and also
cheaper.

I note that the VCPU currently running on the offlined CPU continues to run
there even after __cpu_disable(), and until that CPU does a final run
through the scheduler soon after. I hope it does not matter there is one
vcpu with v->processor == offlined_cpu for a short while (e.g., what if
another CPU does vcpu_sleep_nosync(v) -> cpu_raise_softirq(v->processor,
...)). I *think* it's actually okay, but I'm not totally certain. Really I
guess this patch needs some stress testing (lots of online/offline cycles
while pausing/unpausing domains, etc). Perhaps we could plumb through a Xen
sysctl and make a small dom0 utility for this purpose?

 -- Keir

On 11/9/08 12:33, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Thanks!
> Concerning cpu online/offline development, I have a small question here.
> Since cpu_online_map is very important, code in different subsystems may use
> it extensively. If such code is not designed with cpu online/offline in mind,
> it may introduce race conditions, just like the one fixed in cpu calibration
> rendezvous.
> Currently, we solve it in a find-and-fix manner. Do you have any idea that can
> solve the problem in a cleaner way?
> Thanks in advance.
> 
> Shan Haitao 
> 
> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: 2008年9月11日 19:13
> To: Shan, Haitao; Haitao Shan
> Cc: xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in Xen
> 
> It looks much better. I'll read through, maybe tweak, and most likely then
> check it in.
> 
>  Thanks,
>  Keir
> 
> On 11/9/08 09:02, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> 
>> Hi, Keir,
>> 
>> Attached is the updated patch using the methods as you described in
>> another mail.
>> What do you think of the one?
>> 
>> Signed-off-by: Shan Haitao <haitao.shan@intel.com>
>> 
>> Best Regards
>> Haitao Shan
>> 
>> Haitao Shan wrote:
>>> Agree. Placing migration in stop_machine context will definitely make
>>> our jobs easier. I will start making a new patch tomorrow. :)
>>> I place the migraton code outside the stop_machine_run context, partly
>>> because I am not quite sure how long it will take to migrate all the
>>> vcpus away. If it takes too much time, all useful works are blocked
>>> since all cpus are in the stop_machine context. Of course, I borrowed
>>> the ideas from kernel, which also let me made the desicion.
>>> 
>>> 2008/9/10 Keir Fraser <keir.fraser@eu.citrix.com>:
>>>> I feel this is more complicated than it needs to be.
>>>> 
>>>> How about clearing VCPUs from the offlined CPU's runqueue from the
>>>> very end of __cpu_disable()? At that point all other CPUs are safely
>>>> in softirq context with IRQs disabled, and we are running on the
>>>> correct CPU (being offlined). We could have a hook into the
>>>> scheduler subsystem at that point to break affinities, assign to
>>>> different runqueues, etc. We would just need to be careful not to
>>>> try an IPI. :-) This approach would not need a cpu_schedule_map
>>>> (which is really increasing code fragility imo, by creating possible
>>>> extra confusion about which cpumask is the wright one to use in a
>>>> given situation).
>>>> 
>>>> My feeling, unless I've missed something, is that this would make
>>>> the patch quite a bit smaller and with a smaller spread of code
>>>> changes. 
>>>> 
>>>>  -- Keir
>>>> 
>>>> On 9/9/08 09:59, "Shan, Haitao" <haitao.shan@intel.com> wrote:
>>>> 
>>>>> This patch implements cpu offline feature.
>>>>> 
>>>>> Best Regards
>>>>> Haitao Shan
>>>> 
>>>> 
>>>> 
>>>> _______________________________________________
>>>> Xen-devel mailing list
>>>> Xen-devel@lists.xensource.com
>>>> http://lists.xensource.com/xen-devel
> 
> 

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 16:00             ` Shan, Haitao
@ 2008-09-11 16:52               ` Keir Fraser
  2008-09-11 23:30                 ` Shan, Haitao
  0 siblings, 1 reply; 30+ messages in thread
From: Keir Fraser @ 2008-09-11 16:52 UTC (permalink / raw)
  To: Shan, Haitao, Haitao Shan, Tian, Kevin; +Cc: xen-devel

On 11/9/08 17:00, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Hi, Keir,
> 
> Concerning the last running vcpu on the dying cpu, I have some thought.
> Yes, there would be a short time after the stop_machine_run when this vcpu
> v->processor == dying_cpu. But anyhow, we set fie __VPF_migrating flag for
> that vcpu and issued a schedule_softirq on the dying cpu.
> This softirq should run immediately after stop_machine context, am I right? If
> so, by the time the schedule softirq is executed, this last vcpu is migrated
> away from this dying cpu. But saving of its context will be delayed to
> play_dead->sync_lazy_context.
> If another cpu issues the schedule request to this dying cpu
> (vcpu_sleep_nosync->cpu_raise_softirq(vc->processor....)) during this time,
> the request will be serviced by the above code sequence. So it is safe in such
> cases.
> Am I missing something important? I am not quite confident on the statements,
> though.

I agree it looks safe.

By the way, have you considered using this hotplug functionality for power
management? If instead of for(;;) halt(); we instead hooked into Cx
management and tried to get into as deep sleep as possible (possibly even
supporting the really deep sleeps that power off a whole socket and mean you
*have* to come back via real mode) then this would give a nice
coarse-time-scale power management mechanism controllable from dom0.

I consider this might be a nice win for possibly less effort than is being
expended in trying to make idle residency times (and hence Cx residency
times) as long as possible.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 16:52               ` Keir Fraser
@ 2008-09-11 23:30                 ` Shan, Haitao
  0 siblings, 0 replies; 30+ messages in thread
From: Shan, Haitao @ 2008-09-11 23:30 UTC (permalink / raw)
  To: Keir Fraser, Haitao Shan, Tian, Kevin; +Cc: xen-devel

Agree and I ever discussed about this interesting thing with Kevin. So I think he can talk more on this topic. :)

Shan Haitao

-----Original Message-----
From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: 2008年9月12日 0:53
To: Shan, Haitao; Haitao Shan; Tian, Kevin
Cc: xen-devel@lists.xensource.com
Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in Xen

On 11/9/08 17:00, "Shan, Haitao" <haitao.shan@intel.com> wrote:

> Hi, Keir,
> 
> Concerning the last running vcpu on the dying cpu, I have some thought.
> Yes, there would be a short time after the stop_machine_run when this vcpu
> v->processor == dying_cpu. But anyhow, we set fie __VPF_migrating flag for
> that vcpu and issued a schedule_softirq on the dying cpu.
> This softirq should run immediately after stop_machine context, am I right? If
> so, by the time the schedule softirq is executed, this last vcpu is migrated
> away from this dying cpu. But saving of its context will be delayed to
> play_dead->sync_lazy_context.
> If another cpu issues the schedule request to this dying cpu
> (vcpu_sleep_nosync->cpu_raise_softirq(vc->processor....)) during this time,
> the request will be serviced by the above code sequence. So it is safe in such
> cases.
> Am I missing something important? I am not quite confident on the statements,
> though.

I agree it looks safe.

By the way, have you considered using this hotplug functionality for power
management? If instead of for(;;) halt(); we instead hooked into Cx
management and tried to get into as deep sleep as possible (possibly even
supporting the really deep sleeps that power off a whole socket and mean you
*have* to come back via real mode) then this would give a nice
coarse-time-scale power management mechanism controllable from dom0.

I consider this might be a nice win for possibly less effort than is being
expended in trying to make idle residency times (and hence Cx residency
times) as long as possible.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
@ 2008-09-12  2:22 Tian, Kevin
  2008-09-12  6:02 ` Keir Fraser
  0 siblings, 1 reply; 30+ messages in thread
From: Tian, Kevin @ 2008-09-12  2:22 UTC (permalink / raw)
  To: Shan, Haitao, Haitao Shan, Keir Fraser; +Cc: xen-devel, Wei, Gang

On Friday, September 12, 2008 12:53 AM, Keir Fraser wrote:
> On 11/9/08 17:00, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> 
>> Hi, Keir,
>> 
>> Concerning the last running vcpu on the dying cpu, I have some
thought.
>> Yes, there would be a short time after the stop_machine_run when this
vcpu
>> v->processor == dying_cpu. But anyhow, we set fie __VPF_migrating
flag for
>> that vcpu and issued a schedule_softirq on the dying cpu.
>> This softirq should run immediately after stop_machine context, am I
right?
>> If so, by the time the schedule softirq is executed, this last vcpu
is
>> migrated away from this dying cpu. But saving of its context will be
delayed
>> to play_dead->sync_lazy_context. If another cpu issues the schedule
request
>> to this dying cpu
(vcpu_sleep_nosync->cpu_raise_softirq(vc->processor....))
>> during this time, the request will be serviced by the above code
sequence.
>> So it is safe in such cases. Am I missing something important? I am
not
>> quite confident on the statements, though.
> 
> I agree it looks safe.
> 
> By the way, have you considered using this hotplug functionality for
power
> management? If instead of for(;;) halt(); we instead hooked into Cx
> management and tried to get into as deep sleep as possible (possibly
even
> supporting the really deep sleeps that power off a whole socket and
mean you
> *have* to come back via real mode) then this would give a nice
> coarse-time-scale power management mechanism controllable from dom0.

Yes, that's one good suggestion and we can add deep sleep for offline
path.

> 
> I consider this might be a nice win for possibly less effort than is
being
> expended in trying to make idle residency times (and hence Cx
residency
> times) as long as possible.
> 

These two don't conflict. Cpu online/offline can't be used in small
interval due
to long latency and added overhead to whole system, but it makes sense 
when administrator realizes low cpu utilization in a relatively long
period like
in hrs. Current idle governor instead runs in fine-grained level to fit
the otherwise
cases.

Thanks,
Kevin

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-12  2:22 Tian, Kevin
@ 2008-09-12  6:02 ` Keir Fraser
  2008-09-12  6:04   ` Tian, Kevin
  0 siblings, 1 reply; 30+ messages in thread
From: Keir Fraser @ 2008-09-12  6:02 UTC (permalink / raw)
  To: Tian, Kevin, Shan, Haitao, Haitao Shan; +Cc: xen-devel, Wei, Gang

On 12/9/08 03:22, "Tian, Kevin" <kevin.tian@intel.com> wrote:

>> I consider this might be a nice win for possibly less effort than is
> being
>> expended in trying to make idle residency times (and hence Cx
> residency
>> times) as long as possible.
>> 
> 
> These two don't conflict. Cpu online/offline can't be used in small
> interval due
> to long latency and added overhead to whole system, but it makes sense
> when administrator realizes low cpu utilization in a relatively long
> period like
> in hrs. Current idle governor instead runs in fine-grained level to fit
> the otherwise
> cases.

I certainly agree with that. Just pointing out that, with the fine-graiend
approach, beyond a certain point you'll be investing effort for smaller and
smaller further gains.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-12  6:02 ` Keir Fraser
@ 2008-09-12  6:04   ` Tian, Kevin
  0 siblings, 0 replies; 30+ messages in thread
From: Tian, Kevin @ 2008-09-12  6:04 UTC (permalink / raw)
  To: Keir Fraser, Shan, Haitao, Haitao Shan; +Cc: xen-devel, Wei, Gang

>From: Keir Fraser
>Sent: 2008年9月12日 14:02
>To: Tian, Kevin; Shan, Haitao; Haitao Shan
>Cc: xen-devel@lists.xensource.com; Wei, Gang
>Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline 
>support in Xen
>
>On 12/9/08 03:22, "Tian, Kevin" <kevin.tian@intel.com> wrote:
>
>>> I consider this might be a nice win for possibly less effort than is
>> being
>>> expended in trying to make idle residency times (and hence Cx
>> residency
>>> times) as long as possible.
>>> 
>> 
>> These two don't conflict. Cpu online/offline can't be used in small
>> interval due
>> to long latency and added overhead to whole system, but it 
>makes sense
>> when administrator realizes low cpu utilization in a relatively long
>> period like
>> in hrs. Current idle governor instead runs in fine-grained 
>level to fit
>> the otherwise
>> cases.
>
>I certainly agree with that. Just pointing out that, with the 
>fine-graiend
>approach, beyond a certain point you'll be investing effort 
>for smaller and
>smaller further gains.
>

Sure I agree, and beyond that point a weak-designed governor
may even slow system with no actual gain.

Thanks,
Kevin

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-11 14:23             ` Christoph Egger
  2008-09-11 14:32               ` Keir Fraser
@ 2008-09-17  4:17               ` Gavin Maltby
  2008-09-17  7:05                 ` Jan Beulich
  1 sibling, 1 reply; 30+ messages in thread
From: Gavin Maltby @ 2008-09-17  4:17 UTC (permalink / raw)
  To: Christoph Egger
  Cc: Haitao Shan, Tian, Kevin, xen-devel, Shan, Haitao, Keir Fraser

Christoph Egger wrote:
> On Thursday 11 September 2008 16:15:14 Keir Fraser wrote:
>> I applied the patch with the following changes:
>>  * I rewrote your changes to fixup_irqs(). We should force lazy EOIs
>> *after* we have serviced any straggling interrupts. Also we should actually
>> clear the EOI stack so it is empty next time the CPU comes online.
>>  * I simplified your changes to schedule.c in light of the fact we run in
>> stop_machine context. Hence we can be quite relaxed about locking, for
>> example.
>>  * I removed your change to __csched_vcpu_is_migrateable() and instead put
>> a similar check in csched_load_balance(). I think this is clearer and also
>> cheaper.
>>
>> I note that the VCPU currently running on the offlined CPU continues to run
>> there even after __cpu_disable(), and until that CPU does a final run
>> through the scheduler soon after. I hope it does not matter there is one
>> vcpu with v->processor == offlined_cpu for a short while 
> 
> This is not acceptable regarding to machine check. When Dom0 offlines a
> defect cpu, nothing may continue on it or silent data corruption occurs.

I don't see this as a problem for machine check correctness.

If dom0 asks to offline a cpu (because it believes the cpu is busted and
a threat to uptime), that decision is fundamentally asynchronous
to the actual error handling that occured at machine check exception
time:

  - running in whatever context
  - MCE occurs
  - trap to hypervisor MCE handler
	. this decides on hypervisor panic, or other appropriate
	  immediate (in handler) response
	. telemetry forwarded to dom0 for logging and analysis
  - assume no hypervisor panic
  - eons pass during which any unconstrained bad data remaining
    after initial handling may go anywhere
  - dom0 gets telemetry and let's say diagnoses a fault and
    decides to call back into the hypervisor to offline the
    offending cpu

Note the "eons pass" bit;  tonnes of instructions may run on the
bad cpu in this time, and a few more for some offline delay won't
hurt.

Gavin

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in Xen
  2008-09-17  4:17               ` Gavin Maltby
@ 2008-09-17  7:05                 ` Jan Beulich
  2008-09-17  9:20                   ` Jiang, Yunhong
  0 siblings, 1 reply; 30+ messages in thread
From: Jan Beulich @ 2008-09-17  7:05 UTC (permalink / raw)
  To: Christoph Egger, Gavin Maltby
  Cc: Haitao Shan, Kevin Tian, xen-devel, Haitao Shan, Keir Fraser

>>> Gavin Maltby <Gavin.Maltby@Sun.COM> 17.09.08 06:17 >>>
>I don't see this as a problem for machine check correctness.
>
>If dom0 asks to offline a cpu (because it believes the cpu is busted and
>a threat to uptime), that decision is fundamentally asynchronous
>to the actual error handling that occured at machine check exception
>time:
>
>  - running in whatever context
>  - MCE occurs
>  - trap to hypervisor MCE handler
>	. this decides on hypervisor panic, or other appropriate
>	  immediate (in handler) response
>	. telemetry forwarded to dom0 for logging and analysis
>  - assume no hypervisor panic
>  - eons pass during which any unconstrained bad data remaining
>    after initial handling may go anywhere
>  - dom0 gets telemetry and let's say diagnoses a fault and
>    decides to call back into the hypervisor to offline the
>    offending cpu
>
>Note the "eons pass" bit;  tonnes of instructions may run on the
>bad cpu in this time, and a few more for some offline delay won't
>hurt.

Shouldn't this possibly be handled the other way around: If a recoverable
MCE happened, immediately stop scheduling anything on the affected
CPU(s), until Dom0 tells you otherwise (and of course as long as there
remains at least one CPU to run on).

Jan

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-17  7:05                 ` Jan Beulich
@ 2008-09-17  9:20                   ` Jiang, Yunhong
  2008-09-17  9:43                     ` Christoph Egger
  0 siblings, 1 reply; 30+ messages in thread
From: Jiang, Yunhong @ 2008-09-17  9:20 UTC (permalink / raw)
  To: Jan Beulich, Christoph Egger, Gavin Maltby
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao, Fraser,
	Keir, Haitao Shan

[-- Attachment #1: Type: text/plain, Size: 2234 bytes --]



>-----Original Message-----
>From: xen-devel-bounces@lists.xensource.com
>[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Jan Beulich
>Sent: 2008年9月17日 15:06
>To: Christoph Egger; Gavin Maltby
>Cc: Haitao Shan; Tian, Kevin; xen-devel@lists.xensource.com;
>Shan, Haitao; Keir Fraser
>Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline
>support in Xen
>
>>>> Gavin Maltby <Gavin.Maltby@Sun.COM> 17.09.08 06:17 >>>
>>I don't see this as a problem for machine check correctness.
>>
>>If dom0 asks to offline a cpu (because it believes the cpu is
>busted and
>>a threat to uptime), that decision is fundamentally asynchronous
>>to the actual error handling that occured at machine check exception
>>time:
>>
>>  - running in whatever context
>>  - MCE occurs
>>  - trap to hypervisor MCE handler
>>       . this decides on hypervisor panic, or other appropriate
>>         immediate (in handler) response
>>       . telemetry forwarded to dom0 for logging and analysis
>>  - assume no hypervisor panic
>>  - eons pass during which any unconstrained bad data remaining
>>    after initial handling may go anywhere
>>  - dom0 gets telemetry and let's say diagnoses a fault and
>>    decides to call back into the hypervisor to offline the
>>    offending cpu
>>
>>Note the "eons pass" bit;  tonnes of instructions may run on the
>>bad cpu in this time, and a few more for some offline delay won't
>>hurt.
>
>Shouldn't this possibly be handled the other way around: If a
>recoverable
>MCE happened, immediately stop scheduling anything on the affected
>CPU(s), until Dom0 tells you otherwise (and of course as long as there
>remains at least one CPU to run on).

Current MCE handling in Xen has no mechanism to achieve this, agree that some initial containment in Xen is needed to reduce the possibility of second MCE, ( will the program locality cause such situation?)
What we are thinking is, when MCA handler happen, all domain's vcpu except dom0's vcpu0 need be bring into xen's execution context.


>
>Jan
>
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@lists.xensource.com
>http://lists.xensource.com/xen-devel
>

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-17  9:20                   ` Jiang, Yunhong
@ 2008-09-17  9:43                     ` Christoph Egger
  2008-09-17 13:14                       ` Ke, Liping
  2008-09-18  3:56                       ` Jiang, Yunhong
  0 siblings, 2 replies; 30+ messages in thread
From: Christoph Egger @ 2008-09-17  9:43 UTC (permalink / raw)
  To: Jiang, Yunhong
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Keir Fraser, Haitao Shan

On Wednesday 17 September 2008 11:20:57 Jiang, Yunhong wrote:
> >-----Original Message-----
> >From: xen-devel-bounces@lists.xensource.com
> >[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Jan Beulich
> >Sent: 2008年9月17日 15:06
> >To: Christoph Egger; Gavin Maltby
> >Cc: Haitao Shan; Tian, Kevin; xen-devel@lists.xensource.com;
> >Shan, Haitao; Keir Fraser
> >Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline
> >support in Xen
> >
> >>>> Gavin Maltby <Gavin.Maltby@Sun.COM> 17.09.08 06:17 >>>
> >>
> >>I don't see this as a problem for machine check correctness.
> >>
> >>If dom0 asks to offline a cpu (because it believes the cpu is
> >
> >busted and
> >
> >>a threat to uptime), that decision is fundamentally asynchronous
> >>to the actual error handling that occured at machine check exception
> >>time:
> >>
> >>  - running in whatever context
> >>  - MCE occurs
> >>  - trap to hypervisor MCE handler
> >>       . this decides on hypervisor panic, or other appropriate
> >>         immediate (in handler) response
> >>       . telemetry forwarded to dom0 for logging and analysis
> >>  - assume no hypervisor panic
> >>  - eons pass during which any unconstrained bad data remaining
> >>    after initial handling may go anywhere
> >>  - dom0 gets telemetry and let's say diagnoses a fault and
> >>    decides to call back into the hypervisor to offline the
> >>    offending cpu
> >>
> >>Note the "eons pass" bit;  tonnes of instructions may run on the
> >>bad cpu in this time, and a few more for some offline delay won't
> >>hurt.
> >
> >Shouldn't this possibly be handled the other way around: If a
> >recoverable
> >MCE happened, immediately stop scheduling anything on the affected
> >CPU(s), until Dom0 tells you otherwise (and of course as long as there
> >remains at least one CPU to run on).
>
> Current MCE handling in Xen has no mechanism to achieve this.

It has since c/s 17968.

Christoph



-- 
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-17  9:43                     ` Christoph Egger
@ 2008-09-17 13:14                       ` Ke, Liping
  2008-09-18  3:56                       ` Jiang, Yunhong
  1 sibling, 0 replies; 30+ messages in thread
From: Ke, Liping @ 2008-09-17 13:14 UTC (permalink / raw)
  To: Christoph Egger, Jiang, Yunhong
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Keir Fraser, Haitao Shan

[-- Attachment #1: Type: text/plain, Size: 4049 bytes --]

Hi, Egger
Thanks a lot about your answer. Just look through your patch and want to get some help:

1. When MCE happens, will all of the cores (even in different socket) all be brought into MCE handler in AMD platform (such as K8) too? So every core will enter and execute k8_machine_check handler? When mce happened, this handler will enter N (number of cores) times.

2. When doing send_guest_trap, if dom0->vcpu0->processor = 0 while in nmi_mce_softirq, cur_cpu = 1, when set affinity, we should bind dom0->vcpu0 with cur_cpu 1 instead of its original bindings [cpu_set(cpu, affinity) vs cpu_set(st->processor, affinity)]? Otherwise the affinity has e no changes and need not restore? Not sure about this.

3. If several vcpus are running (belongs dom0 or other domains) when MCA happens, if we don't pause other vcpus except dom0.vcpu0 and let them into idle, maybe we can't make sure that those vcpus will still be scheduled in an unstable environment? At the same time, other pcpu might be in k8_machine_check handler concurrently?

Thanks a lot for your answer!
Regards,
Criping

-----Original Message-----
From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Christoph Egger
Sent: 2008年9月17日 17:44
To: Jiang, Yunhong
Cc: Tian, Kevin; xen-devel@lists.xensource.com; Shan, Haitao; Gavin Maltby; Keir Fraser; Haitao Shan
Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline support in Xen

On Wednesday 17 September 2008 11:20:57 Jiang, Yunhong wrote:
> >-----Original Message-----
> >From: xen-devel-bounces@lists.xensource.com
> >[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Jan Beulich
> >Sent: 2008年9月17日 15:06
> >To: Christoph Egger; Gavin Maltby
> >Cc: Haitao Shan; Tian, Kevin; xen-devel@lists.xensource.com;
> >Shan, Haitao; Keir Fraser
> >Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline
> >support in Xen
> >
> >>>> Gavin Maltby <Gavin.Maltby@Sun.COM> 17.09.08 06:17 >>>
> >>
> >>I don't see this as a problem for machine check correctness.
> >>
> >>If dom0 asks to offline a cpu (because it believes the cpu is
> >
> >busted and
> >
> >>a threat to uptime), that decision is fundamentally asynchronous
> >>to the actual error handling that occured at machine check exception
> >>time:
> >>
> >>  - running in whatever context
> >>  - MCE occurs
> >>  - trap to hypervisor MCE handler
> >>       . this decides on hypervisor panic, or other appropriate
> >>         immediate (in handler) response
> >>       . telemetry forwarded to dom0 for logging and analysis
> >>  - assume no hypervisor panic
> >>  - eons pass during which any unconstrained bad data remaining
> >>    after initial handling may go anywhere
> >>  - dom0 gets telemetry and let's say diagnoses a fault and
> >>    decides to call back into the hypervisor to offline the
> >>    offending cpu
> >>
> >>Note the "eons pass" bit;  tonnes of instructions may run on the
> >>bad cpu in this time, and a few more for some offline delay won't
> >>hurt.
> >
> >Shouldn't this possibly be handled the other way around: If a
> >recoverable
> >MCE happened, immediately stop scheduling anything on the affected
> >CPU(s), until Dom0 tells you otherwise (and of course as long as there
> >remains at least one CPU to run on).
>
> Current MCE handling in Xen has no mechanism to achieve this.

It has since c/s 17968.

Christoph

--
AMD Saxony, Dresden, Germany
Operating System Research Center

Legal Information:
AMD Saxony Limited Liability Company & Co. KG
Sitz (Geschäftsanschrift):
   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
Registergericht Dresden: HRA 4896
vertretungsberechtigter Komplementär:
   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
Geschäftsführer der AMD Saxony LLC:
   Dr. Hans-R. Deppe, Thomas McCoy

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-17  9:43                     ` Christoph Egger
  2008-09-17 13:14                       ` Ke, Liping
@ 2008-09-18  3:56                       ` Jiang, Yunhong
  2008-09-18  7:20                         ` Keir Fraser
  1 sibling, 1 reply; 30+ messages in thread
From: Jiang, Yunhong @ 2008-09-18  3:56 UTC (permalink / raw)
  To: Christoph Egger
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Keir Fraser, Haitao Shan

>> >
>> >Shouldn't this possibly be handled the other way around: If a
>> >recoverable
>> >MCE happened, immediately stop scheduling anything on the affected
>> >CPU(s), until Dom0 tells you otherwise (and of course as
>long as there
>> >remains at least one CPU to run on).
>>
>> Current MCE handling in Xen has no mechanism to achieve this.
>
>It has since c/s 17968.

Hmm, I think current NMI_MCE_SOFTIRQ can't make sure other guest will not be scheduled. Considering there is a schedule softirq already pending on the pCPU, other guest may run before the impacted guest. Did I missed anything?

Thanks
Yunhong Jiang


>
>Christoph
>
>
>
>--
>AMD Saxony, Dresden, Germany
>Operating System Research Center
>
>Legal Information:
>AMD Saxony Limited Liability Company & Co. KG
>Sitz (Geschäftsanschrift):
>   Wilschdorfer Landstr. 101, 01109 Dresden, Deutschland
>Registergericht Dresden: HRA 4896
>vertretungsberechtigter Komplementär:
>   AMD Saxony LLC (Sitz Wilmington, Delaware, USA)
>Geschäftsführer der AMD Saxony LLC:
>   Dr. Hans-R. Deppe, Thomas McCoy
>
>

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-18  3:56                       ` Jiang, Yunhong
@ 2008-09-18  7:20                         ` Keir Fraser
  2008-09-18  8:13                           ` Jiang, Yunhong
  0 siblings, 1 reply; 30+ messages in thread
From: Keir Fraser @ 2008-09-18  7:20 UTC (permalink / raw)
  To: Jiang, Yunhong, Christoph Egger
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Haitao Shan

On 18/9/08 04:56, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:

>> It has since c/s 17968.
> 
> Hmm, I think current NMI_MCE_SOFTIRQ can't make sure other guest will not be
> scheduled. Considering there is a schedule softirq already pending on the
> pCPU, other guest may run before the impacted guest. Did I missed anything?

There are races here in any case. What if #MC happens halfway through the
scheduler, just before set_current(new)?

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-18  7:20                         ` Keir Fraser
@ 2008-09-18  8:13                           ` Jiang, Yunhong
  2008-09-18  9:11                             ` Keir Fraser
  0 siblings, 1 reply; 30+ messages in thread
From: Jiang, Yunhong @ 2008-09-18  8:13 UTC (permalink / raw)
  To: Keir Fraser, Christoph Egger
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Haitao Shan

[-- Attachment #1: Type: text/plain, Size: 932 bytes --]



>-----Original Message-----
>From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>Sent: 2008年9月18日 15:21
>To: Jiang, Yunhong; Christoph Egger
>Cc: Jan Beulich; Gavin Maltby; Haitao Shan; Tian, Kevin;
>xen-devel@lists.xensource.com; Shan, Haitao
>Subject: Re: [Xen-devel] Re: [PATCH 1/4] CPU online/offline
>support in Xen
>
>On 18/9/08 04:56, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>
>>> It has since c/s 17968.
>>
>> Hmm, I think current NMI_MCE_SOFTIRQ can't make sure other
>guest will not be
>> scheduled. Considering there is a schedule softirq already
>pending on the
>> pCPU, other guest may run before the impacted guest. Did I
>missed anything?
>
>There are races here in any case. What if #MC happens halfway
>through the
>scheduler, just before set_current(new)?

If MCE handler will not cause schedule and not change current, will any issue happen?


>
> -- Keir
>
>
>

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-18  8:13                           ` Jiang, Yunhong
@ 2008-09-18  9:11                             ` Keir Fraser
  2008-09-18 15:17                               ` Jiang, Yunhong
  0 siblings, 1 reply; 30+ messages in thread
From: Keir Fraser @ 2008-09-18  9:11 UTC (permalink / raw)
  To: Jiang, Yunhong, Christoph Egger
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Haitao Shan

On 18/9/08 09:13, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:

>>> Hmm, I think current NMI_MCE_SOFTIRQ can't make sure other
>> guest will not be
>>> scheduled. Considering there is a schedule softirq already
>> pending on the
>>> pCPU, other guest may run before the impacted guest. Did I
>> missed anything?
>> 
>> There are races here in any case. What if #MC happens halfway
>> through the
>> scheduler, just before set_current(new)?
> 
> If MCE handler will not cause schedule and not change current, will any issue
> happen?

I'm not sure exactly what you mean. What *I* meant was that there are
certain points during execution where, if a #MC occurs, it may not be
possible to determine which single vCPU was running on the pCPU. I guess
though that if you ever get unrecoverable errors reported while running
inside the hypervisor, you probably can't recover anyway.

 -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

* RE: Re: [PATCH 1/4] CPU online/offline support in  Xen
  2008-09-18  9:11                             ` Keir Fraser
@ 2008-09-18 15:17                               ` Jiang, Yunhong
  0 siblings, 0 replies; 30+ messages in thread
From: Jiang, Yunhong @ 2008-09-18 15:17 UTC (permalink / raw)
  To: Keir Fraser, Christoph Egger
  Cc: Tian, Kevin, xen-devel@lists.xensource.com, Shan, Haitao,
	Gavin Maltby, Haitao Shan

Keir Fraser <mailto:keir.fraser@eu.citrix.com> wrote:
> On 18/9/08 09:13, "Jiang, Yunhong" <yunhong.jiang@intel.com> wrote:
>
>>>> Hmm, I think current NMI_MCE_SOFTIRQ can't make sure other guest will
>>>> not be scheduled. Considering there is a schedule softirq already
>>>> pending on the pCPU, other guest may run before the impacted guest. Did
>>>> I missed anything?
>>>
>>> There are races here in any case. What if #MC happens halfway through the
>>> scheduler, just before set_current(new)?
>>
>> If MCE handler will not cause schedule and not change current, will any
>> issue happen?
>
> I'm not sure exactly what you mean. What *I* meant was that there are
> certain points during execution where, if a #MC occurs, it may not be
> possible to determine which single vCPU was running on the

Current implementation on k8_machine_check, it determine xen_impacted through if current is idel domain. And it determine which domain is impacted through current. I have no idea of AMD's machine check mechanism, but when considering support on intel platform, it may be a bit different. For example, xen is impacted if MCE caused by sync event happens in Xen's context, even is not in idel domain. Also impacted domain may not always determined by current, memory ownership may help to decide impacted domain.

Another difference we are considering is, we suppose domU's MCA handler is not trusted, so firstly, we may always need dom0's MCE handler support, secondly, after domU MCE handler, some guard may be needed to make sure no error triggered again.

> pCPU. I guess
> though that if you ever get unrecoverable errors reported while running
> inside the hypervisor, you probably can't recover anyway.

I think this may depends on the error type. If the error is an async event, it may be ok to continue after some containment. For example, if EIPV=0, RIPV=1, and ADDRV =1 and happens to xen's execution context, it may because of some async event to the memory side, in that situtaion, we can kill the owner of the page (if that page is owned exclusively by one guest) and continue run. However, if it is a sync event like EIPV=1, then we have to reset the system.

>
> -- Keir

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2008-09-18 15:17 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-09  8:59 [PATCH 1/4] CPU online/offline support in Xen Shan, Haitao
2008-09-10 10:43 ` Keir Fraser
2008-09-10 10:59   ` Keir Fraser
2008-09-10 12:59   ` Haitao Shan
2008-09-10 16:05     ` Frank van der Linden
2008-09-11  7:36       ` Keir Fraser
2008-09-11  8:02     ` Shan, Haitao
2008-09-11 11:12       ` Keir Fraser
2008-09-11 11:33         ` Shan, Haitao
2008-09-11 12:42           ` Keir Fraser
2008-09-11 14:15           ` Keir Fraser
2008-09-11 14:23             ` Christoph Egger
2008-09-11 14:32               ` Keir Fraser
2008-09-11 14:47                 ` Keir Fraser
2008-09-17  4:17               ` Gavin Maltby
2008-09-17  7:05                 ` Jan Beulich
2008-09-17  9:20                   ` Jiang, Yunhong
2008-09-17  9:43                     ` Christoph Egger
2008-09-17 13:14                       ` Ke, Liping
2008-09-18  3:56                       ` Jiang, Yunhong
2008-09-18  7:20                         ` Keir Fraser
2008-09-18  8:13                           ` Jiang, Yunhong
2008-09-18  9:11                             ` Keir Fraser
2008-09-18 15:17                               ` Jiang, Yunhong
2008-09-11 16:00             ` Shan, Haitao
2008-09-11 16:52               ` Keir Fraser
2008-09-11 23:30                 ` Shan, Haitao
  -- strict thread matches above, loose matches on Subject: below --
2008-09-12  2:22 Tian, Kevin
2008-09-12  6:02 ` Keir Fraser
2008-09-12  6:04   ` Tian, Kevin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.