All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] CPU online/offline support in Xen
@ 2008-09-09  8:59 Shan, Haitao
  2008-09-10 10:43 ` Keir Fraser
  0 siblings, 1 reply; 30+ messages in thread
From: Shan, Haitao @ 2008-09-09  8:59 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 73 bytes --]

This patch implements cpu offline feature.

Best Regards
Haitao Shan

[-- Attachment #2: cpu_offline.patch --]
[-- Type: application/octet-stream, Size: 9919 bytes --]

diff -r be573a356c90 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/arch/x86/irq.c	Sat Sep 06 15:50:12 2008 +0800
@@ -739,6 +739,7 @@
 {
     unsigned int irq;
     static int warned;
+    irq_guest_action_t *action;
 
     for ( irq = 0; irq < NR_IRQS; irq++ )
     {
@@ -756,6 +757,16 @@
             irq_desc[irq].handler->set_affinity(irq, mask);
         else if ( irq_desc[irq].action && !(warned++) )
             printk("Cannot set affinity for irq %i\n", irq);
+
+        if ( !(irq_desc[irq].status & IRQ_GUEST) )
+            continue;
+        action = (irq_guest_action_t *)irq_desc[irq].action;
+        if ( cpu_isset(smp_processor_id(), action->cpu_eoi_map) )
+        {
+            ack_APIC_irq();
+            cpu_clear(smp_processor_id(), action->cpu_eoi_map);
+            printk("Flushing pending eoi for irq %i\n", irq);
+        }
     }
 
     local_irq_enable();
diff -r be573a356c90 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/arch/x86/smpboot.c	Sat Sep 06 15:50:12 2008 +0800
@@ -39,6 +39,7 @@
 #include <xen/mm.h>
 #include <xen/domain.h>
 #include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/irq.h>
 #include <xen/delay.h>
 #include <xen/softirq.h>
@@ -531,6 +532,8 @@
 
 	cpu_set(smp_processor_id(), cpu_online_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+
+    cpu_schedule_map_set(smp_processor_id());
 
 	init_percpu_time();
 
@@ -1180,6 +1183,7 @@
 	cpu_set(smp_processor_id(), cpu_callout_map);
 	cpu_set(smp_processor_id(), cpu_present_map);
 	cpu_set(smp_processor_id(), cpu_possible_map);
+    cpu_schedule_map_set(smp_processor_id());
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
@@ -1225,15 +1229,6 @@
 	if (cpu == 0)
 		return -EBUSY;
 
-	/*
-	 * Only S3 is using this path, and thus idle vcpus are running on all
-	 * APs when we are called. To support full cpu hotplug, other 
-	 * notification mechanisms should be introduced (e.g., migrate vcpus
-	 * off this physical cpu before rendezvous point).
-	 */
-	if (!is_idle_vcpu(current))
-		return -EINVAL;
-
 	local_irq_disable();
 	clear_local_APIC();
 	/* Allow any queued timer interrupts to get serviced */
@@ -1275,28 +1270,15 @@
     return __cpu_disable();
 }
 
-/* 
- * XXX: One important thing missed here is to migrate vcpus
- * from dead cpu to other online ones and then put whole
- * system into a stop state. It assures a safe environment
- * for a cpu hotplug/remove at normal running state.
- *
- * However for xen PM case, at this point:
- * 	-> All other domains should be notified with PM event,
- *	   and then in following states:
- *		* Suspend state, or
- *		* Paused state, which is a force step to all
- *		  domains if they do nothing to suspend
- *	-> All vcpus of dom0 (except vcpu0) have already beem
- *	   hot removed
- * with the net effect that all other cpus only have idle vcpu
- * running. In this special case, we can avoid vcpu migration
- * then and system can be considered in a stop state.
- *
- * So current cpu hotplug is a special version for PM specific
- * usage, and need more effort later for full cpu hotplug.
- * (ktian1)
- */
+static int fixing_scheduler_map(void *data)
+{
+    unsigned int cpu = *(unsigned int *)data;
+
+    cpu_schedule_map_clear(cpu);
+
+    return 0;
+}
+
 int cpu_down(unsigned int cpu)
 {
 	int err = 0;
@@ -1307,16 +1289,46 @@
 		goto out;
 	}
 
+    /* Can not offline BSP */
+    if ( cpu == 0 )
+    {
+        err = -EINVAL;
+        goto out;
+    }
+
 	if (!cpu_online(cpu)) {
 		err = -EINVAL;
 		goto out;
 	}
 
+    /* Modify vcpu affinity of those whose affinity only includes this dying
+     * cpu. Otherwise, after we modifies the schedule_map, the scheduler
+     * is confused.
+     */
+    prepare_migration_on_cpu(cpu);
+
+    /* Prevent the scheduler from migrating vcpus to this cpu.*/
+    err = stop_machine_run(fixing_scheduler_map, &cpu, smp_processor_id());
+    if ( err < 0 )
+    {
+        printk("stop machine rmove siblinginfo failed\n");
+        goto out;
+    }
+
+    /* Actually migrate all vcpus on this dying cpu away. */
+    migrate_all_vcpus_on_cpu(cpu);
+
 	printk("Prepare to bring CPU%d down...\n", cpu);
+
+    while ( !idle_vcpu[cpu]->is_running )
+        cpu_relax();
 
 	err = stop_machine_run(take_cpu_down, NULL, cpu);
 	if ( err < 0 )
+    {
+        cpu_schedule_map_set(cpu);
 		goto out;
+    }
 
 	__cpu_die(cpu);
 
diff -r be573a356c90 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/common/sched_credit.c	Sat Sep 06 15:50:12 2008 +0800
@@ -407,11 +407,14 @@
 static inline int
 __csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
 {
+    cpumask_t mask;
+
     /*
      * Don't pick up work that's in the peer's scheduling tail. Also only pick
      * up work that's allowed to run on our CPU.
      */
-    return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+    cpus_and(mask, vc->cpu_affinity, cpu_schedule_map);
+    return !vc->is_running && cpu_isset(dest_cpu, mask);
 }
 
 static int
@@ -425,7 +428,7 @@
      * Pick from online CPUs in VCPU's affinity mask, giving a
      * preference to its current processor if it's in there.
      */
-    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    cpus_and(cpus, cpu_schedule_map, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : __cycle_cpu(vc->processor, &cpus);
@@ -1118,7 +1121,7 @@
      * Peek at non-idling CPUs in the system, starting with our
      * immediate neighbour.
      */
-    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    cpus_andnot(workers, cpu_schedule_map, csched_priv.idlers);
     cpu_clear(cpu, workers);
     peer_cpu = cpu;
 
diff -r be573a356c90 xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/common/sched_sedf.c	Sat Sep 06 15:50:12 2008 +0800
@@ -415,7 +415,7 @@
 {
     cpumask_t online_affinity;
 
-    cpus_and(online_affinity, v->cpu_affinity, cpu_online_map);
+    cpus_and(online_affinity, v->cpu_affinity, cpu_schedule_map);
     return first_cpu(online_affinity);
 }
 
diff -r be573a356c90 xen/common/schedule.c
--- a/xen/common/schedule.c	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/common/schedule.c	Sat Sep 06 15:50:12 2008 +0800
@@ -39,6 +39,8 @@
 string_param("sched", opt_sched);
 
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
+
+cpumask_t cpu_schedule_map;
 
 /* Various timer handlers. */
 static void s_timer_fn(void *unused);
@@ -266,6 +268,82 @@
         vcpu_sleep_nosync(v);
         vcpu_migrate(v);
     }
+}
+
+static void vcpu_force_migrate(struct vcpu *v, int cpu_from)
+{
+    unsigned long flags;
+
+    vcpu_schedule_lock_irqsave(v, flags);
+
+    if ( v->processor != cpu_from )
+    {
+        vcpu_schedule_unlock_irqrestore(v, flags);
+        return;
+    }
+
+    set_bit(_VPF_migrating, &v->pause_flags);
+    vcpu_schedule_unlock_irqrestore(v, flags);
+
+    if ( test_bit(_VPF_migrating, &v->pause_flags) )
+    {
+        vcpu_sleep_nosync(v);
+        vcpu_migrate(v);
+    }
+}
+
+void prepare_migration_on_cpu(int cpu)
+{
+    struct domain *d = NULL;
+    struct vcpu *v = NULL;
+    unsigned long flags;
+
+    for_each_domain(d)
+        for_each_vcpu(d, v)
+        {
+            if ( is_idle_vcpu(v) )
+                continue;
+
+            /* If vcpu is pinned on dying cpu, give warning here and vcpu can
+             * be continued on any cpus.
+             */
+            if ( cpus_weight(v->cpu_affinity) == 1
+                 && cpu_isset(cpu, v->cpu_affinity) )
+            {
+                printk("Breaking vcpu affinity for domain %d vcpu %d\n",
+                        v->domain->domain_id, v->vcpu_id);
+                vcpu_schedule_lock_irqsave(v, flags);
+                cpus_setall(v->cpu_affinity);
+                vcpu_schedule_unlock_irqrestore(v, flags);
+            }
+        }
+
+}
+
+/* This function is used by cpu_hotplug code. All vcpus but idle vcpu is
+ * migrated to other cpus. The caller should already prevented migration
+ * to this dying cpu.
+ */
+void migrate_all_vcpus_on_cpu(int cpu)
+{
+    struct domain *d = NULL;
+    struct vcpu *v = NULL;
+
+    for_each_domain(d)
+        for_each_vcpu(d, v)
+        {
+            if ( is_idle_vcpu(v) )
+                continue;
+
+            /* Single shot timer might be left active on this cpu, migrate it
+             * to bsp. A new cpu will be automatically shosen when * the timer
+             * is set again.
+             */
+            if ( v->singleshot_timer.cpu == cpu )
+                migrate_timer(&v->singleshot_timer, 0);
+
+            vcpu_force_migrate(v, cpu);
+        }
 }
 
 static int __vcpu_set_affinity(
diff -r be573a356c90 xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/include/xen/sched-if.h	Sat Sep 06 15:50:12 2008 +0800
@@ -79,4 +79,18 @@
     void         (*dump_cpu_state) (int);
 };
 
+extern cpumask_t cpu_schedule_map;
+
+static inline void cpu_schedule_map_set(int cpu)
+{
+    cpu_set(cpu, cpu_schedule_map);
+    smp_mb();
+}
+
+static inline void cpu_schedule_map_clear(int cpu)
+{
+    cpu_clear(cpu, cpu_schedule_map);
+    smp_mb();
+}
+
 #endif /* __XEN_SCHED_IF_H__ */
diff -r be573a356c90 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Fri Sep 05 11:56:35 2008 +0100
+++ b/xen/include/xen/sched.h	Sat Sep 06 15:50:12 2008 +0800
@@ -524,6 +524,8 @@
 void cpu_init(void);
 
 void vcpu_force_reschedule(struct vcpu *v);
+void prepare_migration_on_cpu(int cpu);
+void migrate_all_vcpus_on_cpu(int cpu);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 30+ messages in thread
* RE: Re: [PATCH 1/4] CPU online/offline support in Xen
@ 2008-09-12  2:22 Tian, Kevin
  2008-09-12  6:02 ` Keir Fraser
  0 siblings, 1 reply; 30+ messages in thread
From: Tian, Kevin @ 2008-09-12  2:22 UTC (permalink / raw)
  To: Shan, Haitao, Haitao Shan, Keir Fraser; +Cc: xen-devel, Wei, Gang

On Friday, September 12, 2008 12:53 AM, Keir Fraser wrote:
> On 11/9/08 17:00, "Shan, Haitao" <haitao.shan@intel.com> wrote:
> 
>> Hi, Keir,
>> 
>> Concerning the last running vcpu on the dying cpu, I have some
thought.
>> Yes, there would be a short time after the stop_machine_run when this
vcpu
>> v->processor == dying_cpu. But anyhow, we set fie __VPF_migrating
flag for
>> that vcpu and issued a schedule_softirq on the dying cpu.
>> This softirq should run immediately after stop_machine context, am I
right?
>> If so, by the time the schedule softirq is executed, this last vcpu
is
>> migrated away from this dying cpu. But saving of its context will be
delayed
>> to play_dead->sync_lazy_context. If another cpu issues the schedule
request
>> to this dying cpu
(vcpu_sleep_nosync->cpu_raise_softirq(vc->processor....))
>> during this time, the request will be serviced by the above code
sequence.
>> So it is safe in such cases. Am I missing something important? I am
not
>> quite confident on the statements, though.
> 
> I agree it looks safe.
> 
> By the way, have you considered using this hotplug functionality for
power
> management? If instead of for(;;) halt(); we instead hooked into Cx
> management and tried to get into as deep sleep as possible (possibly
even
> supporting the really deep sleeps that power off a whole socket and
mean you
> *have* to come back via real mode) then this would give a nice
> coarse-time-scale power management mechanism controllable from dom0.

Yes, that's one good suggestion and we can add deep sleep for offline
path.

> 
> I consider this might be a nice win for possibly less effort than is
being
> expended in trying to make idle residency times (and hence Cx
residency
> times) as long as possible.
> 

These two don't conflict. Cpu online/offline can't be used in small
interval due
to long latency and added overhead to whole system, but it makes sense 
when administrator realizes low cpu utilization in a relatively long
period like
in hrs. Current idle governor instead runs in fine-grained level to fit
the otherwise
cases.

Thanks,
Kevin

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2008-09-18 15:17 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-09-09  8:59 [PATCH 1/4] CPU online/offline support in Xen Shan, Haitao
2008-09-10 10:43 ` Keir Fraser
2008-09-10 10:59   ` Keir Fraser
2008-09-10 12:59   ` Haitao Shan
2008-09-10 16:05     ` Frank van der Linden
2008-09-11  7:36       ` Keir Fraser
2008-09-11  8:02     ` Shan, Haitao
2008-09-11 11:12       ` Keir Fraser
2008-09-11 11:33         ` Shan, Haitao
2008-09-11 12:42           ` Keir Fraser
2008-09-11 14:15           ` Keir Fraser
2008-09-11 14:23             ` Christoph Egger
2008-09-11 14:32               ` Keir Fraser
2008-09-11 14:47                 ` Keir Fraser
2008-09-17  4:17               ` Gavin Maltby
2008-09-17  7:05                 ` Jan Beulich
2008-09-17  9:20                   ` Jiang, Yunhong
2008-09-17  9:43                     ` Christoph Egger
2008-09-17 13:14                       ` Ke, Liping
2008-09-18  3:56                       ` Jiang, Yunhong
2008-09-18  7:20                         ` Keir Fraser
2008-09-18  8:13                           ` Jiang, Yunhong
2008-09-18  9:11                             ` Keir Fraser
2008-09-18 15:17                               ` Jiang, Yunhong
2008-09-11 16:00             ` Shan, Haitao
2008-09-11 16:52               ` Keir Fraser
2008-09-11 23:30                 ` Shan, Haitao
  -- strict thread matches above, loose matches on Subject: below --
2008-09-12  2:22 Tian, Kevin
2008-09-12  6:02 ` Keir Fraser
2008-09-12  6:04   ` Tian, Kevin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.