xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] x86: introduce and use prefill_possible_map()
@ 2010-05-04 16:00 Jan Beulich
  2010-05-14 16:32 ` Keir Fraser
  0 siblings, 1 reply; 2+ messages in thread
From: Jan Beulich @ 2010-05-04 16:00 UTC (permalink / raw)
  To: xen-devel; +Cc: xen-ia64-devel

[-- Attachment #1: Type: text/plain, Size: 11377 bytes --]

With the larger default NR_CPUS config setting (and the more with
build time settings exceeding this default) the wasting of memory (and
potentially other resources) just because cpu_possible_map doesn't get
set up properly increases. Use Linux' (accordingly modified to fit
Xen) prefill_possible_map() to overcome this.

This makes necessary an adjustment to tasklet initialization (which
must not happen before cpu_possible_map is guaranteed to be fully set
up - according to my static code analysis this was a problem on ia64
anyway).

Tracing code also needed a minor adjustment, as it still had a simple
counted loop accessing per-CPU data in its body.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

--- 2010-05-04.orig/xen/arch/ia64/linux-xen/smpboot.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/ia64/linux-xen/smpboot.c	2010-05-04 13:22:06.000000000 +0200
@@ -776,7 +776,7 @@ void __cpu_die(unsigned int cpu)
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void
-smp_cpus_done (unsigned int dummy)
+smp_cpus_done(void)
 {
 	int cpu;
 	unsigned long bogosum = 0;
--- 2010-05-04.orig/xen/arch/ia64/xen/xensetup.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/ia64/xen/xensetup.c	2010-05-04 16:42:36.000000000 +0200
@@ -562,10 +562,12 @@ skip_move:
     end_boot_allocator();
 
     softirq_init();
-    tasklet_subsys_init();
+    tasklet_early_init();
 
     late_setup_arch(&cmdline);
 
+    tasklet_subsys_init();
+
     scheduler_init();
     idle_vcpu[0] = (struct vcpu*) ia64_r13;
     idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
@@ -626,7 +628,7 @@ printk("num_online_cpus=%d, max_cpus=%d\
     local_irq_disable();
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
-    smp_cpus_done(max_cpus);
+    smp_cpus_done();
 #endif
 
     initialise_gdb(); /* could be moved earlier */
--- 2010-05-04.orig/xen/arch/x86/mpparse.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/x86/mpparse.c	2010-05-04 13:22:06.000000000 +0200
@@ -35,7 +35,6 @@
 
 /* Have we found an MP table */
 int smp_found_config;
-unsigned int __devinitdata maxcpus = NR_CPUS;
 
 /*
  * Various Linux-internal data structures created from the
@@ -66,7 +65,7 @@ unsigned int def_to_bigsmp = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
-static unsigned int __devinitdata num_processors;
+unsigned int __devinitdata num_processors;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
@@ -105,8 +104,10 @@ static int __devinit MP_processor_info (
  	int ver, apicid, cpu = 0;
 	physid_mask_t phys_cpu;
  	
-	if (!(m->mpc_cpuflag & CPU_ENABLED))
+	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+		++disabled_cpus;
 		return -EINVAL;
+	}
 
 	apicid = mpc_apic_id(m, translation_table[mpc_record]);
 
@@ -185,9 +186,9 @@ static int __devinit MP_processor_info (
 		return -ENOSPC;
 	}
 
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
+	if (max_cpus && num_processors >= max_cpus) {
+		printk(KERN_WARNING "WARNING: maxcpus limit of %u reached."
+		       " Processor ignored.\n", max_cpus);
 		return -ENOSPC;
 	}
 
--- 2010-05-04.orig/xen/arch/x86/setup.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/x86/setup.c	2010-05-04 16:43:22.000000000 +0200
@@ -61,7 +61,7 @@ static int __initdata opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
 
 /* maxcpus: maximum number of CPUs to activate. */
-static unsigned int __initdata max_cpus = NR_CPUS;
+unsigned int __devinitdata max_cpus;
 integer_param("maxcpus", max_cpus);
 
 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
@@ -568,6 +568,11 @@ void __init __start_xen(unsigned long mb
     if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
         EARLY_FAIL("Misaligned CPU0 stack.\n");
 
+    if ( opt_nosmp )
+        max_cpus = prefill_possible_map(1);
+    else if ( max_cpus )
+        max_cpus = prefill_possible_map(max_cpus);
+
     if ( e820_raw_nr != 0 )
     {
         memmap_type = "Xen-e820";
@@ -978,7 +983,7 @@ void __init __start_xen(unsigned long mb
 #endif
 
     softirq_init();
-    tasklet_subsys_init();
+    tasklet_early_init();
 
     early_cpu_init();
 
@@ -1017,6 +1022,11 @@ void __init __start_xen(unsigned long mb
     zap_low_mappings();
 #endif
 
+    if ( !max_cpus )
+        max_cpus = prefill_possible_map(0);
+
+    tasklet_subsys_init();
+
     init_apic_mappings();
 
     percpu_free_unused_areas();
@@ -1049,12 +1059,9 @@ void __init __start_xen(unsigned long mb
     vesa_mtrr_init();
 #endif
 
-    if ( opt_nosmp )
-        max_cpus = 0;
-
     iommu_setup();    /* setup iommu if available */
 
-    smp_prepare_cpus(max_cpus);
+    smp_prepare_cpus(!opt_nosmp * max_cpus);
 
     spin_debug_enable();
 
@@ -1087,7 +1094,7 @@ void __init __start_xen(unsigned long mb
     }
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
-    smp_cpus_done(max_cpus);
+    smp_cpus_done();
 
     initialise_gdb(); /* could be moved earlier */
 
--- 2010-05-04.orig/xen/arch/x86/smpboot.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/x86/smpboot.c	2010-05-04 13:22:06.000000000 +0200
@@ -83,10 +83,12 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
+unsigned int __devinitdata disabled_cpus;
+
 /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
  * is no way to resync one AP against BP. TBD: for prescott and above, we
  * should use IA64's algorithm
@@ -829,7 +831,11 @@ int alloc_cpu_id(void)
 {
 	cpumask_t	tmp_map;
 	int cpu;
-	cpus_complement(tmp_map, cpu_present_map);
+
+	if (max_cpus)
+		cpus_andnot(tmp_map, cpu_possible_map, cpu_present_map);
+	else
+		cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
 	if (cpu >= NR_CPUS)
 		return -ENODEV;
@@ -1243,6 +1249,52 @@ void __devinit smp_prepare_boot_cpu(void
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
+/*
+ * cpu_possible_mask should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_mask on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with max_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+unsigned int __init prefill_possible_map(unsigned int max_cpus)
+{
+	unsigned int i, possible;
+
+	/* no processor from mptable or madt */
+	if (!num_processors)
+		num_processors = 1;
+
+	if (!max_cpus)
+		possible = num_processors + disabled_cpus;
+	else
+		possible = max_cpus;
+
+	if (possible > NR_CPUS) {
+		printk(KERN_WARNING
+		       "%u processors exceeds NR_CPUS limit of %d\n",
+		       possible, NR_CPUS);
+		possible = NR_CPUS;
+	}
+
+	printk(KERN_INFO "SMP: Allowing %u CPUs, %d hotplug CPUs\n",
+	       possible, max_t(int, possible - num_processors, 0));
+
+	for (i = 0; i < possible; i++)
+		cpu_set(i, cpu_possible_map);
+
+	return possible;
+}
+
 static void
 remove_siblinginfo(int cpu)
 {
@@ -1568,7 +1620,7 @@ int __devinit __cpu_up(unsigned int cpu)
 }
 
 
-void __init smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(void)
 {
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
--- 2010-05-04.orig/xen/common/tasklet.c	2010-04-22 14:43:25.000000000 +0200
+++ 2010-05-04/xen/common/tasklet.c	2010-05-04 16:46:03.000000000 +0200
@@ -18,7 +18,7 @@
 #include <xen/tasklet.h>
 
 /* Some subsystems call into us before we are initialised. We ignore them. */
-static bool_t tasklets_initialised;
+static unsigned int __read_mostly tasklets_initialised = UINT_MAX;
 
 /*
  * NB. Any modification to a tasklet_list requires the scheduler to run
@@ -35,7 +35,8 @@ void tasklet_schedule_on_cpu(struct task
 
     spin_lock_irqsave(&tasklet_lock, flags);
 
-    if ( tasklets_initialised && !t->is_dead )
+    if ( (tasklets_initialised == NR_CPUS || tasklets_initialised == cpu) &&
+         !t->is_dead )
     {
         t->scheduled_on = cpu;
         if ( !t->is_running )
@@ -161,14 +162,24 @@ void tasklet_init(
     t->data = data;
 }
 
+void __init tasklet_early_init(void)
+{
+    unsigned int cpu = smp_processor_id();
+
+    INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
+
+    tasklets_initialised = cpu;
+}
+
 void __init tasklet_subsys_init(void)
 {
     unsigned int cpu;
 
     for_each_possible_cpu ( cpu )
-        INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
+        if ( cpu != tasklets_initialised )
+            INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
 
-    tasklets_initialised = 1;
+    tasklets_initialised = NR_CPUS;
 }
 
 /*
--- 2010-05-04.orig/xen/common/trace.c	2010-04-22 14:43:25.000000000 +0200
+++ 2010-05-04/xen/common/trace.c	2010-05-04 16:26:50.000000000 +0200
@@ -289,7 +289,7 @@ void __init init_trace_bufs(void)
         return;
     }
 
-    for(i = 0; i < NR_CPUS; i++)
+    for_each_possible_cpu(i)
         spin_lock_init(&per_cpu(t_lock, i));
 
     for(i=0; i<T_INFO_PAGES; i++)
--- 2010-05-04.orig/xen/include/asm-x86/smp.h	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/include/asm-x86/smp.h	2010-05-04 13:22:06.000000000 +0200
@@ -34,6 +34,10 @@ extern void smp_alloc_memory(void);
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 
+extern unsigned int max_cpus, num_processors, disabled_cpus;
+
+unsigned int prefill_possible_map(unsigned int max_cpus);
+
 void smp_send_nmi_allbutself(void);
 
 void  send_IPI_mask(const cpumask_t *mask, int vector);
--- 2010-05-04.orig/xen/include/xen/smp.h	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/include/xen/smp.h	2010-05-04 13:22:06.000000000 +0200
@@ -26,7 +26,7 @@ extern int __cpu_up(unsigned int cpunum)
 /*
  * Final polishing of CPUs
  */
-extern void smp_cpus_done(unsigned int max_cpus);
+extern void smp_cpus_done(void);
 
 /*
  * Call a function on all other processors
--- 2010-05-04.orig/xen/include/xen/tasklet.h	2010-04-22 14:43:25.000000000 +0200
+++ 2010-05-04/xen/include/xen/tasklet.h	2010-05-04 16:42:13.000000000 +0200
@@ -32,6 +32,7 @@ void tasklet_kill(struct tasklet *t);
 void migrate_tasklets_from_cpu(unsigned int cpu);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);
+void tasklet_early_init(void);
 void tasklet_subsys_init(void);
 
 #endif /* __XEN_TASKLET_H__ */



[-- Attachment #2: x86-count-disabled-cpus.patch --]
[-- Type: text/plain, Size: 11373 bytes --]

With the larger default NR_CPUS config setting (and the more with
build time settings exceeding this default) the wasting of memory (and
potentially other resources) just because cpu_possible_map doesn't get
set up properly increases. Use Linux' (accordingly modified to fit
Xen) prefill_possible_map() to overcome this.

This makes necessary an adjustment to tasklet initialization (which
must not happen before cpu_possible_map is guaranteed to be fully set
up - according to my static code analysis this was a problem on ia64
anyway).

Tracing code also needed a minor adjustment, as it still had a simple
counted loop accessing per-CPU data in its body.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

--- 2010-05-04.orig/xen/arch/ia64/linux-xen/smpboot.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/ia64/linux-xen/smpboot.c	2010-05-04 13:22:06.000000000 +0200
@@ -776,7 +776,7 @@ void __cpu_die(unsigned int cpu)
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void
-smp_cpus_done (unsigned int dummy)
+smp_cpus_done(void)
 {
 	int cpu;
 	unsigned long bogosum = 0;
--- 2010-05-04.orig/xen/arch/ia64/xen/xensetup.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/ia64/xen/xensetup.c	2010-05-04 16:42:36.000000000 +0200
@@ -562,10 +562,12 @@ skip_move:
     end_boot_allocator();
 
     softirq_init();
-    tasklet_subsys_init();
+    tasklet_early_init();
 
     late_setup_arch(&cmdline);
 
+    tasklet_subsys_init();
+
     scheduler_init();
     idle_vcpu[0] = (struct vcpu*) ia64_r13;
     idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
@@ -626,7 +628,7 @@ printk("num_online_cpus=%d, max_cpus=%d\
     local_irq_disable();
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
-    smp_cpus_done(max_cpus);
+    smp_cpus_done();
 #endif
 
     initialise_gdb(); /* could be moved earlier */
--- 2010-05-04.orig/xen/arch/x86/mpparse.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/x86/mpparse.c	2010-05-04 13:22:06.000000000 +0200
@@ -35,7 +35,6 @@
 
 /* Have we found an MP table */
 int smp_found_config;
-unsigned int __devinitdata maxcpus = NR_CPUS;
 
 /*
  * Various Linux-internal data structures created from the
@@ -66,7 +65,7 @@ unsigned int def_to_bigsmp = 0;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
-static unsigned int __devinitdata num_processors;
+unsigned int __devinitdata num_processors;
 
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
@@ -105,8 +104,10 @@ static int __devinit MP_processor_info (
  	int ver, apicid, cpu = 0;
 	physid_mask_t phys_cpu;
  	
-	if (!(m->mpc_cpuflag & CPU_ENABLED))
+	if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+		++disabled_cpus;
 		return -EINVAL;
+	}
 
 	apicid = mpc_apic_id(m, translation_table[mpc_record]);
 
@@ -185,9 +186,9 @@ static int __devinit MP_processor_info (
 		return -ENOSPC;
 	}
 
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
+	if (max_cpus && num_processors >= max_cpus) {
+		printk(KERN_WARNING "WARNING: maxcpus limit of %u reached."
+		       " Processor ignored.\n", max_cpus);
 		return -ENOSPC;
 	}
 
--- 2010-05-04.orig/xen/arch/x86/setup.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/x86/setup.c	2010-05-04 16:43:22.000000000 +0200
@@ -61,7 +61,7 @@ static int __initdata opt_nosmp = 0;
 boolean_param("nosmp", opt_nosmp);
 
 /* maxcpus: maximum number of CPUs to activate. */
-static unsigned int __initdata max_cpus = NR_CPUS;
+unsigned int __devinitdata max_cpus;
 integer_param("maxcpus", max_cpus);
 
 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
@@ -568,6 +568,11 @@ void __init __start_xen(unsigned long mb
     if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
         EARLY_FAIL("Misaligned CPU0 stack.\n");
 
+    if ( opt_nosmp )
+        max_cpus = prefill_possible_map(1);
+    else if ( max_cpus )
+        max_cpus = prefill_possible_map(max_cpus);
+
     if ( e820_raw_nr != 0 )
     {
         memmap_type = "Xen-e820";
@@ -978,7 +983,7 @@ void __init __start_xen(unsigned long mb
 #endif
 
     softirq_init();
-    tasklet_subsys_init();
+    tasklet_early_init();
 
     early_cpu_init();
 
@@ -1017,6 +1022,11 @@ void __init __start_xen(unsigned long mb
     zap_low_mappings();
 #endif
 
+    if ( !max_cpus )
+        max_cpus = prefill_possible_map(0);
+
+    tasklet_subsys_init();
+
     init_apic_mappings();
 
     percpu_free_unused_areas();
@@ -1049,12 +1059,9 @@ void __init __start_xen(unsigned long mb
     vesa_mtrr_init();
 #endif
 
-    if ( opt_nosmp )
-        max_cpus = 0;
-
     iommu_setup();    /* setup iommu if available */
 
-    smp_prepare_cpus(max_cpus);
+    smp_prepare_cpus(!opt_nosmp * max_cpus);
 
     spin_debug_enable();
 
@@ -1087,7 +1094,7 @@ void __init __start_xen(unsigned long mb
     }
 
     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
-    smp_cpus_done(max_cpus);
+    smp_cpus_done();
 
     initialise_gdb(); /* could be moved earlier */
 
--- 2010-05-04.orig/xen/arch/x86/smpboot.c	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/arch/x86/smpboot.c	2010-05-04 13:22:06.000000000 +0200
@@ -83,10 +83,12 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map;
 EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
+unsigned int __devinitdata disabled_cpus;
+
 /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
  * is no way to resync one AP against BP. TBD: for prescott and above, we
  * should use IA64's algorithm
@@ -829,7 +831,11 @@ int alloc_cpu_id(void)
 {
 	cpumask_t	tmp_map;
 	int cpu;
-	cpus_complement(tmp_map, cpu_present_map);
+
+	if (max_cpus)
+		cpus_andnot(tmp_map, cpu_possible_map, cpu_present_map);
+	else
+		cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
 	if (cpu >= NR_CPUS)
 		return -ENODEV;
@@ -1243,6 +1249,52 @@ void __devinit smp_prepare_boot_cpu(void
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
+/*
+ * cpu_possible_mask should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_mask on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with max_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+unsigned int __init prefill_possible_map(unsigned int max_cpus)
+{
+	unsigned int i, possible;
+
+	/* no processor from mptable or madt */
+	if (!num_processors)
+		num_processors = 1;
+
+	if (!max_cpus)
+		possible = num_processors + disabled_cpus;
+	else
+		possible = max_cpus;
+
+	if (possible > NR_CPUS) {
+		printk(KERN_WARNING
+		       "%u processors exceeds NR_CPUS limit of %d\n",
+		       possible, NR_CPUS);
+		possible = NR_CPUS;
+	}
+
+	printk(KERN_INFO "SMP: Allowing %u CPUs, %d hotplug CPUs\n",
+	       possible, max_t(int, possible - num_processors, 0));
+
+	for (i = 0; i < possible; i++)
+		cpu_set(i, cpu_possible_map);
+
+	return possible;
+}
+
 static void
 remove_siblinginfo(int cpu)
 {
@@ -1568,7 +1620,7 @@ int __devinit __cpu_up(unsigned int cpu)
 }
 
 
-void __init smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(void)
 {
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
--- 2010-05-04.orig/xen/common/tasklet.c	2010-04-22 14:43:25.000000000 +0200
+++ 2010-05-04/xen/common/tasklet.c	2010-05-04 16:46:03.000000000 +0200
@@ -18,7 +18,7 @@
 #include <xen/tasklet.h>
 
 /* Some subsystems call into us before we are initialised. We ignore them. */
-static bool_t tasklets_initialised;
+static unsigned int __read_mostly tasklets_initialised = UINT_MAX;
 
 /*
  * NB. Any modification to a tasklet_list requires the scheduler to run
@@ -35,7 +35,8 @@ void tasklet_schedule_on_cpu(struct task
 
     spin_lock_irqsave(&tasklet_lock, flags);
 
-    if ( tasklets_initialised && !t->is_dead )
+    if ( (tasklets_initialised == NR_CPUS || tasklets_initialised == cpu) &&
+         !t->is_dead )
     {
         t->scheduled_on = cpu;
         if ( !t->is_running )
@@ -161,14 +162,24 @@ void tasklet_init(
     t->data = data;
 }
 
+void __init tasklet_early_init(void)
+{
+    unsigned int cpu = smp_processor_id();
+
+    INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
+
+    tasklets_initialised = cpu;
+}
+
 void __init tasklet_subsys_init(void)
 {
     unsigned int cpu;
 
     for_each_possible_cpu ( cpu )
-        INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
+        if ( cpu != tasklets_initialised )
+            INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
 
-    tasklets_initialised = 1;
+    tasklets_initialised = NR_CPUS;
 }
 
 /*
--- 2010-05-04.orig/xen/common/trace.c	2010-04-22 14:43:25.000000000 +0200
+++ 2010-05-04/xen/common/trace.c	2010-05-04 16:26:50.000000000 +0200
@@ -289,7 +289,7 @@ void __init init_trace_bufs(void)
         return;
     }
 
-    for(i = 0; i < NR_CPUS; i++)
+    for_each_possible_cpu(i)
         spin_lock_init(&per_cpu(t_lock, i));
 
     for(i=0; i<T_INFO_PAGES; i++)
--- 2010-05-04.orig/xen/include/asm-x86/smp.h	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/include/asm-x86/smp.h	2010-05-04 13:22:06.000000000 +0200
@@ -34,6 +34,10 @@ extern void smp_alloc_memory(void);
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 
+extern unsigned int max_cpus, num_processors, disabled_cpus;
+
+unsigned int prefill_possible_map(unsigned int max_cpus);
+
 void smp_send_nmi_allbutself(void);
 
 void  send_IPI_mask(const cpumask_t *mask, int vector);
--- 2010-05-04.orig/xen/include/xen/smp.h	2010-05-04 16:04:09.000000000 +0200
+++ 2010-05-04/xen/include/xen/smp.h	2010-05-04 13:22:06.000000000 +0200
@@ -26,7 +26,7 @@ extern int __cpu_up(unsigned int cpunum)
 /*
  * Final polishing of CPUs
  */
-extern void smp_cpus_done(unsigned int max_cpus);
+extern void smp_cpus_done(void);
 
 /*
  * Call a function on all other processors
--- 2010-05-04.orig/xen/include/xen/tasklet.h	2010-04-22 14:43:25.000000000 +0200
+++ 2010-05-04/xen/include/xen/tasklet.h	2010-05-04 16:42:13.000000000 +0200
@@ -32,6 +32,7 @@ void tasklet_kill(struct tasklet *t);
 void migrate_tasklets_from_cpu(unsigned int cpu);
 void tasklet_init(
     struct tasklet *t, void (*func)(unsigned long), unsigned long data);
+void tasklet_early_init(void);
 void tasklet_subsys_init(void);
 
 #endif /* __XEN_TASKLET_H__ */

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] x86: introduce and use prefill_possible_map()
  2010-05-04 16:00 [PATCH] x86: introduce and use prefill_possible_map() Jan Beulich
@ 2010-05-14 16:32 ` Keir Fraser
  0 siblings, 0 replies; 2+ messages in thread
From: Keir Fraser @ 2010-05-14 16:32 UTC (permalink / raw)
  To: Jan Beulich, xen-devel@lists.xensource.com
  Cc: xen-ia64-devel@lists.xensource.com

Jan,

If you look in the current xen-unstable staging tree, you'll see I've added
a cpu notifier chain to Xen. We can use this to dynamically
allocate/initialise/de-allocate subsystem state in response to CPU hotplug
events, which is better than static initialisation off cpu_possible_map.

My aim is to get rid of cpu_possible_map from the x86 build altogether (sio
it is not used in common or x86 code) and to restrict cpu_present_map only
to the CPU hotplug subsystem.

Next thing in my sights is to improve the percpu memory area management, but
any other uses of cpu_possible_map, cpu_present_map, or bad for-loops
0...NR_CPUS, please do provide patches to switch to cpu notifiers if you
have time!

 -- Keir

On 04/05/2010 17:00, "Jan Beulich" <JBeulich@novell.com> wrote:

> With the larger default NR_CPUS config setting (and the more with
> build time settings exceeding this default) the wasting of memory (and
> potentially other resources) just because cpu_possible_map doesn't get
> set up properly increases. Use Linux' (accordingly modified to fit
> Xen) prefill_possible_map() to overcome this.
> 
> This makes necessary an adjustment to tasklet initialization (which
> must not happen before cpu_possible_map is guaranteed to be fully set
> up - according to my static code analysis this was a problem on ia64
> anyway).
> 
> Tracing code also needed a minor adjustment, as it still had a simple
> counted loop accessing per-CPU data in its body.
> 
> Signed-off-by: Jan Beulich <jbeulich@novell.com>
> 
> --- 2010-05-04.orig/xen/arch/ia64/linux-xen/smpboot.c   2010-05-04
> 16:04:09.000000000 +0200
> +++ 2010-05-04/xen/arch/ia64/linux-xen/smpboot.c        2010-05-04
> 13:22:06.000000000 +0200
> @@ -776,7 +776,7 @@ void __cpu_die(unsigned int cpu)
>  #endif /* CONFIG_HOTPLUG_CPU */
> 
>  void
> -smp_cpus_done (unsigned int dummy)
> +smp_cpus_done(void)
>  {
>         int cpu;
>         unsigned long bogosum = 0;
> --- 2010-05-04.orig/xen/arch/ia64/xen/xensetup.c        2010-05-04
> 16:04:09.000000000 +0200
> +++ 2010-05-04/xen/arch/ia64/xen/xensetup.c     2010-05-04 16:42:36.000000000
> +0200
> @@ -562,10 +562,12 @@ skip_move:
>      end_boot_allocator();
> 
>      softirq_init();
> -    tasklet_subsys_init();
> +    tasklet_early_init();
> 
>      late_setup_arch(&cmdline);
> 
> +    tasklet_subsys_init();
> +
>      scheduler_init();
>      idle_vcpu[0] = (struct vcpu*) ia64_r13;
>      idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
> @@ -626,7 +628,7 @@ printk("num_online_cpus=%d, max_cpus=%d\
>      local_irq_disable();
> 
>      printk("Brought up %ld CPUs\n", (long)num_online_cpus());
> -    smp_cpus_done(max_cpus);
> +    smp_cpus_done();
>  #endif
> 
>      initialise_gdb(); /* could be moved earlier */
> --- 2010-05-04.orig/xen/arch/x86/mpparse.c      2010-05-04 16:04:09.000000000
> +0200
> +++ 2010-05-04/xen/arch/x86/mpparse.c   2010-05-04 13:22:06.000000000 +0200
> @@ -35,7 +35,6 @@
> 
>  /* Have we found an MP table */
>  int smp_found_config;
> -unsigned int __devinitdata maxcpus = NR_CPUS;
> 
>  /*
>   * Various Linux-internal data structures created from the
> @@ -66,7 +65,7 @@ unsigned int def_to_bigsmp = 0;
>  /* Processor that is doing the boot up */
>  unsigned int boot_cpu_physical_apicid = -1U;
>  /* Internal processor count */
> -static unsigned int __devinitdata num_processors;
> +unsigned int __devinitdata num_processors;
> 
>  /* Bitmask of physically existing CPUs */
>  physid_mask_t phys_cpu_present_map;
> @@ -105,8 +104,10 @@ static int __devinit MP_processor_info (
>         int ver, apicid, cpu = 0;
>         physid_mask_t phys_cpu;
> 
> -       if (!(m->mpc_cpuflag & CPU_ENABLED))
> +       if (!(m->mpc_cpuflag & CPU_ENABLED)) {
> +               ++disabled_cpus;
>                 return -EINVAL;
> +       }
> 
>         apicid = mpc_apic_id(m, translation_table[mpc_record]);
> 
> @@ -185,9 +186,9 @@ static int __devinit MP_processor_info (
>                 return -ENOSPC;
>         }
> 
> -       if (num_processors >= maxcpus) {
> -               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
> -                       " Processor ignored.\n", maxcpus);
> +       if (max_cpus && num_processors >= max_cpus) {
> +               printk(KERN_WARNING "WARNING: maxcpus limit of %u reached."
> +                      " Processor ignored.\n", max_cpus);
>                 return -ENOSPC;
>         }
> 
> --- 2010-05-04.orig/xen/arch/x86/setup.c        2010-05-04 16:04:09.000000000
> +0200
> +++ 2010-05-04/xen/arch/x86/setup.c     2010-05-04 16:43:22.000000000 +0200
> @@ -61,7 +61,7 @@ static int __initdata opt_nosmp = 0;
>  boolean_param("nosmp", opt_nosmp);
> 
>  /* maxcpus: maximum number of CPUs to activate. */
> -static unsigned int __initdata max_cpus = NR_CPUS;
> +unsigned int __devinitdata max_cpus;
>  integer_param("maxcpus", max_cpus);
> 
>  /* opt_watchdog: If true, run a watchdog NMI on each processor. */
> @@ -568,6 +568,11 @@ void __init __start_xen(unsigned long mb
>      if ( ((unsigned long)cpu0_stack & (STACK_SIZE-1)) != 0 )
>          EARLY_FAIL("Misaligned CPU0 stack.\n");
> 
> +    if ( opt_nosmp )
> +        max_cpus = prefill_possible_map(1);
> +    else if ( max_cpus )
> +        max_cpus = prefill_possible_map(max_cpus);
> +
>      if ( e820_raw_nr != 0 )
>      {
>          memmap_type = "Xen-e820";
> @@ -978,7 +983,7 @@ void __init __start_xen(unsigned long mb
>  #endif
> 
>      softirq_init();
> -    tasklet_subsys_init();
> +    tasklet_early_init();
> 
>      early_cpu_init();
> 
> @@ -1017,6 +1022,11 @@ void __init __start_xen(unsigned long mb
>      zap_low_mappings();
>  #endif
> 
> +    if ( !max_cpus )
> +        max_cpus = prefill_possible_map(0);
> +
> +    tasklet_subsys_init();
> +
>      init_apic_mappings();
> 
>      percpu_free_unused_areas();
> @@ -1049,12 +1059,9 @@ void __init __start_xen(unsigned long mb
>      vesa_mtrr_init();
>  #endif
> 
> -    if ( opt_nosmp )
> -        max_cpus = 0;
> -
>      iommu_setup();    /* setup iommu if available */
> 
> -    smp_prepare_cpus(max_cpus);
> +    smp_prepare_cpus(!opt_nosmp * max_cpus);
> 
>      spin_debug_enable();
> 
> @@ -1087,7 +1094,7 @@ void __init __start_xen(unsigned long mb
>      }
> 
>      printk("Brought up %ld CPUs\n", (long)num_online_cpus());
> -    smp_cpus_done(max_cpus);
> +    smp_cpus_done();
> 
>      initialise_gdb(); /* could be moved earlier */
> 
> --- 2010-05-04.orig/xen/arch/x86/smpboot.c      2010-05-04 16:04:09.000000000
> +0200
> +++ 2010-05-04/xen/arch/x86/smpboot.c   2010-05-04 13:22:06.000000000 +0200
> @@ -83,10 +83,12 @@ EXPORT_SYMBOL(cpu_online_map);
>  cpumask_t cpu_callin_map;
>  cpumask_t cpu_callout_map;
>  EXPORT_SYMBOL(cpu_callout_map);
> -cpumask_t cpu_possible_map = CPU_MASK_ALL;
> +cpumask_t cpu_possible_map;
>  EXPORT_SYMBOL(cpu_possible_map);
>  static cpumask_t smp_commenced_mask;
> 
> +unsigned int __devinitdata disabled_cpus;
> +
>  /* TSC's upper 32 bits can't be written in eariler CPU (before prescott),
> there
>   * is no way to resync one AP against BP. TBD: for prescott and above, we
>   * should use IA64's algorithm
> @@ -829,7 +831,11 @@ int alloc_cpu_id(void)
>  {
>         cpumask_t       tmp_map;
>         int cpu;
> -       cpus_complement(tmp_map, cpu_present_map);
> +
> +       if (max_cpus)
> +               cpus_andnot(tmp_map, cpu_possible_map, cpu_present_map);
> +       else
> +               cpus_complement(tmp_map, cpu_present_map);
>         cpu = first_cpu(tmp_map);
>         if (cpu >= NR_CPUS)
>                 return -ENODEV;
> @@ -1243,6 +1249,52 @@ void __devinit smp_prepare_boot_cpu(void
>         per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
>  }
> 
> +/*
> + * cpu_possible_mask should be static, it cannot change as cpu's
> + * are onlined, or offlined. The reason is per-cpu data-structures
> + * are allocated by some modules at init time, and dont expect to
> + * do this dynamically on cpu arrival/departure.
> + * cpu_present_mask on the other hand can change dynamically.
> + * In case when cpu_hotplug is not compiled, then we resort to current
> + * behaviour, which is cpu_possible == cpu_present.
> + * - Ashok Raj
> + *
> + * Three ways to find out the number of additional hotplug CPUs:
> + * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
> + * - The user can overwrite it with max_cpus=NUM
> + * - Otherwise don't reserve additional CPUs.
> + * We do this because additional CPUs waste a lot of memory.
> + * -AK
> + */
> +unsigned int __init prefill_possible_map(unsigned int max_cpus)
> +{
> +       unsigned int i, possible;
> +
> +       /* no processor from mptable or madt */
> +       if (!num_processors)
> +               num_processors = 1;
> +
> +       if (!max_cpus)
> +               possible = num_processors + disabled_cpus;
> +       else
> +               possible = max_cpus;
> +
> +       if (possible > NR_CPUS) {
> +               printk(KERN_WARNING
> +                      "%u processors exceeds NR_CPUS limit of %d\n",
> +                      possible, NR_CPUS);
> +               possible = NR_CPUS;
> +       }
> +
> +       printk(KERN_INFO "SMP: Allowing %u CPUs, %d hotplug CPUs\n",
> +              possible, max_t(int, possible - num_processors, 0));
> +
> +       for (i = 0; i < possible; i++)
> +               cpu_set(i, cpu_possible_map);
> +
> +       return possible;
> +}
> +
>  static void
>  remove_siblinginfo(int cpu)
>  {
> @@ -1568,7 +1620,7 @@ int __devinit __cpu_up(unsigned int cpu)
>  }
> 
> 
> -void __init smp_cpus_done(unsigned int max_cpus)
> +void __init smp_cpus_done(void)
>  {
>  #ifdef CONFIG_X86_IO_APIC
>         setup_ioapic_dest();
> --- 2010-05-04.orig/xen/common/tasklet.c        2010-04-22 14:43:25.000000000
> +0200
> +++ 2010-05-04/xen/common/tasklet.c     2010-05-04 16:46:03.000000000 +0200
> @@ -18,7 +18,7 @@
>  #include <xen/tasklet.h>
> 
>  /* Some subsystems call into us before we are initialised. We ignore them. */
> -static bool_t tasklets_initialised;
> +static unsigned int __read_mostly tasklets_initialised = UINT_MAX;
> 
>  /*
>   * NB. Any modification to a tasklet_list requires the scheduler to run
> @@ -35,7 +35,8 @@ void tasklet_schedule_on_cpu(struct task
> 
>      spin_lock_irqsave(&tasklet_lock, flags);
> 
> -    if ( tasklets_initialised && !t->is_dead )
> +    if ( (tasklets_initialised == NR_CPUS || tasklets_initialised == cpu) &&
> +         !t->is_dead )
>      {
>          t->scheduled_on = cpu;
>          if ( !t->is_running )
> @@ -161,14 +162,24 @@ void tasklet_init(
>      t->data = data;
>  }
> 
> +void __init tasklet_early_init(void)
> +{
> +    unsigned int cpu = smp_processor_id();
> +
> +    INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
> +
> +    tasklets_initialised = cpu;
> +}
> +
>  void __init tasklet_subsys_init(void)
>  {
>      unsigned int cpu;
> 
>      for_each_possible_cpu ( cpu )
> -        INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
> +        if ( cpu != tasklets_initialised )
> +            INIT_LIST_HEAD(&per_cpu(tasklet_list, cpu));
> 
> -    tasklets_initialised = 1;
> +    tasklets_initialised = NR_CPUS;
>  }
> 
>  /*
> --- 2010-05-04.orig/xen/common/trace.c  2010-04-22 14:43:25.000000000 +0200
> +++ 2010-05-04/xen/common/trace.c       2010-05-04 16:26:50.000000000 +0200
> @@ -289,7 +289,7 @@ void __init init_trace_bufs(void)
>          return;
>      }
> 
> -    for(i = 0; i < NR_CPUS; i++)
> +    for_each_possible_cpu(i)
>          spin_lock_init(&per_cpu(t_lock, i));
> 
>      for(i=0; i<T_INFO_PAGES; i++)
> --- 2010-05-04.orig/xen/include/asm-x86/smp.h   2010-05-04 16:04:09.000000000
> +0200
> +++ 2010-05-04/xen/include/asm-x86/smp.h        2010-05-04 13:22:06.000000000
> +0200
> @@ -34,6 +34,10 @@ extern void smp_alloc_memory(void);
>  DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
>  DECLARE_PER_CPU(cpumask_t, cpu_core_map);
> 
> +extern unsigned int max_cpus, num_processors, disabled_cpus;
> +
> +unsigned int prefill_possible_map(unsigned int max_cpus);
> +
>  void smp_send_nmi_allbutself(void);
> 
>  void  send_IPI_mask(const cpumask_t *mask, int vector);
> --- 2010-05-04.orig/xen/include/xen/smp.h       2010-05-04 16:04:09.000000000
> +0200
> +++ 2010-05-04/xen/include/xen/smp.h    2010-05-04 13:22:06.000000000 +0200
> @@ -26,7 +26,7 @@ extern int __cpu_up(unsigned int cpunum)
>  /*
>   * Final polishing of CPUs
>   */
> -extern void smp_cpus_done(unsigned int max_cpus);
> +extern void smp_cpus_done(void);
> 
>  /*
>   * Call a function on all other processors
> --- 2010-05-04.orig/xen/include/xen/tasklet.h   2010-04-22 14:43:25.000000000
> +0200
> +++ 2010-05-04/xen/include/xen/tasklet.h        2010-05-04 16:42:13.000000000
> +0200
> @@ -32,6 +32,7 @@ void tasklet_kill(struct tasklet *t);
>  void migrate_tasklets_from_cpu(unsigned int cpu);
>  void tasklet_init(
>      struct tasklet *t, void (*func)(unsigned long), unsigned long data);
> +void tasklet_early_init(void);
>  void tasklet_subsys_init(void);
> 
>  #endif /* __XEN_TASKLET_H__ */
> 
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2010-05-14 16:32 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-05-04 16:00 [PATCH] x86: introduce and use prefill_possible_map() Jan Beulich
2010-05-14 16:32 ` Keir Fraser

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).