[PATH] Reduce per_cpu allocations to minimum needed for boot V3.

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Robin Holt <holt@sgi.com>
To: linux-ia64@vger.kernel.org
Subject: [PATH] Reduce per_cpu allocations to minimum needed for boot V3.
Date: Fri, 08 Feb 2008 22:50:15 +0000	[thread overview]
Message-ID: <20080208225015.GK3875@sgi.com> (raw)


The attached patch significantly shrinks boot memory allocation on ia64.
It does this by not allocating per_cpu areas for cpus that can never
exist.

In the case where acpi does not have any numa node description of
the cpus, I defaulted to assigning the first 4 to node 0.  For the
!CONFIG_ACPI  I used for_each_possible_cpu().


Signed-off-by: Robin Holt <holt@sgi.com>

---

I tested all the different config options.  allyesconfig fails with
or without this patch so that was the one exception.  Otherwise,
allnoconfig, allmodconfig, deconfig, and configs/* all compiled.
Additionally, I booted the sn2- and defconfig both on altix and the
defconfig on a zx2000 with 2 cpus.  I would like it if somebody with
access to a simulator could build and boot this.  That is a different
code path which I have no means of checking.

Version 3:

I reworked this patch to use a cpumask to track the cpus we have seen.
It still initializes the .nid to NUMA_NO_NODE (-1).  The introcution of
a bitmask makes the scans much cleaner.

This patch could be using the cpu_possible_map instead of our own.
I was reluctant to do that, but there is nothing that prevents it.
Does anybody have an opinion?


Version 2 fixed a port bug.  It also introduces NUMA_NO_NODE for ia64.
This is a direct copy from x86.

One comment I have received is the hard-coded 4 described above should
probably be 8 or 16 to handle larger non-NUMA machines.  I originally
set it to 4 because my recollection was that, at most, you could have
four processors per FSB, but maybe that is just an SGI limitation.

How should this be set?  Should I be using a PAL call? processor model?
Limit by current FSB spec and adjust as new processors come along?


Using a patched SuSE SLES10 kernel with both the mca patch that Jack/Russ
submitted a couple days ago and the attached.

On a 2 cpu, 6GB system, NR_CPUS@96:
Before the patch:
Memory: 5687728k/6234784k available (5777k code, 579632k reserved, 10450k data, 672k init)
After both patches:
Memory: 6211984k/6235040k available (5552k code, 55376k reserved, 10418k data, 656k init)
90% savings on reserved.

On a 1 cpu, 1GB system, NR_CPUS@96 before 572,464K, after 37,456k for a 93% savings.


Index: per_cpu/arch/ia64/kernel/setup.c
=================================--- per_cpu.orig/arch/ia64/kernel/setup.c	2008-02-08 15:05:45.564741552 -0600
+++ per_cpu/arch/ia64/kernel/setup.c	2008-02-08 16:38:18.763202424 -0600
@@ -45,6 +45,7 @@
 #include <linux/cpufreq.h>
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
+#include <linux/numa.h>
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -494,9 +495,11 @@ setup_arch (char **cmdline_p)
 # ifdef CONFIG_ACPI_NUMA
 	acpi_numa_init();
 # endif
+	per_cpu_scan_finalize(4, additional_cpus);
 #else
 # ifdef CONFIG_SMP
 	smp_build_cpu_map();	/* happens, e.g., with the Ski simulator */
+	per_cpu_scan_finalize(num_possible_cpus(), additional_cpus);
 # endif
 #endif /* CONFIG_APCI_BOOT */
 
Index: per_cpu/arch/ia64/mm/discontig.c
=================================--- per_cpu.orig/arch/ia64/mm/discontig.c	2008-02-08 15:05:45.600746284 -0600
+++ per_cpu/arch/ia64/mm/discontig.c	2008-02-08 15:06:51.653423733 -0600
@@ -104,7 +104,7 @@ static int __meminit early_nr_cpus_node(
 {
 	int cpu, n = 0;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_possible_early_cpu(cpu)
 		if (node = node_cpuid[cpu].nid)
 			n++;
 
@@ -142,7 +142,7 @@ static void *per_cpu_node_setup(void *cp
 #ifdef CONFIG_SMP
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_early_cpu(cpu) {
 		if (node = node_cpuid[cpu].nid) {
 			memcpy(__va(cpu_data), __phys_per_cpu_start,
 			       __per_cpu_end - __per_cpu_start);
@@ -345,7 +345,7 @@ static void __init initialize_pernode_da
 
 #ifdef CONFIG_SMP
 	/* Set the node_data pointer for each per-cpu struct */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_early_cpu(cpu) {
 		node = node_cpuid[cpu].nid;
 		per_cpu(cpu_info, cpu).node_data = mem_data[node].node_data;
 	}
@@ -493,13 +493,9 @@ void __cpuinit *per_cpu_init(void)
 	int cpu;
 	static int first_time = 1;
 
-
-	if (smp_processor_id() != 0)
-		return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
-
 	if (first_time) {
 		first_time = 0;
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
+		for_each_possible_early_cpu(cpu)
 			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
 	}
 
Index: per_cpu/arch/ia64/kernel/acpi.c
=================================--- per_cpu.orig/arch/ia64/kernel/acpi.c	2008-02-08 15:05:45.544738923 -0600
+++ per_cpu/arch/ia64/kernel/acpi.c	2008-02-08 15:06:51.669425834 -0600
@@ -482,6 +482,7 @@ acpi_numa_processor_affinity_init(struct
 	    (pa->apic_id << 8) | (pa->local_sapic_eid);
 	/* nid should be overridden as logical node id later */
 	node_cpuid[srat_num_cpus].nid = pxm;
+	cpu_set(srat_num_cpus, early_cpu_possible_map);
 	srat_num_cpus++;
 }
 
@@ -559,8 +560,11 @@ void __init acpi_numa_arch_fixup(void)
 	}
 
 	/* set logical node id in cpu structure */
-	for (i = 0; i < srat_num_cpus; i++)
+	for (i = 0; i < srat_num_cpus; i++) {
+		if (!cpu_isset(i, early_cpu_possible_map))
+			continue;
 		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
+	}
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
 	       num_online_nodes());
Index: per_cpu/arch/ia64/kernel/numa.c
=================================--- per_cpu.orig/arch/ia64/kernel/numa.c	2008-02-08 15:05:45.560741026 -0600
+++ per_cpu/arch/ia64/kernel/numa.c	2008-02-08 15:06:51.697429512 -0600
@@ -73,7 +73,7 @@ void __init build_cpu_to_node_map(void)
 	for(node=0; node < MAX_NUMNODES; node++)
 		cpus_clear(node_to_cpu_mask[node]);
 
-	for(cpu = 0; cpu < NR_CPUS; ++cpu) {
+	for_each_possible_early_cpu(cpu) {
 		node = -1;
 		for (i = 0; i < NR_CPUS; ++i)
 			if (cpu_physical_id(cpu) = node_cpuid[i].phys_id) {
Index: per_cpu/include/asm-ia64/acpi.h
=================================--- per_cpu.orig/include/asm-ia64/acpi.h	2008-02-08 15:05:53.685808838 -0600
+++ per_cpu/include/asm-ia64/acpi.h	2008-02-08 15:06:51.721432664 -0600
@@ -115,7 +115,11 @@ extern unsigned int is_cpu_cpei_target(u
 extern void set_cpei_target_cpu(unsigned int cpu);
 extern unsigned int get_cpei_target_cpu(void);
 extern void prefill_possible_map(void);
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
 extern int additional_cpus;
+#else
+#define additional_cpus 0
+#endif
 
 #ifdef CONFIG_ACPI_NUMA
 #if MAX_NUMNODES > 256
Index: per_cpu/include/asm-ia64/numa.h
=================================--- per_cpu.orig/include/asm-ia64/numa.h	2008-02-08 15:05:53.697810415 -0600
+++ per_cpu/include/asm-ia64/numa.h	2008-02-08 16:42:09.086660888 -0600
@@ -22,6 +22,8 @@
 
 #include <asm/mmzone.h>
 
+#define NUMA_NO_NODE	-1
+
 extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
 extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
 extern pg_data_t *pgdat_list[MAX_NUMNODES];
@@ -68,6 +70,26 @@ extern int paddr_to_nid(unsigned long pa
 extern void map_cpu_to_node(int cpu, int nid);
 extern void unmap_cpu_from_node(int cpu, int nid);
 
+extern cpumask_t early_cpu_possible_map;
+#define for_each_possible_early_cpu(cpu)  \
+	for_each_cpu_mask((cpu), early_cpu_possible_map)
+
+static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus)
+{
+	int low_cpu, high_cpu;
+	int cpu;
+
+	low_cpu = cpus_weight(early_cpu_possible_map);
+
+	high_cpu = max(low_cpu, min_cpus);
+	high_cpu = min(high_cpu + reserve_cpus, NR_CPUS);
+
+	for (cpu = low_cpu; cpu < high_cpu; cpu++) {
+		cpu_set(cpu, early_cpu_possible_map);
+		if (node_cpuid[cpu].nid = NUMA_NO_NODE)
+			node_cpuid[cpu].nid = 0;
+	}
+}
 
 #else /* !CONFIG_NUMA */
 #define map_cpu_to_node(cpu, nid)	do{}while(0)
@@ -75,6 +97,7 @@ extern void unmap_cpu_from_node(int cpu,
 
 #define paddr_to_nid(addr)	0
 
+static inline void per_cpu_scan_finalize(int min_cpus, int reserve_cpus) {};
 #endif /* CONFIG_NUMA */
 
 #endif /* _ASM_IA64_NUMA_H */
Index: per_cpu/arch/ia64/mm/numa.c
=================================--- per_cpu.orig/arch/ia64/mm/numa.c	2008-02-08 15:05:45.604746810 -0600
+++ per_cpu/arch/ia64/mm/numa.c	2008-02-08 16:41:43.675628259 -0600
@@ -27,7 +27,10 @@
  */
 int num_node_memblks;
 struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
-struct node_cpuid_s node_cpuid[NR_CPUS];
+struct node_cpuid_s node_cpuid[NR_CPUS] +	{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } };
+cpumask_t early_cpu_possible_map = CPU_MASK_NONE;
+
 /*
  * This is a matrix with "distances" between nodes, they should be
  * proportional to the memory access latency ratios.

next             reply	other threads:[~2008-02-08 22:50 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-02-08 22:50 Robin Holt [this message]
2008-02-08 23:10 ` [PATH] Reduce per_cpu allocations to minimum needed for boot V3 Luck, Tony
2008-02-08 23:20 ` [PATH] Reduce per_cpu allocations to minimum needed for boot Robin Holt
2008-02-09  0:09 ` [PATH] Reduce per_cpu allocations to minimum needed for bootV3 Luck, Tony
2008-02-10 14:06 ` Robin Holt
2008-02-11 18:41 ` Luck, Tony

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080208225015.GK3875@sgi.com \
    --to=holt@sgi.com \
    --cc=linux-ia64@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.