* percpu-2.5.63-bkcurr
@ 2003-03-01 5:59 William Lee Irwin III
2003-03-01 7:36 ` percpu-2.5.63-bkcurr William Lee Irwin III
0 siblings, 1 reply; 5+ messages in thread
From: William Lee Irwin III @ 2003-03-01 5:59 UTC (permalink / raw)
To: linux-kernel
Shove per-cpu areas into node-local memory for i386 discontigmem,
or at least NUMA-Q. You'll have to plop down early_cpu_to_node()
and early_node_to_cpumask() stubs to use it on, say Summit.
-- wli
===== arch/i386/mm/discontig.c 1.9 vs edited =====
--- 1.9/arch/i386/mm/discontig.c Fri Feb 28 15:08:58 2003
+++ edited/arch/i386/mm/discontig.c Fri Feb 28 21:48:54 2003
@@ -48,8 +48,6 @@
extern unsigned long totalram_pages;
extern unsigned long totalhigh_pages;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
unsigned long node_remap_start_pfn[MAX_NUMNODES];
unsigned long node_remap_size[MAX_NUMNODES];
unsigned long node_remap_offset[MAX_NUMNODES];
@@ -67,6 +65,20 @@
node_end_pfn[nid] = max_pfn;
}
+extern char __per_cpu_start[], __per_cpu_end[];
+unsigned long __per_cpu_offset[NR_CPUS];
+
+#define PER_CPU_PAGES PFN_UP((unsigned long)(__per_cpu_end-__per_cpu_start))
+#define MEM_MAP_SIZE(n) PFN_UP((node_end_pfn[n]-node_start_pfn[n]+1)*sizeof(struct page))
+
+#ifdef CONFIG_X86_NUMAQ
+#define early_cpu_to_node(cpu) ((cpu)/4)
+#define early_node_to_cpumask(node) (0xFUL << (4*(node)))
+#else
+#define early_cpu_to_node(cpu) cpu_to_node(cpu)
+#define early_node_to_cpumask(node) node_to_cpumask(node)
+#endif
+
/*
* Allocate memory for the pg_data_t via a crude pre-bootmem method
* We ought to relocate these onto their own node later on during boot.
@@ -82,6 +94,44 @@
}
}
+static void __init allocate_one_cpu_area(int cpu)
+{
+ int cpu_in_node, node = early_cpu_to_node(cpu);
+ unsigned long nodemask = early_node_to_cpumask(node);
+ unsigned long node_vaddr = (unsigned long)node_remap_start_vaddr[node];
+
+ if (!PER_CPU_PAGES)
+ return;
+
+ if (!node) {
+ __per_cpu_offset[cpu] = min_low_pfn*PAGE_SIZE
+ + PAGE_OFFSET
+ - (unsigned long)__per_cpu_start;
+ min_low_pfn += PER_CPU_PAGES;
+ return;
+ }
+
+ cpu_in_node = hweight32(nodemask & ((1UL << cpu) - 1));
+ __per_cpu_offset[cpu] = node_vaddr + MEM_MAP_SIZE(node)*PAGE_SIZE
+ + PFN_UP(sizeof(pg_data_t))*PAGE_SIZE
+ + PER_CPU_PAGES*cpu_in_node*PAGE_SIZE
+ - (unsigned long)__per_cpu_start;
+}
+
+void __init setup_per_cpu_areas(void)
+{
+ int node, cpu;
+ for (node = 0; node < numnodes; ++node) {
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (early_cpu_to_node(cpu) == node) {
+ memcpy(RELOC_HIDE((char *)__per_cpu_start, __per_cpu_offset[cpu]),
+ __per_cpu_start,
+ PER_CPU_PAGES*PAGE_SIZE);
+ }
+ }
+ }
+}
+
/*
* Register fully available low RAM pages with the bootmem allocator.
*/
@@ -144,13 +194,11 @@
unsigned long size, reserve_pages = 0;
for (nid = 1; nid < numnodes; nid++) {
- /* calculate the size of the mem_map needed in bytes */
- size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
- * sizeof(struct page) + sizeof(pg_data_t);
- /* convert size to large (pmd size) pages, rounding up */
- size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
- /* now the roundup is correct, convert to PAGE_SIZE pages */
- size = size * PTRS_PER_PTE;
+ /* calculate the size of the mem_map needed in pages */
+ size = MEM_MAP_SIZE(nid) + PFN_UP(sizeof(pg_data_t))
+ + PER_CPU_PAGES*hweight32(early_node_to_cpumask(nid));
+ /* round up to nearest pmd boundary */
+ size = (size + PTRS_PER_PTE - 1) & ~(PTRS_PER_PTE - 1);
printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
size, nid);
node_remap_size[nid] = size;
@@ -196,9 +244,14 @@
printk("Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
for (nid = 0; nid < numnodes; nid++) {
+ int cpu;
node_remap_start_vaddr[nid] = pfn_to_kaddr(
highstart_pfn - node_remap_offset[nid]);
allocate_pgdat(nid);
+ for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+ if (early_cpu_to_node(cpu) == nid)
+ allocate_one_cpu_area(cpu);
+ }
printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
(ulong) node_remap_start_vaddr[nid],
(ulong) pfn_to_kaddr(highstart_pfn
===== include/asm-i386/numaq.h 1.7 vs edited =====
--- 1.7/include/asm-i386/numaq.h Fri Feb 28 15:03:59 2003
+++ edited/include/asm-i386/numaq.h Fri Feb 28 18:37:53 2003
@@ -169,9 +169,9 @@
struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
};
-static inline unsigned long get_zholes_size(int nid)
+static inline unsigned long *get_zholes_size(int nid)
{
- return 0;
+ return NULL;
}
#endif /* CONFIG_X86_NUMAQ */
#endif /* NUMAQ_H */
===== include/asm-i386/percpu.h 1.1 vs edited =====
--- 1.1/include/asm-i386/percpu.h Fri Mar 15 04:55:35 2002
+++ edited/include/asm-i386/percpu.h Fri Feb 28 18:31:26 2003
@@ -1,6 +1,30 @@
#ifndef __ARCH_I386_PERCPU__
#define __ARCH_I386_PERCPU__
+#include <linux/config.h>
+#include <linux/compiler.h>
+
+#ifndef CONFIG_DISCONTIGMEM
#include <asm-generic/percpu.h>
+#else /* CONFIG_DISCONTIGMEM */
+
+extern unsigned long __per_cpu_offset[NR_CPUS];
+void setup_per_cpu_areas(void);
+
+/* Separate out the type, so (int[3], foo) works. */
+#ifndef MODULE
+#define DEFINE_PER_CPU(type, name) \
+ __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
+#endif
+
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+#define __get_cpu_var(var) per_cpu(var, smp_processor_id())
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(var##__per_cpu)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_cpu)
+
+#endif /* CONFIG_DISCONTIGMEM */
#endif /* __ARCH_I386_PERCPU__ */
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: percpu-2.5.63-bkcurr 2003-03-01 5:59 percpu-2.5.63-bkcurr William Lee Irwin III @ 2003-03-01 7:36 ` William Lee Irwin III 2003-03-01 7:40 ` percpu-2.5.63-bkcurr William Lee Irwin III 0 siblings, 1 reply; 5+ messages in thread From: William Lee Irwin III @ 2003-03-01 7:36 UTC (permalink / raw) To: linux-kernel On Fri, Feb 28, 2003 at 09:59:22PM -0800, William Lee Irwin III wrote: > Shove per-cpu areas into node-local memory for i386 discontigmem, > or at least NUMA-Q. You'll have to plop down early_cpu_to_node() > and early_node_to_cpumask() stubs to use it on, say Summit. Tentative followup #1 (thanks Zwane!) Use per-cpu rq's in the sched.c to avoid remote cache misses there. It actually means something now. Index: linux-2.5.59/kernel/sched.c =================================================================== RCS file: /build/cvsroot/linux-2.5.59/kernel/sched.c,v retrieving revision 1.1.1.1 diff -u -r1.1.1.1 sched.c --- linux-2.5.59/kernel/sched.c 17 Jan 2003 02:46:29 -0000 1.1.1.1 +++ linux-2.5.59/kernel/sched.c 17 Jan 2003 10:03:31 -0000 @@ -160,9 +160,9 @@ atomic_t nr_iowait; } ____cacheline_aligned; -static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; +static DEFINE_PER_CPU(struct runqueue, runqueues) = {{ 0 }}; -#define cpu_rq(cpu) (runqueues + (cpu)) +#define cpu_rq(cpu) (&per_cpu(runqueues, cpu)) #define this_rq() cpu_rq(smp_processor_id()) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -- function.linuxpower.ca ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: percpu-2.5.63-bkcurr 2003-03-01 7:36 ` percpu-2.5.63-bkcurr William Lee Irwin III @ 2003-03-01 7:40 ` William Lee Irwin III 2003-03-01 7:46 ` percpu-2.5.63-bkcurr William Lee Irwin III 0 siblings, 1 reply; 5+ messages in thread From: William Lee Irwin III @ 2003-03-01 7:40 UTC (permalink / raw) To: linux-kernel On Fri, Feb 28, 2003 at 09:59:22PM -0800, William Lee Irwin III wrote: >> Shove per-cpu areas into node-local memory for i386 discontigmem, >> or at least NUMA-Q. You'll have to plop down early_cpu_to_node() >> and early_node_to_cpumask() stubs to use it on, say Summit. On Fri, Feb 28, 2003 at 11:36:55PM -0800, William Lee Irwin III wrote: > Tentative followup #1 (thanks Zwane!) > Use per-cpu rq's in the sched.c to avoid remote cache misses there. > It actually means something now. Tentative followup #2 -- totally untested, at some point I have to figure out how to avoid breaking the compile for non-NUMA-Q with this. diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/arch/i386/mm/discontig.c sched-2.5/arch/i386/mm/discontig.c --- linux-2.5/arch/i386/mm/discontig.c Fri Feb 28 21:48:54 2003 +++ sched-2.5/arch/i386/mm/discontig.c Fri Feb 28 23:12:45 2003 @@ -66,10 +66,12 @@ } extern char __per_cpu_start[], __per_cpu_end[]; -unsigned long __per_cpu_offset[NR_CPUS]; +extern char __per_node_start[], __per_node_end[]; +unsigned long __per_cpu_offset[NR_CPUS], __per_node_offset[MAX_NR_NODES]; #define PER_CPU_PAGES PFN_UP((unsigned long)(__per_cpu_end-__per_cpu_start)) #define MEM_MAP_SIZE(n) PFN_UP((node_end_pfn[n]-node_start_pfn[n]+1)*sizeof(struct page)) +#define PER_NODE_PAGES PFN_UP((unsigned long)(__per_node_end-__per_node_start)) #ifdef CONFIG_X86_NUMAQ #define early_cpu_to_node(cpu) ((cpu)/4) @@ -94,6 +96,26 @@ } } +static void __init allocate_per_node_area(int node) +{ + unsigned long node_vaddr = (unsigned long)node_remap_start_vaddr[node]; + + if (!PER_CPU_PAGES) + return; + + if (!node) { + __per_node_offset[node] = min_low_pfn*PAGE_SIZE + + PAGE_OFFSET + - (unsigned long)__per_node_start; + min_low_pfn += PER_NODE_PAGES; + return; + } + + __per_node_offset[node] = node_vaddr + MEM_MAP_SIZE(node)*PAGE_SIZE + + PFN_UP(sizeof(pg_data_t))*PAGE_SIZE + - (unsigned long)__per_node_start; +} + static void __init allocate_one_cpu_area(int cpu) { int cpu_in_node, node = early_cpu_to_node(cpu); @@ -115,9 +137,21 @@ __per_cpu_offset[cpu] = node_vaddr + MEM_MAP_SIZE(node)*PAGE_SIZE + PFN_UP(sizeof(pg_data_t))*PAGE_SIZE + PER_CPU_PAGES*cpu_in_node*PAGE_SIZE + + PER_NODE_PAGES*PAGE_SIZE - (unsigned long)__per_cpu_start; } +void __init setup_per_node_areas(void) +{ + int node; + for (node = 0; node < numnodes; ++node) { + memcpy(RELOC_HIDE((char *)__per_node_start, + __per_node_offset[node]), + __per_node_start, + PER_NODE_PAGES*PAGE_SIZE); + } +} + void __init setup_per_cpu_areas(void) { int node, cpu; @@ -248,6 +282,7 @@ node_remap_start_vaddr[nid] = pfn_to_kaddr( highstart_pfn - node_remap_offset[nid]); allocate_pgdat(nid); + allocate_per_node_area(nid); for (cpu = 0; cpu < NR_CPUS; ++cpu) { if (early_cpu_to_node(cpu) == nid) allocate_one_cpu_area(cpu); diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/arch/i386/vmlinux.lds.S sched-2.5/arch/i386/vmlinux.lds.S --- linux-2.5/arch/i386/vmlinux.lds.S Fri Feb 28 18:32:34 2003 +++ sched-2.5/arch/i386/vmlinux.lds.S Fri Feb 28 23:02:36 2003 @@ -83,6 +83,10 @@ .data.percpu : { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); + __per_node_start = .; + .data.pernode : { *(.data.pernode) } + __per_node_end = .; + . = ALIGN(4096); __init_end = .; /* freed after init ends here */ diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/include/asm-i386/pernode.h sched-2.5/include/asm-i386/pernode.h --- linux-2.5/include/asm-i386/pernode.h Wed Dec 31 16:00:00 1969 +++ sched-2.5/include/asm-i386/pernode.h Fri Feb 28 23:08:53 2003 @@ -0,0 +1,29 @@ +#ifndef __ARCH_I386_PERNODE__ +#define __ARCH_I386_PERNODE__ + +#include <linux/config.h> +#include <linux/compiler.h> + +#ifndef CONFIG_DISCONTIGMEM +#include <asm-generic/percpu.h> +#else /* CONFIG_DISCONTIGMEM */ + +extern unsigned long __per_node_offset[NR_CPUS]; +void setup_per_node_areas(void); + +/* Separate out the type, so (int[3], foo) works. */ +#ifndef MODULE +#define DEFINE_PER_NODE(type, name) \ + __attribute__((__section__(".data.pernode"))) __typeof__(type) name##__per_node +#endif + +/* var is in discarded region: offset to particular copy we want */ +#define per_node(var, node) (*RELOC_HIDE(&var##__per_node, __per_node_offset[node])) + +#define DECLARE_PER_NODE(type, name) extern __typeof__(type) name##__per_node +#define EXPORT_PER_NODE_SYMBOL(var) EXPORT_SYMBOL(var##__per_node) +#define EXPORT_PER_NODE_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(var##__per_node) + +#endif /* CONFIG_DISCONTIGMEM */ + +#endif /* __ARCH_I386_PERNODE__ */ diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/init/main.c sched-2.5/init/main.c --- linux-2.5/init/main.c Fri Feb 28 21:49:13 2003 +++ sched-2.5/init/main.c Fri Feb 28 23:10:12 2003 @@ -38,6 +38,7 @@ #include <asm/io.h> #include <asm/bugs.h> +#include <asm/pernode.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/smp.h> @@ -317,6 +318,10 @@ } #endif /* !__GENERIC_PER_CPU */ +#ifndef CONFIG_NUMA +static inline void setup_per_node_areas(void) { } +#endif + /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { @@ -376,6 +381,7 @@ printk(linux_banner); setup_arch(&command_line); setup_per_cpu_areas(); + setup_per_node_areas(); /* * Mark the boot cpu "online" so that it can call console drivers in diff -Nur --exclude=RCS --exclude=CVS --exclude=SCCS --exclude=BitKeeper --exclude=ChangeSet linux-2.5/kernel/sched.c sched-2.5/kernel/sched.c --- linux-2.5/kernel/sched.c Fri Feb 28 18:33:00 2003 +++ sched-2.5/kernel/sched.c Fri Feb 28 23:17:19 2003 @@ -32,6 +32,7 @@ #include <linux/delay.h> #include <linux/timer.h> #include <linux/rcupdate.h> +#include <asm/pernode.h> /* * Convert user-nice values [ -20 ... 0 ... 19 ] @@ -166,9 +167,9 @@ atomic_t nr_iowait; } ____cacheline_aligned; -static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; +static DEFINE_PER_CPU(struct runqueue, runqueues) = {{ 0 }}; -#define cpu_rq(cpu) (runqueues + (cpu)) +#define cpu_rq(cpu) (&per_cpu(runqueues, cpu)) #define this_rq() cpu_rq(smp_processor_id()) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) @@ -189,12 +190,11 @@ * Keep track of running tasks. */ -static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = - {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; +static DEFINE_PER_NODE(atomic_t, node_nr_running) = ATOMIC_INIT(0); static inline void nr_running_init(struct runqueue *rq) { - rq->node_nr_running = &node_nr_running[0]; + rq->node_nr_running = &per_node(node_nr_running, 0); } static inline void nr_running_inc(runqueue_t *rq) @@ -214,7 +214,7 @@ int i; for (i = 0; i < NR_CPUS; i++) - cpu_rq(i)->node_nr_running = &node_nr_running[cpu_to_node(i)]; + cpu_rq(i)->node_nr_running = &per_node(node_nr_running, cpu_to_node(i)); } #else /* !CONFIG_NUMA */ @@ -748,7 +748,7 @@ minload = 10000000; for (i = 0; i < numnodes; i++) { - load = atomic_read(&node_nr_running[i]); + load = atomic_read(&per_node(node_nr_running, i)); if (load < minload) { minload = load; node = i; @@ -790,13 +790,13 @@ int i, node = -1, load, this_load, maxload; this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) - + atomic_read(&node_nr_running[this_node]); + + atomic_read(&per_node(node_nr_running, this_node)); this_rq()->prev_node_load[this_node] = this_load; for (i = 0; i < numnodes; i++) { if (i == this_node) continue; load = (this_rq()->prev_node_load[i] >> 1) - + atomic_read(&node_nr_running[i]); + + atomic_read(&per_node(node_nr_running, i)); this_rq()->prev_node_load[i] = load; if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { maxload = load; ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: percpu-2.5.63-bkcurr 2003-03-01 7:40 ` percpu-2.5.63-bkcurr William Lee Irwin III @ 2003-03-01 7:46 ` William Lee Irwin III 2003-03-01 8:27 ` percpu-2.5.63-bkcurr William Lee Irwin III 0 siblings, 1 reply; 5+ messages in thread From: William Lee Irwin III @ 2003-03-01 7:46 UTC (permalink / raw) To: linux-kernel On Fri, Feb 28, 2003 at 11:36:55PM -0800, William Lee Irwin III wrote: > Tentative followup #1 (thanks Zwane!) >> Use per-cpu rq's in the sched.c to avoid remote cache misses there. >> It actually means something now. On Fri, Feb 28, 2003 at 11:40:35PM -0800, William Lee Irwin III wrote: > Tentative followup #2 -- totally untested, at some point I have to > figure out how to avoid breaking the compile for non-NUMA-Q with this. woops diff -u sched-2.5/arch/i386/mm/discontig.c sched-2.5/arch/i386/mm/discontig.c --- sched-2.5/arch/i386/mm/discontig.c Fri Feb 28 23:12:45 2003 +++ sched-2.5/arch/i386/mm/discontig.c Fri Feb 28 23:42:15 2003 @@ -230,6 +230,7 @@ for (nid = 1; nid < numnodes; nid++) { /* calculate the size of the mem_map needed in pages */ size = MEM_MAP_SIZE(nid) + PFN_UP(sizeof(pg_data_t)) + + PER_NODE_PAGES + PER_CPU_PAGES*hweight32(early_node_to_cpumask(nid)); /* round up to nearest pmd boundary */ size = (size + PTRS_PER_PTE - 1) & ~(PTRS_PER_PTE - 1); ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: percpu-2.5.63-bkcurr 2003-03-01 7:46 ` percpu-2.5.63-bkcurr William Lee Irwin III @ 2003-03-01 8:27 ` William Lee Irwin III 0 siblings, 0 replies; 5+ messages in thread From: William Lee Irwin III @ 2003-03-01 8:27 UTC (permalink / raw) To: linux-kernel On Fri, Feb 28, 2003 at 11:36:55PM -0800, William Lee Irwin III wrote: >>> Use per-cpu rq's in the sched.c to avoid remote cache misses there. >>> It actually means something now. On Fri, Feb 28, 2003 at 11:40:35PM -0800, William Lee Irwin III wrote: >> Tentative followup #2 -- totally untested, at some point I have to >> figure out how to avoid breaking the compile for non-NUMA-Q with this. Tested. it works. -- wli ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2003-03-01 8:16 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2003-03-01 5:59 percpu-2.5.63-bkcurr William Lee Irwin III 2003-03-01 7:36 ` percpu-2.5.63-bkcurr William Lee Irwin III 2003-03-01 7:40 ` percpu-2.5.63-bkcurr William Lee Irwin III 2003-03-01 7:46 ` percpu-2.5.63-bkcurr William Lee Irwin III 2003-03-01 8:27 ` percpu-2.5.63-bkcurr William Lee Irwin III
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox