* [Linux-ia64] discontigmem patch for 2.4.20
@ 2003-03-04 9:28 suganuma
2003-03-06 18:31 ` Bjorn Helgaas
0 siblings, 1 reply; 2+ messages in thread
From: suganuma @ 2003-03-04 9:28 UTC (permalink / raw)
To: linux-ia64
[-- Attachment #1: Type: text/plain, Size: 333 bytes --]
Hi all,
I back ported the IA64 discontigmem function in 2.5 to 2.4.20.
I tested the patch on 8 way Itanium2 NUMA server with
a NUMA kernel and an SMP kernel.
David, Bjorn, please let me know is there any possibility
that you take this patch into the ia64-patch for 2.4.
Regards,
Kimi
--
suganuma <suganuma@hpc.bs1.fc.nec.co.jp>
[-- Attachment #2: discontig-2.4.20-030304.patch --]
[-- Type: application/octet-stream, Size: 61197 bytes --]
diff -Nur linux-2.4.20-base/arch/ia64/config.in linux-2.4.20-dcm/arch/ia64/config.in
--- linux-2.4.20-base/arch/ia64/config.in Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/arch/ia64/config.in Mon Mar 3 10:55:12 2003
@@ -66,6 +66,14 @@
fi
if [ "$CONFIG_IA64_GENERIC" = "y" -o "$CONFIG_IA64_DIG" = "y" -o "$CONFIG_IA64_HP_ZX1" = "y" ]; then
+ bool ' Enable NUMA support' CONFIG_NUMA
+ if [ "$CONFIG_NUMA" = "y" ]; then
+ define_bool CONFIG_DISCONTIGMEM y
+ choice 'Maximum Memory per NUMA Node' \
+ "16GB CONFIG_IA64_NODESIZE_16GB \
+ 64GB CONFIG_IA64_NODESIZE_64GB \
+ 256GB CONFIG_IA64_NODESIZE_256GB" 16GB
+ fi
bool ' Enable IA-64 Machine Check Abort' CONFIG_IA64_MCA
define_bool CONFIG_PM y
fi
diff -Nur linux-2.4.20-base/arch/ia64/kernel/acpi.c linux-2.4.20-dcm/arch/ia64/kernel/acpi.c
--- linux-2.4.20-base/arch/ia64/kernel/acpi.c Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/arch/ia64/kernel/acpi.c Tue Mar 4 14:26:48 2003
@@ -8,6 +8,9 @@
* Copyright (C) 2000 Intel Corp.
* Copyright (C) 2000,2001 J.I. Lee <jung-ik.lee@intel.com>
* Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
+ * Copyright (C) 2001 Takayoshi Kochi <t-kouchi@cq.jp.nec.com>
+ * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
*
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
@@ -38,11 +41,13 @@
#include <linux/irq.h>
#include <linux/acpi.h>
#include <linux/efi.h>
+#include <linux/mm.h>
#include <asm/io.h>
#include <asm/iosapic.h>
#include <asm/machvec.h>
#include <asm/page.h>
#include <asm/system.h>
+#include <asm/numa.h>
#define PREFIX "ACPI: "
@@ -559,6 +564,191 @@
}
+#ifdef CONFIG_ACPI_NUMA
+
+#define SLIT_DEBUG
+
+#define PXM_FLAG_LEN ((MAX_PXM_DOMAINS + 1)/32)
+
+static int __initdata srat_num_cpus; /* number of cpus */
+static u32 __initdata pxm_flag[PXM_FLAG_LEN];
+#define pxm_bit_set(bit) (set_bit(bit,(void *)pxm_flag))
+#define pxm_bit_test(bit) (test_bit(bit,(void *)pxm_flag))
+/* maps to convert between proximity domain and logical node ID */
+int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
+int __initdata nid_to_pxm_map[NR_NODES];
+static struct acpi_table_slit __initdata *slit_table;
+
+/*
+ * ACPI 2.0 SLIT (System Locality Information Table)
+ * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
+ */
+void __init
+acpi_numa_slit_init (struct acpi_table_slit *slit)
+{
+ u32 len;
+
+ len = sizeof(struct acpi_table_header) + 8
+ + slit->localities * slit->localities;
+ if (slit->header.length != len) {
+ printk("ACPI 2.0 SLIT: size mismatch: %d expected, %d actual\n",
+ len, slit->header.length);
+ memset(numa_slit, 10, sizeof(numa_slit));
+ return;
+ }
+ slit_table = slit;
+}
+
+void __init
+acpi_numa_processor_affinity_init (struct acpi_table_processor_affinity *pa)
+{
+ /* record this node in proximity bitmap */
+ pxm_bit_set(pa->proximity_domain);
+
+ node_cpuid[srat_num_cpus].phys_id = (pa->apic_id << 8) | (pa->lsapic_eid);
+ /* nid should be overridden as logical node id later */
+ node_cpuid[srat_num_cpus].nid = pa->proximity_domain;
+ srat_num_cpus++;
+}
+
+void __init
+acpi_numa_memory_affinity_init (struct acpi_table_memory_affinity *ma)
+{
+ unsigned long paddr, size, hole_size, min_hole_size;
+ u8 pxm;
+ struct node_memblk_s *p, *q, *pend;
+
+ pxm = ma->proximity_domain;
+
+ /* fill node memory chunk structure */
+ paddr = ma->base_addr_hi;
+ paddr = (paddr << 32) | ma->base_addr_lo;
+ size = ma->length_hi;
+ size = (size << 32) | ma->length_lo;
+
+ if (num_memblks >= NR_MEMBLKS) {
+ printk("Too many mem chunks in SRAT. Ignoring %ld MBytes at %lx\n",
+ size/(1024*1024), paddr);
+ return;
+ }
+
+ /* Ignore disabled entries */
+ if (!ma->flags.enabled)
+ return;
+
+ /*
+ * When the chunk is not the first one in the node, check distance
+ * from the other chunks. When the hole is too huge ignore the chunk.
+ * This restriction should be removed when multiple chunks per node
+ * is supported.
+ */
+ pend = &node_memblk[num_memblks];
+ min_hole_size = 0;
+ for (p = &node_memblk[0]; p < pend; p++) {
+ if (p->nid != pxm)
+ continue;
+ if (p->start_paddr < paddr)
+ hole_size = paddr - (p->start_paddr + p->size);
+ else
+ hole_size = p->start_paddr - (paddr + size);
+
+ if (!min_hole_size || hole_size < min_hole_size)
+ min_hole_size = hole_size;
+ }
+
+#if 0 /* test */
+ if (min_hole_size) {
+ if (min_hole_size > size) {
+ printk("Too huge memory hole. Ignoring %ld MBytes at %lx\n",
+ size/(1024*1024), paddr);
+ return;
+ }
+ }
+#endif
+
+ /* record this node in proximity bitmap */
+ pxm_bit_set(pxm);
+
+ /* Insertion sort based on base address */
+ pend = &node_memblk[num_memblks];
+ for (p = &node_memblk[0]; p < pend; p++) {
+ if (paddr < p->start_paddr)
+ break;
+ }
+ if (p < pend) {
+ for (q = pend; q >= p; q--)
+ *(q + 1) = *q;
+ }
+ p->start_paddr = paddr;
+ p->size = size;
+ p->nid = pxm;
+ num_memblks++;
+}
+
+void __init
+acpi_numa_arch_fixup(void)
+{
+ int i, j, node_from, node_to;
+
+ /* calculate total number of nodes in system from PXM bitmap */
+ numnodes = 0; /* init total nodes in system */
+
+ memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map));
+ memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map));
+ for (i = 0; i < MAX_PXM_DOMAINS; i++) {
+ if (pxm_bit_test(i)) {
+ pxm_to_nid_map[i] = numnodes;
+ nid_to_pxm_map[numnodes++] = i;
+ }
+ }
+
+ /* set logical node id in memory chunk structure */
+ for (i = 0; i < num_memblks; i++)
+ node_memblk[i].nid = pxm_to_nid_map[node_memblk[i].nid];
+
+ /* assign memory bank numbers for each chunk on each node */
+ for (i = 0; i < numnodes; i++) {
+ int bank;
+
+ bank = 0;
+ for (j = 0; j < num_memblks; j++)
+ if (node_memblk[j].nid == i)
+ node_memblk[j].bank = bank++;
+ }
+
+ /* set logical node id in cpu structure */
+ for (i = 0; i < srat_num_cpus; i++)
+ node_cpuid[i].nid = pxm_to_nid_map[node_cpuid[i].nid];
+
+ printk("Number of logical nodes in system = %d\n", numnodes);
+ printk("Number of memory chunks in system = %d\n", num_memblks);
+
+ if (!slit_table) return;
+ memset(numa_slit, -1, sizeof(numa_slit));
+ for (i=0; i<slit_table->localities; i++) {
+ if (!pxm_bit_test(i))
+ continue;
+ node_from = pxm_to_nid_map[i];
+ for (j=0; j<slit_table->localities; j++) {
+ if (!pxm_bit_test(j))
+ continue;
+ node_to = pxm_to_nid_map[j];
+ node_distance(node_from, node_to) =
+ slit_table->entry[i*slit_table->localities + j];
+ }
+ }
+
+#ifdef SLIT_DEBUG
+ printk("ACPI 2.0 SLIT locality table:\n");
+ for (i = 0; i < numnodes; i++) {
+ for (j = 0; j < numnodes; j++)
+ printk("%03d ", node_distance(i,j));
+ printk("\n");
+ }
+#endif
+}
+#endif /* CONFIG_ACPI_NUMA */
+
static int __init
acpi_parse_fadt (unsigned long phys_addr, unsigned long size)
{
@@ -665,12 +855,6 @@
int __init
acpi_boot_init (char *cmdline)
{
- int result;
-
- /* Initialize the ACPI boot-time table parser */
- result = acpi_table_init(cmdline);
- if (result)
- return result;
/*
* MADT
@@ -738,6 +922,9 @@
available_cpus = 1; /* We've got at least one of these, no? */
}
smp_boot_data.cpu_count = total_cpus;
+#ifdef CONFIG_NUMA
+ build_cpu_to_node_map();
+#endif
#endif
/* Make boot-up look pretty */
printk("%d CPUs available, %d CPUs total\n", available_cpus, total_cpus);
diff -Nur linux-2.4.20-base/arch/ia64/kernel/setup.c linux-2.4.20-dcm/arch/ia64/kernel/setup.c
--- linux-2.4.20-base/arch/ia64/kernel/setup.c Fri Nov 29 08:53:09 2002
+++ linux-2.4.20-dcm/arch/ia64/kernel/setup.c Tue Mar 4 17:45:50 2003
@@ -34,6 +34,7 @@
#include <asm/ia32.h>
#include <asm/page.h>
+#include <asm/pgtable.h>
#include <asm/machvec.h>
#include <asm/processor.h>
#include <asm/sal.h>
@@ -49,16 +50,9 @@
# error "struct cpuinfo_ia64 too big!"
#endif
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
extern char _end;
-#ifdef CONFIG_NUMA
- struct cpuinfo_ia64 *boot_cpu_data;
-#else
struct cpuinfo_ia64 _cpu_data[NR_CPUS] __attribute__ ((section ("__special_page_section")));
-#endif
unsigned long ia64_cycles_per_usec;
struct ia64_boot_param *ia64_boot_param;
@@ -110,15 +104,60 @@
#define IGNORE_PFN0 1 /* XXX fix me: ignore pfn 0 until TLB miss handler is updated... */
+#ifdef CONFIG_DISCONTIGMEM
/*
- * Free available memory based on the primitive map created from
- * the boot parameters. This routine does not assume the incoming
- * segments are sorted.
+ * efi_memmap_walk() knows nothing about layout of memory across nodes. Find
+ * out to which node a block of memory belongs. Ignore memory that we cannot
+ * identify, and split blocks that run across multiple nodes.
+ *
+ * Take this opportunity to round the start address up and the end address
+ * down to page boundaries.
*/
-static int
-free_available_memory (unsigned long start, unsigned long end, void *arg)
+void
+call_pernode_memory (unsigned long start, unsigned long end, void *arg)
+{
+ unsigned long rs, re;
+ void (*func)(unsigned long, unsigned long, int, int);
+ int i;
+
+ start = PAGE_ALIGN(start);
+ end &= PAGE_MASK;
+ if (start >= end)
+ return;
+
+ func = arg;
+
+ if (!num_memblks) {
+ /* this machine doesn't have SRAT, */
+ /* so call func with nid=0, bank=0 */
+ if (start < end)
+ (*func)(start, end - start, 0, 0);
+ return;
+ }
+
+ for (i = 0; i < num_memblks; i++) {
+ rs = max(start, node_memblk[i].start_paddr);
+ re = min(end, node_memblk[i].start_paddr+node_memblk[i].size);
+
+ if (rs < re)
+ (*func)(rs, re-rs, node_memblk[i].nid,
+ node_memblk[i].bank);
+ }
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
+/*
+ * Filter incoming memory segments based on the primitive map created from
+ * the boot parameters. Segments contained in the map are removed from the
+ * memory ranges. A caller-specified function is called with the memory
+ * ranges that remain after filtering.
+ * This routine does not assume the incoming segments are sorted.
+ */
+int
+filter_rsvd_memory (unsigned long start, unsigned long end, void *arg)
{
unsigned long range_start, range_end, prev_start;
+ void (*func)(unsigned long, unsigned long);
int i;
#if IGNORE_PFN0
@@ -132,13 +171,18 @@
* lowest possible address(walker uses virtual)
*/
prev_start = PAGE_OFFSET;
+ func = arg;
for (i = 0; i < num_rsvd_regions; ++i) {
- range_start = MAX(start, prev_start);
- range_end = MIN(end, rsvd_region[i].start);
+ range_start = max(start, prev_start);
+ range_end = min(end, rsvd_region[i].start);
if (range_start < range_end)
- free_bootmem(__pa(range_start), range_end - range_start);
+#ifdef CONFIG_DISCONTIGMEM
+ call_pernode_memory(__pa(range_start), __pa(range_end), func);
+#else
+ (*func)(__pa(range_start), range_end - range_start);
+#endif
/* nothing more available in this segment */
if (range_end == end) return 0;
@@ -150,6 +194,7 @@
}
+#ifndef CONFIG_DISCONTIGMEM
/*
* Find a place to put the bootmap and return its starting address in bootmap_start.
* This address must be page-aligned.
@@ -171,8 +216,8 @@
free_start = PAGE_OFFSET;
for (i = 0; i < num_rsvd_regions; i++) {
- range_start = MAX(start, free_start);
- range_end = MIN(end, rsvd_region[i].start & PAGE_MASK);
+ range_start = max(start, free_start);
+ range_end = min(end, rsvd_region[i].start & PAGE_MASK);
if (range_end <= range_start) continue; /* skip over empty range */
@@ -188,6 +233,7 @@
}
return 0;
}
+#endif /* CONFIG_DISCONTIGMEM */
static void
sort_regions (struct rsvd_region *rsvd_region, int max)
@@ -252,6 +298,14 @@
sort_regions(rsvd_region, num_rsvd_regions);
+#ifdef CONFIG_DISCONTIGMEM
+ {
+ extern void discontig_mem_init(void);
+ bootmap_size = max_pfn = 0; /* stop gcc warnings */
+ discontig_mem_init();
+ }
+#else /* !CONFIG_DISCONTIGMEM */
+
/* first find highest page frame number */
max_pfn = 0;
efi_memmap_walk(find_max_pfn, &max_pfn);
@@ -268,8 +322,9 @@
bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn);
/* Free all available memory, then mark bootmem-map as being in use. */
- efi_memmap_walk(free_available_memory, 0);
+ efi_memmap_walk(filter_rsvd_memory, free_bootmem);
reserve_bootmem(bootmap_start, bootmap_size);
+#endif /* !CONFIG_DISCONTIGMEM */
#ifdef CONFIG_BLK_DEV_INITRD
if (ia64_boot_param->initrd_start) {
@@ -296,6 +351,16 @@
efi_init();
+#ifdef CONFIG_ACPI_BOOT
+ /* Initialize the ACPI boot-time table parser */
+ acpi_table_init(*cmdline_p);
+
+#ifdef CONFIG_ACPI_NUMA
+ acpi_numa_init();
+#endif
+
+#endif /* CONFIG_APCI_BOOT */
+
iomem_resource.end = ~0UL; /* FIXME probably belongs elsewhere */
find_memory();
@@ -537,40 +602,11 @@
pal_vm_info_2_u_t vmi;
unsigned int max_ctx;
struct cpuinfo_ia64 *my_cpu_data;
-#ifdef CONFIG_NUMA
- int cpu, order;
- /*
- * If NUMA is configured, the cpu_data array is not preallocated. The boot cpu
- * allocates entries for every possible cpu. As the remaining cpus come online,
- * they reallocate a new cpu_data structure on their local node. This extra work
- * is required because some boot code references all cpu_data structures
- * before the cpus are actually started.
- */
- if (!boot_cpu_data) {
- my_cpu_data = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()),
- sizeof(struct cpuinfo_ia64));
- boot_cpu_data = my_cpu_data;
- my_cpu_data->cpu_data[0] = my_cpu_data;
- for (cpu = 1; cpu < NR_CPUS; ++cpu)
- my_cpu_data->cpu_data[cpu]
- = alloc_bootmem_pages_node(NODE_DATA(numa_node_id()),
- sizeof(struct cpuinfo_ia64));
- for (cpu = 1; cpu < NR_CPUS; ++cpu)
- memcpy(my_cpu_data->cpu_data[cpu]->cpu_data,
- my_cpu_data->cpu_data, sizeof(my_cpu_data->cpu_data));
- } else {
- order = get_order(sizeof(struct cpuinfo_ia64));
- my_cpu_data = page_address(alloc_pages_node(numa_node_id(), GFP_KERNEL, order));
- memcpy(my_cpu_data, boot_cpu_data->cpu_data[smp_processor_id()],
- sizeof(struct cpuinfo_ia64));
- __free_pages(virt_to_page(boot_cpu_data->cpu_data[smp_processor_id()]),
- order);
- for (cpu = 0; cpu < NR_CPUS; ++cpu)
- boot_cpu_data->cpu_data[cpu]->cpu_data[smp_processor_id()] = my_cpu_data;
- }
-#else
my_cpu_data = cpu_data(smp_processor_id());
+
+#ifdef CONFIG_DISCONTIGMEM
+ my_cpu_data->node_data = get_node_data_ptr();
#endif
/*
diff -Nur linux-2.4.20-base/arch/ia64/kernel/smpboot.c linux-2.4.20-dcm/arch/ia64/kernel/smpboot.c
--- linux-2.4.20-base/arch/ia64/kernel/smpboot.c Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/arch/ia64/kernel/smpboot.c Mon Mar 3 10:55:12 2003
@@ -575,3 +575,44 @@
smp_num_cpus = 1;
}
}
+
+
+#ifdef CONFIG_NUMA
+
+/* on which node is each logical CPU (one cacheline even for 64 CPUs) */
+volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned;
+/* which logical CPUs are on which nodes */
+volatile unsigned long node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
+
+/*
+ * Build cpu to node mapping and initialize the per node cpu masks.
+ */
+void __init
+build_cpu_to_node_map (void)
+{
+ int cpu, i, node;
+
+ for(node=0; node<MAX_NUMNODES; node++)
+ node_to_cpu_mask[node] = 0;
+ for(cpu = 0; cpu < NR_CPUS; ++cpu) {
+ /*
+ * All Itanium NUMA platforms I know use ACPI, so maybe we
+ * can drop this ifdef completely. [EF]
+ */
+#ifdef CONFIG_ACPI_NUMA
+ node = -1;
+ for (i = 0; i < NR_CPUS; ++i)
+ if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
+ node = node_cpuid[i].nid;
+ break;
+ }
+#else
+# error Fixme: Dunno how to build CPU-to-node map.
+#endif
+ cpu_to_node_map[cpu] = node;
+ if (node >= 0)
+ node_to_cpu_mask[node] |= (1UL << cpu);
+ }
+}
+
+#endif /* CONFIG_NUMA */
diff -Nur linux-2.4.20-base/arch/ia64/mm/Makefile linux-2.4.20-dcm/arch/ia64/mm/Makefile
--- linux-2.4.20-base/arch/ia64/mm/Makefile Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/arch/ia64/mm/Makefile Mon Mar 3 10:55:12 2003
@@ -12,5 +12,7 @@
export-objs := init.o
obj-y := init.o fault.o tlb.o extable.o
+obj-$(CONFIG_NUMA) += numa.o
+obj-$(CONFIG_DISCONTIGMEM) += discontig.o
include $(TOPDIR)/Rules.make
diff -Nur linux-2.4.20-base/arch/ia64/mm/discontig.c linux-2.4.20-dcm/arch/ia64/mm/discontig.c
--- linux-2.4.20-base/arch/ia64/mm/discontig.c Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/arch/ia64/mm/discontig.c Tue Mar 4 17:15:37 2003
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 2001 Intel Corp.
+ * Copyright (c) 2001 Tony Luck <tony.luck@intel.com>
+ * Copyright (c) 2002 NEC Corp.
+ * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
+ */
+
+/*
+ * Platform initialization for Discontig Memory
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+
+
+/*
+ * Round an address upward to the next multiple of GRANULE size.
+ */
+#define GRANULEROUNDUP(n) (((n)+IA64_GRANULE_SIZE-1) & ~(IA64_GRANULE_SIZE-1))
+
+static struct ia64_node_data *node_data[NR_NODES];
+static long boot_pg_data[8*NR_NODES+sizeof(pg_data_t)] __initdata;
+static pg_data_t *pg_data_ptr[NR_NODES] __initdata;
+static bootmem_data_t bdata[NR_NODES][NR_BANKS_PER_NODE+1] __initdata;
+
+extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg);
+
+/*
+ * Return the compact node number of this cpu. Used prior to
+ * setting up the cpu_data area.
+ * Note - not fast, intended for boot use only!!
+ */
+int
+boot_get_local_nodeid(void)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++)
+ if (node_cpuid[i].phys_id == hard_smp_processor_id())
+ return node_cpuid[i].nid;
+
+ /* node info missing, so nid should be 0.. */
+ return 0;
+}
+
+/*
+ * Return a pointer to the pg_data structure for a node.
+ * This function is used ONLY in early boot before the cpu_data
+ * structure is available.
+ */
+pg_data_t* __init
+boot_get_pg_data_ptr(long node)
+{
+ return pg_data_ptr[node];
+}
+
+
+/*
+ * Return a pointer to the node data for the current node.
+ * (boottime initialization only)
+ */
+struct ia64_node_data *
+get_node_data_ptr(void)
+{
+ return node_data[boot_get_local_nodeid()];
+}
+
+/*
+ * We allocate one of the bootmem_data_t structs for each piece of memory
+ * that we wish to treat as a contiguous block. Each such block must start
+ * on a BANKSIZE boundary. Multiple banks per node is not supported.
+ */
+static int __init
+build_maps(unsigned long pstart, unsigned long length, int node)
+{
+ bootmem_data_t *bdp;
+ unsigned long cstart, epfn;
+
+ bdp = pg_data_ptr[node]->bdata;
+ epfn = GRANULEROUNDUP(pstart + length) >> PAGE_SHIFT;
+ cstart = pstart & ~(BANKSIZE - 1);
+
+ if (!bdp->node_low_pfn) {
+ bdp->node_boot_start = cstart;
+ bdp->node_low_pfn = epfn;
+ } else {
+ bdp->node_boot_start = min(cstart, bdp->node_boot_start);
+ bdp->node_low_pfn = max(epfn, bdp->node_low_pfn);
+ }
+
+ min_low_pfn = min(min_low_pfn, bdp->node_boot_start>>PAGE_SHIFT);
+ max_low_pfn = max(max_low_pfn, bdp->node_low_pfn);
+
+ return 0;
+}
+
+/*
+ * Find space on each node for the bootmem map.
+ *
+ * Called by efi_memmap_walk to find boot memory on each node. Note that
+ * only blocks that are free are passed to this routine (currently filtered by
+ * free_available_memory).
+ */
+static int __init
+find_bootmap_space(unsigned long pstart, unsigned long length, int node)
+{
+ unsigned long mapsize, pages, epfn;
+ bootmem_data_t *bdp;
+
+ epfn = (pstart + length) >> PAGE_SHIFT;
+ bdp = &pg_data_ptr[node]->bdata[0];
+
+ if (pstart < bdp->node_boot_start || epfn > bdp->node_low_pfn)
+ return 0;
+
+ if (!bdp->node_bootmem_map) {
+ pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
+ mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+ if (length > mapsize) {
+ init_bootmem_node(
+ BOOT_NODE_DATA(node),
+ pstart>>PAGE_SHIFT,
+ bdp->node_boot_start>>PAGE_SHIFT,
+ bdp->node_low_pfn);
+ }
+
+ }
+
+ return 0;
+}
+
+
+/*
+ * Free available memory to the bootmem allocator.
+ *
+ * Note that only blocks that are free are passed to this routine (currently
+ * filtered by free_available_memory).
+ *
+ */
+static int __init
+discontig_free_bootmem_node(unsigned long pstart, unsigned long length, int node)
+{
+ free_bootmem_node(BOOT_NODE_DATA(node), pstart, length);
+
+ return 0;
+}
+
+
+/*
+ * Reserve the space used by the bootmem maps.
+ */
+static void __init
+discontig_reserve_bootmem(void)
+{
+ int node;
+ unsigned long mapbase, mapsize, pages;
+ bootmem_data_t *bdp;
+
+ for (node = 0; node < numnodes; node++) {
+ bdp = BOOT_NODE_DATA(node)->bdata;
+
+ pages = bdp->node_low_pfn - (bdp->node_boot_start>>PAGE_SHIFT);
+ mapsize = bootmem_bootmap_pages(pages) << PAGE_SHIFT;
+ mapbase = __pa(bdp->node_bootmem_map);
+ reserve_bootmem_node(BOOT_NODE_DATA(node), mapbase, mapsize);
+ }
+}
+
+/*
+ * Allocate per node tables.
+ * - the pg_data structure is allocated on each node. This minimizes offnode
+ * memory references
+ * - the node data is allocated & initialized. Portions of this structure is read-only (after
+ * boot) and contains node-local pointers to usefuls data structures located on
+ * other nodes.
+ *
+ * We also switch to using the "real" pg_data structures at this point. Earlier in boot, we
+ * use a different structure. The only use for pg_data prior to the point in boot is to get
+ * the pointer to the bdata for the node.
+ */
+static void __init
+allocate_pernode_structures(void)
+{
+ pg_data_t *pgdat=0, *new_pgdat_list=0;
+ int node, mynode;
+
+ mynode = boot_get_local_nodeid();
+ for (node = numnodes - 1; node >= 0 ; node--) {
+ node_data[node] = alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof (struct ia64_node_data));
+ pgdat = __alloc_bootmem_node(BOOT_NODE_DATA(node), sizeof(pg_data_t), SMP_CACHE_BYTES, 0);
+ pgdat->bdata = &(bdata[node][0]);
+ pg_data_ptr[node] = pgdat;
+ pgdat->node_next = new_pgdat_list;
+ new_pgdat_list = pgdat;
+ }
+
+ memcpy(node_data[mynode]->pg_data_ptrs, pg_data_ptr, sizeof(pg_data_ptr));
+ memcpy(node_data[mynode]->node_data_ptrs, node_data, sizeof(node_data));
+
+ pgdat_list = new_pgdat_list;
+}
+
+/*
+ * Called early in boot to setup the boot memory allocator, and to
+ * allocate the node-local pg_data & node-directory data structures..
+ */
+void __init
+discontig_mem_init(void)
+{
+ int node;
+
+ if (numnodes == 0) {
+ printk("node info missing!\n");
+ numnodes = 1;
+ }
+
+ for (node = 0; node < numnodes; node++) {
+ pg_data_ptr[node] = (pg_data_t*) &boot_pg_data[node];
+ pg_data_ptr[node]->bdata = &bdata[node][0];
+ }
+
+ min_low_pfn = -1;
+ max_low_pfn = 0;
+
+ efi_memmap_walk(filter_rsvd_memory, build_maps);
+ efi_memmap_walk(filter_rsvd_memory, find_bootmap_space);
+ efi_memmap_walk(filter_rsvd_memory, discontig_free_bootmem_node);
+ discontig_reserve_bootmem();
+ allocate_pernode_structures();
+}
+
+/*
+ * Initialize the paging system.
+ * - determine sizes of each node
+ * - initialize the paging system for the node
+ * - build the nodedir for the node. This contains pointers to
+ * the per-bank mem_map entries.
+ * - fix the page struct "virtual" pointers. These are bank specific
+ * values that the paging system doesnt understand.
+ * - replicate the nodedir structure to other nodes
+ */
+
+void __init
+discontig_paging_init(void)
+{
+ int node, mynode;
+ unsigned long max_dma, zones_size[MAX_NR_ZONES];
+ unsigned long kaddr, ekaddr, bid;
+ struct page *page;
+ bootmem_data_t *bdp;
+
+ max_mapnr = 0;
+ max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+ mynode = boot_get_local_nodeid();
+ for (bid = 0; bid < NR_BANKS; bid++) {
+ node_data[mynode]->node_id_map[bid] = -1;
+ node_data[mynode]->bank_mem_map_base[bid] = NULL;
+ }
+
+ for (node = 0; node < numnodes; node++) {
+ long pfn, startpfn;
+
+ memset(zones_size, 0, sizeof(zones_size));
+
+ startpfn = -1;
+ bdp = BOOT_NODE_DATA(node)->bdata;
+ pfn = bdp->node_boot_start >> PAGE_SHIFT;
+ if (startpfn == -1)
+ startpfn = pfn;
+ if (pfn > max_dma)
+ zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - pfn);
+ else if (bdp->node_low_pfn < max_dma)
+ zones_size[ZONE_DMA] += (bdp->node_low_pfn - pfn);
+ else {
+ zones_size[ZONE_DMA] += (max_dma - pfn);
+ zones_size[ZONE_NORMAL] += (bdp->node_low_pfn - max_dma);
+ }
+
+ free_area_init_node(node, NODE_DATA(node), NULL, zones_size, startpfn<<PAGE_SHIFT, 0);
+
+ page = NODE_DATA(node)->node_mem_map;
+
+ bdp = BOOT_NODE_DATA(node)->bdata;
+
+ kaddr = (unsigned long)__va(bdp->node_boot_start);
+ ekaddr = (unsigned long)__va(bdp->node_low_pfn << PAGE_SHIFT);
+ while (kaddr < ekaddr) {
+ if (paddr_to_nid(__pa(kaddr)) == node) {
+ bid = BANK_MEM_MAP_INDEX(kaddr);
+ node_data[mynode]->node_id_map[bid] = node;
+ node_data[mynode]->bank_mem_map_base[bid] = page;
+ printk("addr(%lx), bank(%d) -> node(%d), page(%lx)\n", kaddr, bid, node, page);
+ }
+ kaddr += BANKSIZE;
+ page += BANKSIZE/PAGE_SIZE;
+ }
+ max_mapnr = max(max_mapnr, page - mem_map);
+ }
+
+ /*
+ * Finish setting up the node data for this node, then copy it to the other nodes.
+ */
+ for (node=0; node < numnodes; node++)
+ if (mynode != node) {
+ memcpy(node_data[node], node_data[mynode], sizeof(struct ia64_node_data));
+ node_data[node]->node = node;
+ }
+}
+
diff -Nur linux-2.4.20-base/arch/ia64/mm/init.c linux-2.4.20-dcm/arch/ia64/mm/init.c
--- linux-2.4.20-base/arch/ia64/mm/init.c Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/arch/ia64/mm/init.c Mon Mar 3 10:55:12 2003
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/efi.h>
+#include <linux/mmzone.h>
#include <asm/bitops.h>
#include <asm/dma.h>
@@ -38,7 +39,7 @@
unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
#define LARGE_GAP 0x40000000 /* Use virtual mem map if a hole is > than this */
-static unsigned long totalram_pages;
+static unsigned long totalram_pages, reserved_pages;
unsigned long vmalloc_end = VMALLOC_END_INIT;
@@ -357,6 +358,7 @@
ia64_tlb_init();
}
+#ifndef CONFIG_DISCONTIGMEM
static int
create_mem_map_page_table (u64 start, u64 end, void *arg)
{
@@ -456,6 +458,7 @@
return page_to_phys(end);
}
+#endif /* CONFIG_DISCONTIGMEM */
static int
count_dma_pages (u64 start, u64 end, void *arg)
@@ -484,7 +487,6 @@
return 0;
}
-#ifndef CONFIG_DISCONTIGMEM
static int
find_largest_hole(u64 start, u64 end, void *arg)
{
@@ -498,20 +500,27 @@
last_end = end;
return 0;
}
-#endif
/*
* Set up the page tables.
*/
+#ifdef CONFIG_DISCONTIGMEM
+void
+paging_init (void)
+{
+ extern void discontig_paging_init(void);
+
+ discontig_paging_init();
+ efi_memmap_walk(count_pages, &num_physpages);
+}
+#else /* !CONFIG_DISCONTIGMEM */
void
paging_init (void)
{
unsigned long max_dma;
unsigned long zones_size[MAX_NR_ZONES];
unsigned long zholes_size[MAX_NR_ZONES];
-#ifndef CONFIG_DISCONTIGMEM
unsigned long max_gap;
-#endif
/* initialize mem_map[] */
@@ -539,9 +548,6 @@
}
}
-#ifdef CONFIG_DISCONTIGMEM
- free_area_init_node(0, NULL, NULL, zones_size, 0, zholes_size);
-#else
max_gap = 0;
efi_memmap_walk(find_largest_hole, (u64 *)&max_gap);
@@ -562,20 +568,19 @@
free_area_init_node(0, NULL, vmem_map, zones_size, 0, zholes_size);
printk("Virtual mem_map starts at 0x%p\n", mem_map);
}
-#endif
}
+#endif /* !CONFIG_DISCONTIGMEM */
static int
-count_reserved_pages (u64 start, u64 end, void *arg)
+count_reserved_pages (u64 start, u64 end)
{
unsigned long num_reserved = 0;
- unsigned long *count = arg;
struct page *pg;
for (pg = virt_to_page((void *)start); pg < virt_to_page((void *)end); ++pg)
if (PageReserved(pg))
++num_reserved;
- *count += num_reserved;
+ reserved_pages += num_reserved;
return 0;
}
@@ -583,8 +588,11 @@
mem_init (void)
{
extern char __start_gate_section[];
- long reserved_pages, codesize, datasize, initsize;
+ long codesize, datasize, initsize;
unsigned long num_pgt_pages;
+ pg_data_t *pgdat;
+ extern int filter_rsvd_memory (unsigned long start, unsigned long end, void *arg);
+
#ifdef CONFIG_PCI
/*
@@ -595,16 +603,19 @@
platform_pci_dma_init();
#endif
+#ifndef CONFIG_DISCONTIGMEM
if (!mem_map)
BUG();
max_mapnr = max_low_pfn;
+#endif
high_memory = __va(max_low_pfn * PAGE_SIZE);
- totalram_pages += free_all_bootmem();
+ for_each_pgdat(pgdat)
+ totalram_pages += free_all_bootmem_node(pgdat);
reserved_pages = 0;
- efi_memmap_walk(count_reserved_pages, &reserved_pages);
+ efi_memmap_walk(filter_rsvd_memory, count_reserved_pages);
codesize = (unsigned long) &_etext - (unsigned long) &_stext;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
diff -Nur linux-2.4.20-base/arch/ia64/mm/numa.c linux-2.4.20-dcm/arch/ia64/mm/numa.c
--- linux-2.4.20-base/arch/ia64/mm/numa.c Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/arch/ia64/mm/numa.c Mon Mar 3 10:55:12 2003
@@ -0,0 +1,46 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * This file contains NUMA specific variables and functions which can
+ * be split away from DISCONTIGMEM and are used on NUMA machines with
+ * contiguous memory.
+ *
+ * 2002/08/07 Erich Focht <efocht@ess.nec.de>
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <asm/numa.h>
+
+/*
+ * The following structures are usually initialized by ACPI or
+ * similar mechanisms and describe the NUMA characteristics of the machine.
+ */
+int num_memblks = 0;
+struct node_memblk_s node_memblk[NR_MEMBLKS];
+struct node_cpuid_s node_cpuid[NR_CPUS];
+/*
+ * This is a matrix with "distances" between nodes, they should be
+ * proportional to the memory access latency ratios.
+ */
+u8 numa_slit[NR_NODES * NR_NODES];
+
+/* Identify which cnode a physical address resides on */
+int
+paddr_to_nid(unsigned long paddr)
+{
+ int i;
+
+ for (i = 0; i < num_memblks; i++)
+ if (paddr >= node_memblk[i].start_paddr &&
+ paddr < node_memblk[i].start_paddr + node_memblk[i].size)
+ break;
+
+ return (i < num_memblks) ? node_memblk[i].nid : -1;
+}
diff -Nur linux-2.4.20-base/drivers/acpi/Config.in linux-2.4.20-dcm/drivers/acpi/Config.in
--- linux-2.4.20-base/drivers/acpi/Config.in Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/drivers/acpi/Config.in Mon Mar 3 10:55:12 2003
@@ -36,6 +36,9 @@
tristate ' Fan' CONFIG_ACPI_FAN
tristate ' Processor' CONFIG_ACPI_PROCESSOR
dep_tristate ' Thermal Zone' CONFIG_ACPI_THERMAL $CONFIG_ACPI_PROCESSOR
+ if [ "$CONFIG_NUMA" = "y" ]; then
+ bool ' NUMA support' CONFIG_ACPI_NUMA
+ fi
bool ' Debug Statements' CONFIG_ACPI_DEBUG
fi
@@ -119,6 +122,9 @@
tristate ' Fan' CONFIG_ACPI_FAN
tristate ' Processor' CONFIG_ACPI_PROCESSOR
dep_tristate ' Thermal Zone' CONFIG_ACPI_THERMAL $CONFIG_ACPI_PROCESSOR
+ if [ "$CONFIG_NUMA" = "y" ]; then
+ bool ' NUMA support' CONFIG_ACPI_NUMA
+ fi
bool ' Debug Statements' CONFIG_ACPI_DEBUG
endmenu
fi
diff -Nur linux-2.4.20-base/drivers/acpi/Makefile linux-2.4.20-dcm/drivers/acpi/Makefile
--- linux-2.4.20-base/drivers/acpi/Makefile Mon Mar 3 10:24:21 2003
+++ linux-2.4.20-dcm/drivers/acpi/Makefile Mon Mar 3 10:55:12 2003
@@ -50,6 +50,7 @@
obj-$(CONFIG_ACPI_PROCESSOR) += processor.o
obj-$(CONFIG_ACPI_THERMAL) += thermal.o
obj-$(CONFIG_ACPI_SYSTEM) += system.o
+obj-$(CONFIG_ACPI_NUMA) += numa.o
endif
include $(TOPDIR)/Rules.make
diff -Nur linux-2.4.20-base/drivers/acpi/numa.c linux-2.4.20-dcm/drivers/acpi/numa.c
--- linux-2.4.20-base/drivers/acpi/numa.c Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/drivers/acpi/numa.c Mon Mar 3 10:55:12 2003
@@ -0,0 +1,186 @@
+/*
+ * acpi_numa.c - ACPI NUMA support
+ *
+ * Copyright (C) 2002 Takayoshi Kochi <t-kouchi@cq.jp.nec.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/acpi.h>
+#include "acpi_bus.h"
+
+extern int __init acpi_table_parse_madt_family (enum acpi_table_id id, unsigned long madt_size, int entry_id, acpi_madt_entry_handler handler);
+
+void __init
+acpi_table_print_srat_entry (
+ acpi_table_entry_header *header)
+{
+ if (!header)
+ return;
+
+ switch (header->type) {
+
+ case ACPI_SRAT_PROCESSOR_AFFINITY:
+ {
+ struct acpi_table_processor_affinity *p =
+ (struct acpi_table_processor_affinity*) header;
+ printk(KERN_INFO "SRAT Processor (id[0x%02x] eid[0x%02x]) in proximity domain %d %s\n",
+ p->apic_id, p->lsapic_eid, p->proximity_domain,
+ p->flags.enabled?"enabled":"disabled");
+ }
+ break;
+
+ case ACPI_SRAT_MEMORY_AFFINITY:
+ {
+ struct acpi_table_memory_affinity *p =
+ (struct acpi_table_memory_affinity*) header;
+ printk(KERN_INFO "SRAT Memory (0x%08x%08x length 0x%08x%08x type 0x%x) in proximity domain %d %s%s\n",
+ p->base_addr_hi, p->base_addr_lo, p->length_hi, p->length_lo,
+ p->memory_type, p->proximity_domain,
+ p->flags.enabled ? "enabled" : "disabled",
+ p->flags.hot_pluggable ? " hot-pluggable" : "");
+ }
+ break;
+
+ default:
+ printk(KERN_WARNING "Found unsupported SRAT entry (type = 0x%x)\n",
+ header->type);
+ break;
+ }
+}
+
+
+static int __init
+acpi_parse_slit (unsigned long phys_addr, unsigned long size)
+{
+ struct acpi_table_slit *slit;
+ u32 localities;
+
+ if (!phys_addr || !size)
+ return -EINVAL;
+
+ slit = (struct acpi_table_slit *) __va(phys_addr);
+
+ /* downcast just for %llu vs %lu for i386/ia64 */
+ localities = (u32) slit->localities;
+
+ printk(KERN_INFO "SLIT localities %ux%u\n", localities, localities);
+
+ acpi_numa_slit_init(slit);
+
+ return 0;
+}
+
+
+static int __init
+acpi_parse_processor_affinity (acpi_table_entry_header *header)
+{
+ struct acpi_table_processor_affinity *processor_affinity = NULL;
+
+ processor_affinity = (struct acpi_table_processor_affinity*) header;
+ if (!processor_affinity)
+ return -EINVAL;
+
+ acpi_table_print_srat_entry(header);
+
+ /* let architecture-dependent part to do it */
+ acpi_numa_processor_affinity_init(processor_affinity);
+
+ return 0;
+}
+
+
+static int __init
+acpi_parse_memory_affinity (acpi_table_entry_header *header)
+{
+ struct acpi_table_memory_affinity *memory_affinity = NULL;
+
+ memory_affinity = (struct acpi_table_memory_affinity*) header;
+ if (!memory_affinity)
+ return -EINVAL;
+
+ acpi_table_print_srat_entry(header);
+
+ /* let architecture-dependent part to do it */
+ acpi_numa_memory_affinity_init(memory_affinity);
+
+ return 0;
+}
+
+
+static int __init
+acpi_parse_srat (unsigned long phys_addr, unsigned long size)
+{
+ struct acpi_table_srat *srat = NULL;
+
+ if (!phys_addr || !size)
+ return -EINVAL;
+
+ srat = (struct acpi_table_srat *) __va(phys_addr);
+
+ printk(KERN_INFO "SRAT revision %d\n", srat->table_revision);
+
+ return 0;
+}
+
+
+int __init
+acpi_table_parse_srat (
+ enum acpi_srat_entry_id id,
+ acpi_madt_entry_handler handler)
+{
+ return acpi_table_parse_madt_family(ACPI_SRAT, sizeof(struct acpi_table_srat),
+ id, handler);
+}
+
+
+int __init
+acpi_numa_init()
+{
+ int result;
+
+ /* SRAT: Static Resource Affinity Table */
+ result = acpi_table_parse(ACPI_SRAT, acpi_parse_srat);
+
+ if (result > 0) {
+ result = acpi_table_parse_srat(ACPI_SRAT_PROCESSOR_AFFINITY,
+ acpi_parse_processor_affinity);
+ result = acpi_table_parse_srat(ACPI_SRAT_MEMORY_AFFINITY,
+ acpi_parse_memory_affinity);
+ } else {
+ /* FIXME */
+ printk("Warning: acpi_table_parse(ACPI_SRAT) returned %d!\n",result);
+ }
+
+ /* SLIT: System Locality Information Table */
+ result = acpi_table_parse(ACPI_SLIT, acpi_parse_slit);
+ if (result < 1) {
+ /* FIXME */
+ printk("Warning: acpi_table_parse(ACPI_SLIT) returned %d!\n",result);
+ }
+
+ acpi_numa_arch_fixup();
+ return 0;
+}
diff -Nur linux-2.4.20-base/drivers/acpi/tables.c linux-2.4.20-dcm/drivers/acpi/tables.c
--- linux-2.4.20-base/drivers/acpi/tables.c Mon Mar 3 10:24:22 2003
+++ linux-2.4.20-dcm/drivers/acpi/tables.c Mon Mar 3 10:55:12 2003
@@ -224,11 +224,13 @@
int __init
-acpi_table_parse_madt (
+acpi_table_parse_madt_family (
enum acpi_table_id id,
+ unsigned long madt_size,
+ int entry_id,
acpi_madt_entry_handler handler)
{
- struct acpi_table_madt *madt = NULL;
+ void *madt = NULL;
acpi_table_entry_header *entry = NULL;
unsigned long count = 0;
unsigned long madt_end = 0;
@@ -240,19 +242,21 @@
/* Locate the MADT (if exists). There should only be one. */
for (i = 0; i < sdt.count; i++) {
- if (sdt.entry[i].id != ACPI_APIC)
+ if (sdt.entry[i].id != id)
continue;
- madt = (struct acpi_table_madt *)
+ madt = (void *)
__acpi_map_table(sdt.entry[i].pa, sdt.entry[i].size);
if (!madt) {
- printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+ printk(KERN_WARNING PREFIX "Unable to map %s\n",
+ acpi_table_signatures[id]);
return -ENODEV;
}
break;
}
if (!madt) {
- printk(KERN_WARNING PREFIX "MADT not present\n");
+ printk(KERN_WARNING PREFIX "%s not present\n",
+ acpi_table_signatures[id]);
return -ENODEV;
}
@@ -261,18 +265,28 @@
/* Parse all entries looking for a match. */
entry = (acpi_table_entry_header *)
- ((unsigned long) madt + sizeof(struct acpi_table_madt));
+ ((unsigned long) madt + madt_size);
while (((unsigned long) entry) < madt_end) {
- if (entry->type == id) {
+ if (entry->type == entry_id) {
count++;
handler(entry);
}
entry = (acpi_table_entry_header *)
- ((unsigned long) entry += entry->length);
+ ((unsigned long) entry + entry->length);
}
return count;
+}
+
+
+int __init
+acpi_table_parse_madt (
+ enum acpi_madt_entry_id id,
+ acpi_madt_entry_handler handler)
+{
+ return acpi_table_parse_madt_family(ACPI_APIC, sizeof(struct acpi_table_madt),
+ id, handler);
}
diff -Nur linux-2.4.20-base/include/asm-ia64/acpi.h linux-2.4.20-dcm/include/asm-ia64/acpi.h
--- linux-2.4.20-base/include/asm-ia64/acpi.h Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/include/asm-ia64/acpi.h Tue Mar 4 17:40:04 2003
@@ -97,17 +97,18 @@
} while (0)
const char *acpi_get_sysname (void);
-int acpi_boot_init (char *cdline);
int acpi_request_vector (u32 int_type);
int acpi_get_prt (struct pci_vector_struct **vectors, int *count);
int acpi_get_interrupt_model (int *type);
int acpi_irq_to_vector (u32 irq);
-#ifdef CONFIG_DISCONTIGMEM
-#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
-#define MAX_PXM_DOMAINS (256)
-#endif /* CONFIG_DISCONTIGMEM */
+#ifdef CONFIG_ACPI_NUMA
+#include <asm/numa.h>
+/* Proximity bitmap length; _PXM is at most 255 (8 bit)*/
+#define MAX_PXM_DOMAINS (256)
+extern int __initdata pxm_to_nid_map[MAX_PXM_DOMAINS];
+extern int __initdata nid_to_pxm_map[NR_NODES];
+#endif
#endif /*__KERNEL__*/
diff -Nur linux-2.4.20-base/include/asm-ia64/mmzone.h linux-2.4.20-dcm/include/asm-ia64/mmzone.h
--- linux-2.4.20-base/include/asm-ia64/mmzone.h Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/include/asm-ia64/mmzone.h Tue Mar 4 17:40:04 2003
@@ -0,0 +1,141 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 2002 NEC Corp.
+ * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
+ * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
+ */
+#ifndef _ASM_IA64_MMZONE_H
+#define _ASM_IA64_MMZONE_H
+
+#include <linux/config.h>
+#include <linux/init.h>
+
+/*
+ * Given a kaddr, find the base mem_map address for the start of the mem_map
+ * entries for the bank containing the kaddr.
+ */
+#define BANK_MEM_MAP_BASE(kaddr) local_node_data->bank_mem_map_base[BANK_MEM_MAP_INDEX(kaddr)]
+
+/*
+ * Given a kaddr, this macro return the relative map number
+ * within the bank.
+ */
+#define BANK_MAP_NR(kaddr) (BANK_OFFSET(kaddr) >> PAGE_SHIFT)
+
+/*
+ * Given a pte, this macro returns a pointer to the page struct for the pte.
+ */
+#define pte_page(pte) virt_to_page(PAGE_OFFSET | (pte_val(pte)&_PFN_MASK))
+
+/*
+ * Determine if a kaddr is a valid memory address of memory that
+ * actually exists.
+ *
+ * The check consists of 2 parts:
+ * - verify that the address is a region 7 address & does not
+ * contain any bits that preclude it from being a valid platform
+ * memory address
+ * - verify that the chunk actually exists.
+ *
+ * Note that IO addresses are NOT considered valid addresses.
+ *
+ * Note, many platforms can simply check if kaddr exceeds a specific size.
+ * (However, this wont work on SGI platforms since IO space is embedded
+ * within the range of valid memory addresses & nodes have holes in the
+ * address range between banks).
+ */
+#define kern_addr_valid(kaddr) ({long _kav=(long)(kaddr); \
+ VALID_MEM_KADDR(_kav);})
+
+/*
+ * Given a kaddr, return a pointer to the page struct for the page.
+ * If the kaddr does not represent RAM memory that potentially exists, return
+ * a pointer the page struct for max_mapnr. IO addresses will
+ * return the page for max_nr. Addresses in unpopulated RAM banks may
+ * return undefined results OR may panic the system.
+ *
+ */
+#define virt_to_page(kaddr) ({long _kvtp=(long)(kaddr); \
+ (VALID_MEM_KADDR(_kvtp)) \
+ ? BANK_MEM_MAP_BASE(_kvtp) + BANK_MAP_NR(_kvtp) \
+ : NULL;})
+
+/*
+ * Given a page struct entry, return the physical address that the page struct represents.
+ * Since IA64 has all memory in the DMA zone, the following works:
+ */
+#define page_to_phys(page) __pa(page_address(page))
+
+#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)
+
+#define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn)
+
+#define pfn_to_page(pfn) (struct page *)(node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn)))
+
+#define pfn_to_nid(pfn) local_node_data->node_id_map[(pfn << PAGE_SHIFT) >> DIG_BANKSHIFT]
+
+#define page_to_pfn(page) (long)((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn)
+
+
+/*
+ * pfn_valid should be made as fast as possible, and the current definition
+ * is valid for machines that are NUMA, but still contiguous, which is what
+ * is currently supported. A more generalised, but slower definition would
+ * be something like this - mbligh:
+ * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) )
+ */
+#define pfn_valid(pfn) (pfn < max_low_pfn)
+extern unsigned long max_low_pfn;
+
+
+#ifdef CONFIG_NUMA
+
+/*
+ * Platform definitions for DIG platform with contiguous memory.
+ */
+#define MAX_PHYSNODE_ID 8 /* Maximum node number +1 */
+#define NR_NODES 8 /* Maximum number of nodes in SSI */
+
+#define MAX_PHYS_MEMORY (1UL << 40) /* 1 TB */
+
+/*
+ * Bank definitions.
+ * Configurable settings for DIG: 512MB/bank: 16GB/node,
+ * 2048MB/bank: 64GB/node,
+ * 8192MB/bank: 256GB/node.
+ */
+#define NR_BANKS_PER_NODE 32
+#if defined(CONFIG_IA64_NODESIZE_16GB)
+# define DIG_BANKSHIFT 29
+#elif defined(CONFIG_IA64_NODESIZE_64GB)
+# define DIG_BANKSHIFT 31
+#elif defined(CONFIG_IA64_NODESIZE_256GB)
+# define DIG_BANKSHIFT 33
+#else
+# error Unsupported bank and nodesize!
+#endif
+#define BANKSIZE (1UL << DIG_BANKSHIFT)
+#define BANK_OFFSET(addr) ((unsigned long)(addr) & (BANKSIZE-1))
+#define NR_BANKS (NR_BANKS_PER_NODE * NR_NODES)
+
+/*
+ * VALID_MEM_KADDR returns a boolean to indicate if a kaddr is
+ * potentially a valid cacheable identity mapped RAM memory address.
+ * Note that the RAM may or may not actually be present!!
+ */
+ #define VALID_MEM_KADDR(kaddr) 1
+/* #define VALID_MEM_KADDR(kaddr) (BANK_MEM_MAP_BASE(kaddr) == NULL ? NULL : 1) */
+
+/*
+ * Given a nodeid & a bank number, find the address of the mem_map
+ * entry for the first page of the bank.
+ */
+#define BANK_MEM_MAP_INDEX(kaddr) \
+ (((unsigned long)(kaddr) & (MAX_PHYS_MEMORY-1)) >> DIG_BANKSHIFT)
+
+#endif /* CONFIG_NUMA */
+#endif /* _ASM_IA64_MMZONE_H */
diff -Nur linux-2.4.20-base/include/asm-ia64/nodedata.h linux-2.4.20-dcm/include/asm-ia64/nodedata.h
--- linux-2.4.20-base/include/asm-ia64/nodedata.h Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/include/asm-ia64/nodedata.h Tue Mar 4 17:40:04 2003
@@ -0,0 +1,75 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (c) 2000 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (c) 2002 NEC Corp.
+ * Copyright (c) 2002 Erich Focht <efocht@ess.nec.de>
+ * Copyright (c) 2002 Kimio Suganuma <k-suganuma@da.jp.nec.com>
+ */
+
+
+#ifndef _ASM_IA64_NODEDATA_H
+#define _ASM_IA64_NODEDATA_H
+
+
+#include <asm/mmzone.h>
+
+/*
+ * Node Data. One of these structures is located on each node of a NUMA system.
+ */
+
+struct pglist_data;
+struct ia64_node_data {
+ short node;
+ struct pglist_data *pg_data_ptrs[NR_NODES];
+ struct page *bank_mem_map_base[NR_BANKS];
+ struct ia64_node_data *node_data_ptrs[NR_NODES];
+ short node_id_map[NR_BANKS];
+};
+
+
+/*
+ * Return a pointer to the node_data structure for the executing cpu.
+ */
+#define local_node_data (local_cpu_data->node_data)
+
+
+/*
+ * Return a pointer to the node_data structure for the specified node.
+ */
+#define node_data(node) (local_node_data->node_data_ptrs[node])
+
+/*
+ * Get a pointer to the node_id/node_data for the current cpu.
+ * (boot time only)
+ */
+extern int boot_get_local_nodeid(void);
+extern struct ia64_node_data *get_node_data_ptr(void);
+
+/*
+ * Given a node id, return a pointer to the pg_data_t for the node.
+ * The following 2 macros are similar.
+ *
+ * NODE_DATA - should be used in all code not related to system
+ * initialization. It uses pernode data structures to minimize
+ * offnode memory references. However, these structure are not
+ * present during boot. This macro can be used once cpu_init
+ * completes.
+ *
+ * BOOT_NODE_DATA
+ * - should be used during system initialization
+ * prior to freeing __initdata. It does not depend on the percpu
+ * area being present.
+ *
+ * NOTE: The names of these macros are misleading but are difficult to change
+ * since they are used in generic linux & on other architecures.
+ */
+#define NODE_DATA(nid) (local_node_data->pg_data_ptrs[nid])
+#define BOOT_NODE_DATA(nid) boot_get_pg_data_ptr((long)(nid))
+
+struct pglist_data;
+extern struct pglist_data * __init boot_get_pg_data_ptr(long);
+
+#endif /* _ASM_IA64_NODEDATA_H */
diff -Nur linux-2.4.20-base/include/asm-ia64/numa.h linux-2.4.20-dcm/include/asm-ia64/numa.h
--- linux-2.4.20-base/include/asm-ia64/numa.h Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/include/asm-ia64/numa.h Tue Mar 4 17:40:04 2003
@@ -0,0 +1,70 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * This file contains NUMA specific prototypes and definitions.
+ *
+ * 2002/08/05 Erich Focht <efocht@ess.nec.de>
+ *
+ */
+#ifndef _ASM_IA64_NUMA_H
+#define _ASM_IA64_NUMA_H
+
+#ifdef CONFIG_NUMA
+
+#ifdef CONFIG_DISCONTIGMEM
+# include <asm/mmzone.h>
+# define NR_MEMBLKS (NR_BANKS)
+#else
+# define NR_NODES (8)
+# define NR_MEMBLKS (NR_NODES * 8)
+#endif
+
+#include <linux/cache.h>
+#include <linux/threads.h>
+extern volatile char cpu_to_node_map[NR_CPUS] __cacheline_aligned;
+extern volatile unsigned long node_to_cpu_mask[NR_NODES] __cacheline_aligned;
+
+/* Stuff below this line could be architecture independent */
+
+extern int num_memblks; /* total number of memory chunks */
+
+/*
+ * List of node memory chunks. Filled when parsing SRAT table to
+ * obtain information about memory nodes.
+*/
+
+struct node_memblk_s {
+ unsigned long start_paddr;
+ unsigned long size;
+ int nid; /* which logical node contains this chunk? */
+ int bank; /* which mem bank on this node */
+};
+
+struct node_cpuid_s {
+ u16 phys_id; /* id << 8 | eid */
+ int nid; /* logical node containing this CPU */
+};
+
+extern struct node_memblk_s node_memblk[NR_MEMBLKS];
+extern struct node_cpuid_s node_cpuid[NR_CPUS];
+
+/*
+ * ACPI 2.0 SLIT (System Locality Information Table)
+ * http://devresource.hp.com/devresource/Docs/TechPapers/IA64/slit.pdf
+ *
+ * This is a matrix with "distances" between nodes, they should be
+ * proportional to the memory access latency ratios.
+ */
+
+extern u8 numa_slit[NR_NODES * NR_NODES];
+#define node_distance(from,to) (numa_slit[from * numnodes + to])
+
+extern int paddr_to_nid(unsigned long paddr);
+
+#define local_nodeid (cpu_to_node_map[smp_processor_id()])
+
+#endif /* CONFIG_NUMA */
+
+#endif /* _ASM_IA64_NUMA_H */
diff -Nur linux-2.4.20-base/include/asm-ia64/numnodes.h linux-2.4.20-dcm/include/asm-ia64/numnodes.h
--- linux-2.4.20-base/include/asm-ia64/numnodes.h Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/include/asm-ia64/numnodes.h Tue Mar 4 17:40:04 2003
@@ -0,0 +1,7 @@
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+#include <asm/mmzone.h>
+#define MAX_NUMNODES NR_NODES
+
+#endif /* _ASM_MAX_NUMNODES_H */
diff -Nur linux-2.4.20-base/include/asm-ia64/page.h linux-2.4.20-dcm/include/asm-ia64/page.h
--- linux-2.4.20-base/include/asm-ia64/page.h Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/include/asm-ia64/page.h Tue Mar 4 17:40:04 2003
@@ -56,16 +56,10 @@
# include <asm/machvec.h>
# define virt_to_page(kaddr) (mem_map + platform_map_nr(kaddr))
# define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT)
-#elif defined (CONFIG_IA64_SGI_SN1)
-# ifndef CONFIG_DISCONTIGMEM
-# define virt_to_page(kaddr) (mem_map + MAP_NR_DENSE(kaddr))
-# define page_to_phys(page) XXX fix me
-# endif
-#else
+#elif !defined (CONFIG_DISCONTIGMEM)
# define virt_to_page(kaddr) (mem_map + MAP_NR_DENSE(kaddr))
# define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT)
#endif
-
struct page;
extern int ia64_page_valid (struct page *);
#define VALID_PAGE(page) (((page - mem_map) < max_mapnr) && ia64_page_valid(page))
diff -Nur linux-2.4.20-base/include/asm-ia64/pgtable.h linux-2.4.20-dcm/include/asm-ia64/pgtable.h
--- linux-2.4.20-base/include/asm-ia64/pgtable.h Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/include/asm-ia64/pgtable.h Tue Mar 4 17:40:07 2003
@@ -206,6 +206,15 @@
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
+#ifdef CONFIG_DISCONTIGMEM
+#define mk_pte(page,pgprot) \
+({ \
+ pte_t __pte; \
+ \
+ pte_val(__pte) = (unsigned long)page_address(page) - PAGE_OFFSET + pgprot_val(pgprot); \
+ __pte; \
+})
+#else
#define mk_pte(page,pgprot) \
({ \
pte_t __pte; \
@@ -213,6 +222,7 @@
pte_val(__pte) = ((page - mem_map) << PAGE_SHIFT) | pgprot_val(pgprot); \
__pte; \
})
+#endif
/* This takes a physical page address that is used by the remapping functions */
#define mk_pte_phys(physpage, pgprot) \
@@ -440,6 +450,7 @@
*/
#define pgtable_cache_init() do { } while (0)
+#ifndef CONFIG_DISCONTIGMEM
/* arch mem_map init routines are needed due to holes in a virtual mem_map */
#define HAVE_ARCH_MEMMAP_INIT
@@ -449,7 +460,7 @@
extern unsigned long arch_memmap_init (memmap_init_callback_t *callback,
struct page *start, struct page *end, int zone,
unsigned long start_paddr, int highmem);
-
+#endif /* CONFIG_DISCONTIGMEM */
# endif /* !__ASSEMBLY__ */
/*
diff -Nur linux-2.4.20-base/include/asm-ia64/processor.h linux-2.4.20-dcm/include/asm-ia64/processor.h
--- linux-2.4.20-base/include/asm-ia64/processor.h Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/include/asm-ia64/processor.h Tue Mar 4 17:40:04 2003
@@ -87,6 +87,9 @@
#include <asm/rse.h>
#include <asm/unwind.h>
#include <asm/atomic.h>
+#ifdef CONFIG_NUMA
+#include <asm/nodedata.h>
+#endif
/* like above but expressed as bitfields for more efficient access: */
struct ia64_psr {
@@ -187,9 +190,8 @@
} ipi;
#endif
#ifdef CONFIG_NUMA
- void *node_directory;
- int numa_node_id;
- struct cpuinfo_ia64 *cpu_data[NR_CPUS];
+ struct ia64_node_data *node_data;
+ int nodeid;
#endif
/* Platform specific word. MUST BE LAST IN STRUCT */
__u64 platform_specific;
@@ -201,23 +203,12 @@
*/
#define local_cpu_data ((struct cpuinfo_ia64 *) PERCPU_ADDR)
-/*
- * On NUMA systems, cpu_data for each cpu is allocated during cpu_init() & is allocated on
- * the node that contains the cpu. This minimizes off-node memory references. cpu_data
- * for each cpu contains an array of pointers to the cpu_data structures of each of the
- * other cpus.
- *
- * On non-NUMA systems, cpu_data is a static array allocated at compile time. References
- * to the cpu_data of another cpu is done by direct references to the appropriate entry of
- * the array.
- */
#ifdef CONFIG_NUMA
-# define cpu_data(cpu) local_cpu_data->cpu_data[cpu]
-# define numa_node_id() (local_cpu_data->numa_node_id)
-#else
- extern struct cpuinfo_ia64 _cpu_data[NR_CPUS];
-# define cpu_data(cpu) (&_cpu_data[cpu])
+#define numa_node_id() (local_cpu_data->nodeid)
#endif
+
+extern struct cpuinfo_ia64 _cpu_data[NR_CPUS];
+#define cpu_data(cpu) (&_cpu_data[cpu])
extern void identify_cpu (struct cpuinfo_ia64 *);
extern void print_cpu_info (struct cpuinfo_ia64 *);
diff -Nur linux-2.4.20-base/include/asm-ia64/topology.h linux-2.4.20-dcm/include/asm-ia64/topology.h
--- linux-2.4.20-base/include/asm-ia64/topology.h Thu Jan 1 09:00:00 1970
+++ linux-2.4.20-dcm/include/asm-ia64/topology.h Tue Mar 4 17:40:04 2003
@@ -0,0 +1,63 @@
+/*
+ * linux/include/asm-ia64/topology.h
+ *
+ * Copyright (C) 2002, Erich Focht, NEC
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _ASM_IA64_TOPOLOGY_H
+#define _ASM_IA64_TOPOLOGY_H
+
+#include <asm/acpi.h>
+#include <asm/numa.h>
+#include <asm/smp.h>
+
+#ifdef CONFIG_NUMA
+/*
+ * Returns the number of the node containing CPU 'cpu'
+ */
+#define __cpu_to_node(cpu) (int)(cpu_to_node_map[cpu])
+
+/*
+ * Returns a bitmask of CPUs on Node 'node'.
+ */
+#define __node_to_cpu_mask(node) (node_to_cpu_mask[node])
+
+#else
+#define __cpu_to_node(cpu) (0)
+#define __node_to_cpu_mask(node) (phys_cpu_present_map)
+#endif
+
+/*
+ * Returns the number of the node containing MemBlk 'memblk'
+ */
+#ifdef CONFIG_ACPI_NUMA
+#define __memblk_to_node(memblk) (node_memblk[memblk].nid)
+#else
+#define __memblk_to_node(memblk) (memblk)
+#endif
+
+/*
+ * Returns the number of the node containing Node 'nid'.
+ * Not implemented here. Multi-level hierarchies detected with
+ * the help of node_distance().
+ */
+#define __parent_node(nid) (nid)
+
+/*
+ * Returns the number of the first CPU on Node 'node'.
+ */
+#define __node_to_first_cpu(node) (__ffs(__node_to_cpu_mask(node)))
+
+/*
+ * Returns the number of the first MemBlk on Node 'node'
+ * Should be fixed when IA64 discontigmem goes in.
+ */
+#define __node_to_memblk(node) (node)
+
+#endif /* _ASM_IA64_TOPOLOGY_H */
diff -Nur linux-2.4.20-base/include/linux/mmzone.h linux-2.4.20-dcm/include/linux/mmzone.h
--- linux-2.4.20-base/include/linux/mmzone.h Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/include/linux/mmzone.h Tue Mar 4 17:40:04 2003
@@ -8,6 +8,12 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/wait.h>
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+#ifndef MAX_NUMNODES
+#define MAX_NUMNODES 1
+#endif
/*
* Free memory management - zoned buddy allocator.
@@ -212,6 +218,15 @@
#define for_each_zone(zone) \
for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
+#ifdef CONFIG_NUMA
+#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */
+#else /* !CONFIG_NUMA */
+#define MAX_NR_MEMBLKS 1
+#endif /* CONFIG_NUMA */
+
+#include <asm/topology.h>
+/* Returns the number of the current Node. */
+#define numa_node_id() (__cpu_to_node(smp_processor_id()))
#ifndef CONFIG_DISCONTIGMEM
diff -Nur linux-2.4.20-base/init/main.c linux-2.4.20-dcm/init/main.c
--- linux-2.4.20-base/init/main.c Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/init/main.c Mon Mar 3 10:55:12 2003
@@ -360,6 +360,7 @@
lock_kernel();
printk(linux_banner);
setup_arch(&command_line);
+ build_all_zonelists();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
diff -Nur linux-2.4.20-base/mm/page_alloc.c linux-2.4.20-dcm/mm/page_alloc.c
--- linux-2.4.20-base/mm/page_alloc.c Mon Mar 3 10:24:23 2003
+++ linux-2.4.20-dcm/mm/page_alloc.c Mon Mar 3 10:55:12 2003
@@ -586,13 +586,41 @@
/*
* Builds allocation fallback zone lists.
*/
-static inline void build_zonelists(pg_data_t *pgdat)
+static int __init build_zonelists_node(pg_data_t *pgdat, zonelist_t *zonelist, int j, int k)
{
- int i, j, k;
+ switch (k) {
+ zone_t *zone;
+ default:
+ BUG();
+ case ZONE_HIGHMEM:
+ zone = pgdat->node_zones + ZONE_HIGHMEM;
+ if (zone->memsize) {
+#ifndef CONFIG_HIGHMEM
+ BUG();
+#endif
+ zonelist->zones[j++] = zone;
+ }
+ case ZONE_NORMAL:
+ zone = pgdat->node_zones + ZONE_NORMAL;
+ if (zone->memsize)
+ zonelist->zones[j++] = zone;
+ case ZONE_DMA:
+ zone = pgdat->node_zones + ZONE_DMA;
+ if (zone->memsize)
+ zonelist->zones[j++] = zone;
+ }
+ return j;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k, node, local_node;
+
+ local_node = pgdat->node_id;
+ printk("Building zonelist for node : %d\n", local_node);
for (i = 0; i <= GFP_ZONEMASK; i++) {
zonelist_t *zonelist;
- zone_t *zone;
zonelist = pgdat->node_zonelists + i;
memset(zonelist, 0, sizeof(*zonelist));
@@ -604,31 +632,30 @@
if (i & __GFP_DMA)
k = ZONE_DMA;
- switch (k) {
- default:
- BUG();
- /*
- * fallthrough:
- */
- case ZONE_HIGHMEM:
- zone = pgdat->node_zones + ZONE_HIGHMEM;
- if (zone->memsize) {
-#ifndef CONFIG_HIGHMEM
- BUG();
-#endif
- zonelist->zones[j++] = zone;
- }
- case ZONE_NORMAL:
- zone = pgdat->node_zones + ZONE_NORMAL;
- if (zone->memsize)
- zonelist->zones[j++] = zone;
- case ZONE_DMA:
- zone = pgdat->node_zones + ZONE_DMA;
- if (zone->memsize)
- zonelist->zones[j++] = zone;
- }
+ j = build_zonelists_node(pgdat, zonelist, j, k);
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < numnodes; node++)
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ for (node = 0; node < local_node; node++)
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+
zonelist->zones[j++] = NULL;
- }
+ }
+}
+
+void __init build_all_zonelists(void)
+{
+ int i;
+
+ for(i = 0 ; i < numnodes ; i++)
+ build_zonelists(NODE_DATA(i));
}
/*
@@ -806,6 +833,7 @@
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
+
zone_start_paddr = MEMMAP_INIT(mem_map + offset,
mem_map + offset + size,
nid * MAX_NR_ZONES + j, zone_start_paddr,
@@ -850,7 +878,6 @@
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
- build_zonelists(pgdat);
}
void __init free_area_init(unsigned long *zones_size)
^ permalink raw reply [flat|nested] 2+ messages in thread* Re: [Linux-ia64] discontigmem patch for 2.4.20
2003-03-04 9:28 [Linux-ia64] discontigmem patch for 2.4.20 suganuma
@ 2003-03-06 18:31 ` Bjorn Helgaas
0 siblings, 0 replies; 2+ messages in thread
From: Bjorn Helgaas @ 2003-03-06 18:31 UTC (permalink / raw)
To: linux-ia64
> I back ported the IA64 discontigmem function in 2.5 to 2.4.20.
> I tested the patch on 8 way Itanium2 NUMA server with
> a NUMA kernel and an SMP kernel.
>
> David, Bjorn, please let me know is there any possibility
> that you take this patch into the ia64-patch for 2.4.
I'm in the process of merging this patch, but when I build a
"generic" kernel (with CONFIG_NUMA=y and CONFIG_DISCONTIGMEM=y),
I get many new warnings:
/home/helgaas/bk/testing/include/asm/mmzone.h:62:21: warning: "virt_to_page" redefined
/home/helgaas/bk/testing/include/asm/page.h:57:1: warning: this is the location of the previous definition
/home/helgaas/bk/testing/include/asm/mmzone.h:71:21: warning: "page_to_phys" redefined
/home/helgaas/bk/testing/include/asm/page.h:58:1: warning: this is the location of the previous definition
/home/helgaas/bk/testing/include/linux/mmzone.h:229:21: warning: "numa_node_id" redefined
/home/helgaas/bk/testing/include/asm/processor.h:206:1: warning: this is the location of the previous definition
Could you look into these and send a new patch to correct them?
Also, the resulting kernel doesn't boot (it MCAs) on HP rx2600
and zx2000. I'll look into it in my spare time, but you can
probably do so more efficiently.
A small patch that applies on top of the previous patch would be
easiest.
Bjorn
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2003-03-06 18:31 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-03-04 9:28 [Linux-ia64] discontigmem patch for 2.4.20 suganuma
2003-03-06 18:31 ` Bjorn Helgaas
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox