public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] avoid creating empty nodes [0/2]
@ 2006-03-29  2:09 KAMEZAWA Hiroyuki
  2006-03-29  2:11 ` [RFC][PATCH] avoid creating empty nodes [1/2] move reserve memory KAMEZAWA Hiroyuki
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-03-29  2:09 UTC (permalink / raw)
  To: linux-ia64

When I tesetd new firmware , which supports NUMA cpu/memory hot-add 
(with Windows), I found Linux/ia64 creates nodes with no cpus, no memory.

This is because of acpi_numa_fixup(). It onlines nodes when they find pxms.
Because SRAT can contain resources which doesn't exist at boot,
the kernel can create a node with a empty pxm which includes not-exisitng cpus
and not-exisitng memory.

Now, I think this is a bug. These patches will fix it.
These patches are against 2.6.16-mm1.

--Kame
 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC][PATCH] avoid creating empty nodes [1/2] move reserve memory
  2006-03-29  2:09 [RFC][PATCH] avoid creating empty nodes [0/2] KAMEZAWA Hiroyuki
@ 2006-03-29  2:11 ` KAMEZAWA Hiroyuki
  2006-03-29  2:12 ` [RFC][PATCH] avoid creating empty nodes [2/2] ignore empty pxms KAMEZAWA Hiroyuki
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-03-29  2:11 UTC (permalink / raw)
  To: linux-ia64

move callers of reserve_memory() to use efi_memmap_walk
in acpi_numa_fixup().

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Index: linux-2.6.16-mm1/arch/ia64/kernel/setup.c
=================================--- linux-2.6.16-mm1.orig/arch/ia64/kernel/setup.c	2006-03-28 11:16:13.000000000 +0900
+++ linux-2.6.16-mm1/arch/ia64/kernel/setup.c	2006-03-28 15:20:52.000000000 +0900
@@ -431,6 +431,8 @@
 		mark_bsp_online();
 
 	parse_early_param();
+
+	reserve_memory();
 #ifdef CONFIG_ACPI
 	/* Initialize the ACPI boot-time table parser */
 	acpi_table_init();
Index: linux-2.6.16-mm1/arch/ia64/mm/contig.c
=================================--- linux-2.6.16-mm1.orig/arch/ia64/mm/contig.c	2006-03-28 11:16:13.000000000 +0900
+++ linux-2.6.16-mm1/arch/ia64/mm/contig.c	2006-03-28 15:20:52.000000000 +0900
@@ -146,8 +146,6 @@
 {
 	unsigned long bootmap_size;
 
-	reserve_memory();
-
 	/* first find highest page frame number */
 	max_pfn = 0;
 	efi_memmap_walk(find_max_pfn, &max_pfn);
Index: linux-2.6.16-mm1/arch/ia64/mm/discontig.c
=================================--- linux-2.6.16-mm1.orig/arch/ia64/mm/discontig.c	2006-03-28 11:16:13.000000000 +0900
+++ linux-2.6.16-mm1/arch/ia64/mm/discontig.c	2006-03-28 15:20:52.000000000 +0900
@@ -438,8 +438,6 @@
 {
 	int node;
 
-	reserve_memory();
-
 	if (num_online_nodes() = 0) {
 		printk(KERN_ERR "node info missing!\n");
 		node_set_online(0);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC][PATCH] avoid creating empty nodes [2/2] ignore empty pxms.
  2006-03-29  2:09 [RFC][PATCH] avoid creating empty nodes [0/2] KAMEZAWA Hiroyuki
  2006-03-29  2:11 ` [RFC][PATCH] avoid creating empty nodes [1/2] move reserve memory KAMEZAWA Hiroyuki
@ 2006-03-29  2:12 ` KAMEZAWA Hiroyuki
  2006-03-29 14:37 ` [RFC][PATCH] avoid creating empty nodes [0/2] Jack Steiner
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-03-29  2:12 UTC (permalink / raw)
  To: linux-ia64

compare SRAT information and other information to
avoid onlining nodes which doesn't have any cpus and memory.
for cpus, compare SRAT and lsapic.
for memory, compare SRAT and EFI.

Signed-Off-By: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Index: linux-2.6.16-mm1/arch/ia64/kernel/acpi.c
=================================--- linux-2.6.16-mm1.orig/arch/ia64/kernel/acpi.c	2006-03-28 15:04:31.000000000 +0900
+++ linux-2.6.16-mm1/arch/ia64/kernel/acpi.c	2006-03-28 16:53:39.000000000 +0900
@@ -413,8 +413,11 @@
 
 static int __initdata srat_num_cpus;	/* number of cpus */
 static u32 __devinitdata pxm_flag[PXM_FLAG_LEN];
+static u32 __devinitdata pxm_online_flag[PXM_FLAG_LEN];
 #define pxm_bit_set(bit)	(set_bit(bit,(void *)pxm_flag))
 #define pxm_bit_test(bit)	(test_bit(bit,(void *)pxm_flag))
+#define pxm_online_bit_set(bit)	(set_bit(bit, (void *)pxm_online_flag))
+#define pxm_online_bit_test(bit) (test_bit(bit, (void *)pxm_online_flag))
 static struct acpi_table_slit __initdata *slit_table;
 
 /*
@@ -450,6 +453,24 @@
 	srat_num_cpus++;
 }
 
+/*
+ * compare SRAT and lsapic entries.
+ */
+void __init confirm_srat_available_cpu(void)
+{
+	int i, j;
+	unsigned long phys_id;
+	for (i = 0; i < available_cpus; i++) {
+		phys_id = smp_boot_data.cpu_phys_id[i];
+		for (j = 0; j < srat_num_cpus; j++) {
+			if (node_cpuid[j].phys_id = phys_id) {
+				pxm_online_bit_set(node_cpuid[j].nid);
+				printk("set pxm %d online\n", node_cpuid[j].nid);
+			}
+		}
+	}
+}
+
 void __init
 acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 {
@@ -488,9 +509,31 @@
 	num_node_memblks++;
 }
 
+int confirm_srat_efi_memmap(unsigned long start, unsigned long end, void *arg)
+{
+	unsigned long pstart, pend;
+	struct node_memblk_s *p;
+	int i;
+	printk(KERN_INFO "print efi memamp %p %p\n", (void*)start, (void *)end);
+	pstart = __pa(start);
+	pend = __pa(end);
+	/* check intersection with SRAT's memdata */
+	for (i = 0; i < num_node_memblks; i++) {
+		p = &node_memblk[i];
+		if (((p->start_paddr <= pstart) &&
+	             (pstart < p->start_paddr + p->size)) ||
+		    ((pstart <= p->start_paddr) &&
+	             (p->start_paddr + p->size <= pend))) {
+			pxm_online_bit_set(p->nid);
+			printk(KERN_INFO "set pxm %d online\n", p->nid);
+		}
+	}
+	return 0;
+}
+
 void __init acpi_numa_arch_fixup(void)
 {
-	int i, j, node_from, node_to;
+	int i, j, node, node_from, node_to;
 
 	/* If there's no SRAT, fix the phys_id and mark node 0 online */
 	if (srat_num_cpus = 0) {
@@ -498,6 +541,8 @@
 		node_cpuid[0].phys_id = hard_smp_processor_id();
 		return;
 	}
+	confirm_srat_available_cpu();
+	efi_memmap_walk(confirm_srat_efi_memmap, NULL);
 
 	/*
 	 * MCD - This can probably be dropped now.  No need for pxm ID to node ID
@@ -507,8 +552,8 @@
 	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
 		if (pxm_bit_test(i)) {
 			int nid = acpi_map_pxm_to_node(i);
-			pxm_bit_set(i);
-			node_set_online(nid);
+			if (pxm_online_bit_test(i))
+				node_set_online(nid);
 		}
 	}
 
@@ -527,8 +572,13 @@
 	}
 
 	/* set logical node id in cpu structure */
-	for (i = 0; i < srat_num_cpus; i++)
-		node_cpuid[i].nid = pxm_to_node(node_cpuid[i].nid);
+	for (i = 0; i < srat_num_cpus; i++) {
+		node = pxm_to_node(node_cpuid[i].nid);
+		if (node_online(node))
+			node_cpuid[i].nid = node;
+		else
+			node_cpuid[i].nid = 0; /* for possible cpus */
+	}
 
 	printk(KERN_INFO "Number of logical nodes in system = %d\n",
 	       num_online_nodes());

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH] avoid creating empty nodes [0/2]
  2006-03-29  2:09 [RFC][PATCH] avoid creating empty nodes [0/2] KAMEZAWA Hiroyuki
  2006-03-29  2:11 ` [RFC][PATCH] avoid creating empty nodes [1/2] move reserve memory KAMEZAWA Hiroyuki
  2006-03-29  2:12 ` [RFC][PATCH] avoid creating empty nodes [2/2] ignore empty pxms KAMEZAWA Hiroyuki
@ 2006-03-29 14:37 ` Jack Steiner
  2006-03-29 23:31 ` KAMEZAWA Hiroyuki
  2006-03-30  1:38 ` Jack Steiner
  4 siblings, 0 replies; 6+ messages in thread
From: Jack Steiner @ 2006-03-29 14:37 UTC (permalink / raw)
  To: linux-ia64

On Wed, Mar 29, 2006 at 11:09:03AM +0900, KAMEZAWA Hiroyuki wrote:
> When I tesetd new firmware , which supports NUMA cpu/memory hot-add 
> (with Windows), I found Linux/ia64 creates nodes with no cpus, no memory.

I don't see any problems with your patch in the short term. However, long
term, we may need to make changes & support nodes with no memory or cpus.

An SGI Altix SSI system consists of a collection of nodes connected via a
high speed interconnect. Nodes come in several flavors:

        - memory, IO and cpus
        - memory & cpus
        - memory only
        - IO only

You can think of the last 3 types of nodes as nodes that have been partially
depopulated. This isn't entirely accurate but is good enough to gain an
understanding of the problem.

Currently, IO nodes are not defined in the SRAT tables because there is no
way to describe them.  Most of the kernel is unaware of IO nodes. There are
hacks in the SN code that have extended the definition of nodes (cnodes) to
include IO nodes. However, this is just an interim solution & we intend to
fix it. 

ACPI3.0 will have a way to describe these IO nodes. We have not done the
design but when ACPI3.0 is available, we intend to make the kernel fully
aware of IO-only nodes. 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH] avoid creating empty nodes [0/2]
  2006-03-29  2:09 [RFC][PATCH] avoid creating empty nodes [0/2] KAMEZAWA Hiroyuki
                   ` (2 preceding siblings ...)
  2006-03-29 14:37 ` [RFC][PATCH] avoid creating empty nodes [0/2] Jack Steiner
@ 2006-03-29 23:31 ` KAMEZAWA Hiroyuki
  2006-03-30  1:38 ` Jack Steiner
  4 siblings, 0 replies; 6+ messages in thread
From: KAMEZAWA Hiroyuki @ 2006-03-29 23:31 UTC (permalink / raw)
  To: linux-ia64

On Wed, 29 Mar 2006 08:37:34 -0600
Jack Steiner <steiner@sgi.com> wrote:
> ACPI3.0 will have a way to describe these IO nodes. We have not done the
> design but when ACPI3.0 is available, we intend to make the kernel fully
> aware of IO-only nodes. 
> 

Then, we should set I/O nodes's pxm to pxm_online_map (I added in patch) in 
acpi_numa_arch_fixup() in addition to cpus and memory. But there are no codes
to find available I/O nodes now.
Correct ?

BTW I heared ACPI3.0 can define pxm > 256. If we support it, entire codes should
be rewrote ;(.

-- Kame


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC][PATCH] avoid creating empty nodes [0/2]
  2006-03-29  2:09 [RFC][PATCH] avoid creating empty nodes [0/2] KAMEZAWA Hiroyuki
                   ` (3 preceding siblings ...)
  2006-03-29 23:31 ` KAMEZAWA Hiroyuki
@ 2006-03-30  1:38 ` Jack Steiner
  4 siblings, 0 replies; 6+ messages in thread
From: Jack Steiner @ 2006-03-30  1:38 UTC (permalink / raw)
  To: linux-ia64

On Thu, Mar 30, 2006 at 08:31:02AM +0900, KAMEZAWA Hiroyuki wrote:
> On Wed, 29 Mar 2006 08:37:34 -0600
> Jack Steiner <steiner@sgi.com> wrote:
> > ACPI3.0 will have a way to describe these IO nodes. We have not done the
> > design but when ACPI3.0 is available, we intend to make the kernel fully
> > aware of IO-only nodes. 
> > 
> 
> Then, we should set I/O nodes's pxm to pxm_online_map (I added in patch) in 
> acpi_numa_arch_fixup() in addition to cpus and memory. But there are no codes
> to find available I/O nodes now.
> Correct ?

Correct for ACPI2.0. ACPI3.0 has the PXM for IO buses (at least I think it was
at the bus level - I don't have the spec right now) in one of the IO tables.

> 
> BTW I heared ACPI3.0 can define pxm > 256. If we support it, entire codes should
> be rewrote ;(.

Yes. IIRC, PXM is a 32-bits in some tables, 24-bits in other tables.


> 
> -- Kame

---
Jack

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2006-03-30  1:38 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-29  2:09 [RFC][PATCH] avoid creating empty nodes [0/2] KAMEZAWA Hiroyuki
2006-03-29  2:11 ` [RFC][PATCH] avoid creating empty nodes [1/2] move reserve memory KAMEZAWA Hiroyuki
2006-03-29  2:12 ` [RFC][PATCH] avoid creating empty nodes [2/2] ignore empty pxms KAMEZAWA Hiroyuki
2006-03-29 14:37 ` [RFC][PATCH] avoid creating empty nodes [0/2] Jack Steiner
2006-03-29 23:31 ` KAMEZAWA Hiroyuki
2006-03-30  1:38 ` Jack Steiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox