Linux Hotplug development

Linux Hotplug development
 help / color / mirror / Atom feed

* [v2,6/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:42 UTC (permalink / raw)
  To: linux-hotplug

From: Shaohui Zheng <shaohui.zheng@intel.com>
Subject: hotplug emulator: Fake CPU socket with logical CPU on x86

When hotplug a CPU with emulator, we are using a logical CPU to emulate the
CPU hotplug process. For the CPU supported SMT, some logical CPUs are in the
same socket, but it may located in different NUMA node after we have emulator.
it misleads the scheduling domain to build the incorrect hierarchy, and it
causes the following call trace when rebalance the scheduling domain:

divide error: 0000 [#1] SMP 
last sysfs file: /sys/devices/system/cpu/cpu8/online
CPU 0 
Modules linked in: fbcon tileblit font bitblit softcursor radeon ttm drm_kms_helper e1000e usbhid via_rhine mii drm i2c_algo_bit igb dca
Pid: 0, comm: swapper Not tainted 2.6.32hpe #78 X8DTN
RIP: 0010:[<ffffffff81051da5>]  [<ffffffff81051da5>] find_busiest_group+0x6c5/0xa10
RSP: 0018:ffff880028203c30  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000015ac0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff880277e8cfa0 RDI: 0000000000000000
RBP: ffff880028203dc0 R08: ffff880277e8cfa0 R09: 0000000000000040
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff880028200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00007f16cfc85770 CR3: 0000000001001000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81822000, task ffffffff8184a600)
Stack:
 ffff880028203d60 ffff880028203cd0 ffff8801c204ff08 ffff880028203e38
<0> 0101ffff81018c59 ffff880028203e44 00000001810806bd ffff8801c204fe00
<0> 0000000528200000 ffffffff00000000 0000000000000018 0000000000015ac0
Call Trace:
 <IRQ> 
 [<ffffffff81088ee0>] ? tick_dev_program_event+0x40/0xd0
 [<ffffffff81053b2c>] rebalance_domains+0x17c/0x570
 [<ffffffff81018c89>] ? read_tsc+0x9/0x20
 [<ffffffff81088ee0>] ? tick_dev_program_event+0x40/0xd0
 [<ffffffff810569ed>] run_rebalance_domains+0xbd/0xf0
 [<ffffffff8106471f>] __do_softirq+0xaf/0x1e0
 [<ffffffff810b7d18>] ? handle_IRQ_event+0x58/0x160
 [<ffffffff810130ac>] call_softirq+0x1c/0x30
 [<ffffffff81014a85>] do_softirq+0x65/0xa0
 [<ffffffff810645cd>] irq_exit+0x7d/0x90
 [<ffffffff81013ff0>] do_IRQ+0x70/0xe0
 [<ffffffff810128d3>] ret_from_intr+0x0/0x11
 <EOI> 
 [<ffffffff8133387f>] ? acpi_idle_enter_bm+0x281/0x2b5
 [<ffffffff81333878>] ? acpi_idle_enter_bm+0x27a/0x2b5
 [<ffffffff8145dc8f>] ? cpuidle_idle_call+0x9f/0x130
 [<ffffffff81010e2b>] ? cpu_idle+0xab/0x100
 [<ffffffff8158aee6>] ? rest_init+0x66/0x70
 [<ffffffff81905d90>] ? start_kernel+0x3e3/0x3ef
 [<ffffffff8190533a>] ? x86_64_start_reservations+0x125/0x129
 [<ffffffff81905438>] ? x86_64_start_kernel+0xfa/0x109
Code: 00 00 e9 4c fb ff ff 0f 1f 80 00 00 00 00 48 8b b5 d8 fe ff ff 48 8b 45 a8 4d 29 ef 8b 56 08 48 c1 e0 0a 49 89 f0 48 89 d7 31 d2 <48> f7 f7 31 d2 48 89 45 a0 8b 76 08 4c 89 f0 48 c1 e0 0a 48 f7 
RIP  [<ffffffff81051da5>] find_busiest_group+0x6c5/0xa10
 RSP <ffff880028203c30>

Solution:

We put the logical CPU into a fake CPU socket, and assign it an unique
 phys_proc_id. For the fake socket, we put one logical CPU in only. This
method fixes the above bug.

Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
---
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bd..9a2088c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -113,6 +113,15 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	/*
+	 * Use a logic cpu to emulate a physical cpu's hotplug. We put the
+	 * logical cpu into a fake socket, assign a fake physical id to it,
+	 * and create a fake core.
+	 */
+	__u8		cpu_probe_on; /* A flag to enable cpu probe/release */
+#endif
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 170d9b9..1d4dc67 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
  */
 static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 void cpu_hotplug_driver_lock()
 {
         mutex_lock(&x86_cpu_hotplug_driver_mutex);
@@ -106,6 +107,7 @@ void cpu_hotplug_driver_unlock()
 {
         mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
+#endif
 
 #else
 static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
@@ -198,6 +200,8 @@ static void __cpuinit smp_callin(void)
 {
 	int cpuid, phys_id;
 	unsigned long timeout;
+	u8 cpu_probe_on = 0;
+	struct cpuinfo_x86 *c;
 
 	/*
 	 * If waken up by an INIT in an 82489DX configuration
@@ -277,7 +281,20 @@ static void __cpuinit smp_callin(void)
 	/*
 	 * Save our processor parameters
 	 */
+	c = &cpu_data(cpuid);
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	cpu_probe_on = c->cpu_probe_on;
+	phys_id = c->phys_proc_id;
+#endif
+
 	smp_store_cpu_info(cpuid);
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	if (cpu_probe_on) {
+		c->phys_proc_id = phys_id; /* restore the fake phys_proc_id */
+		c->cpu_core_id = 0; /* force the logical cpu to core 0 */
+		c->cpu_probe_on = cpu_probe_on;
+	}
+#endif
 
 	notify_cpu_starting(cpuid);
 
@@ -400,6 +417,11 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 {
 	int i;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int cpu_probe_on = 0;
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	cpu_probe_on = c->cpu_probe_on;
+#endif
 
 	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
@@ -431,7 +453,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-		    per_cpu(cpu_llc_id, cpu) = per_cpu(cpu_llc_id, i)) {
+		    per_cpu(cpu_llc_id, cpu) = per_cpu(cpu_llc_id, i) &&
+			cpu_probe_on = 0) {
 			cpumask_set_cpu(i, c->llc_shared_map);
 			cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
 		}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 515f08a..98a9b45 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -90,6 +90,36 @@ void arch_unregister_cpu(int num)
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+/*
+ * Put the logical cpu into a new sokect, and encapsule it into core 0.
+ */
+static void fake_cpu_socket_info(int cpu)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int i, phys_id = 0;
+
+	/* calculate the max phys_id */
+	for_each_present_cpu(i) {
+		struct cpuinfo_x86 *c = &cpu_data(i);
+		if (phys_id < c->phys_proc_id)
+			phys_id = c->phys_proc_id;
+	}
+
+	c->phys_proc_id = phys_id + 1; /* pick up a unused phys_proc_id */
+	c->cpu_core_id = 0; /* always put the logical cpu to core 0 */
+	c->cpu_probe_on = 1;
+}
+
+static void clear_cpu_socket_info(int cpu)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	c->phys_proc_id = 0;
+	c->cpu_core_id = 0;
+	c->cpu_probe_on = 0;
+}
+
+
 ssize_t arch_cpu_probe(const char *buf, size_t count)
 {
 	int nid = 0;
@@ -129,6 +159,7 @@ ssize_t arch_cpu_probe(const char *buf, size_t count)
 	/* register cpu */
 	arch_register_cpu_emu(selected, nid);
 	acpi_map_lsapic_emu(selected, nid);
+	fake_cpu_socket_info(selected);
 
 	return count;
 }
@@ -152,10 +183,13 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 	arch_unregister_cpu(cpu);
 	acpi_unmap_lsapic(cpu);
+	clear_cpu_socket_info(cpu);
+	set_cpu_present(cpu, true);
 
 	return count;
 }
 EXPORT_SYMBOL(arch_cpu_release);
+#endif CONFIG_ARCH_CPU_PROBE_RELEASE
 
 #else /* CONFIG_HOTPLUG_CPU */
 

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* [v2,7/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:43 UTC (permalink / raw)
  To: linux-hotplug

From: Shaohui Zheng <shaohui.zheng@intel.com>
Subject: hotplug emulator:extend memory probe interface to support NUMA

Extend memory probe interface to support an extra paramter nid,
the reserved memory can be added into this node if node exists.

Add a memory section(128M) to node 3(boots with mem\x1024m)

	echo 0x40000000,3 > memory/probe

And more we make it friendly, it is possible to add memory to do

	echo 3g > memory/probe
	echo 1024m,3 > memory/probe

It maintains backwards compatibility.

Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@intel.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dcb0593..0750409 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1273,10 +1273,6 @@ config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on ARCH_SPARSEMEM_ENABLE
 
-config ARCH_MEMORY_PROBE
-	def_bool X86_64
-	depends on MEMORY_HOTPLUG
-
 config ILLEGAL_POINTER_VALUE
        hex
        default 0 if X86_32
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 933442f..32c16a2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -329,6 +329,9 @@ static int block_size_init(void)
  * will not need to do it from userspace.  The fake hot-add code
  * as well as ppc64 will do all of their discovery in userspace
  * and will require this interface.
+ *
+ * Parameter format 1: physical_address,numa_node
+ * Parameter format 2: physical_address=0x40000000 numa_node=3
  */
 #ifdef CONFIG_ARCH_MEMORY_PROBE
 static ssize_t
@@ -336,13 +339,53 @@ memory_probe_store(struct class *class, struct class_attribute *attr,
 		   const char *buf, size_t count)
 {
 	u64 phys_addr;
-	int nid;
+	int nid = 0;
 	int ret;
+	char *p = NULL, *q = NULL;
+	/* format: physical_address=0x40000000 numa_node=3 */
+	p = strchr(buf, '=');
+	if (p != NULL) {
+		*p = '\0';
+		q = strchr(buf, ' ');
+		if (q = NULL) {
+			if (strcmp(buf, "physical_address") != 0)
+				ret = -EPERM;
+			else
+				phys_addr = memparse(p+1, NULL);
+		} else {
+			*q++ = '\0';
+			p = strchr(q, '=');
+			if (strcmp(buf, "physical_address") = 0)
+				phys_addr = memparse(p+1, NULL);
+			if (strcmp(buf, "numa_node") = 0)
+				nid = simple_strtoul(p+1, NULL, 0);
+			if (strcmp(q, "physical_address") = 0)
+				phys_addr = memparse(p+1, NULL);
+			if (strcmp(q, "numa_node") = 0)
+				nid = simple_strtoul(p+1, NULL, 0);
+		}
+	} else { /* physical_address,numa_node */
+		p = strchr(buf, ',');
+		if (p != NULL && strlen(p+1) > 0) {
+			/* nid specified */
+			*p++ = '\0';
+			nid = simple_strtoul(p, NULL, 0);
+			phys_addr = memparse(buf, NULL);
+		} else {
+			phys_addr = memparse(buf, NULL);
+			nid = memory_add_physaddr_to_nid(phys_addr);
+		}
+	}
 
-	phys_addr = simple_strtoull(buf, NULL, 0);
-
-	nid = memory_add_physaddr_to_nid(phys_addr);
-	ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+	if (nid < 0 || nid > nr_node_ids - 1) {
+		printk(KERN_ERR "Invalid node id %d(0<=nid<%d).\n", nid, nr_node_ids);
+		ret = -EPERM;
+	} else {
+		printk(KERN_INFO "Add a memory section to node: %d.\n", nid);
+		ret = add_memory(nid, phys_addr, PAGES_PER_SECTION << PAGE_SHIFT);
+		if (ret)
+			count = ret;
+	}
 
 	if (ret)
 		count = ret;
diff --git a/mm/Kconfig b/mm/Kconfig
index 78bb447..a0b5c3d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -170,6 +170,17 @@ config ARCH_CPU_PROBE_RELEASE
 	  is for cpu hot-add/hot-remove to specified node in software method.
 	  This is for debuging and testing purpose
 
+config ARCH_MEMORY_PROBE
+	def_bool y
+	bool "Memory hotplug emulation"
+	depends on NUMA_HOTPLUG_EMU
+	---help---
+	  Enable memory hotplug emulation. Reserve memory with grub parameter
+	  "mem=N"(such as mem\x1024M), where N is the initial memory size, the
+	  rest physical memory will be removed from e820 table; the memory probe
+	  interface is for memory hot-add to specified node in software method.
+	  This is for debuging and testing purpose
+
 #
 # If we have space for more page flags then we can enable additional
 # optimizations and functionality.

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* [v2,8/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:45 UTC (permalink / raw)
  To: linux-hotplug

From: Shaohui Zheng <shaohui.zheng@intel.com>
Subject: Doc/x86_64: documentation of NUMA hotplug emulator

add a text file Documentation/x86/x86_64/numa_hotplug_emulator.txt
to explain the usage for the hotplug emulator.

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
---
diff --git a/Documentation/x86/x86_64/numa_hotplug_emulator.txt b/Documentation/x86/x86_64/numa_hotplug_emulator.txt
new file mode 100644
index 0000000..c6bdb2c
--- /dev/null
+++ b/Documentation/x86/x86_64/numa_hotplug_emulator.txt
@@ -0,0 +1,87 @@
+NUMA Hotplug Emulator for x86
+---------------------------------------------------
+
+NUMA hotplug emulator is able to emulate NUMA Node Hotplug
+thru a pure software way. It intends to help people easily debug
+and test node/cpu/memory hotplug related stuff on a
+none-numa-hotplug-support machine, even a UMA machine and virtual
+environment.
+
+1) Node hotplug emulation:
+
+The emulator firstly hides RAM via E820 table, and then it can
+fake offlined nodes with the hidden RAM.
+
+After system bootup, user is able to hotplug-add these offlined
+nodes, which is just similar to a real hotplug hardware behavior.
+
+Using boot option "numa=hide=N*size" to fake offlined nodes:
+	- N is the number of hidden nodes
+	- size is the memory size (in MB) per hidden node.
+
+There is a sysfs entry "probe" under /sys/devices/system/node/ for user
+to hotplug the fake offlined nodes:
+
+ - to show all fake offlined nodes:
+    $ cat /sys/devices/system/node/probe
+
+ - to hotadd a fake offlined node, e.g. nodeid is N:
+    $ echo N > /sys/devices/system/node/probe
+
+2) CPU hotplug emulation:
+
+The emulator reserve CPUs throu grub parameter, the reserved CPUs can be
+hot-add/hot-remove in software method, it emulates the procuess of physical
+cpu hotplug.
+
+ - to hide CPUs
+	- Using boot option "maxcpus=N" hide CPUs
+	  N is the number of initialize CPUs
+	- Using boot option "cpu_hpe=on" to enable cpu hotplug emulation
+      when cpu_hpe is enabled, the rest CPUs will not be initialized
+
+ - to hot-add CPU to node
+	$ echo nid > cpu/probe
+
+ - to hot-remove CPU
+	$ echo nid > cpu/release
+
+3) Memory hotplug emulation:
+
+The emulator reserve memory before OS booting, the reserved memory region
+is remove from e820 table, and they can be hot-added via the probe interface,
+this interface was extend to support add memory to the specified node, It
+maintains backwards compatibility.
+
+The difficulty of Memory Release is well-known, we have no plan for it until now.
+
+ - reserve memory throu grub parameter
+ 	mem\x1024m
+
+ - add a memory section to node 3
+    $ echo 0x40000000,3 > memory/probe
+	OR
+    $ echo 1024m,3 > memory/probe
+	OR
+    $ echo "physical_address=0x40000000 numa_node=3" > memory/probe
+
+4) Script for hotplug testing
+
+These scripts provides convenience when we hot-add memory/cpu in batch.
+
+- Online all pages:
+for m in /sys/devices/system/memory/memory*;
+do
+	echo online > $m/state;
+done
+
+- CPU Online:
+for c in /sys/devices/system/cpu/cpu*;
+do
+	echo 1 > $c/online;
+done
+
+- Haicheng Li <haicheng.li@linux.intel.com>
+- Shaohui Zheng <shaohui.zheng@intel.com>
+  May 2010
+

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Peter Zijlstra @ 2010-11-13 10:30 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: LKML, Ingo Molnar, Nikanth Karthikesan, Yinghai Lu,
	David Rientjes, Zheng, Shaohui, Andrew Morton, linux-hotplug
In-Reply-To: <20101113084018.GA23098@localhost>

On Sat, 2010-11-13 at 16:40 +0800, Wu Fengguang wrote:
> > Will try and figure out how the heck that's happening, Ingo any clue?
> 
> It's back to normal on 2.6.37-rc1 when reverting commit 50f2d7f682f9
> ("x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA").
> 
> The interesting part is, the commit was introduced in 
> 2.6.36-rc7..2.6.36, however 2.6.36 boots OK, while 2.6.37-rc1 panics.

Argh, that commit again..

Does this fix it: http://lkml.org/lkml/2010/11/12/8



^ permalink raw reply

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Wu Fengguang @ 2010-11-13 12:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Nikanth Karthikesan, Yinghai Lu,
	David Rientjes, Zheng, Shaohui, Andrew Morton,
	linux-hotplug@vger.kernel.org, Eric Dumazet, Bjorn Helgaas,
	Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <1289644224.2084.521.camel@laptop>

On Sat, Nov 13, 2010 at 06:30:24PM +0800, Peter Zijlstra wrote:
> On Sat, 2010-11-13 at 16:40 +0800, Wu Fengguang wrote:
> > > Will try and figure out how the heck that's happening, Ingo any clue?
> > 
> > It's back to normal on 2.6.37-rc1 when reverting commit 50f2d7f682f9
> > ("x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA").
> > 
> > The interesting part is, the commit was introduced in 
> > 2.6.36-rc7..2.6.36, however 2.6.36 boots OK, while 2.6.37-rc1 panics.
> 
> Argh, that commit again..
> 
> Does this fix it: http://lkml.org/lkml/2010/11/12/8

No it still panics. Here is the dmesg.

Thanks,
Fengguang
---

[    0.000000] console [ttyS0] enabled, bootconsole disabled
[    0.000000] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
[    0.000000] ... MAX_LOCKDEP_SUBCLASSES:  8
[    0.000000] ... MAX_LOCK_DEPTH:          48
[    0.000000] ... MAX_LOCKDEP_KEYS:        8191
[    0.000000] ... CLASSHASH_SIZE:          4096
[    0.000000] ... MAX_LOCKDEP_ENTRIES:     16384
[    0.000000] ... MAX_LOCKDEP_CHAINS:      32768
[    0.000000] ... CHAINHASH_SIZE:          16384
[    0.000000]  memory used by lock dependency info: 6367 kB
[    0.000000]  per task-struct memory footprint: 2688 bytes
[    0.000000] allocated 62914560 bytes of page_cgroup
[    0.000000] please try 'cgroup_disable=memory' option if you don't want memory cgroups
[    0.000000] ODEBUG: 15 of 15 active objects replaced
[    0.000000] hpet clockevent registered
[    0.001000] Fast TSC calibration using PIT
[    0.002000] Detected 2666.733 MHz processor.
[    0.000009] Calibrating delay loop (skipped), value calculated using timer frequency.. 5333.46 BogoMIPS (lpj&66733)
[    0.010813] pid_max: default: 32768 minimum: 301
[    0.018252] Dentry cache hash table entries: 1048576 (order: 11, 8388608 bytes)
[    0.028528] Inode-cache hash table entries: 524288 (order: 10, 4194304 bytes)
[    0.036421] Mount-cache hash table entries: 256
[    0.041300] Initializing cgroup subsys debug
[    0.045664] Initializing cgroup subsys ns
[    0.049767] ns_cgroup deprecated: consider using the 'clone_children' flag without the ns_cgroup.
[    0.058788] Initializing cgroup subsys cpuacct
[    0.063328] Initializing cgroup subsys memory
[    0.067805] Initializing cgroup subsys devices
[    0.072340] Initializing cgroup subsys freezer
[    0.076910] CPU: Physical Processor ID: 0
[    0.081008] CPU: Processor Core ID: 0
[    0.084761] mce: CPU supports 9 MCE banks
[    0.088876] CPU0: Thermal monitoring enabled (TM1)
[    0.093767] using mwait in idle threads.
[    0.097777] Performance Events: PEBS fmt1+, Nehalem events, Intel PMU driver.
[    0.105138] ... version:                3
[    0.109239] ... bit width:              48
[    0.113423] ... generic registers:      4
[    0.117521] ... value mask:             0000ffffffffffff
[    0.122918] ... max period:             000000007fffffff
[    0.128319] ... fixed-purpose events:   3
[    0.132415] ... event mask:             000000070000000f
[    0.138807] ACPI: Core revision 20101013
[    0.162629] ftrace: allocating 24175 entries in 95 pages
[    0.177831] Setting APIC routing to flat
[    0.182351] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[    0.198414] CPU0: Genuine Intel(R) CPU             000  @ 2.67GHz stepping 04
[    0.312081] lockdep: fixing up alternatives.
[    0.317087] Booting Node   0, Processors  #1lockdep: fixing up alternatives.
[    0.416915]  #2lockdep: fixing up alternatives.
[    0.513688]  #3lockdep: fixing up alternatives.
[    0.610394]  #4lockdep: fixing up alternatives.
[    0.707133]  Ok.
[    0.709070] Booting Node   1, Processors  #5lockdep: fixing up alternatives.
[    0.808855]  Ok.
[    0.810787] Booting Node   0, Processors  #6lockdep: fixing up alternatives.
[    0.910602]  Ok.
[    0.912532] Booting Node   1, Processors  #7 Ok.
[    1.007347] Brought up 8 CPUs
[    1.010412] Total of 8 processors activated (42661.40 BogoMIPS).
[    1.016551] Testing NMI watchdog ... OK.
[    1.044508] CPU0 attaching sched-domain:
[    1.048524]  domain 0: span 0-3 level MC
[    1.052578]   groups: 0 1 2 3
[    1.055836]   domain 1: span 0-4,6 level CPU
[    1.060235]    groups: 0-3 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.066875] ERROR: repeated CPUs
[    1.070189]
[    1.071778] ERROR: groups don't span domain->span
[    1.076564]    domain 2: span 0-7 level NODE
[    1.080966]     groups: 0-4,6 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.087884] CPU1 attaching sched-domain:
[    1.091899]  domain 0: span 0-3 level MC
[    1.095957]   groups: 1 2 3 0
[    1.099201]   domain 1: span 0-4,6 level CPU
[    1.103608]    groups: 0-3 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.110273] ERROR: repeated CPUs
[    1.113594]
[    1.115177] ERROR: groups don't span domain->span
[    1.119966]    domain 2: span 0-7 level NODE
[    1.124371]     groups: 0-4,6 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.131280] CPU2 attaching sched-domain:
[    1.135295]  domain 0: span 0-3 level MC
[    1.139353]   groups: 2 3 0 1
[    1.142609]   domain 1: span 0-4,6 level CPU
[    1.147008]    groups: 0-3 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.153664] ERROR: repeated CPUs
[    1.156979]
[    1.158567] ERROR: groups don't span domain->span
[    1.163357]    domain 2: span 0-7 level NODE
[    1.167759]     groups: 0-4,6 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.174681] CPU3 attaching sched-domain:
[    1.178688]  domain 0: span 0-3 level MC
[    1.182746]   groups: 3 0 1 2
[    1.185997]   domain 1: span 0-4,6 level CPU
[    1.190400]    groups: 0-3 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.197059] ERROR: repeated CPUs
[    1.200377]
[    1.201959] ERROR: groups don't span domain->span
[    1.206747]    domain 2: span 0-7 level NODE
[    1.211140]     groups: 0-4,6 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.218050] CPU4 attaching sched-domain:
[    1.222055]  domain 0: span 4-7 level MC
[    1.226112]   groups: 4 5 6 7
[    1.229358] ERROR: parent span is not a superset of domain->span
[    1.235452]   domain 1: span 0-4,6 level CPU
[    1.239858] ERROR: domain->groups does not contain CPU4
[    1.245163]    groups: 5,7 (cpu_power = 4096)
[    1.249742] ERROR: groups don't span domain->span
[    1.254535]    domain 2: span 0-7 level NODE
[    1.258935]     groups: 0-4,6 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.265836] CPU5 attaching sched-domain:
[    1.269841]  domain 0: span 4-7 level MC
[    1.273899]   groups: 5 6 7 4
[    1.277142] ERROR: parent span is not a superset of domain->span
[    1.283227]   domain 1: span 5,7 level CPU
[    1.287458]    groups: 5,7 (cpu_power = 4096)
[    1.292026]    domain 2: span 0-7 level NODE
[    1.296429]     groups: 5,7 (cpu_power = 4096) 0-4,6 (cpu_power = 4096)
[    1.304915] CPU6 attaching sched-domain:
[    1.308922]  domain 0: span 4-7 level MC
[    1.312979]   groups: 6 7 4 5
[    1.316248] ERROR: parent span is not a superset of domain->span
[    1.322344]   domain 1: span 0-4,6 level CPU
[    1.326742] ERROR: domain->groups does not contain CPU6
[    1.332048]    groups: 5,7 (cpu_power = 4096)
[    1.336623] ERROR: groups don't span domain->span
[    1.341437]    domain 2: span 0-7 level NODE
[    1.345841]     groups: 0-4,6 (cpu_power = 4096) 5,7 (cpu_power = 4096)
[    1.352755] CPU7 attaching sched-domain:
[    1.356764]  domain 0: span 4-7 level MC
[    1.360820]   groups: 7 4 5 6
[    1.364078] ERROR: parent span is not a superset of domain->span
[    1.370165]   domain 1: span 5,7 level CPU
[    1.374398]    groups: 5,7 (cpu_power = 4096)
[    1.378964]    domain 2: span 0-7 level NODE
[    1.383372]     groups: 5,7 (cpu_power = 4096) 0-4,6 (cpu_power = 4096)
[    6.526802] BUG: NMI Watchdog detected LOCKUP on CPU0, ip ffffffff810a9dc1, registers:
[    6.534902] CPU 0
[    6.536767] Modules linked in:
[    6.540213]
[    6.541792] Pid: 1, comm: swapper Tainted: G        W   2.6.37-rc1+ #111 X8DTN/X8DTN
[    6.549675] RIP: 0010:[<ffffffff810a9dc1>]  [<ffffffff810a9dc1>] find_busiest_group+0x761/0x1480
[    6.558650] RSP: 0018:ffff8801b966d870  EFLAGS: 00000012
[    6.564039] RAX: 0000000000000000 RBX: ffff8801b966daec RCX: 0000000000000000
[    6.571245] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff8800bac0e410
[    6.578455] RBP: ffff8801b966da30 R08: ffff8800bac0e410 R09: ffff8800bac0e400
[    6.585664] R10: 0000000000000003 R11: 0000000000000000 R12: 00000000001d2d00
[    6.592873] R13: 00000000001d2d00 R14: 00000000001d2d00 R15: 0000000000000008
[    6.600083] FS:  0000000000000000(0000) GS:ffff8800ba400000(0000) knlGS:0000000000000000
[    6.608312] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    6.614134] CR2: 0000000000000000 CR3: 0000000001ee1000 CR4: 00000000000006f0
[    6.621348] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    6.628558] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    6.635767] Process swapper (pid: 1, threadinfo ffff8801b966c000, task ffff8800b3778000)
[    6.643994] Stack:
[    6.646095]  ffff8801b966d890 ffff8801b966d9d0 0000000000000007 ffff8801bfdd2d00
[    6.653793]  0000000000000000 00000000001d2d00 ffff8801b966dae0 00000002b966d910
[    6.661476]  ffff8801b966d801 ffffffff810929ed ffff8800ba40de48 00000000000b306a
[    6.669171] Call Trace:
[    6.671706]  [<ffffffff810929ed>] ? __phys_addr+0x5d/0x120
[    6.677270]  [<ffffffff810b2614>] load_balance+0xe4/0xcb0
[    6.682747]  [<ffffffff810b0b54>] ? dequeue_task_fair+0x1f4/0x250
[    6.688926]  [<ffffffff8199be5d>] schedule+0xb0d/0x14b0
[    6.694235]  [<ffffffff810cc60e>] ? __sysctl_head_next+0x19e/0x1a0
[    6.700499]  [<ffffffff8199d2dd>] schedule_timeout+0x50d/0x570
[    6.706409]  [<ffffffff8110b9bc>] ? print_lock_contention_bug+0x2c/0x110
[    6.713187]  [<ffffffff810af7a1>] ? get_parent_ip+0x11/0x90
[    6.718843]  [<ffffffff819a7cbd>] ? sub_preempt_count+0x12d/0x1f0
[    6.725020]  [<ffffffff8199b10b>] wait_for_common+0x16b/0x290
[    6.730853]  [<ffffffff810b4950>] ? default_wake_function+0x0/0x20
[    6.737113]  [<ffffffff8199b34d>] wait_for_completion+0x1d/0x20
[    6.743112]  [<ffffffff810efdfb>] kthread_create+0x9b/0x150
[    6.748764]  [<ffffffff810e8310>] ? rescuer_thread+0x0/0x2a0
[    6.754506]  [<ffffffff81202078>] ? __kmalloc_node+0x2b8/0x340
[    6.760419]  [<ffffffff810e7d5a>] __alloc_workqueue_key+0x27a/0x830
[    6.766765]  [<ffffffff8263b23f>] cpuset_init_smp+0x56/0x8c
[    6.772417]  [<ffffffff8261d148>] kernel_init+0x17a/0x27c
[    6.777899]  [<ffffffff81051a24>] kernel_thread_helper+0x4/0x10
[    6.783899]  [<ffffffff819a2c14>] ? restore_args+0x0/0x30
[    6.789377]  [<ffffffff8261cfce>] ? kernel_init+0x0/0x27c
[    6.794859]  [<ffffffff81051a20>] ? kernel_thread_helper+0x0/0x10
[    6.801028] Code: ff 8b 42 08 48 05 00 02 00 00 48 c1 f8 0a 48 85 c0 48 89 45 c0 0f 94 c0 0f b6 c0 48 63 d0 48 83 c2 02 48 83 04 d5 58 21 09 82 01 <85> c0 0f 84 07 02 00 00 48 8b bd a8 fe ff ff 31 d2 83 7f 50 01
[    6.822637] ---[ end trace 4eaa2a86a8e2da23 ]---
[    6.827330] Kernel panic - not syncing: Non maskable interrupt
[    6.833236] Pid: 1, comm: swapper Tainted: G      D W   2.6.37-rc1+ #111
[    6.840018] Call Trace:
[    6.842548]  <NMI>  [<ffffffff810a9dc1>] ? find_busiest_group+0x761/0x1480
[    6.849539]  [<ffffffff8199acb0>] panic+0xb1/0x222
[    6.854414]  [<ffffffff810a9dc1>] ? find_busiest_group+0x761/0x1480
[    6.860763]  [<ffffffff819a4403>] die_nmi+0x153/0x180
[    6.865895]  [<ffffffff819a5049>] nmi_watchdog_tick+0x219/0x270
[    6.871902]  [<ffffffff819a38fa>] do_nmi+0x2fa/0x490
[    6.876955]  [<ffffffff819a3170>] nmi+0x20/0x39
[    6.881566]  [<ffffffff810a9dc1>] ? find_busiest_group+0x761/0x1480
[    6.887916]  <<EOE>>  [<ffffffff810929ed>] ? __phys_addr+0x5d/0x120
[    6.894301]  [<ffffffff810b2614>] load_balance+0xe4/0xcb0
[    6.899783]  [<ffffffff810b0b54>] ? dequeue_task_fair+0x1f4/0x250
[    6.905960]  [<ffffffff8199be5d>] schedule+0xb0d/0x14b0
[    6.911271]  [<ffffffff810cc60e>] ? __sysctl_head_next+0x19e/0x1a0
[    6.917533]  [<ffffffff8199d2dd>] schedule_timeout+0x50d/0x570
[    6.923443]  [<ffffffff8110b9bc>] ? print_lock_contention_bug+0x2c/0x110
[    6.930222]  [<ffffffff810af7a1>] ? get_parent_ip+0x11/0x90
[    6.935872]  [<ffffffff819a7cbd>] ? sub_preempt_count+0x12d/0x1f0
[    6.942051]  [<ffffffff8199b10b>] wait_for_common+0x16b/0x290
[    6.947881]  [<ffffffff810b4950>] ? default_wake_function+0x0/0x20
[    6.954140]  [<ffffffff8199b34d>] wait_for_completion+0x1d/0x20
[    6.960140]  [<ffffffff810efdfb>] kthread_create+0x9b/0x150
[    6.965792]  [<ffffffff810e8310>] ? rescuer_thread+0x0/0x2a0
[    6.971533]  [<ffffffff81202078>] ? __kmalloc_node+0x2b8/0x340
[    6.977445]  [<ffffffff810e7d5a>] __alloc_workqueue_key+0x27a/0x830
[    6.983793]  [<ffffffff8263b23f>] cpuset_init_smp+0x56/0x8c
[    6.989443]  [<ffffffff8261d148>] kernel_init+0x17a/0x27c
[    6.994924]  [<ffffffff81051a24>] kernel_thread_helper+0x4/0x10
[    7.000924]  [<ffffffff819a2c14>] ? restore_args+0x0/0x30
[    7.006402]  [<ffffffff8261cfce>] ? kernel_init+0x0/0x27c
[    7.011883]  [<ffffffff81051a20>] ? kernel_thread_helper+0x0/0x10
[    8.097122] Rebooting in 10 seconds..

^ permalink raw reply

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Peter Zijlstra @ 2010-11-13 12:57 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: LKML, Ingo Molnar, Nikanth Karthikesan, Yinghai Lu,
	David Rientjes, Zheng, Shaohui, Andrew Morton,
	linux-hotplug@vger.kernel.org, Eric Dumazet, Bjorn Helgaas,
	Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <20101113120030.GA31517@localhost>

On Sat, 2010-11-13 at 20:00 +0800, Wu Fengguang wrote:
> On Sat, Nov 13, 2010 at 06:30:24PM +0800, Peter Zijlstra wrote:
> > On Sat, 2010-11-13 at 16:40 +0800, Wu Fengguang wrote:
> > > > Will try and figure out how the heck that's happening, Ingo any clue?
> > > 
> > > It's back to normal on 2.6.37-rc1 when reverting commit 50f2d7f682f9
> > > ("x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA").
> > > 
> > > The interesting part is, the commit was introduced in 
> > > 2.6.36-rc7..2.6.36, however 2.6.36 boots OK, while 2.6.37-rc1 panics.
> > 
> > Argh, that commit again..
> > 
> > Does this fix it: http://lkml.org/lkml/2010/11/12/8
> 
> No it still panics. Here is the dmesg.

OK, I'll let Nikanth have a look, if all else fails we can always revert
that patch.

^ permalink raw reply

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Wu Fengguang @ 2010-11-13 13:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, Ingo Molnar, Nikanth Karthikesan, Yinghai Lu,
	David Rientjes, Zheng, Shaohui, Andrew Morton,
	linux-hotplug@vger.kernel.org, Eric Dumazet, Bjorn Helgaas,
	Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <1289653078.2084.675.camel@laptop>

On Sat, Nov 13, 2010 at 08:57:58PM +0800, Peter Zijlstra wrote:
> On Sat, 2010-11-13 at 20:00 +0800, Wu Fengguang wrote:
> > On Sat, Nov 13, 2010 at 06:30:24PM +0800, Peter Zijlstra wrote:
> > > On Sat, 2010-11-13 at 16:40 +0800, Wu Fengguang wrote:
> > > > > Will try and figure out how the heck that's happening, Ingo any clue?
> > > > 
> > > > It's back to normal on 2.6.37-rc1 when reverting commit 50f2d7f682f9
> > > > ("x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA").
> > > > 
> > > > The interesting part is, the commit was introduced in 
> > > > 2.6.36-rc7..2.6.36, however 2.6.36 boots OK, while 2.6.37-rc1 panics.
> > > 
> > > Argh, that commit again..
> > > 
> > > Does this fix it: http://lkml.org/lkml/2010/11/12/8
> > 
> > No it still panics. Here is the dmesg.
> 
> OK, I'll let Nikanth have a look, if all else fails we can always
> revert that patch.

It's the same bug.

Just tried another machine, I get the same divide error.  The patch
posted in lkml/2010/11/12/8 does not fix it. But after reverting
commit 50f2d7f682f9, it boots OK.

Thanks,
Fengguang
---
PS. dmesg with divide error

[    0.000000] console [ttyS0] enabled, bootconsole disabled
[    0.000000] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
[    0.000000] ... MAX_LOCKDEP_SUBCLASSES:  8
[    0.000000] ... MAX_LOCK_DEPTH:          48
[    0.000000] ... MAX_LOCKDEP_KEYS:        8191
[    0.000000] ... CLASSHASH_SIZE:          4096
[    0.000000] ... MAX_LOCKDEP_ENTRIES:     16384
[    0.000000] ... MAX_LOCKDEP_CHAINS:      32768
[    0.000000] ... CHAINHASH_SIZE:          16384
[    0.000000]  memory used by lock dependency info: 6367 kB
[    0.000000]  per task-struct memory footprint: 2688 bytes
[    0.000000] allocated 167772160 bytes of page_cgroup
[    0.000000] please try 'cgroup_disable=memory' option if you don't want memory cgroups
[    0.000000] ODEBUG: 15 of 15 active objects replaced
[    0.000000] hpet clockevent registered
[    0.001000] Fast TSC calibration using PIT
[    0.002000] Detected 2800.469 MHz processor.
[    0.000010] Calibrating delay loop (skipped), value calculated using timer frequency.. 5600.93 BogoMIPS (lpj(00469)
[    0.010818] pid_max: default: 32768 minimum: 301
[    0.021745] Dentry cache hash table entries: 2097152 (order: 12, 16777216 bytes)
[    0.035657] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes)
[    0.044553] Mount-cache hash table entries: 256
[    0.049469] Initializing cgroup subsys debug
[    0.053834] Initializing cgroup subsys ns
[    0.057940] ns_cgroup deprecated: consider using the 'clone_children' flag without the ns_cgroup.
[    0.066968] Initializing cgroup subsys cpuacct
[    0.071511] Initializing cgroup subsys memory
[    0.075988] Initializing cgroup subsys devices
[    0.080527] Initializing cgroup subsys freezer
[    0.085107] CPU: Physical Processor ID: 0
[    0.089209] CPU: Processor Core ID: 0
[    0.092974] mce: CPU supports 9 MCE banks
[    0.097095] CPU0: Thermal monitoring enabled (TM1)
[    0.101990] using mwait in idle threads.
[    0.106006] Performance Events: PEBS fmt1+, Westmere events, Intel PMU driver.
[    0.113535] ... version:                3
[    0.117641] ... bit width:              48
[    0.121828] ... generic registers:      4
[    0.125926] ... value mask:             0000ffffffffffff
[    0.131328] ... max period:             000000007fffffff
[    0.136734] ... fixed-purpose events:   3
[    0.140839] ... event mask:             000000070000000f
[    0.147297] ACPI: Core revision 20101013
[    0.175646] ftrace: allocating 24175 entries in 95 pages
[    0.190912] Setting APIC routing to flat
[    0.195562] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[    0.211643] CPU0: Intel(R) Xeon(R) CPU           X5660  @ 2.80GHz stepping 01
[    0.325243] lockdep: fixing up alternatives.
[    0.330242] Booting Node   0, Processors  #1lockdep: fixing up alternatives.
[    0.430140]  #2lockdep: fixing up alternatives.
[    0.526962]  #3lockdep: fixing up alternatives.
[    0.623755]  #4lockdep: fixing up alternatives.
[    0.720588]  Ok.
[    0.722525] Booting Node   1, Processors  #5lockdep: fixing up alternatives.
[    0.822389]  Ok.
[    0.824327] Booting Node   0, Processors  #6
[    0.919089] TSC synchronization [CPU#0 -> CPU#6]:
[    0.924155] Measured 296 cycles TSC warp between CPUs, turning off TSC clock.
[    0.003999] Marking TSC unstable due to check_tsc_sync_source failed
[    0.557048] lockdep: fixing up alternatives.
[    0.558041]  Ok.
[    0.559004] Booting Node   1, Processors  #7 Ok.
[    0.632157] Brought up 8 CPUs
[    0.633006] Total of 8 processors activated (44799.46 BogoMIPS).
[    0.634048] Testing NMI watchdog ... OK.
[    0.658054] divide error: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[    0.658999] last sysfs file:
[    0.658999] CPU 0
[    0.658999] Modules linked in:
[    0.658999]
[    0.658999] Pid: 1, comm: swapper Tainted: G        W   2.6.37-rc1+ #111 X8DTN/X8DTN
[    0.658999] RIP: 0010:[<ffffffff810a9d18>]  [<ffffffff810a9d18>] find_busiest_group+0x6b8/0x1480
[    0.658999] RSP: 0018:ffff88022f965870  EFLAGS: 00010006
[    0.658999] RAX: 0000000000100000 RBX: ffff88022f965aec RCX: 0000000000000000
[    0.658999] RDX: 0000000000000000 RSI: 0000000000000400 RDI: 0000000000000008
[    0.658999] RBP: ffff88022f965a30 R08: ffff88022fa00278 R09: ffff88022fa00268
[    0.658999] R10: 0000000000000003 R11: 0000000000000001 R12: 00000000001d2d00
[    0.658999] R13: 00000000001d2d00 R14: 00000000001d2d00 R15: 0000000000000008
[    0.658999] FS:  0000000000000000(0000) GS:ffff8800bc400000(0000) knlGS:0000000000000000
[    0.658999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.658999] CR2: 0000000000000000 CR3: 0000000001ee1000 CR4: 00000000000006f0
[    0.658999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.658999] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    0.658999] Process swapper (pid: 1, threadinfo ffff88022f964000, task ffff88042f45c000)
[    0.658999] Stack:
[    0.658999]  ffff88022f965890 ffff88022f9659d0 0000000000000006 ffff8800bcfd2d00
[    0.658999]  0000000000000000 00000000001d2d00 ffff88022f965ae0 000000022f965910
[    0.658999]  ffff88022f965801 ffffffff810929ed ffff8800bc40de48 000000000042f47c
[    0.658999] Call Trace:
[    0.658999]  [<ffffffff810929ed>] ? __phys_addr+0x5d/0x120
[    0.658999]  [<ffffffff810b2614>] load_balance+0xe4/0xcb0
[    0.658999]  [<ffffffff810b0b54>] ? dequeue_task_fair+0x1f4/0x250
[    0.658999]  [<ffffffff8199be5d>] schedule+0xb0d/0x14b0
[    0.658999]  [<ffffffff810cc60e>] ? __sysctl_head_next+0x19e/0x1a0
[    0.658999]  [<ffffffff8199d2dd>] schedule_timeout+0x50d/0x570
[    0.658999]  [<ffffffff8110b9bc>] ? print_lock_contention_bug+0x2c/0x110
[    0.658999]  [<ffffffff810af7a1>] ? get_parent_ip+0x11/0x90
[    0.658999]  [<ffffffff819a7cbd>] ? sub_preempt_count+0x12d/0x1f0
[    0.658999]  [<ffffffff8199b10b>] wait_for_common+0x16b/0x290
[    0.658999]  [<ffffffff810b4950>] ? default_wake_function+0x0/0x20
[    0.658999]  [<ffffffff8199b34d>] wait_for_completion+0x1d/0x20
[    0.658999]  [<ffffffff810efdfb>] kthread_create+0x9b/0x150
[    0.658999]  [<ffffffff810e8310>] ? rescuer_thread+0x0/0x2a0
[    0.658999]  [<ffffffff81202078>] ? __kmalloc_node+0x2b8/0x340
[    0.658999]  [<ffffffff810e7d5a>] __alloc_workqueue_key+0x27a/0x830
[    0.658999]  [<ffffffff8263b23f>] cpuset_init_smp+0x56/0x8c
[    0.658999]  [<ffffffff8261d148>] kernel_init+0x17a/0x27c
[    0.658999]  [<ffffffff81051a24>] kernel_thread_helper+0x4/0x10
[    0.658999]  [<ffffffff819a2c14>] ? restore_args+0x0/0x30
[    0.658999]  [<ffffffff8261cfce>] ? kernel_init+0x0/0x27c
[    0.658999]  [<ffffffff81051a20>] ? kernel_thread_helper+0x0/0x10
[    0.658999] Code: 04 f5 20 87 43 82 48 89 94 07 80 08 00 00 41 89 4f 08 90 4c 8b 8d e0 fe ff ff 48 8b 75 a8 31 d2 41 8b 49 08 48 89 f0 48 c1 e0 0a <48> f7 f1 48 8b 4d b0 31 d2 48 85 c9 0f 95 c2 48 89 45 a0 48 63
[    0.658999] RIP  [<ffffffff810a9d18>] find_busiest_group+0x6b8/0x1480
[    0.658999]  RSP <ffff88022f965870>
[    0.658999] ---[ end trace 4eaa2a86a8e2da23 ]---
[    0.658999] divide error: 0000 [#2] PREEMPT SMP DEBUG_PAGEALLOC
[    0.658999] last sysfs file:
[    0.658999] CPU 1
[    0.658999] Modules linked in:
[    0.658999]
[    0.658999] Pid: 2, comm: kthreadd Tainted: G      D W   2.6.37-rc1+ #111 X8DTN/X8DTN
[    0.658999] RIP: 0010:[<ffffffff810a3321>]  [<ffffffff810a3321>] select_task_rq_fair+0x691/0x9a0
[    0.658999] RSP: 0000:ffff88022f967c30  EFLAGS: 00010002
[    0.658999] RAX: 0000000000100000 RBX: 0000000000000400 RCX: 0000000000000000
[    0.658999] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 0000000000000008
[    0.658999] RBP: ffff88022f967cf0 R08: ffff88022fa00278 R09: 0000000000000000
[    0.658999] R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001
[    0.658999] R13: ffff88022fa00278 R14: ffff88022fa00268 R15: 0000000000000003
[    0.658999] FS:  0000000000000000(0000) GS:ffff8800bc600000(0000) knlGS:0000000000000000
[    0.658999] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.658999] CR2: 0000000000000000 CR3: 0000000001ee1000 CR4: 00000000000006e0
[    0.658999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    0.658999] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    0.658999] Process kthreadd (pid: 2, threadinfo ffff88022f966000, task ffff88022f968000)
[    0.658999] Stack:
[    0.658999]  000000002f967c50 ffff8800bc7d2d18 ffff880200000006 00000000001d2d00
[    0.658999]  00000000001d2d00 0000000000000000 000000000000007d 0000000000000000
[    0.658999]  0000000800000001 ffff88022faa41b0 00000001810a59ea 0000000000000000
[    0.658999] Call Trace:
[    0.658999]  [<ffffffff810b4a01>] wake_up_new_task+0x51/0x2d0
[    0.658999]  [<ffffffff810eb83c>] ? __task_pid_nr_ns+0x10c/0x130
[    0.658999]  [<ffffffff810eb730>] ? __task_pid_nr_ns+0x0/0x130
[    0.658999]  [<ffffffff810bb0e3>] do_fork+0x693/0x7b0
[    0.658999]  [<ffffffff819a22e8>] ? _raw_spin_unlock_irq+0x68/0x90
[    0.658999]  [<ffffffff810a7a98>] ? finish_task_switch+0x118/0x1d0
[    0.658999]  [<ffffffff810af7a1>] ? get_parent_ip+0x11/0x90
[    0.658999]  [<ffffffff819a7cbd>] ? sub_preempt_count+0x12d/0x1f0
[    0.658999]  [<ffffffff8105c276>] kernel_thread+0x76/0x80
[    0.658999]  [<ffffffff810ef9a0>] ? kthread+0x0/0xd0
[    0.658999]  [<ffffffff81051a20>] ? kernel_thread_helper+0x0/0x10
[    0.658999]  [<ffffffff810f0006>] kthreadd+0x136/0x1a0
[    0.658999]  [<ffffffff8110d629>] ? trace_hardirqs_on_caller+0x29/0x210
[    0.658999]  [<ffffffff81051a24>] kernel_thread_helper+0x4/0x10
[    0.658999]  [<ffffffff819a2c14>] ? restore_args+0x0/0x30
[    0.658999]  [<ffffffff810efed0>] ? kthreadd+0x0/0x1a0
[    0.658999]  [<ffffffff81051a20>] ? kernel_thread_helper+0x0/0x10
[    0.658999] Code: 95 50 ff ff ff eb 99 0f 1f 00 45 89 f4 4c 8b 75 b0 48 89 d8 48 c1 e0 0a 31 d2 49 83 c7 02 41 8b 4e 08 4a 83 04 fd d8 ec 08 82 01 <48> f7 f1 45 85 e4 0f 85 33 01 00 00 31 d2 48 3b 45 a0 0f 92 c2
[    0.658999] RIP  [<ffffffff810a3321>] select_task_rq_fair+0x691/0x9a0
[    0.658999]  RSP <ffff88022f967c30>
[    0.658999] ---[ end trace 4eaa2a86a8e2da24 ]---
[    0.658999] note: kthreadd[2] exited with preempt_count 2
[    0.658999] note: swapper[1] exited with preempt_count 1
[    0.659015] swapper used greatest stack depth: 3680 bytes left
[    0.660011] Kernel panic - not syncing: Attempted to kill init!
[    0.661008] Pid: 1, comm: swapper Tainted: G      D W   2.6.37-rc1+ #111
[    0.662005] Call Trace:
[    0.663012]  [<ffffffff8199acb0>] panic+0xb1/0x222
[    0.664011]  [<ffffffff810c2ff0>] do_exit+0xd10/0xdb0
[    0.665009]  [<ffffffff819a2398>] ? _raw_spin_unlock_irqrestore+0x88/0xd0
[    0.666011]  [<ffffffff819a414c>] oops_end+0x10c/0x150
[    0.667011]  [<ffffffff8105623a>] die+0x8a/0xc0
[    0.668011]  [<ffffffff819a337c>] do_trap+0x11c/0x1c0
[    0.669011]  [<ffffffff81051bee>] do_divide_error+0xbe/0xe0
[    0.670011]  [<ffffffff810a9d18>] ? find_busiest_group+0x6b8/0x1480
[    0.671011]  [<ffffffff8110ae39>] ? trace_hardirqs_off_caller+0x29/0x150
[    0.672009]  [<ffffffff819a1028>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[    0.673017]  [<ffffffff819a2c44>] ? irq_return+0x0/0xc
[    0.674012]  [<ffffffff8105183b>] divide_error+0x1b/0x20
[    0.675013]  [<ffffffff810a9d18>] ? find_busiest_group+0x6b8/0x1480
[    0.676013]  [<ffffffff810929ed>] ? __phys_addr+0x5d/0x120
[    0.677018]  [<ffffffff810b2614>] load_balance+0xe4/0xcb0
[    0.678012]  [<ffffffff810b0b54>] ? dequeue_task_fair+0x1f4/0x250
[    0.679015]  [<ffffffff8199be5d>] schedule+0xb0d/0x14b0
[    0.680009]  [<ffffffff810cc60e>] ? __sysctl_head_next+0x19e/0x1a0
[    0.681015]  [<ffffffff8199d2dd>] schedule_timeout+0x50d/0x570
[    0.682009]  [<ffffffff8110b9bc>] ? print_lock_contention_bug+0x2c/0x110
[    0.683012]  [<ffffffff810af7a1>] ? get_parent_ip+0x11/0x90
[    0.684009]  [<ffffffff819a7cbd>] ? sub_preempt_count+0x12d/0x1f0
[    0.685010]  [<ffffffff8199b10b>] wait_for_common+0x16b/0x290
[    0.686010]  [<ffffffff810b4950>] ? default_wake_function+0x0/0x20
[    0.687012]  [<ffffffff8199b34d>] wait_for_completion+0x1d/0x20
[    0.688009]  [<ffffffff810efdfb>] kthread_create+0x9b/0x150
[    0.689008]  [<ffffffff810e8310>] ? rescuer_thread+0x0/0x2a0
[    0.690012]  [<ffffffff81202078>] ? __kmalloc_node+0x2b8/0x340
[    0.691021]  [<ffffffff810e7d5a>] __alloc_workqueue_key+0x27a/0x830
[    0.692012]  [<ffffffff8263b23f>] cpuset_init_smp+0x56/0x8c
[    0.693010]  [<ffffffff8261d148>] kernel_init+0x17a/0x27c
[    0.694009]  [<ffffffff81051a24>] kernel_thread_helper+0x4/0x10
[    0.695012]  [<ffffffff819a2c14>] ? restore_args+0x0/0x30
[    0.696010]  [<ffffffff8261cfce>] ? kernel_init+0x0/0x27c
[    0.697009]  [<ffffffff81051a20>] ? kernel_thread_helper+0x0/0x10
[    2.074478] Rebooting in 10 seconds..


^ permalink raw reply

* Re: [v2, 0/8] NUMA Hotplug emulator
From: Greg KH @ 2010-11-13 14:42 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <20101113053714.GA32501@shaohui>

On Sat, Nov 13, 2010 at 01:37:14PM +0800, Shaohui Zheng wrote:
> Hi, All
> 
> 	This patchset introduces NUMA hotplug emulator for x86. we already sent out
> an early version in LKML (http ://lwn.net/Articles/387571/). This is 4th version
> in internal, and 2nd time to sent to LKML. 

Please send this to lkml, the linux-hotplug mailing list is primarily
for userspace hotplug tools (like udev) and not for kernel stuff like
this where you want the core kernel developers to review your patches.

Please use the script, scripts/get_maintainer.pl to determine who best
to send your patches to, it will tell you the mailing list as well as
the people involved.

good luck,

greg k-h

^ permalink raw reply

* Re: Touchpad toggle mess
From: Julien Cristau @ 2010-11-13 18:10 UTC (permalink / raw)
  To: Bastien Nocera; +Cc: linux-input, linux-hotplug, Martin Pitt, xorg
In-Reply-To: <1289059101.5418.93.camel@novo.hadess.net>

On Sat, Nov  6, 2010 at 15:58:21 +0000, Bastien Nocera wrote:

> On Sat, 2010-11-06 at 11:30 -0400, Martin Pitt wrote:
> > Hello Bastien,
> > 
> > Bastien Nocera [2010-11-05  0:00 +0000]:
> > > The patches in https://bugs.freedesktop.org/show_bug.cgi?id1300
> > > implement this for the kernel, and for X.org.
> > > 
> > > Then we have the problem that udev's keymaps seem to use different
> > > function keys depending on the hardware [1], when X.org (because of the
> > > limitations of XKB) standardised on F22 for "XF86TouchpadToggle".
> > 
> > Thanks for bringing this up, indeed there's some cleanup in order
> > here.
> > 
> > > So we'll need to standardise on the keys used. I selected F21 for
> > > XF86TouchpadToggle, F22 for XF86TouchpadOn and F23 for XF86TouchpadOff.
> > > See the patch in:
> > > https://bugs.freedesktop.org/show_bug.cgi?id1333
> > > 
> > > The remaining fixes would need to be in udev's keymaps, to set hardware
> > > handled keys to F22 and F23, and software ones to F21. Patch is attached
> > > for that.
> > 
> > Indeed the maps which don't currently use f22 should be fixed,
> > regardless of what happens in X/kernel/userspace. My main concern is
> > that F22 has acted as touchpad toggle key for years
> 
> No, for about a year:
> http://cgit.freedesktop.org/xkeyboard-config/commit/?id\x1d05eda8dfc706d6450cab5883120e0d5e1100c0
> 
That seems long enough that moving touchpad toggle from f22 to something
else for no particular reason will break stuff...

Cheers,
Julien

^ permalink raw reply

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Yinghai Lu @ 2010-11-13 19:12 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, LKML, Ingo Molnar, Nikanth Karthikesan,
	David Rientjes, Zheng, Shaohui, Andrew Morton,
	linux-hotplug@vger.kernel.org, Eric Dumazet, Bjorn Helgaas,
	Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <20101113131042.GA5522@localhost>

On 11/13/2010 05:10 AM, Wu Fengguang wrote:
> On Sat, Nov 13, 2010 at 08:57:58PM +0800, Peter Zijlstra wrote:
>> On Sat, 2010-11-13 at 20:00 +0800, Wu Fengguang wrote:
>>> On Sat, Nov 13, 2010 at 06:30:24PM +0800, Peter Zijlstra wrote:
>>>> On Sat, 2010-11-13 at 16:40 +0800, Wu Fengguang wrote:
>>>>>> Will try and figure out how the heck that's happening, Ingo any clue?
>>>>>
>>>>> It's back to normal on 2.6.37-rc1 when reverting commit 50f2d7f682f9
>>>>> ("x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA").
>>>>>
>>>>> The interesting part is, the commit was introduced in 
>>>>> 2.6.36-rc7..2.6.36, however 2.6.36 boots OK, while 2.6.37-rc1 panics.
>>>>
>>>> Argh, that commit again..
>>>>
>>>> Does this fix it: http://lkml.org/lkml/2010/11/12/8
>>>
>>> No it still panics. Here is the dmesg.
>>
>> OK, I'll let Nikanth have a look, if all else fails we can always
>> revert that patch.
> 
> It's the same bug.
> 
> Just tried another machine, I get the same divide error.  The patch
> posted in lkml/2010/11/12/8 does not fix it. But after reverting
> commit 50f2d7f682f9, it boots OK.
> 
> Thanks,
> Fengguang
> ---
> PS. dmesg with divide error
> 
> [    0.000000] console [ttyS0] enabled, bootconsole disabled
> [    0.000000] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
> [    0.000000] ... MAX_LOCKDEP_SUBCLASSES:  8
> [    0.000000] ... MAX_LOCK_DEPTH:          48
> [    0.000000] ... MAX_LOCKDEP_KEYS:        8191
> [    0.000000] ... CLASSHASH_SIZE:          4096
> [    0.000000] ... MAX_LOCKDEP_ENTRIES:     16384
> [    0.000000] ... MAX_LOCKDEP_CHAINS:      32768
> [    0.000000] ... CHAINHASH_SIZE:          16384
> [    0.000000]  memory used by lock dependency info: 6367 kB
> [    0.000000]  per task-struct memory footprint: 2688 bytes
> [    0.000000] allocated 167772160 bytes of page_cgroup
> [    0.000000] please try 'cgroup_disable=memory' option if you don't want memory cgroups
> [    0.000000] ODEBUG: 15 of 15 active objects replaced
> [    0.000000] hpet clockevent registered
> [    0.001000] Fast TSC calibration using PIT
> [    0.002000] Detected 2800.469 MHz processor.
> [    0.000010] Calibrating delay loop (skipped), value calculated using timer frequency.. 5600.93 BogoMIPS (lpj(00469)
> [    0.010818] pid_max: default: 32768 minimum: 301
> [    0.021745] Dentry cache hash table entries: 2097152 (order: 12, 16777216 bytes)
> [    0.035657] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes)
> [    0.044553] Mount-cache hash table entries: 256
> [    0.049469] Initializing cgroup subsys debug
> [    0.053834] Initializing cgroup subsys ns
> [    0.057940] ns_cgroup deprecated: consider using the 'clone_children' flag without the ns_cgroup.
> [    0.066968] Initializing cgroup subsys cpuacct
> [    0.071511] Initializing cgroup subsys memory
> [    0.075988] Initializing cgroup subsys devices
> [    0.080527] Initializing cgroup subsys freezer
> [    0.085107] CPU: Physical Processor ID: 0
> [    0.089209] CPU: Processor Core ID: 0
> [    0.092974] mce: CPU supports 9 MCE banks
> [    0.097095] CPU0: Thermal monitoring enabled (TM1)
> [    0.101990] using mwait in idle threads.
> [    0.106006] Performance Events: PEBS fmt1+, Westmere events, Intel PMU driver.
> [    0.113535] ... version:                3
> [    0.117641] ... bit width:              48
> [    0.121828] ... generic registers:      4
> [    0.125926] ... value mask:             0000ffffffffffff
> [    0.131328] ... max period:             000000007fffffff
> [    0.136734] ... fixed-purpose events:   3
> [    0.140839] ... event mask:             000000070000000f
> [    0.147297] ACPI: Core revision 20101013
> [    0.175646] ftrace: allocating 24175 entries in 95 pages
> [    0.190912] Setting APIC routing to flat
> [    0.195562] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
> [    0.211643] CPU0: Intel(R) Xeon(R) CPU           X5660  @ 2.80GHz stepping 01
> [    0.325243] lockdep: fixing up alternatives.
> [    0.330242] Booting Node   0, Processors  #1lockdep: fixing up alternatives.
> [    0.430140]  #2lockdep: fixing up alternatives.
> [    0.526962]  #3lockdep: fixing up alternatives.
> [    0.623755]  #4lockdep: fixing up alternatives.
> [    0.720588]  Ok.
> [    0.722525] Booting Node   1, Processors  #5lockdep: fixing up alternatives.
> [    0.822389]  Ok.
> [    0.824327] Booting Node   0, Processors  #6
> [    0.919089] TSC synchronization [CPU#0 -> CPU#6]:
> [    0.924155] Measured 296 cycles TSC warp between CPUs, turning off TSC clock.
> [    0.003999] Marking TSC unstable due to check_tsc_sync_source failed
> [    0.557048] lockdep: fixing up alternatives.
> [    0.558041]  Ok.
> [    0.559004] Booting Node   1, Processors  #7 Ok.
> [    0.632157] Brought up 8 CPUs
> [    0.633006] Total of 8 processors activated (44799.46 BogoMIPS).

assume that when you have 
CONFIG_NR_CPUS\x16
instead of
CONFIG_NR_CPUS=8

it will boot ok?

Thanks

	Yinghai

^ permalink raw reply

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Peter Zijlstra @ 2010-11-13 19:41 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Wu Fengguang, LKML, Ingo Molnar, Nikanth Karthikesan,
	David Rientjes, Zheng, Shaohui, Andrew Morton,
	linux-hotplug@vger.kernel.org, Eric Dumazet, Bjorn Helgaas,
	Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <4CDEE314.6090107@kernel.org>

On Sat, 2010-11-13 at 11:12 -0800, Yinghai Lu wrote:
> > [    0.633006] Total of 8 processors activated (44799.46 BogoMIPS).
> 
> assume that when you have 
> CONFIG_NR_CPUS\x16
> instead of
> CONFIG_NR_CPUS=8
> 
> it will boot ok? 

If it would that'd still be a bug.

^ permalink raw reply

* Re: [v2, 0/8] NUMA Hotplug emulator
From: Wu Fengguang @ 2010-11-13 23:43 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <20101113053714.GA32501@shaohui>

On Sat, Nov 13, 2010 at 10:42:52PM +0800, Greg KH wrote:
> On Sat, Nov 13, 2010 at 01:37:14PM +0800, Shaohui Zheng wrote:
> > Hi, All
> > 
> > 	This patchset introduces NUMA hotplug emulator for x86. we already sent out
> > an early version in LKML (http ://lwn.net/Articles/387571/). This is 4th version
> > in internal, and 2nd time to sent to LKML. 
> 
> Please send this to lkml, the linux-hotplug mailing list is primarily
> for userspace hotplug tools (like udev) and not for kernel stuff like
> this where you want the core kernel developers to review your patches.

And try the git/quilt tools when resending the emails to LKML.

> Please use the script, scripts/get_maintainer.pl to determine who best
> to send your patches to, it will tell you the mailing list as well as
> the people involved.

I run a handy script (based on your git-authors) to help me create the CC
list for given source files:

        [ $# -lt "1" ] && { echo 'usage: git authors <files...>'; exit -1; }

        git log $@ | grep Author: | cut -d: -f2 | sort | uniq -c | sort -n | tail -5
        echo
        git log $@ | grep Author: | cut -d: -f2 | sort | uniq -c | sort -n | tail -5 \
                | sed 's/ *[0-9]\+ /CC:/' \
                | sed 's/.*clameter@sgi.com.*/CC: Christoph Lameter <cl@linux-foundation.org>/' \
                | sed 's/.*hugh@veritas.com.*/CC: Hugh Dickins <hugh.dickins@tiscali.co.uk>/' \
                | sed 's/.*npiggin@suse.de.*/CC: Nick Piggin <npiggin@kernel.dk>/' \
                | sed 's/.*nickpiggin@yahoo.com.au.*/CC: Nick Piggin <npiggin@kernel.dk>/' \
                | sed 's/.*jens.axboe@oracle.com.*/CC: Jens Axboe <axboe@kernel.dk>/' \
                | sed 's/.*dgc@sgi.com.*/CC: Dave Chinner <david@fromorbit.com>/'

The output looks like

        wfg@bee ~/cc/linux-2.6% git-authors-email mm/memory_hotplug.c 
              3  Keith Mannthey <kmannth@us.ibm.com>
              3  Minchan Kim <minchan.kim@gmail.com>
              4  Dave Hansen <haveblue@us.ibm.com>
             14  KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
             14  Yasunori Goto <y-goto@jp.fujitsu.com>

        CC: Keith Mannthey <kmannth@us.ibm.com>
        CC: Minchan Kim <minchan.kim@gmail.com>
        CC: Dave Hansen <haveblue@us.ibm.com>
        CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
        CC: Yasunori Goto <y-goto@jp.fujitsu.com>

Thanks,
Fengguang

^ permalink raw reply

* Re: [BUG 2.6.27-rc1] find_busiest_group() LOCKUP
From: Yinghai Lu @ 2010-11-14  0:18 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, LKML, Ingo Molnar, Nikanth Karthikesan,
	David Rientjes, Zheng, Shaohui, Andrew Morton,
	linux-hotplug@vger.kernel.org, Eric Dumazet, Bjorn Helgaas,
	Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <20101113235746.GA9458@localhost>

On 11/13/2010 03:57 PM, Wu Fengguang wrote:
> On Sun, Nov 14, 2010 at 03:12:20AM +0800, Yinghai Lu wrote:
>> On 11/13/2010 05:10 AM, Wu Fengguang wrote:
>>> On Sat, Nov 13, 2010 at 08:57:58PM +0800, Peter Zijlstra wrote:
>>>> On Sat, 2010-11-13 at 20:00 +0800, Wu Fengguang wrote:
>>>>> On Sat, Nov 13, 2010 at 06:30:24PM +0800, Peter Zijlstra wrote:
>>>>>> On Sat, 2010-11-13 at 16:40 +0800, Wu Fengguang wrote:
>>>>>>>> Will try and figure out how the heck that's happening, Ingo any clue?
>>>>>>>
>>>>>>> It's back to normal on 2.6.37-rc1 when reverting commit 50f2d7f682f9
>>>>>>> ("x86, numa: Assign CPUs to nodes in round-robin manner on fake NUMA").
>>>>>>>
>>>>>>> The interesting part is, the commit was introduced in 
>>>>>>> 2.6.36-rc7..2.6.36, however 2.6.36 boots OK, while 2.6.37-rc1 panics.
>>>>>>
>>>>>> Argh, that commit again..
>>>>>>
>>>>>> Does this fix it: http://lkml.org/lkml/2010/11/12/8
>>>>>
>>>>> No it still panics. Here is the dmesg.
>>>>
>>>> OK, I'll let Nikanth have a look, if all else fails we can always
>>>> revert that patch.
>>>
>>> It's the same bug.
>>>
>>> Just tried another machine, I get the same divide error.  The patch
>>> posted in lkml/2010/11/12/8 does not fix it. But after reverting
>>> commit 50f2d7f682f9, it boots OK.
>>>
>>> Thanks,
>>> Fengguang
>>> ---
>>> PS. dmesg with divide error
>>>
>>> [    0.000000] console [ttyS0] enabled, bootconsole disabled
>>> [    0.000000] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
>>> [    0.000000] ... MAX_LOCKDEP_SUBCLASSES:  8
>>> [    0.000000] ... MAX_LOCK_DEPTH:          48
>>> [    0.000000] ... MAX_LOCKDEP_KEYS:        8191
>>> [    0.000000] ... CLASSHASH_SIZE:          4096
>>> [    0.000000] ... MAX_LOCKDEP_ENTRIES:     16384
>>> [    0.000000] ... MAX_LOCKDEP_CHAINS:      32768
>>> [    0.000000] ... CHAINHASH_SIZE:          16384
>>> [    0.000000]  memory used by lock dependency info: 6367 kB
>>> [    0.000000]  per task-struct memory footprint: 2688 bytes
>>> [    0.000000] allocated 167772160 bytes of page_cgroup
>>> [    0.000000] please try 'cgroup_disable=memory' option if you don't want memory cgroups
>>> [    0.000000] ODEBUG: 15 of 15 active objects replaced
>>> [    0.000000] hpet clockevent registered
>>> [    0.001000] Fast TSC calibration using PIT
>>> [    0.002000] Detected 2800.469 MHz processor.
>>> [    0.000010] Calibrating delay loop (skipped), value calculated using timer frequency.. 5600.93 BogoMIPS (lpj(00469)
>>> [    0.010818] pid_max: default: 32768 minimum: 301
>>> [    0.021745] Dentry cache hash table entries: 2097152 (order: 12, 16777216 bytes)
>>> [    0.035657] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes)
>>> [    0.044553] Mount-cache hash table entries: 256
>>> [    0.049469] Initializing cgroup subsys debug
>>> [    0.053834] Initializing cgroup subsys ns
>>> [    0.057940] ns_cgroup deprecated: consider using the 'clone_children' flag without the ns_cgroup.
>>> [    0.066968] Initializing cgroup subsys cpuacct
>>> [    0.071511] Initializing cgroup subsys memory
>>> [    0.075988] Initializing cgroup subsys devices
>>> [    0.080527] Initializing cgroup subsys freezer
>>> [    0.085107] CPU: Physical Processor ID: 0
>>> [    0.089209] CPU: Processor Core ID: 0
>>> [    0.092974] mce: CPU supports 9 MCE banks
>>> [    0.097095] CPU0: Thermal monitoring enabled (TM1)
>>> [    0.101990] using mwait in idle threads.
>>> [    0.106006] Performance Events: PEBS fmt1+, Westmere events, Intel PMU driver.
>>> [    0.113535] ... version:                3
>>> [    0.117641] ... bit width:              48
>>> [    0.121828] ... generic registers:      4
>>> [    0.125926] ... value mask:             0000ffffffffffff
>>> [    0.131328] ... max period:             000000007fffffff
>>> [    0.136734] ... fixed-purpose events:   3
>>> [    0.140839] ... event mask:             000000070000000f
>>> [    0.147297] ACPI: Core revision 20101013
>>> [    0.175646] ftrace: allocating 24175 entries in 95 pages
>>> [    0.190912] Setting APIC routing to flat
>>> [    0.195562] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
>>> [    0.211643] CPU0: Intel(R) Xeon(R) CPU           X5660  @ 2.80GHz stepping 01
>>> [    0.325243] lockdep: fixing up alternatives.
>>> [    0.330242] Booting Node   0, Processors  #1lockdep: fixing up alternatives.
>>> [    0.430140]  #2lockdep: fixing up alternatives.
>>> [    0.526962]  #3lockdep: fixing up alternatives.
>>> [    0.623755]  #4lockdep: fixing up alternatives.
>>> [    0.720588]  Ok.
>>> [    0.722525] Booting Node   1, Processors  #5lockdep: fixing up alternatives.
>>> [    0.822389]  Ok.
>>> [    0.824327] Booting Node   0, Processors  #6
>>> [    0.919089] TSC synchronization [CPU#0 -> CPU#6]:
>>> [    0.924155] Measured 296 cycles TSC warp between CPUs, turning off TSC clock.
>>> [    0.003999] Marking TSC unstable due to check_tsc_sync_source failed
>>> [    0.557048] lockdep: fixing up alternatives.
>>> [    0.558041]  Ok.
>>> [    0.559004] Booting Node   1, Processors  #7 Ok.
>>> [    0.632157] Brought up 8 CPUs
>>> [    0.633006] Total of 8 processors activated (44799.46 BogoMIPS).
>>
>> assume that when you have 
>> CONFIG_NR_CPUS\x16
>> instead of
>> CONFIG_NR_CPUS=8
>>
>> it will boot ok?
> 
> No. But it boots OK with CONFIG_NR_CPUSd: it actually has 24 CPUs, a bit more
> than your expectation :)
> 
> This also boots the other 16 CPU box that used to lockup in find_busiest_group().

please check attached patch, it should fix the problem.

Thanks

	Yinghai

[PATCH] x86, acpi: Handle all SRAT cpu entries even have cpu num limitaion

Recent Intel new system have different order in MADT, aka will list all thread0
at first, then all thread1.
But SRAT table still old order, it will list cpus in one socket all together.

If the user have compiled limited NR_CPUS or boot with nr_cpus=, could have missed
to put some cpus apic id to node mapping into apicid_to_node[].

for example for 4 sockets system with 64 cpus with nr_cpus2 will get crash...

[    9.106288] Total of 32 processors activated (136190.88 BogoMIPS).
[    9.235021] divide error: 0000 [#1] SMP 
[    9.235315] last sysfs file: 
[    9.235481] CPU 1 
[    9.235592] Modules linked in:
[    9.245398] 
[    9.245478] Pid: 2, comm: kthreadd Not tainted 2.6.37-rc1-tip-yh-01782-ge92ef79-dirty #274      /Sun Fire x4800
[    9.265415] RIP: 0010:[<ffffffff81075a8f>]  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
[    9.265835] RSP: 0000:ffff88103f8d1c40  EFLAGS: 00010046
[    9.285550] RAX: 0000000000000000 RBX: ffff88103f887de0 RCX: 0000000000000000
[    9.305356] RDX: 0000000000000000 RSI: 0000000000000200 RDI: 0000000000000200
[    9.305711] RBP: ffff88103f8d1d10 R08: 0000000000000200 R09: ffff88103f887e38
[    9.325709] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[    9.326038] R13: ffff88107e80dfb0 R14: 0000000000000001 R15: ffff88103f887e40
[    9.345655] FS:  0000000000000000(0000) GS:ffff88107e800000(0000) knlGS:0000000000000000
[    9.365503] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    9.365776] CR2: 0000000000000000 CR3: 0000000002417000 CR4: 00000000000006e0
[    9.385583] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    9.405368] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    9.405713] Process kthreadd (pid: 2, threadinfo ffff88103f8d0000, task ffff88305c8aa2d0)
[    9.425563] Stack:
[    9.425668]  ffff88103f8d1cb0 0000000000000046 0000000000000000 0000000200000000
[    9.445509]  0000000000000000 0000000100000000 0000000000000046 ffffffff82bd1ce0
[    9.465350]  000000015c8aa2d0 00000000001d2540 00000000001d2540 0000007d3f8d1d28
[    9.465763] Call Trace:
[    9.465875]  [<ffffffff810747c3>] wake_up_new_task+0x3c/0x10e
[    9.485486]  [<ffffffff8107b2e3>] do_fork+0x28c/0x35f
[    9.485753]  [<ffffffff810ab832>] ? __lock_acquire+0x1801/0x1813
[    9.505474]  [<ffffffff8106f2bd>] ? finish_task_switch+0x80/0xf4
[    9.525264]  [<ffffffff8106f286>] ? finish_task_switch+0x49/0xf4
[    9.525575]  [<ffffffff8109da72>] ? local_clock+0x2b/0x3c
[    9.545281]  [<ffffffff8103da76>] kernel_thread+0x70/0x72
[    9.545544]  [<ffffffff81097c83>] ? kthread+0x0/0xa8
[    9.545797]  [<ffffffff81037990>] ? kernel_thread_helper+0x0/0x10
[    9.565519]  [<ffffffff81098099>] kthreadd+0xe8/0x12b
[    9.585185]  [<ffffffff81037994>] kernel_thread_helper+0x4/0x10
[    9.585485]  [<ffffffff81cd793c>] ? restore_args+0x0/0x30
[    9.605192]  [<ffffffff81097fb1>] ? kthreadd+0x0/0x12b
[    9.605479]  [<ffffffff81037990>] ? kernel_thread_helper+0x0/0x10
[    9.625295] Code: a0 be 00 02 00 00 ff c2 48 63 d2 e8 f8 67 3b 00 3b 05 86 8e 52 01 48 89 c7 89 45 c8 7c c1 48 8b 45 b0 8b 4b 08 31 d2 48 c1 e0 0a <48> f7 f1 45 85 e4 75 08 48 3b 45 b8 72 08 eb 0d 48 89 45 a8 eb 
[    9.645938] RIP  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
[    9.665356]  RSP <ffff88103f8d1c40>
[    9.665568] ---[ end trace 2296156d35fdfc87 ]---

So let just parse all cpu entries in SRAT.

Also add apicid checking with MAX_LOCAL_APIC, in case We could out of boundaries of
apicid_to_node[].

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/kernel/acpi/boot.c |    7 +++++++
 arch/x86/mm/srat_64.c       |    8 ++++++++
 drivers/acpi/numa.c         |   14 ++++++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

Index: linux-2.6/arch/x86/kernel/acpi/boot.c
=================================--- linux-2.6.orig/arch/x86/kernel/acpi/boot.c
+++ linux-2.6/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,13 @@ static void __cpuinit acpi_register_lapi
 {
 	unsigned int ver = 0;
 
+#ifdef CONFIG_X86_64
+	if (id >= (MAX_APICS-1)) {
+		printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+		return;
+	}
+#endif
+
 	if (!enabled) {
 		++disabled_cpus;
 		return;
Index: linux-2.6/arch/x86/mm/srat_64.c
=================================--- linux-2.6.orig/arch/x86/mm/srat_64.c
+++ linux-2.6/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct ac
 	}
 
 	apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped that apicid too big\n", pxm, apic_id, node);
+		return;
+	}
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
@@ -168,6 +172,10 @@ acpi_numa_processor_affinity_init(struct
 		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 	else
 		apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
Index: linux-2.6/drivers/acpi/numa.c
=================================--- linux-2.6.orig/drivers/acpi/numa.c
+++ linux-2.6/drivers/acpi/numa.c
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_typ
 int __init acpi_numa_init(void)
 {
 	int ret = 0;
+	int nr_cpu_entries = nr_cpu_ids;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Should not limit number with cpu num that will handle,
+	 * SRAT cpu entries could have different order with that in MADT.
+	 * So go over all cpu entries in SRAT to get apicid to node mapping.
+	 */
+	nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
 
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-				     acpi_parse_x2apic_affinity, nr_cpu_ids);
+				     acpi_parse_x2apic_affinity, nr_cpu_entries);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-				     acpi_parse_processor_affinity, nr_cpu_ids);
+				     acpi_parse_processor_affinity, nr_cpu_entries);
 		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
 					    acpi_parse_memory_affinity,
 					    NR_NODE_MEMBLKS);

^ permalink raw reply

* [PATCH] x86, acpi: Handle all SRAT cpu entries even have cpu num
From: Yinghai Lu @ 2010-11-14  1:38 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin
  Cc: Wu Fengguang, Peter Zijlstra, LKML, Nikanth Karthikesan,
	David Rientjes, Zheng, Shaohui, linux-hotplug@vger.kernel.org,
	Eric Dumazet, Bjorn Helgaas, Venkatesh Pallipadi, Nikhil Rao,
	Takuya Yoshikawa
In-Reply-To: <20101113235746.GA9458@localhost>


Recent Intel new system have different order in MADT, aka will list all thread0
at first, then all thread1.
But SRAT table still old order, it will list cpus in one socket all together.

If the user have compiled limited NR_CPUS or boot with nr_cpus=, could have missed
to put some cpus apic id to node mapping into apicid_to_node[].

for example for 4 sockets system with 64 cpus with nr_cpus2 will get crash...

[    9.106288] Total of 32 processors activated (136190.88 BogoMIPS).
[    9.235021] divide error: 0000 [#1] SMP 
[    9.235315] last sysfs file: 
[    9.235481] CPU 1 
[    9.235592] Modules linked in:
[    9.245398] 
[    9.245478] Pid: 2, comm: kthreadd Not tainted 2.6.37-rc1-tip-yh-01782-ge92ef79-dirty #274      /Sun Fire x4800
[    9.265415] RIP: 0010:[<ffffffff81075a8f>]  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
[    9.265835] RSP: 0000:ffff88103f8d1c40  EFLAGS: 00010046
[    9.285550] RAX: 0000000000000000 RBX: ffff88103f887de0 RCX: 0000000000000000
[    9.305356] RDX: 0000000000000000 RSI: 0000000000000200 RDI: 0000000000000200
[    9.305711] RBP: ffff88103f8d1d10 R08: 0000000000000200 R09: ffff88103f887e38
[    9.325709] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
[    9.326038] R13: ffff88107e80dfb0 R14: 0000000000000001 R15: ffff88103f887e40
[    9.345655] FS:  0000000000000000(0000) GS:ffff88107e800000(0000) knlGS:0000000000000000
[    9.365503] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    9.365776] CR2: 0000000000000000 CR3: 0000000002417000 CR4: 00000000000006e0
[    9.385583] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    9.405368] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[    9.405713] Process kthreadd (pid: 2, threadinfo ffff88103f8d0000, task ffff88305c8aa2d0)
[    9.425563] Stack:
[    9.425668]  ffff88103f8d1cb0 0000000000000046 0000000000000000 0000000200000000
[    9.445509]  0000000000000000 0000000100000000 0000000000000046 ffffffff82bd1ce0
[    9.465350]  000000015c8aa2d0 00000000001d2540 00000000001d2540 0000007d3f8d1d28
[    9.465763] Call Trace:
[    9.465875]  [<ffffffff810747c3>] wake_up_new_task+0x3c/0x10e
[    9.485486]  [<ffffffff8107b2e3>] do_fork+0x28c/0x35f
[    9.485753]  [<ffffffff810ab832>] ? __lock_acquire+0x1801/0x1813
[    9.505474]  [<ffffffff8106f2bd>] ? finish_task_switch+0x80/0xf4
[    9.525264]  [<ffffffff8106f286>] ? finish_task_switch+0x49/0xf4
[    9.525575]  [<ffffffff8109da72>] ? local_clock+0x2b/0x3c
[    9.545281]  [<ffffffff8103da76>] kernel_thread+0x70/0x72
[    9.545544]  [<ffffffff81097c83>] ? kthread+0x0/0xa8
[    9.545797]  [<ffffffff81037990>] ? kernel_thread_helper+0x0/0x10
[    9.565519]  [<ffffffff81098099>] kthreadd+0xe8/0x12b
[    9.585185]  [<ffffffff81037994>] kernel_thread_helper+0x4/0x10
[    9.585485]  [<ffffffff81cd793c>] ? restore_args+0x0/0x30
[    9.605192]  [<ffffffff81097fb1>] ? kthreadd+0x0/0x12b
[    9.605479]  [<ffffffff81037990>] ? kernel_thread_helper+0x0/0x10
[    9.625295] Code: a0 be 00 02 00 00 ff c2 48 63 d2 e8 f8 67 3b 00 3b 05 86 8e 52 01 48 89 c7 89 45 c8 7c c1 48 8b 45 b0 8b 4b 08 31 d2 48 c1 e0 0a <48> f7 f1 45 85 e4 75 08 48 3b 45 b8 72 08 eb 0d 48 89 45 a8 eb 
[    9.645938] RIP  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
[    9.665356]  RSP <ffff88103f8d1c40>
[    9.665568] ---[ end trace 2296156d35fdfc87 ]---

So let just parse all cpu entries in SRAT.

Also add apicid checking with MAX_LOCAL_APIC, in case We could out of boundaries of
apicid_to_node[].

it should fix following bug too.
https://bugzilla.kernel.org/show_bug.cgi?id"662

Reported-and-Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Reported-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/kernel/acpi/boot.c |    7 +++++++
 arch/x86/mm/srat_64.c       |    8 ++++++++
 drivers/acpi/numa.c         |   14 ++++++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

Index: linux-2.6/arch/x86/kernel/acpi/boot.c
=================================--- linux-2.6.orig/arch/x86/kernel/acpi/boot.c
+++ linux-2.6/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,13 @@ static void __cpuinit acpi_register_lapi
 {
 	unsigned int ver = 0;
 
+#ifdef CONFIG_X86_64
+	if (id >= (MAX_APICS-1)) {
+		printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+		return;
+	}
+#endif
+
 	if (!enabled) {
 		++disabled_cpus;
 		return;
Index: linux-2.6/arch/x86/mm/srat_64.c
=================================--- linux-2.6.orig/arch/x86/mm/srat_64.c
+++ linux-2.6/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct ac
 	}
 
 	apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped that apicid too big\n", pxm, apic_id, node);
+		return;
+	}
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
@@ -168,6 +172,10 @@ acpi_numa_processor_affinity_init(struct
 		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
 	else
 		apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
 	apicid_to_node[apic_id] = node;
 	node_set(node, cpu_nodes_parsed);
 	acpi_numa = 1;
Index: linux-2.6/drivers/acpi/numa.c
=================================--- linux-2.6.orig/drivers/acpi/numa.c
+++ linux-2.6/drivers/acpi/numa.c
@@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_typ
 int __init acpi_numa_init(void)
 {
 	int ret = 0;
+	int nr_cpu_entries = nr_cpu_ids;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Should not limit number with cpu num that will handle,
+	 * SRAT cpu entries could have different order with that in MADT.
+	 * So go over all cpu entries in SRAT to get apicid to node mapping.
+	 */
+	nr_cpu_entries = MAX_LOCAL_APIC;
+#endif
 
 	/* SRAT: Static Resource Affinity Table */
 	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
-				     acpi_parse_x2apic_affinity, nr_cpu_ids);
+				     acpi_parse_x2apic_affinity, nr_cpu_entries);
 		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
-				     acpi_parse_processor_affinity, nr_cpu_ids);
+				     acpi_parse_processor_affinity, nr_cpu_entries);
 		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
 					    acpi_parse_memory_affinity,
 					    NR_NODE_MEMBLKS);

^ permalink raw reply

* Re: [PATCH] x86, acpi: Handle all SRAT cpu entries even have cpu
From: Wu Fengguang @ 2010-11-14 17:32 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, LKML, Nikanth Karthikesan, David Rientjes,
	Zheng, Shaohui, linux-hotplug@vger.kernel.org, Eric Dumazet,
	Bjorn Helgaas, Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <4CDF3DA1.2090806@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 7348 bytes --]

Hi,

I just found another problem. When passing "mem=256" to 2.6.37-rc1,
it dies hard early (not able to print any boot log). With this patch
applied, it's a bit better: it shows a kernel panic, but still dies
hard (not able to reboot with "panic=10").

Attached is the screenshot in kvm (it's not specific to kvm, it dies
hard on two more physical boxes). The screenshot shows that it panics
inside reserve_trampoline_memory().

Thanks,
Fengguang

On Sun, Nov 14, 2010 at 09:38:41AM +0800, Yinghai Lu wrote:
> 
> Recent Intel new system have different order in MADT, aka will list all thread0
> at first, then all thread1.
> But SRAT table still old order, it will list cpus in one socket all together.
> 
> If the user have compiled limited NR_CPUS or boot with nr_cpus=, could have missed
> to put some cpus apic id to node mapping into apicid_to_node[].
> 
> for example for 4 sockets system with 64 cpus with nr_cpus=32 will get crash...
> 
> [    9.106288] Total of 32 processors activated (136190.88 BogoMIPS).
> [    9.235021] divide error: 0000 [#1] SMP 
> [    9.235315] last sysfs file: 
> [    9.235481] CPU 1 
> [    9.235592] Modules linked in:
> [    9.245398] 
> [    9.245478] Pid: 2, comm: kthreadd Not tainted 2.6.37-rc1-tip-yh-01782-ge92ef79-dirty #274      /Sun Fire x4800
> [    9.265415] RIP: 0010:[<ffffffff81075a8f>]  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
> [    9.265835] RSP: 0000:ffff88103f8d1c40  EFLAGS: 00010046
> [    9.285550] RAX: 0000000000000000 RBX: ffff88103f887de0 RCX: 0000000000000000
> [    9.305356] RDX: 0000000000000000 RSI: 0000000000000200 RDI: 0000000000000200
> [    9.305711] RBP: ffff88103f8d1d10 R08: 0000000000000200 R09: ffff88103f887e38
> [    9.325709] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
> [    9.326038] R13: ffff88107e80dfb0 R14: 0000000000000001 R15: ffff88103f887e40
> [    9.345655] FS:  0000000000000000(0000) GS:ffff88107e800000(0000) knlGS:0000000000000000
> [    9.365503] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [    9.365776] CR2: 0000000000000000 CR3: 0000000002417000 CR4: 00000000000006e0
> [    9.385583] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [    9.405368] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> [    9.405713] Process kthreadd (pid: 2, threadinfo ffff88103f8d0000, task ffff88305c8aa2d0)
> [    9.425563] Stack:
> [    9.425668]  ffff88103f8d1cb0 0000000000000046 0000000000000000 0000000200000000
> [    9.445509]  0000000000000000 0000000100000000 0000000000000046 ffffffff82bd1ce0
> [    9.465350]  000000015c8aa2d0 00000000001d2540 00000000001d2540 0000007d3f8d1d28
> [    9.465763] Call Trace:
> [    9.465875]  [<ffffffff810747c3>] wake_up_new_task+0x3c/0x10e
> [    9.485486]  [<ffffffff8107b2e3>] do_fork+0x28c/0x35f
> [    9.485753]  [<ffffffff810ab832>] ? __lock_acquire+0x1801/0x1813
> [    9.505474]  [<ffffffff8106f2bd>] ? finish_task_switch+0x80/0xf4
> [    9.525264]  [<ffffffff8106f286>] ? finish_task_switch+0x49/0xf4
> [    9.525575]  [<ffffffff8109da72>] ? local_clock+0x2b/0x3c
> [    9.545281]  [<ffffffff8103da76>] kernel_thread+0x70/0x72
> [    9.545544]  [<ffffffff81097c83>] ? kthread+0x0/0xa8
> [    9.545797]  [<ffffffff81037990>] ? kernel_thread_helper+0x0/0x10
> [    9.565519]  [<ffffffff81098099>] kthreadd+0xe8/0x12b
> [    9.585185]  [<ffffffff81037994>] kernel_thread_helper+0x4/0x10
> [    9.585485]  [<ffffffff81cd793c>] ? restore_args+0x0/0x30
> [    9.605192]  [<ffffffff81097fb1>] ? kthreadd+0x0/0x12b
> [    9.605479]  [<ffffffff81037990>] ? kernel_thread_helper+0x0/0x10
> [    9.625295] Code: a0 be 00 02 00 00 ff c2 48 63 d2 e8 f8 67 3b 00 3b 05 86 8e 52 01 48 89 c7 89 45 c8 7c c1 48 8b 45 b0 8b 4b 08 31 d2 48 c1 e0 0a <48> f7 f1 45 85 e4 75 08 48 3b 45 b8 72 08 eb 0d 48 89 45 a8 eb 
> [    9.645938] RIP  [<ffffffff81075a8f>] select_task_rq_fair+0x4f0/0x623
> [    9.665356]  RSP <ffff88103f8d1c40>
> [    9.665568] ---[ end trace 2296156d35fdfc87 ]---
> 
> So let just parse all cpu entries in SRAT.
> 
> Also add apicid checking with MAX_LOCAL_APIC, in case We could out of boundaries of
> apicid_to_node[].
> 
> it should fix following bug too.
> https://bugzilla.kernel.org/show_bug.cgi?id=22662
> 
> Reported-and-Tested-by: Wu Fengguang <fengguang.wu@intel.com>
> Reported-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>
> 
> ---
>  arch/x86/kernel/acpi/boot.c |    7 +++++++
>  arch/x86/mm/srat_64.c       |    8 ++++++++
>  drivers/acpi/numa.c         |   14 ++++++++++++--
>  3 files changed, 27 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6/arch/x86/kernel/acpi/boot.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/acpi/boot.c
> +++ linux-2.6/arch/x86/kernel/acpi/boot.c
> @@ -198,6 +198,13 @@ static void __cpuinit acpi_register_lapi
>  {
>  	unsigned int ver = 0;
>  
> +#ifdef CONFIG_X86_64
> +	if (id >= (MAX_APICS-1)) {
> +		printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
> +		return;
> +	}
> +#endif
> +
>  	if (!enabled) {
>  		++disabled_cpus;
>  		return;
> Index: linux-2.6/arch/x86/mm/srat_64.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/mm/srat_64.c
> +++ linux-2.6/arch/x86/mm/srat_64.c
> @@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct ac
>  	}
>  
>  	apic_id = pa->apic_id;
> +	if (apic_id >= MAX_LOCAL_APIC) {
> +		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped that apicid too big\n", pxm, apic_id, node);
> +		return;
> +	}
>  	apicid_to_node[apic_id] = node;
>  	node_set(node, cpu_nodes_parsed);
>  	acpi_numa = 1;
> @@ -168,6 +172,10 @@ acpi_numa_processor_affinity_init(struct
>  		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
>  	else
>  		apic_id = pa->apic_id;
> +	if (apic_id >= MAX_LOCAL_APIC) {
> +		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
> +		return;
> +	}
>  	apicid_to_node[apic_id] = node;
>  	node_set(node, cpu_nodes_parsed);
>  	acpi_numa = 1;
> Index: linux-2.6/drivers/acpi/numa.c
> ===================================================================
> --- linux-2.6.orig/drivers/acpi/numa.c
> +++ linux-2.6/drivers/acpi/numa.c
> @@ -275,13 +275,23 @@ acpi_table_parse_srat(enum acpi_srat_typ
>  int __init acpi_numa_init(void)
>  {
>  	int ret = 0;
> +	int nr_cpu_entries = nr_cpu_ids;
> +
> +#ifdef CONFIG_X86_64
> +	/*
> +	 * Should not limit number with cpu num that will handle,
> +	 * SRAT cpu entries could have different order with that in MADT.
> +	 * So go over all cpu entries in SRAT to get apicid to node mapping.
> +	 */
> +	nr_cpu_entries = MAX_LOCAL_APIC;
> +#endif
>  
>  	/* SRAT: Static Resource Affinity Table */
>  	if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
>  		acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
> -				     acpi_parse_x2apic_affinity, nr_cpu_ids);
> +				     acpi_parse_x2apic_affinity, nr_cpu_entries);
>  		acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
> -				     acpi_parse_processor_affinity, nr_cpu_ids);
> +				     acpi_parse_processor_affinity, nr_cpu_entries);
>  		ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
>  					    acpi_parse_memory_affinity,
>  					    NR_NODE_MEMBLKS);

[-- Attachment #2: panic-reserve_trampoline_memory.png --]
[-- Type: image/png, Size: 18830 bytes --]

^ permalink raw reply

* Re: [PATCH] x86, acpi: Handle all SRAT cpu entries even have cpu
From: Yinghai Lu @ 2010-11-14 18:02 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, LKML, Nikanth Karthikesan, David Rientjes,
	Zheng, Shaohui, linux-hotplug@vger.kernel.org, Eric Dumazet,
	Bjorn Helgaas, Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <20101114173208.GA23017@localhost>

On 11/14/2010 09:32 AM, Wu Fengguang wrote:
> Hi,
> 
> I just found another problem. When passing "mem%6" to 2.6.37-rc1,
> it dies hard early (not able to print any boot log). With this patch
> applied, it's a bit better: it shows a kernel panic, but still dies
> hard (not able to reboot with "panic\x10").

do you mean mem%6M ?

Thanks
	Yinghai

^ permalink raw reply

* Re: [PATCH] x86, acpi: Handle all SRAT cpu entries even have cpu
From: Yinghai Lu @ 2010-11-14 18:19 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, LKML, Nikanth Karthikesan, David Rientjes,
	Zheng, Shaohui, linux-hotplug@vger.kernel.org, Eric Dumazet,
	Bjorn Helgaas, Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <20101114173208.GA23017@localhost>

On 11/14/2010 09:32 AM, Wu Fengguang wrote:
> Hi,
> 
> I just found another problem. When passing "mem%6" to 2.6.37-rc1,
> it dies hard early (not able to print any boot log). With this patch
> applied, it's a bit better: it shows a kernel panic, but still dies
> hard (not able to reboot with "panic\x10").
> 

if you did use "mem%6", you should get panic.

early console in setup code
Probing EDD (edd=off to disable)... ok
early console in decompress_kernel
decompress_kernel:
  input: [0x2431269-0x2e7fec6], output: 0x1000000, heap: [0x2e856c0-0x2e8c6bf]

Decompressing Linux... Parsing ELF... done.
Booting the kernel.
[    0.000000] bootconsole [uart0] enabled
[    0.000000] Kernel Layout:
[    0.000000]   .text: [0x01000000-0x01ce0e48]
[    0.000000] .rodata: [0x01ce6000-0x0240ffff]
[    0.000000]   .data: [0x02410000-0x025a50bf]
[    0.000000]   .init: [0x025a7000-0x0286afff]
[    0.000000]    .bss: [0x02875000-0x0348a6d7]
[    0.000000]    .brk: [0x0348b000-0x034aafff]
[    0.000000]     memblock_x86_reserve_range: [0x01000000-0x0348a6d7]    TEXT DATA BSS
[    0.000000]     memblock_x86_reserve_range: [0x7c59d000-0x7fffefff]          RAMDISK
[    0.000000]     memblock_x86_reserve_range: [0x0009fc00-0x000fffff]  * BIOS reserved
[    0.000000] Initializing cgroup subsys cpuset
[    0.000000] Initializing cgroup subsys cpu
[    0.000000] Linux version 2.6.37-rc1-tip-yh-02102-g776a022-dirty (yhlu@linux-siqj.site) (gcc version 4.5.0 20100604 [gcc-4_5-branch revision 160292] (SUSE Linux) ) #276 SMP Sat Nov 13 15:34:15 PST 2010
[    0.000000] Command line: BOOT_IMAGE=linux debug apicÞbug ramdisk_size&2144 root=/dev/ram0 rw ip=dhcp mem%6 console=uart8250,io,0x3f8,115200 initrd=initrd.img
[    0.000000] KERNEL supported cpus:
[    0.000000]   Intel GenuineIntel
[    0.000000]   AMD AuthenticAMD
[    0.000000]   Centaur CentaurHauls
[    0.000000] BIOS-provided physical RAM map:
[    0.000000]  BIOS-e820: [0x00000000000000-0x0000000009fbff] (usable)
[    0.000000]  BIOS-e820: [0x0000000009fc00-0x0000000009ffff] (reserved)
[    0.000000]  BIOS-e820: [0x000000000f0000-0x000000000fffff] (reserved)
[    0.000000]  BIOS-e820: [0x00000000100000-0x0000002ffeffff] (usable)
[    0.000000]  BIOS-e820: [0x0000002fff0000-0x0000002fffffff] (ACPI data)
[    0.000000]  BIOS-e820: [0x000000fffc0000-0x000000ffffffff] (reserved)
[    0.000000] e820 remove range: [0x00000000000100-0xfffffffffffffffe] (usable)
[    0.000000] NX (Execute Disable) protection: active
[    0.000000] user-defined physical RAM map:
[    0.000000]  user: [0x00000000000000-0x000000000000ff] (usable)
[    0.000000]  user: [0x0000000009fc00-0x0000000009ffff] (reserved)
[    0.000000]  user: [0x000000000f0000-0x000000000fffff] (reserved)
[    0.000000]  user: [0x0000002fff0000-0x0000002fffffff] (ACPI data)
[    0.000000]  user: [0x000000fffc0000-0x000000ffffffff] (reserved)
[    0.000000] e820 update range: [0x00000000000000-0x000000000000ff] (usable) => (reserved)
[    0.000000] aligned physical RAM map:
[    0.000000]  aligned: [0x00000000000000-0x000000000000ff] (reserved)
[    0.000000]  aligned: [0x0000000009fc00-0x0000000009ffff] (reserved)
[    0.000000]  aligned: [0x000000000f0000-0x000000000fffff] (reserved)
[    0.000000]  aligned: [0x0000002fff0000-0x0000002fffffff] (ACPI data)
[    0.000000]  aligned: [0x000000fffc0000-0x000000ffffffff] (reserved)
[    0.000000] DMI 2.5 present.
[    0.000000] DMI: /VirtualBox, BIOS VirtualBox 12/01/2006
[    0.000000] e820 update range: [0x00000000000000-0x0000000000ffff] (usable) => (reserved)
[    0.000000] e820 remove range: [0x000000000a0000-0x000000000fffff] (usable)
[    0.000000] No AGP bridge found
[    0.000000] last_pfn = 0x0 max_arch_pfn = 0x400000000
[    0.000000] MTRR default type: uncachable
[    0.000000] MTRR variable ranges disabled:
[    0.000000] x86 PAT enabled: cpu 0, old 0x7040600070406, new 0x7010600070106
[    0.000000] CPU MTRRs all blank - virtualized system.
[    0.000000] Scan SMP from ffff880000000000 for 1024 bytes.
[    0.000000] Scan SMP from ffff88000009fc00 for 1024 bytes.
[    0.000000] found SMP MP-table at [ffff88000009fff0] 9fff0
[    0.000000]     memblock_x86_reserve_range: [0x0009fff0-0x0009ffff]   * MP-table mpf
[    0.000000]   mpc: e1160-e1254
[    0.000000]     memblock_x86_reserve_range: [0x000e1160-0x000e1253]   * MP-table mpc
[    0.000000]     memblock_x86_reserve_range: [0x0348b000-0x0348b070]              BRK
[    0.000000] MEMBLOCK configuration:
[    0.000000]  memory size = 0x0
[    0.000000]  memory.cnt  = 0x1
[    0.000000]  memory[0x0]     [0x00000000000000-0xffffffffffffffff], 0x0 bytes
[    0.000000]  reserved.cnt  = 0x6
[    0.000000]  reserved[0x0]   [0x0000000009fc00-0x000000000fffff], 0x60400 bytes
[    0.000000]  reserved[0x1]   [0x0000000009fff0-0x0000000009ffff], 0x10 bytes
[    0.000000]  reserved[0x2]   [0x000000000e1160-0x000000000e1253], 0xf4 bytes
[    0.000000]  reserved[0x3]   [0x00000001000000-0x0000000348a6d7], 0x248a6d8 bytes
[    0.000000]  reserved[0x4]   [0x0000000348b000-0x0000000348b070], 0x71 bytes
[    0.000000]  reserved[0x5]   [0x0000007c59d000-0x0000007fffefff], 0x3a62000 bytes
[    0.000000] initial memory mapped : 0 - 20000000
[    0.000000] Kernel panic - not syncing: Cannot allocate trampoline
[    0.000000] 
[    0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc1-tip-yh-02102-g776a022-dirty #276
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff81cd3f14>] panic+0x91/0x1a3
[    0.000000]  [<ffffffff82783490>] reserve_trampoline_memory+0x46/0x72
[    0.000000]  [<ffffffff827801fe>] setup_arch+0x5b0/0xae3
[    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
[    0.000000]  [<ffffffff81cd4067>] ? printk+0x41/0x43
[    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
[    0.000000]  [<ffffffff8277bb0d>] start_kernel+0xd7/0x3e8
[    0.000000]  [<ffffffff8277b2cc>] x86_64_start_reservations+0x9c/0xa0
[    0.000000]  [<ffffffff8277b3e4>] x86_64_start_kernel+0x114/0x11b
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at kernel/lockdep.c:2322 trace_hardirqs_on_caller+0xc3/0x178()
[    0.000000] Hardware name: VirtualBox
[    0.000000] Modules linked in:
[    0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc1-tip-yh-02102-g776a022-dirty #276
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff8107bda0>] warn_slowpath_common+0x85/0x9d
[    0.000000]  [<ffffffff81cd3fe1>] ? panic+0x15e/0x1a3
[    0.000000]  [<ffffffff8107bdd2>] warn_slowpath_null+0x1a/0x1c
[    0.000000]  [<ffffffff810a8e59>] trace_hardirqs_on_caller+0xc3/0x178
[    0.000000]  [<ffffffff810a8f1b>] trace_hardirqs_on+0xd/0xf
[    0.000000]  [<ffffffff81cd3fe1>] panic+0x15e/0x1a3
[    0.000000]  [<ffffffff82783490>] reserve_trampoline_memory+0x46/0x72
[    0.000000]  [<ffffffff827801fe>] setup_arch+0x5b0/0xae3
[    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
[    0.000000]  [<ffffffff81cd4067>] ? printk+0x41/0x43
[    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
[    0.000000]  [<ffffffff8277bb0d>] start_kernel+0xd7/0x3e8
[    0.000000]  [<ffffffff8277b2cc>] x86_64_start_reservations+0x9c/0xa0
[    0.000000]  [<ffffffff8277b3e4>] x86_64_start_kernel+0x114/0x11b
[    0.000000] ---[ end trace a7919e7f17c0a725 ]---

^ permalink raw reply

* Re: [v2, 0/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-14 23:14 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <20101113053714.GA32501@shaohui>

On Sat, Nov 13, 2010 at 06:42:52AM -0800, Greg KH wrote:
> On Sat, Nov 13, 2010 at 01:37:14PM +0800, Shaohui Zheng wrote:
> 
> Please use the script, scripts/get_maintainer.pl to determine who best
> to send your patches to, it will tell you the mailing list as well as
> the people involved.
> 
thanks greg, I misunderstand the linux-hotplug mailing list is relate to
 CPU/Memory hotplug. I will recheck it with the script, and send them
to the correct list.

> good luck,
> 
> greg k-h

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply

* Re: [PATCH] x86, acpi: Handle all SRAT cpu entries even have cpu
From: Wu Fengguang @ 2010-11-15  1:22 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Ingo Molnar, Andrew Morton, Thomas Gleixner, H. Peter Anvin,
	Peter Zijlstra, LKML, Nikanth Karthikesan, David Rientjes,
	Zheng, Shaohui, linux-hotplug@vger.kernel.org, Eric Dumazet,
	Bjorn Helgaas, Venkatesh Pallipadi, Nikhil Rao, Takuya Yoshikawa
In-Reply-To: <4CE02832.3020402@kernel.org>

On Mon, Nov 15, 2010 at 02:19:30AM +0800, Yinghai Lu wrote:
> On 11/14/2010 09:32 AM, Wu Fengguang wrote:
> > Hi,
> > 
> > I just found another problem. When passing "mem%6" to 2.6.37-rc1,
> > it dies hard early (not able to print any boot log). With this patch
> > applied, it's a bit better: it shows a kernel panic, but still dies
> > hard (not able to reboot with "panic\x10").
> > 
> 
> if you did use "mem%6", you should get panic.

Oops, "256" is accepted as 256 bytes..
Sorry for the noise! It boots OK with "mem%6M".

Thanks,
Fengguang

> early console in setup code
> Probing EDD (edd=off to disable)... ok
> early console in decompress_kernel
> decompress_kernel:
>   input: [0x2431269-0x2e7fec6], output: 0x1000000, heap: [0x2e856c0-0x2e8c6bf]
> 
> Decompressing Linux... Parsing ELF... done.
> Booting the kernel.
> [    0.000000] bootconsole [uart0] enabled
> [    0.000000] Kernel Layout:
> [    0.000000]   .text: [0x01000000-0x01ce0e48]
> [    0.000000] .rodata: [0x01ce6000-0x0240ffff]
> [    0.000000]   .data: [0x02410000-0x025a50bf]
> [    0.000000]   .init: [0x025a7000-0x0286afff]
> [    0.000000]    .bss: [0x02875000-0x0348a6d7]
> [    0.000000]    .brk: [0x0348b000-0x034aafff]
> [    0.000000]     memblock_x86_reserve_range: [0x01000000-0x0348a6d7]    TEXT DATA BSS
> [    0.000000]     memblock_x86_reserve_range: [0x7c59d000-0x7fffefff]          RAMDISK
> [    0.000000]     memblock_x86_reserve_range: [0x0009fc00-0x000fffff]  * BIOS reserved
> [    0.000000] Initializing cgroup subsys cpuset
> [    0.000000] Initializing cgroup subsys cpu
> [    0.000000] Linux version 2.6.37-rc1-tip-yh-02102-g776a022-dirty (yhlu@linux-siqj.site) (gcc version 4.5.0 20100604 [gcc-4_5-branch revision 160292] (SUSE Linux) ) #276 SMP Sat Nov 13 15:34:15 PST 2010
> [    0.000000] Command line: BOOT_IMAGE=linux debug apicÞbug ramdisk_size&2144 root=/dev/ram0 rw ip=dhcp mem%6 console=uart8250,io,0x3f8,115200 initrd=initrd.img
> [    0.000000] KERNEL supported cpus:
> [    0.000000]   Intel GenuineIntel
> [    0.000000]   AMD AuthenticAMD
> [    0.000000]   Centaur CentaurHauls
> [    0.000000] BIOS-provided physical RAM map:
> [    0.000000]  BIOS-e820: [0x00000000000000-0x0000000009fbff] (usable)
> [    0.000000]  BIOS-e820: [0x0000000009fc00-0x0000000009ffff] (reserved)
> [    0.000000]  BIOS-e820: [0x000000000f0000-0x000000000fffff] (reserved)
> [    0.000000]  BIOS-e820: [0x00000000100000-0x0000002ffeffff] (usable)
> [    0.000000]  BIOS-e820: [0x0000002fff0000-0x0000002fffffff] (ACPI data)
> [    0.000000]  BIOS-e820: [0x000000fffc0000-0x000000ffffffff] (reserved)
> [    0.000000] e820 remove range: [0x00000000000100-0xfffffffffffffffe] (usable)
> [    0.000000] NX (Execute Disable) protection: active
> [    0.000000] user-defined physical RAM map:
> [    0.000000]  user: [0x00000000000000-0x000000000000ff] (usable)
> [    0.000000]  user: [0x0000000009fc00-0x0000000009ffff] (reserved)
> [    0.000000]  user: [0x000000000f0000-0x000000000fffff] (reserved)
> [    0.000000]  user: [0x0000002fff0000-0x0000002fffffff] (ACPI data)
> [    0.000000]  user: [0x000000fffc0000-0x000000ffffffff] (reserved)
> [    0.000000] e820 update range: [0x00000000000000-0x000000000000ff] (usable) => (reserved)
> [    0.000000] aligned physical RAM map:
> [    0.000000]  aligned: [0x00000000000000-0x000000000000ff] (reserved)
> [    0.000000]  aligned: [0x0000000009fc00-0x0000000009ffff] (reserved)
> [    0.000000]  aligned: [0x000000000f0000-0x000000000fffff] (reserved)
> [    0.000000]  aligned: [0x0000002fff0000-0x0000002fffffff] (ACPI data)
> [    0.000000]  aligned: [0x000000fffc0000-0x000000ffffffff] (reserved)
> [    0.000000] DMI 2.5 present.
> [    0.000000] DMI: /VirtualBox, BIOS VirtualBox 12/01/2006
> [    0.000000] e820 update range: [0x00000000000000-0x0000000000ffff] (usable) => (reserved)
> [    0.000000] e820 remove range: [0x000000000a0000-0x000000000fffff] (usable)
> [    0.000000] No AGP bridge found
> [    0.000000] last_pfn = 0x0 max_arch_pfn = 0x400000000
> [    0.000000] MTRR default type: uncachable
> [    0.000000] MTRR variable ranges disabled:
> [    0.000000] x86 PAT enabled: cpu 0, old 0x7040600070406, new 0x7010600070106
> [    0.000000] CPU MTRRs all blank - virtualized system.
> [    0.000000] Scan SMP from ffff880000000000 for 1024 bytes.
> [    0.000000] Scan SMP from ffff88000009fc00 for 1024 bytes.
> [    0.000000] found SMP MP-table at [ffff88000009fff0] 9fff0
> [    0.000000]     memblock_x86_reserve_range: [0x0009fff0-0x0009ffff]   * MP-table mpf
> [    0.000000]   mpc: e1160-e1254
> [    0.000000]     memblock_x86_reserve_range: [0x000e1160-0x000e1253]   * MP-table mpc
> [    0.000000]     memblock_x86_reserve_range: [0x0348b000-0x0348b070]              BRK
> [    0.000000] MEMBLOCK configuration:
> [    0.000000]  memory size = 0x0
> [    0.000000]  memory.cnt  = 0x1
> [    0.000000]  memory[0x0]     [0x00000000000000-0xffffffffffffffff], 0x0 bytes
> [    0.000000]  reserved.cnt  = 0x6
> [    0.000000]  reserved[0x0]   [0x0000000009fc00-0x000000000fffff], 0x60400 bytes
> [    0.000000]  reserved[0x1]   [0x0000000009fff0-0x0000000009ffff], 0x10 bytes
> [    0.000000]  reserved[0x2]   [0x000000000e1160-0x000000000e1253], 0xf4 bytes
> [    0.000000]  reserved[0x3]   [0x00000001000000-0x0000000348a6d7], 0x248a6d8 bytes
> [    0.000000]  reserved[0x4]   [0x0000000348b000-0x0000000348b070], 0x71 bytes
> [    0.000000]  reserved[0x5]   [0x0000007c59d000-0x0000007fffefff], 0x3a62000 bytes
> [    0.000000] initial memory mapped : 0 - 20000000
> [    0.000000] Kernel panic - not syncing: Cannot allocate trampoline
> [    0.000000] 
> [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc1-tip-yh-02102-g776a022-dirty #276
> [    0.000000] Call Trace:
> [    0.000000]  [<ffffffff81cd3f14>] panic+0x91/0x1a3
> [    0.000000]  [<ffffffff82783490>] reserve_trampoline_memory+0x46/0x72
> [    0.000000]  [<ffffffff827801fe>] setup_arch+0x5b0/0xae3
> [    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
> [    0.000000]  [<ffffffff81cd4067>] ? printk+0x41/0x43
> [    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
> [    0.000000]  [<ffffffff8277bb0d>] start_kernel+0xd7/0x3e8
> [    0.000000]  [<ffffffff8277b2cc>] x86_64_start_reservations+0x9c/0xa0
> [    0.000000]  [<ffffffff8277b3e4>] x86_64_start_kernel+0x114/0x11b
> [    0.000000] ------------[ cut here ]------------
> [    0.000000] WARNING: at kernel/lockdep.c:2322 trace_hardirqs_on_caller+0xc3/0x178()
> [    0.000000] Hardware name: VirtualBox
> [    0.000000] Modules linked in:
> [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.37-rc1-tip-yh-02102-g776a022-dirty #276
> [    0.000000] Call Trace:
> [    0.000000]  [<ffffffff8107bda0>] warn_slowpath_common+0x85/0x9d
> [    0.000000]  [<ffffffff81cd3fe1>] ? panic+0x15e/0x1a3
> [    0.000000]  [<ffffffff8107bdd2>] warn_slowpath_null+0x1a/0x1c
> [    0.000000]  [<ffffffff810a8e59>] trace_hardirqs_on_caller+0xc3/0x178
> [    0.000000]  [<ffffffff810a8f1b>] trace_hardirqs_on+0xd/0xf
> [    0.000000]  [<ffffffff81cd3fe1>] panic+0x15e/0x1a3
> [    0.000000]  [<ffffffff82783490>] reserve_trampoline_memory+0x46/0x72
> [    0.000000]  [<ffffffff827801fe>] setup_arch+0x5b0/0xae3
> [    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
> [    0.000000]  [<ffffffff81cd4067>] ? printk+0x41/0x43
> [    0.000000]  [<ffffffff827cb3a0>] ? boot_command_line+0x0/0x800
> [    0.000000]  [<ffffffff8277bb0d>] start_kernel+0xd7/0x3e8
> [    0.000000]  [<ffffffff8277b2cc>] x86_64_start_reservations+0x9c/0xa0
> [    0.000000]  [<ffffffff8277b3e4>] x86_64_start_kernel+0x114/0x11b
> [    0.000000] ---[ end trace a7919e7f17c0a725 ]---

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Vivek Goyal @ 2010-11-15 14:57 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

On Fri, Nov 12, 2010 at 03:36:47PM +0100, Kay Sievers wrote:
> On Thu, Nov 11, 2010 at 21:07, Jeff Moyer <jmoyer@redhat.com> wrote:
> > Jens Axboe <axboe@kernel.dk> writes:
> >> On 2010-11-10 21:03, Vivek Goyal wrote:
> >>> On Wed, Nov 10, 2010 at 01:26:21PM -0500, David Zeuthen wrote:
> >>>> On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
> >>>>> From within the block layer in the kernel, it is difficult to
> >>>>> automatically detect the performance characteristics of the underlying
> >>>>> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
> >>>>> rule to tune the I/O scheduler properly for most cases.  The basic
> >>>>> approach is to leave CFQ's default tunings alone for SATA disks.  For
> >>>>> everything else, turn off slice idling and bump the quantum in order to
> >>>>> drive higher queue depths.  This patch is an attempt to implement this.
> >>>>>
> >>>>> I've tested it in a variety of configurations:
> >>>>> - cciss devices
> >>>>> - sata disks
> >>>>> - sata ssds
> >>>>> - enterprise storage (single path)
> >>>>> - enterprise storage (multi-path)
> >>>>> - multiple paths to a sata disk (yes, you can actually do that!)
> >>>>>
> >>>>> The tuning works as expected in all of those scenarios.  I look forward
> >>>>> to your comments.
> >>>>
> >>>> This looks useful, but I really think the kernel driver creating the
> >>>> block device should choose/change the defaults for the created block
> >>>> device - it seems really backwards to do this in user-space as an
> >>>> afterthought.
> >>>
> >>> I think it just becomes little easier to implement in user space so that
> >>> if things don't work as expected, somebody can easily disable the rules
> >>> or somebody can easily refine the rule further to better suite their
> >>> needs instead of driver hardcoding this decision.
> >>
> >> That's the primary reason why I suggested doing this in user space. Plus
> >> we don't always know in the kernel, at least this provides an easier way
> >> to auto-tune things.
> >
> > Right, so given the above, is there still opposition to doing this in
> > udev?
> 
> Not in general. Udev can do such things, that's what it's there for.
> It can do quirks, custom setups, and support tweaked configs that way.
> 
> But it's usually not meant to set common defaults for every box. The
> last time we got into this business, and set timeouts for scsi devices
> from udev, we broke more recent kernels that did not like the
> specified values anymore, and we needed to remove all that in released
> versions, to be able to safely run newer kernels. And we've been told
> not to do such a thing in the future.
> 
> And all what your rules are doing is to unconditionally apply
> kernel-internal knowledge to kernel devices -- which if you look at it
> from one step back -- is a bit weird.
> 
> So I guess, this should be done from the multipath package, the dm
> setup, some 'tweak.rpm', ...  I'm not sure, if we can do that for
> everybody from the main udev sources, for the same reasons the scsi
> timeout was wrong to do from udev. The time we added it, it seemed to
> be the right thing, but 2 years later it wasn't, because the kernel
> evolved, and we got into its way.

Hi Kay,

I can understand the issue of a rule being not valid anymore if kernel
evolves. But the question is what's wrong with that? Why can't we keep
on updating the udev rules as kernel and hardware evolves. Are they
supposed to be set in stone once a rule has been written?

Even if we move the rule to some other user space package, then that
package will face the same issue of rule being not valid any more if
kernel evolves. So that will be equivalent just shifting the problem
from one user space package to other.

To me key thing here is whether udev should try to set up some reasonable
IO scheduler defaults for system or not or it should be left entirely
to kernel.

"Deadline" IO scheduler generally works very well with enterprise storage.
CFQ primarly cuts down seeks for very seeky media like SATA drive. Kernel
by default keeps CFQ as default for all the devices and we are trying to
improve out of the box experience for the user instead of imposing CFQ
on everybody and expecting them to change it later to deadline where
appropriate.

Because rules are still not very clear yet and we are not sure how well
this notion of CFQ for SATA is going to play with everybody, to me it
still might not be a bad idea to initially write a udev rule and if
this works reasonably well or kernel evolves, we can modify the rule
accordingly.

Thanks
Vivek

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Kay Sievers @ 2010-11-15 15:43 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

On Mon, Nov 15, 2010 at 15:57, Vivek Goyal <vgoyal@redhat.com> wrote:
> On Fri, Nov 12, 2010 at 03:36:47PM +0100, Kay Sievers wrote:
>> On Thu, Nov 11, 2010 at 21:07, Jeff Moyer <jmoyer@redhat.com> wrote:
>> > Jens Axboe <axboe@kernel.dk> writes:
>> >> On 2010-11-10 21:03, Vivek Goyal wrote:
>> >>> On Wed, Nov 10, 2010 at 01:26:21PM -0500, David Zeuthen wrote:
>> >>>> On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
>> >>>>> From within the block layer in the kernel, it is difficult to
>> >>>>> automatically detect the performance characteristics of the underlying
>> >>>>> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
>> >>>>> rule to tune the I/O scheduler properly for most cases.  The basic
>> >>>>> approach is to leave CFQ's default tunings alone for SATA disks.  For
>> >>>>> everything else, turn off slice idling and bump the quantum in order to
>> >>>>> drive higher queue depths.  This patch is an attempt to implement this.
>> >>>>>
>> >>>>> I've tested it in a variety of configurations:
>> >>>>> - cciss devices
>> >>>>> - sata disks
>> >>>>> - sata ssds
>> >>>>> - enterprise storage (single path)
>> >>>>> - enterprise storage (multi-path)
>> >>>>> - multiple paths to a sata disk (yes, you can actually do that!)
>> >>>>>
>> >>>>> The tuning works as expected in all of those scenarios.  I look forward
>> >>>>> to your comments.
>> >>>>
>> >>>> This looks useful, but I really think the kernel driver creating the
>> >>>> block device should choose/change the defaults for the created block
>> >>>> device - it seems really backwards to do this in user-space as an
>> >>>> afterthought.
>> >>>
>> >>> I think it just becomes little easier to implement in user space so that
>> >>> if things don't work as expected, somebody can easily disable the rules
>> >>> or somebody can easily refine the rule further to better suite their
>> >>> needs instead of driver hardcoding this decision.
>> >>
>> >> That's the primary reason why I suggested doing this in user space. Plus
>> >> we don't always know in the kernel, at least this provides an easier way
>> >> to auto-tune things.
>> >
>> > Right, so given the above, is there still opposition to doing this in
>> > udev?
>>
>> Not in general. Udev can do such things, that's what it's there for.
>> It can do quirks, custom setups, and support tweaked configs that way.
>>
>> But it's usually not meant to set common defaults for every box. The
>> last time we got into this business, and set timeouts for scsi devices
>> from udev, we broke more recent kernels that did not like the
>> specified values anymore, and we needed to remove all that in released
>> versions, to be able to safely run newer kernels. And we've been told
>> not to do such a thing in the future.
>>
>> And all what your rules are doing is to unconditionally apply
>> kernel-internal knowledge to kernel devices -- which if you look at it
>> from one step back -- is a bit weird.
>>
>> So I guess, this should be done from the multipath package, the dm
>> setup, some 'tweak.rpm', ...  I'm not sure, if we can do that for
>> everybody from the main udev sources, for the same reasons the scsi
>> timeout was wrong to do from udev. The time we added it, it seemed to
>> be the right thing, but 2 years later it wasn't, because the kernel
>> evolved, and we got into its way.
>
> Hi Kay,
>
> I can understand the issue of a rule being not valid anymore if kernel
> evolves. But the question is what's wrong with that? Why can't we keep
> on updating the udev rules as kernel and hardware evolves. Are they
> supposed to be set in stone once a rule has been written?
>
> Even if we move the rule to some other user space package, then that
> package will face the same issue of rule being not valid any more if
> kernel evolves. So that will be equivalent just shifting the problem
> from one user space package to other.
>
> To me key thing here is whether udev should try to set up some reasonable
> IO scheduler defaults for system or not or it should be left entirely
> to kernel.
>
> "Deadline" IO scheduler generally works very well with enterprise storage.
> CFQ primarly cuts down seeks for very seeky media like SATA drive. Kernel
> by default keeps CFQ as default for all the devices and we are trying to
> improve out of the box experience for the user instead of imposing CFQ
> on everybody and expecting them to change it later to deadline where
> appropriate.
>
> Because rules are still not very clear yet and we are not sure how well
> this notion of CFQ for SATA is going to play with everybody, to me it
> still might not be a bad idea to initially write a udev rule and if
> this works reasonably well or kernel evolves, we can modify the rule
> accordingly.

Udev can be the engine to change stuff on demand, but it should not
ship *common defaults* which are only gathered from kernel
information. If that's the goal, and it should be done for all
systems, please change the kernel to that directly, and don't put that
into udev.

It would be a different picture if userspace would be involved in some
sense, like persistently storing results of 'disk tests', or something
similar, and applying calculated values based on these earlier
results, to the actual disk when it is re-discovered. It would
probably also involve permanent monitoring, and updating these values.

Retrieving simple kernel values and re-apply them to the kernel does
not make much sense in general, not for block devices, not for other
subsystems, and things like that should not got into the udev
repository for the reasons mentioned in the earlier mail.

Kay

^ permalink raw reply

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Matt Domsch @ 2010-11-15 16:47 UTC (permalink / raw)
  To: Greg KH
  Cc: K, Narendra, linux-hotplug@vger.kernel.org,
	netdev@vger.kernel.org, Hargrave, Jordan, Rose, Charles
In-Reply-To: <20101105025848.GA14021@pws490.domsch.com>

On Thu, Nov 04, 2010 at 09:58:48PM -0500, Matt Domsch wrote:
> On Wed, Nov 03, 2010 at 11:05:00AM -0700, Greg KH wrote:
> > On Wed, Nov 03, 2010 at 10:25:25PM +0530, Narendra_K@Dell.com wrote:
> > > Hello,
> > > 
> > > This patch allows users to specify if they want the onboard network
> > > interfaces to be renamed to lomN by implementing a command line param
> > > 'udevlom'.
> > 
> > Ick ick ick.
> > 
> > Why not do this in some other configuration file?  Don't rely on udev
> > being started with a different option, that is only ripe for abuse by
> > everyone else who wants their pet-project to get into the udev
> > environment.
> > 
> > Please, surely there's a different way to do this.
> 
> At Linux Plumbers Conference today, this problem space was discussed
> once again, and I believe concensus on approach was reached.  Here
> goes:
> 
> * If a 70-persistent-net.rules file sets a name, honor that.  This
>   preserves existing installs.
> 
> * If BIOS provides indexes for onboard devices, honor that.
> ** Rename onboard NICs "lom[1-N]" as BIOS reports (# matches chassis labels)
> ** No rename for all others "ethX" (no change for NICs in PCI slots/USB/others)

I'm getting a lot of pushback from Dell customers on our
linux-poweredge mailing list (thread starts [1]) that the choice of
name "lomX" is poor, due to HP's extensive use of LOM meaning Lights
Out Management, rather than my intended meaning of "LAN on
Motherboard".  Gotta hate TLA collisions.

So, I'm open to new ideas for naming these.  At LPC, Ted noted that
2- and 3-letter names are expected.  "nic[1234]" or "en[1234]" ?

And yes, they'd prefer that we keep "eth[0123]" for the onboard
devices, but I simply don't see how to do that without kernel changes,
due to the races in both naming them in the kernel vs udev renaming,
and simple races between two udev processes.

Thanks,
Matt

[1] http://lists.us.dell.com/pipermail/linux-poweredge/2010-November/043576.html

-- 
Matt Domsch
Technology Strategist
Dell | Office of the CTO

^ permalink raw reply

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to
From: Ben Hutchings @ 2010-11-15 17:16 UTC (permalink / raw)
  To: Matt Domsch
  Cc: Greg KH, K, Narendra, linux-hotplug@vger.kernel.org,
	netdev@vger.kernel.org, Hargrave, Jordan, Rose, Charles
In-Reply-To: <20101115164714.GB7030@auslistsprd01.us.dell.com>

On Mon, 2010-11-15 at 10:47 -0600, Matt Domsch wrote:
> On Thu, Nov 04, 2010 at 09:58:48PM -0500, Matt Domsch wrote:
> > On Wed, Nov 03, 2010 at 11:05:00AM -0700, Greg KH wrote:
> > > On Wed, Nov 03, 2010 at 10:25:25PM +0530, Narendra_K@Dell.com wrote:
> > > > Hello,
> > > > 
> > > > This patch allows users to specify if they want the onboard network
> > > > interfaces to be renamed to lomN by implementing a command line param
> > > > 'udevlom'.
> > > 
> > > Ick ick ick.
> > > 
> > > Why not do this in some other configuration file?  Don't rely on udev
> > > being started with a different option, that is only ripe for abuse by
> > > everyone else who wants their pet-project to get into the udev
> > > environment.
> > > 
> > > Please, surely there's a different way to do this.
> > 
> > At Linux Plumbers Conference today, this problem space was discussed
> > once again, and I believe concensus on approach was reached.  Here
> > goes:
> > 
> > * If a 70-persistent-net.rules file sets a name, honor that.  This
> >   preserves existing installs.
> > 
> > * If BIOS provides indexes for onboard devices, honor that.
> > ** Rename onboard NICs "lom[1-N]" as BIOS reports (# matches chassis labels)
> > ** No rename for all others "ethX" (no change for NICs in PCI slots/USB/others)
> 
> I'm getting a lot of pushback from Dell customers on our
> linux-poweredge mailing list (thread starts [1]) that the choice of
> name "lomX" is poor, due to HP's extensive use of LOM meaning Lights
> Out Management, rather than my intended meaning of "LAN on
> Motherboard".  Gotta hate TLA collisions.
> 
> So, I'm open to new ideas for naming these.  At LPC, Ted noted that
> 2- and 3-letter names are expected.  "nic[1234]" or "en[1234]" ?
[...]

I would suggest avoiding "nic" since some people use "NIC" to mean
specifically an add-in card rather than LOM.  In addition there is some
ambiguity with multi-port cards/controllers of whether NIC means a
controller or a port.

Other options for the prefix:
- "lan".  Maybe too generic.
- "mbe" = MotherBoard Ethernet. Looks a bit like "GbE" as some OEMs put
on the port labels.
- "eom" = Ethernet On Motherboard

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Rick Jones @ 2010-11-15 19:32 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Matt Domsch, Greg KH, K, Narendra, linux-hotplug@vger.kernel.org,
	netdev@vger.kernel.org, Hargrave, Jordan, Rose, Charles
In-Reply-To: <1289841399.2586.18.camel@bwh-desktop>

>>I'm getting a lot of pushback from Dell customers on our
>>linux-poweredge mailing list (thread starts [1]) that the choice of
>>name "lomX" is poor, due to HP's extensive use of LOM meaning Lights
>>Out Management, rather than my intended meaning of "LAN on
>>Motherboard".  Gotta hate TLA collisions.

I think Sun (sorry, Oracle) push LOM for Lights-Out Management quite a lot - 
calling their service processor an iLOM IIRC.

>>So, I'm open to new ideas for naming these.  At LPC, Ted noted that
>>2- and 3-letter names are expected.  "nic[1234]" or "en[1234]" ?
> 
> [...]
> 
> I would suggest avoiding "nic" since some people use "NIC" to mean
> specifically an add-in card rather than LOM.  In addition there is some
> ambiguity with multi-port cards/controllers of whether NIC means a
> controller or a port.
> 
> Other options for the prefix:
> - "lan".  Maybe too generic.

yes and no - that is the prefix for "ethernet" network interface names in HP-UX, 
going back decades.  so, there is precedent for that, and given the way HP-UX 
device name persistence works, 99 times out of ten, the "built-in" or "core" LAN 
interfaces ended-up being enumerated starting from zero - lan0, lan1, etc. 
(There are exceptions relating to certain modles of systems and a full 
re-install of the OS with add-on cards present but that is a story for another 
thread).

> - "mbe" = MotherBoard Ethernet. Looks a bit like "GbE" as some OEMs put
> on the port labels.

Collides with Multi-Bit Error.

> - "eom" = Ethernet On Motherboard

Collides with End of Message.

If there is indeed *no* way to get then named eth[1-N], and "lan" doesn't 
resonate well-enough, then my contribution to the bikeshed would be "cor" simply 
because I don't know the TLA with which that collides :)

Are folks sufficently confident that using anything other than "eth" won't cause 
some unpleasant "our app always ass-u-me-d interfaces started with 'eth'" 
situations?

rick jones

^ permalink raw reply

* extended netdevice naming proposal
From: Matt Domsch @ 2010-11-17 22:06 UTC (permalink / raw)
  To: linux-hotplug, netdev, narendra_k; +Cc: jcm, notting

While this _is_ the original bikeshedding problem, as long as I'm
going to use biosdevname to change names for embedded NICs, perhaps I
can be so bold as to change them for USB add-in cards too?

There are quite a few dimensions to the problem:
* device location (onboard, PCI, other bus)
* multiple ports on a single add-in card
* with Network Partitioning (NPAR) and SR-IOV, the OS sees multiple
  network interfaces (physical or virtual interfaces) but a single external port
* the suffix .1234 currently used for vlans (ala vconfig)
* A single PCI device may drive multiple external ports

As such, here is a naming proposal, aimed to keep within 15
characters for most configurations.

(location)(slot)#(port)/(instance).(vlan)

location := NIC on Motherboard = net1, net2, net3, net4
   (note: people hated the TLA collision with 'lom', so avoiding that here).
         := PCI slot = pci1, pci2, pci3, pci4
 these correspond to chassis labels, information is available in
 $PIRQ, SMBIOS or ACPI, which biosdevname retrieves and uses.

For single- or multi-port cards in PCI slots, append #(port):
   pci1#1, pci1#2, pci1#3, pci1#4  for 4 ports on a card in PCI slot 1

There is currently no way to get this port info from BIOS.  Several people
have suggested using adding a PCI capabilities field to expose this
info in a standard way, but that's a ways off. Until then, biosdevname
can guess (assume ascending MAC order on the single card).

For NPAR/SR-IOV where the physical port is shared by several
instances, append /(instance):
   net1/1, net1/2 pci1#1/1, pci1#1/2,
   pci1#1/2, pci1#1/3, ...

For each of the above where vconfig sets up a vlan:
   pci1#/1.1000, pci1#1/2.1001, pci1#1/2.1003, pci1#1/3.1004, ...
vconfig simply appends .{vlan#} to the already  named device when creating a new vlan netdevice.

BIOS definitely doesn't know about these, as they aren't exposed until
after the OS is running, so the mechanism that creates them (such as
following modprobe ixgbevf) would have to, and I think that can be
done with a udev rule, if we can somehow expose the port number of the
underlying PF when we throw the message to udev on creation of the VF.

And of course, BIOS knows nothing about vlans, so vconfig would add that.

If we have no more than 99 PCI slots, no more than 99 ports on a
single card, no more than 999 instances/virtual functions (we need at
least 128, perhaps more than 256 at some point, so hex vs decimal
doesn't buy us much here), no more than 4k VLANs, we get:

len("pci99#99/999.4095") = 17

If we really have that many, we're in trouble in other ways, so let's
hope that's good enough.

Thoughts?  Overkill?

Thanks,
Matt

-- 
Matt Domsch
Technology Strategist
Dell | Office of the CTO

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox