Linux Hotplug development

Linux Hotplug development
 help / color / mirror / Atom feed

* [v2,6/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:42 UTC (permalink / raw)
  To: linux-hotplug

From: Shaohui Zheng <shaohui.zheng@intel.com>
Subject: hotplug emulator: Fake CPU socket with logical CPU on x86

When hotplug a CPU with emulator, we are using a logical CPU to emulate the
CPU hotplug process. For the CPU supported SMT, some logical CPUs are in the
same socket, but it may located in different NUMA node after we have emulator.
it misleads the scheduling domain to build the incorrect hierarchy, and it
causes the following call trace when rebalance the scheduling domain:

divide error: 0000 [#1] SMP 
last sysfs file: /sys/devices/system/cpu/cpu8/online
CPU 0 
Modules linked in: fbcon tileblit font bitblit softcursor radeon ttm drm_kms_helper e1000e usbhid via_rhine mii drm i2c_algo_bit igb dca
Pid: 0, comm: swapper Not tainted 2.6.32hpe #78 X8DTN
RIP: 0010:[<ffffffff81051da5>]  [<ffffffff81051da5>] find_busiest_group+0x6c5/0xa10
RSP: 0018:ffff880028203c30  EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000015ac0 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffff880277e8cfa0 RDI: 0000000000000000
RBP: ffff880028203dc0 R08: ffff880277e8cfa0 R09: 0000000000000040
R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
FS:  0000000000000000(0000) GS:ffff880028200000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 00007f16cfc85770 CR3: 0000000001001000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 0, threadinfo ffffffff81822000, task ffffffff8184a600)
Stack:
 ffff880028203d60 ffff880028203cd0 ffff8801c204ff08 ffff880028203e38
<0> 0101ffff81018c59 ffff880028203e44 00000001810806bd ffff8801c204fe00
<0> 0000000528200000 ffffffff00000000 0000000000000018 0000000000015ac0
Call Trace:
 <IRQ> 
 [<ffffffff81088ee0>] ? tick_dev_program_event+0x40/0xd0
 [<ffffffff81053b2c>] rebalance_domains+0x17c/0x570
 [<ffffffff81018c89>] ? read_tsc+0x9/0x20
 [<ffffffff81088ee0>] ? tick_dev_program_event+0x40/0xd0
 [<ffffffff810569ed>] run_rebalance_domains+0xbd/0xf0
 [<ffffffff8106471f>] __do_softirq+0xaf/0x1e0
 [<ffffffff810b7d18>] ? handle_IRQ_event+0x58/0x160
 [<ffffffff810130ac>] call_softirq+0x1c/0x30
 [<ffffffff81014a85>] do_softirq+0x65/0xa0
 [<ffffffff810645cd>] irq_exit+0x7d/0x90
 [<ffffffff81013ff0>] do_IRQ+0x70/0xe0
 [<ffffffff810128d3>] ret_from_intr+0x0/0x11
 <EOI> 
 [<ffffffff8133387f>] ? acpi_idle_enter_bm+0x281/0x2b5
 [<ffffffff81333878>] ? acpi_idle_enter_bm+0x27a/0x2b5
 [<ffffffff8145dc8f>] ? cpuidle_idle_call+0x9f/0x130
 [<ffffffff81010e2b>] ? cpu_idle+0xab/0x100
 [<ffffffff8158aee6>] ? rest_init+0x66/0x70
 [<ffffffff81905d90>] ? start_kernel+0x3e3/0x3ef
 [<ffffffff8190533a>] ? x86_64_start_reservations+0x125/0x129
 [<ffffffff81905438>] ? x86_64_start_kernel+0xfa/0x109
Code: 00 00 e9 4c fb ff ff 0f 1f 80 00 00 00 00 48 8b b5 d8 fe ff ff 48 8b 45 a8 4d 29 ef 8b 56 08 48 c1 e0 0a 49 89 f0 48 89 d7 31 d2 <48> f7 f7 31 d2 48 89 45 a0 8b 76 08 4c 89 f0 48 c1 e0 0a 48 f7 
RIP  [<ffffffff81051da5>] find_busiest_group+0x6c5/0xa10
 RSP <ffff880028203c30>

Solution:

We put the logical CPU into a fake CPU socket, and assign it an unique
 phys_proc_id. For the fake socket, we put one logical CPU in only. This
method fixes the above bug.

Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
---
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bd..9a2088c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -113,6 +113,15 @@ struct cpuinfo_x86 {
 	/* Index into per_cpu list: */
 	u16			cpu_index;
 #endif
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	/*
+	 * Use a logic cpu to emulate a physical cpu's hotplug. We put the
+	 * logical cpu into a fake socket, assign a fake physical id to it,
+	 * and create a fake core.
+	 */
+	__u8		cpu_probe_on; /* A flag to enable cpu probe/release */
+#endif
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define X86_VENDOR_INTEL	0
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 170d9b9..1d4dc67 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
  */
 static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 void cpu_hotplug_driver_lock()
 {
         mutex_lock(&x86_cpu_hotplug_driver_mutex);
@@ -106,6 +107,7 @@ void cpu_hotplug_driver_unlock()
 {
         mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
+#endif
 
 #else
 static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
@@ -198,6 +200,8 @@ static void __cpuinit smp_callin(void)
 {
 	int cpuid, phys_id;
 	unsigned long timeout;
+	u8 cpu_probe_on = 0;
+	struct cpuinfo_x86 *c;
 
 	/*
 	 * If waken up by an INIT in an 82489DX configuration
@@ -277,7 +281,20 @@ static void __cpuinit smp_callin(void)
 	/*
 	 * Save our processor parameters
 	 */
+	c = &cpu_data(cpuid);
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	cpu_probe_on = c->cpu_probe_on;
+	phys_id = c->phys_proc_id;
+#endif
+
 	smp_store_cpu_info(cpuid);
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	if (cpu_probe_on) {
+		c->phys_proc_id = phys_id; /* restore the fake phys_proc_id */
+		c->cpu_core_id = 0; /* force the logical cpu to core 0 */
+		c->cpu_probe_on = cpu_probe_on;
+	}
+#endif
 
 	notify_cpu_starting(cpuid);
 
@@ -400,6 +417,11 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 {
 	int i;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int cpu_probe_on = 0;
+
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	cpu_probe_on = c->cpu_probe_on;
+#endif
 
 	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
@@ -431,7 +453,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
-		    per_cpu(cpu_llc_id, cpu) = per_cpu(cpu_llc_id, i)) {
+		    per_cpu(cpu_llc_id, cpu) = per_cpu(cpu_llc_id, i) &&
+			cpu_probe_on = 0) {
 			cpumask_set_cpu(i, c->llc_shared_map);
 			cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map);
 		}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 515f08a..98a9b45 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -90,6 +90,36 @@ void arch_unregister_cpu(int num)
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+/*
+ * Put the logical cpu into a new sokect, and encapsule it into core 0.
+ */
+static void fake_cpu_socket_info(int cpu)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int i, phys_id = 0;
+
+	/* calculate the max phys_id */
+	for_each_present_cpu(i) {
+		struct cpuinfo_x86 *c = &cpu_data(i);
+		if (phys_id < c->phys_proc_id)
+			phys_id = c->phys_proc_id;
+	}
+
+	c->phys_proc_id = phys_id + 1; /* pick up a unused phys_proc_id */
+	c->cpu_core_id = 0; /* always put the logical cpu to core 0 */
+	c->cpu_probe_on = 1;
+}
+
+static void clear_cpu_socket_info(int cpu)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	c->phys_proc_id = 0;
+	c->cpu_core_id = 0;
+	c->cpu_probe_on = 0;
+}
+
+
 ssize_t arch_cpu_probe(const char *buf, size_t count)
 {
 	int nid = 0;
@@ -129,6 +159,7 @@ ssize_t arch_cpu_probe(const char *buf, size_t count)
 	/* register cpu */
 	arch_register_cpu_emu(selected, nid);
 	acpi_map_lsapic_emu(selected, nid);
+	fake_cpu_socket_info(selected);
 
 	return count;
 }
@@ -152,10 +183,13 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
 
 	arch_unregister_cpu(cpu);
 	acpi_unmap_lsapic(cpu);
+	clear_cpu_socket_info(cpu);
+	set_cpu_present(cpu, true);
 
 	return count;
 }
 EXPORT_SYMBOL(arch_cpu_release);
+#endif CONFIG_ARCH_CPU_PROBE_RELEASE
 
 #else /* CONFIG_HOTPLUG_CPU */
 

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* [v2,5/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:42 UTC (permalink / raw)
  To: linux-hotplug

From: Shaohui Zheng <shaohui.zheng@intel.com>
Subject: hotplug emulator: support cpu probe/release in x86

Add cpu interface probe/release under sysfs for x86. User can use this
interface to emulate the cpu hot-add process, it is for cpu hotplug 
test purpose. Add a kernel option CONFIG_ARCH_CPU_PROBE_RELEASE for this
feature.

This interface provides a mechanism to emulate cpu hotplug with software
 methods, it becomes possible to do cpu hotplug automation and stress
testing.

Directive:
*) Reserve CPU throu grub parameter like:
	maxcpus=4

the rest CPUs will not be initiliazed. 

*) Probe CPU
we can use the probe interface to hot-add new CPUs:
	echo nid > /sys/devices/system/cpu/probe

*) Release a CPU
	echo cpu > /sys/devices/system/cpu/release

A reserved CPU will be hot-added to the specified node.
1) nid = 0, the CPU will be added to the real node which the CPU
should be in
2) nid != 0, add the CPU to node nid even through it is a fake node.

Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@intel.com>
---
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091..339ac2d 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -28,6 +28,9 @@ struct x86_cpu {
 #ifdef CONFIG_HOTPLUG_CPU
 extern int arch_register_cpu(int num);
 extern void arch_unregister_cpu(int);
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+extern int arch_register_cpu_emu(int num, int nid);
+#endif
 #endif
 
 DECLARE_PER_CPU(int, cpu_state);
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872a..28f052c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -647,8 +647,44 @@ int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
 }
 EXPORT_SYMBOL(acpi_map_lsapic);
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+static void acpi_map_cpu2node_emu(int cpu, int physid, int nid)
+{
+#ifdef CONFIG_ACPI_NUMA
+#ifdef CONFIG_X86_64
+	apicid_to_node[physid] = nid;
+	numa_set_node(cpu, nid);
+#else /* CONFIG_X86_32 */
+	apicid_2_node[physid] = nid;
+	cpu_to_node_map[cpu] = nid;
+#endif
+#endif
+}
+
+static u16 cpu_to_apicid_saved[CONFIG_NR_CPUS];
+int __ref acpi_map_lsapic_emu(int pcpu, int nid)
+{
+	/* backup cpu apicid to array cpu_to_apicid_saved */
+	if (cpu_to_apicid_saved[pcpu] = 0 &&
+		per_cpu(x86_cpu_to_apicid, pcpu) != BAD_APICID)
+		cpu_to_apicid_saved[pcpu] = per_cpu(x86_cpu_to_apicid, pcpu);
+
+	per_cpu(x86_cpu_to_apicid, pcpu) = cpu_to_apicid_saved[pcpu];
+	acpi_map_cpu2node_emu(pcpu, per_cpu(x86_cpu_to_apicid, pcpu), nid);
+
+	return pcpu;
+}
+EXPORT_SYMBOL(acpi_map_lsapic_emu);
+#endif
+
 int acpi_unmap_lsapic(int cpu)
 {
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+	/* backup cpu apicid to array cpu_to_apicid_saved */
+	if (cpu_to_apicid_saved[cpu] = 0 &&
+		per_cpu(x86_cpu_to_apicid, cpu) != BAD_APICID)
+		cpu_to_apicid_saved[cpu] = per_cpu(x86_cpu_to_apicid, cpu);
+#endif
 	per_cpu(x86_cpu_to_apicid, cpu) = -1;
 	set_cpu_present(cpu, false);
 	num_processors--;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4..170d9b9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -107,8 +107,6 @@ void cpu_hotplug_driver_unlock()
         mutex_unlock(&x86_cpu_hotplug_driver_mutex);
 }
 
-ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
-ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
 #else
 static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
 #define get_idle_for_cpu(x)      (idle_thread_array[(x)])
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index f716cd9..515f08a 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -29,6 +29,9 @@
 #include <linux/mmzone.h>
 #include <linux/init.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
+#include <linux/topology.h>
+#include <linux/acpi.h>
 #include <asm/cpu.h>
 
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
@@ -37,6 +40,11 @@ static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
 /*
  * Add nid(NUMA node id) as parameter for cpu hotplug emulation. It supports
  * to register a CPU to any nodes.
+ *
+ * nid is a special parameter, it has 2 different branches:
+ * 1) when nid = NUMA_NO_NODE, the CPU will be registered into the normal node
+ * which it should be in.
+ * 2) nid != NUMA_NO_NODE, it will be registered into the specified node.
  */
 static int __ref __arch_register_cpu(int num, int nid)
 {
@@ -52,8 +60,23 @@ static int __ref __arch_register_cpu(int num, int nid)
 	if (num)
 		per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
 
-	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+	if (nid = NUMA_NO_NODE)
+		return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
+	else
+		return register_cpu_emu(&per_cpu(cpu_devices, num).cpu, num, nid);
+}
+
+/*
+ * Emulated version of function arch_register_cpu
+ * Parameter:
+ *	  num: cpu_id
+ *	  nid: emulated numa id
+ */
+int __ref arch_register_cpu_emu(int num, int nid)
+{
+	return __arch_register_cpu(num, nid);
 }
+EXPORT_SYMBOL(arch_register_cpu_emu);
 
 int __ref arch_register_cpu(int num)
 {
@@ -66,6 +89,74 @@ void arch_unregister_cpu(int num)
 	unregister_cpu(&per_cpu(cpu_devices, num).cpu);
 }
 EXPORT_SYMBOL(arch_unregister_cpu);
+
+ssize_t arch_cpu_probe(const char *buf, size_t count)
+{
+	int nid = 0;
+	int num = 0, selected = 0;
+
+	/* check parameters */
+	if (!buf || count < 2)
+		return -EPERM;
+
+	nid = simple_strtoul(buf, NULL, 0);
+	printk(KERN_DEBUG "Add a cpu to node : %d\n", nid);
+
+	if (nid < 0 || nid > nr_node_ids - 1) {
+		printk(KERN_ERR "Invalid NUMA node id: %d (0 <= nid < %d).\n",
+			nid, nr_node_ids);
+		return -EPERM;
+	}
+
+	if (!node_online(nid)) {
+		printk(KERN_ERR "NUMA node %d is not online, give up.\n", nid);
+		return -EPERM;
+	}
+
+	/* find first uninitialized cpu */
+	for_each_present_cpu(num) {
+		if (per_cpu(cpu_sys_devices, num) = NULL) {
+			selected = num;
+			break;
+		}
+	}
+
+	if (selected >= num_possible_cpus()) {
+		printk(KERN_ERR "No free cpu, give up cpu probing.\n");
+		return -EPERM;
+	}
+
+	/* register cpu */
+	arch_register_cpu_emu(selected, nid);
+	acpi_map_lsapic_emu(selected, nid);
+
+	return count;
+}
+EXPORT_SYMBOL(arch_cpu_probe);
+
+ssize_t arch_cpu_release(const char *buf, size_t count)
+{
+	int cpu = 0;
+
+	cpu =  simple_strtoul(buf, NULL, 0);
+	/* cpu 0 is not hotplugable */
+	if (cpu = 0) {
+		printk(KERN_ERR "can not release cpu 0.\n");
+		return -EPERM;
+	}
+
+	if (cpu_online(cpu)) {
+		printk(KERN_DEBUG "offline cpu %d.\n", cpu);
+		cpu_down(cpu);
+	}
+
+	arch_unregister_cpu(cpu);
+	acpi_unmap_lsapic(cpu);
+
+	return count;
+}
+EXPORT_SYMBOL(arch_cpu_release);
+
 #else /* CONFIG_HOTPLUG_CPU */
 
 static int __init arch_register_cpu(int num)
@@ -83,8 +174,14 @@ static int __init topology_init(void)
 		register_one_node(i);
 #endif
 
-	for_each_present_cpu(i)
-		arch_register_cpu(i);
+	/*
+	 * when cpu hotplug emulation enabled, register the online cpu only,
+	 * the rests are reserved for cpu probe.
+	 */
+	for_each_present_cpu(i) {
+		if ((cpu_hpe_on && cpu_online(i)) || !cpu_hpe_on)
+			arch_register_cpu(i);
+	}
 
 	return 0;
 }
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2485fd2..fff27ae 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nodemask.h>
 #include <linux/sched.h>
+#include <linux/cpu.h>
 
 #include <asm/e820.h>
 #include <asm/proto.h>
@@ -914,6 +915,19 @@ void __init init_cpu_to_node(void)
 }
 #endif
 
+#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+static __init int cpu_hpe_setup(char *opt)
+{
+	if (!opt)
+		return -EINVAL;
+
+	if (!strncmp(opt, "on", 2) || !strncmp(opt, "1", 1))
+		cpu_hpe_on = 1;
+
+	return 0;
+}
+early_param("cpu_hpe", cpu_hpe_setup);
+#endif  /* CONFIG_ARCH_CPU_PROBE_RELEASE */
 
 void __cpuinit numa_set_node(int cpu, int node)
 {
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 347eb21..39b4026 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -530,6 +530,14 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 		goto err_free_cpumask;
 
 	sysdev = get_cpu_sysdev(pr->id);
+	/*
+	 * Reserve cpu for hotplug emulation, the reserved cpu can be hot-added
+	 * throu the cpu probe interface. Return directly.
+	 */
+	if (sysdev = NULL) {
+		goto out;
+	}
+
 	if (sysfs_create_link(&device->dev.kobj, &sysdev->kobj, "sysdev")) {
 		result = -EFAULT;
 		goto err_remove_fs;
@@ -570,6 +578,7 @@ static int __cpuinit acpi_processor_add(struct acpi_device *device)
 		goto err_remove_sysfs;
 	}
 
+out:
 	return 0;
 
 err_remove_sysfs:
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index adbfd04..a113d01 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -22,9 +22,15 @@ struct sysdev_class cpu_sysdev_class = {
 };
 EXPORT_SYMBOL(cpu_sysdev_class);
 
-static DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
+DEFINE_PER_CPU(struct sys_device *, cpu_sys_devices);
 
 #ifdef CONFIG_HOTPLUG_CPU
+/*
+ * cpu_hpe_on is a switch to enable/disable cpu hotplug emulation. it is
+ * disabled in default, we can enable it throu grub parameter cpu_hpe=on
+ */
+int cpu_hpe_on;
+
 static ssize_t show_online(struct sys_device *dev, struct sysdev_attribute *attr,
 			   char *buf)
 {
@@ -80,6 +86,7 @@ void unregister_cpu(struct cpu *cpu)
 }
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
+
 static ssize_t cpu_probe_store(struct sysdev_class *class,
 			       struct sysdev_class_attribute *attr,
 			       const char *buf,
@@ -250,6 +257,18 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
 	return __register_cpu(cpu, num, cpu_to_node(num));
 }
 
+/*
+ * Register cpu to the specified NUMA node
+ *
+ * emulated version of function register_cpu, but is more flexible. it supports
+ * an extra parameter nid, We can register a CPU to any specified node throu
+ * this function.
+ */
+int __cpuinit register_cpu_emu(struct cpu *cpu, int num, int nid)
+{
+	return __register_cpu(cpu, num, nid);
+}
+
 struct sys_device *get_cpu_sysdev(unsigned cpu)
 {
 	if (cpu < nr_cpu_ids && cpu_possible(cpu))
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c227757..858c980 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -102,6 +102,7 @@ void acpi_numa_arch_fixup(void);
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 /* Arch dependent functions for cpu hotplug support */
 int acpi_map_lsapic(acpi_handle handle, int *pcpu);
+int acpi_map_lsapic_emu(int pcpu, int nid);
 int acpi_unmap_lsapic(int cpu);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4823af6..8447a81 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -30,7 +30,10 @@ struct cpu {
 	struct sys_device sysdev;
 };
 
+DECLARE_PER_CPU(struct sys_device *, cpu_sys_devices);
+
 extern int register_cpu(struct cpu *cpu, int num);
+extern int register_cpu_emu(struct cpu *cpu, int num, int nid);
 extern struct sys_device *get_cpu_sysdev(unsigned cpu);
 
 extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
@@ -143,6 +146,7 @@ extern void put_online_cpus(void);
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
 int cpu_down(unsigned int cpu);
+extern int cpu_hpe_on;
 
 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
 extern void cpu_hotplug_driver_lock(void);
@@ -165,6 +169,7 @@ static inline void cpu_hotplug_driver_unlock(void)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
 #define unregister_hotcpu_notifier(nb)	({ (void)(nb); })
+static int cpu_hpe_on;
 #endif		/* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM_SLEEP_SMP
diff --git a/mm/Kconfig b/mm/Kconfig
index a01a679..f88690c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -162,6 +162,17 @@ config NODE_HOTPLUG_EMU
 	  N is the number of hidden nodes, size is the memory size per
 	  hidden node. This is only useful for debugging.
 
+config ARCH_CPU_PROBE_RELEASE
+	def_bool y
+	bool "CPU hotplug emulation"
+	depends on NUMA_HOTPLUG_EMU
+	---help---
+	  Enable cpu hotplug emulation. Reserve cpu with grub parameter
+	  "maxcpus=N", where N is the initial CPU number, the rest physical
+	  CPUs will not be initialized; there is a probe/release interface
+	  is for cpu hot-add/hot-remove to specified node in software method.
+	  This is for debuging and testing purpose
+
 #
 # If we have space for more page flags then we can enable additional
 # optimizations and functionality.

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* [v2,4/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:41 UTC (permalink / raw)
  To: linux-hotplug

From: Shaohui Zheng <shaohui.zheng@intel.com>
Subject: hotplug emulator: Abstract cpu register functions

Abstract function arch_register_cpu and register_cpu, move the implementation
details to a sub function with prefix "__". 

each of the sub function has an extra parameter nid, it can be used to register
CPU under a fake NUMA node, it is a reserved interface for cpu hotplug emulation
(CPU PROBE/RELEASE) in x86.

Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@intel.com>
---
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e45159..f716cd9 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -34,7 +34,11 @@
 static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
 
 #ifdef CONFIG_HOTPLUG_CPU
-int __ref arch_register_cpu(int num)
+/*
+ * Add nid(NUMA node id) as parameter for cpu hotplug emulation. It supports
+ * to register a CPU to any nodes.
+ */
+static int __ref __arch_register_cpu(int num, int nid)
 {
 	/*
 	 * CPU0 cannot be offlined due to several
@@ -50,6 +54,11 @@ int __ref arch_register_cpu(int num)
 
 	return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
 }
+
+int __ref arch_register_cpu(int num)
+{
+	return __arch_register_cpu(num, NUMA_NO_NODE);
+}
 EXPORT_SYMBOL(arch_register_cpu);
 
 void arch_unregister_cpu(int num)
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index f35719a..4aca9e3 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -208,17 +208,20 @@ static ssize_t print_cpus_offline(struct sysdev_class *class,
 static SYSDEV_CLASS_ATTR(offline, 0444, print_cpus_offline, NULL);
 
 /*
- * register_cpu - Setup a sysfs device for a CPU.
+ * __register_cpu -Initialize and register the CPU device.
+ *
  * @cpu - cpu->hotpluggable field set to 1 will generate a control file in
  *	  sysfs for this CPU.
  * @num - CPU number to use when creating the device.
+ * @nid - numa node id
  *
- * Initialize and register the CPU device.
+ * We do not calculate nid by funciton cpu_to_node(), and change it as a
+ * parameter, it is an reserved interface for CPU hotplug emulation.
  */
-int __cpuinit register_cpu(struct cpu *cpu, int num)
+static int __cpuinit __register_cpu(struct cpu *cpu, int num, int nid)
 {
 	int error;
-	cpu->node_id = cpu_to_node(num);
+	cpu->node_id = nid;
 	cpu->sysdev.id = num;
 	cpu->sysdev.cls = &cpu_sysdev_class;
 
@@ -229,7 +232,7 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
 	if (!error)
 		per_cpu(cpu_sys_devices, num) = &cpu->sysdev;
 	if (!error)
-		register_cpu_under_node(num, cpu_to_node(num));
+		register_cpu_under_node(num, nid);
 
 #ifdef CONFIG_KEXEC
 	if (!error)
@@ -238,6 +241,15 @@ int __cpuinit register_cpu(struct cpu *cpu, int num)
 	return error;
 }
 
+/*
+ * register_cpu - Setup a sysfs device for a CPU.
+ * Initialize and register the CPU device.
+ */
+int __cpuinit register_cpu(struct cpu *cpu, int num)
+{
+	return __register_cpu(cpu, num, cpu_to_node(num));
+}
+
 struct sys_device *get_cpu_sysdev(unsigned cpu)
 {
 	if (cpu < nr_cpu_ids && cpu_possible(cpu))

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* Subject: [v2,3/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:40 UTC (permalink / raw)
  To: linux-hotplug

From: Haicheng Li <haicheng.li@linux.intel.com>
Subject: hotplug emulator:  Userland interface to hotplug-add fake offlined nodes.

Add a sysfs entry "probe" under /sys/devices/system/node/:

 - to show all fake offlined nodes:
    $ cat /sys/devices/system/node/probe

 - to hotadd a fake offlined node, e.g. nodeid is N:
    $ echo N > /sys/devices/system/node/probe

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
---
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 2872e86..d21bea2 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -538,6 +538,25 @@ void unregister_one_node(int nid)
 	unregister_node(&node_devices[nid]);
 }
 
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+static ssize_t store_nodes_probe(struct sysdev_class *class,
+				  struct sysdev_class_attribute *attr,
+				  const char *buf, size_t count)
+{
+	long nid;
+
+	strict_strtol(buf, 0, &nid);
+	if (nid < 0 || nid > nr_node_ids - 1) {
+		printk(KERN_ERR "Invalid NUMA node id: %ld (0 <= nid < %d).\n",
+			nid, nr_node_ids);
+		return -EPERM;
+	}
+	hotadd_hidden_nodes(nid);
+
+	return count;
+}
+#endif
+
 /*
  * node states attributes
  */
@@ -566,26 +585,35 @@ static ssize_t show_node_state(struct sysdev_class *class,
 	return print_nodes_state(na->state, buf);
 }
 
-#define _NODE_ATTR(name, state) \
+#define _NODE_ATTR_RO(name, state) \
 	{ _SYSDEV_CLASS_ATTR(name, 0444, show_node_state, NULL), state }
 
+#define _NODE_ATTR_RW(name, store_func, state) \
+	{ _SYSDEV_CLASS_ATTR(name, 0644, show_node_state, store_func), state }
+
 static struct node_attr node_state_attr[] = {
-	_NODE_ATTR(possible, N_POSSIBLE),
-	_NODE_ATTR(online, N_ONLINE),
-	_NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
-	_NODE_ATTR(has_cpu, N_CPU),
+	[N_POSSIBLE] = _NODE_ATTR_RO(possible, N_POSSIBLE),
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+	[N_HIDDEN] = _NODE_ATTR_RW(probe, store_nodes_probe, N_HIDDEN),
+#endif
+	[N_ONLINE] = _NODE_ATTR_RO(online, N_ONLINE),
+	[N_NORMAL_MEMORY] = _NODE_ATTR_RO(has_normal_memory, N_NORMAL_MEMORY),
 #ifdef CONFIG_HIGHMEM
-	_NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
+	[N_HIGH_MEMORY] = _NODE_ATTR_RO(has_high_memory, N_HIGH_MEMORY),
 #endif
+	[N_CPU] = _NODE_ATTR_RO(has_cpu, N_CPU),
 };
 
 static struct sysdev_class_attribute *node_state_attrs[] = {
-	&node_state_attr[0].attr,
-	&node_state_attr[1].attr,
-	&node_state_attr[2].attr,
-	&node_state_attr[3].attr,
+	&node_state_attr[N_POSSIBLE].attr,
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+	&node_state_attr[N_HIDDEN].attr,
+#endif
+	&node_state_attr[N_ONLINE].attr,
+	&node_state_attr[N_NORMAL_MEMORY].attr,
+	&node_state_attr[N_CPU].attr,
 #ifdef CONFIG_HIGHMEM
-	&node_state_attr[4].attr,
+	&node_state_attr[N_HIGH_MEMORY].attr,
 #endif
 	NULL
 };
diff --git a/mm/Kconfig b/mm/Kconfig
index f0fb912..a01a679 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -147,6 +147,21 @@ config MEMORY_HOTREMOVE
 	depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
 	depends on MIGRATION
 
+config NUMA_HOTPLUG_EMU
+	bool "NUMA hotplug emulator"
+	depends on X86_64 && NUMA && MEMORY_HOTPLUG
+
+	---help---
+
+config NODE_HOTPLUG_EMU
+	bool "Node hotplug emulation"
+	depends on NUMA_HOTPLUG_EMU && MEMORY_HOTPLUG
+	---help---
+	  Enable Node hotplug emulation. The machine will be setup with
+	  hidden virtual nodes when booted with "numa=hide=N*size", where
+	  N is the number of hidden nodes, size is the memory size per
+	  hidden node. This is only useful for debugging.
+
 #
 # If we have space for more page flags then we can enable additional
 # optimizations and functionality.

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* Subject: [v2,2/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:39 UTC (permalink / raw)
  To: linux-hotplug

From: Haicheng Li <haicheng.li@linux.intel.com>
Subject: hotplug emulator: infrastructure of NUMA hotplug emulation

NUMA hotplug emulator introduces a new node state N_HIDDEN to
identify the fake offlined node. It firstly hides RAM via E820
table and then emulates fake offlined nodes with the hidden RAM.

After system bootup, user is able to hotplug-add these offlined
nodes, which is just similar to a real hardware hotplug behavior.

Using boot option "numa=hide=N*size" to fake offlined nodes:
	- N is the number of hidden nodes
	- size is the memory size (in MB) per hidden node.

OPEN: Kernel might use part of hidden memory region as RAM buffer,
      now emulator directly hide 128M extra space to workaround
      this issue.  Any better way to avoid this conflict?

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
---
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070..0bb8b37 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -37,7 +37,7 @@ extern void __cpuinit numa_clear_node(int cpu);
 extern void __cpuinit numa_add_cpu(int cpu);
 extern void __cpuinit numa_remove_cpu(int cpu);
 
-#ifdef CONFIG_NUMA_EMU
+#if defined(CONFIG_NUMA_EMU) || defined(CONFIG_NODE_HOTPLUG_EMU)
 #define FAKE_NODE_MIN_SIZE	((u64)64 << 20)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
 #endif /* CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23..a985cb0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -304,6 +304,123 @@ void __init numa_init_array(void)
 	}
 }
 
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+static char *hp_cmdline __initdata;
+static struct bootnode *hidden_nodes;
+static u64 hp_start;
+static long hidden_num, hp_size;
+static u64 nodes_size[MAX_NUMNODES] __initdata;
+
+int hotadd_hidden_nodes(int nid)
+{
+	int ret;
+
+	if (!node_hidden(nid))
+		return -EINVAL;
+
+	ret = add_memory(nid, hidden_nodes[nid].start,
+			 hidden_nodes[nid].end - hidden_nodes[nid].start);
+	if (!ret) {
+		node_clear_hidden(nid);
+		return 0;
+	} else {
+		return -EEXIST;
+	}
+}
+
+/* parse the comand line for numa=hide */
+static long __init parse_hide_nodes(char *hp_cmdline)
+{
+	int coef = 1, nid = 0;
+	u64 size = 0;
+	long total = 0;
+	char buf[512], *p;
+
+	/* parse numa=hide command-line */
+	hidden_num = 0;
+	p = buf;
+	while (1) {
+		if (*hp_cmdline = ',' || *hp_cmdline = '\0') {
+			*p = '\0';
+			size = simple_strtoul(buf, NULL, 0);
+			printk(KERN_ERR "size: %dM buf:%s coef: %d.\n", (int)size, buf, coef);
+			if (!((size<<20) & FAKE_NODE_MIN_HASH_MASK))
+				printk(KERN_ERR "%d M is less than minimum node size, ignore it.\n", (int)size);
+
+			size <<= 20;
+			/* Round down to nearest FAKE_NODE_MIN_SIZE. */
+			size &= FAKE_NODE_MIN_HASH_MASK;
+
+			if (size) {
+				int i;
+				total += size * coef;
+				for (i = 0; i < coef; i++)
+					nodes_size[nid++] = size;
+				hidden_num += coef;
+			}
+
+			coef = 1;
+			p = buf;
+			if (*hp_cmdline  = '\0')
+				break;
+			hp_cmdline++;
+		} else if (*hp_cmdline =  '*') {
+			*p++ = '\0';
+			coef = simple_strtoul(buf, NULL, 0);
+			p = buf;
+			hp_cmdline++;
+		} else if (!isdigit(*hp_cmdline)) {
+			break;
+		}
+
+		*p++ = *hp_cmdline++;
+	}
+
+	return total;
+}
+
+static void __init numa_hide_nodes(void)
+{
+	hp_size = parse_hide_nodes(hp_cmdline);
+
+	hp_start = e820_hide_mem(hp_size);
+	if (hp_start <= 0) {
+		printk(KERN_ERR "Hide too much memory, disable node hotplug emualtion.");
+		hidden_num = 0;
+		return;
+	}
+
+	/* leave 128M space for possible RAM buffer usage later
+	 any other better way to avoid this conflict?*/
+
+	e820_hide_mem(128*1024*1024);
+}
+
+static void __init numa_hotplug_emulation(void)
+{
+	int i, num_nodes = 0, nid;
+
+	for_each_online_node(i)
+		if (i > num_nodes)
+			num_nodes = i;
+
+	i = num_nodes + hidden_num;
+	if (!hidden_nodes) {
+		hidden_nodes = alloc_bootmem(sizeof(struct bootnode) * i);
+		memset(hidden_nodes, 0, sizeof(struct bootnode) * i);
+	}
+
+	nid = num_nodes + 1;
+	for (i = 0; i < hidden_num; i++) {
+		node_set(nid, node_possible_map);
+		hidden_nodes[nid].start = hp_start;
+		hidden_nodes[nid].end = hp_start + (nodes_size[i]);
+		hp_start = hidden_nodes[nid].end;
+		node_set_hidden(nid++);
+	}
+}
+#endif /* CONFIG_NODE_HOTPLUG_EMU */
+
 #ifdef CONFIG_NUMA_EMU
 /* Numa emulation */
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
@@ -658,7 +775,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 
 #ifdef CONFIG_NUMA_EMU
 	if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
-		return;
+		goto done;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 #endif
@@ -666,14 +783,14 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 #ifdef CONFIG_ACPI_NUMA
 	if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 						  last_pfn << PAGE_SHIFT))
-		return;
+		goto done;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 #endif
 
 #ifdef CONFIG_K8_NUMA
 	if (!numa_off && k8 && !k8_scan_nodes())
-		return;
+		goto done;
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
 #endif
@@ -693,6 +810,13 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 		numa_set_node(i, 0);
 	e820_register_active_regions(0, start_pfn, last_pfn);
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
+
+done:
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+	if (hidden_num)
+		numa_hotplug_emulation();
+#endif
+	return;
 }
 
 unsigned long __init numa_free_all_bootmem(void)
@@ -720,6 +844,12 @@ static __init int numa_setup(char *opt)
 	if (!strncmp(opt, "fake=", 5))
 		cmdline = opt + 5;
 #endif
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+	if (!strncmp(opt, "hide=", 5)) {
+		hp_cmdline = opt + 5;
+		numa_hide_nodes();
+	}
+#endif
 #ifdef CONFIG_ACPI_NUMA
 	if (!strncmp(opt, "noacpi", 6))
 		acpi_numa = -1;
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index dba35e4..ba0f82d 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -371,6 +371,10 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
  */
 enum node_states {
 	N_POSSIBLE,		/* The node could become online at some point */
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+	N_HIDDEN,		/* The node is hidden at booting time, could be
+				 * onlined in run time */
+#endif
 	N_ONLINE,		/* The node is online */
 	N_NORMAL_MEMORY,	/* The node has regular memory */
 #ifdef CONFIG_HIGHMEM
@@ -470,6 +474,13 @@ static inline int num_node_state(enum node_states state)
 #define node_online(node)	node_state((node), N_ONLINE)
 #define node_possible(node)	node_state((node), N_POSSIBLE)
 
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+#define node_set_hidden(node)	   node_set_state((node), N_HIDDEN)
+#define node_clear_hidden(node)	   node_clear_state((node), N_HIDDEN)
+#define node_hidden(node)	node_state((node), N_HIDDEN)
+extern int hotadd_hidden_nodes(int nid);
+#endif
+
 #define for_each_node(node)	   for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
 

-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* [v2,1/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:38 UTC (permalink / raw)
  To: linux-hotplug

From: Haicheng Li <haicheng.li@linux.intel.com>
Subject: hotplug emulator: add function to hide memory region via e820 table.

NUMA hotplug emulator needs to hide memory regions at the very
beginning of kernel booting. Then emulator will use these
memory regions to fake offlined numa nodes.

Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
---
 arch/x86/include/asm/e820.h |    1 +
 arch/x86/kernel/e820.c      |   19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 0e22296..027bbb1 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -124,6 +124,7 @@ extern int e820_find_active_region(const struct e820entry *ei,
 extern void e820_register_active_regions(int nid, unsigned long start_pfn,
 					 unsigned long end_pfn);
 extern u64 e820_hole_size(u64 start, u64 end);
+extern u64 e820_hide_mem(u64 mem_size);
 extern void finish_e820_parsing(void);
 extern void e820_reserve_resources(void);
 extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7bca3c6..1993275 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -971,6 +971,7 @@ static void early_panic(char *msg)
 }
 
 static int userdef __initdata;
+static u64 max_mem_size __initdata = ULLONG_MAX;
 
 /* "mem=nopentium" disables the 4MB page tables. */
 static int __init parse_memopt(char *p)
@@ -989,12 +990,28 @@ static int __init parse_memopt(char *p)
 
 	userdef = 1;
 	mem_size = memparse(p, &p);
-	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
+	e820_remove_range(mem_size, max_mem_size - mem_size, E820_RAM, 1);
+	max_mem_size = mem_size;
 
 	return 0;
 }
 early_param("mem", parse_memopt);
 
+#ifdef CONFIG_NODE_HOTPLUG_EMU
+u64 __init e820_hide_mem(u64 mem_size)
+{
+	u64 start, end_pfn;
+
+	userdef = 1;
+	end_pfn = e820_end_of_ram_pfn();
+	start = (end_pfn << PAGE_SHIFT) - mem_size;
+	e820_remove_range(start, max_mem_size - start, E820_RAM, 1);
+	max_mem_size = start;
+
+	return start;
+}
+#endif
+
 static int __init parse_memmap_opt(char *p)
 {
 	char *oldp;
-- 
1.6.0.rc1


-- 
Thanks & Regards,
Shaohui


^ permalink raw reply related

* [v2, 0/8] NUMA Hotplug emulator
From: Shaohui Zheng @ 2010-11-13  5:37 UTC (permalink / raw)
  To: linux-hotplug

Hi, All

	This patchset introduces NUMA hotplug emulator for x86. we already sent out
an early version in LKML (http ://lwn.net/Articles/387571/). This is 4th version
in internal, and 2nd time to sent to LKML. 

Compare with last version, we accept the comments and feedback from the mailing
list, and did more testing and bug fixing on different hardwares. It is relative
stable version.

* WHAT IS HOTPLUG EMULATOR 

NUMA hotplug emulator is collectively named for the hotplug emulation it is
able to emulate NUMA Node Hotplug thru a pure software way. It intends to help
people easily debug and test node/cpu/memory hotplug related stuff on a
none-numa-hotplug-support machine, even an UMA machine.

The emulator provides mechanism to emulate the process of physcial cpu/mem
hotadd, it provides possibility to debug CPU and memory hotplug on the machines
without NUMA support for kenrel developers. It offers an interface for cpu and
memory hotplug test purpose.

* WHY DO WE USE HOTPLUG EMULATOR

We are focusing on the hotplug emualation for a few months. The emualor helps
 team to reproduce all the major hotplug bugs. It plays an important role to
the hotplug code quality assuirance. Because of the hotplug emulator, we already
move most of the debug working to virtual evironment.

* Principles & Usages 

NUMA hotplug emulator include 3 different parts, We add a menu item to the
menuconfig to enable/disable them.

1) Node hotplug emulation:

The emulator firstly hides RAM via E820 table, and then it can
fake offlined nodes with the hidden RAM.

After system bootup, user is able to hotplug-add these offlined
nodes, which is just similar to a real hotplug hardware behavior.

Using boot option "numa=hide=N*size" to fake offlined nodes:
	- N is the number of hidden nodes
	- size is the memory size (in MB) per hidden node.

There is a sysfs entry "probe" under /sys/devices/system/node/ for user
to hotplug the fake offlined nodes:

 - to show all fake offlined nodes:
    $ cat /sys/devices/system/node/probe

 - to hotadd a fake offlined node, e.g. nodeid is N:
    $ echo N > /sys/devices/system/node/probe

2) CPU hotplug emulation:

The emulator reserve CPUs throu grub parameter, the reserved CPUs can be
hot-add/hot-remove in software method.

When hotplug a CPU with emulator, we are using a logical CPU to emulate the CPU
hotplug process. For the CPU supported SMT, some logical CPUs are in the same
socket, but it may located in different NUMA node after we have emulator.  We
put the logical CPU into a fake CPU socket, and assign it an unique
phys_proc_id. For the fake socket, we put one logical CPU in only.

 - to hide CPUs
	- Using boot option "maxcpus=N" hide CPUs
	  N is the number of initialize CPUs
	- Using boot option "cpu_hpe=on" to enable cpu hotplug emulation
      when cpu_hpe is enabled, the rest CPUs will not be initialized 

 - to hot-add CPU to node
	$ echo nid > cpu/probe

 - to hot-remove CPU
	$ echo nid > cpu/release

3) Memory hotplug emulation:

The emulator reserve memory before OS booting, the reserved memory region
is remove from e820 table, and they can be hot-added via the probe interface,
this interface was extend to support add memory to the specified node, It
maintains backwards compatibility.

The difficulty of Memory Release is well-known, we have no plan for it until now.

 - reserve memory throu grub parameter
 	mem\x1024m

 - add a memory section to node 3
    $ echo 0x40000000,3 > memory/probe
	OR
    $ echo 1024m,3 > memory/probe

* ACKNOWLEDGMENT 

hotplug emulator includes a team's efforts, thanks all of them.
They are:
Andi Kleen, Haicheng Li, Shaohui Zheng, Fengguang Wu and Yongkang You

-- 
Thanks & Regards,
Shaohui

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Kay Sievers @ 2010-11-12 14:36 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

On Thu, Nov 11, 2010 at 21:07, Jeff Moyer <jmoyer@redhat.com> wrote:
> Jens Axboe <axboe@kernel.dk> writes:
>> On 2010-11-10 21:03, Vivek Goyal wrote:
>>> On Wed, Nov 10, 2010 at 01:26:21PM -0500, David Zeuthen wrote:
>>>> On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
>>>>> From within the block layer in the kernel, it is difficult to
>>>>> automatically detect the performance characteristics of the underlying
>>>>> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
>>>>> rule to tune the I/O scheduler properly for most cases.  The basic
>>>>> approach is to leave CFQ's default tunings alone for SATA disks.  For
>>>>> everything else, turn off slice idling and bump the quantum in order to
>>>>> drive higher queue depths.  This patch is an attempt to implement this.
>>>>>
>>>>> I've tested it in a variety of configurations:
>>>>> - cciss devices
>>>>> - sata disks
>>>>> - sata ssds
>>>>> - enterprise storage (single path)
>>>>> - enterprise storage (multi-path)
>>>>> - multiple paths to a sata disk (yes, you can actually do that!)
>>>>>
>>>>> The tuning works as expected in all of those scenarios.  I look forward
>>>>> to your comments.
>>>>
>>>> This looks useful, but I really think the kernel driver creating the
>>>> block device should choose/change the defaults for the created block
>>>> device - it seems really backwards to do this in user-space as an
>>>> afterthought.
>>>
>>> I think it just becomes little easier to implement in user space so that
>>> if things don't work as expected, somebody can easily disable the rules
>>> or somebody can easily refine the rule further to better suite their
>>> needs instead of driver hardcoding this decision.
>>
>> That's the primary reason why I suggested doing this in user space. Plus
>> we don't always know in the kernel, at least this provides an easier way
>> to auto-tune things.
>
> Right, so given the above, is there still opposition to doing this in
> udev?

Not in general. Udev can do such things, that's what it's there for.
It can do quirks, custom setups, and support tweaked configs that way.

But it's usually not meant to set common defaults for every box. The
last time we got into this business, and set timeouts for scsi devices
from udev, we broke more recent kernels that did not like the
specified values anymore, and we needed to remove all that in released
versions, to be able to safely run newer kernels. And we've been told
not to do such a thing in the future.

And all what your rules are doing is to unconditionally apply
kernel-internal knowledge to kernel devices -- which if you look at it
from one step back -- is a bit weird.

So I guess, this should be done from the multipath package, the dm
setup, some 'tweak.rpm', ...  I'm not sure, if we can do that for
everybody from the main udev sources, for the same reasons the scsi
timeout was wrong to do from udev. The time we added it, it seemed to
be the right thing, but 2 years later it wasn't, because the kernel
evolved, and we got into its way.

Kay

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Jeff Moyer @ 2010-11-11 20:07 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

Jens Axboe <axboe@kernel.dk> writes:

> On 2010-11-10 21:03, Vivek Goyal wrote:
>> On Wed, Nov 10, 2010 at 01:26:21PM -0500, David Zeuthen wrote:
>>> Hi,
>>>
>>> On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
>>>> Hi,
>>>>
>>>> From within the block layer in the kernel, it is difficult to
>>>> automatically detect the performance characteristics of the underlying
>>>> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
>>>> rule to tune the I/O scheduler properly for most cases.  The basic
>>>> approach is to leave CFQ's default tunings alone for SATA disks.  For
>>>> everything else, turn off slice idling and bump the quantum in order to
>>>> drive higher queue depths.  This patch is an attempt to implement this.
>>>>
>>>> I've tested it in a variety of configurations:
>>>> - cciss devices
>>>> - sata disks
>>>> - sata ssds
>>>> - enterprise storage (single path)
>>>> - enterprise storage (multi-path)
>>>> - multiple paths to a sata disk (yes, you can actually do that!)
>>>>
>>>> The tuning works as expected in all of those scenarios.  I look forward
>>>> to your comments.
>>>
>>> This looks useful, but I really think the kernel driver creating the
>>> block device should choose/change the defaults for the created block
>>> device - it seems really backwards to do this in user-space as an
>>> afterthought.
>> 
>> I think it just becomes little easier to implement in user space so that
>> if things don't work as expected, somebody can easily disable the rules
>> or somebody can easily refine the rule further to better suite their
>> needs instead of driver hardcoding this decision.
>
> That's the primary reason why I suggested doing this in user space. Plus
> we don't always know in the kernel, at least this provides an easier way
> to auto-tune things.

Right, so given the above, is there still opposition to doing this in
udev?

Thanks!
Jeff

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Jens Axboe @ 2010-11-10 20:08 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

On 2010-11-10 21:03, Vivek Goyal wrote:
> On Wed, Nov 10, 2010 at 01:26:21PM -0500, David Zeuthen wrote:
>> Hi,
>>
>> On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
>>> Hi,
>>>
>>> From within the block layer in the kernel, it is difficult to
>>> automatically detect the performance characteristics of the underlying
>>> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
>>> rule to tune the I/O scheduler properly for most cases.  The basic
>>> approach is to leave CFQ's default tunings alone for SATA disks.  For
>>> everything else, turn off slice idling and bump the quantum in order to
>>> drive higher queue depths.  This patch is an attempt to implement this.
>>>
>>> I've tested it in a variety of configurations:
>>> - cciss devices
>>> - sata disks
>>> - sata ssds
>>> - enterprise storage (single path)
>>> - enterprise storage (multi-path)
>>> - multiple paths to a sata disk (yes, you can actually do that!)
>>>
>>> The tuning works as expected in all of those scenarios.  I look forward
>>> to your comments.
>>
>> This looks useful, but I really think the kernel driver creating the
>> block device should choose/change the defaults for the created block
>> device - it seems really backwards to do this in user-space as an
>> afterthought.
> 
> I think it just becomes little easier to implement in user space so that
> if things don't work as expected, somebody can easily disable the rules
> or somebody can easily refine the rule further to better suite their
> needs instead of driver hardcoding this decision.

That's the primary reason why I suggested doing this in user space. Plus
we don't always know in the kernel, at least this provides an easier way
to auto-tune things.

-- 
Jens Axboe


^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Vivek Goyal @ 2010-11-10 20:03 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

On Wed, Nov 10, 2010 at 01:26:21PM -0500, David Zeuthen wrote:
> Hi,
> 
> On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
> > Hi,
> >
> > From within the block layer in the kernel, it is difficult to
> > automatically detect the performance characteristics of the underlying
> > storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
> > rule to tune the I/O scheduler properly for most cases.  The basic
> > approach is to leave CFQ's default tunings alone for SATA disks.  For
> > everything else, turn off slice idling and bump the quantum in order to
> > drive higher queue depths.  This patch is an attempt to implement this.
> >
> > I've tested it in a variety of configurations:
> > - cciss devices
> > - sata disks
> > - sata ssds
> > - enterprise storage (single path)
> > - enterprise storage (multi-path)
> > - multiple paths to a sata disk (yes, you can actually do that!)
> >
> > The tuning works as expected in all of those scenarios.  I look forward
> > to your comments.
> 
> This looks useful, but I really think the kernel driver creating the
> block device should choose/change the defaults for the created block
> device - it seems really backwards to do this in user-space as an
> afterthought.

I think it just becomes little easier to implement in user space so that
if things don't work as expected, somebody can easily disable the rules
or somebody can easily refine the rule further to better suite their
needs instead of driver hardcoding this decision.

Thanks
Vivek

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: David Zeuthen @ 2010-11-10 18:26 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

Hi,

On Wed, Nov 10, 2010 at 11:47 AM, Jeff Moyer <jmoyer@redhat.com> wrote:
> Hi,
>
> From within the block layer in the kernel, it is difficult to
> automatically detect the performance characteristics of the underlying
> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
> rule to tune the I/O scheduler properly for most cases.  The basic
> approach is to leave CFQ's default tunings alone for SATA disks.  For
> everything else, turn off slice idling and bump the quantum in order to
> drive higher queue depths.  This patch is an attempt to implement this.
>
> I've tested it in a variety of configurations:
> - cciss devices
> - sata disks
> - sata ssds
> - enterprise storage (single path)
> - enterprise storage (multi-path)
> - multiple paths to a sata disk (yes, you can actually do that!)
>
> The tuning works as expected in all of those scenarios.  I look forward
> to your comments.

This looks useful, but I really think the kernel driver creating the
block device should choose/change the defaults for the created block
device - it seems really backwards to do this in user-space as an
afterthought.

     David

^ permalink raw reply

* Re: [patch|rfc] add support for I/O scheduler tuning
From: Jeff Moyer @ 2010-11-10 17:03 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <x498w119lc5.fsf@segfault.boston.devel.redhat.com>

Jeff Moyer <jmoyer@redhat.com> writes:

> Hi,
>
> From within the block layer in the kernel, it is difficult to
> automatically detect the performance characteristics of the underlying
> storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
> rule to tune the I/O scheduler properly for most cases.  The basic
> approach is to leave CFQ's default tunings alone for SATA disks.  For
> everything else, turn off slice idling and bump the quantum in order to
> drive higher queue depths.  This patch is an attempt to implement this.
>
> I've tested it in a variety of configurations:
> - cciss devices
> - sata disks
> - sata ssds
> - enterprise storage (single path)
> - enterprise storage (multi-path)
> - multiple paths to a sata disk (yes, you can actually do that!)
>
> The tuning works as expected in all of those scenarios.  I look forward
> to your comments.
>

I forgot to mention that Harald Hoyer provided a great deal of help in
getting me up to speed on udev, so thanks are indeed due to him.

Cheers,
Jeff

^ permalink raw reply

* [patch|rfc] add support for I/O scheduler tuning
From: Jeff Moyer @ 2010-11-10 16:47 UTC (permalink / raw)
  To: linux-hotplug

Hi,

From within the block layer in the kernel, it is difficult to
automatically detect the performance characteristics of the underlying
storage.  It was suggested by Jens Axboe at LSF2010 that we write a udev
rule to tune the I/O scheduler properly for most cases.  The basic
approach is to leave CFQ's default tunings alone for SATA disks.  For
everything else, turn off slice idling and bump the quantum in order to
drive higher queue depths.  This patch is an attempt to implement this.

I've tested it in a variety of configurations:
- cciss devices
- sata disks
- sata ssds
- enterprise storage (single path)
- enterprise storage (multi-path)
- multiple paths to a sata disk (yes, you can actually do that!)

The tuning works as expected in all of those scenarios.  I look forward
to your comments.

Thanks in advance!

-Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>

diff --git a/Makefile.am b/Makefile.am
index 032eb28..673c371 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -622,6 +622,16 @@ keymaps-distcheck-hook: extras/keymap/keys.txt
 	$(top_srcdir)/extras/keymap/check-keymaps.sh $(top_srcdir) $^
 DISTCHECK_HOOKS += keymaps-distcheck-hook
 
+# ------------------------------------------------------------------------------
+# iosched - optimize I/O scheduler tunings
+# ------------------------------------------------------------------------------
+EXTRA_DIST += extras/iosched/80-iosched.rules \
+	extras/iosched/80-mpath-iosched.rules extras/iosched/mpath-iosched.sh
+dist_udevrules_DATA += extras/iosched/80-iosched.rules \
+	extras/iosched/80-mpath-iosched.rules
+dist_libexec_SCRIPTS += extras/iosched/mpath-iosched.sh
+
+
 endif # ENABLE_EXTRAS
 
 # ------------------------------------------------------------------------------
diff --git a/extras/iosched/80-iosched.rules b/extras/iosched/80-iosched.rules
new file mode 100644
index 0000000..163f240
--- /dev/null
+++ b/extras/iosched/80-iosched.rules
@@ -0,0 +1,14 @@
+#
+# CFQ's default tunings are geared towards slow SATA disks.  If we detect
+# anything else, we change the tunings to drive deeper queue depths and
+# keep the device busy.
+#
+SUBSYSTEM!="block", GOTO="end_iosched"
+KERNEL="dm-*", GOTO="end_iosched"
+ENV{DEVTYPE}="partition", GOTO="end_iosched"
+ACTION!="add|change", GOTO="end_iosched"
+ENV{ID_BUS}="ata", GOTO="end_iosched"
+ATTR{queue/scheduler}!="*\[cfq\]", GOTO="end_iosched"
+ATTR{queue/iosched/slice_idle}="0"
+ATTR{queue/iosched/quantum}="32"
+LABEL="end_iosched"
diff --git a/extras/iosched/80-mpath-iosched.rules b/extras/iosched/80-mpath-iosched.rules
new file mode 100644
index 0000000..ece9e78
--- /dev/null
+++ b/extras/iosched/80-mpath-iosched.rules
@@ -0,0 +1,9 @@
+SUBSYSTEM!="block", GOTO="end_mpath_iosched"
+ENV{DEVTYPE}="partition", GOTO="end_mpath_iosched"
+KERNEL!="dm-*", GOTO="end_mpath_iosched"
+ACTION!="change", GOTO="end_mpath_iosched"
+ATTR{queue/scheduler}!="*\[cfq\]", GOTO="end_mpath_iosched"
+ENV{DM_UUID}!="mpath-?*", GOTO="end_mpath_iosched"
+ENV{DM_ACTION}="PATH_FAILED", GOTO="end_mpath_iosched"
+RUN+="mpath-iosched.sh"
+LABEL="end_mpath_iosched"
diff --git a/extras/iosched/mpath-iosched.sh b/extras/iosched/mpath-iosched.sh
new file mode 100755
index 0000000..51fb292
--- /dev/null
+++ b/extras/iosched/mpath-iosched.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#
+# For the request-based multipath driver, the I/O scheduler runs on the
+# multipath device, not the underlying "slave" devices.  This script
+# checks the ID_BUS attribute for each of the slave devices.  If it finds
+# an ata device, it leaves the I/O scheduler tunings alone.  For any other
+# device, we tune the I/O scheduler to try to keep the device busy.
+#
+PATH=/sbin:$PATH
+
+needs_tuning=1
+for slave in /sys${DEVPATH}/slaves/*; do
+	bus_type=$(udevadm info --query=property --path=$slave | grep ID_BUS | awk -F= '{print $2}')
+	if [ "$bus_type" = "ata" ]; then
+		needs_tuning=0
+		break
+	fi
+done
+
+if [ $needs_tuning -eq 1 ]; then
+	echo 0 > /sys${DEVPATH}/queue/iosched/slice_idle
+	echo 32 > /sys${DEVPATH}/queue/iosched/quantum
+fi
+
+exit 0

^ permalink raw reply related

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Harald Hoyer @ 2010-11-10 16:37 UTC (permalink / raw)
  To: Narendra_K
  Cc: linux-hotplug, netdev, Matt_Domsch, Jordan_Hargrave, Charles_Rose
In-Reply-To: <4CDAC930.4010801@redhat.com>

On 11/10/2010 05:32 PM, Harald Hoyer wrote:
> On 11/03/2010 05:55 PM, Narendra_K@Dell.com wrote:
>> Hello,
>>
>> This patch allows users to specify if they want the onboard network
>> interfaces to be renamed to lomN by implementing a command line param
>> 'udevlom'.
>>
>> From: Narendra K<narendra_k@dell.com>
>> Subject: [PATCH] UDEV - Add 'udevlom' command line param to start_udev
>>
>> This patch implements 'udevlom' command line parameter, which
>> when passed, results in onboard network interfaces getting
>> renamed to lomN.
>>
>> Signed-off-by: Narendra K<narendra_k@dell.com>
>> ---
>> start_udev | 5 +++++
>> 1 files changed, 5 insertions(+), 0 deletions(-)
>>
>> diff --git a/start_udev b/start_udev
>> index 49fc286..57d60c9 100755
>> --- a/start_udev
>> +++ b/start_udev
>> @@ -32,6 +32,7 @@ export TZ=/etc/localtime
>> . /etc/init.d/functions
>>
>> prog=udev
>> +cmdline=`cat /proc/cmdline`
>>
>> touch_recursive() {
>> ( cd $1;
>> @@ -60,6 +61,10 @@ fi
>>
>> ret=$[$ret + $?]
>>
>> +if strstr "$cmdline" udevlom; then
>> + /sbin/udevadm control --env=UDEVLOM="y"
>> +fi
>> +
>> /sbin/udevadm trigger --type=subsystems --actiond
>> /sbin/udevadm trigger --typeÞvices --actiond
>> /sbin/udevadm settle
>
> start_udev is obsolete with the use of systemd service files anyway in Fedora>\x15

not saying that we really should use "udevlom" on the kernel command line, but 
you could use:

IMPORT{cmdline}="udevlom"
KERNEL="eth*", ENV{udevlom}=1, ....

^ permalink raw reply

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Harald Hoyer @ 2010-11-10 16:32 UTC (permalink / raw)
  To: Narendra_K
  Cc: linux-hotplug, netdev, Matt_Domsch, Jordan_Hargrave, Charles_Rose
In-Reply-To: <20101103165505.GA3281@fedora-14-r710.oslab.blr.amer.dell.com>

On 11/03/2010 05:55 PM, Narendra_K@Dell.com wrote:
> Hello,
>
> This patch allows users to specify if they want the onboard network
> interfaces to be renamed to lomN by implementing a command line param
> 'udevlom'.
>
> From: Narendra K<narendra_k@dell.com>
> Subject: [PATCH] UDEV - Add 'udevlom' command line param to start_udev
>
> This patch implements 'udevlom' command line parameter, which
> when passed, results in onboard network interfaces getting
> renamed to lomN.
>
> Signed-off-by: Narendra K<narendra_k@dell.com>
> ---
>   start_udev |    5 +++++
>   1 files changed, 5 insertions(+), 0 deletions(-)
>
> diff --git a/start_udev b/start_udev
> index 49fc286..57d60c9 100755
> --- a/start_udev
> +++ b/start_udev
> @@ -32,6 +32,7 @@ export TZ=/etc/localtime
>   . /etc/init.d/functions
>
>   prog=udev
> +cmdline=`cat /proc/cmdline`
>
>   touch_recursive() {
>   	( cd $1;
> @@ -60,6 +61,10 @@ fi
>
>   ret=$[$ret + $?]
>
> +if strstr "$cmdline" udevlom; then
> +	/sbin/udevadm control --env=UDEVLOM="y"
> +fi
> +
>   /sbin/udevadm trigger --type=subsystems --actiond
>   /sbin/udevadm trigger --typeÞvices --actiond
>   /sbin/udevadm settle

start_udev is obsolete with the use of systemd service files anyway in Fedora>\x15

^ permalink raw reply

* pciehp: Setting maxpayload after hot_plug ?
From: Xavier Bru @ 2010-11-09 17:10 UTC (permalink / raw)
  To: linux-hotplug

Hello,

I have a concern around hot-plugging a pci adapter that sits behind a 
pci switch port, using the pciehp driver:
     . When  I boot with the adapter plugged, the maxpayload is set to 
256 (as the pci switch upstream port is connected to a port that has the 
capability maxpayload%6.
     . In case  I hot-replace this adapter, after hot-plugging the 
adapter, the maxpayload value is set to the default value (128).
     . In case there was no adapter plugged at boot time, the 
corresponding pci port has a default maxpayload\x128, and after 
hot-plugging the board, the pci port and the board stay with maxpayload\x128.

So, here is my question: should the pciehp driver be aware of the 
maxpayload, and re-initialize it in the pci bus hierarchy and on the 
board (has it is done I suppose by BIOS initialization when the board is 
present ) ? Else, who is in charge to reinitialize the maxpayload to the 
right value ?

Thanks in advance for your help.

Xavier



^ permalink raw reply

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Matt Domsch @ 2010-11-08 18:17 UTC (permalink / raw)
  To: Sujit K M
  Cc: Greg KH, K, Narendra, linux-hotplug@vger.kernel.org,
	netdev@vger.kernel.org, Hargrave, Jordan, Rose, Charles
In-Reply-To: <AANLkTin_rswTZn5RxH375u4xsoftQzMSPMRfUvy77fdX@mail.gmail.com>

On Mon, Nov 08, 2010 at 02:12:56PM +0530, Sujit K M wrote:
> > At Linux Plumbers Conference today, this problem space was discussed
> > once again, and I believe concensus on approach was reached. ?Here
> > goes:
> 
> Was the patch a starting point for the discussion.

The discussion has been ongoing for 3 years.  This patch was posted
just prior to the conversation at LPC, where aspects of this patch
(reading values from sysfs and using them, if present) were discussed
at length.  The patch itself will have to undergo some changes based
on the outcome of that discussion.

 
> > * If a 70-persistent-net.rules file sets a name, honor that. ?This
> > ?preserves existing installs.
> >
> > * If BIOS provides indexes for onboard devices, honor that.
> > ** Rename onboard NICs "lom[1-N]" as BIOS reports (# matches chassis labels)
> > ** No rename for all others "ethX" (no change for NICs in PCI slots/USB/others)
> >
> > * If neither are true, do not rename at all.
> 
> I would like to know what is the difference in the nomenclature for this.

LOM = "LAN on Motherboard", aka "Embedded NIC".  I'm not wedded to
using "lomX", but it can't be ethX, and it must be short, and that's
as good as anything.

 
> > * Implementation will be:
> > ** Udev rules to be included in upstream udev will read the index
> > ? value from sysfs (provided by SMBIOS 2.6 info on kernels >= 2.6.36,
> > ? PCI DSM info at some future point) if present, and rename LOMs
> > ? based on that index value. ?Distros will use these rules by default
> > ? (Ubuntu and Fedora maintainers on board with the concept; I have
> > ? not spoken with other distros yet.)
> > ** Legacy distros with older udev rules will invoke biosdevname on
> > ? kernels < 2.6.36 to get the same information, if present, and will
> > ? rename LOMs based on index value.
> 
> How will you manage these scenarios.

I've had conversations with the relevant maintainers of the subsystems
of each of the major distributions.  What else are you looking for please?

Thanks,
Matt

-- 
Matt Domsch
Technology Strategist
Dell | Office of the CTO

^ permalink raw reply

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Sujit K M @ 2010-11-08  8:54 UTC (permalink / raw)
  To: Matt Domsch
  Cc: Greg KH, K, Narendra, linux-hotplug@vger.kernel.org,
	netdev@vger.kernel.org, Hargrave, Jordan, Rose, Charles
In-Reply-To: <20101105025848.GA14021@pws490.domsch.com>

> At Linux Plumbers Conference today, this problem space was discussed
> once again, and I believe concensus on approach was reached.  Here
> goes:

Was the patch a starting point for the discussion.

> * If a 70-persistent-net.rules file sets a name, honor that.  This
>  preserves existing installs.
>
> * If BIOS provides indexes for onboard devices, honor that.
> ** Rename onboard NICs "lom[1-N]" as BIOS reports (# matches chassis labels)
> ** No rename for all others "ethX" (no change for NICs in PCI slots/USB/others)
>
> * If neither are true, do not rename at all.

I would like to know what is the difference in the nomenclature for this.

>
> * Implementation will be:
> ** Udev rules to be included in upstream udev will read the index
>   value from sysfs (provided by SMBIOS 2.6 info on kernels >= 2.6.36,
>   PCI DSM info at some future point) if present, and rename LOMs
>   based on that index value.  Distros will use these rules by default
>   (Ubuntu and Fedora maintainers on board with the concept; I have
>   not spoken with other distros yet.)
> ** Legacy distros with older udev rules will invoke biosdevname on
>   kernels < 2.6.36 to get the same information, if present, and will
>   rename LOMs based on index value.

How will you manage these scenarios.

^ permalink raw reply

* udev(7) Crashes Palm Z22 on Ubuntu, but NOT on Debian
From: Hendrickson, Kenneth @ 2010-11-07  2:59 UTC (permalink / raw)
  To: linux-hotplug

EXECUTIVE SUMMARY

udev(7) Crashes Palm Z22 on Ubuntu, but NOT on Debian.

SYMPTOMS

On Ubuntu, (but not on Debian), very shortly after I plug in the USB cable to the Palm Z22, the Z22 dies.  It must be reset (rebooted).  This happens whether or not I have time to tap the HotSync icon, and also whether or not I have time to execute the pilot-xfer(1) command to start the hot sync process.  (This makes me think the problem might be udev(7).)

So far, no data has been lost, but it is very frustrating.

----------8<------------------------------------------------------------
Data from Ubuntu box:

kjh@Lemur:~$ uname -a
Linux Lemur 2.6.32-25-generic #45-Ubuntu SMP Sat Oct 16 19:52:42 UTC 2010 x86_64 GNU/Linux

kjh@Lemur:~$ pilot-xfer --version
   DEPRECATED: The application is calling print_splash()
   .--------------------------------------------.
   | (c) Copyright 1996-2006, pilot-link team   |
   |   Join the pilot-link lists to help out.   |
   `--------------------------------------------'
   This is pilot-xfer, from pilot-link version 0.12.4

   Build target..: x86_64-unknown-linux-gnu
   Build date....: Jan  6 2010 08:12:01

   pilot-link 0.12.4 is covered under the GPL/LGPL
   See the file COPYING under docs for more info.

   Please use --help for more detailed options.

kjh@Lemur:~$ aptitude show udev
Package: udev
State: installed
Automatically installed: no
Version: 151-12.1
Priority: required
Section: admin
Maintainer: Scott James Remnant <scott@ubuntu.com>
Uncompressed Size: 1626k
Depends: libacl1 (>= 2.2.11-1), libc6 (>= 2.9), libglib2.0-0 (>= 2.16.0),
         libselinux1 (>= 1.32), libusb-0.1-4 (>= 2:0.1.12), upstart-job,
         module-init-tools (>= 3.2.1-0ubuntu3), initramfs-tools (>         0.92bubuntu63), procps, adduser, util-linux (> 2.15~rc2)
Suggests: watershed
Conflicts: hotplug, ifrename, libdevmapper1.02 (< 2:1.02.08-1ubuntu7),
           udev-extras (<= 20090618)
Breaks: casper (< 1.174), consolekit (<= 0.4.1), dmsetup (<        2:1.02.27-4ubuntu5), initramfs-tools (< 0.92bubuntu30), lvm2 (<        2.02.39-0ubuntu9), mdadm (<= 2.6.7.1-1ubuntu8)
Replaces: hotplug, ifrename, initramfs-tools (< 0.040ubuntu1), udev-extras (<          20090618)

Description: rule-based device node and kernel event manager
 udev is a collection of tools and a daemon to manage events received from the
 kernel and deal with them in user-space.  Primarily this involves creating and
 removing device nodes in /dev when hardware is discovered or removed from the
 system.

 Events are received via kernel netlink messaged and processed according to
 rules in /etc/udev/rules.d and /lib/udev/rules.d, altering the name of the
 device node, creating additional symlinks or calling other tools and programs
 including those to load kernel modules and initialise the device.

----------8<------------------------------------------------------------
Data from Debian box:

108 Tuxedo$ uname -a
Linux Tuxedo 2.6.26-2-686 #1 SMP Sat Dec 26 09:01:51 UTC 2009 i686 GNU/Linux

109 Tuxedo$ pilot-xfer --version
   DEPRECATED: The application is calling print_splash()
   .--------------------------------------------.
   | (c) Copyright 1996-2006, pilot-link team   |
   |   Join the pilot-link lists to help out.   |
   `--------------------------------------------'
   This is pilot-xfer, from pilot-link version 0.12.3

   Build target..: i686-pc-linux-gnu
   Build date....: Jun 15 2008 17:26:02

   pilot-link 0.12.3 is covered under the GPL/LGPL
   See the file COPYING under docs for more info.

   Please use --help for more detailed options.

111 Tuxedo$ aptitude show udev
Package: udev
State: installed
Automatically installed: no
Version: 0.125-7+lenny3
Priority: optional
Section: admin
Maintainer: Marco d'Itri <md@linux.it>
Uncompressed Size: 827k

Depends: libc6 (>= 2.7-1), libselinux1 (>= 2.0.59), libvolume-id0 (>= 0.113-1~),
         lsb-base (>= 3.0-6)
PreDepends: debconf (>= 1.4.69) | debconf-2.0
Conflicts: hal (< 0.5.6-2), hotplug, initramfs-tools (< 0.39), initscripts (<
           2.85-16), klibc-utils (<= 1.4.19-1), lvm-common (< 1.5.13), makedev
           (< 2.3.1-80), module-init-tools (< 3.2.2-1), multipath-tools (<
           0.4.7-2)
Replaces: initramfs-tools (<= 0.41)
Description: /dev/ and hotplug management daemon
 udev is a daemon which dynamically creates and removes device nodes from /dev/,
 handles hotplug events and loads drivers at boot time. It replaces the hotplug
 package and requires a 2.6.18 or newer kernel version.

----------8<------------------------------------------------------------
I am using the following pilot-xfer(1) command:

pilot-xfer -p /dev/pilot -b ~/.jpilot/backup$(date +%Y-%m-%d)

----------8<------------------------------------------------------------

I am not on some of these mailing lists, so please include me directly in any replies.  I am open to suggestions.

Thank you,
Ken Hendrickson

^ permalink raw reply

* Re: keyboard udev rules
From: Martin Pitt @ 2010-11-06 20:06 UTC (permalink / raw)
  To: linux-hotplug

Hello Daniel,

MONDON Daniel [2010-10-29 16:35 +0200]:
> ACTION="add", NAME="%keyboard", OPTION+="ignore_device last_rule"

These options were removed long ago, since they are just plainly
wrong. udev isn't creating devices, it just gives you information and
notifications. Thus udev is also the wrong place to try and disable
them.

Better options are:
 - Disconnect the keyboard :)
 - Use your xorg.conf to disable keyboard
 - Build a kernel without any keyboard drivers, or rmmod them during
   boot (the latter will probably not work with usbhid, though)

Martin

-- 
Martin Pitt                        | http://www.piware.de
Ubuntu Developer (www.ubuntu.com)  | Debian Developer  (www.debian.org)

^ permalink raw reply

* Re: Touchpad toggle mess
From: Bastien Nocera @ 2010-11-06 17:17 UTC (permalink / raw)
  To: Martin Pitt
  Cc: linux-input, linux-hotplug, xorg, Peter Hutterer, Matthew Garrett
In-Reply-To: <20101106163339.GE2215@piware.de>

On Sat, 2010-11-06 at 12:33 -0400, Martin Pitt wrote:
> Hello Bastien,
> 
> Bastien Nocera [2010-11-06 15:58 +0000]:
> > No, for about a year:
> > http://cgit.freedesktop.org/xkeyboard-config/commit/?id\x1d05eda8dfc706d6450cab5883120e0d5e1100c0
> 
> Ah, right, I missed that it didn't actually work all the way up to the
> session, only at the evdev level.
> 
> > You don't need to update gnome-settings-daemon. gnome-settings-daemon
> > doesn't see F21, F22 or F23, it'll see XF86TouchpadOn, etc.
> 
> Ah, ok.
> 
> So, that makes a lot of sense then, I committed your patch with a
> little more verbiage in the changelog. It would be nice if we could
> get the X.org counterpart in as well, so that the udev keymaps and
> X.org don't run out of sync.

We actually need 2 patches into X, one for adding the keysyms, one for
setting the keysyms to the Function keys.


^ permalink raw reply

* Re: Touchpad toggle mess
From: Martin Pitt @ 2010-11-06 16:33 UTC (permalink / raw)
  To: Bastien Nocera
  Cc: linux-input, linux-hotplug, xorg, Peter Hutterer, Matthew Garrett
In-Reply-To: <1289059101.5418.93.camel@novo.hadess.net>

[-- Attachment #1: Type: text/plain, Size: 840 bytes --]

Hello Bastien,

Bastien Nocera [2010-11-06 15:58 +0000]:
> No, for about a year:
> http://cgit.freedesktop.org/xkeyboard-config/commit/?id=1d05eda8dfc706d6450cab5883120e0d5e1100c0

Ah, right, I missed that it didn't actually work all the way up to the
session, only at the evdev level.

> You don't need to update gnome-settings-daemon. gnome-settings-daemon
> doesn't see F21, F22 or F23, it'll see XF86TouchpadOn, etc.

Ah, ok.

So, that makes a lot of sense then, I committed your patch with a
little more verbiage in the changelog. It would be nice if we could
get the X.org counterpart in as well, so that the udev keymaps and
X.org don't run out of sync.

Thank you!

Martin
-- 
Martin Pitt                        | http://www.piware.de
Ubuntu Developer (www.ubuntu.com)  | Debian Developer  (www.debian.org)

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: Fix touchpad toggle on HP laptops breaking the keyboard
From: Bastien Nocera @ 2010-11-06 16:00 UTC (permalink / raw)
  To: linux-hotplug
In-Reply-To: <1288633460.2771.24.camel@novo.hadess.net>

On Sat, 2010-11-06 at 09:17 -0400, Martin Pitt wrote:
> Hello Bastien,
> 
> Bastien Nocera [2010-11-01 17:44 +0000]:
> > Subject: [PATCH] keymap: Add force release for HP touchpad off
> 
> Thanks! Committed with adding the new file to Makefile.am.

Oops, missed that. Thanks!


^ permalink raw reply

* Re: Touchpad toggle mess
From: Bastien Nocera @ 2010-11-06 15:58 UTC (permalink / raw)
  To: Martin Pitt
  Cc: linux-input, linux-hotplug, xorg, Peter Hutterer, Matthew Garrett
In-Reply-To: <20101106153031.GA2215@piware.de>

On Sat, 2010-11-06 at 11:30 -0400, Martin Pitt wrote:
> Hello Bastien,
> 
> Bastien Nocera [2010-11-05  0:00 +0000]:
> > The patches in https://bugs.freedesktop.org/show_bug.cgi?id1300
> > implement this for the kernel, and for X.org.
> > 
> > Then we have the problem that udev's keymaps seem to use different
> > function keys depending on the hardware [1], when X.org (because of the
> > limitations of XKB) standardised on F22 for "XF86TouchpadToggle".
> 
> Thanks for bringing this up, indeed there's some cleanup in order
> here.
> 
> > So we'll need to standardise on the keys used. I selected F21 for
> > XF86TouchpadToggle, F22 for XF86TouchpadOn and F23 for XF86TouchpadOff.
> > See the patch in:
> > https://bugs.freedesktop.org/show_bug.cgi?id1333
> > 
> > The remaining fixes would need to be in udev's keymaps, to set hardware
> > handled keys to F22 and F23, and software ones to F21. Patch is attached
> > for that.
> 
> Indeed the maps which don't currently use f22 should be fixed,
> regardless of what happens in X/kernel/userspace. My main concern is
> that F22 has acted as touchpad toggle key for years

No, for about a year:
http://cgit.freedesktop.org/xkeyboard-config/commit/?id\x1d05eda8dfc706d6450cab5883120e0d5e1100c0

>  (many of those
> were imported from hal-info originally), so if we change everything to
> f21 now without updating userspace (things like gnome-settings-daemon)
> in lockstep, then we break existing functionality. 

You don't need to update gnome-settings-daemon. gnome-settings-daemon
doesn't see F21, F22 or F23, it'll see XF86TouchpadOn, etc.

> It would look slightly weird, but perhaps it might be better to keep
> using F22 for toggling, and F1->on, F23->off?

That sounds like a bit of a mess having those 2 keys not next to each.
F22 for toggling, F23 for on, F24 for off would do me fine, if that's
really required.

FWIW, the keys mapped in udev are the only ones that mapped F22. There's
nothing in the kernel using F22 for touchpad toggles of any kind.

Cheers


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox