[patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-04-30 17:14 [patch 0/2] x2apic optimization, round 4 Cyrill Gorcunov
@ 2011-04-30 17:15 ` Cyrill Gorcunov
  2011-05-01 17:18   ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-04-30 17:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: H. Peter Anvin, Thomas Gleixner, Suresh Siddha, LKML,
	Cyrill Gorcunov

[-- Attachment #1: x86-x2apic-optimise-cluster-mode-v4 --]
[-- Type: text/plain, Size: 9461 bytes --]

In the case of x2apic cluster mode we can group
IPI register writes based on the cluster group
instead of individual per-cpu destiantion messages.

This reduces the apic register writes and reduces
the amount of IPI messages (in the best case we can
reduce it by a factor of 16).

With this change, microbenchmark measuring the cost
of flush_tlb_others(), with the flush tlb IPI being
sent from a cpu in the socket-1 to all the logical
cpus in socket-2 (on a Westmere-EX system that has
20 logical cpus in a socket) is 3x times better now
(compared to the former 'send one-by-one' algorithm).

v3: Address Ingo concerns on code style, also a note
    added just to not forget that we need merge
    probe_64/32 into some common structure.

v4: Suresh discovered (and fixed) that cluster
    infomation must be updated at CPU_UP_PREPARE
    state otherwise if IPI happens too early we
    will be in touble having incomplete cluster
    sibling map.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/apic.h           |    2 
 arch/x86/kernel/apic/probe_64.c       |   13 ++
 arch/x86/kernel/apic/x2apic_cluster.c |  167 ++++++++++++++++++++++++----------
 3 files changed, 136 insertions(+), 46 deletions(-)

Index: tip-linux-2.6/arch/x86/include/asm/apic.h
===================================================================
--- tip-linux-2.6.orig/arch/x86/include/asm/apic.h
+++ tip-linux-2.6/arch/x86/include/asm/apic.h
@@ -178,6 +178,8 @@ extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
+extern void x2apic_init_cpu_notifier(void);
+
 static inline int x2apic_enabled(void)
 {
 	u64 msr;
Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
+++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
@@ -55,6 +55,15 @@ static int apicid_phys_pkg_id(int initia
 void __init default_setup_apic_routing(void)
 {
 
+	/*
+	 * FIXME:
+	 *
+	 * Cleanup the apic routing selection by having an apic driver specific
+	 * selection routine. Then all we need to do here is iterate through
+	 * them to finalize the apic selection. That would get rid of the
+	 * ifdef mess and most of the code here.
+	 */
+
 	enable_IR_x2apic();
 
 #ifdef CONFIG_X86_X2APIC
@@ -71,7 +80,9 @@ void __init default_setup_apic_routing(v
 #endif
 
 	if (apic == &apic_flat && num_possible_cpus() > 8)
-			apic = &apic_physflat;
+		apic = &apic_physflat;
+	else if (apic == &apic_x2apic_cluster)
+		x2apic_init_cpu_notifier();
 
 	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 
Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_cluster.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,12 +5,15 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 
 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -36,7 +39,7 @@ static void x2apic_vector_allocation_dom
 }
 
 static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
 {
 	unsigned long cfg;
 
@@ -48,70 +51,80 @@ static void
 	native_x2apic_icr_write(cfg, apicid);
 }
 
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static inline u32 x2apic_cluster(int cpu)
 {
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		__x2apic_send_IPI_dest(
-			per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
 }
 
 static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int exclude_self)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
+	struct cpumask *cpus_in_cluster_ptr;
+	struct cpumask *ipi_mask_ptr;
+	unsigned int cpu, this_cpu;
 	unsigned long flags;
+	u32 dest;
 
 	x2apic_wrmsr_fence();
 
 	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu == this_cpu)
+	this_cpu = smp_processor_id();
+
+	/*
+	 * We are to modify mask, so we need an own copy
+	 * and be sure it's manipulated with irq off.
+	 */
+	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+	cpumask_copy(ipi_mask_ptr, mask);
+
+	/*
+	 * The idea is to send one IPI per cluster.
+	 */
+	for_each_cpu(cpu, ipi_mask_ptr) {
+		unsigned long i;
+
+		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+		dest = 0;
+
+		/* Collect cpus in cluster. */
+		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+			if (!exclude_self || i != this_cpu)
+				dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+		}
+
+		if (!dest)
 			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
+
+		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		/*
+		 * Cluster sibling cpus should be discared now so
+		 * we would not send IPI them second time.
+		 */
+		cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
 	}
+
 	local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
+	__x2apic_send_IPI_mask(mask, vector, 1);
+}
 
-	x2apic_wrmsr_fence();
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 1);
+}
 
-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, 0);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 0);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -151,6 +164,7 @@ x2apic_cpu_mask_to_apicid_and(const stru
 	return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }
 
+
 static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
 {
 	unsigned int id;
@@ -179,13 +193,76 @@ static void x2apic_send_IPI_self(int vec
 
 static void init_x2apic_ldr(void)
 {
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
+
+	per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+	__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+	for_each_online_cpu(cpu) {
+		if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+			continue;
+		__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+		__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+	}
+}
+
+/*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int this_cpu = (unsigned long)hcpu;
+	unsigned int cpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+					GFP_KERNEL)) {
+			err = -ENOMEM;
+		} else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+					       GFP_KERNEL)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+			err = -ENOMEM;
+		}
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+		free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+		free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+		break;
+	}
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+	.notifier_call = update_clusterinfo,
+};
+
+void x2apic_init_cpu_notifier(void)
+{
 	int cpu = smp_processor_id();
 
-	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	register_hotcpu_notifier(&x2apic_cpu_notifier);
 }
 
 struct apic apic_x2apic_cluster = {
-
 	.name				= "cluster x2apic",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-04-30 17:15 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
@ 2011-05-01 17:18   ` Ingo Molnar
  0 siblings, 0 replies; 13+ messages in thread
From: Ingo Molnar @ 2011-05-01 17:18 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: H. Peter Anvin, Thomas Gleixner, Suresh Siddha, LKML


* Cyrill Gorcunov <gorcunov@openvz.org> wrote:

> Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>

This SOB section is wrong, the last SOB must be the person sending the patch. 
Please see section "13)" in Documentation/Submittingpatches.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 0/2] x2apic optimization, v4 log updated
@ 2011-05-02 11:34 Cyrill Gorcunov
  2011-05-02 11:34 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
  2011-05-02 11:34 ` [patch 2/2] x86, x2apic: Move the common bits of physical and cluster modes to x2apic.h v4 Cyrill Gorcunov
  0 siblings, 2 replies; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-02 11:34 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML

  Hi Ingo, this series consist of updated commit messages,
so I hope this time I made all things right.

  Please ping me if something still not confirm the submitting
patch guide.

  Suresh, I've putted your ack and tested-by in a sake of consistency
with submitting patch guide.

Thanks,
  Cyrill

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 11:34 [patch 0/2] x2apic optimization, v4 log updated Cyrill Gorcunov
@ 2011-05-02 11:34 ` Cyrill Gorcunov
  2011-05-02 13:22   ` Ingo Molnar
  2011-05-02 11:34 ` [patch 2/2] x86, x2apic: Move the common bits of physical and cluster modes to x2apic.h v4 Cyrill Gorcunov
  1 sibling, 1 reply; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-02 11:34 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML, Cyrill Gorcunov

[-- Attachment #1: x86-x2apic-optimise-cluster-mode-v4 --]
[-- Type: text/plain, Size: 9646 bytes --]

In the case of x2apic cluster mode we can group
IPI register writes based on the cluster group
instead of individual per-cpu destiantion messages.

This reduces the apic register writes and reduces
the amount of IPI messages (in the best case we can
reduce it by a factor of 16).

With this change, microbenchmark measuring the cost
of flush_tlb_others(), with the flush tlb IPI being
sent from a cpu in the socket-1 to all the logical
cpus in socket-2 (on a Westmere-EX system that has
20 logical cpus in a socket) is 3x times better now
(compared to the former 'send one-by-one' algorithm).

v2: Suresh fixed cpumask allocation to be dynamic,
    stack allocation is not acceptable for such things
    because can be exhausted.

v3: Address Ingo concerns on code style, also a note
    added just to not forget that we need merge
    probe_64/32 into some common structure.

v4: Suresh discovered (and fixed) that cluster
    infomation must be updated at CPU_UP_PREPARE
    state otherwise if IPI happens too early we
    will be in touble having incomplete cluster
    sibling map.

Tested-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/apic.h           |    2 
 arch/x86/kernel/apic/probe_64.c       |   13 ++
 arch/x86/kernel/apic/x2apic_cluster.c |  167 ++++++++++++++++++++++++----------
 3 files changed, 136 insertions(+), 46 deletions(-)

Index: tip-linux-2.6/arch/x86/include/asm/apic.h
===================================================================
--- tip-linux-2.6.orig/arch/x86/include/asm/apic.h
+++ tip-linux-2.6/arch/x86/include/asm/apic.h
@@ -178,6 +178,8 @@ extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
+extern void x2apic_init_cpu_notifier(void);
+
 static inline int x2apic_enabled(void)
 {
 	u64 msr;
Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
+++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
@@ -55,6 +55,15 @@ static int apicid_phys_pkg_id(int initia
 void __init default_setup_apic_routing(void)
 {
 
+	/*
+	 * FIXME:
+	 *
+	 * Cleanup the apic routing selection by having an apic driver specific
+	 * selection routine. Then all we need to do here is iterate through
+	 * them to finalize the apic selection. That would get rid of the
+	 * ifdef mess and most of the code here.
+	 */
+
 	enable_IR_x2apic();
 
 #ifdef CONFIG_X86_X2APIC
@@ -71,7 +80,9 @@ void __init default_setup_apic_routing(v
 #endif
 
 	if (apic == &apic_flat && num_possible_cpus() > 8)
-			apic = &apic_physflat;
+		apic = &apic_physflat;
+	else if (apic == &apic_x2apic_cluster)
+		x2apic_init_cpu_notifier();
 
 	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 
Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_cluster.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,12 +5,15 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 
 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -36,7 +39,7 @@ static void x2apic_vector_allocation_dom
 }
 
 static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
 {
 	unsigned long cfg;
 
@@ -48,70 +51,80 @@ static void
 	native_x2apic_icr_write(cfg, apicid);
 }
 
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static inline u32 x2apic_cluster(int cpu)
 {
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		__x2apic_send_IPI_dest(
-			per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
 }
 
 static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int exclude_self)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
+	struct cpumask *cpus_in_cluster_ptr;
+	struct cpumask *ipi_mask_ptr;
+	unsigned int cpu, this_cpu;
 	unsigned long flags;
+	u32 dest;
 
 	x2apic_wrmsr_fence();
 
 	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu == this_cpu)
+	this_cpu = smp_processor_id();
+
+	/*
+	 * We are to modify mask, so we need an own copy
+	 * and be sure it's manipulated with irq off.
+	 */
+	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+	cpumask_copy(ipi_mask_ptr, mask);
+
+	/*
+	 * The idea is to send one IPI per cluster.
+	 */
+	for_each_cpu(cpu, ipi_mask_ptr) {
+		unsigned long i;
+
+		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+		dest = 0;
+
+		/* Collect cpus in cluster. */
+		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+			if (!exclude_self || i != this_cpu)
+				dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+		}
+
+		if (!dest)
 			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
+
+		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		/*
+		 * Cluster sibling cpus should be discared now so
+		 * we would not send IPI them second time.
+		 */
+		cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
 	}
+
 	local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
+	__x2apic_send_IPI_mask(mask, vector, 1);
+}
 
-	x2apic_wrmsr_fence();
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 1);
+}
 
-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, 0);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 0);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -151,6 +164,7 @@ x2apic_cpu_mask_to_apicid_and(const stru
 	return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }
 
+
 static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
 {
 	unsigned int id;
@@ -179,13 +193,76 @@ static void x2apic_send_IPI_self(int vec
 
 static void init_x2apic_ldr(void)
 {
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
+
+	per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+	__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+	for_each_online_cpu(cpu) {
+		if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+			continue;
+		__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+		__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+	}
+}
+
+/*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int this_cpu = (unsigned long)hcpu;
+	unsigned int cpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+					GFP_KERNEL)) {
+			err = -ENOMEM;
+		} else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+					       GFP_KERNEL)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+			err = -ENOMEM;
+		}
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+		free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+		free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+		break;
+	}
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+	.notifier_call = update_clusterinfo,
+};
+
+void x2apic_init_cpu_notifier(void)
+{
 	int cpu = smp_processor_id();
 
-	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	register_hotcpu_notifier(&x2apic_cpu_notifier);
 }
 
 struct apic apic_x2apic_cluster = {
-
 	.name				= "cluster x2apic",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 2/2] x86, x2apic: Move the common bits of physical and cluster modes to x2apic.h v4
  2011-05-02 11:34 [patch 0/2] x2apic optimization, v4 log updated Cyrill Gorcunov
  2011-05-02 11:34 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
@ 2011-05-02 11:34 ` Cyrill Gorcunov
  1 sibling, 0 replies; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-02 11:34 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML, Cyrill Gorcunov

[-- Attachment #1: x86-x2apic-merge-phys-cluster-v4 --]
[-- Type: text/plain, Size: 7794 bytes --]

To eliminate code duplication.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/x2apic.h         |   64 ++++++++++++++++++++++++++++++
 arch/x86/kernel/apic/x2apic_cluster.c |   71 +---------------------------------
 arch/x86/kernel/apic/x2apic_phys.c    |   71 +++-------------------------------
 3 files changed, 76 insertions(+), 130 deletions(-)

Index: tip-linux-2.6/arch/x86/include/asm/x2apic.h
===================================================================
--- /dev/null
+++ tip-linux-2.6/arch/x86/include/asm/x2apic.h
@@ -0,0 +1,64 @@
+/*
+ * Common bits for X2APIC cluster/physical modes.
+ */
+
+#ifndef _ASM_X86_X2APIC_H
+#define _ASM_X86_X2APIC_H
+
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <linux/cpumask.h>
+
+/*
+ * Need to use more than cpu 0, because we need more vectors
+ * when MSI-X are used.
+ */
+static const struct cpumask *x2apic_target_cpus(void)
+{
+	return cpu_online_mask;
+}
+
+static int x2apic_apic_id_registered(void)
+{
+	return 1;
+}
+
+/*
+ * For now each logical cpu is in its own vector allocation domain.
+ */
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+	cpumask_clear(retmask);
+	cpumask_set_cpu(cpu, retmask);
+}
+
+static void
+__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+{
+	unsigned long cfg = __prepare_ICR(0, vector, dest);
+	native_x2apic_icr_write(cfg, apicid);
+}
+
+static unsigned int x2apic_get_apic_id(unsigned long id)
+{
+	return id;
+}
+
+static unsigned long x2apic_set_apic_id(unsigned int id)
+{
+	return id;
+}
+
+static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+{
+	return initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+	apic_write(APIC_SELF_IPI, vector);
+}
+
+void x2apic_init_cpu_notifier(void);
+
+#endif /* _ASM_X86_X2APIC_H */
Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_cluster.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
@@ -8,8 +8,7 @@
 #include <linux/cpu.h>
 
 #include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
 
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
@@ -20,37 +19,6 @@ static int x2apic_acpi_madt_oem_check(ch
 	return x2apic_enabled();
 }
 
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
-static void
-__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
-{
-	unsigned long cfg;
-
-	cfg = __prepare_ICR(0, vector, dest);
-
-	/*
-	 * send the IPI.
-	 */
-	native_x2apic_icr_write(cfg, apicid);
-}
-
 static inline u32 x2apic_cluster(int cpu)
 {
 	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
@@ -127,11 +95,6 @@ static void x2apic_send_IPI_all(int vect
 	__x2apic_send_IPI_mask(cpu_online_mask, vector, 0);
 }
 
-static int x2apic_apic_id_registered(void)
-{
-	return 1;
-}
-
 static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
 	/*
@@ -165,32 +128,6 @@ x2apic_cpu_mask_to_apicid_and(const stru
 }
 
 
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
-{
-	unsigned int id;
-
-	id = x;
-	return id;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
-	unsigned long x;
-
-	x = id;
-	return x;
-}
-
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
-{
-	return initial_apicid >> index_msb;
-}
-
-static void x2apic_send_IPI_self(int vector)
-{
-	apic_write(APIC_SELF_IPI, vector);
-}
-
 static void init_x2apic_ldr(void)
 {
 	unsigned int this_cpu = smp_processor_id();
@@ -288,11 +225,11 @@ struct apic apic_x2apic_cluster = {
 	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
 	.enable_apic_mode		= NULL,
-	.phys_pkg_id			= x2apic_cluster_phys_pkg_id,
+	.phys_pkg_id			= x2apic_phys_pkg_id,
 	.mps_oem_check			= NULL,
 
-	.get_apic_id			= x2apic_cluster_phys_get_apic_id,
-	.set_apic_id			= set_apic_id,
+	.get_apic_id			= x2apic_get_apic_id,
+	.set_apic_id			= x2apic_set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
 	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,
Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_phys.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_phys.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,8 +7,7 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
 
 int x2apic_phys;
 
@@ -27,34 +26,6 @@ static int x2apic_acpi_madt_oem_check(ch
 		return 0;
 }
 
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
-				   unsigned int dest)
-{
-	unsigned long cfg;
-
-	cfg = __prepare_ICR(0, vector, dest);
-
-	/*
-	 * send the IPI.
-	 */
-	native_x2apic_icr_write(cfg, apicid);
-}
-
 static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
 	unsigned long query_cpu;
@@ -71,7 +42,7 @@ static void x2apic_send_IPI_mask(const s
 }
 
 static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
 	unsigned long this_cpu = smp_processor_id();
 	unsigned long query_cpu;
@@ -81,10 +52,10 @@ static void
 
 	local_irq_save(flags);
 	for_each_cpu(query_cpu, mask) {
-		if (query_cpu != this_cpu)
-			__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_apicid, query_cpu),
-				vector, APIC_DEST_PHYSICAL);
+		if (query_cpu == this_cpu)
+			continue;
+		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+				       vector, APIC_DEST_PHYSICAL);
 	}
 	local_irq_restore(flags);
 }
@@ -112,11 +83,6 @@ static void x2apic_send_IPI_all(int vect
 	x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 
-static int x2apic_apic_id_registered(void)
-{
-	return 1;
-}
-
 static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
 	/*
@@ -149,32 +115,11 @@ x2apic_cpu_mask_to_apicid_and(const stru
 	return per_cpu(x86_cpu_to_apicid, cpu);
 }
 
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
-{
-	return x;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
-	return id;
-}
-
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
-{
-	return initial_apicid >> index_msb;
-}
-
-static void x2apic_send_IPI_self(int vector)
-{
-	apic_write(APIC_SELF_IPI, vector);
-}
-
 static void init_x2apic_ldr(void)
 {
 }
 
 struct apic apic_x2apic_phys = {
-
 	.name				= "physical x2apic",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
@@ -203,8 +148,8 @@ struct apic apic_x2apic_phys = {
 	.phys_pkg_id			= x2apic_phys_pkg_id,
 	.mps_oem_check			= NULL,
 
-	.get_apic_id			= x2apic_phys_get_apic_id,
-	.set_apic_id			= set_apic_id,
+	.get_apic_id			= x2apic_get_apic_id,
+	.set_apic_id			= x2apic_set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
 	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 11:34 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
@ 2011-05-02 13:22   ` Ingo Molnar
  2011-05-02 14:02     ` Cyrill Gorcunov
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2011-05-02 13:22 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: Suresh Siddha, LKML


* Cyrill Gorcunov <gorcunov@openvz.org> wrote:

> In the case of x2apic cluster mode we can group
> IPI register writes based on the cluster group
> instead of individual per-cpu destiantion messages.

typo.

> This reduces the apic register writes and reduces
> the amount of IPI messages (in the best case we can
> reduce it by a factor of 16).
> 
> With this change, microbenchmark measuring the cost
> of flush_tlb_others(), with the flush tlb IPI being
> sent from a cpu in the socket-1 to all the logical
> cpus in socket-2 (on a Westmere-EX system that has
> 20 logical cpus in a socket) is 3x times better now
> (compared to the former 'send one-by-one' algorithm).

What kind of microbenchmark was this, could the actual results and measurement 
methods be shared as well?

> v2: Suresh fixed cpumask allocation to be dynamic,
>     stack allocation is not acceptable for such things
>     because can be exhausted.
> 
> v3: Address Ingo concerns on code style, also a note
>     added just to not forget that we need merge
>     probe_64/32 into some common structure.
> 
> v4: Suresh discovered (and fixed) that cluster
>     infomation must be updated at CPU_UP_PREPARE
>     state otherwise if IPI happens too early we
>     will be in touble having incomplete cluster
>     sibling map.
> 
> Tested-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
> Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
> ---
>  arch/x86/include/asm/apic.h           |    2 
>  arch/x86/kernel/apic/probe_64.c       |   13 ++
>  arch/x86/kernel/apic/x2apic_cluster.c |  167 ++++++++++++++++++++++++----------
>  3 files changed, 136 insertions(+), 46 deletions(-)
> 
> Index: tip-linux-2.6/arch/x86/include/asm/apic.h
> ===================================================================
> --- tip-linux-2.6.orig/arch/x86/include/asm/apic.h
> +++ tip-linux-2.6/arch/x86/include/asm/apic.h
> @@ -178,6 +178,8 @@ extern int x2apic_phys;
>  extern void check_x2apic(void);
>  extern void enable_x2apic(void);
>  extern void x2apic_icr_write(u32 low, u32 id);
> +extern void x2apic_init_cpu_notifier(void);
> +
>  static inline int x2apic_enabled(void)
>  {
>  	u64 msr;
> Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
> ===================================================================
> --- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
> +++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
> @@ -55,6 +55,15 @@ static int apicid_phys_pkg_id(int initia
>  void __init default_setup_apic_routing(void)
>  {
>  
> +	/*
> +	 * FIXME:
> +	 *
> +	 * Cleanup the apic routing selection by having an apic driver specific
> +	 * selection routine. Then all we need to do here is iterate through
> +	 * them to finalize the apic selection. That would get rid of the
> +	 * ifdef mess and most of the code here.
> +	 */
> +
>  	enable_IR_x2apic();
>  
>  #ifdef CONFIG_X86_X2APIC
> @@ -71,7 +80,9 @@ void __init default_setup_apic_routing(v
>  #endif
>  
>  	if (apic == &apic_flat && num_possible_cpus() > 8)
> -			apic = &apic_physflat;
> +		apic = &apic_physflat;
> +	else if (apic == &apic_x2apic_cluster)
> +		x2apic_init_cpu_notifier();


Why is there an x2apic specific function in the generic 
default_setup_apic_routing() function?

Instead of that it would be cleaner to extend the apic driver functions with an 
init method, which would be filled in for x2apic and left NULL for the others.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 13:22   ` Ingo Molnar
@ 2011-05-02 14:02     ` Cyrill Gorcunov
  2011-05-02 14:23       ` Cyrill Gorcunov
  2011-05-02 18:27       ` Suresh Siddha
  0 siblings, 2 replies; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-02 14:02 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML

On 05/02/2011 05:22 PM, Ingo Molnar wrote:
> 
> * Cyrill Gorcunov <gorcunov@openvz.org> wrote:
> 
>> In the case of x2apic cluster mode we can group
>> IPI register writes based on the cluster group
>> instead of individual per-cpu destiantion messages.
> 
> typo.
> 

ok, will fix, thanks.

>> This reduces the apic register writes and reduces
>> the amount of IPI messages (in the best case we can
>> reduce it by a factor of 16).
>>
>> With this change, microbenchmark measuring the cost
>> of flush_tlb_others(), with the flush tlb IPI being
>> sent from a cpu in the socket-1 to all the logical
>> cpus in socket-2 (on a Westmere-EX system that has
>> 20 logical cpus in a socket) is 3x times better now
>> (compared to the former 'send one-by-one' algorithm).
> 
> What kind of microbenchmark was this, could the actual results and measurement 
> methods be shared as well?

Suresh, could you please post the microbenchmark?
...
>> Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
>> ===================================================================
>> --- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
>> +++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
>> @@ -55,6 +55,15 @@ static int apicid_phys_pkg_id(int initia
>>  void __init default_setup_apic_routing(void)
>>  {
>>  
>> +	/*
>> +	 * FIXME:
>> +	 *
>> +	 * Cleanup the apic routing selection by having an apic driver specific
>> +	 * selection routine. Then all we need to do here is iterate through
>> +	 * them to finalize the apic selection. That would get rid of the
>> +	 * ifdef mess and most of the code here.
>> +	 */
>> +
>>  	enable_IR_x2apic();
>>  
>>  #ifdef CONFIG_X86_X2APIC
>> @@ -71,7 +80,9 @@ void __init default_setup_apic_routing(v
>>  #endif
>>  
>>  	if (apic == &apic_flat && num_possible_cpus() > 8)
>> -			apic = &apic_physflat;
>> +		apic = &apic_physflat;
>> +	else if (apic == &apic_x2apic_cluster)
>> +		x2apic_init_cpu_notifier();
> 
> 
> Why is there an x2apic specific function in the generic 
> default_setup_apic_routing() function?
> 
> Instead of that it would be cleaner to extend the apic driver functions with an 
> init method, which would be filled in for x2apic and left NULL for the others.
> 
> Thanks,
> 
> 	Ingo

  Ingo, the idea was to merge probe_x.c completely, and put all this not into init()
but rather into apic->probe() or something like that. I don't have a clear picture
in mind yet what the best way would be, so instead of fast designed method I thought
to leave it opencoded with fixme note.

  So lets wait until Suresh post the benchmark and I will make apic->init() meanwhile.

-- 
    Cyrill

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 14:02     ` Cyrill Gorcunov
@ 2011-05-02 14:23       ` Cyrill Gorcunov
  2011-05-02 15:05         ` Ingo Molnar
  2011-05-02 18:27       ` Suresh Siddha
  1 sibling, 1 reply; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-02 14:23 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML

On 05/02/2011 06:02 PM, Cyrill Gorcunov wrote:
...
> 
>   Ingo, the idea was to merge probe_x.c completely, and put all this not into init()
> but rather into apic->probe() or something like that. I don't have a clear picture
> in mind yet what the best way would be, so instead of fast designed method I thought
> to leave it opencoded with fixme note.
> 
>   So lets wait until Suresh post the benchmark and I will make apic->init() meanwhile.
> 

  Ingo, would it be fine to make apic->init() either _before_ this series or on
top of them (because if I introduce it inside this particular patch it would contain
some unrelated code snippets such as .init = NULL for all apics declaration).

-- 
Thanks,
  Cyrill

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 14:23       ` Cyrill Gorcunov
@ 2011-05-02 15:05         ` Ingo Molnar
  2011-05-02 15:16           ` Cyrill Gorcunov
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2011-05-02 15:05 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: Suresh Siddha, LKML


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On 05/02/2011 06:02 PM, Cyrill Gorcunov wrote:
> ...
> > 
> >   Ingo, the idea was to merge probe_x.c completely, and put all this not into init()
> > but rather into apic->probe() or something like that. I don't have a clear picture
> > in mind yet what the best way would be, so instead of fast designed method I thought
> > to leave it opencoded with fixme note.
> > 
> >   So lets wait until Suresh post the benchmark and I will make apic->init() meanwhile.
> > 
> 
>   Ingo, would it be fine to make apic->init() either _before_ this series or 
> on top of them (because if I introduce it inside this particular patch it 
> would contain some unrelated code snippets such as .init = NULL for all apics 
> declaration).

Of course it should be a separate patch - even this patch looks a bit large - 
any way to split it up further?

And yes, if it fits it should be in apic->probe() instead of introducing a new 
->init() method.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 15:05         ` Ingo Molnar
@ 2011-05-02 15:16           ` Cyrill Gorcunov
  2011-05-03  6:31             ` Ingo Molnar
  0 siblings, 1 reply; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-02 15:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML

On 05/02/2011 07:05 PM, Ingo Molnar wrote:
...
>>
>>   Ingo, would it be fine to make apic->init() either _before_ this series or 
>> on top of them (because if I introduce it inside this particular patch it 
>> would contain some unrelated code snippets such as .init = NULL for all apics 
>> declaration).
> 
> Of course it should be a separate patch - even this patch looks a bit large - 
> any way to split it up further?

  Well, for this particular path the only minimum is used, so i fear there is no
way to split it, probably I could drop some 'cleanup' bits from it and make it
a separate one. Gimme some time.

> 
> And yes, if it fits it should be in apic->probe() instead of introducing a new 
> ->init() method.

  I'll recheck. Thanks for all comments Ingo, drop this series for a while then.

> 
> Thanks,
> 
> 	Ingo


-- 
Thanks,
  Cyrill

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 14:02     ` Cyrill Gorcunov
  2011-05-02 14:23       ` Cyrill Gorcunov
@ 2011-05-02 18:27       ` Suresh Siddha
  1 sibling, 0 replies; 13+ messages in thread
From: Suresh Siddha @ 2011-05-02 18:27 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: Ingo Molnar, LKML

On Mon, 2011-05-02 at 07:02 -0700, Cyrill Gorcunov wrote:
> On 05/02/2011 05:22 PM, Ingo Molnar wrote:
> > 
> > * Cyrill Gorcunov <gorcunov@openvz.org> wrote:
> > 
> >> With this change, microbenchmark measuring the cost
> >> of flush_tlb_others(), with the flush tlb IPI being
> >> sent from a cpu in the socket-1 to all the logical
> >> cpus in socket-2 (on a Westmere-EX system that has
> >> 20 logical cpus in a socket) is 3x times better now
> >> (compared to the former 'send one-by-one' algorithm).
> > 
> > What kind of microbenchmark was this, could the actual results and measurement 
> > methods be shared as well?
> 
> Suresh, could you please post the microbenchmark?

It is a simple kernel hack to measure the TSC cost of flush_tlb_others()
with and with out this change. 3x better was specifically for the test
condition where we called flush_tlb_others() on a logical cpu in
socket-1, which sent the flush tlb IPI to all the logical cpu's in
another socket.

This is done on WSM-EX which has 20 logical cpu's on one socket. 20
logical cpu's in that socket will fall under two cluster groups. So 2
batches of grouped IPI's vs 20 serialized(atleast the sending part)
IPI's.

thanks,
suresh

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-02 15:16           ` Cyrill Gorcunov
@ 2011-05-03  6:31             ` Ingo Molnar
  2011-05-03  6:59               ` Cyrill Gorcunov
  0 siblings, 1 reply; 13+ messages in thread
From: Ingo Molnar @ 2011-05-03  6:31 UTC (permalink / raw)
  To: Cyrill Gorcunov; +Cc: Suresh Siddha, LKML

* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On 05/02/2011 07:05 PM, Ingo Molnar wrote:
> ...
> >>
> >>   Ingo, would it be fine to make apic->init() either _before_ this series or 
> >> on top of them (because if I introduce it inside this particular patch it 
> >> would contain some unrelated code snippets such as .init = NULL for all apics 
> >> declaration).
> > 
> > Of course it should be a separate patch - even this patch looks a bit large - 
> > any way to split it up further?
> 
>   Well, for this particular path the only minimum is used, so i fear there is no
> way to split it, probably I could drop some 'cleanup' bits from it and make it
> a separate one. Gimme some time.

Well, first try to do *all* preparatory and cleanup changes that have low 
regression risk.

*Then* keep the most dangerous part to the end of it - so that it's easily 
reverted, should the need arise. Preferably the dangerous part should be much 
smaller than:

 3 files changed, 136 insertions(+), 46 deletions(-)

And no, it is not at all true that there is 'no way' to split the patch up any 
further: you could certainly add the data structures, init methods and such 
support code (which is low regression risk) in a separate patch than the 
changes that modify the existing x2apic_send_IPI_mask_allbutself() function and 
such.

Also, the loop body in the new __x2apic_send_IPI_mask() function could 
certainly be split out into a helper inline, making the code flow clearer.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
  2011-05-03  6:31             ` Ingo Molnar
@ 2011-05-03  6:59               ` Cyrill Gorcunov
  0 siblings, 0 replies; 13+ messages in thread
From: Cyrill Gorcunov @ 2011-05-03  6:59 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, LKML

On 05/03/2011 10:31 AM, Ingo Molnar wrote:
> 
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
>> On 05/02/2011 07:05 PM, Ingo Molnar wrote:
>> ...
>>>>
>>>>   Ingo, would it be fine to make apic->init() either _before_ this series or 
>>>> on top of them (because if I introduce it inside this particular patch it 
>>>> would contain some unrelated code snippets such as .init = NULL for all apics 
>>>> declaration).
>>>
>>> Of course it should be a separate patch - even this patch looks a bit large - 
>>> any way to split it up further?
>>
>>   Well, for this particular path the only minimum is used, so i fear there is no
>> way to split it, probably I could drop some 'cleanup' bits from it and make it
>> a separate one. Gimme some time.
> 
> Well, first try to do *all* preparatory and cleanup changes that have low 
> regression risk.

OK

> 
> *Then* keep the most dangerous part to the end of it - so that it's easily 
> reverted, should the need arise. Preferably the dangerous part should be much 
> smaller than:
> 
>  3 files changed, 136 insertions(+), 46 deletions(-)
> 
> And no, it is not at all true that there is 'no way' to split the patch up any 
> further: you could certainly add the data structures, init methods and such 
> support code (which is low regression risk) in a separate patch than the 
> changes that modify the existing x2apic_send_IPI_mask_allbutself() function and 
> such.

OK

> 
> Also, the loop body in the new __x2apic_send_IPI_mask() function could 
> certainly be split out into a helper inline, making the code flow clearer.

OK, will try so, thanks Ingo!

> 
> Thanks,
> 
> 	Ingo


-- 
Thanks,
  Cyrill

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-05-03  6:59 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-05-02 11:34 [patch 0/2] x2apic optimization, v4 log updated Cyrill Gorcunov
2011-05-02 11:34 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
2011-05-02 13:22   ` Ingo Molnar
2011-05-02 14:02     ` Cyrill Gorcunov
2011-05-02 14:23       ` Cyrill Gorcunov
2011-05-02 15:05         ` Ingo Molnar
2011-05-02 15:16           ` Cyrill Gorcunov
2011-05-03  6:31             ` Ingo Molnar
2011-05-03  6:59               ` Cyrill Gorcunov
2011-05-02 18:27       ` Suresh Siddha
2011-05-02 11:34 ` [patch 2/2] x86, x2apic: Move the common bits of physical and cluster modes to x2apic.h v4 Cyrill Gorcunov
  -- strict thread matches above, loose matches on Subject: below --
2011-04-30 17:14 [patch 0/2] x2apic optimization, round 4 Cyrill Gorcunov
2011-04-30 17:15 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
2011-05-01 17:18   ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox