[patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Cyrill Gorcunov <gorcunov@openvz.org>
To: Ingo Molnar <mingo@elte.hu>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>,
	LKML <linux-kernel@vger.kernel.org>,
	Cyrill Gorcunov <gorcunov@openvz.org>
Subject: [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4
Date: Mon, 02 May 2011 15:34:46 +0400	[thread overview]
Message-ID: <20110502114024.222582172@openvz.org> (raw)
In-Reply-To: 20110502113445.751391656@openvz.org

[-- Attachment #1: x86-x2apic-optimise-cluster-mode-v4 --]
[-- Type: text/plain, Size: 9646 bytes --]

In the case of x2apic cluster mode we can group
IPI register writes based on the cluster group
instead of individual per-cpu destiantion messages.

This reduces the apic register writes and reduces
the amount of IPI messages (in the best case we can
reduce it by a factor of 16).

With this change, microbenchmark measuring the cost
of flush_tlb_others(), with the flush tlb IPI being
sent from a cpu in the socket-1 to all the logical
cpus in socket-2 (on a Westmere-EX system that has
20 logical cpus in a socket) is 3x times better now
(compared to the former 'send one-by-one' algorithm).

v2: Suresh fixed cpumask allocation to be dynamic,
    stack allocation is not acceptable for such things
    because can be exhausted.

v3: Address Ingo concerns on code style, also a note
    added just to not forget that we need merge
    probe_64/32 into some common structure.

v4: Suresh discovered (and fixed) that cluster
    infomation must be updated at CPU_UP_PREPARE
    state otherwise if IPI happens too early we
    will be in touble having incomplete cluster
    sibling map.

Tested-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/apic.h           |    2 
 arch/x86/kernel/apic/probe_64.c       |   13 ++
 arch/x86/kernel/apic/x2apic_cluster.c |  167 ++++++++++++++++++++++++----------
 3 files changed, 136 insertions(+), 46 deletions(-)

Index: tip-linux-2.6/arch/x86/include/asm/apic.h
===================================================================
--- tip-linux-2.6.orig/arch/x86/include/asm/apic.h
+++ tip-linux-2.6/arch/x86/include/asm/apic.h
@@ -178,6 +178,8 @@ extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
+extern void x2apic_init_cpu_notifier(void);
+
 static inline int x2apic_enabled(void)
 {
 	u64 msr;
Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
+++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
@@ -55,6 +55,15 @@ static int apicid_phys_pkg_id(int initia
 void __init default_setup_apic_routing(void)
 {
 
+	/*
+	 * FIXME:
+	 *
+	 * Cleanup the apic routing selection by having an apic driver specific
+	 * selection routine. Then all we need to do here is iterate through
+	 * them to finalize the apic selection. That would get rid of the
+	 * ifdef mess and most of the code here.
+	 */
+
 	enable_IR_x2apic();
 
 #ifdef CONFIG_X86_X2APIC
@@ -71,7 +80,9 @@ void __init default_setup_apic_routing(v
 #endif
 
 	if (apic == &apic_flat && num_possible_cpus() > 8)
-			apic = &apic_physflat;
+		apic = &apic_physflat;
+	else if (apic == &apic_x2apic_cluster)
+		x2apic_init_cpu_notifier();
 
 	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 
Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_cluster.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,12 +5,15 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 
 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -36,7 +39,7 @@ static void x2apic_vector_allocation_dom
 }
 
 static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
 {
 	unsigned long cfg;
 
@@ -48,70 +51,80 @@ static void
 	native_x2apic_icr_write(cfg, apicid);
 }
 
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static inline u32 x2apic_cluster(int cpu)
 {
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		__x2apic_send_IPI_dest(
-			per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
 }
 
 static void
- x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int exclude_self)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
+	struct cpumask *cpus_in_cluster_ptr;
+	struct cpumask *ipi_mask_ptr;
+	unsigned int cpu, this_cpu;
 	unsigned long flags;
+	u32 dest;
 
 	x2apic_wrmsr_fence();
 
 	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu == this_cpu)
+	this_cpu = smp_processor_id();
+
+	/*
+	 * We are to modify mask, so we need an own copy
+	 * and be sure it's manipulated with irq off.
+	 */
+	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+	cpumask_copy(ipi_mask_ptr, mask);
+
+	/*
+	 * The idea is to send one IPI per cluster.
+	 */
+	for_each_cpu(cpu, ipi_mask_ptr) {
+		unsigned long i;
+
+		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+		dest = 0;
+
+		/* Collect cpus in cluster. */
+		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+			if (!exclude_self || i != this_cpu)
+				dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+		}
+
+		if (!dest)
 			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
+
+		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		/*
+		 * Cluster sibling cpus should be discared now so
+		 * we would not send IPI them second time.
+		 */
+		cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
 	}
+
 	local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void
+x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
+	__x2apic_send_IPI_mask(mask, vector, 1);
+}
 
-	x2apic_wrmsr_fence();
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 1);
+}
 
-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, 0);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 0);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -151,6 +164,7 @@ x2apic_cpu_mask_to_apicid_and(const stru
 	return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }
 
+
 static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
 {
 	unsigned int id;
@@ -179,13 +193,76 @@ static void x2apic_send_IPI_self(int vec
 
 static void init_x2apic_ldr(void)
 {
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
+
+	per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+	__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+	for_each_online_cpu(cpu) {
+		if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+			continue;
+		__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+		__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+	}
+}
+
+/*
+ * At CPU state changes, update the x2apic cluster sibling info.
+ */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int this_cpu = (unsigned long)hcpu;
+	unsigned int cpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+					GFP_KERNEL)) {
+			err = -ENOMEM;
+		} else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+					       GFP_KERNEL)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+			err = -ENOMEM;
+		}
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+		free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+		free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+		break;
+	}
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+	.notifier_call = update_clusterinfo,
+};
+
+void x2apic_init_cpu_notifier(void)
+{
 	int cpu = smp_processor_id();
 
-	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	register_hotcpu_notifier(&x2apic_cpu_notifier);
 }
 
 struct apic apic_x2apic_cluster = {
-
 	.name				= "cluster x2apic",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,

next prev parent reply	other threads:[~2011-05-02 11:40 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-05-02 11:34 [patch 0/2] x2apic optimization, v4 log updated Cyrill Gorcunov
2011-05-02 11:34 ` Cyrill Gorcunov [this message]
2011-05-02 13:22   ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Ingo Molnar
2011-05-02 14:02     ` Cyrill Gorcunov
2011-05-02 14:23       ` Cyrill Gorcunov
2011-05-02 15:05         ` Ingo Molnar
2011-05-02 15:16           ` Cyrill Gorcunov
2011-05-03  6:31             ` Ingo Molnar
2011-05-03  6:59               ` Cyrill Gorcunov
2011-05-02 18:27       ` Suresh Siddha
2011-05-02 11:34 ` [patch 2/2] x86, x2apic: Move the common bits of physical and cluster modes to x2apic.h v4 Cyrill Gorcunov
  -- strict thread matches above, loose matches on Subject: below --
2011-04-30 17:14 [patch 0/2] x2apic optimization, round 4 Cyrill Gorcunov
2011-04-30 17:15 ` [patch 1/2] x86, x2apic: minimize IPI register writes using cluster groups v4 Cyrill Gorcunov
2011-05-01 17:18   ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20110502114024.222582172@openvz.org \
    --to=gorcunov@openvz.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=suresh.b.siddha@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.