All of lore.kernel.org
 help / color / mirror / Atom feed
From: Cyrill Gorcunov <gorcunov@gmail.com>
To: Ingo Molnar <mingo@elte.hu>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>,
	Yinghai Lu <yhlu.kernel@gmail.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	"H. Peter Anvin" <hpa@zytor.com>,
	lkml <linux-kernel@vger.kernel.org>
Subject: [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
Date: Fri, 04 Feb 2011 00:03:49 +0300	[thread overview]
Message-ID: <4D4B1835.10606@gmail.com> (raw)

In the case of x2apic cluster mode we can group IPI register writes based
on the cluster group instead of individual per-cpu destiantion messages.
This reduces the apic register writes and reduces the amount of IPI messages
(in the best case we can reduce it by a factor of 16).

With this change, microbenchmark measuring the cost of flush_tlb_others(),
with the flush tlb IPI being sent from a cpu in the socket-1 to all the logical
cpus in socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket)
is 3x times better now (compared to the former 'send one-by-one' algorithm).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/apic.h           |    2
 arch/x86/kernel/apic/probe_64.c       |    4
 arch/x86/kernel/apic/x2apic_cluster.c |  169 +++++++++++++++++++++++++---------
 3 files changed, 131 insertions(+), 44 deletions(-)

Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
+++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
@@ -71,7 +71,9 @@ void __init default_setup_apic_routing(v
 #endif

 	if (apic == &apic_flat && num_possible_cpus() > 8)
-			apic = &apic_physflat;
+		apic = &apic_physflat;
+	else if (apic == &apic_x2apic_cluster)
+		x2apic_init_cpu_notifier();

 	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);

Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_cluster.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,12 +5,15 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>

 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>

 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);

 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -48,24 +51,53 @@ static void
 	native_x2apic_icr_write(cfg, apicid);
 }

-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static inline u32 x2apic_cluster(int cpu)
 {
-	unsigned long query_cpu;
+	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
+}
+
+static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector,
+				   int exclude_self)
+{
+	unsigned long cpu;
 	unsigned long flags;
+	struct cpumask *cpus_in_cluster_ptr, *ipi_mask_ptr;
+	u32 dest, this_cpu;

 	x2apic_wrmsr_fence();

 	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		__x2apic_send_IPI_dest(
-			per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+	this_cpu = smp_processor_id();
+
+	/*
+	 * we are to modify mask, so we need an own copy
+	 * and be sure it's manipulated with irq off
+	 */
+	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+	cpumask_copy(ipi_mask_ptr, mask);
+
+	/*
+	 * the idea is to send one IPI per cluster
+	 */
+	for_each_cpu(cpu, ipi_mask_ptr) {
+		unsigned long i;
+		dest = 0;
+		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+
+		/* only cpus in cluster involved */
+		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr)
+			if (!exclude_self || i != this_cpu)
+				dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+
+		if (!dest)
+			continue;
+
+		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		/*
+		 * cluster sibling cpus should be discared now so
+		 * we would not send IPI them second time
+		 */
+		cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
 	}
 	local_irq_restore(flags);
 }
@@ -73,45 +105,22 @@ static void x2apic_send_IPI_mask(const s
 static void
  x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	__x2apic_send_IPI_mask(mask, vector, 1);
 }

 static void x2apic_send_IPI_allbutself(int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 1);
+}

-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, 0);
 }

 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 0);
 }

 static int x2apic_apic_id_registered(void)
@@ -151,6 +160,34 @@ x2apic_cpu_mask_to_apicid_and(const stru
 	return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }

+#define x2apic_propagate_cpu_cluster_status_online(cpu)		\
+	x2apic_propagate_cpu_cluster_status(cpu, 1)
+
+#define x2apic_propagate_cpu_cluster_status_offline(cpu)	\
+	x2apic_propagate_cpu_cluster_status(cpu, 0)
+
+/* kind of 'fill cluster cpu siblings' map */
+static void x2apic_propagate_cpu_cluster_status(int this_cpu, int online)
+{
+	int cpu;
+
+	if (online) {
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+	} else {
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+	}
+}
+
 static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
 {
 	unsigned int id;
@@ -184,8 +221,54 @@ static void init_x2apic_ldr(void)
 	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
 }

-struct apic apic_x2apic_cluster = {
+static int __cpuinit
+cluster_setup(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+			free_cpumask_var(per_cpu(ipi_mask, cpu));
+			err = -ENOMEM;
+		}
+		break;
+	case CPU_ONLINE:
+		x2apic_propagate_cpu_cluster_status_online(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+		x2apic_propagate_cpu_cluster_status_offline(cpu);
+		free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+		free_cpumask_var(per_cpu(ipi_mask, cpu));
+		break;
+	}
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block __refdata x2apic_cpu_notifier =
+{
+	.notifier_call = cluster_setup,
+};
+
+void x2apic_init_cpu_notifier(void)
+{
+	int cpu = smp_processor_id();

+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	register_hotcpu_notifier(&x2apic_cpu_notifier);
+}
+
+struct apic apic_x2apic_cluster = {
 	.name				= "cluster x2apic",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
Index: tip-linux-2.6/arch/x86/include/asm/apic.h
===================================================================
--- tip-linux-2.6.orig/arch/x86/include/asm/apic.h
+++ tip-linux-2.6/arch/x86/include/asm/apic.h
@@ -179,6 +179,8 @@ extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
+extern void x2apic_init_cpu_notifier(void);
+
 static inline int x2apic_enabled(void)
 {
 	u64 msr;

             reply	other threads:[~2011-02-03 21:03 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-02-03 21:03 Cyrill Gorcunov [this message]
2011-02-14 11:45 ` [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups Ingo Molnar
2011-02-14 15:10   ` Cyrill Gorcunov
2011-02-15  3:22     ` Ingo Molnar
2011-02-15  8:39       ` Cyrill Gorcunov
2011-02-16  9:23         ` Ingo Molnar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4D4B1835.10606@gmail.com \
    --to=gorcunov@gmail.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=suresh.b.siddha@intel.com \
    --cc=tglx@linutronix.de \
    --cc=yhlu.kernel@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.