[PATCH] - Optional method to purge the TLB on SN systems

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] - Optional method to purge the TLB on SN systems
@ 2007-03-27 19:39 Jack Steiner
  2007-03-27 20:24 ` Luck, Tony
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: Jack Steiner @ 2007-03-27 19:39 UTC (permalink / raw)
  To: linux-ia64


This patch adds an optional method for purging the TLB on SN IA64 systems.
The change should not affect any non-SN system.

	Signed-off-by: Jack Steiner <steiner@sgi.com>

---

Instead of using the chipset (SHUB) MMRs for issuing PTC flushes, the new
code sends IPIs to all affected nodes.  Each node then issues local PTC flushes.
The purpose of this change is work around performance issues that have
been seen on very large SSI systems. Small to medium size systems will continue
to use the original algorithm.

I would like to make the selection of the algorithm automatic but unfortunately 
the optimum algorithm is workload specific. The intent is that only sites that
encounter problems will use the new algorithm.


Index: linux/arch/ia64/kernel/irq_ia64.c
=================================--- linux.orig/arch/ia64/kernel/irq_ia64.c	2007-03-27 10:08:06.935647573 -0500
+++ linux/arch/ia64/kernel/irq_ia64.c	2007-03-27 10:15:41.859634603 -0500
@@ -39,6 +39,7 @@
 #include <asm/machvec.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_PERFMON
 # include <asm/perfmon.h>
@@ -127,8 +128,10 @@ void destroy_irq(unsigned int irq)
 
 #ifdef CONFIG_SMP
 #	define IS_RESCHEDULE(vec)	(vec = IA64_IPI_RESCHEDULE)
+#	define IS_LOCAL_TLB_FLUSH(vec)	(vec = IA64_IPI_LOCAL_TLB_FLUSH)
 #else
 #	define IS_RESCHEDULE(vec)	(0)
+#	define IS_LOCAL_TLB_FLUSH(vec)	(0)
 #endif
 /*
  * That's where the IVT branches when we get an external
@@ -182,7 +185,10 @@ ia64_handle_irq (ia64_vector vector, str
 	while (vector != IA64_SPURIOUS_INT_VECTOR) {
 		if (unlikely(IS_RESCHEDULE(vector)))
 			 kstat_this_cpu.irqs[vector]++;
-		else {
+		else if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
+			smp_local_flush_tlb();
+			kstat_this_cpu.irqs[vector]++;
+		} else {
 			ia64_setreg(_IA64_REG_CR_TPR, vector);
 			ia64_srlz_d();
 
@@ -229,7 +235,10 @@ void ia64_process_pending_intr(void)
 	while (vector != IA64_SPURIOUS_INT_VECTOR) {
 		if (unlikely(IS_RESCHEDULE(vector)))
 			 kstat_this_cpu.irqs[vector]++;
-		else {
+		else if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
+			smp_local_flush_tlb();
+			kstat_this_cpu.irqs[vector]++;
+		} else {
 			struct pt_regs *old_regs = set_irq_regs(NULL);
 
 			ia64_setreg(_IA64_REG_CR_TPR, vector);
@@ -267,6 +276,12 @@ static irqreturn_t dummy_handler (int ir
 	BUG();
 }
 
+static struct irqaction tlb_irqaction = {
+	.handler =	dummy_handler,
+	.flags =	SA_INTERRUPT,
+	.name =		"tlb_flush"
+};
+
 static struct irqaction ipi_irqaction = {
 	.handler =	handle_IPI,
 	.flags =	IRQF_DISABLED,
@@ -303,6 +318,7 @@ init_IRQ (void)
 #ifdef CONFIG_SMP
 	register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
 	register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
+	register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction);
 #endif
 #ifdef CONFIG_PERFMON
 	pfm_init_percpu();
Index: linux/arch/ia64/kernel/smp.c
=================================--- linux.orig/arch/ia64/kernel/smp.c	2007-03-27 10:08:06.935647573 -0500
+++ linux/arch/ia64/kernel/smp.c	2007-03-27 10:15:41.859634603 -0500
@@ -50,6 +50,12 @@
 #include <asm/mca.h>
 
 /*
+ * Per-cpu counts of the number of local TLB flushes that are done via an IPI.
+ *   Note: keep structure small - local copies are made on the task's stack.
+ */
+static DEFINE_PER_CPU(unsigned short, local_flush_count) ____cacheline_aligned;
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise static memory
  * requirements. It also looks cleaner.
  */
@@ -248,6 +254,64 @@ smp_send_reschedule (int cpu)
 	platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
 }
 
+/*
+ * Called with preeemption disabled.
+ */
+static void
+smp_send_local_flush_tlb (int cpu)
+{
+	platform_send_ipi(cpu, IA64_IPI_LOCAL_TLB_FLUSH, IA64_IPI_DM_INT, 0);
+}
+
+void
+smp_local_flush_tlb(void)
+{
+	__ia64_per_cpu_var(local_flush_count)++;
+	local_flush_tlb_all();
+}
+
+/*
+ * Flush counts are kept in a "short" to preserve stack space. It is possible (but
+ * highly unlikely) that a count could wrap & the flush would not be seen as complete.
+ * Retry the flush IPI after a long time...
+ */
+#define FLUSH_RETRY_COUNT	10000
+
+void
+smp_flush_tlb_cpumask (cpumask_t xcpumask)
+{
+	unsigned short counts[NR_CPUS];
+	cpumask_t cpumask = xcpumask;
+	int count, mycpu, cpu, flush_mycpu = 0;
+
+	preempt_disable();
+	mycpu = smp_processor_id();
+
+	for_each_cpu_mask(cpu, cpumask) {
+		counts[cpu] = per_cpu(local_flush_count, cpu);
+		mb();
+		if (cpu = mycpu)
+			flush_mycpu = 1;
+		else
+			smp_send_local_flush_tlb(cpu);
+	}
+
+	if (flush_mycpu)
+		smp_local_flush_tlb();
+
+	for_each_cpu_mask(cpu, cpumask) {
+		count = 0;
+		while(counts[cpu] = per_cpu(local_flush_count, cpu)) {
+			udelay(1);
+			if (count++ >= FLUSH_RETRY_COUNT) {
+				count = 0;
+				smp_send_local_flush_tlb(cpu);
+			}
+		}
+	}
+	preempt_enable();
+}
+
 void
 smp_flush_tlb_all (void)
 {
Index: linux/arch/ia64/sn/kernel/sn2/sn2_smp.c
=================================--- linux.orig/arch/ia64/sn/kernel/sn2/sn2_smp.c	2007-03-27 10:08:06.935647573 -0500
+++ linux/arch/ia64/sn/kernel/sn2/sn2_smp.c	2007-03-27 11:21:53.285791962 -0500
@@ -46,6 +46,9 @@ DECLARE_PER_CPU(struct ptc_stats, ptcsta
 
 static  __cacheline_aligned DEFINE_SPINLOCK(sn2_global_ptc_lock);
 
+/* 0 = old algorithm (no IPI flushes), 1 = ipi deadlock flush, 2 = ipi instead of SHUB ptc, >2 = always ipi */
+static int sn2_flush_opt = 0;
+
 extern unsigned long
 sn2_ptc_deadlock_recovery_core(volatile unsigned long *, unsigned long,
 			       volatile unsigned long *, unsigned long,
@@ -76,6 +79,8 @@ struct ptc_stats {
 	unsigned long shub_itc_clocks;
 	unsigned long shub_itc_clocks_max;
 	unsigned long shub_ptc_flushes_not_my_mm;
+	unsigned long shub_ipi_flushes;
+	unsigned long shub_ipi_flushes_itc_clocks;
 };
 
 #define sn2_ptctest	0
@@ -121,6 +126,18 @@ void sn_tlb_migrate_finish(struct mm_str
 		flush_tlb_mm(mm);
 }
 
+static void
+sn2_ipi_flush_all_tlb(struct mm_struct *mm)
+{
+	unsigned long itc;
+
+	itc = ia64_get_itc();
+	smp_flush_tlb_cpumask(mm->cpu_vm_mask);
+	itc = ia64_get_itc() - itc;
+	__get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc;
+	__get_cpu_var(ptcstats).shub_ipi_flushes++;
+}
+
 /**
  * sn2_global_tlb_purge - globally purge translation cache of virtual address range
  * @mm: mm_struct containing virtual address range
@@ -154,7 +171,12 @@ sn2_global_tlb_purge(struct mm_struct *m
 	unsigned long itc, itc2, flags, data0 = 0, data1 = 0, rr_value, old_rr = 0;
 	short nasids[MAX_NUMNODES], nix;
 	nodemask_t nodes_flushed;
-	int active, max_active, deadlock;
+	int active, max_active, deadlock, flush_opt = sn2_flush_opt;
+
+	if (flush_opt > 2) {
+		sn2_ipi_flush_all_tlb(mm);
+		return;
+	}
 
 	nodes_clear(nodes_flushed);
 	i = 0;
@@ -189,6 +211,12 @@ sn2_global_tlb_purge(struct mm_struct *m
 		return;
 	}
 
+	if (flush_opt = 2) {
+		sn2_ipi_flush_all_tlb(mm);
+		preempt_enable();
+		return;
+	}
+
 	itc = ia64_get_itc();
 	nix = 0;
 	for_each_node_mask(cnode, nodes_flushed)
@@ -256,6 +284,8 @@ sn2_global_tlb_purge(struct mm_struct *m
 			}
 			if (active >= max_active || i = (nix - 1)) {
 				if ((deadlock = wait_piowc())) {
+					if (flush_opt = 1)
+						goto done;
 					sn2_ptc_deadlock_recovery(nasids, ibegin, i, mynasid, ptc0, data0, ptc1, data1);
 					if (reset_max_active_on_deadlock())
 						max_active = 1;
@@ -267,6 +297,7 @@ sn2_global_tlb_purge(struct mm_struct *m
 		start += (1UL << nbits);
 	} while (start < end);
 
+done:
 	itc2 = ia64_get_itc() - itc2;
 	__get_cpu_var(ptcstats).shub_itc_clocks += itc2;
 	if (itc2 > __get_cpu_var(ptcstats).shub_itc_clocks_max)
@@ -279,6 +310,11 @@ sn2_global_tlb_purge(struct mm_struct *m
 
 	spin_unlock_irqrestore(PTC_LOCK(shub1), flags);
 
+	if (flush_opt = 1 && deadlock) {
+		__get_cpu_var(ptcstats).deadlocks++;
+		sn2_ipi_flush_all_tlb(mm);
+	}
+
 	preempt_enable();
 }
 
@@ -425,24 +461,42 @@ static int sn2_ptc_seq_show(struct seq_f
 
 	if (!cpu) {
 		seq_printf(file,
-			   "# cpu ptc_l newrid ptc_flushes nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max not_my_mm deadlock2\n");
-		seq_printf(file, "# ptctest %d\n", sn2_ptctest);
+			   "# cpu ptc_l newrid ptc_flushes nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max not_my_mm deadlock2 ipi_fluches ipi_nsec\n");
+		seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt);
 	}
 
 	if (cpu < NR_CPUS && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
-		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
+		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
 				stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
 				stat->deadlocks,
 				1000 * stat->lock_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
 				1000 * stat->shub_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
 				1000 * stat->shub_itc_clocks_max / per_cpu(cpu_info, cpu).cyc_per_usec,
 				stat->shub_ptc_flushes_not_my_mm,
-				stat->deadlocks2);
+				stat->deadlocks2,
+				stat->shub_ipi_flushes,
+				1000 * stat->shub_ipi_flushes_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec);
 	}
 	return 0;
 }
 
+static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data)
+{
+	int cpu;
+	char optstr[64];
+
+	if (copy_from_user(optstr, user, count))
+		return -EFAULT;
+	optstr[count - 1] = '\0';
+	sn2_flush_opt = simple_strtoul(optstr, NULL, 0);
+
+	for_each_online_cpu(cpu)
+		memset(&per_cpu(ptcstats, cpu), 0, sizeof(struct ptc_stats));
+
+	return count;
+}
+
 static struct seq_operations sn2_ptc_seq_ops = {
 	.start = sn2_ptc_seq_start,
 	.next = sn2_ptc_seq_next,
@@ -458,6 +512,7 @@ static int sn2_ptc_proc_open(struct inod
 static const struct file_operations proc_sn2_ptc_operations = {
 	.open = sn2_ptc_proc_open,
 	.read = seq_read,
+	.write = sn2_ptc_proc_write,
 	.llseek = seq_lseek,
 	.release = seq_release,
 };
Index: linux/include/asm-ia64/hw_irq.h
=================================--- linux.orig/include/asm-ia64/hw_irq.h	2007-03-27 10:08:06.935647573 -0500
+++ linux/include/asm-ia64/hw_irq.h	2007-03-27 10:15:41.875636572 -0500
@@ -66,6 +66,7 @@ extern int ia64_last_device_vector;
 #define IA64_PERFMON_VECTOR		0xee	/* performanc monitor interrupt vector */
 #define IA64_TIMER_VECTOR		0xef	/* use highest-prio group 15 interrupt for timer */
 #define	IA64_MCA_WAKEUP_VECTOR		0xf0	/* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
+#define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP reschedule */
 #define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */
 #define IA64_IPI_VECTOR			0xfe	/* inter-processor interrupt vector */
 
Index: linux/include/asm-ia64/tlbflush.h
=================================--- linux.orig/include/asm-ia64/tlbflush.h	2007-03-27 10:08:06.939648066 -0500
+++ linux/include/asm-ia64/tlbflush.h	2007-03-27 10:15:41.879637065 -0500
@@ -27,9 +27,11 @@ extern void local_flush_tlb_all (void);
 #ifdef CONFIG_SMP
   extern void smp_flush_tlb_all (void);
   extern void smp_flush_tlb_mm (struct mm_struct *mm);
+  extern void smp_flush_tlb_cpumask (cpumask_t xcpumask);
 # define flush_tlb_all()	smp_flush_tlb_all()
 #else
 # define flush_tlb_all()	local_flush_tlb_all()
+# define smp_flush_tlb_cpumask() local_flush_tlb_all()
 #endif
 
 static inline void
@@ -94,6 +96,15 @@ flush_tlb_pgtables (struct mm_struct *mm
 	 */
 }
 
+/*
+ * Flush the local TLB. Invoked from another cpu using an IPI.
+ */
+#ifdef CONFIG_SMP
+void smp_local_flush_tlb(void);
+#else
+#define smp_local_flush_tlb()
+#endif
+
 #define flush_tlb_kernel_range(start, end)	flush_tlb_all()	/* XXX fix me */
 
 #endif /* _ASM_IA64_TLBFLUSH_H */

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
@ 2007-03-27 20:24 ` Luck, Tony
  2007-03-27 20:33 ` Jack Steiner
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Luck, Tony @ 2007-03-27 20:24 UTC (permalink / raw)
  To: linux-ia64

+smp_flush_tlb_cpumask (cpumask_t xcpumask)
+{
+	unsigned short counts[NR_CPUS];

Even as a "short" ... that is still a lot of stack.  Possibly
a new place to fail when NR_CPUS gets bumped up again?

+#define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP reschedule */
 #define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */

Cut & pasted the comment when duping this line?


-Tony

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
  2007-03-27 20:24 ` Luck, Tony
@ 2007-03-27 20:33 ` Jack Steiner
  2007-03-27 22:32 ` Luck, Tony
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jack Steiner @ 2007-03-27 20:33 UTC (permalink / raw)
  To: linux-ia64

On Tue, Mar 27, 2007 at 01:24:27PM -0700, Luck, Tony wrote:
> +smp_flush_tlb_cpumask (cpumask_t xcpumask)
> +{
> +	unsigned short counts[NR_CPUS];
> 
> Even as a "short" ... that is still a lot of stack.  Possibly
> a new place to fail when NR_CPUS gets bumped up again?

I was also somwhat concerned about that, too. At NR_CPUS\x1024, it
uses 2k bytes whick is big but I think ok. Even at NR_CPUS 48,
I think is is still ok but I'm getting a little nervous. 

FWIW, slab.c has a "struct array_cache *new[NR_CPUS]" on the
stack.

Also, net/core/utils.c has "unsigned long seed[NR_CPUS]".

I'm only adding 1/4 as much space. Can I get away with it or
should I allocate a node-local chunk of memory & keep a pointer to
it in per-cpu data?

> 
> +#define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP reschedule */
>  #define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */
> 
> Cut & pasted the comment when duping this line?

Whoops :-) 
Fixed....

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
  2007-03-27 20:24 ` Luck, Tony
  2007-03-27 20:33 ` Jack Steiner
@ 2007-03-27 22:32 ` Luck, Tony
  2007-03-27 22:46 ` Jack Steiner
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Luck, Tony @ 2007-03-27 22:32 UTC (permalink / raw)
  To: linux-ia64

> FWIW, slab.c has a "struct array_cache *new[NR_CPUS]" on the
> stack.

I don't think so.  That line is part of the "struct ccupdate_struct"
definition.  We only declare a pointer to one of those on the stack.

> Also, net/core/utils.c has "unsigned long seed[NR_CPUS]".

I don't see this one either (in Linus 2.6.21-rc5). Looking at
the history Suresh Siddha zapped it last August specifically
because of worries about stack overflow on 1024 cpu systems.

I do see:
kernel/sched.c: 	int ints[NR_CPUS], i;

> I'm only adding 1/4 as much space. Can I get away with it or
> should I allocate a node-local chunk of memory & keep a pointer to
> it in per-cpu data?

Debugging kernel stack overflow is no fun at all, so you might
want to take the better-safe-than-sorry approach here.

-Tony

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
                   ` (2 preceding siblings ...)
  2007-03-27 22:32 ` Luck, Tony
@ 2007-03-27 22:46 ` Jack Steiner
  2007-03-28  0:46 ` Zou Nan hai
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jack Steiner @ 2007-03-27 22:46 UTC (permalink / raw)
  To: linux-ia64

On Tue, Mar 27, 2007 at 03:32:16PM -0700, Luck, Tony wrote:
> > FWIW, slab.c has a "struct array_cache *new[NR_CPUS]" on the
> > stack.
> 
> I don't think so.  That line is part of the "struct ccupdate_struct"
> definition.  We only declare a pointer to one of those on the stack.

You are correct (of course). I was looking in a tree basd on 
SLES10. That is also true for the other reference in the network
code. 

> Debugging kernel stack overflow is no fun at all, so you might
> want to take the better-safe-than-sorry approach here.

Probably true. I'll repost a modified patch (hopefully) tommorrow.

Does anyone see any other issues?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
                   ` (3 preceding siblings ...)
  2007-03-27 22:46 ` Jack Steiner
@ 2007-03-28  0:46 ` Zou Nan hai
  2007-03-28  1:53 ` Jack Steiner
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Zou Nan hai @ 2007-03-28  0:46 UTC (permalink / raw)
  To: linux-ia64

On Wed, 2007-03-28 at 03:39, Jack Steiner wrote:

> This patch adds an optional method for purging the TLB on SN IA64 systems.
> The change should not affect any non-SN system.
> 
> 	Signed-off-by: Jack Steiner <steiner@sgi.com>
> 
> ---
> 
> +void
> +smp_flush_tlb_cpumask (cpumask_t xcpumask)
> +{
> +	unsigned short counts[NR_CPUS];
> +	cpumask_t cpumask = xcpumask;
> +	int count, mycpu, cpu, flush_mycpu = 0;
> +
> +	preempt_disable();
> +	mycpu = smp_processor_id();
> +
> +	for_each_cpu_mask(cpu, cpumask) {
> +		counts[cpu] = per_cpu(local_flush_count, cpu);
> +		mb();
> +		if (cpu = mycpu)
> +			flush_mycpu = 1;
> +		else
> +			smp_send_local_flush_tlb(cpu);
> +	}
> +
> +	if (flush_mycpu)
> +		smp_local_flush_tlb();
> +
> +	for_each_cpu_mask(cpu, cpumask) {
> +		count = 0;
> +		while(counts[cpu] = per_cpu(local_flush_count, cpu)) {

Due to 64k offset of percpu data, the same percpu variable on different
CPUs are very likely to be on the same cacheline of some levels of
cache.

So I think the operation on local_flush_count may be very cache
unfriendly...


Zou Nan hai





^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
                   ` (4 preceding siblings ...)
  2007-03-28  0:46 ` Zou Nan hai
@ 2007-03-28  1:53 ` Jack Steiner
  2007-03-28  3:03 ` Zou, Nanhai
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 10+ messages in thread
From: Jack Steiner @ 2007-03-28  1:53 UTC (permalink / raw)
  To: linux-ia64

On Wed, Mar 28, 2007 at 08:46:44AM +0800, Zou Nan hai wrote:
> On Wed, 2007-03-28 at 03:39, Jack Steiner wrote:
> 
> > This patch adds an optional method for purging the TLB on SN IA64 systems.
> > The change should not affect any non-SN system.
> > 
> > 	Signed-off-by: Jack Steiner <steiner@sgi.com>
> > 
> > ---
> > 
> > +void
> > +smp_flush_tlb_cpumask (cpumask_t xcpumask)
> > +{
> > +	unsigned short counts[NR_CPUS];
> > +	cpumask_t cpumask = xcpumask;
> > +	int count, mycpu, cpu, flush_mycpu = 0;
> > +
> > +	preempt_disable();
> > +	mycpu = smp_processor_id();
> > +
> > +	for_each_cpu_mask(cpu, cpumask) {
> > +		counts[cpu] = per_cpu(local_flush_count, cpu);
> > +		mb();
> > +		if (cpu = mycpu)
> > +			flush_mycpu = 1;
> > +		else
> > +			smp_send_local_flush_tlb(cpu);
> > +	}
> > +
> > +	if (flush_mycpu)
> > +		smp_local_flush_tlb();
> > +
> > +	for_each_cpu_mask(cpu, cpumask) {
> > +		count = 0;
> > +		while(counts[cpu] = per_cpu(local_flush_count, cpu)) {
> 
> Due to 64k offset of percpu data, the same percpu variable on different
> CPUs are very likely to be on the same cacheline of some levels of
> cache.
> 
> So I think the operation on local_flush_count may be very cache
> unfriendly...

I was concerned about that, too, but testing finally convinced me that
it was not an issue. I think the reason is that is takes a few hundred
nanoseconds per cpu to send an IPI.  So rather than a contended cache
line, we have a line that is serially read by multiple cpus. Although
contention can occur, typically multiple cpus are not trying to read
the same line at the same time.

For example (oversimplified), IPI sent to cpu 0 at time 0, to cpu 1 at
time ~100, cpu 2 at time ~200, etc. The IPI requires a chipset access
that takes order-of-memory-access time. Assume it take N usec for a
cpu to recognize the IPI & call the TLB flushing code. Cpu 0 reads
local_flush_count at time N, cpu reads local_flush_count at time 
100+N, etc. Very little contention, just serial access.

--

I tried a second algorithm where the local_flush_count was kept in
node-local percpu data. That scheme was significantly slower. Most
likely because the cpu that initiates the flush will take N (# of
cpus) cache misses to get an initial snapshot of the counts, then
another N cache misses to check for completion. This assumes that
a cpu doing a flush is not the most-recent cpu to do a flush.
I believe this is typical.

Keeping the counts in a single array (64cpus/cache line)
significantly reduces the number of cache misses.

Another disadvantage of keeping counts in per-cpu data is that
scanning the counts trashes the TLB for large NR_CPUS. The counts will
be located in different 16MB granules. Each reference to cpu's percpu
data will require a different TLB entry to map the address used to
reference the count. To scan N cpus, there will be ~2*N TLB misses
plus at the end of the flush, the contents of the TLB are useless
for most kernel or user use.

--

I tried a third algorithm where the counts were kept in a single array
but each count was cacheline aligned to eliminate any possibility
of contention. This was better that the second method that trashed
the TLB. 1 TLB entry will cover the entire array. Unfortunately,
this algorithm still encurs 2*N cache misses & is slower than
the current algorithm.

Does this explanation make sense...... If anyone has an alternate
algorithm, I be glad to try it.

-- jack

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
                   ` (5 preceding siblings ...)
  2007-03-28  1:53 ` Jack Steiner
@ 2007-03-28  3:03 ` Zou, Nanhai
  2007-03-28  3:26 ` Jack Steiner
  2007-04-05 21:39 ` Jack Steiner
  8 siblings, 0 replies; 10+ messages in thread
From: Zou, Nanhai @ 2007-03-28  3:03 UTC (permalink / raw)
  To: linux-ia64

> -----Original Message-----
> From: Jack Steiner [mailto:steiner@sgi.com]
> Sent: 2007Äê3ÔÂ28ÈÕ 9:53
> To: Zou, Nanhai
> Cc: Luck, Tony; Linux-IA64
> Subject: Re: [PATCH] - Optional method to purge the TLB on SN systems
> 
> On Wed, Mar 28, 2007 at 08:46:44AM +0800, Zou Nan hai wrote:
> > On Wed, 2007-03-28 at 03:39, Jack Steiner wrote:
> >
> > > This patch adds an optional method for purging the TLB on SN IA64 systems.
> > > The change should not affect any non-SN system.
> > >
> > > 	Signed-off-by: Jack Steiner <steiner@sgi.com>
> > >
> > > ---
> > >
> > > +void
> > > +smp_flush_tlb_cpumask (cpumask_t xcpumask)
> > > +{
> > > +	unsigned short counts[NR_CPUS];
> > > +	cpumask_t cpumask = xcpumask;
> > > +	int count, mycpu, cpu, flush_mycpu = 0;
> > > +
> > > +	preempt_disable();
> > > +	mycpu = smp_processor_id();
> > > +
> > > +	for_each_cpu_mask(cpu, cpumask) {
> > > +		counts[cpu] = per_cpu(local_flush_count, cpu);
> > > +		mb();
> > > +		if (cpu = mycpu)
> > > +			flush_mycpu = 1;
> > > +		else
> > > +			smp_send_local_flush_tlb(cpu);
> > > +	}
> > > +
> > > +	if (flush_mycpu)
> > > +		smp_local_flush_tlb();
> > > +
> > > +	for_each_cpu_mask(cpu, cpumask) {
> > > +		count = 0;
> > > +		while(counts[cpu] = per_cpu(local_flush_count, cpu)) {
> >
> > Due to 64k offset of percpu data, the same percpu variable on different
> > CPUs are very likely to be on the same cacheline of some levels of
> > cache.
> >
> > So I think the operation on local_flush_count may be very cache
> > unfriendly...
> 
> I was concerned about that, too, but testing finally convinced me that
> it was not an issue. I think the reason is that is takes a few hundred
> nanoseconds per cpu to send an IPI.  So rather than a contended cache
> line, we have a line that is serially read by multiple cpus. Although
> contention can occur, typically multiple cpus are not trying to read
> the same line at the same time.
> 
> For example (oversimplified), IPI sent to cpu 0 at time 0, to cpu 1 at
> time ~100, cpu 2 at time ~200, etc. The IPI requires a chipset access
> that takes order-of-memory-access time. Assume it take N usec for a
> cpu to recognize the IPI & call the TLB flushing code. Cpu 0 reads
> local_flush_count at time N, cpu reads local_flush_count at time
> 100+N, etc. Very little contention, just serial access.
> 
> --
> 
> I tried a second algorithm where the local_flush_count was kept in
> node-local percpu data. That scheme was significantly slower. Most
> likely because the cpu that initiates the flush will take N (# of
> cpus) cache misses to get an initial snapshot of the counts, then
> another N cache misses to check for completion. This assumes that
> a cpu doing a flush is not the most-recent cpu to do a flush.
> I believe this is typical.
> 
> Keeping the counts in a single array (64cpus/cache line)
> significantly reduces the number of cache misses.

> 
> Another disadvantage of keeping counts in per-cpu data is that
> scanning the counts trashes the TLB for large NR_CPUS. The counts will
> be located in different 16MB granules. Each reference to cpu's percpu
> data will require a different TLB entry to map the address used to
> reference the count. To scan N cpus, there will be ~2*N TLB misses
> plus at the end of the flush, the contents of the TLB are useless
> for most kernel or user use.
> 
> --
> 
> I tried a third algorithm where the counts were kept in a single array
> but each count was cacheline aligned to eliminate any possibility
> of contention. This was better that the second method that trashed
> the TLB. 1 TLB entry will cover the entire array. Unfortunately,
> this algorithm still encurs 2*N cache misses & is slower than
> the current algorithm.
> 
> 
> Does this explanation make sense...... If anyone has an alternate
> algorithm, I be glad to try it.

  Yes, put count in a tight array could be better.
  But your original patch is using the second algorithm?

  Zou Nan hai
> 
> 
> -- jack

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
                   ` (6 preceding siblings ...)
  2007-03-28  3:03 ` Zou, Nanhai
@ 2007-03-28  3:26 ` Jack Steiner
  2007-04-05 21:39 ` Jack Steiner
  8 siblings, 0 replies; 10+ messages in thread
From: Jack Steiner @ 2007-03-28  3:26 UTC (permalink / raw)
  To: linux-ia64

On Wed, Mar 28, 2007 at 11:03:50AM +0800, Zou, Nanhai wrote:
> > -----Original Message-----
> > From: Jack Steiner [mailto:steiner@sgi.com]
> > Sent: 2007??3??28?? 9:53
> > To: Zou, Nanhai
> > Cc: Luck, Tony; Linux-IA64
> > Subject: Re: [PATCH] - Optional method to purge the TLB on SN systems
> > 
> > On Wed, Mar 28, 2007 at 08:46:44AM +0800, Zou Nan hai wrote:
> > > On Wed, 2007-03-28 at 03:39, Jack Steiner wrote:
> > >
> > > > This patch adds an optional method for purging the TLB on SN IA64 systems.
> > > > The change should not affect any non-SN system.
> > > >
> > > > 	Signed-off-by: Jack Steiner <steiner@sgi.com>
> > > >
> > > > ---
> > > >
> > > > +void
> > > > +smp_flush_tlb_cpumask (cpumask_t xcpumask)
> > > > +{
> > > > +	unsigned short counts[NR_CPUS];
> > > > +	cpumask_t cpumask = xcpumask;
> > > > +	int count, mycpu, cpu, flush_mycpu = 0;
> > > > +
> > > > +	preempt_disable();
> > > > +	mycpu = smp_processor_id();
> > > > +
> > > > +	for_each_cpu_mask(cpu, cpumask) {
> > > > +		counts[cpu] = per_cpu(local_flush_count, cpu);
> > > > +		mb();
> > > > +		if (cpu = mycpu)
> > > > +			flush_mycpu = 1;
> > > > +		else
> > > > +			smp_send_local_flush_tlb(cpu);
> > > > +	}
> > > > +
> > > > +	if (flush_mycpu)
> > > > +		smp_local_flush_tlb();
> > > > +
> > > > +	for_each_cpu_mask(cpu, cpumask) {
> > > > +		count = 0;
> > > > +		while(counts[cpu] = per_cpu(local_flush_count, cpu)) {
> > >
> > > Due to 64k offset of percpu data, the same percpu variable on different
> > > CPUs are very likely to be on the same cacheline of some levels of
> > > cache.
> > >
> > > So I think the operation on local_flush_count may be very cache
> > > unfriendly...
> > 
> > I was concerned about that, too, but testing finally convinced me that
> > it was not an issue. I think the reason is that is takes a few hundred
> > nanoseconds per cpu to send an IPI.  So rather than a contended cache
> > line, we have a line that is serially read by multiple cpus. Although
> > contention can occur, typically multiple cpus are not trying to read
> > the same line at the same time.
> > 
> > For example (oversimplified), IPI sent to cpu 0 at time 0, to cpu 1 at
> > time ~100, cpu 2 at time ~200, etc. The IPI requires a chipset access
> > that takes order-of-memory-access time. Assume it take N usec for a
> > cpu to recognize the IPI & call the TLB flushing code. Cpu 0 reads
> > local_flush_count at time N, cpu reads local_flush_count at time
> > 100+N, etc. Very little contention, just serial access.
> > 
> > --
> > 
> > I tried a second algorithm where the local_flush_count was kept in
> > node-local percpu data. That scheme was significantly slower. Most
> > likely because the cpu that initiates the flush will take N (# of
> > cpus) cache misses to get an initial snapshot of the counts, then
> > another N cache misses to check for completion. This assumes that
> > a cpu doing a flush is not the most-recent cpu to do a flush.
> > I believe this is typical.
> > 
> > Keeping the counts in a single array (64cpus/cache line)
> > significantly reduces the number of cache misses.
> 
> > 
> > Another disadvantage of keeping counts in per-cpu data is that
> > scanning the counts trashes the TLB for large NR_CPUS. The counts will
> > be located in different 16MB granules. Each reference to cpu's percpu
> > data will require a different TLB entry to map the address used to
> > reference the count. To scan N cpus, there will be ~2*N TLB misses
> > plus at the end of the flush, the contents of the TLB are useless
> > for most kernel or user use.
> > 
> > --
> > 
> > I tried a third algorithm where the counts were kept in a single array
> > but each count was cacheline aligned to eliminate any possibility
> > of contention. This was better that the second method that trashed
> > the TLB. 1 TLB entry will cover the entire array. Unfortunately,
> > this algorithm still encurs 2*N cache misses & is slower than
> > the current algorithm.
> > 
> > 
> > Does this explanation make sense...... If anyone has an alternate
> > algorithm, I be glad to try it.
> 
>   Yes, put count in a tight array could be better.
>   But your original patch is using the second algorithm?

That's embarasing.

I had several variants of the patch & did a lot of testing with each.
The only difference was in the "counts". Arrays, sizes, alignment,
percpu, etc. It looks like I grabbed the wrong patch.

I want to review my notes & possibly retest to make sure that what I
said was correct about the differences between the patches & the
performance of each.

Stay tuned & thanks for the careful review.

-- jack



^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] - Optional method to purge the TLB on SN systems
  2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
                   ` (7 preceding siblings ...)
  2007-03-28  3:26 ` Jack Steiner
@ 2007-04-05 21:39 ` Jack Steiner
  8 siblings, 0 replies; 10+ messages in thread
From: Jack Steiner @ 2007-04-05 21:39 UTC (permalink / raw)
  To: linux-ia64


This patch adds an optional method for purging the TLB on SN IA64 systems.
The change should not affect any non-SN system.

	Signed-off-by: Jack Steiner <steiner@sgi.com>

---

This version of the patch uses the algorithm that I _thought_ I
used in the original patch (thanks Zou). It also moves the
shadow array out of the stack as suggested by Tony.


Instead of using the chipset (SHUB) MMRs for issuing PTC flushes, the new
code sends IPIs to all affected nodes.  Each node then issues local PTC flushes.
The purpose of this change is work around performance issues that have
been seen on very large SSI systems. Small to medium size systems will continue
to use the original algorithm. 

I would like to make the selection of the algorithm automatic but unfortunately
the optimum algorithm is workload specific. The intent is that only sites that
encounter problems will use the new algorithm.



Index: linux/arch/ia64/kernel/irq_ia64.c
=================================--- linux.orig/arch/ia64/kernel/irq_ia64.c	2007-04-04 17:02:18.476865852 -0500
+++ linux/arch/ia64/kernel/irq_ia64.c	2007-04-04 17:03:51.960395197 -0500
@@ -39,6 +39,7 @@
 #include <asm/machvec.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
+#include <asm/tlbflush.h>
 
 #ifdef CONFIG_PERFMON
 # include <asm/perfmon.h>
@@ -127,8 +128,10 @@ void destroy_irq(unsigned int irq)
 
 #ifdef CONFIG_SMP
 #	define IS_RESCHEDULE(vec)	(vec = IA64_IPI_RESCHEDULE)
+#	define IS_LOCAL_TLB_FLUSH(vec)	(vec = IA64_IPI_LOCAL_TLB_FLUSH)
 #else
 #	define IS_RESCHEDULE(vec)	(0)
+#	define IS_LOCAL_TLB_FLUSH(vec)	(0)
 #endif
 /*
  * That's where the IVT branches when we get an external
@@ -180,8 +183,11 @@ ia64_handle_irq (ia64_vector vector, str
 	saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
 	ia64_srlz_d();
 	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		if (unlikely(IS_RESCHEDULE(vector)))
-			 kstat_this_cpu.irqs[vector]++;
+		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
+			smp_local_flush_tlb();
+			kstat_this_cpu.irqs[vector]++;
+		} else if (unlikely(IS_RESCHEDULE(vector)))
+			kstat_this_cpu.irqs[vector]++;
 		else {
 			ia64_setreg(_IA64_REG_CR_TPR, vector);
 			ia64_srlz_d();
@@ -227,8 +233,11 @@ void ia64_process_pending_intr(void)
 	  * Perform normal interrupt style processing
 	  */
 	while (vector != IA64_SPURIOUS_INT_VECTOR) {
-		if (unlikely(IS_RESCHEDULE(vector)))
-			 kstat_this_cpu.irqs[vector]++;
+		if (unlikely(IS_LOCAL_TLB_FLUSH(vector))) {
+			smp_local_flush_tlb();
+			kstat_this_cpu.irqs[vector]++;
+		} else if (unlikely(IS_RESCHEDULE(vector)))
+			kstat_this_cpu.irqs[vector]++;
 		else {
 			struct pt_regs *old_regs = set_irq_regs(NULL);
 
@@ -260,12 +269,12 @@ void ia64_process_pending_intr(void)
 
 
 #ifdef CONFIG_SMP
-extern irqreturn_t handle_IPI (int irq, void *dev_id);
 
 static irqreturn_t dummy_handler (int irq, void *dev_id)
 {
 	BUG();
 }
+extern irqreturn_t handle_IPI (int irq, void *dev_id);
 
 static struct irqaction ipi_irqaction = {
 	.handler =	handle_IPI,
@@ -278,6 +287,13 @@ static struct irqaction resched_irqactio
 	.flags =	IRQF_DISABLED,
 	.name =		"resched"
 };
+
+static struct irqaction tlb_irqaction = {
+	.handler =	dummy_handler,
+	.flags =	SA_INTERRUPT,
+	.name =		"tlb_flush"
+};
+
 #endif
 
 void
@@ -303,6 +319,7 @@ init_IRQ (void)
 #ifdef CONFIG_SMP
 	register_percpu_irq(IA64_IPI_VECTOR, &ipi_irqaction);
 	register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
+	register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction);
 #endif
 #ifdef CONFIG_PERFMON
 	pfm_init_percpu();
Index: linux/arch/ia64/kernel/smp.c
=================================--- linux.orig/arch/ia64/kernel/smp.c	2007-04-04 17:02:18.476865852 -0500
+++ linux/arch/ia64/kernel/smp.c	2007-04-04 17:02:39.871505895 -0500
@@ -50,6 +50,18 @@
 #include <asm/mca.h>
 
 /*
+ * Note: alignment of 4 entries/cacheline was empirically determined
+ * to be a good tradeoff between hot cachelines & spreading the array
+ * across too many cacheline.
+ */
+static struct local_tlb_flush_counts {
+	unsigned int count;
+} __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS];
+
+static DEFINE_PER_CPU(unsigned int, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned;
+
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise static memory
  * requirements. It also looks cleaner.
  */
@@ -248,6 +260,62 @@ smp_send_reschedule (int cpu)
 	platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
 }
 
+/*
+ * Called with preeemption disabled.
+ */
+static void
+smp_send_local_flush_tlb (int cpu)
+{
+	platform_send_ipi(cpu, IA64_IPI_LOCAL_TLB_FLUSH, IA64_IPI_DM_INT, 0);
+}
+
+void
+smp_local_flush_tlb(void)
+{
+	/*
+	 * Use atomic ops. Otherwise, the load/increment/store sequence from
+	 * a "++" operation can have the line stolen between the load & store.
+	 * The overhead of the atomic op in negligible in this case & offers
+	 * significant benefit for the brief periods where lots of cpus
+	 * are simultaneously flushing TLBs.
+	 */
+	ia64_fetchadd(1, &local_tlb_flush_counts[smp_processor_id()].count, acq);
+	local_flush_tlb_all();
+}
+
+#define FLUSH_DELAY	5 /* Usec backoff to eliminate excessive cacheline bouncing */
+
+void
+smp_flush_tlb_cpumask(cpumask_t xcpumask)
+{
+	unsigned int *counts = __ia64_per_cpu_var(shadow_flush_counts);
+	cpumask_t cpumask = xcpumask;
+	int mycpu, cpu, flush_mycpu = 0;
+
+	preempt_disable();
+	mycpu = smp_processor_id();
+
+	for_each_cpu_mask(cpu, cpumask)
+		counts[cpu] = local_tlb_flush_counts[cpu].count;
+
+	mb();
+	for_each_cpu_mask(cpu, cpumask) {
+		if (cpu = mycpu)
+			flush_mycpu = 1;
+		else
+			smp_send_local_flush_tlb(cpu);
+	}
+
+	if (flush_mycpu)
+		smp_local_flush_tlb();
+
+	for_each_cpu_mask(cpu, cpumask)
+		while(counts[cpu] = local_tlb_flush_counts[cpu].count)
+			udelay(FLUSH_DELAY);
+
+	preempt_enable();
+}
+
 void
 smp_flush_tlb_all (void)
 {
Index: linux/arch/ia64/sn/kernel/sn2/sn2_smp.c
=================================--- linux.orig/arch/ia64/sn/kernel/sn2/sn2_smp.c	2007-04-04 17:02:18.476865852 -0500
+++ linux/arch/ia64/sn/kernel/sn2/sn2_smp.c	2007-04-05 14:46:06.733549011 -0500
@@ -46,6 +46,9 @@ DECLARE_PER_CPU(struct ptc_stats, ptcsta
 
 static  __cacheline_aligned DEFINE_SPINLOCK(sn2_global_ptc_lock);
 
+/* 0 = old algorithm (no IPI flushes), 1 = ipi deadlock flush, 2 = ipi instead of SHUB ptc, >2 = always ipi */
+static int sn2_flush_opt = 0;
+
 extern unsigned long
 sn2_ptc_deadlock_recovery_core(volatile unsigned long *, unsigned long,
 			       volatile unsigned long *, unsigned long,
@@ -76,6 +79,8 @@ struct ptc_stats {
 	unsigned long shub_itc_clocks;
 	unsigned long shub_itc_clocks_max;
 	unsigned long shub_ptc_flushes_not_my_mm;
+	unsigned long shub_ipi_flushes;
+	unsigned long shub_ipi_flushes_itc_clocks;
 };
 
 #define sn2_ptctest	0
@@ -121,6 +126,18 @@ void sn_tlb_migrate_finish(struct mm_str
 		flush_tlb_mm(mm);
 }
 
+static void
+sn2_ipi_flush_all_tlb(struct mm_struct *mm)
+{
+	unsigned long itc;
+
+	itc = ia64_get_itc();
+	smp_flush_tlb_cpumask(mm->cpu_vm_mask);
+	itc = ia64_get_itc() - itc;
+	__get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc;
+	__get_cpu_var(ptcstats).shub_ipi_flushes++;
+}
+
 /**
  * sn2_global_tlb_purge - globally purge translation cache of virtual address range
  * @mm: mm_struct containing virtual address range
@@ -154,7 +171,12 @@ sn2_global_tlb_purge(struct mm_struct *m
 	unsigned long itc, itc2, flags, data0 = 0, data1 = 0, rr_value, old_rr = 0;
 	short nasids[MAX_NUMNODES], nix;
 	nodemask_t nodes_flushed;
-	int active, max_active, deadlock;
+	int active, max_active, deadlock, flush_opt = sn2_flush_opt;
+
+	if (flush_opt > 2) {
+		sn2_ipi_flush_all_tlb(mm);
+		return;
+	}
 
 	nodes_clear(nodes_flushed);
 	i = 0;
@@ -189,6 +211,12 @@ sn2_global_tlb_purge(struct mm_struct *m
 		return;
 	}
 
+	if (flush_opt = 2) {
+		sn2_ipi_flush_all_tlb(mm);
+		preempt_enable();
+		return;
+	}
+
 	itc = ia64_get_itc();
 	nix = 0;
 	for_each_node_mask(cnode, nodes_flushed)
@@ -256,6 +284,8 @@ sn2_global_tlb_purge(struct mm_struct *m
 			}
 			if (active >= max_active || i = (nix - 1)) {
 				if ((deadlock = wait_piowc())) {
+					if (flush_opt = 1)
+						goto done;
 					sn2_ptc_deadlock_recovery(nasids, ibegin, i, mynasid, ptc0, data0, ptc1, data1);
 					if (reset_max_active_on_deadlock())
 						max_active = 1;
@@ -267,6 +297,7 @@ sn2_global_tlb_purge(struct mm_struct *m
 		start += (1UL << nbits);
 	} while (start < end);
 
+done:
 	itc2 = ia64_get_itc() - itc2;
 	__get_cpu_var(ptcstats).shub_itc_clocks += itc2;
 	if (itc2 > __get_cpu_var(ptcstats).shub_itc_clocks_max)
@@ -279,6 +310,11 @@ sn2_global_tlb_purge(struct mm_struct *m
 
 	spin_unlock_irqrestore(PTC_LOCK(shub1), flags);
 
+	if (flush_opt = 1 && deadlock) {
+		__get_cpu_var(ptcstats).deadlocks++;
+		sn2_ipi_flush_all_tlb(mm);
+	}
+
 	preempt_enable();
 }
 
@@ -425,24 +461,42 @@ static int sn2_ptc_seq_show(struct seq_f
 
 	if (!cpu) {
 		seq_printf(file,
-			   "# cpu ptc_l newrid ptc_flushes nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max not_my_mm deadlock2\n");
-		seq_printf(file, "# ptctest %d\n", sn2_ptctest);
+			   "# cpu ptc_l newrid ptc_flushes nodes_flushed deadlocks lock_nsec shub_nsec shub_nsec_max not_my_mm deadlock2 ipi_fluches ipi_nsec\n");
+		seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt);
 	}
 
 	if (cpu < NR_CPUS && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
-		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
+		seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
 				stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
 				stat->deadlocks,
 				1000 * stat->lock_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
 				1000 * stat->shub_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec,
 				1000 * stat->shub_itc_clocks_max / per_cpu(cpu_info, cpu).cyc_per_usec,
 				stat->shub_ptc_flushes_not_my_mm,
-				stat->deadlocks2);
+				stat->deadlocks2,
+				stat->shub_ipi_flushes,
+				1000 * stat->shub_ipi_flushes_itc_clocks / per_cpu(cpu_info, cpu).cyc_per_usec);
 	}
 	return 0;
 }
 
+static ssize_t sn2_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data)
+{
+	int cpu;
+	char optstr[64];
+
+	if (copy_from_user(optstr, user, count))
+		return -EFAULT;
+	optstr[count - 1] = '\0';
+	sn2_flush_opt = simple_strtoul(optstr, NULL, 0);
+
+	for_each_online_cpu(cpu)
+		memset(&per_cpu(ptcstats, cpu), 0, sizeof(struct ptc_stats));
+
+	return count;
+}
+
 static struct seq_operations sn2_ptc_seq_ops = {
 	.start = sn2_ptc_seq_start,
 	.next = sn2_ptc_seq_next,
@@ -458,6 +512,7 @@ static int sn2_ptc_proc_open(struct inod
 static const struct file_operations proc_sn2_ptc_operations = {
 	.open = sn2_ptc_proc_open,
 	.read = seq_read,
+	.write = sn2_ptc_proc_write,
 	.llseek = seq_lseek,
 	.release = seq_release,
 };
Index: linux/include/asm-ia64/hw_irq.h
=================================--- linux.orig/include/asm-ia64/hw_irq.h	2007-04-04 17:02:18.476865852 -0500
+++ linux/include/asm-ia64/hw_irq.h	2007-04-04 17:02:39.887507868 -0500
@@ -66,6 +66,7 @@ extern int ia64_last_device_vector;
 #define IA64_PERFMON_VECTOR		0xee	/* performanc monitor interrupt vector */
 #define IA64_TIMER_VECTOR		0xef	/* use highest-prio group 15 interrupt for timer */
 #define	IA64_MCA_WAKEUP_VECTOR		0xf0	/* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
+#define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP flush local TLB */
 #define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */
 #define IA64_IPI_VECTOR			0xfe	/* inter-processor interrupt vector */
 
Index: linux/include/asm-ia64/tlbflush.h
=================================--- linux.orig/include/asm-ia64/tlbflush.h	2007-04-04 17:02:18.476865852 -0500
+++ linux/include/asm-ia64/tlbflush.h	2007-04-04 17:02:39.887507868 -0500
@@ -27,9 +27,11 @@ extern void local_flush_tlb_all (void);
 #ifdef CONFIG_SMP
   extern void smp_flush_tlb_all (void);
   extern void smp_flush_tlb_mm (struct mm_struct *mm);
+  extern void smp_flush_tlb_cpumask (cpumask_t xcpumask);
 # define flush_tlb_all()	smp_flush_tlb_all()
 #else
 # define flush_tlb_all()	local_flush_tlb_all()
+# define smp_flush_tlb_cpumask() local_flush_tlb_all()
 #endif
 
 static inline void
@@ -94,6 +96,15 @@ flush_tlb_pgtables (struct mm_struct *mm
 	 */
 }
 
+/*
+ * Flush the local TLB. Invoked from another cpu using an IPI.
+ */
+#ifdef CONFIG_SMP
+void smp_local_flush_tlb(void);
+#else
+#define smp_local_flush_tlb()
+#endif
+
 #define flush_tlb_kernel_range(start, end)	flush_tlb_all()	/* XXX fix me */
 
 #endif /* _ASM_IA64_TLBFLUSH_H */

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2007-04-05 21:39 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-27 19:39 [PATCH] - Optional method to purge the TLB on SN systems Jack Steiner
2007-03-27 20:24 ` Luck, Tony
2007-03-27 20:33 ` Jack Steiner
2007-03-27 22:32 ` Luck, Tony
2007-03-27 22:46 ` Jack Steiner
2007-03-28  0:46 ` Zou Nan hai
2007-03-28  1:53 ` Jack Steiner
2007-03-28  3:03 ` Zou, Nanhai
2007-03-28  3:26 ` Jack Steiner
2007-04-05 21:39 ` Jack Steiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox