From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jack Steiner Date: Wed, 26 Nov 2003 15:43:01 +0000 Subject: TLB flushing on SGI platforms Message-Id: List-Id: MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit To: linux-ia64@vger.kernel.org The SGI NUMA platform does not use the hardware "ptc" instruction to flush TLBs. Instead, it has to write an MMR on the chipset on each node to cause a TLB flush transaction to be placed on the bus. On large systems, the overhead to broadcast the TLB flush to every node in the system is one of the hot spots in the kernel. In most cases, the TLB context being flushed has been loaded into a small subset of the nodes. Flushing every node is unnecessary. I'm looking for suggestions on the best way to limit TLB flushing so that only the necessary nodes are flushed. Here is patch that I believe will work. I added a bitmask to the mm_context_t to track nodes where the context has been loaded. The TLB flush routine issues the TLB flush requests only to these nodes. Are there other/better ways that I can do this?? ------------------------------------------------------------------------------ diff -Naur linux_base/arch/ia64/mm/tlb.c linux/arch/ia64/mm/tlb.c --- linux_base/arch/ia64/mm/tlb.c Tue Nov 25 10:03:46 2003 +++ linux/arch/ia64/mm/tlb.c Tue Nov 25 10:41:25 2003 @@ -59,7 +59,7 @@ for_each_process(tsk) { if (!tsk->mm) continue; - tsk_context = tsk->mm->context; + tsk_context = tsk->mm->context.ctx; if (tsk_context = ia64_ctx.next) { if (++ia64_ctx.next >= ia64_ctx.limit) { /* empty range: reset the range limit and start over */ diff -Naur linux_base/arch/ia64/sn/kernel/sn2/sn2_smp.c linux/arch/ia64/sn/kernel/sn2/sn2_smp.c --- linux_base/arch/ia64/sn/kernel/sn2/sn2_smp.c Tue Nov 25 10:03:46 2003 +++ linux/arch/ia64/sn/kernel/sn2/sn2_smp.c Tue Nov 25 10:42:34 2003 @@ -98,6 +99,7 @@ int cnode, mycnode, nasid, flushed=0; volatile unsigned long *ptc0, *ptc1; unsigned long flags=0, data0, data1; + struct mm_struct *mm=current->active_mm; data0 = (1UL<context.node_history, numnodes); cnode < numnodes; + cnode=find_next_bit(&mm->context.node_history, numnodes, ++cnode)) { if (cnode = mycnode) { asm volatile ("ptc.ga %0,%1;;srlz.i;;" :: "r"(start), "r"(nbits<<2) : "memory"); } else { diff -Naur linux_base/include/asm-ia64/mmu.h linux/include/asm-ia64/mmu.h --- linux_base/include/asm-ia64/mmu.h Tue Nov 25 10:03:47 2003 +++ linux/include/asm-ia64/mmu.h Tue Nov 25 10:45:19 2003 @@ -1,11 +1,20 @@ #ifndef __MMU_H #define __MMU_H +#ifdef CONFIG_NUMA +#include +#endif + /* * Type for a context number. We declare it volatile to ensure proper ordering when it's * accessed outside of spinlock'd critical sections (e.g., as done in activate_mm() and * init_new_context()). */ -typedef volatile unsigned long mm_context_t; +typedef struct { + volatile unsigned long ctx; +#ifdef CONFIG_NUMA + cpumask_t node_history; /* ZZZ change to nodemask_t when avail */ +#endif +} mm_context_t; #endif diff -Naur linux_base/include/asm-ia64/mmu_context.h linux/include/asm-ia64/mmu_context.h --- linux_base/include/asm-ia64/mmu_context.h Tue Nov 25 10:03:47 2003 +++ linux/include/asm-ia64/mmu_context.h Tue Nov 25 11:03:41 2003 @@ -75,6 +75,12 @@ { } +static inline void +clear_mm_context(struct mm_struct *mm) +{ + memset(&mm->context, 0, sizeof(mm->context)); +} + /* * When the context counter wraps around all TLBs need to be flushed because an old * context number might have been reused. This is signalled by the ia64_need_tlb_flush @@ -92,26 +98,27 @@ } } -static inline mm_context_t +static inline unsigned long get_mmu_context (struct mm_struct *mm) { - mm_context_t context = mm->context; + mm_context_t *context = &mm->context; + unsigned long ctx = context->ctx; - if (context) - return context; + if (ctx) + return ctx; spin_lock(&ia64_ctx.lock); { /* re-check, now that we've got the lock: */ - context = mm->context; - if (context = 0) { + ctx = context->ctx; + if (ctx = 0) { if (ia64_ctx.next >= ia64_ctx.limit) wrap_mmu_context(mm); - mm->context = context = ia64_ctx.next++; + context->ctx = ctx = ia64_ctx.next++; } } spin_unlock(&ia64_ctx.lock); - return context; + return ctx; } /* @@ -122,7 +129,7 @@ init_new_context (struct task_struct *p, struct mm_struct *mm) { MMU_TRACE('N', smp_processor_id(), mm, 0); - mm->context = 0; + clear_mm_context(mm); return 0; } @@ -134,7 +141,7 @@ } static inline void -reload_context (mm_context_t context) +reload_context (unsigned int context) { unsigned long rid; unsigned long rid_incr = 0; @@ -164,15 +171,18 @@ static inline void activate_context (struct mm_struct *mm) { - mm_context_t context; + unsigned long ctx; +#ifdef CONFIG_NUMA + set_bit(numa_node_id(), &mm->context.node_history); +#endif do { - context = get_mmu_context(mm); + ctx = get_mmu_context(mm); MMU_TRACE('A', smp_processor_id(), mm, context); - reload_context(context); + reload_context(ctx); MMU_TRACE('a', smp_processor_id(), mm, context); /* in the unlikely event of a TLB-flush by another thread, redo the load: */ - } while (unlikely(context != mm->context)); + } while (unlikely(ctx != mm->context.ctx)); } #define deactivate_mm(tsk,mm) \ diff -Naur linux_base/include/asm-ia64/tlbflush.h linux/include/asm-ia64/tlbflush.h --- linux_base/include/asm-ia64/tlbflush.h Tue Nov 25 10:03:47 2003 +++ linux/include/asm-ia64/tlbflush.h Tue Nov 25 10:47:48 2003 @@ -52,7 +52,7 @@ if (!mm) goto out; - mm->context = 0; + clear_mm_context(mm); if (atomic_read(&mm->mm_users) = 0) goto out; /* happens as a result of exit_mmap() */ diff -Naur linux_base/mm/memory.c linux/mm/memory.c --- linux_base/mm/memory.c Tue Nov 25 10:03:50 2003 +++ linux/mm/memory.c Tue Nov 25 10:55:00 2003 @@ -572,9 +572,10 @@ if ((long)zap_bytes > 0) continue; if (need_resched()) { + int fullmm = (*tlbp)->fullmm; tlb_finish_mmu(*tlbp, tlb_start, start); cond_resched_lock(&mm->page_table_lock); - *tlbp = tlb_gather_mmu(mm, 0); + *tlbp = tlb_gather_mmu(mm, fullmm); tlb_start_valid = 0; } zap_bytes = ZAP_BLOCK_SIZE; -- Thanks Jack Steiner (steiner@sgi.com) 651-683-5302 Principal Engineer SGI - Silicon Graphics, Inc.