ia64 get_mmu_context patch

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* ia64 get_mmu_context patch
@ 2005-10-27 17:28 Peter Keilty
  2005-10-28  2:54 ` Chen, Kenneth W
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: Peter Keilty @ 2005-10-27 17:28 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 129 bytes --]

 
 
Gentleman,

Please find attached IA64 context_id patch and supporting data for your
Review and consideration.

Regards,
pete

[-- Attachment #2: ctx_bitmap.patch3 --]
[-- Type: text/plain, Size: 5011 bytes --]



Signed-off-by:

--- linux-2.6.14-rc3/include/asm-ia64/mmu_context.h	2005-10-26 10:54:53.000000000 -0400
+++ linux-2.6.14-rc3pmk/include/asm-ia64/mmu_context.h	2005-10-06 16:39:27.000000000 -0400
@@ -34,11 +34,13 @@
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
 	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+	unsigned long *bitmap;	/* bitmap size is max_ctx+1 */
 };
 
 extern struct ia64_ctx ia64_ctx;
 DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
 
+extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);
 
 static inline void
@@ -86,6 +88,7 @@
 				if (ia64_ctx.next >= ia64_ctx.limit)
 					wrap_mmu_context(mm);
 				mm->context = context = ia64_ctx.next++;
+	 			set_bit(context, ia64_ctx.bitmap);
 			}
 		}
 		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
--- linux-2.6.14-rc3/include/asm-ia64/tlbflush.h	2005-08-28 19:41:01.000000000 -0400
+++ linux-2.6.14-rc3pmk/include/asm-ia64/tlbflush.h	2005-10-06 16:15:47.000000000 -0400
@@ -51,6 +51,7 @@
 	if (!mm)
 		return;
 
+	clear_bit(mm->context, ia64_ctx.bitmap);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) == 0)
--- linux-2.6.14-rc3/arch/ia64/mm/tlb.c	2005-08-28 19:41:01.000000000 -0400
+++ linux-2.6.14-rc3pmk/arch/ia64/mm/tlb.c	2005-10-25 14:55:25.000000000 -0400
@@ -16,12 +16,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 
 #include <asm/delay.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>
 
 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
@@ -38,13 +40,32 @@
 DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
 
 /*
+ * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
+ * Called after cpu_init() has setup ia64_ctx.max_ctx based on
+ * maximum RID that is supported by all CPUs.
+ */
+void __init
+mmu_context_init (void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ia64_ctx.lock, flags);
+	if (ia64_ctx.bitmap == NULL) {
+		ia64_ctx.bitmap = (unsigned long *)__alloc_bootmem(
+							(ia64_ctx.max_ctx+1)>>3,
+							PAGE_SIZE,
+							__pa(MAX_DMA_ADDRESS));
+	}
+	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+}
+
+/*
  * Acquire the ia64_ctx.lock before calling this function!
  */
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
-	struct task_struct *tsk;
+	unsigned int next_ctx, max_ctx = ia64_ctx.max_ctx;
 	int i;
 
 	if (ia64_ctx.next > max_ctx)
@@ -52,28 +73,23 @@
 	ia64_ctx.limit = max_ctx + 1;
 
 	/*
-	 * Scan all the task's mm->context and set proper safe range
+	 * Scan the ia64_ctx bitmap and set proper safe range
 	 */
+repeat:
+	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		smp_mb();
+		ia64_ctx.next = 300;	/* skip daemons */
+		goto repeat;
+	}
+	ia64_ctx.next = next_ctx;
 
-	read_lock(&tasklist_lock);
-  repeat:
-	for_each_process(tsk) {
-		if (!tsk->mm)
-			continue;
-		tsk_context = tsk->mm->context;
-		if (tsk_context == ia64_ctx.next) {
-			if (++ia64_ctx.next >= ia64_ctx.limit) {
-				/* empty range: reset the range limit and start over */
-				if (ia64_ctx.next > max_ctx)
-					ia64_ctx.next = 300;
-				ia64_ctx.limit = max_ctx + 1;
-				goto repeat;
-			}
-		}
-		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
-			ia64_ctx.limit = tsk_context;
+	next_ctx = find_next_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		next_ctx = ia64_ctx.limit;
 	}
-	read_unlock(&tasklist_lock);
+	ia64_ctx.limit = next_ctx;
+
 	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
 	{
 		int cpu = get_cpu(); /* prevent preemption/migration */
--- linux-2.6.14-rc3/arch/ia64/kernel/setup.c	2005-10-26 10:54:06.000000000 -0400
+++ linux-2.6.14-rc3pmk/arch/ia64/kernel/setup.c	2005-10-25 14:53:59.000000000 -0400
@@ -419,6 +419,7 @@
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
@@ -798,9 +799,13 @@
 #endif
 
 	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
-	if (ia64_pal_vm_summary(NULL, &vmi) == 0)
+	if (ia64_pal_vm_summary(NULL, &vmi) == 0) {
 		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
-	else {
+		if (max_ctx > (1U << 21)) {
+			max_ctx = (1U << 21) - 1;	/* limit to 2^21  */
+			printk(KERN_WARNING "cpu_init: max_ctx limited to 21 RID bits for bitmap size\n");
+		}
+	} else {
 		printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
 		max_ctx = (1U << 15) - 1;	/* use architected minimum */
 	}

[-- Attachment #3: Type: message/rfc822, Size: 39858 bytes --]

[-- Attachment #3.1.1: Type: text/plain, Size: 3967 bytes --]

 

Hi Tony,

I have attached a patch for an issue with get_mmu_context() seen seen with
RHEL 2.6.9-15 base code.
I would like you to take a look review and comment on it's merits.
Bob Picco has review the patch prior to me senting it to you.
 
Data on problem:

Here is the results of modifing the code using a bitmap for looking up new
context_id.

Updated png showing AIM7 shared run with before, after and clm data.   

Lockstat Data:
There are 4 sets of lockstat data, one each for loads of 40K, 30K, 20K and
40K with no fork test. The lockstat data shows that as loading increases the
lock contention on the task lock with wrap_mmu_context and higher
utilization of the ia64_ctx lock and the ia64_global_tlb_purge lock. 

Get_mmu_context() is called to get a new process context id number to
uniquely identify it's address space and tbl entries. If the limit is
reached the wrap_mmu_context is called to reset and flush the tlb's.
 
Wrap_mmu_context()  uses ia64_ctx.max_ctx which is set based on ia64 arch.
rid size minus 3 bits used for region number. For Itanium the size is 24
bits, so there are 21 bits used as an increment number. This number and the
region register number (3bits) is used as the process address space number
and is used in the tbl to identify tbl's associated with this process. So
about 2^21 new processes can run before context_id wrapping occurs and the
systems tbl's are flushed.

The difference in the number of times the ia64_ctx  was acquired between
running the fork test and not at 40K load is: 
Lock                         Fork              No Fork
ia64_ctx	                   9,111,760        1,162,555
ia64_global_tlb_purge        101,371,392       33,888,711

Read Task Lock per second for wrap_mmu_context()
Load       Locks/Sec  Contention
20,000         5.8           20%
30,000        10.2           31%
40,000        14.6           39%
40,000          .0001         0%  No Fork or Exec test

Notice the utilization percent for the ia64_ctx and tasklist_lock locks for
the 4 runs.
20K    10%
30K    27%
40K    54%
40K     1%  nofork  
This utilization is based on the number of cycles the lock was busy for the
measurement period.  

Modified the number of bits used in the region id reg for context increment
from 21 to 20 bits. This would cause wrap_mmu_context to be done sooner at
2^21.

Aim7 shared with fork:
Load  21bits   20bits
20K    66K      58K  jobs/min
30K    58K      38K  jobs/min
40K    44K      25K  jobs/min
So now a load of 20K acts like 30K at 21bits and a load of 30K acts like 40K
at 21bits.

The cost of having to search the entire task_list to find a free context
number increases with number of processes in the tests. The following are
the measure times for the wrap_mmu_context function.

Before:
Aim7 shared with fork/exec tests time spent in wrap_mmu_context() walking
task loop.
Load  Jobs/min    Calls  Maxprocs  Cumulative Time  Time/call  Total Run
Time
30K      53279    33828     38198        864.0 sec    25.5 ms     62 min   
40K      42378    79840     52496       2667.4 sec    33.4 ms    108 min
50K      31955   141000     75472       6011.5 sec    42.6 ms    180 min

Modified patch:
Aim7 shared with fork/exec tests time spent in wrap_mmu_context() using 128K
bitmap.
Load  Jobs/min     Calls  Maxprocs  Cumulative Time  Time/call  Total Run
Time      
30K	   61350   1764278     na	     .312 sec      177 ns      48
min    
40K	   60858   3203092     na	     .561 sec	 175 ns      64 min
50K	   60826   5110677     na	     .887 sec      174 ns      80 in

Making the bitmap 512K reduced the tlb flush substantially but the jobs/min
only increased by about 100-150. The 512k size is 1/4 of the max it could
be, 2^21. The patch uses the total 2^21 size.

Lockstat data shows 0% contention of the rwlock task_list in
wrap_mmu_context() and the utilization goes from 54% to 1.6%.
The ia64_ctx lock utlization goes from 54% to 0.1% and the contention down
to 0.3%.

Regards,
pete

[-- Attachment #3.1.2: my-ctx_bitmap.patch2 --]
[-- Type: application/octet-stream, Size: 4521 bytes --]

--- linux-2.6.9_base/include/asm-ia64/mmu_context.h	2005-09-02 14:56:31.000000000 -0400
+++ linux-2.6.9/include/asm-ia64/mmu_context.h	2005-10-05 10:10:36.000000000 -0400
@@ -33,11 +33,13 @@
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
 	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+	unsigned long *bitmap;	/* bitmap size is max_ctx+1 */
 };
 
 extern struct ia64_ctx ia64_ctx;
 DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
 
+extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);
 
 static inline void
@@ -80,6 +82,7 @@
 			if (ia64_ctx.next >= ia64_ctx.limit)
 				wrap_mmu_context(mm);
 			mm->context = context = ia64_ctx.next++;
+			set_bit(context, ia64_ctx.bitmap);
 		}
 	}
 	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
--- linux-2.6.9_base/include/asm-ia64/tlbflush.h	2005-09-02 14:56:31.000000000 -0400
+++ linux-2.6.9/include/asm-ia64/tlbflush.h	2005-09-30 11:11:11.000000000 -0400
@@ -51,6 +51,7 @@
 	if (!mm)
 		return;
 
+	clear_bit(mm->context, ia64_ctx.bitmap);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) == 0)
--- linux-2.6.9_base/arch/ia64/mm/tlb.c	2005-09-02 14:56:53.000000000 -0400
+++ linux-2.6.9/arch/ia64/mm/tlb.c	2005-10-05 10:10:48.000000000 -0400
@@ -16,12 +16,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 
 #include <asm/delay.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>
 
 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
@@ -32,19 +34,39 @@
 	.lock =		SPIN_LOCK_UNLOCKED,
 	.next =		1,
 	.limit =	(1 << 15) - 1,		/* start out with the safe (architected) limit */
-	.max_ctx =	~0U
+	.max_ctx =	~0U,
+	.bitmap = 	NULL
 };
 
 DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
 
 /*
+ * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
+ * Called after cpu_init() has setup ia64_ctx.max_ctx based on
+ * maximum RID that is supported by all CPUs.
+ */
+void __init
+mmu_context_init (void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ia64_ctx.lock, flags);
+	if (ia64_ctx.bitmap == NULL) {
+		ia64_ctx.bitmap = (unsigned long *)__alloc_bootmem(
+							ia64_ctx.max_ctx+1,
+							PAGE_SIZE,
+							__pa(MAX_DMA_ADDRESS));
+	}
+	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+}
+
+/*
  * Acquire the ia64_ctx.lock before calling this function!
  */
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
-	struct task_struct *tsk;
+	unsigned int next_ctx, max_ctx = ia64_ctx.max_ctx;
 	int i;
 
 	if (ia64_ctx.next > max_ctx)
@@ -52,28 +74,23 @@
 	ia64_ctx.limit = max_ctx + 1;
 
 	/*
-	 * Scan all the task's mm->context and set proper safe range
+	 * Scan the ia64_ctx bitmap and set proper safe range
 	 */
+repeat:
+	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		smp_mb();
+		ia64_ctx.next = 300;	/* skip daemons */
+		goto repeat;
+	}
+	ia64_ctx.next = next_ctx;
 
-	read_lock(&tasklist_lock);
-  repeat:
-	for_each_process(tsk) {
-		if (!tsk->mm)
-			continue;
-		tsk_context = tsk->mm->context;
-		if (tsk_context == ia64_ctx.next) {
-			if (++ia64_ctx.next >= ia64_ctx.limit) {
-				/* empty range: reset the range limit and start over */
-				if (ia64_ctx.next > max_ctx)
-					ia64_ctx.next = 300;
-				ia64_ctx.limit = max_ctx + 1;
-				goto repeat;
-			}
-		}
-		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
-			ia64_ctx.limit = tsk_context;
+	next_ctx = find_next_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		next_ctx = ia64_ctx.limit;
 	}
-	read_unlock(&tasklist_lock);
+	ia64_ctx.limit = next_ctx;
+
 	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
 	{
 		int cpu = get_cpu(); /* prevent preemption/migration */
--- linux-2.6.9_base/arch/ia64/kernel/setup.c	2005-09-02 14:56:49.000000000 -0400
+++ linux-2.6.9/arch/ia64/kernel/setup.c	2005-09-30 11:07:46.000000000 -0400
@@ -345,6 +345,7 @@
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
 #ifdef CONFIG_ACPI_BOOT
 	acpi_boot_init();

[-- Attachment #3.1.3: shared_ctx_bitmap_2meg.png --]
[-- Type: image/png, Size: 9383 bytes --]

[-- Attachment #3.1.4: shared_ctx_bitmap_all.png --]
[-- Type: image/png, Size: 12899 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE:  ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
@ 2005-10-28  2:54 ` Chen, Kenneth W
  2005-10-28  3:09 ` Chen, Kenneth W
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28  2:54 UTC (permalink / raw)
  To: linux-ia64

Peter Keilty wrote on Thursday, October 27, 2005 10:28 AM
> Please find attached IA64 context_id patch and supporting data for your
> Review and consideration.
>  ...
> Lockstat Data:
> There are 4 sets of lockstat data, one each for loads of 40K,
> 30K, 20K and 40K with no fork test. The lockstat data shows
> that as loading increases the lock contention on the task
> lock with wrap_mmu_context and higher utilization of the
> ia64_ctx lock and the ia64_global_tlb_purge lock. 

Current implementation in wrap_mmu_context did not fully utilize
all the rid space at the time of wrap.  It finds first available
free range starting from ia64_ctx.next, presumably much smaller
than max_ctx.

Was the lock contention because of much more frequent wrap_mmu_context?
Ideally, it should only do one wrap when the entire rid space is
exhausted.  Current implementation in wrap_mmu_context is suboptimal
in performance.

>  wrap_mmu_context (struct mm_struct *mm)
>  { ....
> @@ -52,28 +74,23 @@
>  	ia64_ctx.limit = max_ctx + 1;
>  
>  	/*
> -	 * Scan all the task's mm->context and set proper safe range
> +	 * Scan the ia64_ctx bitmap and set proper safe range
>  	 */
> +repeat:
> +	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
> +	if (next_ctx >= ia64_ctx.limit) {
> +		smp_mb();
> +		ia64_ctx.next = 300;	/* skip daemons */
> +		goto repeat;
> +	}
> +	ia64_ctx.next = next_ctx;

I like the bitmap thing.  But what's up with all this old range
finding code doing here?  You have a full bitmap that tracks used
ctx_id, one more bitmap can be added to track pending flush. Then at
the time of wrap, we can simply xor them to get full reusable rid.
With that, kernel will only wrap when entire rid space is exhausted.
I will post a patch.

- Ken

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE:  ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
  2005-10-28  2:54 ` Chen, Kenneth W
@ 2005-10-28  3:09 ` Chen, Kenneth W
  2005-10-28  3:23 ` Chen, Kenneth W
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28  3:09 UTC (permalink / raw)
  To: linux-ia64

Chen, Kenneth W wrote on Thursday, October 27, 2005 7:55 PM
> I like the bitmap thing.  But what's up with all this old range
> finding code doing here?  You have a full bitmap that tracks used
> ctx_id, one more bitmap can be added to track pending flush. Then
> at the time of wrap, we can simply xor them to get full reusable
> rid.  With that, kernel will only wrap when entire rid space is
> exhausted.  I will post a patch.


Here is the patch, on top of Peter's patch: Add a flush bitmap to
track which rid can be recycled when wrap happens.  This optimization
allows kernel to only wrap and flush tlb when the entire rid space is
exhausted.  It should dramatically reduce number of rid wrap frequency
compare to current implementation.

Lightly tested, I will do more thorough testing.  Also, I have a few
things want to look at, especially in the area of setting the flushmap
bit.  There are a few other areas to fine tune Peter's original patch
as well.


Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Rohit Seth <rohit.seth@intel.com>

--- ./arch/ia64/mm/tlb.c.orig	2005-10-27 18:18:45.334807075 -0700
+++ ./arch/ia64/mm/tlb.c	2005-10-27 20:00:11.380630958 -0700
@@ -4,10 +4,13 @@
  * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  *
+ * Copyright (C) 2000, 2002-2003, 2005 Intel Co
  * 08/02/00 A. Mallick <asit.k.mallick@intel.com>
  *		Modified RID allocation for SMP
  *          Goutham Rao <goutham.rao@intel.com>
  *              IPI based ptc implementation and A-step IPI implementation.
+ * Rohit Seth <rohit.seth@intel.com>
+ * Ken Chen <kenneth.w.chen@intel.com>
  */
 #include <linux/config.h>
 #include <linux/module.h>
@@ -33,7 +36,6 @@ static struct {
 struct ia64_ctx ia64_ctx = {
 	.lock =		SPIN_LOCK_UNLOCKED,
 	.next =		1,
-	.limit =	(1 << 15) - 1,		/* start out with the safe (architected) limit */
 	.max_ctx =	~0U
 };
 
@@ -55,6 +57,10 @@ mmu_context_init (void)
 							(ia64_ctx.max_ctx+1)>>3,
 							PAGE_SIZE,
 							__pa(MAX_DMA_ADDRESS));
+		ia64_ctx.flushmap = (unsigned long *)__alloc_bootmem(
+							(ia64_ctx.max_ctx+1)>>3,
+							PAGE_SIZE,
+							__pa(MAX_DMA_ADDRESS));
 	}
 	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
 }
@@ -65,30 +71,14 @@ mmu_context_init (void)
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned int next_ctx, max_ctx = ia64_ctx.max_ctx;
 	int i;
 
-	if (ia64_ctx.next > max_ctx)
-		ia64_ctx.next = 300;	/* skip daemons */
-	ia64_ctx.limit = max_ctx + 1;
-
-	/*
-	 * Scan the ia64_ctx bitmap and set proper safe range
-	 */
-repeat:
-	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
-	if (next_ctx >= ia64_ctx.limit) {
-		smp_mb();
-		ia64_ctx.next = 300;	/* skip daemons */
-		goto repeat;
-	}
-	ia64_ctx.next = next_ctx;
-
-	next_ctx = find_next_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
-	if (next_ctx >= ia64_ctx.limit) {
-		next_ctx = ia64_ctx.limit;
-	}
-	ia64_ctx.limit = next_ctx;
+	bitmap_xor(ia64_ctx.bitmap, ia64_ctx.bitmap,
+		   ia64_ctx.flushmap, ia64_ctx.max_ctx);
+	bitmap_zero(ia64_ctx.flushmap, ia64_ctx.max_ctx);
+	/* use offset at 300 to skip daemons */
+	ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+				ia64_ctx.max_ctx, 300);
 
 	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
 	{
--- ./include/asm-ia64/mmu_context.h.orig	2005-10-27 18:18:45.333830512 -0700
+++ ./include/asm-ia64/mmu_context.h	2005-10-27 19:59:00.928483384 -0700
@@ -32,9 +32,10 @@
 struct ia64_ctx {
 	spinlock_t lock;
 	unsigned int next;	/* next context number to use */
-	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
 	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+				/* next > max_ctx => must call wrap_mmu_context() */
 	unsigned long *bitmap;	/* bitmap size is max_ctx+1 */
+	unsigned long *flushmap;/* pending rid to be flushed */
 };
 
 extern struct ia64_ctx ia64_ctx;
@@ -85,7 +86,9 @@ get_mmu_context (struct mm_struct *mm)
 			context = mm->context;
 			if (context = 0) {
 				cpus_clear(mm->cpu_vm_mask);
-				if (ia64_ctx.next >= ia64_ctx.limit)
+				ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
+						ia64_ctx.max_ctx, ia64_ctx.next);
+				if (ia64_ctx.next >= ia64_ctx.max_ctx)
 					wrap_mmu_context(mm);
 				mm->context = context = ia64_ctx.next++;
 	 			set_bit(context, ia64_ctx.bitmap);
--- ./include/asm-ia64/tlbflush.h.orig	2005-10-27 18:18:45.333830512 -0700
+++ ./include/asm-ia64/tlbflush.h	2005-10-27 19:59:39.373795413 -0700
@@ -51,7 +51,8 @@ flush_tlb_mm (struct mm_struct *mm)
 	if (!mm)
 		return;
 
-	clear_bit(mm->context, ia64_ctx.bitmap);
+	/* fix me: should we hold ia64_ctx.lock? */
+	set_bit(mm->context, ia64_ctx.flushmap);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) = 0)



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE:  ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
  2005-10-28  2:54 ` Chen, Kenneth W
  2005-10-28  3:09 ` Chen, Kenneth W
@ 2005-10-28  3:23 ` Chen, Kenneth W
  2005-10-28 14:49 ` Peter Keilty
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28  3:23 UTC (permalink / raw)
  To: linux-ia64

Peter Keilty wrote on Thursday, October 27, 2005 10:28 AM
> Please find attached IA64 context_id patch and supporting data for your
> Review and consideration.
> 
> --- linux-2.6.14-rc3/arch/ia64/kernel/setup.c	2005-10-26 10:54:06.000000000 -0400
> +++ linux-2.6.14-rc3pmk/arch/ia64/kernel/setup.c	2005-10-25 14:53:59.000000000 -0400
> @@ -419,6 +419,7 @@
> ...
>  
>  	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
> -	if (ia64_pal_vm_summary(NULL, &vmi) = 0)
> +	if (ia64_pal_vm_summary(NULL, &vmi) = 0) {
>  		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
> -	else {
> +		if (max_ctx > (1U << 21)) {
> +			max_ctx = (1U << 21) - 1;	/* limit to 2^21  */
> +			printk(KERN_WARNING "cpu_init: max_ctx limited to 21 RID bits for bitmap size\n");
> +		}
> +	} else {

This printk is spurious.  When I first read it, my reaction is: rid as
in region register is architecturally defined at maximum 24 bits.  Printing 
21-bit for rid is misleading.  We currently use 8 rid number per process.
So 21 bit for context number is the max.  max_ctx will never exceed 1<<21.
Suggest this printk to be dropped.

- Ken


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (2 preceding siblings ...)
  2005-10-28  3:23 ` Chen, Kenneth W
@ 2005-10-28 14:49 ` Peter Keilty
  2005-10-28 14:50 ` Peter Keilty
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Peter Keilty @ 2005-10-28 14:49 UTC (permalink / raw)
  To: linux-ia64

 Kenneth,

> -----Original Message-----
> From: Chen, Kenneth W [mailto:kenneth.w.chen@intel.com] 
> Sent: Thursday, October 27, 2005 11:24 PM
> To: 'Peter Keilty'; linux-ia64@vger.kernel.org
> Cc: 'Eric Whitney'
> Subject: RE: ia64 get_mmu_context patch
> 
> Peter Keilty wrote on Thursday, October 27, 2005 10:28 AM
> > Please find attached IA64 context_id patch and supporting data for 
> > your Review and consideration.
> > 
> > --- linux-2.6.14-rc3/arch/ia64/kernel/setup.c	
> 2005-10-26 10:54:06.000000000 -0400
> > +++ linux-2.6.14-rc3pmk/arch/ia64/kernel/setup.c	
> 2005-10-25 14:53:59.000000000 -0400
> > @@ -419,6 +419,7 @@
> > ...
> >  
> >  	/* set ia64_ctx.max_rid to the maximum RID that is 
> supported by all CPUs: */
> > -	if (ia64_pal_vm_summary(NULL, &vmi) = 0)
> > +	if (ia64_pal_vm_summary(NULL, &vmi) = 0) {
> >  		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size 
> - 3)) - 1;
> > -	else {
> > +		if (max_ctx > (1U << 21)) {
> > +			max_ctx = (1U << 21) - 1;	/* 
> limit to 2^21  */
> > +			printk(KERN_WARNING "cpu_init: max_ctx 
> limited to 21 RID bits for bitmap size\n");
> > +		}
> > +	} else {
> 
> This printk is spurious.  When I first read it, my reaction 
> is: rid as in region register is architecturally defined at 
> maximum 24 bits.  Printing 21-bit for rid is misleading.  We 
> currently use 8 rid number per process.
> So 21 bit for context number is the max.  max_ctx will never 
> exceed 1<<21.
> Suggest this printk to be dropped.
I add that per mail with Tony, He suggested that we might limit the bitmap
size so that if the rid size was increase in future hardware that a Giga
byte 
Would not be allocated. The printk was to let people know that the rid was
being 
limited to 2^21 if the rid was enlarged.
> 
> - Ken
> 
> 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (3 preceding siblings ...)
  2005-10-28 14:49 ` Peter Keilty
@ 2005-10-28 14:50 ` Peter Keilty
  2005-10-28 17:56 ` Chen, Kenneth W
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Peter Keilty @ 2005-10-28 14:50 UTC (permalink / raw)
  To: linux-ia64

Hi Kenneth,

> -----Original Message-----
> From: Chen, Kenneth W [mailto:kenneth.w.chen@intel.com] 
> Sent: Thursday, October 27, 2005 10:55 PM
> To: 'Peter Keilty'; linux-ia64@vger.kernel.org
> Cc: 'Eric Whitney'
> Subject: RE: ia64 get_mmu_context patch
> 
> Peter Keilty wrote on Thursday, October 27, 2005 10:28 AM
> > Please find attached IA64 context_id patch and supporting data for 
> > your Review and consideration.
> >  ...
> > Lockstat Data:
> > There are 4 sets of lockstat data, one each for loads of 
> 40K, 30K, 20K 
> > and 40K with no fork test. The lockstat data shows that as loading 
> > increases the lock contention on the task lock with 
> wrap_mmu_context 
> > and higher utilization of the ia64_ctx lock and the 
> > ia64_global_tlb_purge lock.
> 
> 
> Current implementation in wrap_mmu_context did not fully 
> utilize all the rid space at the time of wrap.  It finds 
> first available free range starting from ia64_ctx.next, 
> presumably much smaller than max_ctx.
The original code did use full range, but once wrapping occurred
yes ranging was used by setting limit. The ranging did go out to the 
the max_limit on follow on calls, but the range size could small 
Causing more calls to wrap_mmu_context.
 
> Was the lock contention because of much more frequent 
> wrap_mmu_context?
Indirectly, The real reason was the time used to task_list walking 
derefencing pointers (tens of thousands of processes) trying to find 
a unused rid. 
 
> Ideally, it should only do one wrap when the entire rid space 
> is exhausted.  Current implementation in wrap_mmu_context is 
> suboptimal in performance.
> 
> 
> >  wrap_mmu_context (struct mm_struct *mm)  { ....
> > @@ -52,28 +74,23 @@
> >  	ia64_ctx.limit = max_ctx + 1;
> >  
> >  	/*
> > -	 * Scan all the task's mm->context and set proper safe range
> > +	 * Scan the ia64_ctx bitmap and set proper safe range
> >  	 */
> > +repeat:
> > +	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, 
> ia64_ctx.limit, ia64_ctx.next);
> > +	if (next_ctx >= ia64_ctx.limit) {
> > +		smp_mb();
> > +		ia64_ctx.next = 300;	/* skip daemons */
> > +		goto repeat;
> > +	}
> > +	ia64_ctx.next = next_ctx;
> 
> I like the bitmap thing.  But what's up with all this old 
> range finding code doing here?  You have a full bitmap that 
> tracks used ctx_id, one more bitmap can be added to track 
> pending flush. Then at the time of wrap, we can simply xor 
> them to get full reusable rid.
I do like the flush bitmap idea. Although there is a cost call
find_next_zero_bit
every time for a context_id instead of set up a range that is zero. 

> With that, kernel will only wrap when entire rid space is exhausted.
Only from were the next index started from.

> I will post a patch.
> 
> - Ken
> 
> 



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (4 preceding siblings ...)
  2005-10-28 14:50 ` Peter Keilty
@ 2005-10-28 17:56 ` Chen, Kenneth W
  2005-10-28 17:59 ` Chen, Kenneth W
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28 17:56 UTC (permalink / raw)
  To: linux-ia64

Peter Keilty wrote on Friday, October 28, 2005 7:50 AM
> The original code did use full range,

Yes or no, first call to wrap_mmu_context occurs at
ia64_ctx.next equals 2^15 (32768), then second call occurs
at 2097152.  First one is needlessly too early.  One can say
it uses full range.  But the real thing I'm after is number
of ctx_id allocation in between global tlb flush.  Kernel
did a global flush before entire 2M ctx_id is used.  That
is not a "full range" (as in use up all ctx_id before a
global tlb flush).

> but once wrapping
> occurred yes ranging was used by setting limit. The ranging
> did go out to the the max_limit on follow on calls, but the
> range size could small Causing more calls to wrap_mmu_context.

Exactly, ia64_ctx.next can only increment, while ia64_ctx.limit
will move down.  The code is effectively find_next_hole(), which
isn't equivalent to find_largest_hole().  It could have a
pathological worst case that you call wrap_mmu_context with only
one ctx_id allocation.  Worse, since next and limit pair can not
cross a wrap around point, when next approach the end of the ctx_id
space, the range it find is much smaller at that instance (though
that should only occur once every 2M ctx id allocation).

> > Was the lock contention because of much more frequent 
> > wrap_mmu_context?
> 
> Indirectly, The real reason was the time used to task_list walking 
> derefencing pointers (tens of thousands of processes) trying to find 
> a unused rid. 

What is the average number of ctx_id allocation between
wrap_mmu_context call?  That would tell us how efficient the current
find_next_hole() is.

- Ken

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (5 preceding siblings ...)
  2005-10-28 17:56 ` Chen, Kenneth W
@ 2005-10-28 17:59 ` Chen, Kenneth W
  2005-10-28 18:06 ` Chen, Kenneth W
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28 17:59 UTC (permalink / raw)
  To: linux-ia64

Peter Keilty wrote on Friday, October 28, 2005 7:50 AM
> > This printk is spurious.  When I first read it, my reaction 
> > is: rid as in region register is architecturally defined at 
> > maximum 24 bits.  Printing 21-bit for rid is misleading.  We 
> > currently use 8 rid number per process.
> > So 21 bit for context number is the max.  max_ctx will never 
> > exceed 1<<21.
> > Suggest this printk to be dropped.
> 
> I add that per mail with Tony, He suggested that we might limit
> the bitmap size so that if the rid size was increase in future
> hardware that a Giga byte Would not be allocated. The printk was
> to let people know that the rid was being limited to 2^21 if the
> rid was enlarged.

If I were you, I would try to convince Tony with the argument I
presented in my earlier post :-)

- Ken


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE:  ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (6 preceding siblings ...)
  2005-10-28 17:59 ` Chen, Kenneth W
@ 2005-10-28 18:06 ` Chen, Kenneth W
  2005-10-28 18:40 ` Chen, Kenneth W
  2005-10-28 18:49 ` Peter Keilty
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28 18:06 UTC (permalink / raw)
  To: linux-ia64

Chen, Kenneth W wrote on Thursday, October 27, 2005 8:09 PM
> Here is the patch, on top of Peter's patch: Add a flush bitmap to
> track which rid can be recycled when wrap happens.  This optimization
> allows kernel to only wrap and flush tlb when the entire rid space is
> exhausted.  It should dramatically reduce number of rid wrap frequency
> compare to current implementation.
> 
> Lightly tested, I will do more thorough testing.  Also, I have a few
> things want to look at, especially in the area of setting the flushmap
> bit.  There are a few other areas to fine tune Peter's original patch
> as well.
> 
> --- ./include/asm-ia64/tlbflush.h.orig	2005-10-27 
> +++ ./include/asm-ia64/tlbflush.h	2005-10-27
> @@ -51,7 +51,8 @@ flush_tlb_mm (struct mm_struct *mm)
>  	if (!mm)
>  		return;
>  
> -	clear_bit(mm->context, ia64_ctx.bitmap);
> +	/* fix me: should we hold ia64_ctx.lock? */
> +	set_bit(mm->context, ia64_ctx.flushmap);
>  	mm->context = 0;

I convinced myself that I really need a lock here. Otherwise, It will
have write-after-write race with flushmap and cause ctx_id to leak.
Spin lock is the wrong kind of lock to use here because it should
allow concurrent updates between wrap_mmu_context().


Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>

--- ./arch/ia64/mm/tlb.c.orig	2005-10-28 01:13:23.083525760 -0700
+++ ./arch/ia64/mm/tlb.c	2005-10-28 01:51:13.934083880 -0700
@@ -35,6 +35,7 @@ static struct {
 
 struct ia64_ctx ia64_ctx = {
 	.lock =		SPIN_LOCK_UNLOCKED,
+	.flushmap_rwlock = RW_LOCK_UNLOCKED,
 	.next =		1,
 	.max_ctx =	~0U
 };
@@ -73,9 +74,12 @@ wrap_mmu_context (struct mm_struct *mm)
 {
 	int i;
 
+	write_lock(&ia64_ctx.flushmap_rwlock);
 	bitmap_xor(ia64_ctx.bitmap, ia64_ctx.bitmap,
 		   ia64_ctx.flushmap, ia64_ctx.max_ctx);
 	bitmap_zero(ia64_ctx.flushmap, ia64_ctx.max_ctx);
+	write_unlock(&ia64_ctx.flushmap_rwlock);
+
 	/* use offset at 300 to skip daemons */
 	ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
 				ia64_ctx.max_ctx, 300);
--- ./include/asm-ia64/mmu_context.h.orig	2005-10-28 01:09:18.186067823 -0700
+++ ./include/asm-ia64/mmu_context.h	2005-10-28 01:49:44.337405290 -0700
@@ -31,11 +31,12 @@
 
 struct ia64_ctx {
 	spinlock_t lock;
+	rwlock_t   flushmap_rwlock;
 	unsigned int next;	/* next context number to use */
 	unsigned int max_ctx;	/* max. context value supported by all CPUs */
 				/* next > max_ctx => must call wrap_mmu_context() */
 	unsigned long *bitmap;	/* bitmap size is max_ctx+1 */
-	unsigned long *flushmap;/* pending rid to be flushed */
+	unsigned long *flushmap;/* pending ctx id to be flushed */
 };
 
 extern struct ia64_ctx ia64_ctx;
--- ./include/asm-ia64/tlbflush.h.orig	2005-10-28 01:08:49.198763490 -0700
+++ ./include/asm-ia64/tlbflush.h	2005-10-28 01:08:57.562044638 -0700
@@ -52,8 +52,9 @@ flush_tlb_mm (struct mm_struct *mm)
 	if (!mm)
 		return;
 
-	/* fix me: should we hold ia64_ctx.lock? */
-	set_bit(mm->context, ia64_ctx.flushmap);
+	read_lock(&ia64_ctx.flushmap_rwlock);
+	set_bit(mm->context, ia64_ctx.flushmap);
+	read_unlock(&ia64_ctx.flushmap_rwlock);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) = 0)


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (7 preceding siblings ...)
  2005-10-28 18:06 ` Chen, Kenneth W
@ 2005-10-28 18:40 ` Chen, Kenneth W
  2005-10-28 18:49 ` Peter Keilty
  9 siblings, 0 replies; 11+ messages in thread
From: Chen, Kenneth W @ 2005-10-28 18:40 UTC (permalink / raw)
  To: linux-ia64

I did a crude fork bomb test, then printed out ia64_ctx.next
ia64_ctx.limit and the diff in wrap_mmu_context().  Here is
the result:

First call resulted expected 2M hole.  Look what happens
after first wrap.  My worst fear of global tlb flush after
just one ctx_id allocation is actually happening!  Average
~ 90,000 ctx_id allocation between flush.  Much smaller
than advertised 2^21.  This find_next_hole() is horribly
broken in terms of performance.

- Ken


wrap_mmu_context: next = 32767, limit = 2097152 n = 2064385
wrap_mmu_context: next = 300, limit = 1782 n = 1482
wrap_mmu_context: next = 1783, limit = 1872 n = 89
wrap_mmu_context: next = 1873, limit = 1883 n = 10
wrap_mmu_context: next = 1884, limit = 1921 n = 37
wrap_mmu_context: next = 1922, limit = 1973 n = 51
wrap_mmu_context: next = 1974, limit = 1998 n = 24
wrap_mmu_context: next = 1999, limit = 2055 n = 56
wrap_mmu_context: next = 2056, limit = 2109 n = 53
wrap_mmu_context: next = 2110, limit = 2111 n = 1
wrap_mmu_context: next = 2118, limit = 2124 n = 6
wrap_mmu_context: next = 2125, limit = 2128 n = 3
wrap_mmu_context: next = 2129, limit = 140411 n = 138282
wrap_mmu_context: next = 140412, limit = 518442 n = 378030
wrap_mmu_context: next = 518443, limit = 520789 n = 2346
wrap_mmu_context: next = 520790, limit = 980204 n = 459414
wrap_mmu_context: next = 980205, limit = 1864365 n = 884160
wrap_mmu_context: next = 1864366, limit = 1864528 n = 162
wrap_mmu_context: next = 1864529, limit = 2066917 n = 202388
wrap_mmu_context: next = 2066918, limit = 2097152 n = 30234
wrap_mmu_context: next = 300, limit = 1782 n = 1482
wrap_mmu_context: next = 1783, limit = 1872 n = 89
wrap_mmu_context: next = 1873, limit = 1883 n = 10
wrap_mmu_context: next = 1884, limit = 1921 n = 37
wrap_mmu_context: next = 1922, limit = 1973 n = 51
wrap_mmu_context: next = 1974, limit = 1998 n = 24
wrap_mmu_context: next = 1999, limit = 2055 n = 56
wrap_mmu_context: next = 2056, limit = 2109 n = 53
wrap_mmu_context: next = 2110, limit = 2111 n = 1
wrap_mmu_context: next = 2118, limit = 2124 n = 6
wrap_mmu_context: next = 2125, limit = 2128 n = 3
wrap_mmu_context: next = 2129, limit = 140411 n = 138282
wrap_mmu_context: next = 140412, limit = 184686 n = 44274
wrap_mmu_context: next = 184687, limit = 188664 n = 3977
wrap_mmu_context: next = 188665, limit = 398496 n = 209831
wrap_mmu_context: next = 398497, limit = 492604 n = 94107
wrap_mmu_context: next = 492605, limit = 510747 n = 18142
wrap_mmu_context: next = 510748, limit = 510923 n = 175
wrap_mmu_context: next = 510924, limit = 518442 n = 7518
wrap_mmu_context: next = 518443, limit = 520789 n = 2346
wrap_mmu_context: next = 520790, limit = 980204 n = 459414
wrap_mmu_context: next = 980205, limit = 1864365 n = 884160
wrap_mmu_context: next = 1864366, limit = 1864528 n = 162
wrap_mmu_context: next = 1864529, limit = 2097152 n = 232623
wrap_mmu_context: next = 300, limit = 1782 n = 1482
wrap_mmu_context: next = 1783, limit = 1872 n = 89
wrap_mmu_context: next = 1873, limit = 1883 n = 10
wrap_mmu_context: next = 1884, limit = 1921 n = 37
wrap_mmu_context: next = 1922, limit = 1973 n = 51
wrap_mmu_context: next = 1974, limit = 1998 n = 24
wrap_mmu_context: next = 1999, limit = 2055 n = 56
wrap_mmu_context: next = 2056, limit = 2109 n = 53
wrap_mmu_context: next = 2110, limit = 2111 n = 1
wrap_mmu_context: next = 2118, limit = 2124 n = 6
wrap_mmu_context: next = 2125, limit = 2128 n = 3
wrap_mmu_context: next = 2129, limit = 140411 n = 138282
wrap_mmu_context: next = 140412, limit = 184686 n = 44274
wrap_mmu_context: next = 184687, limit = 188664 n = 3977
wrap_mmu_context: next = 188665, limit = 398496 n = 209831
wrap_mmu_context: next = 398497, limit = 492604 n = 94107
wrap_mmu_context: next = 492605, limit = 510747 n = 18142
wrap_mmu_context: next = 510748, limit = 510923 n = 175
wrap_mmu_context: next = 510924, limit = 518442 n = 7518
wrap_mmu_context: next = 518443, limit = 520789 n = 2346
wrap_mmu_context: next = 520790, limit = 980204 n = 459414
wrap_mmu_context: next = 980205, limit = 1864365 n = 884160
wrap_mmu_context: next = 1864366, limit = 1864528 n = 162
wrap_mmu_context: next = 1864529, limit = 2097152 n = 232623


^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: ia64 get_mmu_context patch
  2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
                   ` (8 preceding siblings ...)
  2005-10-28 18:40 ` Chen, Kenneth W
@ 2005-10-28 18:49 ` Peter Keilty
  9 siblings, 0 replies; 11+ messages in thread
From: Peter Keilty @ 2005-10-28 18:49 UTC (permalink / raw)
  To: linux-ia64

 
Ken,

It well be worth doing find_next_zero_bit from get_mmu_context 
Per your patch instead of the global tbl flush.

> -----Original Message-----
> From: Chen, Kenneth W [mailto:kenneth.w.chen@intel.com] 
> Sent: Friday, October 28, 2005 2:41 PM
> To: 'Peter Keilty'; linux-ia64@vger.kernel.org
> Cc: 'Eric Whitney'
> Subject: RE: ia64 get_mmu_context patch
> 
> I did a crude fork bomb test, then printed out ia64_ctx.next 
> ia64_ctx.limit and the diff in wrap_mmu_context().  Here is 
> the result:
> 
> First call resulted expected 2M hole.  Look what happens 
> after first wrap.  My worst fear of global tlb flush after 
> just one ctx_id allocation is actually happening!  Average ~ 
> 90,000 ctx_id allocation between flush.  Much smaller than 
> advertised 2^21.  This find_next_hole() is horribly broken in 
> terms of performance.
> 
> - Ken
> 
> 
> wrap_mmu_context: next = 32767, limit = 2097152 n = 2064385
> wrap_mmu_context: next = 300, limit = 1782 n = 1482
> wrap_mmu_context: next = 1783, limit = 1872 n = 89
> wrap_mmu_context: next = 1873, limit = 1883 n = 10
> wrap_mmu_context: next = 1884, limit = 1921 n = 37
> wrap_mmu_context: next = 1922, limit = 1973 n = 51
> wrap_mmu_context: next = 1974, limit = 1998 n = 24
> wrap_mmu_context: next = 1999, limit = 2055 n = 56
> wrap_mmu_context: next = 2056, limit = 2109 n = 53
> wrap_mmu_context: next = 2110, limit = 2111 n = 1
> wrap_mmu_context: next = 2118, limit = 2124 n = 6
> wrap_mmu_context: next = 2125, limit = 2128 n = 3
> wrap_mmu_context: next = 2129, limit = 140411 n = 138282
> wrap_mmu_context: next = 140412, limit = 518442 n = 378030
> wrap_mmu_context: next = 518443, limit = 520789 n = 2346
> wrap_mmu_context: next = 520790, limit = 980204 n = 459414
> wrap_mmu_context: next = 980205, limit = 1864365 n = 884160
> wrap_mmu_context: next = 1864366, limit = 1864528 n = 162
> wrap_mmu_context: next = 1864529, limit = 2066917 n = 202388
> wrap_mmu_context: next = 2066918, limit = 2097152 n = 30234
> wrap_mmu_context: next = 300, limit = 1782 n = 1482
> wrap_mmu_context: next = 1783, limit = 1872 n = 89
> wrap_mmu_context: next = 1873, limit = 1883 n = 10
> wrap_mmu_context: next = 1884, limit = 1921 n = 37
> wrap_mmu_context: next = 1922, limit = 1973 n = 51
> wrap_mmu_context: next = 1974, limit = 1998 n = 24
> wrap_mmu_context: next = 1999, limit = 2055 n = 56
> wrap_mmu_context: next = 2056, limit = 2109 n = 53
> wrap_mmu_context: next = 2110, limit = 2111 n = 1
> wrap_mmu_context: next = 2118, limit = 2124 n = 6
> wrap_mmu_context: next = 2125, limit = 2128 n = 3
> wrap_mmu_context: next = 2129, limit = 140411 n = 138282
> wrap_mmu_context: next = 140412, limit = 184686 n = 44274
> wrap_mmu_context: next = 184687, limit = 188664 n = 3977
> wrap_mmu_context: next = 188665, limit = 398496 n = 209831
> wrap_mmu_context: next = 398497, limit = 492604 n = 94107
> wrap_mmu_context: next = 492605, limit = 510747 n = 18142
> wrap_mmu_context: next = 510748, limit = 510923 n = 175
> wrap_mmu_context: next = 510924, limit = 518442 n = 7518
> wrap_mmu_context: next = 518443, limit = 520789 n = 2346
> wrap_mmu_context: next = 520790, limit = 980204 n = 459414
> wrap_mmu_context: next = 980205, limit = 1864365 n = 884160
> wrap_mmu_context: next = 1864366, limit = 1864528 n = 162
> wrap_mmu_context: next = 1864529, limit = 2097152 n = 232623
> wrap_mmu_context: next = 300, limit = 1782 n = 1482
> wrap_mmu_context: next = 1783, limit = 1872 n = 89
> wrap_mmu_context: next = 1873, limit = 1883 n = 10
> wrap_mmu_context: next = 1884, limit = 1921 n = 37
> wrap_mmu_context: next = 1922, limit = 1973 n = 51
> wrap_mmu_context: next = 1974, limit = 1998 n = 24
> wrap_mmu_context: next = 1999, limit = 2055 n = 56
> wrap_mmu_context: next = 2056, limit = 2109 n = 53
> wrap_mmu_context: next = 2110, limit = 2111 n = 1
> wrap_mmu_context: next = 2118, limit = 2124 n = 6
> wrap_mmu_context: next = 2125, limit = 2128 n = 3
> wrap_mmu_context: next = 2129, limit = 140411 n = 138282
> wrap_mmu_context: next = 140412, limit = 184686 n = 44274
> wrap_mmu_context: next = 184687, limit = 188664 n = 3977
> wrap_mmu_context: next = 188665, limit = 398496 n = 209831
> wrap_mmu_context: next = 398497, limit = 492604 n = 94107
> wrap_mmu_context: next = 492605, limit = 510747 n = 18142
> wrap_mmu_context: next = 510748, limit = 510923 n = 175
> wrap_mmu_context: next = 510924, limit = 518442 n = 7518
> wrap_mmu_context: next = 518443, limit = 520789 n = 2346
> wrap_mmu_context: next = 520790, limit = 980204 n = 459414
> wrap_mmu_context: next = 980205, limit = 1864365 n = 884160
> wrap_mmu_context: next = 1864366, limit = 1864528 n = 162
> wrap_mmu_context: next = 1864529, limit = 2097152 n = 232623
> 
> 



^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2005-10-28 18:49 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
2005-10-28  2:54 ` Chen, Kenneth W
2005-10-28  3:09 ` Chen, Kenneth W
2005-10-28  3:23 ` Chen, Kenneth W
2005-10-28 14:49 ` Peter Keilty
2005-10-28 14:50 ` Peter Keilty
2005-10-28 17:56 ` Chen, Kenneth W
2005-10-28 17:59 ` Chen, Kenneth W
2005-10-28 18:06 ` Chen, Kenneth W
2005-10-28 18:40 ` Chen, Kenneth W
2005-10-28 18:49 ` Peter Keilty

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox