public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* ia64 get_mmu_context patch
@ 2005-10-27 17:28 Peter Keilty
  2005-10-28  2:54 ` Chen, Kenneth W
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: Peter Keilty @ 2005-10-27 17:28 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 129 bytes --]

 
 
Gentleman,

Please find attached IA64 context_id patch and supporting data for your
Review and consideration.

Regards,
pete

[-- Attachment #2: ctx_bitmap.patch3 --]
[-- Type: text/plain, Size: 5011 bytes --]



Signed-off-by:

--- linux-2.6.14-rc3/include/asm-ia64/mmu_context.h	2005-10-26 10:54:53.000000000 -0400
+++ linux-2.6.14-rc3pmk/include/asm-ia64/mmu_context.h	2005-10-06 16:39:27.000000000 -0400
@@ -34,11 +34,13 @@
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
 	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+	unsigned long *bitmap;	/* bitmap size is max_ctx+1 */
 };
 
 extern struct ia64_ctx ia64_ctx;
 DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
 
+extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);
 
 static inline void
@@ -86,6 +88,7 @@
 				if (ia64_ctx.next >= ia64_ctx.limit)
 					wrap_mmu_context(mm);
 				mm->context = context = ia64_ctx.next++;
+	 			set_bit(context, ia64_ctx.bitmap);
 			}
 		}
 		spin_unlock_irqrestore(&ia64_ctx.lock, flags);
--- linux-2.6.14-rc3/include/asm-ia64/tlbflush.h	2005-08-28 19:41:01.000000000 -0400
+++ linux-2.6.14-rc3pmk/include/asm-ia64/tlbflush.h	2005-10-06 16:15:47.000000000 -0400
@@ -51,6 +51,7 @@
 	if (!mm)
 		return;
 
+	clear_bit(mm->context, ia64_ctx.bitmap);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) == 0)
--- linux-2.6.14-rc3/arch/ia64/mm/tlb.c	2005-08-28 19:41:01.000000000 -0400
+++ linux-2.6.14-rc3pmk/arch/ia64/mm/tlb.c	2005-10-25 14:55:25.000000000 -0400
@@ -16,12 +16,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 
 #include <asm/delay.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>
 
 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
@@ -38,13 +40,32 @@
 DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
 
 /*
+ * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
+ * Called after cpu_init() has setup ia64_ctx.max_ctx based on
+ * maximum RID that is supported by all CPUs.
+ */
+void __init
+mmu_context_init (void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ia64_ctx.lock, flags);
+	if (ia64_ctx.bitmap == NULL) {
+		ia64_ctx.bitmap = (unsigned long *)__alloc_bootmem(
+							(ia64_ctx.max_ctx+1)>>3,
+							PAGE_SIZE,
+							__pa(MAX_DMA_ADDRESS));
+	}
+	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+}
+
+/*
  * Acquire the ia64_ctx.lock before calling this function!
  */
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
-	struct task_struct *tsk;
+	unsigned int next_ctx, max_ctx = ia64_ctx.max_ctx;
 	int i;
 
 	if (ia64_ctx.next > max_ctx)
@@ -52,28 +73,23 @@
 	ia64_ctx.limit = max_ctx + 1;
 
 	/*
-	 * Scan all the task's mm->context and set proper safe range
+	 * Scan the ia64_ctx bitmap and set proper safe range
 	 */
+repeat:
+	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		smp_mb();
+		ia64_ctx.next = 300;	/* skip daemons */
+		goto repeat;
+	}
+	ia64_ctx.next = next_ctx;
 
-	read_lock(&tasklist_lock);
-  repeat:
-	for_each_process(tsk) {
-		if (!tsk->mm)
-			continue;
-		tsk_context = tsk->mm->context;
-		if (tsk_context == ia64_ctx.next) {
-			if (++ia64_ctx.next >= ia64_ctx.limit) {
-				/* empty range: reset the range limit and start over */
-				if (ia64_ctx.next > max_ctx)
-					ia64_ctx.next = 300;
-				ia64_ctx.limit = max_ctx + 1;
-				goto repeat;
-			}
-		}
-		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
-			ia64_ctx.limit = tsk_context;
+	next_ctx = find_next_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		next_ctx = ia64_ctx.limit;
 	}
-	read_unlock(&tasklist_lock);
+	ia64_ctx.limit = next_ctx;
+
 	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
 	{
 		int cpu = get_cpu(); /* prevent preemption/migration */
--- linux-2.6.14-rc3/arch/ia64/kernel/setup.c	2005-10-26 10:54:06.000000000 -0400
+++ linux-2.6.14-rc3pmk/arch/ia64/kernel/setup.c	2005-10-25 14:53:59.000000000 -0400
@@ -419,6 +419,7 @@
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
@@ -798,9 +799,13 @@
 #endif
 
 	/* set ia64_ctx.max_rid to the maximum RID that is supported by all CPUs: */
-	if (ia64_pal_vm_summary(NULL, &vmi) == 0)
+	if (ia64_pal_vm_summary(NULL, &vmi) == 0) {
 		max_ctx = (1U << (vmi.pal_vm_info_2_s.rid_size - 3)) - 1;
-	else {
+		if (max_ctx > (1U << 21)) {
+			max_ctx = (1U << 21) - 1;	/* limit to 2^21  */
+			printk(KERN_WARNING "cpu_init: max_ctx limited to 21 RID bits for bitmap size\n");
+		}
+	} else {
 		printk(KERN_WARNING "cpu_init: PAL VM summary failed, assuming 18 RID bits\n");
 		max_ctx = (1U << 15) - 1;	/* use architected minimum */
 	}

[-- Attachment #3: Type: message/rfc822, Size: 39858 bytes --]

[-- Attachment #3.1.1: Type: text/plain, Size: 3967 bytes --]

 

Hi Tony,

I have attached a patch for an issue with get_mmu_context() seen seen with
RHEL 2.6.9-15 base code.
I would like you to take a look review and comment on it's merits.
Bob Picco has review the patch prior to me senting it to you.
 
Data on problem:

Here is the results of modifing the code using a bitmap for looking up new
context_id.

Updated png showing AIM7 shared run with before, after and clm data.   

Lockstat Data:
There are 4 sets of lockstat data, one each for loads of 40K, 30K, 20K and
40K with no fork test. The lockstat data shows that as loading increases the
lock contention on the task lock with wrap_mmu_context and higher
utilization of the ia64_ctx lock and the ia64_global_tlb_purge lock. 

Get_mmu_context() is called to get a new process context id number to
uniquely identify it's address space and tbl entries. If the limit is
reached the wrap_mmu_context is called to reset and flush the tlb's.
 
Wrap_mmu_context()  uses ia64_ctx.max_ctx which is set based on ia64 arch.
rid size minus 3 bits used for region number. For Itanium the size is 24
bits, so there are 21 bits used as an increment number. This number and the
region register number (3bits) is used as the process address space number
and is used in the tbl to identify tbl's associated with this process. So
about 2^21 new processes can run before context_id wrapping occurs and the
systems tbl's are flushed.

The difference in the number of times the ia64_ctx  was acquired between
running the fork test and not at 40K load is: 
Lock                         Fork              No Fork
ia64_ctx	                   9,111,760        1,162,555
ia64_global_tlb_purge        101,371,392       33,888,711

Read Task Lock per second for wrap_mmu_context()
Load       Locks/Sec  Contention
20,000         5.8           20%
30,000        10.2           31%
40,000        14.6           39%
40,000          .0001         0%  No Fork or Exec test

Notice the utilization percent for the ia64_ctx and tasklist_lock locks for
the 4 runs.
20K    10%
30K    27%
40K    54%
40K     1%  nofork  
This utilization is based on the number of cycles the lock was busy for the
measurement period.  

Modified the number of bits used in the region id reg for context increment
from 21 to 20 bits. This would cause wrap_mmu_context to be done sooner at
2^21.

Aim7 shared with fork:
Load  21bits   20bits
20K    66K      58K  jobs/min
30K    58K      38K  jobs/min
40K    44K      25K  jobs/min
So now a load of 20K acts like 30K at 21bits and a load of 30K acts like 40K
at 21bits.

The cost of having to search the entire task_list to find a free context
number increases with number of processes in the tests. The following are
the measure times for the wrap_mmu_context function.

Before:
Aim7 shared with fork/exec tests time spent in wrap_mmu_context() walking
task loop.
Load  Jobs/min    Calls  Maxprocs  Cumulative Time  Time/call  Total Run
Time
30K      53279    33828     38198        864.0 sec    25.5 ms     62 min   
40K      42378    79840     52496       2667.4 sec    33.4 ms    108 min
50K      31955   141000     75472       6011.5 sec    42.6 ms    180 min

Modified patch:
Aim7 shared with fork/exec tests time spent in wrap_mmu_context() using 128K
bitmap.
Load  Jobs/min     Calls  Maxprocs  Cumulative Time  Time/call  Total Run
Time      
30K	   61350   1764278     na	     .312 sec      177 ns      48
min    
40K	   60858   3203092     na	     .561 sec	 175 ns      64 min
50K	   60826   5110677     na	     .887 sec      174 ns      80 in

Making the bitmap 512K reduced the tlb flush substantially but the jobs/min
only increased by about 100-150. The 512k size is 1/4 of the max it could
be, 2^21. The patch uses the total 2^21 size.

Lockstat data shows 0% contention of the rwlock task_list in
wrap_mmu_context() and the utilization goes from 54% to 1.6%.
The ia64_ctx lock utlization goes from 54% to 0.1% and the contention down
to 0.3%.

Regards,
pete

[-- Attachment #3.1.2: my-ctx_bitmap.patch2 --]
[-- Type: application/octet-stream, Size: 4521 bytes --]

--- linux-2.6.9_base/include/asm-ia64/mmu_context.h	2005-09-02 14:56:31.000000000 -0400
+++ linux-2.6.9/include/asm-ia64/mmu_context.h	2005-10-05 10:10:36.000000000 -0400
@@ -33,11 +33,13 @@
 	unsigned int next;	/* next context number to use */
 	unsigned int limit;	/* next >= limit => must call wrap_mmu_context() */
 	unsigned int max_ctx;	/* max. context value supported by all CPUs */
+	unsigned long *bitmap;	/* bitmap size is max_ctx+1 */
 };
 
 extern struct ia64_ctx ia64_ctx;
 DECLARE_PER_CPU(u8, ia64_need_tlb_flush);
 
+extern void mmu_context_init (void);
 extern void wrap_mmu_context (struct mm_struct *mm);
 
 static inline void
@@ -80,6 +82,7 @@
 			if (ia64_ctx.next >= ia64_ctx.limit)
 				wrap_mmu_context(mm);
 			mm->context = context = ia64_ctx.next++;
+			set_bit(context, ia64_ctx.bitmap);
 		}
 	}
 	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
--- linux-2.6.9_base/include/asm-ia64/tlbflush.h	2005-09-02 14:56:31.000000000 -0400
+++ linux-2.6.9/include/asm-ia64/tlbflush.h	2005-09-30 11:11:11.000000000 -0400
@@ -51,6 +51,7 @@
 	if (!mm)
 		return;
 
+	clear_bit(mm->context, ia64_ctx.bitmap);
 	mm->context = 0;
 
 	if (atomic_read(&mm->mm_users) == 0)
--- linux-2.6.9_base/arch/ia64/mm/tlb.c	2005-09-02 14:56:53.000000000 -0400
+++ linux-2.6.9/arch/ia64/mm/tlb.c	2005-10-05 10:10:48.000000000 -0400
@@ -16,12 +16,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 
 #include <asm/delay.h>
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/pal.h>
 #include <asm/tlbflush.h>
+#include <asm/dma.h>
 
 static struct {
 	unsigned long mask;	/* mask of supported purge page-sizes */
@@ -32,19 +34,39 @@
 	.lock =		SPIN_LOCK_UNLOCKED,
 	.next =		1,
 	.limit =	(1 << 15) - 1,		/* start out with the safe (architected) limit */
-	.max_ctx =	~0U
+	.max_ctx =	~0U,
+	.bitmap = 	NULL
 };
 
 DEFINE_PER_CPU(u8, ia64_need_tlb_flush);
 
 /*
+ * Initializes the ia64_ctx.bitmap array based on max_ctx+1.
+ * Called after cpu_init() has setup ia64_ctx.max_ctx based on
+ * maximum RID that is supported by all CPUs.
+ */
+void __init
+mmu_context_init (void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ia64_ctx.lock, flags);
+	if (ia64_ctx.bitmap == NULL) {
+		ia64_ctx.bitmap = (unsigned long *)__alloc_bootmem(
+							ia64_ctx.max_ctx+1,
+							PAGE_SIZE,
+							__pa(MAX_DMA_ADDRESS));
+	}
+	spin_unlock_irqrestore(&ia64_ctx.lock, flags);
+}
+
+/*
  * Acquire the ia64_ctx.lock before calling this function!
  */
 void
 wrap_mmu_context (struct mm_struct *mm)
 {
-	unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx;
-	struct task_struct *tsk;
+	unsigned int next_ctx, max_ctx = ia64_ctx.max_ctx;
 	int i;
 
 	if (ia64_ctx.next > max_ctx)
@@ -52,28 +74,23 @@
 	ia64_ctx.limit = max_ctx + 1;
 
 	/*
-	 * Scan all the task's mm->context and set proper safe range
+	 * Scan the ia64_ctx bitmap and set proper safe range
 	 */
+repeat:
+	next_ctx = find_next_zero_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		smp_mb();
+		ia64_ctx.next = 300;	/* skip daemons */
+		goto repeat;
+	}
+	ia64_ctx.next = next_ctx;
 
-	read_lock(&tasklist_lock);
-  repeat:
-	for_each_process(tsk) {
-		if (!tsk->mm)
-			continue;
-		tsk_context = tsk->mm->context;
-		if (tsk_context == ia64_ctx.next) {
-			if (++ia64_ctx.next >= ia64_ctx.limit) {
-				/* empty range: reset the range limit and start over */
-				if (ia64_ctx.next > max_ctx)
-					ia64_ctx.next = 300;
-				ia64_ctx.limit = max_ctx + 1;
-				goto repeat;
-			}
-		}
-		if ((tsk_context > ia64_ctx.next) && (tsk_context < ia64_ctx.limit))
-			ia64_ctx.limit = tsk_context;
+	next_ctx = find_next_bit(ia64_ctx.bitmap, ia64_ctx.limit, ia64_ctx.next);
+	if (next_ctx >= ia64_ctx.limit) {
+		next_ctx = ia64_ctx.limit;
 	}
-	read_unlock(&tasklist_lock);
+	ia64_ctx.limit = next_ctx;
+
 	/* can't call flush_tlb_all() here because of race condition with O(1) scheduler [EF] */
 	{
 		int cpu = get_cpu(); /* prevent preemption/migration */
--- linux-2.6.9_base/arch/ia64/kernel/setup.c	2005-09-02 14:56:49.000000000 -0400
+++ linux-2.6.9/arch/ia64/kernel/setup.c	2005-09-30 11:07:46.000000000 -0400
@@ -345,6 +345,7 @@
 #endif
 
 	cpu_init();	/* initialize the bootstrap CPU */
+	mmu_context_init();	/* initialize context_id bitmap */
 
 #ifdef CONFIG_ACPI_BOOT
 	acpi_boot_init();

[-- Attachment #3.1.3: shared_ctx_bitmap_2meg.png --]
[-- Type: image/png, Size: 9383 bytes --]

[-- Attachment #3.1.4: shared_ctx_bitmap_all.png --]
[-- Type: image/png, Size: 12899 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2005-10-28 18:49 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-27 17:28 ia64 get_mmu_context patch Peter Keilty
2005-10-28  2:54 ` Chen, Kenneth W
2005-10-28  3:09 ` Chen, Kenneth W
2005-10-28  3:23 ` Chen, Kenneth W
2005-10-28 14:49 ` Peter Keilty
2005-10-28 14:50 ` Peter Keilty
2005-10-28 17:56 ` Chen, Kenneth W
2005-10-28 17:59 ` Chen, Kenneth W
2005-10-28 18:06 ` Chen, Kenneth W
2005-10-28 18:40 ` Chen, Kenneth W
2005-10-28 18:49 ` Peter Keilty

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox