[PATCH] Convert pgtable cache to slab

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] Convert pgtable cache to slab
@ 2004-10-18 22:52 Martin K. Petersen
  2004-10-19  4:08 ` Luck, Tony
                   ` (21 more replies)
  0 siblings, 22 replies; 23+ messages in thread
From: Martin K. Petersen @ 2004-10-18 22:52 UTC (permalink / raw)
  To: linux-ia64


Tony,

Below is an updated version of the ia64 page table slabification
patch.

I have got a few other fires to fight right now so I'm still using
pgtable_cache_init() for setting up the slab.  I'll look into a
general purpose zeroed page cache later.

I have made setup_gate() an initcall.  That seemed like the best
compromise and other archs do this too.  kmem_cache_init() is called
after mem_init() in init/main.c, so we'd need to introduce another
stub there to set up the gate page even if we had a general purpose
slab cache in place.

Comments?

Signed-off-by: Martin K. Petersen <mkp@wildopensource.com>

-- 
Martin K. Petersen	Wild Open Source, Inc.
mkp@wildopensource.com	http://www.wildopensource.com/

 arch/ia64/mm/contig.c      |    1 
 arch/ia64/mm/discontig.c   |    1 
 arch/ia64/mm/init.c        |   45 ++++++++-------
 include/asm-ia64/pgalloc.h |  127 ++++++++++-----------------------------------
 include/asm-ia64/pgtable.h |    6 --
 5 files changed, 53 insertions(+), 127 deletions(-)

diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/arch/ia64/mm/contig.c pgtable-slab/arch/ia64/mm/contig.c
--- linux-pristine/arch/ia64/mm/contig.c	2004-10-11 14:57:16.000000000 -0700
+++ pgtable-slab/arch/ia64/mm/contig.c	2004-10-11 14:59:09.000000000 -0700
@@ -60,7 +60,6 @@
 	printk("%d reserved pages\n", reserved);
 	printk("%d pages shared\n", shared);
 	printk("%d pages swap cached\n", cached);
-	printk("%ld pages in page table cache\n", pgtable_cache_size);
 }
 
 /* physical address where the bootmem map is located */
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/arch/ia64/mm/discontig.c pgtable-slab/arch/ia64/mm/discontig.c
--- linux-pristine/arch/ia64/mm/discontig.c	2004-10-11 14:57:16.000000000 -0700
+++ pgtable-slab/arch/ia64/mm/discontig.c	2004-10-11 15:10:05.000000000 -0700
@@ -540,7 +540,6 @@
 	printk("%d reserved pages\n", total_reserved);
 	printk("%d pages shared\n", total_shared);
 	printk("%d pages swap cached\n", total_cached);
-	printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
 	printk("%d free buffer pages\n", nr_free_buffer_pages());
 }
 
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/arch/ia64/mm/init.c pgtable-slab/arch/ia64/mm/init.c
--- linux-pristine/arch/ia64/mm/init.c	2004-10-11 14:57:16.000000000 -0700
+++ pgtable-slab/arch/ia64/mm/init.c	2004-10-18 15:34:23.000000000 -0700
@@ -56,26 +56,6 @@
 EXPORT_SYMBOL(zero_page_memmap_ptr);
 
 void
-check_pgt_cache (void)
-{
-	int low, high;
-
-	low = pgt_cache_water[0];
-	high = pgt_cache_water[1];
-
-	preempt_disable();
-	if (pgtable_cache_size > (u64) high) {
-		do {
-			if (pgd_quicklist)
-				free_page((unsigned long)pgd_alloc_one_fast(NULL));
-			if (pmd_quicklist)
-				free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
-		} while (pgtable_cache_size > (u64) low);
-	}
-	preempt_enable();
-}
-
-void
 update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
 {
 	unsigned long addr;
@@ -254,7 +234,7 @@
 	return page;
 }
 
-static void
+static int __init
 setup_gate (void)
 {
 	struct page *page;
@@ -272,8 +252,11 @@
 	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
 #endif
 	ia64_patch_gate();
+	return 0;
 }
 
+core_initcall(setup_gate);
+
 void __devinit
 ia64_mmu_init (void *my_cpu_data)
 {
@@ -584,9 +567,27 @@
 		if (!fsyscall_table[i] || nolwsys)
 			fsyscall_table[i] = sys_call_table[i] | 1;
 	}
-	setup_gate();
 
 #ifdef CONFIG_IA32_SUPPORT
 	ia32_mem_init();
 #endif
 }
+
+kmem_cache_t *pgtable_cache;
+
+static void pgtable_cache_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(pte, 0, PAGE_SIZE);
+}
+
+void pgtable_cache_init(void)
+{
+	pgtable_cache = kmem_cache_create("pgtable_cache",
+				       PAGE_SIZE,
+				       0,
+				       SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+				       pgtable_cache_ctor,
+				       NULL);
+	if (!pgtable_cache)
+		panic("pgtable_cache_init(): could not create pgtable_cache!\n");
+}
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/include/asm-ia64/pgalloc.h pgtable-slab/include/asm-ia64/pgalloc.h
--- linux-pristine/include/asm-ia64/pgalloc.h	2004-08-13 22:36:12.000000000 -0700
+++ pgtable-slab/include/asm-ia64/pgalloc.h	2004-10-18 14:22:50.000000000 -0700
@@ -17,65 +17,26 @@
 
 #include <linux/compiler.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/page-flags.h>
 #include <linux/threads.h>
 
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
+#include <asm/pgtable.h>
 
-/*
- * Very stupidly, we used to get new pgd's and pmd's, init their contents
- * to point to the NULL versions of the next level page table, later on
- * completely re-init them the same way, then free them up.  This wasted
- * a lot of work and caused unnecessary memory traffic.  How broken...
- * We fix this by caching them.
- */
-#define pgd_quicklist		(local_cpu_data->pgd_quick)
-#define pmd_quicklist		(local_cpu_data->pmd_quick)
-#define pgtable_cache_size	(local_cpu_data->pgtable_cache_sz)
-
-static inline pgd_t*
-pgd_alloc_one_fast (struct mm_struct *mm)
-{
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = pgd_quicklist;
-	if (likely(ret != NULL)) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	} else
-		ret = NULL;
-
-	preempt_enable();
-
-	return (pgd_t *) ret;
-}
+extern kmem_cache_t *pgtable_cache;
 
 static inline pgd_t*
 pgd_alloc (struct mm_struct *mm)
 {
-	/* the VM system never calls pgd_alloc_one_fast(), so we do it here. */
-	pgd_t *pgd = pgd_alloc_one_fast(mm);
-
-	if (unlikely(pgd = NULL)) {
-		pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
-		if (likely(pgd != NULL))
-			clear_page(pgd);
-	}
-	return pgd;
+	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL);
 }
 
 static inline void
 pgd_free (pgd_t *pgd)
 {
-	preempt_disable();
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	++pgtable_cache_size;
-	preempt_enable();
+	kmem_cache_free(pgtable_cache, pgd);
 }
 
 static inline void
@@ -86,92 +47,62 @@
 
 
 static inline pmd_t*
-pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
-{
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = (unsigned long *)pmd_quicklist;
-	if (likely(ret != NULL)) {
-		pmd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	}
-
-	preempt_enable();
-
-	return (pmd_t *)ret;
-}
-
-static inline pmd_t*
 pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-
-	if (likely(pmd != NULL))
-		clear_page(pmd);
-	return pmd;
+	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void
 pmd_free (pmd_t *pmd)
 {
-	preempt_disable();
-	*(unsigned long *)pmd = (unsigned long) pmd_quicklist;
-	pmd_quicklist = (unsigned long *) pmd;
-	++pgtable_cache_size;
-	preempt_enable();
+	kmem_cache_free(pgtable_cache, pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd)	pmd_free(pmd)
 
 static inline void
-pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
-{
-	pmd_val(*pmd_entry) = page_to_phys(pte);
-}
-
-static inline void
 pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
 {
 	pmd_val(*pmd_entry) = __pa(pte);
 }
 
-static inline struct page *
-pte_alloc_one (struct mm_struct *mm, unsigned long addr)
+static inline void
+pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
 {
-	struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
-
-	if (likely(pte != NULL))
-		clear_page(page_address(pte));
-	return pte;
+	pmd_val(*pmd_entry) = page_to_phys(pte);
 }
 
+
 static inline pte_t *
 pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
 {
-	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-
-	if (likely(pte != NULL))
-		clear_page(pte);
-	return pte;
+	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void
-pte_free (struct page *pte)
+pte_free_kernel (pte_t *pte)
 {
-	__free_page(pte);
+	kmem_cache_free(pgtable_cache, pte);
 }
 
-static inline void
-pte_free_kernel (pte_t *pte)
+static inline struct page *
+pte_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	free_page((unsigned long) pte);
+	pte_t *pte = pte_alloc_one_kernel(mm, addr);
+
+	if (pte)
+		return virt_to_page(pte);
+
+	return NULL;
 }
 
-#define __pte_free_tlb(tlb, pte)	tlb_remove_page((tlb), (pte))
+static inline void
+pte_free (struct page *pte)
+{
+	pte_free_kernel(page_address(pte));
+}
 
-extern void check_pgt_cache (void);
+#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+#define check_pgt_cache()		do { } while (0)
 
 #endif /* _ASM_IA64_PGALLOC_H */
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/include/asm-ia64/pgtable.h pgtable-slab/include/asm-ia64/pgtable.h
--- linux-pristine/include/asm-ia64/pgtable.h	2004-10-11 14:57:19.000000000 -0700
+++ pgtable-slab/include/asm-ia64/pgtable.h	2004-10-18 14:38:34.000000000 -0700
@@ -422,6 +422,7 @@
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern void paging_init (void);
+extern void pgtable_cache_init(void);
 
 /*
  * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of
@@ -542,11 +543,6 @@
 #define KERNEL_TR_PAGE_SHIFT	_PAGE_SIZE_64M
 #define KERNEL_TR_PAGE_SIZE	(1 << KERNEL_TR_PAGE_SHIFT)
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 /* These tell get_user_pages() that the first gate page is accessible from user-level.  */
 #define FIXADDR_USER_START	GATE_ADDR
 #define FIXADDR_USER_END	(GATE_ADDR + 2*PERCPU_PAGE_SIZE)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
@ 2004-10-19  4:08 ` Luck, Tony
  2004-10-19 15:59 ` Martin K. Petersen
                   ` (20 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Luck, Tony @ 2004-10-19  4:08 UTC (permalink / raw)
  To: linux-ia64

>I have made setup_gate() an initcall.

Did you build with CONFIG_IA32_SUPPORT=y ... the ia32_mem_init()
call sets up some gate pages too, won't it run into the same issue
that setup_gate() does when called before you've set up the
pgtable_cache()? [Just guessing, I didn't try this either]

>                                         That seemed like the best
>compromise and other archs do this too.  kmem_cache_init() is called
>after mem_init() in init/main.c, so we'd need to introduce another
>stub there to set up the gate page even if we had a general purpose
>slab cache in place.
>
>Comments?

Otherwise this looks pretty good ... but likely to collide with
an almighty thud with Andi Kleen's 4-level page table patch that
is making its way through LKML at the moment.  I'm not sure how
much momentum that patch has built up, or whether it is going to
go live in the -mm tree.

I'm going to hold off on your patch while I see what happens to
the 4-level patch.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
  2004-10-19  4:08 ` Luck, Tony
@ 2004-10-19 15:59 ` Martin K. Petersen
  2005-02-10 20:29 ` Robin Holt
                   ` (19 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Martin K. Petersen @ 2004-10-19 15:59 UTC (permalink / raw)
  To: linux-ia64

>>>>> "Tony" = Luck, Tony <tony.luck@intel.com> writes:

>> I have made setup_gate() an initcall.

Tony> Did you build with CONFIG_IA32_SUPPORT=y 

Yes I did.

Tony> ... the ia32_mem_init() call sets up some gate pages too, won't
Tony> it run into the same issue that setup_gate() does when called
Tony> before you've set up the pgtable_cache()? [Just guessing, I
Tony> didn't try this either]

No because it doesn't use the pgd/pmd/pte calls like setup_gate()
does.

Tony> I'm going to hold off on your patch while I see what happens to
Tony> the 4-level patch.

Okie.

-- 
Martin K. Petersen	Wild Open Source, Inc.
mkp@wildopensource.com	http://www.wildopensource.com/

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
  2004-10-19  4:08 ` Luck, Tony
  2004-10-19 15:59 ` Martin K. Petersen
@ 2005-02-10 20:29 ` Robin Holt
  2005-02-10 20:38 ` Luck, Tony
                   ` (18 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-10 20:29 UTC (permalink / raw)
  To: linux-ia64


Tony,

I have resurrected the ia64 page table slabification patch that Martin
Petersen proposed earlier.  This applies against the 2.6.11-rc3 tarball
and builds for allyesconfig, allmodconfig, and defconfig.  Additionally,
defconfig was booted on an SN2 machine.

I am deferring the general purpose zeroed page cache work for later as
did Martin.

Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Martin K. Petersen <mkp@wildopensource.com>


 arch/ia64/mm/contig.c      |    1 
 arch/ia64/mm/discontig.c   |    1 
 arch/ia64/mm/init.c        |   45 ++++++++---------
 include/asm-ia64/pgalloc.h |  118 ++++++++++-----------------------------------
 include/asm-ia64/pgtable.h |    6 --
 5 files changed, 52 insertions(+), 119 deletions(-)


Index: linux-2.6/arch/ia64/mm/contig.c
=================================--- linux-2.6.orig/arch/ia64/mm/contig.c	2005-02-10 12:14:31.248130419 -0600
+++ linux-2.6/arch/ia64/mm/contig.c	2005-02-10 12:49:41.734267769 -0600
@@ -61,7 +61,6 @@
 	printk("%d reserved pages\n", reserved);
 	printk("%d pages shared\n", shared);
 	printk("%d pages swap cached\n", cached);
-	printk("%ld pages in page table cache\n", pgtable_cache_size);
 }
 
 /* physical address where the bootmem map is located */
Index: linux-2.6/arch/ia64/mm/discontig.c
=================================--- linux-2.6.orig/arch/ia64/mm/discontig.c	2005-02-10 12:15:04.474319994 -0600
+++ linux-2.6/arch/ia64/mm/discontig.c	2005-02-10 12:49:41.735244321 -0600
@@ -582,7 +582,6 @@
 	printk("%d reserved pages\n", total_reserved);
 	printk("%d pages shared\n", total_shared);
 	printk("%d pages swap cached\n", total_cached);
-	printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
 	printk("%d free buffer pages\n", nr_free_buffer_pages());
 }
 
Index: linux-2.6/arch/ia64/mm/init.c
=================================--- linux-2.6.orig/arch/ia64/mm/init.c	2005-02-10 12:14:35.318397233 -0600
+++ linux-2.6/arch/ia64/mm/init.c	2005-02-10 12:49:41.737197424 -0600
@@ -56,26 +56,6 @@
 EXPORT_SYMBOL(zero_page_memmap_ptr);
 
 void
-check_pgt_cache (void)
-{
-	int low, high;
-
-	low = pgt_cache_water[0];
-	high = pgt_cache_water[1];
-
-	preempt_disable();
-	if (pgtable_cache_size > (u64) high) {
-		do {
-			if (pgd_quicklist)
-				free_page((unsigned long)pgd_alloc_one_fast(NULL));
-			if (pmd_quicklist)
-				free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
-		} while (pgtable_cache_size > (u64) low);
-	}
-	preempt_enable();
-}
-
-void
 update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
 {
 	unsigned long addr;
@@ -271,7 +251,7 @@
 	return page;
 }
 
-static void
+static int __init
 setup_gate (void)
 {
 	struct page *page;
@@ -289,8 +269,11 @@
 	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
 #endif
 	ia64_patch_gate();
+	return 0;
 }
 
+core_initcall(setup_gate);
+
 void __devinit
 ia64_mmu_init (void *my_cpu_data)
 {
@@ -590,9 +573,27 @@
 		if (!fsyscall_table[i] || nolwsys)
 			fsyscall_table[i] = sys_call_table[i] | 1;
 	}
-	setup_gate();
 
 #ifdef CONFIG_IA32_SUPPORT
 	ia32_mem_init();
 #endif
 }
+
+kmem_cache_t *pgtable_cache;
+
+static void pgtable_cache_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(pte, 0, PAGE_SIZE);
+}
+
+void pgtable_cache_init(void)
+{
+	pgtable_cache = kmem_cache_create("pgtable_cache",
+				       PAGE_SIZE,
+				       0,
+				       SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+				       pgtable_cache_ctor,
+				       NULL);
+	if (!pgtable_cache)
+		panic("pgtable_cache_init(): could not create pgtable_cache!\n");
+}
Index: linux-2.6/include/asm-ia64/pgalloc.h
=================================--- linux-2.6.orig/include/asm-ia64/pgalloc.h	2005-02-10 12:14:10.711251555 -0600
+++ linux-2.6/include/asm-ia64/pgalloc.h	2005-02-10 13:06:17.960526962 -0600
@@ -17,63 +17,26 @@
 
 #include <linux/compiler.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/page-flags.h>
 #include <linux/threads.h>
 
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
+#include <asm/pgtable.h>
 
-/*
- * Very stupidly, we used to get new pgd's and pmd's, init their contents
- * to point to the NULL versions of the next level page table, later on
- * completely re-init them the same way, then free them up.  This wasted
- * a lot of work and caused unnecessary memory traffic.  How broken...
- * We fix this by caching them.
- */
-#define pgd_quicklist		(local_cpu_data->pgd_quick)
-#define pmd_quicklist		(local_cpu_data->pmd_quick)
-#define pgtable_cache_size	(local_cpu_data->pgtable_cache_sz)
-
-static inline pgd_t*
-pgd_alloc_one_fast (struct mm_struct *mm)
-{
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = pgd_quicklist;
-	if (likely(ret != NULL)) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	} else
-		ret = NULL;
-
-	preempt_enable();
-
-	return (pgd_t *) ret;
-}
+extern kmem_cache_t *pgtable_cache;
 
 static inline pgd_t*
 pgd_alloc (struct mm_struct *mm)
 {
-	/* the VM system never calls pgd_alloc_one_fast(), so we do it here. */
-	pgd_t *pgd = pgd_alloc_one_fast(mm);
-
-	if (unlikely(pgd = NULL)) {
-		pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
-	}
-	return pgd;
+	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL);
 }
 
 static inline void
 pgd_free (pgd_t *pgd)
 {
-	preempt_disable();
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	++pgtable_cache_size;
-	preempt_enable();
+	kmem_cache_free(pgtable_cache, pgd);
 }
 
 static inline void
@@ -83,86 +46,61 @@
 }
 
 static inline pmd_t*
-pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
-{
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = (unsigned long *)pmd_quicklist;
-	if (likely(ret != NULL)) {
-		pmd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	}
-
-	preempt_enable();
-
-	return (pmd_t *)ret;
-}
-
-static inline pmd_t*
 pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-
-	return pmd;
+	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void
 pmd_free (pmd_t *pmd)
 {
-	preempt_disable();
-	*(unsigned long *)pmd = (unsigned long) pmd_quicklist;
-	pmd_quicklist = (unsigned long *) pmd;
-	++pgtable_cache_size;
-	preempt_enable();
+	kmem_cache_free(pgtable_cache, pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd)	pmd_free(pmd)
 
 static inline void
-pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
-{
-	pmd_val(*pmd_entry) = page_to_phys(pte);
-}
-
-static inline void
 pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
 {
 	pmd_val(*pmd_entry) = __pa(pte);
 }
 
-static inline struct page *
-pte_alloc_one (struct mm_struct *mm, unsigned long addr)
+static inline void
+pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
 {
-	struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-
-	return pte;
+	pmd_val(*pmd_entry) = page_to_phys(pte);
 }
 
 static inline pte_t *
 pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
 {
-	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-
-	return pte;
+	return kmem_cache_alloc(pgtable_cache, GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void
-pte_free (struct page *pte)
+pte_free_kernel (pte_t *pte)
 {
-	__free_page(pte);
+	kmem_cache_free(pgtable_cache, pte);
 }
 
-static inline void
-pte_free_kernel (pte_t *pte)
+static inline struct page *
+pte_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	free_page((unsigned long) pte);
+	pte_t *pte = pte_alloc_one_kernel(mm, addr);
+
+	if (pte)
+		return virt_to_page(pte);
+
+	return NULL;
 }
 
-#define __pte_free_tlb(tlb, pte)	tlb_remove_page((tlb), (pte))
+static inline void
+pte_free (struct page *pte)
+{
+	pte_free_kernel(page_address(pte));
+}
 
-extern void check_pgt_cache (void);
+#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+#define check_pgt_cache()		do { } while (0)
 
 #endif /* _ASM_IA64_PGALLOC_H */
Index: linux-2.6/include/asm-ia64/pgtable.h
=================================--- linux-2.6.orig/include/asm-ia64/pgtable.h	2005-02-10 12:14:29.641703137 -0600
+++ linux-2.6/include/asm-ia64/pgtable.h	2005-02-10 12:49:41.745009837 -0600
@@ -423,6 +423,7 @@
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern void paging_init (void);
+extern void pgtable_cache_init(void);
 
 /*
  * Note: The macros below rely on the fact that MAX_SWAPFILES_SHIFT <= number of
@@ -545,11 +546,6 @@
 #define KERNEL_TR_PAGE_SHIFT	_PAGE_SIZE_64M
 #define KERNEL_TR_PAGE_SIZE	(1 << KERNEL_TR_PAGE_SHIFT)
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 /* These tell get_user_pages() that the first gate page is accessible from user-level.  */
 #define FIXADDR_USER_START	GATE_ADDR
 #define FIXADDR_USER_END	(GATE_ADDR + 2*PERCPU_PAGE_SIZE)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (2 preceding siblings ...)
  2005-02-10 20:29 ` Robin Holt
@ 2005-02-10 20:38 ` Luck, Tony
  2005-02-11 18:35 ` Robin Holt
                   ` (17 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Luck, Tony @ 2005-02-10 20:38 UTC (permalink / raw)
  To: linux-ia64

Can pgtable_cache_init() be "__init"?

Do you have some before/after numbers from lmbench fork overhead
(I wouldn't expect much impact from this, but it would be nice to
make sure).

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (3 preceding siblings ...)
  2005-02-10 20:38 ` Luck, Tony
@ 2005-02-11 18:35 ` Robin Holt
  2005-02-11 18:51 ` Luck, Tony
                   ` (16 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-11 18:35 UTC (permalink / raw)
  To: linux-ia64

On Thu, Feb 10, 2005 at 12:38:06PM -0800, Luck, Tony wrote:
> Can pgtable_cache_init() be "__init"?

I have that changed in my work area now.

> 
> Do you have some before/after numbers from lmbench fork overhead
> (I wouldn't expect much impact from this, but it would be nice to
> make sure).

I am running lmbench even as we speak.  Is there a way to run lmbench
and only get the fork overhead information?

I started with this patch because it goes part of the way to where I
need to be.  I am trying to get 4 level page table directories working
efficiently on SGI boxes.  I see this as three problems.

1) This patch to join quicklists together and simplify the cleanup
   of the list to cases where memory is really needed (slabs does this
   nicely).
2) Ensure the slabs are made node aware.  We have seen significant
   impact when the page tables are allocated off node.
3) Implement the pud_alloc pud_free stuff so 4-level directories
   work.

After looking at some of the performance numbers, I am beginning to
think #1 and #2 need to be done as one.  I assumed there would be no
significant impact from using the zeroed slab versus the quicklists and
that appears to be true.  I have not considered all the ramifications
like the default sizes of the slab, etc.  There does not appear to be
much performance difference on lmbench between the two.  For a page-fault
test, the fault rate with the zeroed slab appears to be slightly lower,
but that is as I would expect due to the added overhead of the slab.

Right now, neither the quicklists nor slabs are node local.  I think
that is a larger issue.  I will start a seperate discussion about node
locality soon.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (4 preceding siblings ...)
  2005-02-11 18:35 ` Robin Holt
@ 2005-02-11 18:51 ` Luck, Tony
  2005-02-11 19:33 ` Robin Holt
                   ` (15 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Luck, Tony @ 2005-02-11 18:51 UTC (permalink / raw)
  To: linux-ia64

>> Do you have some before/after numbers from lmbench fork overhead
>> (I wouldn't expect much impact from this, but it would be nice to
>> make sure).
>
>I am running lmbench even as we speak.  Is there a way to run lmbench
>and only get the fork overhead information?

You can run the "lat_proc" binary with the "fork" argument.

>I started with this patch because it goes part of the way to where I
>need to be.  I am trying to get 4 level page table directories working
>efficiently on SGI boxes.  I see this as three problems.

Do you mean full four-level page tables, or just fixing the quirks
in the current implementation that has an empty PUD level (and seems
to have caused a slight drop in performance for reasons that are not
yet completely understood)?

>Right now, neither the quicklists nor slabs are node local.  I think
>that is a larger issue.  I will start a seperate discussion about node
>locality soon.

Making the slab node aware is probably the right thing to do, but
making quicklists node aware is less invasive.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (5 preceding siblings ...)
  2005-02-11 18:51 ` Luck, Tony
@ 2005-02-11 19:33 ` Robin Holt
  2005-02-14 16:33 ` Robin Holt
                   ` (14 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-11 19:33 UTC (permalink / raw)
  To: linux-ia64

On Fri, Feb 11, 2005 at 10:51:12AM -0800, Luck, Tony wrote:
> >> Do you have some before/after numbers from lmbench fork overhead
> >> (I wouldn't expect much impact from this, but it would be nice to
> >> make sure).
> >
> >I am running lmbench even as we speak.  Is there a way to run lmbench
> >and only get the fork overhead information?
> 
> You can run the "lat_proc" binary with the "fork" argument.
> 
> >I started with this patch because it goes part of the way to where I
> >need to be.  I am trying to get 4 level page table directories working
> >efficiently on SGI boxes.  I see this as three problems.
> 
> Do you mean full four-level page tables, or just fixing the quirks
> in the current implementation that has an empty PUD level (and seems
> to have caused a slight drop in performance for reasons that are not
> yet completely understood)?

Full four-level.

> 
> >Right now, neither the quicklists nor slabs are node local.  I think
> >that is a larger issue.  I will start a seperate discussion about node
> >locality soon.
> 
> Making the slab node aware is probably the right thing to do, but
> making quicklists node aware is less invasive.

Does anybody have a preference?

I sort of like the quicklist because they are more clear.  I think the
node-aware slab cache may be quicker for some workloads..  On the SGI
kernel based upon 2.4, we collapsed the quicklists into a single list
and then just checked to see if the page being added to the list was for
this node.  It prevented some problems, but resulted in some of our MPI
loads keeping the quicklist completely full on most nodes and constantly
draining/allocating from a single node.  My preference leans towards the
slab cache, but I can be easily swayed.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (6 preceding siblings ...)
  2005-02-11 19:33 ` Robin Holt
@ 2005-02-14 16:33 ` Robin Holt
  2005-02-14 19:18 ` Luck, Tony
                   ` (13 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-14 16:33 UTC (permalink / raw)
  To: linux-ia64

On Fri, Feb 11, 2005 at 01:33:51PM -0600, Robin Holt wrote:
> On Fri, Feb 11, 2005 at 10:51:12AM -0800, Luck, Tony wrote:
...
> > Making the slab node aware is probably the right thing to do, but
> > making quicklists node aware is less invasive.
> 
> Does anybody have a preference?
> 
> I sort of like the quicklist because they are more clear.  I think the
> node-aware slab cache may be quicker for some workloads..  On the SGI
> kernel based upon 2.4, we collapsed the quicklists into a single list
> and then just checked to see if the page being added to the list was for
> this node.  It prevented some problems, but resulted in some of our MPI
> loads keeping the quicklist completely full on most nodes and constantly
> draining/allocating from a single node.  My preference leans towards the
> slab cache, but I can be easily swayed.

I have given this some more thought over the weekend and think I am
leaning more towards the quicklists.  Its main attraction to me seems
to be the simplicity of design.  The quicklists remain consistent with
what we are already familiar with.  To make pte, pmd, and pgd entries
equivalent, I think I am going to propose a single quicklist for all
the different types of entries with a single add/remove function.

The single quicklist is probably going to prove helpful for the trim
functions.  Right now, the trim function tends to trim with a strong bias
towards the first cpu that processes the tick followed by a weaker bias
for the pte entries remaining on the list.  While the second bias seems
reasonable, collapsing to a single list eliminates that bias and then
making the trim only remove when the node is in a low memory situation
and only have it trim based upon some factor of available memory on the
node will significantly reduce the bias for first cpu to receive the tick.

Sorry for the confusing ramble, but I am in full Monday morning mode
right now.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (7 preceding siblings ...)
  2005-02-14 16:33 ` Robin Holt
@ 2005-02-14 19:18 ` Luck, Tony
  2005-02-15 12:02 ` Robin Holt
                   ` (12 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Luck, Tony @ 2005-02-14 19:18 UTC (permalink / raw)
  To: linux-ia64

>I have given this some more thought over the weekend and think I am
>leaning more towards the quicklists.  Its main attraction to me seems
>to be the simplicity of design.  The quicklists remain consistent with
>what we are already familiar with.  To make pte, pmd, and pgd entries
>equivalent, I think I am going to propose a single quicklist for all
>the different types of entries with a single add/remove function.

What about the 4-level case?  With 64K pages you end up with a surplus
of bits if you use a full page for all pgd/pud/pmd/pte levels.  Are
you planning on just leaving unused space in the PGD so that everything
is simple?  It might be more efficient to slim down the number of bits
in the pte level?  Then the slab starts looking attractive again to
handle the sub-page allocations.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (8 preceding siblings ...)
  2005-02-14 19:18 ` Luck, Tony
@ 2005-02-15 12:02 ` Robin Holt
  2005-02-15 18:07 ` David Mosberger
                   ` (11 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-15 12:02 UTC (permalink / raw)
  To: linux-ia64

On Mon, Feb 14, 2005 at 11:18:21AM -0800, Luck, Tony wrote:
> >I have given this some more thought over the weekend and think I am
> >leaning more towards the quicklists.  Its main attraction to me seems
> >to be the simplicity of design.  The quicklists remain consistent with
> >what we are already familiar with.  To make pte, pmd, and pgd entries
> >equivalent, I think I am going to propose a single quicklist for all
> >the different types of entries with a single add/remove function.
> 
> What about the 4-level case?  With 64K pages you end up with a surplus
> of bits if you use a full page for all pgd/pud/pmd/pte levels.  Are
> you planning on just leaving unused space in the PGD so that everything
> is simple?  It might be more efficient to slim down the number of bits
> in the pte level?  Then the slab starts looking attractive again to
> handle the sub-page allocations.

For the 4-level case with 64k pages, what about just using 16k page
tables?  That leaves us with 60 bits of addressable space which is fairly
close to the full space.  It would make the tables consistently sized,
but I am not sure that is of much value.  As you point out, the slab
would make that completely unnecessary.

Jack Steiner is trying to determine how much more expensive the off-node
page tables are than on node.  Once we know that, we will know if a
per-cpu or per-node quicklist type arrangement is truely beneficial or
if a general use slab without node awareness will be adequate.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (9 preceding siblings ...)
  2005-02-15 12:02 ` Robin Holt
@ 2005-02-15 18:07 ` David Mosberger
  2005-02-15 18:29 ` Luck, Tony
                   ` (10 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: David Mosberger @ 2005-02-15 18:07 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Tue, 15 Feb 2005 06:02:13 -0600, Robin Holt <holt@sgi.com> said:

  Robin> For the 4-level case with 64k pages, what about just using
  Robin> 16k page tables?  That leaves us with 60 bits of addressable
  Robin> space which is fairly close to the full space.

You're kidding, right?  If getting "fairly close" to 64 bits was good
enough, there would be no point for 64KB pages.

	--david

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (10 preceding siblings ...)
  2005-02-15 18:07 ` David Mosberger
@ 2005-02-15 18:29 ` Luck, Tony
  2005-02-15 19:31 ` Robin Holt
                   ` (9 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Luck, Tony @ 2005-02-15 18:29 UTC (permalink / raw)
  To: linux-ia64

Robin> For the 4-level case with 64k pages, what about just using
Robin> 16k page tables?  That leaves us with 60 bits of addressable
Robin> space which is fairly close to the full space.

David> You're kidding, right?  If getting "fairly close" to 64 bits was good
David> enough, there would be no point for 64KB pages.

User space will only end up with just over 63.5 bits since the kernel
has grabbed regions 5, 6, 7 for itself.  Unless someone comes up with
a 16EB+16EB patch analogous to the 4G+4G ia32 patch :-)

We also lose some space out of each region for the VHPT.

Region 4 is reserved for hugetlb pages.

Page 0 of region 0 is a NaT page.

So we fall short of a full, flat 64-bit address space for a variety
of reasons.

Nonetheless, I agree with David that reducing available virtual
space by a factor of 16 sounds like a poor idea.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (11 preceding siblings ...)
  2005-02-15 18:29 ` Luck, Tony
@ 2005-02-15 19:31 ` Robin Holt
  2005-02-15 19:46 ` David Mosberger
                   ` (8 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-15 19:31 UTC (permalink / raw)
  To: linux-ia64

On Tue, Feb 15, 2005 at 10:29:51AM -0800, Luck, Tony wrote:
> Robin> For the 4-level case with 64k pages, what about just using
> Robin> 16k page tables?  That leaves us with 60 bits of addressable
> Robin> space which is fairly close to the full space.
> 
> David> You're kidding, right?  If getting "fairly close" to 64 bits was good
> David> enough, there would be no point for 64KB pages.
> 
> User space will only end up with just over 63.5 bits since the kernel
> has grabbed regions 5, 6, 7 for itself.  Unless someone comes up with
> a 16EB+16EB patch analogous to the 4G+4G ia32 patch :-)
> 
> We also lose some space out of each region for the VHPT.
> 
> Region 4 is reserved for hugetlb pages.
> 
> Page 0 of region 0 is a NaT page.
> 
> So we fall short of a full, flat 64-bit address space for a variety
> of reasons.
> 
> Nonetheless, I agree with David that reducing available virtual
> space by a factor of 16 sounds like a poor idea.

We do not currently forsee a need to go beyond 4 levels of 16k page
tables with 16k pages.  If so, jumping to 4 levels of 16k page tables
with 64k pages gets our address space to 60 bits.

With short format page tables, spanning regions is not possible.  We do
not see that as an issue since a 60 bit virtual address space should
meet our customer needs for the longterm future.

Is there a strong objection to implementing 4 levels of 16k page tables
for now and then, if someone else sees a need, convert to using long
page table format and adjust page tables as needed at that point in time.

At this point in time, I think the consistent 16k allocations would be
better for page table reuse than having 1k allocations intermingled with
64k allocations.

Am I off target on this completely?  Have I missed something important?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (12 preceding siblings ...)
  2005-02-15 19:31 ` Robin Holt
@ 2005-02-15 19:46 ` David Mosberger
  2005-02-15 19:57 ` Robin Holt
                   ` (7 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: David Mosberger @ 2005-02-15 19:46 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Tue, 15 Feb 2005 13:31:48 -0600, Robin Holt <holt@sgi.com> said:

  Robin> Is there a strong objection to implementing 4 levels of 16k
  Robin> page tables for now and then, if someone else sees a need,
  Robin> convert to using long page table format and adjust page
  Robin> tables as needed at that point in time.

I certainly don't want to switch my machines to 4-levels.  There is
zero need for that on the machines I use, so why pay the overhead of
an extra level?

BTW: why do you want 4-level/16KB over 3-level/64KB?

	--david

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (13 preceding siblings ...)
  2005-02-15 19:46 ` David Mosberger
@ 2005-02-15 19:57 ` Robin Holt
  2005-02-15 19:59 ` Robin Holt
                   ` (6 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-15 19:57 UTC (permalink / raw)
  To: linux-ia64

On Tue, Feb 15, 2005 at 11:46:21AM -0800, David Mosberger wrote:
> >>>>> On Tue, 15 Feb 2005 13:31:48 -0600, Robin Holt <holt@sgi.com> said:
> 
>   Robin> Is there a strong objection to implementing 4 levels of 16k
>   Robin> page tables for now and then, if someone else sees a need,
>   Robin> convert to using long page table format and adjust page
>   Robin> tables as needed at that point in time.
> 
> I certainly don't want to switch my machines to 4-levels.  There is
> zero need for that on the machines I use, so why pay the overhead of
> an extra level?
> 
> BTW: why do you want 4-level/16KB over 3-level/64KB?

There are two reasons that have been given to me.

The first is that most of the software vendors are only certifying
their applications to work on 16kb pages.  This seems to follow what
the distributions are doing as well.

Additionally, we have seen that performance on certain file server
benchmarks are better with 16kb pages and expect that there will be a
continued demand by some of our customers to stay with 16kb pages.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (14 preceding siblings ...)
  2005-02-15 19:57 ` Robin Holt
@ 2005-02-15 19:59 ` Robin Holt
  2005-02-15 20:03 ` David Mosberger
                   ` (5 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-15 19:59 UTC (permalink / raw)
  To: linux-ia64

On Tue, Feb 15, 2005 at 11:46:21AM -0800, David Mosberger wrote:
> >>>>> On Tue, 15 Feb 2005 13:31:48 -0600, Robin Holt <holt@sgi.com> said:
> 
>   Robin> Is there a strong objection to implementing 4 levels of 16k
>   Robin> page tables for now and then, if someone else sees a need,
>   Robin> convert to using long page table format and adjust page
>   Robin> tables as needed at that point in time.
> 
> I certainly don't want to switch my machines to 4-levels.  There is
> zero need for that on the machines I use, so why pay the overhead of
> an extra level?

I missed the why the extra level question.  That part is motivated by
some large MPI jobs need a fourth level to get a large enough single
mapping to cover their entire dataset.  We have already tripped this
limitation for a couple of our really large customers.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (15 preceding siblings ...)
  2005-02-15 19:59 ` Robin Holt
@ 2005-02-15 20:03 ` David Mosberger
  2005-02-15 20:08 ` Robin Holt
                   ` (4 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: David Mosberger @ 2005-02-15 20:03 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Tue, 15 Feb 2005 13:59:45 -0600, Robin Holt <holt@sgi.com> said:

  >>  I certainly don't want to switch my machines to 4-levels.  There
  >> is zero need for that on the machines I use, so why pay the
  >> overhead of an extra level?

  Robin> I missed the why the extra level question.  That part is
  Robin> motivated by some large MPI jobs need a fourth level to get a
  Robin> large enough single mapping to cover their entire dataset.
  Robin> We have already tripped this limitation for a couple of our
  Robin> really large customers.

What I'm saying is that the default probably should stay at 3-levels.
Optimizing ia64 linux _just_ for "really large customers" would be
a bad direction to go in.

	--david

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (16 preceding siblings ...)
  2005-02-15 20:03 ` David Mosberger
@ 2005-02-15 20:08 ` Robin Holt
  2005-02-15 20:15 ` William Lee Irwin III
                   ` (3 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: Robin Holt @ 2005-02-15 20:08 UTC (permalink / raw)
  To: linux-ia64

On Tue, Feb 15, 2005 at 12:03:22PM -0800, David Mosberger wrote:
> >>>>> On Tue, 15 Feb 2005 13:59:45 -0600, Robin Holt <holt@sgi.com> said:
> 
>   >>  I certainly don't want to switch my machines to 4-levels.  There
>   >> is zero need for that on the machines I use, so why pay the
>   >> overhead of an extra level?
> 
>   Robin> I missed the why the extra level question.  That part is
>   Robin> motivated by some large MPI jobs need a fourth level to get a
>   Robin> large enough single mapping to cover their entire dataset.
>   Robin> We have already tripped this limitation for a couple of our
>   Robin> really large customers.
> 
> What I'm saying is that the default probably should stay at 3-levels.
> Optimizing ia64 linux _just_ for "really large customers" would be
> a bad direction to go in.

I agree.  What I am asking is if we turn on the 4th level of page tables,
does anybody have an objection to 4/16k levels as opposed to 3/PAGE_SIZE
plus a 4th cleanup to cover the entire address space?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (17 preceding siblings ...)
  2005-02-15 20:08 ` Robin Holt
@ 2005-02-15 20:15 ` William Lee Irwin III
  2005-02-15 20:25 ` Luck, Tony
                   ` (2 subsequent siblings)
  21 siblings, 0 replies; 23+ messages in thread
From: William Lee Irwin III @ 2005-02-15 20:15 UTC (permalink / raw)
  To: linux-ia64

On Tue, 15 Feb 2005 13:31:48 -0600, Robin Holt <holt@sgi.com> said:
Robin> Is there a strong objection to implementing 4 levels of 16k
Robin> page tables for now and then, if someone else sees a need,
Robin> convert to using long page table format and adjust page
Robin> tables as needed at that point in time.

On Tue, Feb 15, 2005 at 11:46:21AM -0800, David Mosberger wrote:
> I certainly don't want to switch my machines to 4-levels.  There is
> zero need for that on the machines I use, so why pay the overhead of
> an extra level?
> BTW: why do you want 4-level/16KB over 3-level/64KB?

It may be worth noticing that things like fault latency and small
writes degrade with increased pagesize, though I seem to recall hearing
that this effect was less pronounced or did not set in at 64KB on ia64.

-- wli

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (18 preceding siblings ...)
  2005-02-15 20:15 ` William Lee Irwin III
@ 2005-02-15 20:25 ` Luck, Tony
  2005-02-15 20:26 ` David Mosberger
  2005-02-17 17:22 ` Jack Steiner
  21 siblings, 0 replies; 23+ messages in thread
From: Luck, Tony @ 2005-02-15 20:25 UTC (permalink / raw)
  To: linux-ia64

>I agree.  What I am asking is if we turn on the 4th level of 
>page tables, does anybody have an objection to 4/16k levels as opposed to 
>3/PAGE_SIZE plus a 4th cleanup to cover the entire address space?

Are there any other big system users now (or coming real soon) for
whom 2**60 (with PAGE_SIZEdk, 2**58 with PAGE_SIZE\x16k) would
be a major pain?

You'd have to do some fancy footwork documenting and enforcing
the PAGE_SIZE vs. PAGE_TABLE_LEVELS interactions (it would be
bad to configure a 4K or 8K pagesize with a 4-level option that
expected to allocate 16K for page tables).

The PAGE_SIZEdk 4/16k case requires you to deal with partial pages
on the quicklists (or in the slab if you go that route).  The slab
code knows how to coalesce and free such sub-pages under memory
pressure (though fragmentation may make it sometimes ineffective).
The quicklist approach doesn't have any easy way to return partial
pages to general use.

-Tony

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (19 preceding siblings ...)
  2005-02-15 20:25 ` Luck, Tony
@ 2005-02-15 20:26 ` David Mosberger
  2005-02-17 17:22 ` Jack Steiner
  21 siblings, 0 replies; 23+ messages in thread
From: David Mosberger @ 2005-02-15 20:26 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Tue, 15 Feb 2005 12:15:19 -0800, William Lee Irwin III <wli@holomorphy.com> said:

  >> I certainly don't want to switch my machines to 4-levels.  There
  >> is zero need for that on the machines I use, so why pay the
  >> overhead of an extra level?  BTW: why do you want 4-level/16KB
  >> over 3-level/64KB?

  William> It may be worth noticing that things like fault latency and
  William> small writes degrade with increased pagesize, though I seem
  William> to recall hearing that this effect was less pronounced or
  William> did not set in at 64KB on ia64.

Certainly, but fault-latency also increase with the number of levels.
And there are real customers who care about page-fault latency a lot
which run apps where the probability of the page-table entries being
in the cache is close to zero...

I'm not advocating one over the other --- I'm pro choice... ;-)

	--david

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH] Convert pgtable cache to slab
  2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
                   ` (20 preceding siblings ...)
  2005-02-15 20:26 ` David Mosberger
@ 2005-02-17 17:22 ` Jack Steiner
  21 siblings, 0 replies; 23+ messages in thread
From: Jack Steiner @ 2005-02-17 17:22 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 2625 bytes --]

> 
> Jack Steiner is trying to determine how much more expensive the off-node
> page tables are than on node.  Once we know that, we will know if a
> per-cpu or per-node quicklist type arrangement is truely beneficial or
> if a general use slab without node awareness will be adequate.
> 
> Thanks,
> Robin

Here is the results of a test that measures the benefit of
node-local page tables vs page tables located on a remote
node. This was run on a 64p Altix system using 900MHz/1.5M L3
cpus.

The test is very likely a ~worse-case test. It was contrived to 
maximize the number of TLB misses to page table entries that
are not in the cache of the processor. The test references
memory with a 32M+128 byte stride. The maximizes data cache
hits & minimizes PT cache hits. The page table pages were all 
allocated on 256K boundaries to ensure maximum cache conflicts
for PT references by the VHPT.

Granted, this is a worse case test but I wanted to see
if I could measure the effect. Although this test is atypical of
most apps, it is similar to some very large pointer chasing
applications in the real world.

The test essentially follows a linked list of pointers. When the number of 
entries in the list exceed the size of the TLB, TLB misses occur & 
the processor must make references to the in-memory page table.

      Memory Reference Time (ns/reference) for Pointer Chasing Test

 POINTERS    PT_LOCAL    PT_REMOTE
       1        5.621        5.621
      10        4.226        4.226
      20        3.224        3.224
      30        2.890        2.890
      40       11.418       11.418
      50       11.357       11.357
      60       11.316       11.316
      70       11.287       11.286
      80       11.329       11.302
      90       11.292       12.082
     100       11.272       12.763
     110       11.974       16.075
     120       51.758      132.389  << exceeds TLB capacity here
     130      119.311      382.279
     140      143.726      466.469
     150      158.851      495.946
     160      164.415      515.392
     170      168.822      525.382
     180      168.057      537.250
     190      173.043      536.515
     300      192.804      632.459
     400      204.916      666.561
     500      230.489      693.896
     600      256.716      725.338
     700      286.068      731.309
     800      293.407      740.035
     900      306.296      747.635

I've also attach a graph. It is prettier but may not be friendly
to all mail readers.
-- 
Thanks

Jack Steiner (steiner@sgi.com)          651-683-5302
Principal Engineer                      SGI - Silicon Graphics, Inc.

[-- Attachment #2: z.png --]
[-- Type: image/png, Size: 5523 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2005-02-17 17:22 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-10-18 22:52 [PATCH] Convert pgtable cache to slab Martin K. Petersen
2004-10-19  4:08 ` Luck, Tony
2004-10-19 15:59 ` Martin K. Petersen
2005-02-10 20:29 ` Robin Holt
2005-02-10 20:38 ` Luck, Tony
2005-02-11 18:35 ` Robin Holt
2005-02-11 18:51 ` Luck, Tony
2005-02-11 19:33 ` Robin Holt
2005-02-14 16:33 ` Robin Holt
2005-02-14 19:18 ` Luck, Tony
2005-02-15 12:02 ` Robin Holt
2005-02-15 18:07 ` David Mosberger
2005-02-15 18:29 ` Luck, Tony
2005-02-15 19:31 ` Robin Holt
2005-02-15 19:46 ` David Mosberger
2005-02-15 19:57 ` Robin Holt
2005-02-15 19:59 ` Robin Holt
2005-02-15 20:03 ` David Mosberger
2005-02-15 20:08 ` Robin Holt
2005-02-15 20:15 ` William Lee Irwin III
2005-02-15 20:25 ` Luck, Tony
2005-02-15 20:26 ` David Mosberger
2005-02-17 17:22 ` Jack Steiner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox