[RFC] Convert pgtable cache to slab

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] Convert pgtable cache to slab
@ 2004-10-13 18:47 Martin K. Petersen
  2004-10-13 18:57 ` Jesse Barnes
                   ` (16 more replies)
  0 siblings, 17 replies; 18+ messages in thread
From: Martin K. Petersen @ 2004-10-13 18:47 UTC (permalink / raw)
  To: linux-ia64


This patch makes the page table cache on IA-64 use the slab instead of
standard page allocations.  It's based upon Bill Irwin's code for
ppc64.

With this patch I got a significant improvement in page fault time.
Standard 2.4.x was about 700ns on average.  Generic 2.6.9rc4 is 3-4
usec, whereas the slabified pgtcache drops us back down to 600-700 ns.

Tested on zx1 and sn2.

The biggest caveat is that I've had to postpone setting up the gate
page until the pgt slab has been initialized.  That wasn't an issue
with the existing page-based allocation scheme.  David - how do you
prefer I handle this?

Comments?

-- 
Martin K. Petersen	Wild Open Source, Inc.
mkp@wildopensource.com	http://www.wildopensource.com/

 arch/ia64/mm/contig.c      |    1 
 arch/ia64/mm/discontig.c   |    1 
 arch/ia64/mm/init.c        |   41 +++++++-------
 include/asm-ia64/pgalloc.h |  127 ++++++++++-----------------------------------
 include/asm-ia64/pgtable.h |    5 -
 5 files changed, 50 insertions(+), 125 deletions(-)

diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/arch/ia64/mm/contig.c pgtable-slab/arch/ia64/mm/contig.c
--- linux-pristine/arch/ia64/mm/contig.c	2004-10-11 14:57:16.000000000 -0700
+++ pgtable-slab/arch/ia64/mm/contig.c	2004-10-11 14:59:09.000000000 -0700
@@ -60,7 +60,6 @@
 	printk("%d reserved pages\n", reserved);
 	printk("%d pages shared\n", shared);
 	printk("%d pages swap cached\n", cached);
-	printk("%ld pages in page table cache\n", pgtable_cache_size);
 }
 
 /* physical address where the bootmem map is located */
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/arch/ia64/mm/discontig.c pgtable-slab/arch/ia64/mm/discontig.c
--- linux-pristine/arch/ia64/mm/discontig.c	2004-10-11 14:57:16.000000000 -0700
+++ pgtable-slab/arch/ia64/mm/discontig.c	2004-10-11 15:10:05.000000000 -0700
@@ -540,7 +540,6 @@
 	printk("%d reserved pages\n", total_reserved);
 	printk("%d pages shared\n", total_shared);
 	printk("%d pages swap cached\n", total_cached);
-	printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
 	printk("%d free buffer pages\n", nr_free_buffer_pages());
 }
 
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/arch/ia64/mm/init.c pgtable-slab/arch/ia64/mm/init.c
--- linux-pristine/arch/ia64/mm/init.c	2004-10-11 14:57:16.000000000 -0700
+++ pgtable-slab/arch/ia64/mm/init.c	2004-10-11 15:04:55.000000000 -0700
@@ -56,26 +56,6 @@
 EXPORT_SYMBOL(zero_page_memmap_ptr);
 
 void
-check_pgt_cache (void)
-{
-	int low, high;
-
-	low = pgt_cache_water[0];
-	high = pgt_cache_water[1];
-
-	preempt_disable();
-	if (pgtable_cache_size > (u64) high) {
-		do {
-			if (pgd_quicklist)
-				free_page((unsigned long)pgd_alloc_one_fast(NULL));
-			if (pmd_quicklist)
-				free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
-		} while (pgtable_cache_size > (u64) low);
-	}
-	preempt_enable();
-}
-
-void
 update_mmu_cache (struct vm_area_struct *vma, unsigned long vaddr, pte_t pte)
 {
 	unsigned long addr;
@@ -584,6 +564,27 @@
 		if (!fsyscall_table[i] || nolwsys)
 			fsyscall_table[i] = sys_call_table[i] | 1;
 	}
+}
+
+kmem_cache_t *zero_cache;
+
+static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(pte, 0, PAGE_SIZE);
+}
+
+void pgtable_cache_init(void)
+{
+	printk(KERN_INFO "pgtable_cache_init()\n");
+	zero_cache = kmem_cache_create("zero",
+				       PAGE_SIZE,
+				       0,
+				       SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
+				       zero_ctor,
+				       NULL);
+	if (!zero_cache)
+		panic("pgtable_cache_init(): could not create zero_cache!\n");
+	
 	setup_gate();
 
 #ifdef CONFIG_IA32_SUPPORT
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/include/asm-ia64/pgalloc.h pgtable-slab/include/asm-ia64/pgalloc.h
--- linux-pristine/include/asm-ia64/pgalloc.h	2004-08-13 22:36:12.000000000 -0700
+++ pgtable-slab/include/asm-ia64/pgalloc.h	2004-10-11 14:59:14.000000000 -0700
@@ -17,65 +17,26 @@
 
 #include <linux/compiler.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/page-flags.h>
 #include <linux/threads.h>
 
 #include <asm/mmu_context.h>
 #include <asm/processor.h>
+#include <asm/pgtable.h>
 
-/*
- * Very stupidly, we used to get new pgd's and pmd's, init their contents
- * to point to the NULL versions of the next level page table, later on
- * completely re-init them the same way, then free them up.  This wasted
- * a lot of work and caused unnecessary memory traffic.  How broken...
- * We fix this by caching them.
- */
-#define pgd_quicklist		(local_cpu_data->pgd_quick)
-#define pmd_quicklist		(local_cpu_data->pmd_quick)
-#define pgtable_cache_size	(local_cpu_data->pgtable_cache_sz)
-
-static inline pgd_t*
-pgd_alloc_one_fast (struct mm_struct *mm)
-{
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = pgd_quicklist;
-	if (likely(ret != NULL)) {
-		pgd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	} else
-		ret = NULL;
-
-	preempt_enable();
-
-	return (pgd_t *) ret;
-}
+extern kmem_cache_t *zero_cache;
 
 static inline pgd_t*
 pgd_alloc (struct mm_struct *mm)
 {
-	/* the VM system never calls pgd_alloc_one_fast(), so we do it here. */
-	pgd_t *pgd = pgd_alloc_one_fast(mm);
-
-	if (unlikely(pgd = NULL)) {
-		pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
-		if (likely(pgd != NULL))
-			clear_page(pgd);
-	}
-	return pgd;
+	return kmem_cache_alloc(zero_cache, GFP_KERNEL);
 }
 
 static inline void
 pgd_free (pgd_t *pgd)
 {
-	preempt_disable();
-	*(unsigned long *)pgd = (unsigned long) pgd_quicklist;
-	pgd_quicklist = (unsigned long *) pgd;
-	++pgtable_cache_size;
-	preempt_enable();
+	kmem_cache_free(zero_cache, pgd);
 }
 
 static inline void
@@ -86,92 +47,62 @@
 
 
 static inline pmd_t*
-pmd_alloc_one_fast (struct mm_struct *mm, unsigned long addr)
-{
-	unsigned long *ret = NULL;
-
-	preempt_disable();
-
-	ret = (unsigned long *)pmd_quicklist;
-	if (likely(ret != NULL)) {
-		pmd_quicklist = (unsigned long *)(*ret);
-		ret[0] = 0;
-		--pgtable_cache_size;
-	}
-
-	preempt_enable();
-
-	return (pmd_t *)ret;
-}
-
-static inline pmd_t*
 pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-
-	if (likely(pmd != NULL))
-		clear_page(pmd);
-	return pmd;
+	return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void
 pmd_free (pmd_t *pmd)
 {
-	preempt_disable();
-	*(unsigned long *)pmd = (unsigned long) pmd_quicklist;
-	pmd_quicklist = (unsigned long *) pmd;
-	++pgtable_cache_size;
-	preempt_enable();
+	kmem_cache_free(zero_cache, pmd);
 }
 
 #define __pmd_free_tlb(tlb, pmd)	pmd_free(pmd)
 
 static inline void
-pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
-{
-	pmd_val(*pmd_entry) = page_to_phys(pte);
-}
-
-static inline void
 pmd_populate_kernel (struct mm_struct *mm, pmd_t *pmd_entry, pte_t *pte)
 {
 	pmd_val(*pmd_entry) = __pa(pte);
 }
 
-static inline struct page *
-pte_alloc_one (struct mm_struct *mm, unsigned long addr)
+static inline void
+pmd_populate (struct mm_struct *mm, pmd_t *pmd_entry, struct page *pte)
 {
-	struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
-
-	if (likely(pte != NULL))
-		clear_page(page_address(pte));
-	return pte;
+	pmd_val(*pmd_entry) = page_to_phys(pte);
 }
 
+
 static inline pte_t *
 pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr)
 {
-	pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
-
-	if (likely(pte != NULL))
-		clear_page(pte);
-	return pte;
+	return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void
-pte_free (struct page *pte)
+pte_free_kernel (pte_t *pte)
 {
-	__free_page(pte);
+	kmem_cache_free(zero_cache, pte);
 }
 
-static inline void
-pte_free_kernel (pte_t *pte)
+static inline struct page *
+pte_alloc_one (struct mm_struct *mm, unsigned long addr)
 {
-	free_page((unsigned long) pte);
+	pte_t *pte = pte_alloc_one_kernel(mm, addr);
+
+	if (pte)
+		return virt_to_page(pte);
+
+	return NULL;
 }
 
-#define __pte_free_tlb(tlb, pte)	tlb_remove_page((tlb), (pte))
+static inline void
+pte_free (struct page *pte)
+{
+	pte_free_kernel(page_address(pte));
+}
 
-extern void check_pgt_cache (void);
+#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+#define check_pgt_cache()		do { } while (0)
 
 #endif /* _ASM_IA64_PGALLOC_H */
diff -urN -X /usr/people/mkp/bin/dontdiff linux-pristine/include/asm-ia64/pgtable.h pgtable-slab/include/asm-ia64/pgtable.h
--- linux-pristine/include/asm-ia64/pgtable.h	2004-10-11 14:57:19.000000000 -0700
+++ pgtable-slab/include/asm-ia64/pgtable.h	2004-10-11 14:59:14.000000000 -0700
@@ -542,11 +542,6 @@
 #define KERNEL_TR_PAGE_SHIFT	_PAGE_SIZE_64M
 #define KERNEL_TR_PAGE_SIZE	(1 << KERNEL_TR_PAGE_SHIFT)
 
-/*
- * No page table caches to initialise
- */
-#define pgtable_cache_init()	do { } while (0)
-
 /* These tell get_user_pages() that the first gate page is accessible from user-level.  */
 #define FIXADDR_USER_START	GATE_ADDR
 #define FIXADDR_USER_END	(GATE_ADDR + 2*PERCPU_PAGE_SIZE)

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
@ 2004-10-13 18:57 ` Jesse Barnes
  2004-10-13 18:59 ` William Lee Irwin III
                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Jesse Barnes @ 2004-10-13 18:57 UTC (permalink / raw)
  To: linux-ia64

On Wednesday, October 13, 2004 1:47 pm, Martin K. Petersen wrote:
> This patch makes the page table cache on IA-64 use the slab instead of
> standard page allocations.  It's based upon Bill Irwin's code for
> ppc64.
>
> With this patch I got a significant improvement in page fault time.
> Standard 2.4.x was about 700ns on average.  Generic 2.6.9rc4 is 3-4
> usec, whereas the slabified pgtcache drops us back down to 600-700 ns.
>
> Tested on zx1 and sn2.
>
> The biggest caveat is that I've had to postpone setting up the gate
> page until the pgt slab has been initialized.  That wasn't an issue
> with the existing page-based allocation scheme.  David - how do you
> prefer I handle this?
>
> Comments?

Looks really nice, it's good to see a diffstat with "5 files changed, 50 
insertions(+), 125 deletions(-)" that actually speeds things up and makes the 
code more sensible.

Jesse

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
  2004-10-13 18:57 ` Jesse Barnes
@ 2004-10-13 18:59 ` William Lee Irwin III
  2004-10-13 19:07 ` Luck, Tony
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: William Lee Irwin III @ 2004-10-13 18:59 UTC (permalink / raw)
  To: linux-ia64

On Wed, Oct 13, 2004 at 02:47:43PM -0400, Martin K. Petersen wrote:
> This patch makes the page table cache on IA-64 use the slab instead of
> standard page allocations.  It's based upon Bill Irwin's code for
> ppc64.
> With this patch I got a significant improvement in page fault time.
> Standard 2.4.x was about 700ns on average.  Generic 2.6.9rc4 is 3-4
> usec, whereas the slabified pgtcache drops us back down to 600-700 ns.
> Tested on zx1 and sn2.
> The biggest caveat is that I've had to postpone setting up the gate
> page until the pgt slab has been initialized.  That wasn't an issue
> with the existing page-based allocation scheme.  David - how do you
> prefer I handle this?

Nice! I got reports that it would not be beneficial when I thought
about going over this earlier. I suppose it's a small vindication
of my methods to see the original objection contradicted here. =)


-- wli

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
  2004-10-13 18:57 ` Jesse Barnes
  2004-10-13 18:59 ` William Lee Irwin III
@ 2004-10-13 19:07 ` Luck, Tony
  2004-10-13 19:28 ` Jesse Barnes
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Luck, Tony @ 2004-10-13 19:07 UTC (permalink / raw)
  To: linux-ia64

>With this patch I got a significant improvement in page fault time.
>Standard 2.4.x was about 700ns on average.  Generic 2.6.9rc4 is 3-4
>usec, whereas the slabified pgtcache drops us back down to 600-700 ns.

What was the benchmark you were running when you measured these?

>Tested on zx1 and sn2.

How does this scale on big sn2 systems? I assume that the answers are
going to be just fine as the slab allocator per-cpu lists should be
just as nice as local quicklists, but it would be nice to see some
data.

>The biggest caveat is that I've had to postpone setting up the gate
>page until the pgt slab has been initialized.  That wasn't an issue
>with the existing page-based allocation scheme.  David - how do you
>prefer I handle this?

I don't see the code for this postponement in this patch.

>Comments?

Overall looks nice ... less code, and goes faster too, what more
could we ask for!

Perhaps "zero_cache" isn't as descriptive a name as it might be (not
that I have any better suggestions :-(

-Tony

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (2 preceding siblings ...)
  2004-10-13 19:07 ` Luck, Tony
@ 2004-10-13 19:28 ` Jesse Barnes
  2004-10-13 19:32 ` Martin K. Petersen
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Jesse Barnes @ 2004-10-13 19:28 UTC (permalink / raw)
  To: linux-ia64

On Wednesday, October 13, 2004 2:07 pm, Luck, Tony wrote:
> >With this patch I got a significant improvement in page fault time.
> >Standard 2.4.x was about 700ns on average.  Generic 2.6.9rc4 is 3-4
> >usec, whereas the slabified pgtcache drops us back down to 600-700 ns.
>
> What was the benchmark you were running when you measured these?
>
> >Tested on zx1 and sn2.
>
> How does this scale on big sn2 systems? I assume that the answers are
> going to be just fine as the slab allocator per-cpu lists should be
> just as nice as local quicklists, but it would be nice to see some
> data.

Yeah, it would be interesting, though page faults in general don't scale very 
well due to the way page_table_lock is used.  Christoph had some patches for 
that, but I don't think he's had time to work on them lately.

> Overall looks nice ... less code, and goes faster too, what more
> could we ask for!
>
> Perhaps "zero_cache" isn't as descriptive a name as it might be (not
> that I have any better suggestions :-(

Maybe pgtable_zero_cache or something?

Jesse

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (3 preceding siblings ...)
  2004-10-13 19:28 ` Jesse Barnes
@ 2004-10-13 19:32 ` Martin K. Petersen
  2004-10-13 19:32 ` Luck, Tony
                   ` (11 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2004-10-13 19:32 UTC (permalink / raw)
  To: linux-ia64

>>>>> "Tony" = Luck, Tony <tony.luck@intel.com> writes:

Tony,

Tony> What was the benchmark you were running when you measured these?

I'm going to let Robin send you his test case.

Tony> How does this scale on big sn2 systems? I assume that the
Tony> answers are going to be just fine as the slab allocator per-cpu
Tony> lists should be just as nice as local quicklists, but it would
Tony> be nice to see some data.

The tests above are from a 64-way Altix.

I'm still looking into the issue of freed data ending up in other
nodes' slabs.  Don't know if anybody else have been working on this?

>> The biggest caveat is that I've had to postpone setting up the gate
>> page until the pgt slab has been initialized.  That wasn't an issue
>> with the existing page-based allocation scheme.  David - how do you
>> prefer I handle this?

Tony> I don't see the code for this postponement in this patch.

It's there.  End of the mm/init.c hunk.

Tony> Perhaps "zero_cache" isn't as descriptive a name as it might be
Tony> (not that I have any better suggestions :-(

Yeah, this is what Bill called it on ppc64.  pgtable_cache perhaps?

-- 
Martin K. Petersen	Wild Open Source, Inc.
mkp@wildopensource.com	http://www.wildopensource.com/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (4 preceding siblings ...)
  2004-10-13 19:32 ` Martin K. Petersen
@ 2004-10-13 19:32 ` Luck, Tony
  2004-10-13 19:42 ` Martin K. Petersen
                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Luck, Tony @ 2004-10-13 19:32 UTC (permalink / raw)
  To: linux-ia64

 
>> Perhaps "zero_cache" isn't as descriptive a name as it might be (not
>> that I have any better suggestions :-(
>
>Maybe pgtable_zero_cache or something?

I thought along those lines at first, but actually this cache would be
useful for any allocation that needed a page that has been pre-zeroed.
So I don't think that the "pgtable" part is helpful.  Some sense of the
fact that the objects are one page big is what I was looking for.

-Tony

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (5 preceding siblings ...)
  2004-10-13 19:32 ` Luck, Tony
@ 2004-10-13 19:42 ` Martin K. Petersen
  2004-10-13 19:49 ` Robin Holt
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2004-10-13 19:42 UTC (permalink / raw)
  To: linux-ia64

>>>>> "Tony" = Luck, Tony <tony.luck@intel.com> writes:

>>> Perhaps "zero_cache" isn't as descriptive a name as it might be
>>> (not that I have any better suggestions :-(
>>  Maybe pgtable_zero_cache or something?

Tony> I thought along those lines at first, but actually this cache
Tony> would be useful for any allocation that needed a page that has
Tony> been pre-zeroed.  So I don't think that the "pgtable" part is
Tony> helpful.  Some sense of the fact that the objects are one page
Tony> big is what I was looking for.

Yeah.  I suspect that's why Bill named it that in the first place.

What about zeroed_page_cache?

Now, this opens up another can of worms in terms of where to
initialize that cache in the first place.  pgtable_cache_init() might
not be the best place if it's generally used.  Only requirement is
that it needs to be early.

-- 
Martin K. Petersen	Silicon Graphics, Inc.
mkp@sgi.com		http://www.sgi.com/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (6 preceding siblings ...)
  2004-10-13 19:42 ` Martin K. Petersen
@ 2004-10-13 19:49 ` Robin Holt
  2004-10-13 19:50 ` Robin Holt
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Robin Holt @ 2004-10-13 19:49 UTC (permalink / raw)
  To: linux-ia64

On Wed, Oct 13, 2004 at 03:32:43PM -0400, Martin K. Petersen wrote:
> >>>>> "Tony" = Luck, Tony <tony.luck@intel.com> writes:
> 
> Tony,
> 
> Tony> What was the benchmark you were running when you measured these?
> 
> I'm going to let Robin send you his test case.

Martin, can you send your updated test case.  I believe yours is just mine
with a higher resolution timer.  If you are uncomfortable with that, let
me know.

> 
> 
> Tony> How does this scale on big sn2 systems? I assume that the
> Tony> answers are going to be just fine as the slab allocator per-cpu
> Tony> lists should be just as nice as local quicklists, but it would
> Tony> be nice to see some data.
> 
> The tests above are from a 64-way Altix.
> 
> I'm still looking into the issue of freed data ending up in other
> nodes' slabs.  Don't know if anybody else have been working on this?

When I tried to measure these on a large multi-threaded job, the difference
got lost in the noise.

> 
> 
> >> The biggest caveat is that I've had to postpone setting up the gate
> >> page until the pgt slab has been initialized.  That wasn't an issue
> >> with the existing page-based allocation scheme.  David - how do you
> >> prefer I handle this?
> 
> Tony> I don't see the code for this postponement in this patch.
> 
> It's there.  End of the mm/init.c hunk.
> 
> 
> Tony> Perhaps "zero_cache" isn't as descriptive a name as it might be
> Tony> (not that I have any better suggestions :-(
> 
> Yeah, this is what Bill called it on ppc64.  pgtable_cache perhaps?

If we are renaming things, can we rename zero_ctor to something a little
more sensible.
> 
> -- 
> Martin K. Petersen	Wild Open Source, Inc.
> mkp@wildopensource.com	http://www.wildopensource.com/
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (7 preceding siblings ...)
  2004-10-13 19:49 ` Robin Holt
@ 2004-10-13 19:50 ` Robin Holt
  2004-10-13 20:23 ` Luck, Tony
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Robin Holt @ 2004-10-13 19:50 UTC (permalink / raw)
  To: linux-ia64

On Wed, Oct 13, 2004 at 03:42:48PM -0400, Martin K. Petersen wrote:
> >>>>> "Tony" = Luck, Tony <tony.luck@intel.com> writes:
> 
> >>> Perhaps "zero_cache" isn't as descriptive a name as it might be
> >>> (not that I have any better suggestions :-(
> >>  Maybe pgtable_zero_cache or something?
> 
> Tony> I thought along those lines at first, but actually this cache
> Tony> would be useful for any allocation that needed a page that has
> Tony> been pre-zeroed.  So I don't think that the "pgtable" part is
> Tony> helpful.  Some sense of the fact that the objects are one page
> Tony> big is what I was looking for.
> 
> Yeah.  I suspect that's why Bill named it that in the first place.
> 
> What about zeroed_page_cache?
> 
> Now, this opens up another can of worms in terms of where to
> initialize that cache in the first place.  pgtable_cache_init() might
> not be the best place if it's generally used.  Only requirement is
> that it needs to be early.

Why don't we start with it here and move the init when we know who
the other users will be.  Just my 2 cents.

Robin

> 
> -- 
> Martin K. Petersen	Silicon Graphics, Inc.
> mkp@sgi.com		http://www.sgi.com/
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (8 preceding siblings ...)
  2004-10-13 19:50 ` Robin Holt
@ 2004-10-13 20:23 ` Luck, Tony
  2004-10-13 20:34 ` Martin K. Petersen
                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Luck, Tony @ 2004-10-13 20:23 UTC (permalink / raw)
  To: linux-ia64

>Tony> I don't see the code for this postponement in this patch.
>
>It's there.  End of the mm/init.c hunk.

Ok, I see it now ... the perils of reading the patch rather than
applying it and looking at the code that you end up with.  I see
that ia32_mem_init() got dragged along too.

This is pretty messy in terms of code legibility.  I think that
the right thing to do is get zero_cache (or whatever we call it)
initialized way, way early ... so we don't have these ugly code
paths where gate pages are being setup in pgtable_cache_init()!

Think you can sell LKML on a generic change to initialize this
cache in mm/slab.c???  It might not be too hard, this patch started
out on PPC, other architectures might also be able to to the same
thing for their page tables (as well as potential other uses in
generic code ... of course concrete examples would be way better
than my handwaving about how nice this might be).

-Tony

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (9 preceding siblings ...)
  2004-10-13 20:23 ` Luck, Tony
@ 2004-10-13 20:34 ` Martin K. Petersen
  2004-10-13 20:37 ` Martin K. Petersen
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2004-10-13 20:34 UTC (permalink / raw)
  To: linux-ia64

>>>>> "Robin" = Robin Holt <holt@sgi.com> writes:

Robin> Martin, can you send your updated test case.  I believe yours
Robin> is just mine with a higher resolution timer. 

Sure.  Included below.

-- 
Martin K. Petersen	Silicon Graphics, Inc.
mkp@sgi.com		http://www.sgi.com/

#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <time.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>


#define	PAGE_SIZE		getpagesize()
#define PTES_PER_PMD		(PAGE_SIZE / 8)
#define STRIDE			PTES_PER_PMD * PAGE_SIZE
#define	FAULTS_TO_CAUSE		32
#define MAPPING_SIZE		FAULTS_TO_CAUSE * STRIDE

#define LOOPS_TO_TIME		128


int main(int argc, char **argv)
{
	long offset, i, j;
	char * mapping;
	volatile char z;
	pid_t child;
	int child_status;
	int fd;
	long long start, end;
	struct timespec ts;

	mapping = mmap(NULL, (size_t) MAPPING_SIZE, PROT_READ,
		       MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);

	if ((unsigned long) mapping = -1UL) {
		perror("Mapping failed.\n");
		exit(0);
	}

	for (j=0; j < LOOPS_TO_TIME; j++) {
		child = fork();
		if (child > 0) {
			wait(&child_status);
		} else if (child = 0) {
			clock_gettime(CLOCK_REALTIME, &ts);
			start = ts.tv_sec * 1000000000 + ts.tv_nsec;

			for (i = 0; i < FAULTS_TO_CAUSE; i++) {
				offset = i * STRIDE;
				z = mapping[offset];
			}

			clock_gettime(CLOCK_REALTIME, &ts);
			end = ts.tv_sec * 1000000000 + ts.tv_nsec;

			printf("Took %lld nanoseconds per fault\n",
			       (end-start) / FAULTS_TO_CAUSE);
			exit(0);
		} else {
			printf ("Fork failed\n");
		}
	}
	munmap(mapping, (size_t) MAPPING_SIZE);
	return 0;
}

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (10 preceding siblings ...)
  2004-10-13 20:34 ` Martin K. Petersen
@ 2004-10-13 20:37 ` Martin K. Petersen
  2004-10-14  7:44 ` David Mosberger
                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: Martin K. Petersen @ 2004-10-13 20:37 UTC (permalink / raw)
  To: linux-ia64

>>>>> "Tony" = Luck, Tony <tony.luck@intel.com> writes:

Tony> Ok, I see it now ... the perils of reading the patch rather than
Tony> applying it and looking at the code that you end up with.  I see
Tony> that ia32_mem_init() got dragged along too.

*nod*

Tony> This is pretty messy in terms of code legibility.  

I agree completely.  I never intended for them to permanently live
there.  I was just looking for suggestions for a more appropriate
location.

Tony> Think you can sell LKML on a generic change to initialize this
Tony> cache in mm/slab.c???  

I'll try.

Thanks!

-- 
Martin K. Petersen	Silicon Graphics, Inc.
mkp@sgi.com		http://www.sgi.com/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (11 preceding siblings ...)
  2004-10-13 20:37 ` Martin K. Petersen
@ 2004-10-14  7:44 ` David Mosberger
  2004-10-14 10:27 ` William Lee Irwin III
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: David Mosberger @ 2004-10-14  7:44 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Wed, 13 Oct 2004 11:59:56 -0700, William Lee Irwin III <wli@holomorphy.com> said:

  William> On Wed, Oct 13, 2004 at 02:47:43PM -0400, Martin
  William> K. Petersen wrote:
  >> This patch makes the page table cache on IA-64 use the slab
  >> instead of standard page allocations.  It's based upon Bill
  >> Irwin's code for ppc64.  With this patch I got a significant
  >> improvement in page fault time.  Standard 2.4.x was about 700ns
  >> on average.  Generic 2.6.9rc4 is 3-4 usec, whereas the slabified
  >> pgtcache drops us back down to 600-700 ns.  Tested on zx1 and
  >> sn2.  The biggest caveat is that I've had to postpone setting up
  >> the gate page until the pgt slab has been initialized.  That
  >> wasn't an issue with the existing page-based allocation scheme.
  >> David - how do you prefer I handle this?

  William> Nice! I got reports that it would not be beneficial when I
  William> thought about going over this earlier. I suppose it's a
  William> small vindication of my methods to see the original
  William> objection contradicted here. =)

What was the original comment relative to?  2.4 or 2.6?  In 2.6 we
lost the PTE quick-list, which I think is where the big overhead came
from.  Right, Martin?

	--david

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (12 preceding siblings ...)
  2004-10-14  7:44 ` David Mosberger
@ 2004-10-14 10:27 ` William Lee Irwin III
  2004-10-14 11:00 ` David Mosberger
                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 18+ messages in thread
From: William Lee Irwin III @ 2004-10-14 10:27 UTC (permalink / raw)
  To: linux-ia64

At some point in the comment, I wrote:
William> Nice! I got reports that it would not be beneficial when I
William> thought about going over this earlier. I suppose it's a
William> small vindication of my methods to see the original
William> objection contradicted here. =)

On Thu, Oct 14, 2004 at 12:44:59AM -0700, David Mosberger wrote:
> What was the original comment relative to?  2.4 or 2.6?  In 2.6 we
> lost the PTE quick-list, which I think is where the big overhead came
> from.  Right, Martin?

I don't recall much of the original comment, apart from that someone
said it had too much overhead. I may have presumed too much when I
presumed it was 2.6

-- wli

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (13 preceding siblings ...)
  2004-10-14 10:27 ` William Lee Irwin III
@ 2004-10-14 11:00 ` David Mosberger
  2004-10-14 11:07 ` Robin Holt
  2004-10-14 11:10 ` David Mosberger
  16 siblings, 0 replies; 18+ messages in thread
From: David Mosberger @ 2004-10-14 11:00 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Thu, 14 Oct 2004 03:27:42 -0700, William Lee Irwin III <wli@holomorphy.com> said:

  wli> At some point in the comment, I wrote:
  William> Nice! I got reports that it would not be beneficial when I
  William> thought about going over this earlier. I suppose it's a
  William> small vindication of my methods to see the original
  William> objection contradicted here. =)

  wli> On Thu, Oct 14, 2004 at 12:44:59AM -0700, David Mosberger
  wli> wrote:
  >> What was the original comment relative to?  2.4 or 2.6?  In 2.6
  >> we lost the PTE quick-list, which I think is where the big
  >> overhead came from.  Right, Martin?

  wli> I don't recall much of the original comment, apart from that
  wli> someone said it had too much overhead. I may have presumed too
  wli> much when I presumed it was 2.6

OK, I don't think it was me, then.

I do suspect the quick-list would be faster than slab but the
difference is most likely in the noise and you can't argue with the
code-size reduction.

	--david

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (14 preceding siblings ...)
  2004-10-14 11:00 ` David Mosberger
@ 2004-10-14 11:07 ` Robin Holt
  2004-10-14 11:10 ` David Mosberger
  16 siblings, 0 replies; 18+ messages in thread
From: Robin Holt @ 2004-10-14 11:07 UTC (permalink / raw)
  To: linux-ia64

On Thu, Oct 14, 2004 at 12:44:59AM -0700, David Mosberger wrote:
> >>>>> On Wed, 13 Oct 2004 11:59:56 -0700, William Lee Irwin III <wli@holomorphy.com> said:
> 
>   William> On Wed, Oct 13, 2004 at 02:47:43PM -0400, Martin
>   William> K. Petersen wrote:
>   >> This patch makes the page table cache on IA-64 use the slab
>   >> instead of standard page allocations.  It's based upon Bill
>   >> Irwin's code for ppc64.  With this patch I got a significant
>   >> improvement in page fault time.  Standard 2.4.x was about 700ns
>   >> on average.  Generic 2.6.9rc4 is 3-4 usec, whereas the slabified
>   >> pgtcache drops us back down to 600-700 ns.  Tested on zx1 and
>   >> sn2.  The biggest caveat is that I've had to postpone setting up
>   >> the gate page until the pgt slab has been initialized.  That
>   >> wasn't an issue with the existing page-based allocation scheme.
>   >> David - how do you prefer I handle this?
> 
>   William> Nice! I got reports that it would not be beneficial when I
>   William> thought about going over this earlier. I suppose it's a
>   William> small vindication of my methods to see the original
>   William> objection contradicted here. =)
> 
> What was the original comment relative to?  2.4 or 2.6?  In 2.6 we
> lost the PTE quick-list, which I think is where the big overhead came
> from.  Right, Martin?

And the pmd quicklist.  We add entries to the pmd quicklist, but never
take them back off.  This has a two fold effect in that it causes the
pgd quicklist to be drained unnecessarily when the pmd quicklist has
many unusable entries on it.

Essentially, we were allocating and clearing pages for both pmd and pte
entries every time.

Robin
> 
> 	--david
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Convert pgtable cache to slab
  2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
                   ` (15 preceding siblings ...)
  2004-10-14 11:07 ` Robin Holt
@ 2004-10-14 11:10 ` David Mosberger
  16 siblings, 0 replies; 18+ messages in thread
From: David Mosberger @ 2004-10-14 11:10 UTC (permalink / raw)
  To: linux-ia64

>>>>> On Thu, 14 Oct 2004 06:07:43 -0500, Robin Holt <holt@sgi.com> said:

  Robin> Essentially, we were allocating and clearing pages for both
  Robin> pmd and pte entries every time.

Yes, you discovered that several months ago, didn't you?  Glad the fix
came through, in the end.

	--david

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2004-10-14 11:10 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-10-13 18:47 [RFC] Convert pgtable cache to slab Martin K. Petersen
2004-10-13 18:57 ` Jesse Barnes
2004-10-13 18:59 ` William Lee Irwin III
2004-10-13 19:07 ` Luck, Tony
2004-10-13 19:28 ` Jesse Barnes
2004-10-13 19:32 ` Martin K. Petersen
2004-10-13 19:32 ` Luck, Tony
2004-10-13 19:42 ` Martin K. Petersen
2004-10-13 19:49 ` Robin Holt
2004-10-13 19:50 ` Robin Holt
2004-10-13 20:23 ` Luck, Tony
2004-10-13 20:34 ` Martin K. Petersen
2004-10-13 20:37 ` Martin K. Petersen
2004-10-14  7:44 ` David Mosberger
2004-10-14 10:27 ` William Lee Irwin III
2004-10-14 11:00 ` David Mosberger
2004-10-14 11:07 ` Robin Holt
2004-10-14 11:10 ` David Mosberger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox