[RFC] 4-level page table directories.

public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] 4-level page table directories.
@ 2005-10-27  4:17 Robin Holt
  2005-10-28  5:19 ` Ian Wienand
                   ` (48 more replies)
  0 siblings, 49 replies; 50+ messages in thread
From: Robin Holt @ 2005-10-27  4:17 UTC (permalink / raw)
  To: linux-ia64


I have started to work on 4-level page tables.  This boots.  I
make no further claims than that.

At one point, I discussed 4-level page tables on the ia64 mailing
list but did not find that discussion in my quick search from
marc.

David, I think it was you who expressed concern with introducing
the fourth level.  I have done some quick benchmarking and found
little difference (well within noise).  How had you envisioned
introducing a 3 or 4 level page tables?  Were you envisioning
a compile-time or run-time selection?

Thanks,
Robin

Index: linux-2.6/include/asm-ia64/pgtable.h
=================================--- linux-2.6.orig/include/asm-ia64/pgtable.h	2005-10-26 18:59:21.253268550 -0500
+++ linux-2.6/include/asm-ia64/pgtable.h	2005-10-26 23:01:34.572838463 -0500
@@ -84,32 +84,48 @@
 #define __DIRTY_BITS		_PAGE_ED | __DIRTY_BITS_NO_ED
 
 /*
- * Definitions for first level:
- *
- * PGDIR_SHIFT determines what a first-level page table entry can map.
+ * How many pointers will a page table level hold expressed in shift
  */
-#define PGDIR_SHIFT		(PAGE_SHIFT + 2*(PAGE_SHIFT-3))
-#define PGDIR_SIZE		(__IA64_UL(1) << PGDIR_SHIFT)
-#define PGDIR_MASK		(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD		(1UL << (PAGE_SHIFT-3))
-#define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
-#define FIRST_USER_ADDRESS	0
+#define PTRS_PER_PTD_SHIFT	(PAGE_SHIFT-3)
 
 /*
- * Definitions for second level:
+ * Definitions for fourth level:
+ */
+#define PTRS_PER_PTE	(__IA64_UL(1) << (PTRS_PER_PTD_SHIFT))
+
+/*
+ * Definitions for third level:
  *
- * PMD_SHIFT determines the size of the area a second-level page table
+ * PMD_SHIFT determines the size of the area a third-level page table
  * can map.
  */
-#define PMD_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-3))
+#define PMD_SHIFT	(PAGE_SHIFT + (PTRS_PER_PTD_SHIFT))
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1UL << (PAGE_SHIFT-3))
+#define PTRS_PER_PMD	(1UL << (PTRS_PER_PTD_SHIFT))
 
 /*
- * Definitions for third level:
+ * Definitions for second level:
+ *
+ * PUD_SHIFT determines the size of the area a second-level page table
+ * can map.
+ */
+#define PUD_SHIFT	(PMD_SHIFT + (PTRS_PER_PTD_SHIFT))
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+#define PTRS_PER_PUD	(1UL << (PTRS_PER_PTD_SHIFT))
+
+/*
+ * Definitions for first level:
+ *
+ * PGDIR_SHIFT determines what a first-level page table entry can map.
  */
-#define PTRS_PER_PTE	(__IA64_UL(1) << (PAGE_SHIFT-3))
+#define PGDIR_SHIFT		(PUD_SHIFT + (PTRS_PER_PTD_SHIFT))
+#define PGDIR_SIZE		(__IA64_UL(1) << PGDIR_SHIFT)
+#define PGDIR_MASK		(~(PGDIR_SIZE-1))
+#define PTRS_PER_PGD		(1UL << (PTRS_PER_PTD_SHIFT))
+#define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
+#define FIRST_USER_ADDRESS	0
 
 /*
  * All the normal masks have the "page accessed" bits on, as any time
@@ -160,6 +176,7 @@
 #define __S111	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX)
 
 #define pgd_ERROR(e)	printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e))
+#define pud_ERROR(e)	printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pmd_ERROR(e)	printk("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
 #define pte_ERROR(e)	printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e))
 
@@ -256,9 +273,14 @@ ia64_phys_addr_valid (unsigned long addr
 #define pud_bad(pud)			(!ia64_phys_addr_valid(pud_val(pud)))
 #define pud_present(pud)		(pud_val(pud) != 0UL)
 #define pud_clear(pudp)			(pud_val(*(pudp)) = 0UL)
-
 #define pud_page(pud)			((unsigned long) __va(pud_val(pud) & _PFN_MASK))
 
+#define pgd_none(pgd)			(!pgd_val(pgd))
+#define pgd_bad(pgd)			(!ia64_phys_addr_valid(pgd_val(pgd)))
+#define pgd_present(pgd)		(pgd_val(pgd) != 0UL)
+#define pgd_clear(pgdp)			(pgd_val(*(pgdp)) = 0UL)
+#define pgd_page(pgd)			((unsigned long) __va(pgd_val(pgd) & _PFN_MASK))
+
 /*
  * The following have defined behavior only work if pte_present() is true.
  */
@@ -327,6 +349,10 @@ pgd_offset (struct mm_struct *mm, unsign
 #define pgd_offset_gate(mm, addr)	pgd_offset_k(addr)
 
 /* Find an entry in the second-level page table.. */
+#define pud_offset(dir,addr) \
+	((pud_t *) pgd_page(*(dir)) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+
+/* Find an entry in the third-level page table.. */
 #define pmd_offset(dir,addr) \
 	((pmd_t *) pud_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
@@ -559,7 +585,6 @@ do {											\
 #define __HAVE_ARCH_PGD_OFFSET_GATE
 #define __HAVE_ARCH_LAZY_MMU_PROT_UPDATE
 
-#include <asm-generic/pgtable-nopud.h>
 #include <asm-generic/pgtable.h>
 
 #endif /* _ASM_IA64_PGTABLE_H */
Index: linux-2.6/include/asm-ia64/pgalloc.h
=================================--- linux-2.6.orig/include/asm-ia64/pgalloc.h	2005-10-26 18:59:21.254245014 -0500
+++ linux-2.6/include/asm-ia64/pgalloc.h	2005-10-26 19:08:46.598882737 -0500
@@ -87,6 +87,23 @@ static inline void pgd_free(pgd_t * pgd)
 }
 
 static inline void
+pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
+{
+	pgd_val(*pgd_entry) = __pa(pud);
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return pgtable_quicklist_alloc();
+}
+
+static inline void pud_free(pud_t * pud)
+{
+	pgtable_quicklist_free(pud);
+}
+#define __pud_free_tlb(tlb, pud)	pud_free(pud)
+
+static inline void
 pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
 {
 	pud_val(*pud_entry) = __pa(pmd);
Index: linux-2.6/include/asm-ia64/page.h
=================================--- linux-2.6.orig/include/asm-ia64/page.h	2005-10-26 18:59:21.254245014 -0500
+++ linux-2.6/include/asm-ia64/page.h	2005-10-26 19:08:46.604741525 -0500
@@ -174,11 +174,13 @@ get_order (unsigned long size)
    */
   typedef struct { unsigned long pte; } pte_t;
   typedef struct { unsigned long pmd; } pmd_t;
+  typedef struct { unsigned long pud; } pud_t;
   typedef struct { unsigned long pgd; } pgd_t;
   typedef struct { unsigned long pgprot; } pgprot_t;
 
 # define pte_val(x)	((x).pte)
 # define pmd_val(x)	((x).pmd)
+# define pud_val(x)	((x).pud)
 # define pgd_val(x)	((x).pgd)
 # define pgprot_val(x)	((x).pgprot)
 
Index: linux-2.6/arch/ia64/kernel/ivt.S
=================================--- linux-2.6.orig/arch/ia64/kernel/ivt.S	2005-10-26 18:59:21.278656627 -0500
+++ linux-2.6/arch/ia64/kernel/ivt.S	2005-10-26 22:36:41.939866135 -0500
@@ -140,20 +140,26 @@ ENTRY(vhpt_miss)
 (p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
+	shr.u r19=r22,PUD_SHIFT			// shift L2 index into position
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
+	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	dep r28=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
 	;;
-(p7)	ld8 r20=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
+(p7)	ld8 r29=[r28]				// fetch the L2 entry (may be 0)
+	shr.u r19=r22,PAGE_SHIFT		// shift L4 index into position
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L2 entry NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+(p7)	cmp.eq p6,p7=r29,r0			// was L2 entry NULL?
+	dep r17=r18,r29,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
 	;;
-(p7)	ld8 r18=[r21]				// read the L3 PTE
+(p7)	ld8 r20=[r17]				// fetch the L3 entry (may be 0)
+	;;
+(p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L3 entry NULL?
+	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L4 page table entry
+	;;
+(p7)	ld8 r18=[r21]				// read the L4 PTE
 	mov r19=cr.isr				// cr.isr bit 0 tells us if this is an insn miss
 	;;
 (p7)	tbit.z p6,p7=r18,_PAGE_P_BIT		// page present bit cleared?
@@ -192,12 +198,15 @@ ENTRY(vhpt_miss)
 	 * between reading the pagetable and the "itc".  If so, flush the entry we
 	 * inserted and retry.
 	 */
-	ld8 r25=[r21]				// read L3 PTE again
-	ld8 r26=[r17]				// read L2 entry again
+	ld8 r25=[r21]				// read L4 PTE again
+	ld8 r26=[r17]				// read L3 entry again
+	ld8 r30=[r28]				// read L2 entry again
 	;;
-	cmp.ne p6,p7=r26,r20			// did L2 entry change
+	cmp.ne p6,p7=r26,r20			// did L3 entry change
 	mov r27=PAGE_SHIFT<<2
 	;;
+(p7)	cmp.ne.or.andcm p6,p7=r30,r29		// did L2 entry change
+	;;
 (p6)	ptc.l r22,r27				// purge PTE page translation
 (p7)	cmp.ne.or.andcm p6,p7=r25,r18		// did L3 PTE change
 	;;
@@ -432,18 +441,24 @@ ENTRY(nested_dtlb_miss)
 (p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
+	shr.u r19=r22,PUD_SHIFT			// shift L2 index into position
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
+	shr.u r18=r22,PMD_SHIFT			// shift L3 index into position
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
 	;;
 (p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
+	shr.u r19=r22,PAGE_SHIFT		// shift L4 index into position
+	;;
+(p7)	cmp.eq p6,p7=r17,r0			// was L2 entry NULL?
+	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+	;;
+(p7)	ld8 r17=[r17]				// fetch the L3 entry (may be 0)
 	;;
-(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+(p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L3 entry NULL?
+	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L4 page table entry
 (p6)	br.cond.spnt page_fault
 	mov b0=r30
 	br.sptk.many b0				// return to continuation point

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
@ 2005-10-28  5:19 ` Ian Wienand
  2005-10-28 11:19 ` Robin Holt
                   ` (47 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Ian Wienand @ 2005-10-28  5:19 UTC (permalink / raw)
  To: linux-ia64

On Wed, Oct 26, 2005 at 11:17:09PM -0500, Robin Holt wrote:
> I have started to work on 4-level page tables.  This boots.  I
> make no further claims than that.

Do you need 4 level page tables for something?  I seem to remember it
coming up before too, I'd certainly be interested in seeing any
numbers you have.

Something I would really like is a small abstraction of some of the
page table access macros to make it a little clearer as to what is
happening.  Besides which using '-3' in many places assumes the size
of entries (of particular interest is the size of a PTE, which you
might want to increase; I can't see why you would increase the upper
levels).

I'd like to see something like this go in, or possibly wrap it up with
your changes.  I also tried to make comments a little more explicit.

> +#define PTRS_PER_PTD_SHIFT	(PAGE_SHIFT-3)

Is having a page of PTE's called at PTD a standard thing?  It has to
be better than PTRS_PER_PTE which is a little confusing.

-i
ianw@gelato.unsw.edu.au
http://www.gelato.unsw.edu.au

 arch/ia64/kernel/ivt.S     |   36 ++++++++++++++++-----------------
 include/asm-ia64/pgtable.h |   48 ++++++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 34 deletions(-)
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -111,7 +111,7 @@ ENTRY(vhpt_miss)
 	rsm psr.dt				// use physical addressing for data
 	mov r31=pr				// save the predicate registers
 	mov r19=IA64_KR(PT_BASE)		// get page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
+	shl r21=r16,3				// shift out region number
 	shr.u r17=r16,61			// get the region number into r17
 	;;
 	shr r22=r21,3
@@ -125,33 +125,33 @@ ENTRY(vhpt_miss)
 (p8)	shr r22=r22,r27
 #endif
 	;;
-	cmp.eq p6,p7=5,r17			// is IFA pointing into to region 5?
+	cmp.eq p6,p7=5,r17			// is faulting address in region 5?
 	shr.u r18=r22,PGDIR_SHIFT		// get bits 33-63 of the faulting address
 	;;
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
+(p7)	dep r17=r17,r19,PGD_INDEX_BITS,PGD_ENTRY_BITS	// put region number bits in place
 
 	srlz.d
 	LOAD_PHYSICAL(p6, r19, swapper_pg_dir)	// region 5 is rooted at swapper_pg_dir
 
 	.pred.rel "mutex", p6, p7
-(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT	// shift out r21 to make sure unused bits zero
+(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3	// for !r5 we already shifted out the top 3 bits
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
+(p6)	dep r17=r18,r19,PGD_ENTRY_BITS,PGD_INDEX_BITS	// find the PGD offset from the page table base
+(p7)	dep r17=r18,r17,PGD_ENTRY_BITS,PGD_INDEX_BITS-3 // for !r5 we already have region bits
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
 	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	dep r17=r18,r17,PMD_ENTRY_BITS,PMD_INDEX_BITS	// compute address of L2 page table entry
 	;;
 (p7)	ld8 r20=[r17]				// fetch the L2 entry (may be 0)
 	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
 	;;
 (p7)	cmp.eq.or.andcm p6,p7=r20,r0		// was L2 entry NULL?
-	dep r21=r19,r20,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+	dep r21=r19,r20,PTD_ENTRY_BITS,PTD_INDEX_BITS	// compute address of L3 page table entry
 	;;
 (p7)	ld8 r18=[r21]				// read the L3 PTE
 	mov r19=cr.isr				// cr.isr bit 0 tells us if this is an insn miss
@@ -408,7 +408,7 @@ ENTRY(nested_dtlb_miss)
 	 */
 	rsm psr.dt				// switch to using physical data addressing
 	mov r19=IA64_KR(PT_BASE)		// get the page table base address
-	shl r21=r16,3				// shift bit 60 into sign bit
+	shl r21=r16,3				// shift out region number
 	mov r18=cr.itir
 	;;
 	shr.u r17=r16,61			// get the region number into r17
@@ -420,30 +420,30 @@ ENTRY(nested_dtlb_miss)
 	;;
 	shr.u r22=r16,r22
 	shr.u r18=r16,r18
-(p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
+(p7)	dep r17=r17,r19,PGD_INDEX_BITS,PGD_ENTRY_BITS	// put region number bits in place
 
 	srlz.d
 	LOAD_PHYSICAL(p6, r19, swapper_pg_dir)	// region 5 is rooted at swapper_pg_dir
 
 	.pred.rel "mutex", p6, p7
-(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT
-(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3
+(p6)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT	// shift out r21 to make sure unused bits zero
+(p7)	shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3  // for !r5 we already shifted out the top 3 bits
 	;;
-(p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
-(p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
-	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
+(p6)	dep r17=r18,r19,PGD_ENTRY_BITS,PGD_INDEX_BITS	// find the PGD offset from the page table base
+(p7)	dep r17=r18,r17,PGD_ENTRY_BITS,PGD_INDEX_BITS-3 // for !r5 we already have region bits
+	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?  if not something wrong
 	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	;;
 (p7)	cmp.eq p6,p7=r17,r0			// was L1 entry NULL?
-	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
+	dep r17=r18,r17,PMD_ENTRY_BITS,PMD_INDEX_BITS	// compute address of L2 page table entry
 	;;
 (p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
 	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
 	;;
 (p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
-	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
+	dep r17=r19,r17,PTD_ENTRY_BITS,PTD_INDEX_BITS	// compute address of L3 page table entry
 (p6)	br.cond.spnt page_fault
 	mov b0=r30
 	br.sptk.many b0				// return to continuation point
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -84,16 +84,16 @@
 #define __DIRTY_BITS		_PAGE_ED | __DIRTY_BITS_NO_ED
 
 /*
- * Definitions for first level:
+ * Definitions for third level:
  *
- * PGDIR_SHIFT determines what a first-level page table entry can map.
+ * A PTD is a page full of PTE entries
  */
-#define PGDIR_SHIFT		(PAGE_SHIFT + 2*(PAGE_SHIFT-3))
-#define PGDIR_SIZE		(__IA64_UL(1) << PGDIR_SHIFT)
-#define PGDIR_MASK		(~(PGDIR_SIZE-1))
-#define PTRS_PER_PGD		(1UL << (PAGE_SHIFT-3))
-#define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/8)	/* regions 0-4 are user regions */
-#define FIRST_USER_ADDRESS	0
+#define PTD_ENTRY_BITS	3
+#define PTD_INDEX_BITS	(PAGE_SHIFT - PTD_ENTRY_BITS)
+#define PTRS_PER_PTD	(__IA64_UL(1) << PTD_INDEX_BITS)
+#define PTRS_PER_PTE	PTRS_PER_PTD
+ /* one entry maps one page */
+#define PTD_SHIFT	PAGE_SHIFT
 
 /*
  * Definitions for second level:
@@ -101,15 +101,27 @@
  * PMD_SHIFT determines the size of the area a second-level page table
  * can map.
  */
-#define PMD_SHIFT	(PAGE_SHIFT + (PAGE_SHIFT-3))
-#define PMD_SIZE	(1UL << PMD_SHIFT)
+#define PMD_ENTRY_BITS	3
+#define PMD_SHIFT	((PAGE_SHIFT - PMD_ENTRY_BITS) + PTD_SHIFT)
+#define PMD_SIZE	(__IA64_UL(1) << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
-#define PTRS_PER_PMD	(1UL << (PAGE_SHIFT-3))
+#define PMD_INDEX_BITS	(PAGE_SHIFT - PMD_ENTRY_BITS)
+#define PTRS_PER_PMD	(__IA64_UL(1) << PMD_INDEX_BITS)
 
 /*
- * Definitions for third level:
+ * Definitions for first level:
+ *
+ * PGDIR_SHIFT determines what a first-level page table entry can map.
  */
-#define PTRS_PER_PTE	(__IA64_UL(1) << (PAGE_SHIFT-3))
+#define PGD_ENTRY_BITS		3
+#define PGDIR_SHIFT		((PAGE_SHIFT - PGD_ENTRY_BITS) + PMD_SHIFT)
+#define PGDIR_SIZE		(__IA64_UL(1) << PGDIR_SHIFT)
+#define PGDIR_MASK		(~(PGDIR_SIZE-1))
+#define PGD_INDEX_BITS		(PAGE_SHIFT - PGD_ENTRY_BITS)
+#define PTRS_PER_PGD		(__IA64_UL(1) << PGD_INDEX_BITS)
+/* regions 0-4 are user regions */
+#define USER_PTRS_PER_PGD	(5*PTRS_PER_PGD/(__IA64_UL(1) << PGD_ENTRY_BITS))
+#define FIRST_USER_ADDRESS	0
 
 /*
  * All the normal masks have the "page accessed" bits on, as any time
@@ -206,11 +218,15 @@ ia64_phys_addr_valid (unsigned long addr
 
 #define VMALLOC_START		(RGN_BASE(RGN_GATE) + 0x200000000UL)
 #ifdef CONFIG_VIRTUAL_MEM_MAP
-# define VMALLOC_END_INIT	(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
+/*
+ * PGDIR_SHIFT is the size one PGD maps, so we need to account for a
+ * whole page of them.
+ */
+# define VMALLOC_END_INIT	(RGN_BASE(RGN_GATE) + (1UL << (PGDIR_SHIFT + PGD_INDEX_BITS)))
 # define VMALLOC_END		vmalloc_end
   extern unsigned long vmalloc_end;
 #else
-# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9)))
+# define VMALLOC_END		(RGN_BASE(RGN_GATE) + (1UL << (PGDIR_SHIFT + PGD_INDEX_BITS)))
 #endif
 
 /* fs/proc/kcore.c */
@@ -334,7 +350,7 @@ pgd_offset (struct mm_struct *mm, unsign
  * Find an entry in the third-level page table.  This looks more complicated than it
  * should be because some platforms place page tables in high memory.
  */
-#define pte_index(addr)	 	(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+#define pte_index(addr)	 	(((addr) >> PAGE_SHIFT) & (PTRS_PER_PTD - 1))
 #define pte_offset_kernel(dir,addr)	((pte_t *) pmd_page_kernel(*(dir)) + pte_index(addr))
 #define pte_offset_map(dir,addr)	pte_offset_kernel(dir, addr)
 #define pte_offset_map_nested(dir,addr)	pte_offset_map(dir, addr)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
  2005-10-28  5:19 ` Ian Wienand
@ 2005-10-28 11:19 ` Robin Holt
  2005-10-28 23:23 ` Luck, Tony
                   ` (46 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-10-28 11:19 UTC (permalink / raw)
  To: linux-ia64

On Fri, Oct 28, 2005 at 03:19:57PM +1000, Ian Wienand wrote:
> On Wed, Oct 26, 2005 at 11:17:09PM -0500, Robin Holt wrote:
> > I have started to work on 4-level page tables.  This boots.  I
> > make no further claims than that.
> 
> Do you need 4 level page tables for something?  I seem to remember it
> coming up before too, I'd certainly be interested in seeing any
> numbers you have.

We have an mpi library which makes memory available for multiple
processors.  We have squezzed as hard as we can to reduce the size of
that block.  Right now, the most processors we can service hits about
3192 (IIRC) which is more than enough for current hardware (limits to
2048 cpus).  In the next version of our hardware due out next spring,
that limit is raised to 16384 (IIRC).  To make the library work with
that size, we need to be able to mmap a larger region.

Unfortunately, we still want this to work with SUSE and RHEL
distributions.  Those distributors strongly favor a 16k page size.
To be able to use their standard kernel, we need to either convince the
entire community that 64k pages is right or we need to implement 4-level
page tables.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
  2005-10-28  5:19 ` Ian Wienand
  2005-10-28 11:19 ` Robin Holt
@ 2005-10-28 23:23 ` Luck, Tony
  2005-10-28 23:55 ` Chen, Kenneth W
                   ` (45 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-10-28 23:23 UTC (permalink / raw)
  To: linux-ia64

> At one point, I discussed 4-level page tables on the ia64 mailing
> list but did not find that discussion in my quick search from
> marc.

IIRC the previous discussion foundered on the general usefulness
of 4-level page tables.  3-level table with a 16K page size provide
a virtual address space that is plenty big enough for the majority
of users (640K should be enough for anyone, so 128T is just over the
top :-).  So 4-level tables are definitely targetted at a small niche.

Making this a config option would be one approach ... but would
almost certainly leave you in the same predicament with the OSDs
that you are in now (if they won't ship a 64K pagesize configured
kernel, I doubt that they will jump for joy about a 4-level page
table config).

Run-time switching between 3-level and 4-level would quite possibly
have even more overhead than just running a 4-level table.

Thanks for supplying the kernbench numbers on this.  I'll see
if I can get Ken interested in running his favorite online
transaction processing benchmark on a 4-level kernel to see
what happens to it.

The worst-case loser from this might be a benchmark that runs
oodles of small processes (partly from the overhead of the extra
page, and partly because I suspect that fork/exec/exit might see
the most impact).  So I'd like to see some AIM7 numbers too.

But overall ... it looks likely that the only possible direction
for the benchmarks is towards worse performance.  Maybe it is a
small to insignificant amount, but there doesn't appear to be
any upside (performance-wise).

Your other potential hope would be long format VHPT.  At least
that has some usage models where performance is better.

-Tony

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (2 preceding siblings ...)
  2005-10-28 23:23 ` Luck, Tony
@ 2005-10-28 23:55 ` Chen, Kenneth W
  2005-10-29  0:49 ` Grant Grundler
                   ` (44 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-10-28 23:55 UTC (permalink / raw)
  To: linux-ia64

Luck, Tony wrote on Friday, October 28, 2005 4:23 PM
> Thanks for supplying the kernbench numbers on this.  I'll see
> if I can get Ken interested in running his favorite online
> transaction processing benchmark on a 4-level kernel to see
> what happens to it.

I suppose so.  I started a few smaller benchmarks earlier in the
afternoon that I want to check out first.

- Ken


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (3 preceding siblings ...)
  2005-10-28 23:55 ` Chen, Kenneth W
@ 2005-10-29  0:49 ` Grant Grundler
  2005-10-29  2:18 ` David Mosberger-Tang
                   ` (43 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Grant Grundler @ 2005-10-29  0:49 UTC (permalink / raw)
  To: linux-ia64

On Fri, Oct 28, 2005 at 04:23:27PM -0700, Luck, Tony wrote:
...
> The worst-case loser from this might be a benchmark that runs
> oodles of small processes (partly from the overhead of the extra
> page, and partly because I suspect that fork/exec/exit might see
> the most impact).  So I'd like to see some AIM7 numbers too.

Just curious...is osdl-aim7 the nearest thing we have to SDET?
Anyone know?

thanks,
grant

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (4 preceding siblings ...)
  2005-10-29  0:49 ` Grant Grundler
@ 2005-10-29  2:18 ` David Mosberger-Tang
  2005-11-01 12:13 ` Robin Holt
                   ` (42 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: David Mosberger-Tang @ 2005-10-29  2:18 UTC (permalink / raw)
  To: linux-ia64

On 10/28/05, Luck, Tony <tony.luck@intel.com> wrote:

> The worst-case loser from this might be a benchmark that runs
> oodles of small processes (partly from the overhead of the extra
> page, and partly because I suspect that fork/exec/exit might see
> the most impact).  So I'd like to see some AIM7 numbers too.

And I would want to see numbers for the "RANDOM" benchmark (from the
HPCC benchmark suite) for huge data sets (multi-gigabyte; something
big enough such that not even the page tables fit in the caches).

  --david

--
Mosberger Consulting LLC, voice/fax: 510-744-9372,
http://www.mosberger-consulting.com/
35706 Runckel Lane, Fremont, CA 94536

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (5 preceding siblings ...)
  2005-10-29  2:18 ` David Mosberger-Tang
@ 2005-11-01 12:13 ` Robin Holt
  2005-11-01 15:41 ` David Mosberger-Tang
                   ` (41 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-01 12:13 UTC (permalink / raw)
  To: linux-ia64

On Fri, Oct 28, 2005 at 07:18:16PM -0700, David Mosberger-Tang wrote:
> On 10/28/05, Luck, Tony <tony.luck@intel.com> wrote:
> 
> > The worst-case loser from this might be a benchmark that runs
> > oodles of small processes (partly from the overhead of the extra
> > page, and partly because I suspect that fork/exec/exit might see
> > the most impact).  So I'd like to see some AIM7 numbers too.
> 
> And I would want to see numbers for the "RANDOM" benchmark (from the
> HPCC benchmark suite) for huge data sets (multi-gigabyte; something
> big enough such that not even the page tables fit in the caches).

I can't seem to find a single benchmark which is showing an appreciable
(actually, any) difference.  I finally sat down with Jack yesterday and we
ran what he thought would be a worst-case benchmark.  His test would map
a page at a strided offset throughout the address space and time how long
it would take to access all the pages.  We found absolutely no difference.

We then started discussing this.  For a normal application with the
same virtual address requirements run on a 4 versus a 3 level page table,
we would end up with, at most five additional pages of page tables with
a single cache-line used in each.  Those cachelines would be frequently
used and therefore remain active.  This would essentially eliminate the
second point in ivt.S where you would expect a stall.  Jack guessed we
would be introducing an additional delay of 2 to 5 clock cycles.

I had started to work up a patch which would have allowed CONFIG of
2 to 4 levels of page tables, but I continue to see that as futile.
Jack thought it might be a good idea to at least allow the config of 3
or 4 to make it easier to sort out any delays we may see in the future,
but neither of us could come up with a worst-case scenario which actually
shows a difference.

I am trying to get time on one of our larger machines today to run the
RandomAccess benchmark (as well as some help from somebody that has run
these before).  Is there a certain number of cpus you would like this
run on or is a 64p box adequate?

Given the benchmark results I have seen so far, when I introduce the
CONFIG for levels, does anybody have any objection to setting it to 4
by default?

Thanks,
Robin Holt

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (6 preceding siblings ...)
  2005-11-01 12:13 ` Robin Holt
@ 2005-11-01 15:41 ` David Mosberger-Tang
  2005-11-02 10:35 ` Robin Holt
                   ` (40 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: David Mosberger-Tang @ 2005-11-01 15:41 UTC (permalink / raw)
  To: linux-ia64

On 11/1/05, Robin Holt <holt@sgi.com> wrote:

> I am trying to get time on one of our larger machines today to run the
> RandomAccess benchmark (as well as some help from somebody that has run
> these before).  Is there a certain number of cpus you would like this
> run on or is a 64p box adequate?

Oh, even a single CPU should be fine.  Just use a large working set. 
IIRC, about 16GB should ensure that not even the page tables fit in
the cache (depending on your cache-size, of course).

Thanks,

  --david
--
Mosberger Consulting LLC, voice/fax: 510-744-9372,
http://www.mosberger-consulting.com/
35706 Runckel Lane, Fremont, CA 94536

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (7 preceding siblings ...)
  2005-11-01 15:41 ` David Mosberger-Tang
@ 2005-11-02 10:35 ` Robin Holt
  2005-11-02 13:26 ` Robin Holt
                   ` (39 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-02 10:35 UTC (permalink / raw)
  To: linux-ia64

On Tue, Nov 01, 2005 at 07:41:26AM -0800, David Mosberger-Tang wrote:
> On 11/1/05, Robin Holt <holt@sgi.com> wrote:
> 
> > I am trying to get time on one of our larger machines today to run the
> > RandomAccess benchmark (as well as some help from somebody that has run
> > these before).  Is there a certain number of cpus you would like this
> > run on or is a 64p box adequate?
> 
> Oh, even a single CPU should be fine.  Just use a large working set. 
> IIRC, about 16GB should ensure that not even the page tables fit in
> the cache (depending on your cache-size, of course).

I have not been able to see any difference.  I am not sure what part of
the hpccoutf.txt file.  I think I should be looking at:

Begin of SingleRandomAccess section.
Main table size   = 2^27 = 134217728 words
Number of updates = 536870912
CPU time used  = 107.997328 seconds
Real time used = 108.077732 seconds
0.004967452 Billion(10^9) Updates    per second [GUP/s]
Found 0 errors in 134217728 locations (passed).
Node(s) with error 0
Node selected 0
Single GUP/s 0.004967
Current time (1130893857) is Tue Nov  1 19:10:57 2005

End of SingleRandomAccess section.


If so, this benchmark seems to have some unstable results.  I ran 10
iterations on a 3-level kernel and got:

CPU time used  = 128.912032 seconds
CPU time used  = 109.921024 seconds
CPU time used  = 170.240752 seconds
CPU time used  = 118.555696 seconds
CPU time used  = 107.986592 seconds
CPU time used  = 126.163616 seconds
CPU time used  = 110.157216 seconds
CPU time used  = 115.738960 seconds
CPU time used  = 107.997328 seconds
CPU time used  = 108.786912 seconds

Is this expected?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (8 preceding siblings ...)
  2005-11-02 10:35 ` Robin Holt
@ 2005-11-02 13:26 ` Robin Holt
  2005-11-02 16:11 ` Luck, Tony
                   ` (38 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-02 13:26 UTC (permalink / raw)
  To: linux-ia64

Assuming I am using the right numbers as indicated below, I have seen
no measurable performance change with 4-levels.

> Begin of SingleRandomAccess section.
> Main table size   = 2^27 = 134217728 words
> Number of updates = 536870912
> CPU time used  = 107.997328 seconds
> Real time used = 108.077732 seconds
> 0.004967452 Billion(10^9) Updates    per second [GUP/s]
> Found 0 errors in 134217728 locations (passed).
> Node(s) with error 0
> Node selected 0
> Single GUP/s 0.004967
> Current time (1130893857) is Tue Nov  1 19:10:57 2005
> 
> End of SingleRandomAccess section.

For my real testing, I doubled the dataset size.  The person who helped
me setup the first benchmark had assumed the system only had 1GB per cpu.
I changed that to 2GB.  I was not sure which "time used" was the one
of concern, but neither showed any difference outside the noise range.
I repeated 10 runs, each takes about 6 minutes.  I will attach the whole
information below.

I also created a tweak on Jack's vhpt_miss timing test.  I changed it
so it drags twice the cache-size worth of data through the processor
between each reference to a group of pages spaced through the users
address space at PAGE_SIZE * 2048 * 2048 steps.  This was intended to
show the cost of the stall while loading the extra page table level.
This has likewise showed the cost in the noise range.  The min-to-max
spread of 100 timings of 16,000 references in the loop with large
memset in the middle was 681 mSec for 3-level and 682 mSec for 4-level.
Average time was 2 mSec higher which places it easily within the noise.

I am not sure what other tests people would want run.  I have thrown every
benchmark I know how to run against this.  The more I think through it,
the less concerned I am with adding the extra page table level.  For the
vast majority of applications I think we are talking about consuming an
extra three cache lines.

I base this upon the assertion that the majority of applications only
reference stuff in regions 1,2, and 3.  Since one PGD entry will cover
the entire portion of the address space, we will simply add a single,
frequently used, cacheline to the lookup chain for the vhpt_miss and
page_fault code path.  The only time that will change is when a larger
virtual address space is used and then it is the desired behavior.

Does anybody have any objections to making 4 level the default?

The timings I promised to attached were on a machine which was just
imaged again so I lost my data file.  I will attach those once another
pass is complete.  Sorry for the delay.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (9 preceding siblings ...)
  2005-11-02 13:26 ` Robin Holt
@ 2005-11-02 16:11 ` Luck, Tony
  2005-11-02 16:23 ` Robin Holt
                   ` (37 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-11-02 16:11 UTC (permalink / raw)
  To: linux-ia64

>We then started discussing this.  For a normal application with the
>same virtual address requirements run on a 4 versus a 3 level page table,
>we would end up with, at most five additional pages of page tables with
>a single cache-line used in each.

What about AIM7?  The extra pages would make a difference on a small
memory machine running thousands of processes.  Is that a concern for
anyone?

-Tony

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (10 preceding siblings ...)
  2005-11-02 16:11 ` Luck, Tony
@ 2005-11-02 16:23 ` Robin Holt
  2005-11-02 16:30 ` Luck, Tony
                   ` (36 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-02 16:23 UTC (permalink / raw)
  To: linux-ia64

On Wed, Nov 02, 2005 at 08:11:10AM -0800, Luck, Tony wrote:
> >We then started discussing this.  For a normal application with the
> >same virtual address requirements run on a 4 versus a 3 level page table,
> >we would end up with, at most five additional pages of page tables with
> >a single cache-line used in each.
> 
> What about AIM7?  The extra pages would make a difference on a small
> memory machine running thousands of processes.  Is that a concern for
> anyone?

On the aim7 runs I did on a 4 cpu, 4GB machine it was in the noise
range with the 4-level kernel outperforming on nearly an equal number
of points to the 3-level.

Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (11 preceding siblings ...)
  2005-11-02 16:23 ` Robin Holt
@ 2005-11-02 16:30 ` Luck, Tony
  2005-11-02 17:16 ` Robin Holt
                   ` (35 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-11-02 16:30 UTC (permalink / raw)
  To: linux-ia64

>On the aim7 runs I did on a 4 cpu, 4GB machine it was in the noise
>range with the 4-level kernel outperforming on nearly an equal number
>of points to the 3-level.

Good to hear that (though I must remember to specify what I mean
by "small memory" when talking to SGI ... there are ia64 boxes
with only 1G :-)

Ken: Any predictions on when a 4-level run might get to the
top of your queue for a run with an industry standard transaction
benchmark?

-Tony

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (12 preceding siblings ...)
  2005-11-02 16:30 ` Luck, Tony
@ 2005-11-02 17:16 ` Robin Holt
  2005-11-02 18:59 ` David Mosberger-Tang
                   ` (34 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-02 17:16 UTC (permalink / raw)
  To: linux-ia64


I think I will hold off on doing this for now.  I think it is better done
as a seperate patch.  I will happily try to tackle this once the direction
of 4-level PTDs is set, but until then, I think I want to skip this.

Thanks,
Robin

On Fri, Oct 28, 2005 at 03:19:57PM +1000, Ian Wienand wrote:
> Something I would really like is a small abstraction of some of the
> page table access macros to make it a little clearer as to what is
> happening.  Besides which using '-3' in many places assumes the size
> of entries (of particular interest is the size of a PTE, which you
> might want to increase; I can't see why you would increase the upper
> levels).
> 
> I'd like to see something like this go in, or possibly wrap it up with
> your changes.  I also tried to make comments a little more explicit.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (13 preceding siblings ...)
  2005-11-02 17:16 ` Robin Holt
@ 2005-11-02 18:59 ` David Mosberger-Tang
  2005-11-02 22:26 ` Ian Wienand
                   ` (33 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: David Mosberger-Tang @ 2005-11-02 18:59 UTC (permalink / raw)
  To: linux-ia64

Robin,

Thanks for running all those benchmarks.

On 11/2/05, Robin Holt <holt@sgi.com> wrote:

> Does anybody have any objections to making 4 level the default?

My concern is that 4-level PT really isn't needed on ia64 for all but
the largest machines and adding a 4-th level cannot possibly help
performance and definitely has a (small) cost.  I see that you have a
CONFIG option now, which is good.  I don't have a strong opinion what
should be the default.  Longer term, it would be good to settle on one
implementation, so perhaps it's not a bad idea to make 4-level the
default and see if anyone complains about performance regressions.

  --david
--
Mosberger Consulting LLC, voice/fax: 510-744-9372,
http://www.mosberger-consulting.com/
35706 Runckel Lane, Fremont, CA 94536

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (14 preceding siblings ...)
  2005-11-02 18:59 ` David Mosberger-Tang
@ 2005-11-02 22:26 ` Ian Wienand
  2005-11-03  1:36 ` Gerald Pfeifer
                   ` (32 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Ian Wienand @ 2005-11-02 22:26 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 750 bytes --]

On Wed, Nov 02, 2005 at 11:16:11AM -0600, Robin Holt wrote:
> I think I will hold off on doing this for now.  I think it is better done
> as a seperate patch.  I will happily try to tackle this once the direction
> of 4-level PTDs is set, but until then, I think I want to skip this.

Ok, cool.  I'll try again later (once I've finished fixing all our
patches to account for 4 levels ;).

I must admit to be a bit perplexed however.  I would have thought that
a customer who just spent (what I assume is a lot of) money on a
machine to map huge areas of contiguous memory would really want to
evaluate the probable benefits of larger pages, despite what Redhat
ships.  I'm probably just extremley naive as to what customers really
want, however.

-i

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (15 preceding siblings ...)
  2005-11-02 22:26 ` Ian Wienand
@ 2005-11-03  1:36 ` Gerald Pfeifer
  2005-11-03  1:53 ` Chen, Kenneth W
                   ` (31 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Gerald Pfeifer @ 2005-11-03  1:36 UTC (permalink / raw)
  To: linux-ia64

On Thu, 3 Nov 2005, Ian Wienand wrote:
> On Wed, Nov 02, 2005 at 11:16:11AM -0600, Robin Holt wrote:
> I must admit to be a bit perplexed however.  I would have thought that
> a customer who just spent (what I assume is a lot of) money on a
> machine to map huge areas of contiguous memory would really want to
> evaluate the probable benefits of larger pages, despite what Redhat
> ships.

I'll note that SUSE has been shipping a 64k-pagesize kernel for more
than a year now as part of SUSE LINUX Enterprise Server 9, and I have
not seen a single L3 support call for this kernel.

Which means that it's either completely bug free, or nobody uses it
in production. ;-)

> I'm probably just extremley naive as to what customers really
> want, however.

The reason customers generally prefer to go with default kernels,
as far as I can tell, is software certification.  If you're only
running your special applications, you may not care, but for such
huge memory configs a DBMS like Oracle or DB2 is often part of the
picture, for example.

Gerald

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (16 preceding siblings ...)
  2005-11-03  1:36 ` Gerald Pfeifer
@ 2005-11-03  1:53 ` Chen, Kenneth W
  2005-11-03  3:55 ` Jack Steiner
                   ` (30 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-03  1:53 UTC (permalink / raw)
  To: linux-ia64

David Mosberger-Tang wrote on Wednesday, November 02, 2005 10:59 AM
> On 11/2/05, Robin Holt <holt@sgi.com> wrote:
> 
> > Does anybody have any objections to making 4 level the default?
> 
> My concern is that 4-level PT really isn't needed on ia64 for all but
> the largest machines

No kidding. People envy that I have a machine with 64 GB, 128 GB and I
envy Robin has access to machine with more than 16 TB.  How many average
joe is going to be able to afford machine like that?  Actually, how many
installation will there be in the entire world (100?, 1000?)

> so perhaps it's not a bad idea to make 4-level the
> default and see if anyone complains about performance regressions.

He he. What a nice way to kick people around for doing performance
measurement :-)

I just did a quick measurement with I/O submission path.  get_user_pages
is my biggest concern that extra level means more code and cache foot
print for each I/O submission.  It cost about 1.5% more in I/O path
length on a micro-benchmark (from block layer and Up).  This could be
all buried as noise in a larger scheme for industry database benchmark.

Another thing: has people tested 4 level page table with 4GB hugetlb page
size?  Looks like pud is already falling short on bits and entire pgd bits
will be falling off the 64-bit.  128-bit computing anybody? (just kidding).

- Ken

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (17 preceding siblings ...)
  2005-11-03  1:53 ` Chen, Kenneth W
@ 2005-11-03  3:55 ` Jack Steiner
  2005-11-03 16:36 ` Robin Holt
                   ` (29 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Jack Steiner @ 2005-11-03  3:55 UTC (permalink / raw)
  To: linux-ia64

On Thu, Nov 03, 2005 at 02:36:18AM +0100, Gerald Pfeifer wrote:
> On Thu, 3 Nov 2005, Ian Wienand wrote:
> > On Wed, Nov 02, 2005 at 11:16:11AM -0600, Robin Holt wrote:
> > I must admit to be a bit perplexed however.  I would have thought that
> > a customer who just spent (what I assume is a lot of) money on a
> > machine to map huge areas of contiguous memory would really want to
> > evaluate the probable benefits of larger pages, despite what Redhat
> > ships.
> 
> I'll note that SUSE has been shipping a 64k-pagesize kernel for more
> than a year now as part of SUSE LINUX Enterprise Server 9, and I have
> not seen a single L3 support call for this kernel.
> 
> Which means that it's either completely bug free, or nobody uses it
> in production. ;-)

You are probably correct that very few sites use it. Part of the reason is 
that the 64k-pagesize kernel is built with NR_CPUS\x128 - not 512. I suspect
that many of the big-memory sites have more than 128p. That makes the 64K page
kernel unusable for those sites.

In addition, the 64k-pagesize kernel does not have KDB configured. That
makes support more difficult. Many sites use the "arch-kdb" commands
to take quick dumps after system failures.

(and of course it really is bug free :-)


> 
> > I'm probably just extremley naive as to what customers really
> > want, however.
> 
> The reason customers generally prefer to go with default kernels,
> as far as I can tell, is software certification.  If you're only
> running your special applications, you may not care, but for such
> huge memory configs a DBMS like Oracle or DB2 is often part of the
> picture, for example.

True...

> 
> Gerald
> -
> To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Thanks

.Jack Steiner (steiner@sgi.com)          651-683-5302
Principal Engineer                      SGI - Silicon Graphics, Inc.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (18 preceding siblings ...)
  2005-11-03  3:55 ` Jack Steiner
@ 2005-11-03 16:36 ` Robin Holt
  2005-11-03 19:59 ` Chen, Kenneth W
                   ` (28 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-03 16:36 UTC (permalink / raw)
  To: linux-ia64

On Wed, Nov 02, 2005 at 05:53:10PM -0800, Chen, Kenneth W wrote:
> I just did a quick measurement with I/O submission path.  get_user_pages
> is my biggest concern that extra level means more code and cache foot
> print for each I/O submission.  It cost about 1.5% more in I/O path
> length on a micro-benchmark (from block layer and Up).  This could be
> all buried as noise in a larger scheme for industry database benchmark.

Can you point me at the benchmark?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (19 preceding siblings ...)
  2005-11-03 16:36 ` Robin Holt
@ 2005-11-03 19:59 ` Chen, Kenneth W
  2005-11-04 17:58 ` Luck, Tony
                   ` (27 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-03 19:59 UTC (permalink / raw)
  To: linux-ia64

Robin Holt wrote on Thursday, November 03, 2005 8:37 AM
> On Wed, Nov 02, 2005 at 05:53:10PM -0800, Chen, Kenneth W wrote:
> > I just did a quick measurement with I/O submission path.  get_user_pages
> > is my biggest concern that extra level means more code and cache foot
> > print for each I/O submission.  It cost about 1.5% more in I/O path
> > length on a micro-benchmark (from block layer and Up).  This could be
> > all buried as noise in a larger scheme for industry database benchmark.
> 
> Can you point me at the benchmark?

Here, kernel null blk driver:
http://marc.theaimsgroup.com/?l=linux-kernel&m\x111033439400836&w=2


And here: user space micro-benchmark:
http://marc.theaimsgroup.com/?l=linux-kernel&m\x111033404219628&w=2


- Ken


^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (20 preceding siblings ...)
  2005-11-03 19:59 ` Chen, Kenneth W
@ 2005-11-04 17:58 ` Luck, Tony
  2005-11-04 21:37 ` Robin Holt
                   ` (26 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-11-04 17:58 UTC (permalink / raw)
  To: linux-ia64

>It turns out that we have measurable performance regression with
>4-level page table running industry database benchmark, the penalty
>is ~0.2% (I thought the number would be smaller than 0.2%, but ...).
>Our resolution with this workload is typically 0.05%.

That makes it tough to make 4-level the default.  If there were a
lot more users that needed 4-level, I might tell Ken he'd have to
just live with it ... but this is only needed for a few HPC jobs
on very, very high end machines.

I'll put the patches into the test tree as they are (with 4-level
as the default) so that we'll get some test exposure in -mm.  But
if they move into the base it will be with 3-level as the default.

-Tony

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (21 preceding siblings ...)
  2005-11-04 17:58 ` Luck, Tony
@ 2005-11-04 21:37 ` Robin Holt
  2005-11-04 21:42 ` Chen, Kenneth W
                   ` (25 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-04 21:37 UTC (permalink / raw)
  To: linux-ia64

On Fri, Nov 04, 2005 at 10:51:07AM -0800, Chen, Kenneth W wrote:
> Luck, Tony wrote on Wednesday, November 02, 2005 8:31 AM
> > Good to hear that (though I must remember to specify what I mean
> > by "small memory" when talking to SGI ... there are ia64 boxes
> > with only 1G :-)
> > 
> > Ken: Any predictions on when a 4-level run might get to the
> > top of your queue for a run with an industry standard transaction
> > benchmark?
> 
> It turns out that we have measurable performance regression with
> 4-level page table running industry database benchmark, the penalty
> is ~0.2% (I thought the number would be smaller than 0.2%, but ...).
> Our resolution with this workload is typically 0.05%.

Can you point me at this benchmark?

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (22 preceding siblings ...)
  2005-11-04 21:37 ` Robin Holt
@ 2005-11-04 21:42 ` Chen, Kenneth W
  2005-11-04 22:50 ` Chen, Kenneth W
                   ` (24 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-04 21:42 UTC (permalink / raw)
  To: linux-ia64

Robin Holt wrote on Friday, November 04, 2005 1:38 PM
> On Fri, Nov 04, 2005 at 10:51:07AM -0800, Chen, Kenneth W wrote:
> > Luck, Tony wrote on Wednesday, November 02, 2005 8:31 AM
> > > Good to hear that (though I must remember to specify what I mean
> > > by "small memory" when talking to SGI ... there are ia64 boxes
> > > with only 1G :-)
> > > 
> > > Ken: Any predictions on when a 4-level run might get to the
> > > top of your queue for a run with an industry standard transaction
> > > benchmark?
> > 
> > It turns out that we have measurable performance regression with
> > 4-level page table running industry database benchmark, the penalty
> > is ~0.2% (I thought the number would be smaller than 0.2%, but ...).
> > Our resolution with this workload is typically 0.05%.
> 
> Can you point me at this benchmark?

I hope everyone knows by now what this "industry standard database
transaction processing benchmark" is.  If not, please google it.

Sorry, this is the best I can do without breaking any U.S. law.

- Ken


^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (23 preceding siblings ...)
  2005-11-04 21:42 ` Chen, Kenneth W
@ 2005-11-04 22:50 ` Chen, Kenneth W
  2005-11-07 21:18 ` Luck, Tony
                   ` (23 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-04 22:50 UTC (permalink / raw)
  To: linux-ia64

David Mosberger-Tang wrote on Tuesday, November 01, 2005 7:41 AM
> On 11/1/05, Robin Holt <holt@sgi.com> wrote:
> > I am trying to get time on one of our larger machines today to run the
> > RandomAccess benchmark (as well as some help from somebody that has run
> > these before).  Is there a certain number of cpus you would like this
> > run on or is a 64p box adequate?
> 
> Oh, even a single CPU should be fine.  Just use a large working set. 
> IIRC, about 16GB should ensure that not even the page tables fit in
> the cache (depending on your cache-size, of course).


Robin, here, something as silly as this test program [*] will show you
the performance regression with 4-level page table:

#include <sys/mman.h>

#define SIZE	(16*1024*1024*1024UL)

int main()
{
	char* addr;
	unsigned long i, j, sum;
	unsigned long start, end;

	addr = mmap(0, SIZE, PROT_READ | PROT_WRITE,
		    MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);

	/* fault in all the pages */
	for (i=0; i<SIZE; i+\x16384)
		addr[i] = 0;

	asm volatile ("mov %0=ar.itc" : "=r"(start));

	for (j=0; j<100000; j++)
		for (i=0; i<SIZE; i+= (1UL << 25))
			sum += addr[i];

	asm volatile ("mov %0=ar.itc" : "=r"(end));

	printf("time is %ld\n", end - start);
}

With 3-level page table kernel: time is 16405345406
With 4-level page table kernel: time is 26768668506

- Ken


[*] disclaimer: this code can not be even called as a benchmark
    since it does not meet basic benchmark criteria and definitions.
    I did it with maybe 2 minutes or so.  However, given its
    simplicity, such program can be used as an illustrative purpose.


^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (24 preceding siblings ...)
  2005-11-04 22:50 ` Chen, Kenneth W
@ 2005-11-07 21:18 ` Luck, Tony
  2005-11-08  0:22 ` Rohit Seth
                   ` (22 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-11-07 21:18 UTC (permalink / raw)
  To: linux-ia64

>Another thing: has people tested 4 level page table with 4GB hugetlb page
>size?  Looks like pud is already falling short on bits and entire pgd bits
>will be falling off the 64-bit.  128-bit computing anybody? (just kidding).

I didn't try 4GB huge page ... but I did just try to build with
a 64K normal page, and the build failed with fatal errors in ivt.S
because of attempts to shift by more than 64.  There are also
a gazillion[1] warnings about "left shift count >= width of type"
from all over the build.

Perhaps arch/ia64/Kconfig shouldn't let you choose 4-level tables
with a 64K pagesize?

-Tony

[1] Actually "just" 1368 warnings.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (25 preceding siblings ...)
  2005-11-07 21:18 ` Luck, Tony
@ 2005-11-08  0:22 ` Rohit Seth
  2005-11-08 12:43 ` Robin Holt
                   ` (21 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Rohit Seth @ 2005-11-08  0:22 UTC (permalink / raw)
  To: linux-ia64

On Wed, 2005-11-02 at 17:36 -0800, Gerald Pfeifer wrote:
> On Thu, 3 Nov 2005, Ian Wienand wrote: 
> > On Wed, Nov 02, 2005 at 11:16:11AM -0600, Robin Holt wrote: 
> > I must admit to be a bit perplexed however.  I would have thought
> that 
> > a customer who just spent (what I assume is a lot of) money on a 
> > machine to map huge areas of contiguous memory would really want to 
> > evaluate the probable benefits of larger pages, despite what Redhat 
> > ships.

Well, huge pages are not that easy to use...particularly if the app
source code can not be changed.

> 
> I'll note that SUSE has been shipping a 64k-pagesize kernel for more 
> than a year now as part of SUSE LINUX Enterprise Server 9, and I have 
> not seen a single L3 support call for this kernel.
> 
> Which means that it's either completely bug free, or nobody uses it 
> in production. ;-)
> 

I think using a 64K page size (may be make that default for IA-64 or
distribute as another kernel the way SuSE has done) is preferred over
4-level page tables....particularly for big memory machines.

There is always at least couple of percentage points that an application
can gain with even smaller memory foot print (like few gig!) by using
64K page size for normal pages.

-rohit


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (26 preceding siblings ...)
  2005-11-08  0:22 ` Rohit Seth
@ 2005-11-08 12:43 ` Robin Holt
  2005-11-08 18:23 ` Boehm, Hans
                   ` (20 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-08 12:43 UTC (permalink / raw)
  To: linux-ia64

On Mon, Nov 07, 2005 at 04:22:32PM -0800, Rohit Seth wrote:
> I think using a 64K page size (may be make that default for IA-64 or
> distribute as another kernel the way SuSE has done) is preferred over
> 4-level page tables....particularly for big memory machines.

For your particular application, that may be the case.  For approx half
of our customers, they _REQUIRE_ their application be certified by the
software vendor.  The vendors usually try to limit their exposure by
certifiying on the smallest set kernel/modules/libraries possible.
We don't control that.

> There is always at least couple of percentage points that an application
> can gain with even smaller memory foot print (like few gig!) by using
> 64K page size for normal pages.

There is also the possibility that the app may be using the pages sparsely
and therefore wasting a larger percentage of time zeroing memory which
is never needed (smaller percent of page fill).

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (27 preceding siblings ...)
  2005-11-08 12:43 ` Robin Holt
@ 2005-11-08 18:23 ` Boehm, Hans
  2005-11-08 18:52 ` Magenheimer, Dan (HP Labs Fort Collins)
                   ` (19 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Boehm, Hans @ 2005-11-08 18:23 UTC (permalink / raw)
  To: linux-ia64

> -----Original Message-----
> From: Robin Holt
> 
> There is also the possibility that the app may be using the 
> pages sparsely and therefore wasting a larger percentage of 
> time zeroing memory which is never needed (smaller percent of 
> page fill).
> 
And, perhaps less significantly, there is a small set of applications
that try to use page protection to track accesses, e.g. for incremental
GC, faster checkpointing, or software DSM.  I expect those would
generally do considerably worse with 64K pages.

Hans

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (28 preceding siblings ...)
  2005-11-08 18:23 ` Boehm, Hans
@ 2005-11-08 18:52 ` Magenheimer, Dan (HP Labs Fort Collins)
  2005-11-08 18:56 ` Rohit Seth
                   ` (18 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Magenheimer, Dan (HP Labs Fort Collins) @ 2005-11-08 18:52 UTC (permalink / raw)
  To: linux-ia64

Just a thought... shouldn't it be possible (at least in theory)
for all the page table macros to be made a bit more dynamically
flexible so that PAGESIZE can (at least optionally) be specified
as a boottime parameter?  For example (very loosely):

#define pgd_macro(x,y,z) \
( \
if (unlikely(boottime_pagesize)) _pgd_macro(boottime_pagesize,x,y,z); \
else _pgd_macro(PAGESIZE,x,y,z); \
)

The cost is of course a global (or cpu) variable access for
every pud/pgd/pmd/pte macro usage, but one would expect the
global would always be in cache/TLB so the performance impact
should be near zero.  Boottime_pagesize could even be optionally
define'd to 0 so, in the case where "near zero" is not good
enough, cpp could make the extra variable access go away.

The same trick could potentially be used to determine whether
to use a 4-level or 3-level page table at runtime.

A big advantage of this is that distros only need deliver a
single kernel (even if it is still prudent to test that binary
with multiple boottime_pagesize's).  It also is much more flexible
for customers who know how their machine is going to be used but
do not want to build their own kernel.  (E.g. in Hans' example,
maybe the best PAGESIZE choice is 4K or in some database app
maybe the best PAGESIZE choice is 1MB... distros will probably
never ship either of these.)

Seems like this would be useful for a number of arch's so even
if it requires some common changes, it could fly.

Comments?

Dan Magenheimer
HP Labs

> -----Original Message-----
> From: linux-ia64-owner@vger.kernel.org 
> [mailto:linux-ia64-owner@vger.kernel.org] On Behalf Of Boehm, Hans
> Sent: Tuesday, November 08, 2005 11:24 AM
> To: Robin Holt; Rohit Seth
> Cc: Gerald Pfeifer; Ian Wienand; linux-ia64@vger.kernel.org; 
> david.mosberger@acm.org
> Subject: RE: [RFC] 4-level page table directories.
> 
> > -----Original Message-----
> > From: Robin Holt
> > 
> > There is also the possibility that the app may be using the 
> > pages sparsely and therefore wasting a larger percentage of 
> > time zeroing memory which is never needed (smaller percent of 
> > page fill).
> > 
> And, perhaps less significantly, there is a small set of applications
> that try to use page protection to track accesses, e.g. for 
> incremental
> GC, faster checkpointing, or software DSM.  I expect those would
> generally do considerably worse with 64K pages.
> 
> Hans
> -
> To unsubscribe from this list: send the line "unsubscribe 
> linux-ia64" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (29 preceding siblings ...)
  2005-11-08 18:52 ` Magenheimer, Dan (HP Labs Fort Collins)
@ 2005-11-08 18:56 ` Rohit Seth
  2005-11-08 19:36 ` Robin Holt
                   ` (17 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Rohit Seth @ 2005-11-08 18:56 UTC (permalink / raw)
  To: linux-ia64

On Tue, 2005-11-08 at 06:43 -0600, Robin Holt wrote:
> On Mon, Nov 07, 2005 at 04:22:32PM -0800, Rohit Seth wrote:
> > I think using a 64K page size (may be make that default for IA-64 or
> > distribute as another kernel the way SuSE has done) is preferred over
> > 4-level page tables....particularly for big memory machines.
> 
> For your particular application, that may be the case.  For approx half
> of our customers, they _REQUIRE_ their application be certified by the
> software vendor.  The vendors usually try to limit their exposure by
> certifiying on the smallest set kernel/modules/libraries possible.
> We don't control that.
> 

I agree with you completely about OSV certification part.  And SuSE is
again a good example here, they have a released kernel with 64K page
size.  There is no reason why end customer should not be using this
bigger page kernel when desired.

> > There is always at least couple of percentage points that an application
> > can gain with even smaller memory foot print (like few gig!) by using
> > 64K page size for normal pages.
> 
> There is also the possibility that the app may be using the pages sparsely
> and therefore wasting a larger percentage of time zeroing memory which
> is never needed (smaller percent of page fill).
> 

You are right that there is extra setup cost (+ some additional bloat)
coming with default 64K page size.  But there is additional cost
associated with 4-level page tables too(some of it in the critical
low-level fault handlers as well). 

I just think that we should validate the 64K page size more rigorously.
So as to have the OSVs gain more confidence.  There is such a wide range
of system configurations...having a single kernel configuration may not
be the optimal solution.

Thanks,
-rohit

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (30 preceding siblings ...)
  2005-11-08 18:56 ` Rohit Seth
@ 2005-11-08 19:36 ` Robin Holt
  2005-11-08 20:07 ` Chen, Kenneth W
                   ` (16 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-08 19:36 UTC (permalink / raw)
  To: linux-ia64

Ken,

Can I get you to reproduce this?  I have tried many times and
your test is giving me numbers that are very close between 3
and 4 level page tables.  For 25 runs, I got:

With 3-level page table kernel: Average of 25 is 24612659771.96
With 4-level page table kernel: Average of 25 is 24686556792.96

Which is showing that a vhtp_miss test is adding a 0.30% overhead
which can also be expressed as an average 1.44 clock cycles per
miss.  As of this writing, the loops have run over 250 times and
the min reading to this point is 23946196482.  This is nowhere
close to your min reported.

> With 3-level page table kernel: time is 16405345406
> With 4-level page table kernel: time is 26768668506

Thanks,
Robin Holt

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (31 preceding siblings ...)
  2005-11-08 19:36 ` Robin Holt
@ 2005-11-08 20:07 ` Chen, Kenneth W
  2005-11-08 20:27 ` Chen, Kenneth W
                   ` (15 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-08 20:07 UTC (permalink / raw)
  To: linux-ia64

Magenheimer, Dan wrote on Tuesday, November 08, 2005 10:53 AM
> The cost is of course a global (or cpu) variable access for
> every pud/pgd/pmd/pte macro usage, but one would expect the
> global would always be in cache/TLB so the performance impact
> should be near zero.

That is only true when your entire working set fits into the
cache (both kernel and user data).  Here you burn an extra cache
line for global page size variable, it will displace one cache
line for user application.  Enterprise workloads all have working
set much bigger than CPU's last level cache.  In the end, both
kernel and user side hurt with more cache miss rate.

- Ken

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (32 preceding siblings ...)
  2005-11-08 20:07 ` Chen, Kenneth W
@ 2005-11-08 20:27 ` Chen, Kenneth W
  2005-11-08 22:09 ` Ian Wienand
                   ` (14 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-08 20:27 UTC (permalink / raw)
  To: linux-ia64

Robin Holt wrote on Tuesday, November 08, 2005 11:37 AM
> Can I get you to reproduce this?  I have tried many times and
> your test is giving me numbers that are very close between 3
> and 4 level page tables.  For 25 runs, I got:
> 
> With 3-level page table kernel: Average of 25 is 24612659771.96
> With 4-level page table kernel: Average of 25 is 24686556792.96
> 
> Which is showing that a vhtp_miss test is adding a 0.30% overhead
> which can also be expressed as an average 1.44 clock cycles per
> miss.  As of this writing, the loops have run over 250 times and
> the min reading to this point is 23946196482.  This is nowhere
> close to your min reported.

The other option is to instrument vhpt_miss handler and measure
average clock ticks spend in that handler.  I had that instrumented
and measured with 3-level/4-level page table configurations.  I
measured 221 clocks with 3-level page table, versus 298 clocks with
4-level page table [*].  This measurement is certainly depends on
system chipset/platform.  But the point is that penalty with 4-level
page table is certainly visible.  Not only the low level hander has
to Walk the extra level, it also incurs additional cache misses while
walking the table, and it has damaging side effect of evicting other
Working set data resides in the cache.

- Ken

[*] measured on: 1.6 GHz Itainum2 processor, 9M L3, Intel server platform
    SR870BN4.  32GB PC2100 DDR memory.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (33 preceding siblings ...)
  2005-11-08 20:27 ` Chen, Kenneth W
@ 2005-11-08 22:09 ` Ian Wienand
  2005-11-08 23:58 ` Gerald Pfeifer
                   ` (13 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Ian Wienand @ 2005-11-08 22:09 UTC (permalink / raw)
  To: linux-ia64

[-- Attachment #1: Type: text/plain, Size: 968 bytes --]

(CC trimmed)

On Tue, Nov 08, 2005 at 10:52:38AM -0800, Magenheimer, Dan (HP Labs Fort Collins) wrote:
> Just a thought... shouldn't it be possible (at least in theory)
> for all the page table macros to be made a bit more dynamically
> flexible so that PAGESIZE can (at least optionally) be specified
> as a boottime parameter?  For example (very loosely):

Well, this suggests to me that maybe we could start working towards a
proper page table abstraction where we can implement totally different
underlying page table implementations with relative ease.

Paul Davies has been looking into this area with some success with an
initial implementation of a guarded page table underneath his
abstraction interface.  The abstraction stuff was posted to linux-mm

http://thread.gmane.org/gmane.linux.kernel.mm/7847

Obviously it isn't something that will happen overnight, but certainly
if a few people could agree it is a useful long term goal that might
be useful.

-i

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (34 preceding siblings ...)
  2005-11-08 22:09 ` Ian Wienand
@ 2005-11-08 23:58 ` Gerald Pfeifer
  2005-11-09  0:08 ` David Mosberger-Tang
                   ` (12 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Gerald Pfeifer @ 2005-11-08 23:58 UTC (permalink / raw)
  To: linux-ia64

On Tue, 8 Nov 2005, Rohit Seth wrote:
> I agree with you completely about OSV certification part.  And SuSE is
> again a good example here, they have a released kernel with 64K page
> size.  There is no reason why end customer should not be using this
> bigger page kernel when desired.

Unfortuantely, I'm pretty sure that if you run such a kernel and call
$BIGISV for support you'll find yourself in troubles. :-(

For example, at least one important ISV even requires a specific
version of the distro kernel RPMs (excluding security updates), so
I pretty much doubt they'll support an alternative kernel such as
one with 64K page size.

> I just think that we should validate the 64K page size more rigorously. 
> So as to have the OSVs gain more confidence.  There is such a wide range 
> of system configurations...having a single kernel configuration may not 
> be the optimal solution.

As an OSV, we are facing combinatorial explosion.  For example when
we also consider CPU counts we get 32p vs 512p maximum on one dimension,
and 16K vs 64K on another, and there may be further ones lurking in the
dark right now (like Xen Dom0 versus Xen Dom1).

OSVs and ISVs, on the other hand, usually prefer to only test and harden 
and qualify/certify a single kernel (which is why Dan's approach to Xen/ia64
looks quite appealing).

Gerald

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (35 preceding siblings ...)
  2005-11-08 23:58 ` Gerald Pfeifer
@ 2005-11-09  0:08 ` David Mosberger-Tang
  2005-11-09  0:22 ` Rohit Seth
                   ` (11 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: David Mosberger-Tang @ 2005-11-09  0:08 UTC (permalink / raw)
  To: linux-ia64

On 11/8/05, Gerald Pfeifer <gp@suse.de> wrote:

> OSVs and ISVs, on the other hand, usually prefer to only test and harden
> and qualify/certify a single kernel (which is why Dan's approach to Xen/ia64
> looks quite appealing).

That's a completely bogus argument though.  Perhaps the ISV is stupid
enough not to realize what happens when you boot with pgsizedK but
it certainly doesn't avoid combinatorial explosion (e.g., programs
that have 16KB page size hardcoded still may fail...).

  --david
--
Mosberger Consulting LLC, voice/fax: 510-744-9372,
http://www.mosberger-consulting.com/
35706 Runckel Lane, Fremont, CA 94536

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (36 preceding siblings ...)
  2005-11-09  0:08 ` David Mosberger-Tang
@ 2005-11-09  0:22 ` Rohit Seth
  2005-11-09  0:46 ` Magenheimer, Dan (HP Labs Fort Collins)
                   ` (10 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Rohit Seth @ 2005-11-09  0:22 UTC (permalink / raw)
  To: linux-ia64

On Wed, 2005-11-09 at 00:58 +0100, Gerald Pfeifer wrote:
> On Tue, 8 Nov 2005, Rohit Seth wrote:
> > I agree with you completely about OSV certification part.  And SuSE is
> > again a good example here, they have a released kernel with 64K page
> > size.  There is no reason why end customer should not be using this
> > bigger page kernel when desired.
> 
> Unfortuantely, I'm pretty sure that if you run such a kernel and call
> $BIGISV for support you'll find yourself in troubles. :-(
> 

> OSVs and ISVs, on the other hand, usually prefer to only test and harden 
> and qualify/certify a single kernel (which is why Dan's approach to Xen/ia64
> looks quite appealing).
> 

Going by your first argument, the first thing ISV/OSV will in this case
ask the end customer to knock of all the extra/special command line
options used...

-rohit


^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (37 preceding siblings ...)
  2005-11-09  0:22 ` Rohit Seth
@ 2005-11-09  0:46 ` Magenheimer, Dan (HP Labs Fort Collins)
  2005-11-09  1:18 ` Chen, Kenneth W
                   ` (9 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Magenheimer, Dan (HP Labs Fort Collins) @ 2005-11-09  0:46 UTC (permalink / raw)
  To: linux-ia64

> On 11/8/05, Gerald Pfeifer <gp@suse.de> wrote:
> 
> > OSVs and ISVs, on the other hand, usually prefer to only 
> test and harden
> > and qualify/certify a single kernel (which is why Dan's 
> approach to Xen/ia64
> > looks quite appealing).
> 
> That's a completely bogus argument though.  Perhaps the ISV is stupid
> enough not to realize what happens when you boot with pgsizedK but
> it certainly doesn't avoid combinatorial explosion (e.g., programs
> that have 16KB page size hardcoded still may fail...).
> 
>   --david

Not completely bogus.  Flexibility always leads to combinatorial
explosion, yet OSVs still support a wide variety of devices,
filesystems, GUIs, etc.  Exposing the ability to more easily
change pagesize is just one more degree of flexibility.  Why
restrict it to "must rebuild kernel"?  And in that case, why not
restrict it to 4K pages only and remove the flexibility from
the ia64 kernel entirely?  (after all, programs that have 4KB page
size hardcoded may fail on ia64 because a small group of kernel
designers decided that 16KB is a better default than 4KB :-)

So, the question becomes who should bear the burden of pagesize
flexibility?  Why should kernel designers decide this (rather
than pass the buck upstream as CPU designers have :-)?

My two cents,
Dan

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (38 preceding siblings ...)
  2005-11-09  0:46 ` Magenheimer, Dan (HP Labs Fort Collins)
@ 2005-11-09  1:18 ` Chen, Kenneth W
  2005-11-09 12:11 ` Robin Holt
                   ` (8 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-09  1:18 UTC (permalink / raw)
  To: linux-ia64

Magenheimer, Dan wrote on Tuesday, November 08, 2005 4:46 PM
> Not completely bogus.  Flexibility always leads to combinatorial
> explosion, yet OSVs still support a wide variety of devices,
> filesystems, GUIs, etc.  Exposing the ability to more easily
> change pagesize is just one more degree of flexibility.

I think you are mixing the argument here.  Single kernel image does
not equate to better/easier OSD/ISV certification.  I boot this
fancy-one-image-handle-all-cases kernel with half a dozen special
boot time arguments that changes page-size, xen, numa, cpuset,
scheduler parameters.  And then call up the ISV and ask for support,
are they going to support that special configuration?  Referring
to Rohit Seth's earlier point: ISV is going to say: "we don't support
that configuration, boot without any boot time argument and call back".

I don't see how ISV/OSD's certification job is any easier.  Perhaps,
even harder because unless one covers the entire permutation of all
the options, it will never be complete.

Having a flexibility does not automatically grant you certification
for all configurations unless someone spend the time and effort in
validating them all.  There is no shortcut here.

- Ken

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (39 preceding siblings ...)
  2005-11-09  1:18 ` Chen, Kenneth W
@ 2005-11-09 12:11 ` Robin Holt
  2005-11-09 14:29 ` Robin Holt
                   ` (7 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-09 12:11 UTC (permalink / raw)
  To: linux-ia64

Have we all agreed by now that the performance hit is relatively minor
(unmeasurable for most cases due to it being less than the noise of the
samples) from 4-level page tables?  Do I need to do any more performance
measurements?

> measured 221 clocks with 3-level page table, versus 298 clocks with

This looks like approx the amount of stall you would see from a cacheline
fill.  Is this value the average of a large number of samples a single
sample.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (40 preceding siblings ...)
  2005-11-09 12:11 ` Robin Holt
@ 2005-11-09 14:29 ` Robin Holt
  2005-11-09 18:22 ` Chen, Kenneth W
                   ` (6 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-09 14:29 UTC (permalink / raw)
  To: linux-ia64

On Mon, Nov 07, 2005 at 01:18:37PM -0800, Luck, Tony wrote:
> >Another thing: has people tested 4 level page table with 4GB hugetlb page
> >size?  Looks like pud is already falling short on bits and entire pgd bits
> >will be falling off the 64-bit.  128-bit computing anybody? (just kidding).
> 
> I didn't try 4GB huge page ... but I did just try to build with
> a 64K normal page, and the build failed with fatal errors in ivt.S
> because of attempts to shift by more than 64.  There are also
> a gazillion[1] warnings about "left shift count >= width of type"
> from all over the build.
> 
> Perhaps arch/ia64/Kconfig shouldn't let you choose 4-level tables
> with a 64K pagesize?

I have this compiling now with 3 or 4 page table levels.  Given that
4 levels with 64k pages results in only having 1/8 of the page used,
I think I am going to get rid of my desire to not have this configurable
(I will merge the two patches together into one).

I still think that 4 level is probably a reasonable default.  With all the
benchmarks I ran, the difference remains in the noise range.  Worst case
change is approx 0.3% change in microbenchmarks with approx 2% noise.
Many have less that 0.1% change.  That seems reasonable.

As for larger page sizes, those config options seem to be removed.
How would you normally go about doing a 4GB huge page?  Is there more
to it than adding the config option back in?

With the latest changes I made to ivt.S, my 3-level selection no longer
boots so I will delay posting until I have that working.

Thanks,
Robin Holt

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (41 preceding siblings ...)
  2005-11-09 14:29 ` Robin Holt
@ 2005-11-09 18:22 ` Chen, Kenneth W
  2005-11-09 18:39 ` Luck, Tony
                   ` (5 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Chen, Kenneth W @ 2005-11-09 18:22 UTC (permalink / raw)
  To: linux-ia64

Robin Holt wrote on Wednesday, November 09, 2005 4:12 AM
> > measured 221 clocks with 3-level page table, versus 298 clocks with
> 
> This looks like approx the amount of stall you would see from a cacheline
> fill.  Is this value the average of a large number of samples a single
> sample.

The value was averaged over 500,000 samples.

- Ken


^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (42 preceding siblings ...)
  2005-11-09 18:22 ` Chen, Kenneth W
@ 2005-11-09 18:39 ` Luck, Tony
  2005-11-10  0:03 ` Gerald Pfeifer
                   ` (4 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-11-09 18:39 UTC (permalink / raw)
  To: linux-ia64

>Have we all agreed by now that the performance hit is relatively minor
>(unmeasurable for most cases due to it being less than the noise of the
>samples) from 4-level page tables?  Do I need to do any more performance
>measurements?

The transaction processing benchmark hit, although small isn't minor.
Ken has had to fight for every 0.1% increment ... so setting him back
by 0.2% with a patch that is only needed for some very large systems
(>3192cpus according to your October 28th post) is a tough thing to do.

Once you've worked out the remaining kinks in the patch so that
64K page (and 4GB hugetlb page) work, I can take it with the
configuration option (with the default set to 3-level).  I realize
that is only of marginal use, as you have the issue of getting
OSDs to ship with a kernel configured this way, and getting ISVs
to qualify their applications to run on this configuration.  But
don't you already have that issue with persuading them to ship
with CONFIG_NR_CPUS\x16384?  Surely there are a whole boatload of
tuneables and patches needed to get this sized system up and running?
4-level pagetables are just one piece of the puzzle.

-Tony

^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (43 preceding siblings ...)
  2005-11-09 18:39 ` Luck, Tony
@ 2005-11-10  0:03 ` Gerald Pfeifer
  2005-11-10  0:23 ` Jack Steiner
                   ` (3 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Gerald Pfeifer @ 2005-11-10  0:03 UTC (permalink / raw)
  To: linux-ia64

On Wed, 9 Nov 2005, Luck, Tony wrote:
> But don't you already have that issue with persuading them to ship with 
> CONFIG_NR_CPUS\x16384?  Surely there are a whole boatload of tuneables 
> and patches needed to get this sized system up and running? 4-level 
> pagetables are just one piece of the puzzle.

Is it possible you are mixing up MAX_NUMALINK_NODES (the maximum
system size, i.e., the maximum number of nodes in a numalink domain)
with CONFIG_NR_CPUS (the maximum SSI size)?

I'm pretty confident SGI has not proposed setting CONFIG_NR_CPUS\x16384
to any OSV. ;-)

Gerald


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (44 preceding siblings ...)
  2005-11-10  0:03 ` Gerald Pfeifer
@ 2005-11-10  0:23 ` Jack Steiner
  2005-11-10  0:27 ` Luck, Tony
                   ` (2 subsequent siblings)
  48 siblings, 0 replies; 50+ messages in thread
From: Jack Steiner @ 2005-11-10  0:23 UTC (permalink / raw)
  To: linux-ia64

On Thu, Nov 10, 2005 at 01:03:44AM +0100, Gerald Pfeifer wrote:
> On Wed, 9 Nov 2005, Luck, Tony wrote:
> > But don't you already have that issue with persuading them to ship with 
> > CONFIG_NR_CPUS\x16384?  Surely there are a whole boatload of tuneables 
> > and patches needed to get this sized system up and running? 4-level 
> > pagetables are just one piece of the puzzle.
> 
> Is it possible you are mixing up MAX_NUMALINK_NODES (the maximum
> system size, i.e., the maximum number of nodes in a numalink domain)
> with CONFIG_NR_CPUS (the maximum SSI size)?
> 
> I'm pretty confident SGI has not proposed setting CONFIG_NR_CPUS\x16384
> to any OSV. ;-)
> 
> Gerald

True, nothing this big is proposed for inclusion in SLES. The current max
for NR_CPUS is 1024. 

MAX_NUMALINK_NODES is the number of nodes in all SSI clusters that are 
connected by NUMALINK. Single kernels are still restricted to 256 
nodes & 1024 cpus.


-- 
Thanks

Jack Steiner (steiner@sgi.com)          651-683-5302
Principal Engineer                      SGI - Silicon Graphics, Inc.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* RE: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (45 preceding siblings ...)
  2005-11-10  0:23 ` Jack Steiner
@ 2005-11-10  0:27 ` Luck, Tony
  2005-11-10  2:54 ` Jack Steiner
  2005-11-10  9:13 ` Robin Holt
  48 siblings, 0 replies; 50+ messages in thread
From: Luck, Tony @ 2005-11-10  0:27 UTC (permalink / raw)
  To: linux-ia64

>Is it possible you are mixing up MAX_NUMALINK_NODES (the maximum
>system size, i.e., the maximum number of nodes in a numalink domain)
>with CONFIG_NR_CPUS (the maximum SSI size)?
>
>I'm pretty confident SGI has not proposed setting CONFIG_NR_CPUS\x16384
>to any OSV. ;-)

It's always possible that I'm confused.  Here's the message where
Robin introduced the rationale for 4-level page tables:

 http://tinyurl.com/bewsk

In that he says "current = 2048" ... which I think is how many
cpus can be in the whole box ... 512 is the current max cpus in
a coherence domain (and thus the max that a single instance of
Linux will see today).

With Montecito (dual core, two threads in each core) the number
of cpus Linux sees will be quadrupled in a system with the same
number of sockets.  Add more sockets, and the 16384 number may
not be impossible.

-Tony

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (46 preceding siblings ...)
  2005-11-10  0:27 ` Luck, Tony
@ 2005-11-10  2:54 ` Jack Steiner
  2005-11-10  9:13 ` Robin Holt
  48 siblings, 0 replies; 50+ messages in thread
From: Jack Steiner @ 2005-11-10  2:54 UTC (permalink / raw)
  To: linux-ia64

On Wed, Nov 09, 2005 at 04:27:48PM -0800, Luck, Tony wrote:
> >Is it possible you are mixing up MAX_NUMALINK_NODES (the maximum
> >system size, i.e., the maximum number of nodes in a numalink domain)
> >with CONFIG_NR_CPUS (the maximum SSI size)?
> >
> >I'm pretty confident SGI has not proposed setting CONFIG_NR_CPUS\x16384
> >to any OSV. ;-)
> 
> It's always possible that I'm confused.  Here's the message where
> Robin introduced the rationale for 4-level page tables:

You are not confused, but this is a confusing area. A lot of 
the terminology that we use to describe these configurations has
been misused or poorly defined.

Altix currently supports a max SSI of 256 nodes (1024p). However, multiple
SSIs can be interconnected via NUMALINK. 

XPMEM provides the ability for a single task running within an SSI
to access ALL the memory thruout the NUMALINK fabric, ie. memory
can be exported from a task running within one SSI to a task 
running in another SSI. 


For example, if you have:

	128 GB per SSI 
	16 SSIs connected via NUMALINK

a task running within an SSI can access all 2TB of memory (excluding memory
used by the kernel)

	Note: SSI = fat node = cluster 

> 
>  http://tinyurl.com/bewsk
> 
> In that he says "current = 2048" ... which I think is how many
> cpus can be in the whole box ... 512 is the current max cpus in
> a coherence domain (and thus the max that a single instance of
> Linux will see today).
> 
> With Montecito (dual core, two threads in each core) the number
> of cpus Linux sees will be quadrupled in a system with the same
> number of sockets.  Add more sockets, and the 16384 number may
> not be impossible.
> 
> -Tony

-- 
Thanks

Jack Steiner (steiner@sgi.com)          651-683-5302
Principal Engineer                      SGI - Silicon Graphics, Inc.



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [RFC] 4-level page table directories.
  2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
                   ` (47 preceding siblings ...)
  2005-11-10  2:54 ` Jack Steiner
@ 2005-11-10  9:13 ` Robin Holt
  48 siblings, 0 replies; 50+ messages in thread
From: Robin Holt @ 2005-11-10  9:13 UTC (permalink / raw)
  To: linux-ia64

> For example, if you have:
> 
> 	128 GB per SSI 
> 	16 SSIs connected via NUMALINK
> 
> a task running within an SSI can access all 2TB of memory (excluding memory
> used by the kernel)

And here to, access may not mean exactly what you expect.  Some accesses
are simple uncached accesses.  Some are cached accesses for the owning
SSI uncached accesses for the remote side (write coalesced).  Some are
DMA reads from the remote side to cachable local side.  When Jack says
access, he truly means it in a very broad sense.

Thanks,
Robin

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2005-11-10  9:13 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-10-27  4:17 [RFC] 4-level page table directories Robin Holt
2005-10-28  5:19 ` Ian Wienand
2005-10-28 11:19 ` Robin Holt
2005-10-28 23:23 ` Luck, Tony
2005-10-28 23:55 ` Chen, Kenneth W
2005-10-29  0:49 ` Grant Grundler
2005-10-29  2:18 ` David Mosberger-Tang
2005-11-01 12:13 ` Robin Holt
2005-11-01 15:41 ` David Mosberger-Tang
2005-11-02 10:35 ` Robin Holt
2005-11-02 13:26 ` Robin Holt
2005-11-02 16:11 ` Luck, Tony
2005-11-02 16:23 ` Robin Holt
2005-11-02 16:30 ` Luck, Tony
2005-11-02 17:16 ` Robin Holt
2005-11-02 18:59 ` David Mosberger-Tang
2005-11-02 22:26 ` Ian Wienand
2005-11-03  1:36 ` Gerald Pfeifer
2005-11-03  1:53 ` Chen, Kenneth W
2005-11-03  3:55 ` Jack Steiner
2005-11-03 16:36 ` Robin Holt
2005-11-03 19:59 ` Chen, Kenneth W
2005-11-04 17:58 ` Luck, Tony
2005-11-04 21:37 ` Robin Holt
2005-11-04 21:42 ` Chen, Kenneth W
2005-11-04 22:50 ` Chen, Kenneth W
2005-11-07 21:18 ` Luck, Tony
2005-11-08  0:22 ` Rohit Seth
2005-11-08 12:43 ` Robin Holt
2005-11-08 18:23 ` Boehm, Hans
2005-11-08 18:52 ` Magenheimer, Dan (HP Labs Fort Collins)
2005-11-08 18:56 ` Rohit Seth
2005-11-08 19:36 ` Robin Holt
2005-11-08 20:07 ` Chen, Kenneth W
2005-11-08 20:27 ` Chen, Kenneth W
2005-11-08 22:09 ` Ian Wienand
2005-11-08 23:58 ` Gerald Pfeifer
2005-11-09  0:08 ` David Mosberger-Tang
2005-11-09  0:22 ` Rohit Seth
2005-11-09  0:46 ` Magenheimer, Dan (HP Labs Fort Collins)
2005-11-09  1:18 ` Chen, Kenneth W
2005-11-09 12:11 ` Robin Holt
2005-11-09 14:29 ` Robin Holt
2005-11-09 18:22 ` Chen, Kenneth W
2005-11-09 18:39 ` Luck, Tony
2005-11-10  0:03 ` Gerald Pfeifer
2005-11-10  0:23 ` Jack Steiner
2005-11-10  0:27 ` Luck, Tony
2005-11-10  2:54 ` Jack Steiner
2005-11-10  9:13 ` Robin Holt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox