linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCh v3] powerpc: add hugepagesz boot-time parameter
@ 2008-01-03 22:59 Jon Tollefson
  2008-01-03 23:34 ` Arnd Bergmann
  0 siblings, 1 reply; 6+ messages in thread
From: Jon Tollefson @ 2008-01-03 22:59 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: arnd, mel, David Gibson, linuxppc-dev, csnook

Paul, please include this in 2.6.25 if there are no objections.

This patch adds the hugepagesz boot-time parameter for ppc64.  It lets
one pick the size for huge pages. The choices available are 64K and 16M
when the base page size is 4k. It defaults to 16M (previously the only
only choice) if nothing or an invalid choice is specified.

Tested 64K huge pages successfully with the libhugetlbfs 1.2.

Changes from v2:
	Moved functions from header file into hugetlbpage.c where they are used.


Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
---

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 33121d6..2fc1fb8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -685,6 +685,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See Documentation/isdn/README.HiSax.
 
 	hugepages=	[HW,X86-32,IA-64] Maximal number of HugeTLB pages.
+	hugepagesz=	[HW,IA-64,PPC] The size of the HugeTLB pages.
 
 	i8042.direct	[HW] Put keyboard port into non-translated mode
 	i8042.dumbkbd	[HW] Pretend that controller can only read data from
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index cbbd8b0..9326a69 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -369,18 +369,11 @@ static void __init htab_init_page_sizes(void)
 	 * on what is available
 	 */
 	if (mmu_psize_defs[MMU_PAGE_16M].shift)
-		mmu_huge_psize = MMU_PAGE_16M;
+		set_huge_psize(MMU_PAGE_16M);
 	/* With 4k/4level pagetables, we can't (for now) cope with a
 	 * huge page size < PMD_SIZE */
 	else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-		mmu_huge_psize = MMU_PAGE_1M;
-
-	/* Calculate HPAGE_SHIFT and sanity check it */
-	if (mmu_psize_defs[mmu_huge_psize].shift > MIN_HUGEPTE_SHIFT &&
-	    mmu_psize_defs[mmu_huge_psize].shift < SID_SHIFT)
-		HPAGE_SHIFT = mmu_psize_defs[mmu_huge_psize].shift;
-	else
-		HPAGE_SHIFT = 0; /* No huge pages dude ! */
+		set_huge_psize(MMU_PAGE_1M);
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 71efb38..a02266d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,18 +24,17 @@
 #include <asm/cputable.h>
 #include <asm/spu.h>
 
+#define HPAGE_SHIFT_64K	16
+#define HPAGE_SHIFT_16M	24
+
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 
-#ifdef CONFIG_PPC_64K_PAGES
-#define HUGEPTE_INDEX_SIZE	(PMD_SHIFT-HPAGE_SHIFT)
-#else
-#define HUGEPTE_INDEX_SIZE	(PUD_SHIFT-HPAGE_SHIFT)
-#endif
-#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
-#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+unsigned int hugepte_shift;
+#define PTRS_PER_HUGEPTE	(1 << hugepte_shift)
+#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << hugepte_shift)
 
-#define HUGEPD_SHIFT		(HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SHIFT		(HPAGE_SHIFT + hugepte_shift)
 #define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
 #define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
 
@@ -82,11 +81,35 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 	return 0;
 }
 
+/* Base page size affects how we walk hugetlb page tables */
+#ifdef CONFIG_PPC_64K_PAGES
+#define hpmd_offset(pud, addr)		pmd_offset(pud, addr)
+#define hpmd_alloc(mm, pud, addr)	pmd_alloc(mm, pud, addr)
+#else
+static inline
+pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
+{
+	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+		return pmd_offset(pud, addr);
+	else
+		return (pmd_t *) pud;
+}
+static inline
+pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+{
+	if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+		return pmd_alloc(mm, pud, addr);
+	else
+		return (pmd_t *) pud;
+}
+#endif
+
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pg;
 	pud_t *pu;
+	pmd_t *pm;
 
 	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
 
@@ -96,14 +119,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	if (!pgd_none(*pg)) {
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
-#ifdef CONFIG_PPC_64K_PAGES
-			pmd_t *pm;
-			pm = pmd_offset(pu, addr);
+			pm = hpmd_offset(pu, addr);
 			if (!pmd_none(*pm))
 				return hugepte_offset((hugepd_t *)pm, addr);
-#else
-			return hugepte_offset((hugepd_t *)pu, addr);
-#endif
 		}
 	}
 
@@ -114,6 +132,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pg;
 	pud_t *pu;
+	pmd_t *pm;
 	hugepd_t *hpdp = NULL;
 
 	BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
@@ -124,14 +143,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	pu = pud_alloc(mm, pg, addr);
 
 	if (pu) {
-#ifdef CONFIG_PPC_64K_PAGES
-		pmd_t *pm;
-		pm = pmd_alloc(mm, pu, addr);
+		pm = hpmd_alloc(mm, pu, addr);
 		if (pm)
 			hpdp = (hugepd_t *)pm;
-#else
-		hpdp = (hugepd_t *)pu;
-#endif
 	}
 
 	if (! hpdp)
@@ -158,7 +172,6 @@ static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
 						 PGF_CACHENUM_MASK));
 }
 
-#ifdef CONFIG_PPC_64K_PAGES
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				   unsigned long addr, unsigned long end,
 				   unsigned long floor, unsigned long ceiling)
@@ -191,7 +204,6 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd);
 }
-#endif
 
 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 				   unsigned long addr, unsigned long end,
@@ -210,9 +222,15 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 			continue;
 		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 #else
-		if (pud_none(*pud))
-			continue;
-		free_hugepte_range(tlb, (hugepd_t *)pud);
+		if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+			if (pud_none_or_clear_bad(pud))
+				continue;
+			hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+		} else {
+			if (pud_none(*pud))
+				continue;
+			free_hugepte_range(tlb, (hugepd_t *)pud);
+		}
 #endif
 	} while (pud++, addr = next, addr != end);
 
@@ -526,6 +544,57 @@ repeat:
 	return err;
 }
 
+void set_huge_psize(int psize)
+{
+	/* Check that it is a page size supported by the hardware and
+	 * that it fits within pagetable limits. */
+	if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
+		(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
+			mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
+		HPAGE_SHIFT = mmu_psize_defs[psize].shift;
+		mmu_huge_psize = psize;
+#ifdef CONFIG_PPC_64K_PAGES
+		hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
+#else
+		if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+			hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
+		else
+			hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
+#endif
+
+	} else
+		HPAGE_SHIFT = 0;
+}
+
+static int __init hugepage_setup_sz(char *str)
+{
+	unsigned long long size;
+	int mmu_psize = -1;
+	int shift;
+
+	size = memparse(str, &str);
+
+	shift = __ffs(size);
+	switch (shift) {
+#ifndef CONFIG_PPC_64K_PAGES
+	case HPAGE_SHIFT_64K:
+		mmu_psize = MMU_PAGE_64K;
+		break;
+#endif
+	case HPAGE_SHIFT_16M:
+		mmu_psize = MMU_PAGE_16M;
+		break;
+	}
+
+	if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
+		set_huge_psize(mmu_psize);
+	else
+		printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
+
+	return 1;
+}
+__setup("hugepagesz=", hugepage_setup_sz);
+
 static void zero_ctor(struct kmem_cache *cache, void *addr)
 {
 	memset(addr, 0, kmem_cache_size(cache));
diff --git a/include/asm-powerpc/mmu-hash64.h b/include/asm-powerpc/mmu-hash64.h
index 12e5e77..2fda33c 100644
--- a/include/asm-powerpc/mmu-hash64.h
+++ b/include/asm-powerpc/mmu-hash64.h
@@ -278,6 +278,7 @@ extern int hash_huge_page(struct mm_struct *mm, unsigned long access,
 extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 			     unsigned long pstart, unsigned long mode,
 			     int psize, int ssize);
+extern void set_huge_psize(int psize);
 
 extern void htab_initialize(void);
 extern void htab_initialize_secondary(void);

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCh v3] powerpc: add hugepagesz boot-time parameter
  2008-01-03 22:59 [PATCh v3] powerpc: add hugepagesz boot-time parameter Jon Tollefson
@ 2008-01-03 23:34 ` Arnd Bergmann
  2008-01-04 20:00   ` Jon Tollefson
  2008-01-07 12:04   ` Mel Gorman
  0 siblings, 2 replies; 6+ messages in thread
From: Arnd Bergmann @ 2008-01-03 23:34 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: mel, Jon Tollefson, csnook, Paul Mackerras, David Gibson

On Thursday 03 January 2008, Jon Tollefson wrote:
> Paul, please include this in 2.6.25 if there are no objections.

No objections to this version from me, just questions.

> This patch adds the hugepagesz boot-time parameter for ppc64. =A0It lets
> one pick the size for huge pages. The choices available are 64K and 16M
> when the base page size is 4k. It defaults to 16M (previously the only
> only choice) if nothing or an invalid choice is specified.

We started discussing this in v1, but the discussion got sidetracked:
Is there a technical reason why you don't also allow 1M pages, which
may be useful in certain scenarios?

On the Cell/B.E. platforms (IBM/Mercury blades, Toshiba Celleb, PS3), the
second large page size is an option that can be set in a HID SPR
to either 64KB or 1MB. Unfortunately, we can't do these two simultaneously,
but the firmware can change the default and put it into the device tree,
or you could have the kernel override the firmware settings.

Going a lot further, do you have plans for a fully dynamic hugepage size,
e.g. using a mount option for hugetlbfs? I can see that as rather useful,
but at the same time it's probably much more complicated than the boot time
option.

	Arnd <><

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCh v3] powerpc: add hugepagesz boot-time parameter
  2008-01-03 23:34 ` Arnd Bergmann
@ 2008-01-04 20:00   ` Jon Tollefson
  2008-01-04 23:03     ` Arnd Bergmann
  2008-01-07 12:04   ` Mel Gorman
  1 sibling, 1 reply; 6+ messages in thread
From: Jon Tollefson @ 2008-01-04 20:00 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Adam Litke, mel, David Gibson, linuxppc-dev, Paul Mackerras,
	csnook

Arnd Bergmann wrote:
> We started discussing this in v1, but the discussion got sidetracked:
> Is there a technical reason why you don't also allow 1M pages, which
> may be useful in certain scenarios?
>   
No, it was mostly a matter of the time I have had and machines easily
available to me for testing.  I don't know of a technical reason that
would prevent supporting 1M huge pages, but would want the tests in the
libhugetlbfs suite to pass, etc.
> On the Cell/B.E. platforms (IBM/Mercury blades, Toshiba Celleb, PS3), the
> second large page size is an option that can be set in a HID SPR
> to either 64KB or 1MB. Unfortunately, we can't do these two simultaneously,
> but the firmware can change the default and put it into the device tree,
> or you could have the kernel override the firmware settings.
>
> Going a lot further, do you have plans for a fully dynamic hugepage size,
> e.g. using a mount option for hugetlbfs? I can see that as rather useful,
> but at the same time it's probably much more complicated than the boot time
> option.
>   
Eventually we will want to support dynamic huge page sizes.  This is
already being looked into.  In the meantime we can have some flexibility
with a boot-time parameter though.

> 	Arnd <><
>   
Jon

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCh v3] powerpc: add hugepagesz boot-time parameter
  2008-01-04 20:00   ` Jon Tollefson
@ 2008-01-04 23:03     ` Arnd Bergmann
  0 siblings, 0 replies; 6+ messages in thread
From: Arnd Bergmann @ 2008-01-04 23:03 UTC (permalink / raw)
  To: linuxppc-dev
  Cc: Jon Tollefson, Adam Litke, mel, David Gibson, Paul Mackerras,
	csnook

On Friday 04 January 2008, Jon Tollefson wrote:
> Arnd Bergmann wrote:
> > We started discussing this in v1, but the discussion got sidetracked:
> > Is there a technical reason why you don't also allow 1M pages, which
> > may be useful in certain scenarios?
> > =A0=20
> No, it was mostly a matter of the time I have had and machines easily
> available to me for testing. =A0I don't know of a technical reason that
> would prevent supporting 1M huge pages, but would want the tests in the
> libhugetlbfs suite to pass, etc.

Ok. Do you think the kernel should be able to change the page size settings
in that case, or should we rely on whatever the firmware tells us it has
configured already?

	Arnd <><

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCh v3] powerpc: add hugepagesz boot-time parameter
  2008-01-03 23:34 ` Arnd Bergmann
  2008-01-04 20:00   ` Jon Tollefson
@ 2008-01-07 12:04   ` Mel Gorman
  2008-01-07 12:37     ` Arnd Bergmann
  1 sibling, 1 reply; 6+ messages in thread
From: Mel Gorman @ 2008-01-07 12:04 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: linuxppc-dev, Jon Tollefson, csnook, Paul Mackerras, David Gibson

On (04/01/08 00:34), Arnd Bergmann didst pronounce:
> On Thursday 03 January 2008, Jon Tollefson wrote:
> > Paul, please include this in 2.6.25 if there are no objections.
> 
> No objections to this version from me, just questions.
> 
> > This patch adds the hugepagesz boot-time parameter for ppc64.  It lets
> > one pick the size for huge pages. The choices available are 64K and 16M
> > when the base page size is 4k. It defaults to 16M (previously the only
> > only choice) if nothing or an invalid choice is specified.
> 
> We started discussing this in v1, but the discussion got sidetracked:
> Is there a technical reason why you don't also allow 1M pages, which
> may be useful in certain scenarios?
> 

I cannot see why not although the ideal would be that the necessary
information to support the pagesize would be provided by the firmware
instead of hard-coded values.

> On the Cell/B.E. platforms (IBM/Mercury blades, Toshiba Celleb, PS3), the
> second large page size is an option that can be set in a HID SPR
> to either 64KB or 1MB. Unfortunately, we can't do these two simultaneously,
> but the firmware can change the default and put it into the device tree,
> or you could have the kernel override the firmware settings.
> 
> Going a lot further, do you have plans for a fully dynamic hugepage size,
> e.g. using a mount option for hugetlbfs?

A mount option for hugetlbfs to select a hugepage size is a long-term
goal. However, a lot of changes are required to support such a thing. Selecting
the hugepage size at boot-time is a reasonable starting point and one that
might be usable with some additional work by x86-64 on AMD processors
supporting 1GiB pages.

> I can see that as rather useful,
> but at the same time it's probably much more complicated than the boot time
> option.
> 

Not probably at all. It is *certainly* much more complicated than the
boot-time option :)

-- 
Mel Gorman
Part-time Phd Student                          Linux Technology Center
University of Limerick                         IBM Dublin Software Lab

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCh v3] powerpc: add hugepagesz boot-time parameter
  2008-01-07 12:04   ` Mel Gorman
@ 2008-01-07 12:37     ` Arnd Bergmann
  0 siblings, 0 replies; 6+ messages in thread
From: Arnd Bergmann @ 2008-01-07 12:37 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linuxppc-dev, Jon Tollefson, csnook, Paul Mackerras, David Gibson

On Monday 07 January 2008, Mel Gorman wrote:
> 
> > We started discussing this in v1, but the discussion got sidetracked:
> > Is there a technical reason why you don't also allow 1M pages, which
> > may be useful in certain scenarios?
> > 
> 
> I cannot see why not although the ideal would be that the necessary
> information to support the pagesize would be provided by the firmware
> instead of hard-coded values.
> 
> > On the Cell/B.E. platforms (IBM/Mercury blades, Toshiba Celleb, PS3), the
> > second large page size is an option that can be set in a HID SPR
> > to either 64KB or 1MB. Unfortunately, we can't do these two simultaneously,
> > but the firmware can change the default and put it into the device tree,
> > or you could have the kernel override the firmware settings.

The problem with the firmware choosing the page size is that it has much
less of an idea about what the user wants to do than the kernel.
It's not exclusive however. We can first teach the kernel to deal
with firmware setting the 1MB or 1GB page size, and later implement
changing the HW setting from the kernel.

	Arnd <><

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-01-07 12:39 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-03 22:59 [PATCh v3] powerpc: add hugepagesz boot-time parameter Jon Tollefson
2008-01-03 23:34 ` Arnd Bergmann
2008-01-04 20:00   ` Jon Tollefson
2008-01-04 23:03     ` Arnd Bergmann
2008-01-07 12:04   ` Mel Gorman
2008-01-07 12:37     ` Arnd Bergmann

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).