From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
To: benh@kernel.crashing.org, paulus@samba.org,
David Gibson <dwg@au1.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org,
"Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Subject: [PATCH -V6 06/27] powerpc: Switch 16GB and 16MB explicit hugepages to a different page table format
Date: Mon, 22 Apr 2013 15:30:40 +0530 [thread overview]
Message-ID: <1366624861-24948-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com> (raw)
In-Reply-To: <1366624861-24948-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com>
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
We will be switching PMD_SHIFT to 24 bits to facilitate THP impmenetation.
With PMD_SHIFT set to 24, we now have 16MB huge pages allocated at PGD level.
That means with 32 bit process we cannot allocate normal pages at
all, because we cover the entire address space with one pgd entry. Fix this
by switching to a new page table format for hugepages. With the new page table
format for 16GB and 16MB hugepages we won't allocate hugepage directory. Instead
we encode the PTE information directly at the directory level. This forces 16MB
hugepage at PMD level. This will also make the page take walk much simpler later
when we add the THP support.
With the new table format we have 4 cases for pgds and pmds:
(1) invalid (all zeroes)
(2) pointer to next table, as normal; bottom 6 bits == 0
(3) leaf pte for huge page, bottom two bits != 00
(4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
arch/powerpc/include/asm/page.h | 2 +
arch/powerpc/include/asm/pgtable.h | 2 +
arch/powerpc/mm/gup.c | 18 +++-
arch/powerpc/mm/hugetlbpage.c | 176 ++++++++++++++++++++++++++++++------
4 files changed, 168 insertions(+), 30 deletions(-)
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index b309cf4..6faa416 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -352,8 +352,10 @@ static inline int hugepd_ok(hugepd_t hpd)
}
#define is_hugepd(pdep) (hugepd_ok(*((hugepd_t *)(pdep))))
+int pgd_huge(pgd_t pgd);
#else /* CONFIG_HUGETLB_PAGE */
#define is_hugepd(pdep) 0
+#define pgd_huge(pgd) 0
#endif /* CONFIG_HUGETLB_PAGE */
struct page;
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 4b52726..7aeb955 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -218,6 +218,8 @@ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *);
extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr);
+extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr);
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
index d7efdbf..4b921af 100644
--- a/arch/powerpc/mm/gup.c
+++ b/arch/powerpc/mm/gup.c
@@ -68,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
return 0;
- if (is_hugepd(pmdp)) {
+ if (pmd_huge(pmd)) {
+ if (!gup_hugepte((pte_t *)pmdp, PMD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pmdp)) {
if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
addr, next, write, pages, nr))
return 0;
@@ -92,7 +96,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (is_hugepd(pudp)) {
+ if (pud_huge(pud)) {
+ if (!gup_hugepte((pte_t *)pudp, PUD_SIZE, addr, next,
+ write, pages, nr))
+ return 0;
+ } else if (is_hugepd(pudp)) {
if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
addr, next, write, pages, nr))
return 0;
@@ -153,7 +161,11 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (is_hugepd(pgdp)) {
+ if (pgd_huge(pgd)) {
+ if (!gup_hugepte((pte_t *)pgdp, PGDIR_SIZE, addr, next,
+ write, pages, &nr))
+ goto slow;
+ } else if (is_hugepd(pgdp)) {
if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
addr, next, write, pages, &nr))
goto slow;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 9108ce7..2da8fe6 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -50,11 +50,69 @@ static unsigned nr_gpages;
#define hugepd_none(hpd) ((hpd).pd == 0)
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+
+/*
+ * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
+ * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
+ */
+int pmd_huge(pmd_t pmd)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pmd_val(pmd) & 0x3) != 0x0);
+}
+
+int pud_huge(pud_t pud)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pud_val(pud) & 0x3) != 0x0);
+}
+
+int pgd_huge(pgd_t pgd)
+{
+ /*
+ * leaf pte for huge page, bottom two bits != 00
+ */
+ return ((pgd_val(pgd) & 0x3) != 0x0);
+}
+#else
+int pmd_huge(pmd_t pmd)
+{
+ return 0;
+}
+
+int pud_huge(pud_t pud)
+{
+ return 0;
+}
+
+int pgd_huge(pgd_t pgd)
+{
+ return 0;
+}
+#endif
+
+/*
+ * We have 4 cases for pgds and pmds:
+ * (1) invalid (all zeroes)
+ * (2) pointer to next table, as normal; bottom 6 bits == 0
+ * (3) leaf pte for huge page, bottom two bits != 00
+ * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table
+ */
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
{
pgd_t *pg;
pud_t *pu;
pmd_t *pm;
+ pte_t *ret_pte;
hugepd_t *hpdp = NULL;
unsigned pdshift = PGDIR_SHIFT;
@@ -62,30 +120,43 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
*shift = 0;
pg = pgdir + pgd_index(ea);
- if (is_hugepd(pg)) {
+
+ if (pgd_huge(*pg)) {
+ ret_pte = (pte_t *) pg;
+ goto out;
+ } else if (is_hugepd(pg))
hpdp = (hugepd_t *)pg;
- } else if (!pgd_none(*pg)) {
+ else if (!pgd_none(*pg)) {
pdshift = PUD_SHIFT;
pu = pud_offset(pg, ea);
- if (is_hugepd(pu))
+
+ if (pud_huge(*pu)) {
+ ret_pte = (pte_t *) pu;
+ goto out;
+ } else if (is_hugepd(pu))
hpdp = (hugepd_t *)pu;
else if (!pud_none(*pu)) {
pdshift = PMD_SHIFT;
pm = pmd_offset(pu, ea);
- if (is_hugepd(pm))
+
+ if (pmd_huge(*pm)) {
+ ret_pte = (pte_t *) pm;
+ goto out;
+ } else if (is_hugepd(pm))
hpdp = (hugepd_t *)pm;
- else if (!pmd_none(*pm)) {
+ else if (!pmd_none(*pm))
return pte_offset_kernel(pm, ea);
- }
}
}
-
if (!hpdp)
return NULL;
+ ret_pte = hugepte_offset(hpdp, ea, pdshift);
+ pdshift = hugepd_shift(*hpdp);
+out:
if (shift)
- *shift = hugepd_shift(*hpdp);
- return hugepte_offset(hpdp, ea, pdshift);
+ *shift = pdshift;
+ return ret_pte;
}
EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
@@ -160,6 +231,61 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
#define HUGEPD_PUD_SHIFT PMD_SHIFT
#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * At this point we do the placement change only for BOOK3S 64. This would
+ * possibly work on other subarchs.
+ */
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
+{
+ pgd_t *pg;
+ pud_t *pu;
+ pmd_t *pm;
+ hugepd_t *hpdp = NULL;
+ unsigned pshift = __ffs(sz);
+ unsigned pdshift = PGDIR_SHIFT;
+
+ addr &= ~(sz-1);
+ pg = pgd_offset(mm, addr);
+
+ if (pshift == PGDIR_SHIFT)
+ /* 16GB huge page */
+ return (pte_t *) pg;
+ else if (pshift > PUD_SHIFT)
+ /*
+ * We need to use hugepd table
+ */
+ hpdp = (hugepd_t *)pg;
+ else {
+ pdshift = PUD_SHIFT;
+ pu = pud_alloc(mm, pg, addr);
+ if (pshift == PUD_SHIFT)
+ return (pte_t *)pu;
+ else if (pshift > PMD_SHIFT)
+ hpdp = (hugepd_t *)pu;
+ else {
+ pdshift = PMD_SHIFT;
+ pm = pmd_alloc(mm, pu, addr);
+ if (pshift == PMD_SHIFT)
+ /* 16MB hugepage */
+ return (pte_t *)pm;
+ else
+ hpdp = (hugepd_t *)pm;
+ }
+ }
+ if (!hpdp)
+ return NULL;
+
+ BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ return NULL;
+
+ return hugepte_offset(hpdp, addr, pdshift);
+}
+
+#else
+
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
{
pgd_t *pg;
@@ -197,6 +323,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
return hugepte_offset(hpdp, addr, pdshift);
}
+#endif
#ifdef CONFIG_PPC_FSL_BOOK3E
/* Build list of addresses of gigantic pages. This function is used in early
@@ -460,7 +587,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
do {
pmd = pmd_offset(pud, addr);
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd))
+ if (pmd_none_or_clear_bad(pmd))
continue;
#ifdef CONFIG_PPC_FSL_BOOK3E
/*
@@ -613,16 +740,6 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
return page;
}
-int pmd_huge(pmd_t pmd)
-{
- return 0;
-}
-
-int pud_huge(pud_t pud)
-{
- return 0;
-}
-
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
@@ -631,8 +748,8 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
}
-static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
unsigned long pte_end;
@@ -868,11 +985,16 @@ static int __init hugetlbpage_init(void)
pdshift = PUD_SHIFT;
else
pdshift = PGDIR_SHIFT;
-
- pgtable_cache_add(pdshift - shift, NULL);
- if (!PGT_CACHE(pdshift - shift))
- panic("hugetlbpage_init(): could not create "
- "pgtable cache for %d bit pagesize\n", shift);
+ /*
+ * if we have pdshift and shift value same, we don't
+ * use pgt cache for hugepd.
+ */
+ if (pdshift != shift) {
+ pgtable_cache_add(pdshift - shift, NULL);
+ if (!PGT_CACHE(pdshift - shift))
+ panic("hugetlbpage_init(): could not create "
+ "pgtable cache for %d bit pagesize\n", shift);
+ }
}
/* Set default large page size. Currently, we pick 16M or 1M
--
1.7.10
next prev parent reply other threads:[~2013-04-22 10:02 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-04-22 10:00 [PATCH -V6 00/27] THP support for PPC64 Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 01/27] powerpc: Use signed formatting when printing error Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 02/27] powerpc: Save DAR and DSISR in pt_regs on MCE Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 03/27] powerpc: Don't hard code the size of pte page Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 04/27] powerpc: Don't truncate pgd_index wrongly Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 05/27] powerpc: New hugepage directory format Aneesh Kumar K.V
2013-04-23 7:01 ` Paul Mackerras
2013-04-23 8:42 ` Aneesh Kumar K.V
2013-04-24 5:47 ` Paul Mackerras
2013-04-25 6:00 ` Aneesh Kumar K.V
2013-04-22 10:00 ` Aneesh Kumar K.V [this message]
2013-04-22 10:00 ` [PATCH -V6 07/27] powerpc: Reduce the PTE_INDEX_SIZE Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 08/27] powerpc: Move the pte free routines from common header Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 09/27] powerpc: Reduce PTE table memory wastage Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 10/27] powerpc: Use encode avpn where we need only avpn values Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 11/27] powerpc: Decode the pte-lp-encoding bits correctly Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 12/27] powerpc: Fix hpte_decode to use the correct decoding for page sizes Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 13/27] powerpc: print both base and actual page size on hash failure Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 14/27] powerpc: Print page size info during boot Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 15/27] powerpc: Update tlbie/tlbiel as per ISA doc Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 16/27] mm/THP: HPAGE_SHIFT is not a #define on some arch Aneesh Kumar K.V
2013-04-22 15:43 ` Andrea Arcangeli
2013-04-22 10:00 ` [PATCH -V6 17/27] mm/THP: Add pmd args to pgtable deposit and withdraw APIs Aneesh Kumar K.V
2013-04-22 15:46 ` Andrea Arcangeli
2013-04-22 10:00 ` [PATCH -V6 18/27] mm/THP: withdraw the pgtable after pmdp related operations Aneesh Kumar K.V
2013-04-22 15:49 ` Andrea Arcangeli
2013-04-24 9:08 ` Aneesh Kumar K.V
2013-04-24 15:14 ` Andrea Arcangeli
2013-04-25 6:11 ` Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 19/27] powerpc/THP: Double the PMD table size for THP Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 20/27] powerpc/THP: Implement transparent hugepages for ppc64 Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 21/27] powerpc: move find_linux_pte_or_hugepte and gup_hugepte to common code Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 22/27] powerpc: Update find_linux_pte_or_hugepte to handle transparent hugepages Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 23/27] powerpc: Replace find_linux_pte with find_linux_pte_or_hugepte Aneesh Kumar K.V
2013-04-24 6:29 ` Paul Mackerras
2013-04-22 10:00 ` [PATCH -V6 24/27] powerpc: Update gup_pmd_range to handle transparent hugepages Aneesh Kumar K.V
2013-04-22 10:00 ` [PATCH -V6 25/27] powerpc/THP: Add code to handle HPTE faults for large pages Aneesh Kumar K.V
2013-04-22 10:01 ` [PATCH -V6 26/27] powerpc/THP: Enable THP on PPC64 Aneesh Kumar K.V
2013-04-22 10:01 ` [PATCH -V6 27/27] powerpc: Optimize hugepage invalidate Aneesh Kumar K.V
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1366624861-24948-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com \
--to=aneesh.kumar@linux.vnet.ibm.com \
--cc=benh@kernel.crashing.org \
--cc=dwg@au1.ibm.com \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=paulus@samba.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).