From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Mon, 13 Mar 2017 17:33:03 +0300 Message-ID: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: Sender: owner-linux-mm@kvack.org To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" List-Id: linux-arch.vger.kernel.org Here's the first bunch of patches of 5-level patchset. Let's see if I'm on right track addressing Ingo's feedback. :) These patches prepare x86 code to be switched from to . It's a stepping stone for adding 5-level paging support. Please review and consider applying. Kirill A. Shutemov (6): x86/mm: Extend headers with basic definitions to support 5-level paging x86/mm: Convert trivial cases of page table walk to 5-level paging x86/gup: Add 5-level paging support x86/ident_map: Add 5-level paging support x86/vmalloc: Add 5-level paging support x86/power: Add 5-level paging support arch/x86/include/asm/pgtable-2level_types.h | 1 + arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 26 +++++++++--- arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- arch/x86/kernel/tboot.c | 6 ++- arch/x86/kernel/vm86_32.c | 6 ++- arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- arch/x86/mm/gup.c | 33 ++++++++++++--- arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- arch/x86/mm/init_32.c | 22 +++++++--- arch/x86/mm/ioremap.c | 3 +- arch/x86/mm/pgtable.c | 4 +- arch/x86/mm/pgtable_32.c | 8 +++- arch/x86/platform/efi/efi_64.c | 13 ++++-- arch/x86/power/hibernate_32.c | 7 ++- arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ 17 files changed, 269 insertions(+), 59 deletions(-) -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 1/6] x86/mm: Extend headers with basic definitions to support 5-level paging Date: Mon, 13 Mar 2017 17:33:04 +0300 Message-ID: <20170313143309.16020-2-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: Received: from mga06.intel.com ([134.134.136.31]:2017 "EHLO mga06.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752103AbdCMOdd (ORCPT ); Mon, 13 Mar 2017 10:33:33 -0400 In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" This patch extends x86 headers to enable 5-level paging support. It's still based on . We will get to the point where we can have later. Signed-off-by: Kirill A. Shutemov --- arch/x86/include/asm/pgtable-2level_types.h | 1 + arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 26 ++++++++++++++++++++----- arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++++++++++++++++++- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 392576433e77..373ab1de909f 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -7,6 +7,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..b8a4341faafa 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -7,6 +7,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..6f6f351e0a81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -179,6 +179,17 @@ static inline unsigned long pud_pfn(pud_t pud) return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; +} + +static inline int p4d_large(p4d_t p4d) +{ + /* No 512 GiB pages yet */ + return 0; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -770,6 +781,16 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline unsigned long p4d_index(unsigned long address) +{ + return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + #if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { @@ -788,11 +809,6 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned long pud_index(unsigned long address) -{ - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) { return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..0b2797e5083c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -13,6 +13,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 62484333673d..df08535f774a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if CONFIG_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 4 + +#error FIXME + +#else #include +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return native_pgd_val(p4d); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -318,6 +329,22 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline p4dval_t p4d_pfn_mask(p4d_t p4d) +{ + /* No 512 GiB huge pages yet */ + return PTE_PFN_MASK; +} + +static inline p4dval_t p4d_flags_mask(p4d_t p4d) +{ + return ~p4d_pfn_mask(p4d); +} + +static inline p4dval_t p4d_flags(p4d_t p4d) +{ + return native_p4d_val(p4d) & p4d_flags_mask(p4d); +} + static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) @@ -461,6 +488,7 @@ enum pg_level { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_512G, PG_LEVEL_NUM }; -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 4/6] x86/ident_map: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:07 +0300 Message-ID: <20170313143309.16020-5-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" List-Id: linux-arch.vger.kernel.org Add additional page table level handing. It's mostly mechanical. The only quirk is that with p4d folded, 'pgd' is equal to 'p4d' in kernel_ident_mapping_init(). pgd entry has to point pud page table in this case. Signed-off-by: Kirill A. Shutemov --- arch/x86/mm/ident_map.c | 51 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..1c3f166bd8c3 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * pgd entry has to point pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 3/6] x86/gup: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:06 +0300 Message-ID: <20170313143309.16020-4-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" List-Id: linux-arch.vger.kernel.org get_user_pages_fast() has to handle additional page table level. Signed-off-by: Kirill A. Shutemov --- arch/x86/mm/gup.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 1f3b6ef105cd..456dfdfd2249 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* - * 'pteval' can come from a pte, pmd or pud. We only check + * 'pteval' can come from a pte, pmd, pud or p4d. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. + * same value on all 4 types. */ static inline int pte_allows_gup(unsigned long pteval, int write) { @@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = *pudp; @@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); @@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 6/6] x86/power: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:09 +0300 Message-ID: <20170313143309.16020-7-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" List-Id: linux-arch.vger.kernel.org set_up_temporary_text_mapping() and relocate_restore_code() require adjustments to handle additional page table level. Signed-off-by: Kirill A. Shutemov --- arch/x86/power/hibernate_64.c | 50 +++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ded2e8272382..aa054feb1860 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; + p4d_t *p4d; /* * The new mapping only has to cover the page containing the image @@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * the virtual address space after switching over to the original page * tables used by the image kernel. */ + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!p4d) + return -ENOMEM; + } + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; @@ -75,8 +83,16 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), - __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_p4d(p4d + p4d_index(restore_jump_address), + __p4d(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd + pgd_index(restore_jump_address), + __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* No p4d for 4-level paging: point pgd to pud page table */ + set_pgd(pgd + pgd_index(restore_jump_address), + __pgd(__pa(pud) | _KERNPG_TABLE)); + } return 0; } @@ -124,7 +140,10 @@ static int set_up_temporary_mappings(void) static int relocate_restore_code(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) @@ -134,22 +153,25 @@ static int relocate_restore_code(void) /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); - pud = pud_offset(pgd, relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); if (pud_large(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - } else { - pmd_t *pmd = pmd_offset(pud, relocated_restore_code); - - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - } else { - pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); - } + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: __flush_tlb_all(); - return 0; } -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 5/6] x86/vmalloc: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:08 +0300 Message-ID: <20170313143309.16020-6-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" List-Id: linux-arch.vger.kernel.org Modify vmalloc_fault() to handle additional page table level. With 4-level paging, copying happens on p4d level, as we have pgd_none() always false if p4d_t is folded. Signed-off-by: Kirill A. Shutemov --- arch/x86/mm/fault.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 605fd5e8e048..1928ea02e182 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -435,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -458,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false. So pgd may + * point to empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: [PATCH 2/6] x86/mm: Convert trivial cases of page table walk to 5-level paging Date: Mon, 13 Mar 2017 17:33:05 +0300 Message-ID: <20170313143309.16020-3-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Return-path: Received: from mga09.intel.com ([134.134.136.24]:2757 "EHLO mga09.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752114AbdCMOei (ORCPT ); Mon, 13 Mar 2017 10:34:38 -0400 In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" This patch only covers simple cases. Less trivial cases will be converted with separate patches. Signed-off-by: Kirill A. Shutemov --- arch/x86/kernel/tboot.c | 6 +++++- arch/x86/kernel/vm86_32.c | 6 +++++- arch/x86/mm/fault.c | 39 +++++++++++++++++++++++++++++++++------ arch/x86/mm/init_32.c | 22 ++++++++++++++++------ arch/x86/mm/ioremap.c | 3 ++- arch/x86/mm/pgtable.c | 4 +++- arch/x86/mm/pgtable_32.c | 8 +++++++- arch/x86/platform/efi/efi_64.c | 13 +++++++++---- arch/x86/power/hibernate_32.c | 7 +++++-- 9 files changed, 85 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..5db0f33cbf2c 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..62597c300d94 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..605fd5e8e048 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -526,6 +536,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +549,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1101,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1124,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b4b53e6793f..5ed3c141bbd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,6 +67,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +76,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +393,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +416,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +425,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +458,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +478,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa2635862d..a5e1cda85974 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..38b6daf72deb 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -261,13 +261,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce776852b..3d275a791c76 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4695da42d77..8544dae3d1b4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -166,6 +166,7 @@ void efi_sync_low_kernel_mappings(void) { unsigned num_entries; pgd_t *pgd_k, *pgd_efi; + p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) @@ -197,16 +198,20 @@ void efi_sync_low_kernel_mappings(void) BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - pud_efi = pud_offset(pgd_efi, 0); + p4d_efi = p4d_offset(pgd_efi, 0); + pud_efi = pud_offset(p4d_efi, 0); pgd_k = pgd_offset_k(EFI_VA_END); - pud_k = pud_offset(pgd_k, 0); + p4d_k = p4d_offset(pgd_k, 0); + pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - pud_efi = pud_offset(pgd_efi, EFI_VA_START); - pud_k = pud_offset(pgd_k, EFI_VA_START); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_START); + pud_efi = pud_offset(p4d_efi, EFI_VA_START); + p4d_k = p4d_offset(pgd_k, EFI_VA_START); + pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..c35fdb585c68 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; */ static pmd_t *resume_one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) return NULL; set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); #else - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); #endif -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 From: Linus Torvalds Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Mon, 13 Mar 2017 12:46:26 -0700 Message-ID: References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Return-path: In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org To: "Kirill A. Shutemov" Cc: Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List List-Id: linux-arch.vger.kernel.org On Mon, Mar 13, 2017 at 7:33 AM, Kirill A. Shutemov wrote: > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > right track addressing Ingo's feedback. :) Considering the bug we just had with the HAVE_GENERIC_RCU_GUP code, I'm wondering if people would be willing to look at what it would take to make x86 use the generic version? The x86 version of __get_user_pages_fast() seems to be quite similar to the generic one. And it would be lovely if all the main architectures shared the same core gup code. Linus -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Ingo Molnar Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Tue, 14 Mar 2017 08:47:29 +0100 Message-ID: <20170314074729.GA23151@gmail.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Received: from mail-wm0-f68.google.com ([74.125.82.68]:34873 "EHLO mail-wm0-f68.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750914AbdCNHre (ORCPT ); Tue, 14 Mar 2017 03:47:34 -0400 Content-Disposition: inline In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: "Kirill A. Shutemov" Cc: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org * Kirill A. Shutemov wrote: > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > right track addressing Ingo's feedback. :) > > These patches prepare x86 code to be switched from > to . It's a stepping stone for adding 5-level > paging support. > > Please review and consider applying. > > Kirill A. Shutemov (6): > x86/mm: Extend headers with basic definitions to support 5-level > paging > x86/mm: Convert trivial cases of page table walk to 5-level paging > x86/gup: Add 5-level paging support > x86/ident_map: Add 5-level paging support > x86/vmalloc: Add 5-level paging support > x86/power: Add 5-level paging support > > arch/x86/include/asm/pgtable-2level_types.h | 1 + > arch/x86/include/asm/pgtable-3level_types.h | 1 + > arch/x86/include/asm/pgtable.h | 26 +++++++++--- > arch/x86/include/asm/pgtable_64_types.h | 1 + > arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- > arch/x86/kernel/tboot.c | 6 ++- > arch/x86/kernel/vm86_32.c | 6 ++- > arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- > arch/x86/mm/gup.c | 33 ++++++++++++--- > arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- > arch/x86/mm/init_32.c | 22 +++++++--- > arch/x86/mm/ioremap.c | 3 +- > arch/x86/mm/pgtable.c | 4 +- > arch/x86/mm/pgtable_32.c | 8 +++- > arch/x86/platform/efi/efi_64.c | 13 ++++-- > arch/x86/power/hibernate_32.c | 7 ++- > arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ > 17 files changed, 269 insertions(+), 59 deletions(-) Much better! I've applied them, with (very) minor readability edits here and there, and will push them out into tip:x86/mm and tip:master after some testing - you can use that as a base for the remaining submissions. I've also applied the GUP patch, with the assumption that you'll address Linus's request to switch x86 over to the generic version. Thanks, Ingo From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Tue, 14 Mar 2017 11:24:09 +0300 Message-ID: <20170314082409.gjhefteglqbfb2gy@node.shutemov.name> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Content-Disposition: inline In-Reply-To: <20170314074729.GA23151@gmail.com> Sender: owner-linux-mm@kvack.org To: Ingo Molnar , Linus Torvalds Cc: "Kirill A. Shutemov" , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org List-Id: linux-arch.vger.kernel.org On Tue, Mar 14, 2017 at 08:47:29AM +0100, Ingo Molnar wrote: > > * Kirill A. Shutemov wrote: > > > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > > right track addressing Ingo's feedback. :) > > > > These patches prepare x86 code to be switched from > > to . It's a stepping stone for adding 5-level > > paging support. > > > > Please review and consider applying. > > > > Kirill A. Shutemov (6): > > x86/mm: Extend headers with basic definitions to support 5-level > > paging > > x86/mm: Convert trivial cases of page table walk to 5-level paging > > x86/gup: Add 5-level paging support > > x86/ident_map: Add 5-level paging support > > x86/vmalloc: Add 5-level paging support > > x86/power: Add 5-level paging support > > > > arch/x86/include/asm/pgtable-2level_types.h | 1 + > > arch/x86/include/asm/pgtable-3level_types.h | 1 + > > arch/x86/include/asm/pgtable.h | 26 +++++++++--- > > arch/x86/include/asm/pgtable_64_types.h | 1 + > > arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- > > arch/x86/kernel/tboot.c | 6 ++- > > arch/x86/kernel/vm86_32.c | 6 ++- > > arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- > > arch/x86/mm/gup.c | 33 ++++++++++++--- > > arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- > > arch/x86/mm/init_32.c | 22 +++++++--- > > arch/x86/mm/ioremap.c | 3 +- > > arch/x86/mm/pgtable.c | 4 +- > > arch/x86/mm/pgtable_32.c | 8 +++- > > arch/x86/platform/efi/efi_64.c | 13 ++++-- > > arch/x86/power/hibernate_32.c | 7 ++- > > arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ > > 17 files changed, 269 insertions(+), 59 deletions(-) > > Much better! > > I've applied them, with (very) minor readability edits here and there, and will > push them out into tip:x86/mm and tip:master after some testing - you can use that > as a base for the remaining submissions. Thanks. > I've also applied the GUP patch, with the assumption that you'll address Linus's > request to switch x86 over to the generic version. Okay, I'll do this. I just want to make priorities clear here: is it okay to finish with the rest of 5-level paging patches first before moving to GUP_fast switch? -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Thomas Gleixner Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Tue, 14 Mar 2017 09:33:20 +0100 (CET) Message-ID: References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> <20170314082409.gjhefteglqbfb2gy@node.shutemov.name> Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Return-path: In-Reply-To: <20170314082409.gjhefteglqbfb2gy@node.shutemov.name> Sender: owner-linux-mm@kvack.org To: "Kirill A. Shutemov" Cc: Ingo Molnar , Linus Torvalds , "Kirill A. Shutemov" , Andrew Morton , x86@kernel.org, Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org List-Id: linux-arch.vger.kernel.org On Tue, 14 Mar 2017, Kirill A. Shutemov wrote: > On Tue, Mar 14, 2017 at 08:47:29AM +0100, Ingo Molnar wrote: > > I've also applied the GUP patch, with the assumption that you'll address Linus's > > request to switch x86 over to the generic version. > > Okay, I'll do this. > > I just want to make priorities clear here: is it okay to finish with the > rest of 5-level paging patches first before moving to GUP_fast switch? I think moving it first is the preferred way to do it. Thanks, tglx -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Linus Torvalds Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Tue, 14 Mar 2017 10:48:51 -0700 Message-ID: References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Return-path: In-Reply-To: <20170314074729.GA23151@gmail.com> Sender: owner-linux-mm@kvack.org To: Ingo Molnar Cc: "Kirill A. Shutemov" , Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List List-Id: linux-arch.vger.kernel.org On Tue, Mar 14, 2017 at 12:47 AM, Ingo Molnar wrote: > > I've also applied the GUP patch, with the assumption that you'll address Linus's > request to switch x86 over to the generic version. Note that switching over to the generic version is somewhat fraught with subtle issues: (a) we need to make sure that x86 actually matches the required semantics for the generic GUP. (b) we need to make sure the atomicity of the page table reads is ok. (c) need to verify the maximum VM address properly I _think_ (a) is ok. The code (and the config option name) talks about freeing page tables using RCU, but in fact I don't think it relies on it, and it's sufficient that it disables interrupts and that that will block any IPI's. In contrast, I think (b) needs real work to make sure it's ok on 32-bit PAE with 64-bit pte entries. The generic code currently just does READ_ONCE(), while the x86 code does gup_get_pte(). And (c) means that we need to really replace that generic code that does "access_ok()": with a proper check against maximum user address (ie independent of set_fs(KERNEL_DS)). But it would be good to aim for unifying this part of the VM, considering how many bugs we've had in GUP. The latest 5-level typo has not been the only one. It's clearly more subtle than you'd think. So it's not quite as simple as just "switching over". I think we need to introduce that gup_get_pte() to all the generic users, and we need to introduce a "user address limit" for those architectures too. Linus -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: Michal Hocko Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Wed, 15 Mar 2017 10:23:41 +0100 Message-ID: <20170315092341.GF32620@dhcp22.suse.cz> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Content-Disposition: inline In-Reply-To: <20170314074729.GA23151@gmail.com> Sender: owner-linux-mm@kvack.org To: Ingo Molnar Cc: "Kirill A. Shutemov" , Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org List-Id: linux-arch.vger.kernel.org On Tue 14-03-17 08:47:29, Ingo Molnar wrote: > > * Kirill A. Shutemov wrote: > > > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > > right track addressing Ingo's feedback. :) > > > > These patches prepare x86 code to be switched from > > to . It's a stepping stone for adding 5-level > > paging support. > > > > Please review and consider applying. > > > > Kirill A. Shutemov (6): > > x86/mm: Extend headers with basic definitions to support 5-level > > paging > > x86/mm: Convert trivial cases of page table walk to 5-level paging > > x86/gup: Add 5-level paging support > > x86/ident_map: Add 5-level paging support > > x86/vmalloc: Add 5-level paging support > > x86/power: Add 5-level paging support > > > > arch/x86/include/asm/pgtable-2level_types.h | 1 + > > arch/x86/include/asm/pgtable-3level_types.h | 1 + > > arch/x86/include/asm/pgtable.h | 26 +++++++++--- > > arch/x86/include/asm/pgtable_64_types.h | 1 + > > arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- > > arch/x86/kernel/tboot.c | 6 ++- > > arch/x86/kernel/vm86_32.c | 6 ++- > > arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- > > arch/x86/mm/gup.c | 33 ++++++++++++--- > > arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- > > arch/x86/mm/init_32.c | 22 +++++++--- > > arch/x86/mm/ioremap.c | 3 +- > > arch/x86/mm/pgtable.c | 4 +- > > arch/x86/mm/pgtable_32.c | 8 +++- > > arch/x86/platform/efi/efi_64.c | 13 ++++-- > > arch/x86/power/hibernate_32.c | 7 ++- > > arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ > > 17 files changed, 269 insertions(+), 59 deletions(-) > > Much better! > > I've applied them, with (very) minor readability edits here and there, and will > push them out into tip:x86/mm and tip:master after some testing - you can use that > as a base for the remaining submissions. JFYI, I have cherry picked these and those merged via Linus tree into the mmotm git tree [1] (tag mmotm-2017-03-14-15-41) [1] git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Wed, 15 Mar 2017 17:51:26 +0300 Message-ID: <20170315145126.4xgvhuavtf5icjdc@node.shutemov.name> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Content-Disposition: inline In-Reply-To: Sender: owner-linux-mm@kvack.org To: Linus Torvalds , Andrea Arcangeli Cc: Ingo Molnar , "Kirill A. Shutemov" , Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List List-Id: linux-arch.vger.kernel.org On Tue, Mar 14, 2017 at 10:48:51AM -0700, Linus Torvalds wrote: > On Tue, Mar 14, 2017 at 12:47 AM, Ingo Molnar wrote: > > > > I've also applied the GUP patch, with the assumption that you'll address Linus's > > request to switch x86 over to the generic version. > > Note that switching over to the generic version is somewhat fraught > with subtle issues: > > (a) we need to make sure that x86 actually matches the required > semantics for the generic GUP. > > (b) we need to make sure the atomicity of the page table reads is ok. > > (c) need to verify the maximum VM address properly > > I _think_ (a) is ok. The code (and the config option name) talks about > freeing page tables using RCU, but in fact I don't think it relies on > it, and it's sufficient that it disables interrupts and that that will > block any IPI's. > > In contrast, I think (b) needs real work to make sure it's ok on > 32-bit PAE with 64-bit pte entries. The generic code currently just > does READ_ONCE(), while the x86 code does gup_get_pte(). + Andrea. Looking on gup_get_pte() makes me thinkg, why don't we need the same approach for pmd level (pud is not relevant for PAE)? Looks like a bug to me. We have pmd_read_atomic() to address the issue in other places. The helper doesn't match required for GUP_fast() semantics, but we clearly need to address the issue. pgd deference doesn't look good too on PAE. Or am I missing something? Heck, we don't even have READ_ONCE() on x86 for page table entry dereference. Looks like a bug waiting to explode. And not only on PAE. -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Kirill A. Shutemov" Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Wed, 15 Mar 2017 18:42:05 +0300 Message-ID: <20170315154205.33hvpvkbjypgkd7g@node.shutemov.name> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Return-path: Content-Disposition: inline In-Reply-To: Sender: owner-linux-mm@kvack.org To: Linus Torvalds Cc: Ingo Molnar , "Kirill A. Shutemov" , Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List List-Id: linux-arch.vger.kernel.org On Tue, Mar 14, 2017 at 10:48:51AM -0700, Linus Torvalds wrote: > On Tue, Mar 14, 2017 at 12:47 AM, Ingo Molnar wrote: > > > > I've also applied the GUP patch, with the assumption that you'll address Linus's > > request to switch x86 over to the generic version. > > Note that switching over to the generic version is somewhat fraught > with subtle issues: > > (a) we need to make sure that x86 actually matches the required > semantics for the generic GUP. > > (b) we need to make sure the atomicity of the page table reads is ok. > > (c) need to verify the maximum VM address properly There's another difference with generic version: it uses page_cache_get_speculative() instead of plain get_page() on x86. That's somewhat more expensive, but probably fine. -- Kirill A. Shutemov -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga06.intel.com ([134.134.136.31]:2017 "EHLO mga06.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753330AbdCMOde (ORCPT ); Mon, 13 Mar 2017 10:33:34 -0400 From: "Kirill A. Shutemov" Subject: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Date: Mon, 13 Mar 2017 17:33:03 +0300 Message-ID: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" Message-ID: <20170313143303.1pjTpJQgVlm38luipwIBkk-74CA-MnA2Dfq4yjYazt8@z> Here's the first bunch of patches of 5-level patchset. Let's see if I'm on right track addressing Ingo's feedback. :) These patches prepare x86 code to be switched from to . It's a stepping stone for adding 5-level paging support. Please review and consider applying. Kirill A. Shutemov (6): x86/mm: Extend headers with basic definitions to support 5-level paging x86/mm: Convert trivial cases of page table walk to 5-level paging x86/gup: Add 5-level paging support x86/ident_map: Add 5-level paging support x86/vmalloc: Add 5-level paging support x86/power: Add 5-level paging support arch/x86/include/asm/pgtable-2level_types.h | 1 + arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 26 +++++++++--- arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- arch/x86/kernel/tboot.c | 6 ++- arch/x86/kernel/vm86_32.c | 6 ++- arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- arch/x86/mm/gup.c | 33 ++++++++++++--- arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- arch/x86/mm/init_32.c | 22 +++++++--- arch/x86/mm/ioremap.c | 3 +- arch/x86/mm/pgtable.c | 4 +- arch/x86/mm/pgtable_32.c | 8 +++- arch/x86/platform/efi/efi_64.c | 13 ++++-- arch/x86/power/hibernate_32.c | 7 ++- arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ 17 files changed, 269 insertions(+), 59 deletions(-) -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga01.intel.com ([192.55.52.88]:32508 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753307AbdCMOde (ORCPT ); Mon, 13 Mar 2017 10:33:34 -0400 From: "Kirill A. Shutemov" Subject: [PATCH 3/6] x86/gup: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:06 +0300 Message-ID: <20170313143309.16020-4-kirill.shutemov@linux.intel.com> In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" Message-ID: <20170313143306.9WfyHA5pXqsQ2l-AmVrngbxtDui7ZfOhf8Csy01wOWw@z> get_user_pages_fast() has to handle additional page table level. Signed-off-by: Kirill A. Shutemov --- arch/x86/mm/gup.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 1f3b6ef105cd..456dfdfd2249 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* - * 'pteval' can come from a pte, pmd or pud. We only check + * 'pteval' can come from a pte, pmd, pud or p4d. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. + * same value on all 4 types. */ static inline int pte_allows_gup(unsigned long pteval, int write) { @@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = *pudp; @@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); @@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga11.intel.com ([192.55.52.93]:7410 "EHLO mga11.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753539AbdCMOdh (ORCPT ); Mon, 13 Mar 2017 10:33:37 -0400 From: "Kirill A. Shutemov" Subject: [PATCH 4/6] x86/ident_map: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:07 +0300 Message-ID: <20170313143309.16020-5-kirill.shutemov@linux.intel.com> In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" Message-ID: <20170313143307.shDOG0buzNEE13clsAXDpe2hHyzMPqejO-53elVnKpo@z> Add additional page table level handing. It's mostly mechanical. The only quirk is that with p4d folded, 'pgd' is equal to 'p4d' in kernel_ident_mapping_init(). pgd entry has to point pud page table in this case. Signed-off-by: Kirill A. Shutemov --- arch/x86/mm/ident_map.c | 51 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..1c3f166bd8c3 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * pgd entry has to point pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga06.intel.com ([134.134.136.31]:26334 "EHLO mga06.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750860AbdCMOeS (ORCPT ); Mon, 13 Mar 2017 10:34:18 -0400 From: "Kirill A. Shutemov" Subject: [PATCH 5/6] x86/vmalloc: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:08 +0300 Message-ID: <20170313143309.16020-6-kirill.shutemov@linux.intel.com> In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" Message-ID: <20170313143308.XLwe_9BrzYzNjrBnZt2bOruRMBhGUjOMNFgf3MLJ0P0@z> Modify vmalloc_fault() to handle additional page table level. With 4-level paging, copying happens on p4d level, as we have pgd_none() always false if p4d_t is folded. Signed-off-by: Kirill A. Shutemov --- arch/x86/mm/fault.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 605fd5e8e048..1928ea02e182 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -435,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -458,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false. So pgd may + * point to empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mga01.intel.com ([192.55.52.88]:32508 "EHLO mga01.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753541AbdCMOdh (ORCPT ); Mon, 13 Mar 2017 10:33:37 -0400 From: "Kirill A. Shutemov" Subject: [PATCH 6/6] x86/power: Add 5-level paging support Date: Mon, 13 Mar 2017 17:33:09 +0300 Message-ID: <20170313143309.16020-7-kirill.shutemov@linux.intel.com> In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" Message-ID: <20170313143309.Df9hMP72axjWsLs-EPuVoVuELi3dbIOUzhNrEdnPO4o@z> set_up_temporary_text_mapping() and relocate_restore_code() require adjustments to handle additional page table level. Signed-off-by: Kirill A. Shutemov --- arch/x86/power/hibernate_64.c | 50 +++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ded2e8272382..aa054feb1860 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; + p4d_t *p4d; /* * The new mapping only has to cover the page containing the image @@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * the virtual address space after switching over to the original page * tables used by the image kernel. */ + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!p4d) + return -ENOMEM; + } + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; @@ -75,8 +83,16 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), - __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_p4d(p4d + p4d_index(restore_jump_address), + __p4d(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd + pgd_index(restore_jump_address), + __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* No p4d for 4-level paging: point pgd to pud page table */ + set_pgd(pgd + pgd_index(restore_jump_address), + __pgd(__pa(pud) | _KERNPG_TABLE)); + } return 0; } @@ -124,7 +140,10 @@ static int set_up_temporary_mappings(void) static int relocate_restore_code(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) @@ -134,22 +153,25 @@ static int relocate_restore_code(void) /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); - pud = pud_offset(pgd, relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); if (pud_large(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - } else { - pmd_t *pmd = pmd_offset(pud, relocated_restore_code); - - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - } else { - pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); - } + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: __flush_tlb_all(); - return 0; } -- 2.11.0 From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-it0-f52.google.com ([209.85.214.52]:37700 "EHLO mail-it0-f52.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751122AbdCMTq2 (ORCPT ); Mon, 13 Mar 2017 15:46:28 -0400 MIME-Version: 1.0 In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> From: Linus Torvalds Date: Mon, 13 Mar 2017 12:46:26 -0700 Message-ID: Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Content-Type: text/plain; charset=UTF-8 Sender: linux-arch-owner@vger.kernel.org List-ID: To: "Kirill A. Shutemov" Cc: Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List Message-ID: <20170313194626.wZLwlu2inuAabyQEIt_jj79ntnh1-tErNK-00mCt-cE@z> On Mon, Mar 13, 2017 at 7:33 AM, Kirill A. Shutemov wrote: > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > right track addressing Ingo's feedback. :) Considering the bug we just had with the HAVE_GENERIC_RCU_GUP code, I'm wondering if people would be willing to look at what it would take to make x86 use the generic version? The x86 version of __get_user_pages_fast() seems to be quite similar to the generic one. And it would be lovely if all the main architectures shared the same core gup code. Linus From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wm0-f66.google.com ([74.125.82.66]:36121 "EHLO mail-wm0-f66.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751054AbdCNIYS (ORCPT ); Tue, 14 Mar 2017 04:24:18 -0400 Received: by mail-wm0-f66.google.com with SMTP id v190so13538120wme.3 for ; Tue, 14 Mar 2017 01:24:17 -0700 (PDT) Date: Tue, 14 Mar 2017 11:24:09 +0300 From: "Kirill A. Shutemov" Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Message-ID: <20170314082409.gjhefteglqbfb2gy@node.shutemov.name> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170314074729.GA23151@gmail.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Ingo Molnar , Linus Torvalds Cc: "Kirill A. Shutemov" , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Message-ID: <20170314082409.NOOYgHGJjHmiUTqNCUb4lrOh8POS6yCGXNwprVQNX40@z> On Tue, Mar 14, 2017 at 08:47:29AM +0100, Ingo Molnar wrote: > > * Kirill A. Shutemov wrote: > > > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > > right track addressing Ingo's feedback. :) > > > > These patches prepare x86 code to be switched from > > to . It's a stepping stone for adding 5-level > > paging support. > > > > Please review and consider applying. > > > > Kirill A. Shutemov (6): > > x86/mm: Extend headers with basic definitions to support 5-level > > paging > > x86/mm: Convert trivial cases of page table walk to 5-level paging > > x86/gup: Add 5-level paging support > > x86/ident_map: Add 5-level paging support > > x86/vmalloc: Add 5-level paging support > > x86/power: Add 5-level paging support > > > > arch/x86/include/asm/pgtable-2level_types.h | 1 + > > arch/x86/include/asm/pgtable-3level_types.h | 1 + > > arch/x86/include/asm/pgtable.h | 26 +++++++++--- > > arch/x86/include/asm/pgtable_64_types.h | 1 + > > arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- > > arch/x86/kernel/tboot.c | 6 ++- > > arch/x86/kernel/vm86_32.c | 6 ++- > > arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- > > arch/x86/mm/gup.c | 33 ++++++++++++--- > > arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- > > arch/x86/mm/init_32.c | 22 +++++++--- > > arch/x86/mm/ioremap.c | 3 +- > > arch/x86/mm/pgtable.c | 4 +- > > arch/x86/mm/pgtable_32.c | 8 +++- > > arch/x86/platform/efi/efi_64.c | 13 ++++-- > > arch/x86/power/hibernate_32.c | 7 ++- > > arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ > > 17 files changed, 269 insertions(+), 59 deletions(-) > > Much better! > > I've applied them, with (very) minor readability edits here and there, and will > push them out into tip:x86/mm and tip:master after some testing - you can use that > as a base for the remaining submissions. Thanks. > I've also applied the GUP patch, with the assumption that you'll address Linus's > request to switch x86 over to the generic version. Okay, I'll do this. I just want to make priorities clear here: is it okay to finish with the rest of 5-level paging patches first before moving to GUP_fast switch? -- Kirill A. Shutemov From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from Galois.linutronix.de ([146.0.238.70]:40465 "EHLO Galois.linutronix.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750828AbdCNIeU (ORCPT ); Tue, 14 Mar 2017 04:34:20 -0400 Date: Tue, 14 Mar 2017 09:33:20 +0100 (CET) From: Thomas Gleixner Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 In-Reply-To: <20170314082409.gjhefteglqbfb2gy@node.shutemov.name> Message-ID: References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> <20170314082409.gjhefteglqbfb2gy@node.shutemov.name> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Sender: linux-arch-owner@vger.kernel.org List-ID: To: "Kirill A. Shutemov" Cc: Ingo Molnar , Linus Torvalds , "Kirill A. Shutemov" , Andrew Morton , x86@kernel.org, Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Message-ID: <20170314083320.DRgJ34dXz0xHejL2n6klKKBwtd2LQa-9BQDLZvVf7m4@z> On Tue, 14 Mar 2017, Kirill A. Shutemov wrote: > On Tue, Mar 14, 2017 at 08:47:29AM +0100, Ingo Molnar wrote: > > I've also applied the GUP patch, with the assumption that you'll address Linus's > > request to switch x86 over to the generic version. > > Okay, I'll do this. > > I just want to make priorities clear here: is it okay to finish with the > rest of 5-level paging patches first before moving to GUP_fast switch? I think moving it first is the preferred way to do it. Thanks, tglx From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-it0-f67.google.com ([209.85.214.67]:34077 "EHLO mail-it0-f67.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751901AbdCNRsx (ORCPT ); Tue, 14 Mar 2017 13:48:53 -0400 MIME-Version: 1.0 In-Reply-To: <20170314074729.GA23151@gmail.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> From: Linus Torvalds Date: Tue, 14 Mar 2017 10:48:51 -0700 Message-ID: Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Content-Type: text/plain; charset=UTF-8 Sender: linux-arch-owner@vger.kernel.org List-ID: To: Ingo Molnar Cc: "Kirill A. Shutemov" , Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List Message-ID: <20170314174851.NS9jsCF5qlfp9HI45wkkpA2oOiT8JeylhQnOqVlm4-A@z> On Tue, Mar 14, 2017 at 12:47 AM, Ingo Molnar wrote: > > I've also applied the GUP patch, with the assumption that you'll address Linus's > request to switch x86 over to the generic version. Note that switching over to the generic version is somewhat fraught with subtle issues: (a) we need to make sure that x86 actually matches the required semantics for the generic GUP. (b) we need to make sure the atomicity of the page table reads is ok. (c) need to verify the maximum VM address properly I _think_ (a) is ok. The code (and the config option name) talks about freeing page tables using RCU, but in fact I don't think it relies on it, and it's sufficient that it disables interrupts and that that will block any IPI's. In contrast, I think (b) needs real work to make sure it's ok on 32-bit PAE with 64-bit pte entries. The generic code currently just does READ_ONCE(), while the x86 code does gup_get_pte(). And (c) means that we need to really replace that generic code that does "access_ok()": with a proper check against maximum user address (ie independent of set_fs(KERNEL_DS)). But it would be good to aim for unifying this part of the VM, considering how many bugs we've had in GUP. The latest 5-level typo has not been the only one. It's clearly more subtle than you'd think. So it's not quite as simple as just "switching over". I think we need to introduce that gup_get_pte() to all the generic users, and we need to introduce a "user address limit" for those architectures too. Linus From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mx2.suse.de ([195.135.220.15]:57777 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750836AbdCOJXo (ORCPT ); Wed, 15 Mar 2017 05:23:44 -0400 Date: Wed, 15 Mar 2017 10:23:41 +0100 From: Michal Hocko Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Message-ID: <20170315092341.GF32620@dhcp22.suse.cz> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170314074729.GA23151@gmail.com> Sender: linux-arch-owner@vger.kernel.org List-ID: To: Ingo Molnar Cc: "Kirill A. Shutemov" , Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org Message-ID: <20170315092341.qsE36ayriMuwuDNqBObnIKiaj035t1PnDDC7lzACk2U@z> On Tue 14-03-17 08:47:29, Ingo Molnar wrote: > > * Kirill A. Shutemov wrote: > > > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > > right track addressing Ingo's feedback. :) > > > > These patches prepare x86 code to be switched from > > to . It's a stepping stone for adding 5-level > > paging support. > > > > Please review and consider applying. > > > > Kirill A. Shutemov (6): > > x86/mm: Extend headers with basic definitions to support 5-level > > paging > > x86/mm: Convert trivial cases of page table walk to 5-level paging > > x86/gup: Add 5-level paging support > > x86/ident_map: Add 5-level paging support > > x86/vmalloc: Add 5-level paging support > > x86/power: Add 5-level paging support > > > > arch/x86/include/asm/pgtable-2level_types.h | 1 + > > arch/x86/include/asm/pgtable-3level_types.h | 1 + > > arch/x86/include/asm/pgtable.h | 26 +++++++++--- > > arch/x86/include/asm/pgtable_64_types.h | 1 + > > arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- > > arch/x86/kernel/tboot.c | 6 ++- > > arch/x86/kernel/vm86_32.c | 6 ++- > > arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- > > arch/x86/mm/gup.c | 33 ++++++++++++--- > > arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- > > arch/x86/mm/init_32.c | 22 +++++++--- > > arch/x86/mm/ioremap.c | 3 +- > > arch/x86/mm/pgtable.c | 4 +- > > arch/x86/mm/pgtable_32.c | 8 +++- > > arch/x86/platform/efi/efi_64.c | 13 ++++-- > > arch/x86/power/hibernate_32.c | 7 ++- > > arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ > > 17 files changed, 269 insertions(+), 59 deletions(-) > > Much better! > > I've applied them, with (very) minor readability edits here and there, and will > push them out into tip:x86/mm and tip:master after some testing - you can use that > as a base for the remaining submissions. JFYI, I have cherry picked these and those merged via Linus tree into the mmotm git tree [1] (tag mmotm-2017-03-14-15-41) [1] git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git -- Michal Hocko SUSE Labs From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wr0-f196.google.com ([209.85.128.196]:34162 "EHLO mail-wr0-f196.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751771AbdCRRBT (ORCPT ); Sat, 18 Mar 2017 13:01:19 -0400 Received: by mail-wr0-f196.google.com with SMTP id u48so13039614wrc.1 for ; Sat, 18 Mar 2017 10:01:18 -0700 (PDT) Date: Wed, 15 Mar 2017 18:42:05 +0300 From: "Kirill A. Shutemov" Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Message-ID: <20170315154205.33hvpvkbjypgkd7g@node.shutemov.name> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds Cc: Ingo Molnar , "Kirill A. Shutemov" , Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List Message-ID: <20170315154205.uL_UmHifThBCwSlcrlxOXgh88Olr5kke7-aue1HUS0I@z> On Tue, Mar 14, 2017 at 10:48:51AM -0700, Linus Torvalds wrote: > On Tue, Mar 14, 2017 at 12:47 AM, Ingo Molnar wrote: > > > > I've also applied the GUP patch, with the assumption that you'll address Linus's > > request to switch x86 over to the generic version. > > Note that switching over to the generic version is somewhat fraught > with subtle issues: > > (a) we need to make sure that x86 actually matches the required > semantics for the generic GUP. > > (b) we need to make sure the atomicity of the page table reads is ok. > > (c) need to verify the maximum VM address properly There's another difference with generic version: it uses page_cache_get_speculative() instead of plain get_page() on x86. That's somewhat more expensive, but probably fine. -- Kirill A. Shutemov From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wr0-f195.google.com ([209.85.128.195]:33773 "EHLO mail-wr0-f195.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751774AbdCRRBU (ORCPT ); Sat, 18 Mar 2017 13:01:20 -0400 Received: by mail-wr0-f195.google.com with SMTP id g10so13048672wrg.0 for ; Sat, 18 Mar 2017 10:01:19 -0700 (PDT) Date: Wed, 15 Mar 2017 17:51:26 +0300 From: "Kirill A. Shutemov" Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Message-ID: <20170315145126.4xgvhuavtf5icjdc@node.shutemov.name> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> <20170314074729.GA23151@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: Sender: linux-arch-owner@vger.kernel.org List-ID: To: Linus Torvalds , Andrea Arcangeli Cc: Ingo Molnar , "Kirill A. Shutemov" , Andrew Morton , the arch/x86 maintainers , Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , "linux-arch@vger.kernel.org" , linux-mm , Linux Kernel Mailing List Message-ID: <20170315145126.xLZt5qwgsNcqyOEm1JEnTGej5aGJ8PZ_E-OXfIQIRpw@z> On Tue, Mar 14, 2017 at 10:48:51AM -0700, Linus Torvalds wrote: > On Tue, Mar 14, 2017 at 12:47 AM, Ingo Molnar wrote: > > > > I've also applied the GUP patch, with the assumption that you'll address Linus's > > request to switch x86 over to the generic version. > > Note that switching over to the generic version is somewhat fraught > with subtle issues: > > (a) we need to make sure that x86 actually matches the required > semantics for the generic GUP. > > (b) we need to make sure the atomicity of the page table reads is ok. > > (c) need to verify the maximum VM address properly > > I _think_ (a) is ok. The code (and the config option name) talks about > freeing page tables using RCU, but in fact I don't think it relies on > it, and it's sufficient that it disables interrupts and that that will > block any IPI's. > > In contrast, I think (b) needs real work to make sure it's ok on > 32-bit PAE with 64-bit pte entries. The generic code currently just > does READ_ONCE(), while the x86 code does gup_get_pte(). + Andrea. Looking on gup_get_pte() makes me thinkg, why don't we need the same approach for pmd level (pud is not relevant for PAE)? Looks like a bug to me. We have pmd_read_atomic() to address the issue in other places. The helper doesn't match required for GUP_fast() semantics, but we clearly need to address the issue. pgd deference doesn't look good too on PAE. Or am I missing something? Heck, we don't even have READ_ONCE() on x86 for page table entry dereference. Looks like a bug waiting to explode. And not only on PAE. -- Kirill A. Shutemov From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf0-f200.google.com (mail-pf0-f200.google.com [209.85.192.200]) by kanga.kvack.org (Postfix) with ESMTP id 9BD636B038C for ; Mon, 13 Mar 2017 10:33:32 -0400 (EDT) Received: by mail-pf0-f200.google.com with SMTP id x63so301459730pfx.7 for ; Mon, 13 Mar 2017 07:33:32 -0700 (PDT) Received: from mga03.intel.com (mga03.intel.com. [134.134.136.65]) by mx.google.com with ESMTPS id 184si2805425pga.29.2017.03.13.07.33.31 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Mon, 13 Mar 2017 07:33:31 -0700 (PDT) From: "Kirill A. Shutemov" Subject: [PATCH 1/6] x86/mm: Extend headers with basic definitions to support 5-level paging Date: Mon, 13 Mar 2017 17:33:04 +0300 Message-Id: <20170313143309.16020-2-kirill.shutemov@linux.intel.com> In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" This patch extends x86 headers to enable 5-level paging support. It's still based on . We will get to the point where we can have later. Signed-off-by: Kirill A. Shutemov --- arch/x86/include/asm/pgtable-2level_types.h | 1 + arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 26 ++++++++++++++++++++----- arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++++++++++++++++++- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 392576433e77..373ab1de909f 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -7,6 +7,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..b8a4341faafa 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -7,6 +7,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..6f6f351e0a81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -179,6 +179,17 @@ static inline unsigned long pud_pfn(pud_t pud) return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; +} + +static inline int p4d_large(p4d_t p4d) +{ + /* No 512 GiB pages yet */ + return 0; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -770,6 +781,16 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline unsigned long p4d_index(unsigned long address) +{ + return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + #if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { @@ -788,11 +809,6 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned long pud_index(unsigned long address) -{ - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) { return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..0b2797e5083c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -13,6 +13,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 62484333673d..df08535f774a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if CONFIG_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 4 + +#error FIXME + +#else #include +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return native_pgd_val(p4d); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -318,6 +329,22 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline p4dval_t p4d_pfn_mask(p4d_t p4d) +{ + /* No 512 GiB huge pages yet */ + return PTE_PFN_MASK; +} + +static inline p4dval_t p4d_flags_mask(p4d_t p4d) +{ + return ~p4d_pfn_mask(p4d); +} + +static inline p4dval_t p4d_flags(p4d_t p4d) +{ + return native_p4d_val(p4d) & p4d_flags_mask(p4d); +} + static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) @@ -461,6 +488,7 @@ enum pg_level { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_512G, PG_LEVEL_NUM }; -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg0-f70.google.com (mail-pg0-f70.google.com [74.125.83.70]) by kanga.kvack.org (Postfix) with ESMTP id 5E3D16B038D for ; Mon, 13 Mar 2017 10:34:47 -0400 (EDT) Received: by mail-pg0-f70.google.com with SMTP id g2so302377152pge.7 for ; Mon, 13 Mar 2017 07:34:47 -0700 (PDT) Received: from mga11.intel.com (mga11.intel.com. [192.55.52.93]) by mx.google.com with ESMTPS id b21si11603285pgg.194.2017.03.13.07.34.45 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Mon, 13 Mar 2017 07:34:46 -0700 (PDT) From: "Kirill A. Shutemov" Subject: [PATCH 2/6] x86/mm: Convert trivial cases of page table walk to 5-level paging Date: Mon, 13 Mar 2017 17:33:05 +0300 Message-Id: <20170313143309.16020-3-kirill.shutemov@linux.intel.com> In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org List-ID: To: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" Cc: Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org, "Kirill A. Shutemov" This patch only covers simple cases. Less trivial cases will be converted with separate patches. Signed-off-by: Kirill A. Shutemov --- arch/x86/kernel/tboot.c | 6 +++++- arch/x86/kernel/vm86_32.c | 6 +++++- arch/x86/mm/fault.c | 39 +++++++++++++++++++++++++++++++++------ arch/x86/mm/init_32.c | 22 ++++++++++++++++------ arch/x86/mm/ioremap.c | 3 ++- arch/x86/mm/pgtable.c | 4 +++- arch/x86/mm/pgtable_32.c | 8 +++++++- arch/x86/platform/efi/efi_64.c | 13 +++++++++---- arch/x86/power/hibernate_32.c | 7 +++++-- 9 files changed, 85 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..5db0f33cbf2c 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..62597c300d94 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..605fd5e8e048 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -526,6 +536,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +549,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1101,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1124,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b4b53e6793f..5ed3c141bbd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,6 +67,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +76,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +393,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +416,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +425,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +458,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +478,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa2635862d..a5e1cda85974 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..38b6daf72deb 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -261,13 +261,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce776852b..3d275a791c76 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4695da42d77..8544dae3d1b4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -166,6 +166,7 @@ void efi_sync_low_kernel_mappings(void) { unsigned num_entries; pgd_t *pgd_k, *pgd_efi; + p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) @@ -197,16 +198,20 @@ void efi_sync_low_kernel_mappings(void) BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - pud_efi = pud_offset(pgd_efi, 0); + p4d_efi = p4d_offset(pgd_efi, 0); + pud_efi = pud_offset(p4d_efi, 0); pgd_k = pgd_offset_k(EFI_VA_END); - pud_k = pud_offset(pgd_k, 0); + p4d_k = p4d_offset(pgd_k, 0); + pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - pud_efi = pud_offset(pgd_efi, EFI_VA_START); - pud_k = pud_offset(pgd_k, EFI_VA_START); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_START); + pud_efi = pud_offset(p4d_efi, EFI_VA_START); + p4d_k = p4d_offset(pgd_k, EFI_VA_START); + pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..c35fdb585c68 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; */ static pmd_t *resume_one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) return NULL; set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); #else - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); #endif -- 2.11.0 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wr0-f197.google.com (mail-wr0-f197.google.com [209.85.128.197]) by kanga.kvack.org (Postfix) with ESMTP id B59126B038A for ; Tue, 14 Mar 2017 03:47:33 -0400 (EDT) Received: by mail-wr0-f197.google.com with SMTP id v66so49260300wrc.4 for ; Tue, 14 Mar 2017 00:47:33 -0700 (PDT) Received: from mail-wm0-x242.google.com (mail-wm0-x242.google.com. [2a00:1450:400c:c09::242]) by mx.google.com with ESMTPS id z46si3968029wrz.204.2017.03.14.00.47.32 for (version=TLS1_2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Tue, 14 Mar 2017 00:47:32 -0700 (PDT) Received: by mail-wm0-x242.google.com with SMTP id v190so13394780wme.3 for ; Tue, 14 Mar 2017 00:47:32 -0700 (PDT) Date: Tue, 14 Mar 2017 08:47:29 +0100 From: Ingo Molnar Subject: Re: [PATCH 0/6] x86: 5-level paging enabling for v4.12, Part 1 Message-ID: <20170314074729.GA23151@gmail.com> References: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170313143309.16020-1-kirill.shutemov@linux.intel.com> Sender: owner-linux-mm@kvack.org List-ID: To: "Kirill A. Shutemov" Cc: Linus Torvalds , Andrew Morton , x86@kernel.org, Thomas Gleixner , Ingo Molnar , Arnd Bergmann , "H. Peter Anvin" , Andi Kleen , Dave Hansen , Andy Lutomirski , Michal Hocko , linux-arch@vger.kernel.org, linux-mm@kvack.org, linux-kernel@vger.kernel.org * Kirill A. Shutemov wrote: > Here's the first bunch of patches of 5-level patchset. Let's see if I'm on > right track addressing Ingo's feedback. :) > > These patches prepare x86 code to be switched from > to . It's a stepping stone for adding 5-level > paging support. > > Please review and consider applying. > > Kirill A. Shutemov (6): > x86/mm: Extend headers with basic definitions to support 5-level > paging > x86/mm: Convert trivial cases of page table walk to 5-level paging > x86/gup: Add 5-level paging support > x86/ident_map: Add 5-level paging support > x86/vmalloc: Add 5-level paging support > x86/power: Add 5-level paging support > > arch/x86/include/asm/pgtable-2level_types.h | 1 + > arch/x86/include/asm/pgtable-3level_types.h | 1 + > arch/x86/include/asm/pgtable.h | 26 +++++++++--- > arch/x86/include/asm/pgtable_64_types.h | 1 + > arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++- > arch/x86/kernel/tboot.c | 6 ++- > arch/x86/kernel/vm86_32.c | 6 ++- > arch/x86/mm/fault.c | 66 +++++++++++++++++++++++++---- > arch/x86/mm/gup.c | 33 ++++++++++++--- > arch/x86/mm/ident_map.c | 51 +++++++++++++++++++--- > arch/x86/mm/init_32.c | 22 +++++++--- > arch/x86/mm/ioremap.c | 3 +- > arch/x86/mm/pgtable.c | 4 +- > arch/x86/mm/pgtable_32.c | 8 +++- > arch/x86/platform/efi/efi_64.c | 13 ++++-- > arch/x86/power/hibernate_32.c | 7 ++- > arch/x86/power/hibernate_64.c | 50 ++++++++++++++++------ > 17 files changed, 269 insertions(+), 59 deletions(-) Much better! I've applied them, with (very) minor readability edits here and there, and will push them out into tip:x86/mm and tip:master after some testing - you can use that as a base for the remaining submissions. I've also applied the GUP patch, with the assumption that you'll address Linus's request to switch x86 over to the generic version. Thanks, Ingo -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1750771AbdCNJjV (ORCPT ); Tue, 14 Mar 2017 05:39:21 -0400 Received: from terminus.zytor.com ([65.50.211.136]:46100 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750742AbdCNJjT (ORCPT ); Tue, 14 Mar 2017 05:39:19 -0400 Date: Tue, 14 Mar 2017 02:37:46 -0700 From: "tip-bot for Kirill A. Shutemov" Message-ID: Cc: mhocko@suse.com, torvalds@linux-foundation.org, arnd@arndb.de, mingo@kernel.org, dave.hansen@intel.com, kirill.shutemov@linux.intel.com, tglx@linutronix.de, luto@kernel.org, bp@alien8.de, brgerst@gmail.com, dvlasenk@redhat.com, akpm@linux-foundation.org, hpa@zytor.com, peterz@infradead.org, linux-kernel@vger.kernel.org, jpoimboe@redhat.com Reply-To: peterz@infradead.org, hpa@zytor.com, brgerst@gmail.com, dvlasenk@redhat.com, akpm@linux-foundation.org, jpoimboe@redhat.com, linux-kernel@vger.kernel.org, mingo@kernel.org, torvalds@linux-foundation.org, mhocko@suse.com, arnd@arndb.de, luto@kernel.org, bp@alien8.de, tglx@linutronix.de, dave.hansen@intel.com, kirill.shutemov@linux.intel.com In-Reply-To: <20170313143309.16020-3-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-3-kirill.shutemov@linux.intel.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:x86/mm] x86/mm: Convert trivial cases of page table walk to 5-level paging Git-Commit-ID: e0c4f6750e130541cca7390739d25feb522acfff X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: e0c4f6750e130541cca7390739d25feb522acfff Gitweb: http://git.kernel.org/tip/e0c4f6750e130541cca7390739d25feb522acfff Author: Kirill A. Shutemov AuthorDate: Mon, 13 Mar 2017 17:33:05 +0300 Committer: Ingo Molnar CommitDate: Tue, 14 Mar 2017 08:45:08 +0100 x86/mm: Convert trivial cases of page table walk to 5-level paging This patch only covers simple cases. Less trivial cases will be converted with separate patches. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tboot.c | 6 +++++- arch/x86/kernel/vm86_32.c | 6 +++++- arch/x86/mm/fault.c | 39 +++++++++++++++++++++++++++++++++------ arch/x86/mm/init_32.c | 22 ++++++++++++++++------ arch/x86/mm/ioremap.c | 3 ++- arch/x86/mm/pgtable.c | 4 +++- arch/x86/mm/pgtable_32.c | 8 +++++++- arch/x86/platform/efi/efi_64.c | 13 +++++++++---- arch/x86/power/hibernate_32.c | 7 +++++-- 9 files changed, 85 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1..5db0f33 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89c..62597c3 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e3176..605fd5e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -526,6 +536,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +549,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1101,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1124,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b4b53e..5ed3c14 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,6 +67,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +76,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +393,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +416,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +425,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +458,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +478,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa263..a5e1cda 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff2..38b6daf 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -261,13 +261,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce77..3d275a7 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4695da..8544dae 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -166,6 +166,7 @@ void efi_sync_low_kernel_mappings(void) { unsigned num_entries; pgd_t *pgd_k, *pgd_efi; + p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) @@ -197,16 +198,20 @@ void efi_sync_low_kernel_mappings(void) BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - pud_efi = pud_offset(pgd_efi, 0); + p4d_efi = p4d_offset(pgd_efi, 0); + pud_efi = pud_offset(p4d_efi, 0); pgd_k = pgd_offset_k(EFI_VA_END); - pud_k = pud_offset(pgd_k, 0); + p4d_k = p4d_offset(pgd_k, 0); + pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - pud_efi = pud_offset(pgd_efi, EFI_VA_START); - pud_k = pud_offset(pgd_k, EFI_VA_START); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_START); + pud_efi = pud_offset(p4d_efi, EFI_VA_START); + p4d_k = p4d_offset(pgd_k, EFI_VA_START); + pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd3..c35fdb5 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; */ static pmd_t *resume_one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) return NULL; set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); #else - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); #endif From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1750869AbdCNJkL (ORCPT ); Tue, 14 Mar 2017 05:40:11 -0400 Received: from terminus.zytor.com ([65.50.211.136]:46226 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750787AbdCNJkJ (ORCPT ); Tue, 14 Mar 2017 05:40:09 -0400 Date: Tue, 14 Mar 2017 02:39:25 -0700 From: "tip-bot for Kirill A. Shutemov" Message-ID: Cc: tglx@linutronix.de, akpm@linux-foundation.org, arnd@arndb.de, linux-kernel@vger.kernel.org, bp@alien8.de, kirill.shutemov@linux.intel.com, torvalds@linux-foundation.org, luto@kernel.org, hpa@zytor.com, mhocko@suse.com, dvlasenk@redhat.com, mingo@kernel.org, dave.hansen@intel.com, brgerst@gmail.com, peterz@infradead.org, jpoimboe@redhat.com Reply-To: brgerst@gmail.com, peterz@infradead.org, jpoimboe@redhat.com, dave.hansen@intel.com, luto@kernel.org, mingo@kernel.org, hpa@zytor.com, mhocko@suse.com, dvlasenk@redhat.com, arnd@arndb.de, linux-kernel@vger.kernel.org, tglx@linutronix.de, akpm@linux-foundation.org, bp@alien8.de, kirill.shutemov@linux.intel.com, torvalds@linux-foundation.org In-Reply-To: <20170313143309.16020-6-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-6-kirill.shutemov@linux.intel.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:x86/mm] x86/mm/vmalloc: Add 5-level paging support Git-Commit-ID: b50858ce3e2a25a7f4638464e857853fbfc81823 X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: b50858ce3e2a25a7f4638464e857853fbfc81823 Gitweb: http://git.kernel.org/tip/b50858ce3e2a25a7f4638464e857853fbfc81823 Author: Kirill A. Shutemov AuthorDate: Mon, 13 Mar 2017 17:33:08 +0300 Committer: Ingo Molnar CommitDate: Tue, 14 Mar 2017 08:45:08 +0100 x86/mm/vmalloc: Add 5-level paging support Modify vmalloc_fault() to handle additional page table level. With 4-level paging, copying happens on p4d level, as we have pgd_none() always false if p4d_t is folded. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 605fd5e..8ad91a0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -435,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -458,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false, so the pgd may + * point to an empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on the p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1750927AbdCNJkS (ORCPT ); Tue, 14 Mar 2017 05:40:18 -0400 Received: from terminus.zytor.com ([65.50.211.136]:46242 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750885AbdCNJkQ (ORCPT ); Tue, 14 Mar 2017 05:40:16 -0400 Date: Tue, 14 Mar 2017 02:38:18 -0700 From: "tip-bot for Kirill A. Shutemov" Message-ID: Cc: mhocko@suse.com, jpoimboe@redhat.com, mingo@kernel.org, luto@kernel.org, kirill.shutemov@linux.intel.com, dvlasenk@redhat.com, hpa@zytor.com, tglx@linutronix.de, dave.hansen@intel.com, akpm@linux-foundation.org, peterz@infradead.org, linux-kernel@vger.kernel.org, bp@alien8.de, torvalds@linux-foundation.org, brgerst@gmail.com, arnd@arndb.de Reply-To: brgerst@gmail.com, arnd@arndb.de, torvalds@linux-foundation.org, bp@alien8.de, linux-kernel@vger.kernel.org, peterz@infradead.org, akpm@linux-foundation.org, dave.hansen@intel.com, tglx@linutronix.de, hpa@zytor.com, dvlasenk@redhat.com, kirill.shutemov@linux.intel.com, luto@kernel.org, mingo@kernel.org, jpoimboe@redhat.com, mhocko@suse.com In-Reply-To: <20170313143309.16020-4-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-4-kirill.shutemov@linux.intel.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:x86/mm] x86/mm/gup: Add 5-level paging support Git-Commit-ID: 0318e5abe1c0933b8bf6763a1a0d3caec4f0826d X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: 0318e5abe1c0933b8bf6763a1a0d3caec4f0826d Gitweb: http://git.kernel.org/tip/0318e5abe1c0933b8bf6763a1a0d3caec4f0826d Author: Kirill A. Shutemov AuthorDate: Mon, 13 Mar 2017 17:33:06 +0300 Committer: Ingo Molnar CommitDate: Tue, 14 Mar 2017 08:45:08 +0100 x86/mm/gup: Add 5-level paging support Extend get_user_pages_fast() to handle an additional page table level. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/gup.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 1f3b6ef..456dfdf 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* - * 'pteval' can come from a pte, pmd or pud. We only check + * 'pteval' can come from a pte, pmd, pud or p4d. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. + * same value on all 4 types. */ static inline int pte_allows_gup(unsigned long pteval, int write) { @@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = *pudp; @@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); @@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751097AbdCNJlM (ORCPT ); Tue, 14 Mar 2017 05:41:12 -0400 Received: from terminus.zytor.com ([65.50.211.136]:46320 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751061AbdCNJlK (ORCPT ); Tue, 14 Mar 2017 05:41:10 -0400 Date: Tue, 14 Mar 2017 02:37:13 -0700 From: "tip-bot for Kirill A. Shutemov" Message-ID: Cc: dave.hansen@intel.com, akpm@linux-foundation.org, torvalds@linux-foundation.org, brgerst@gmail.com, luto@kernel.org, mingo@kernel.org, peterz@infradead.org, kirill.shutemov@linux.intel.com, mhocko@suse.com, bp@alien8.de, jpoimboe@redhat.com, arnd@arndb.de, linux-kernel@vger.kernel.org, dvlasenk@redhat.com, tglx@linutronix.de, hpa@zytor.com Reply-To: bp@alien8.de, mhocko@suse.com, jpoimboe@redhat.com, dvlasenk@redhat.com, tglx@linutronix.de, linux-kernel@vger.kernel.org, arnd@arndb.de, hpa@zytor.com, akpm@linux-foundation.org, dave.hansen@intel.com, torvalds@linux-foundation.org, brgerst@gmail.com, kirill.shutemov@linux.intel.com, peterz@infradead.org, luto@kernel.org, mingo@kernel.org In-Reply-To: <20170313143309.16020-2-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-2-kirill.shutemov@linux.intel.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:x86/mm] x86/mm: Extend headers with basic definitions to support 5-level paging Git-Commit-ID: fe1e8c3e9634071ac608172e29bf997596d17c7c X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: fe1e8c3e9634071ac608172e29bf997596d17c7c Gitweb: http://git.kernel.org/tip/fe1e8c3e9634071ac608172e29bf997596d17c7c Author: Kirill A. Shutemov AuthorDate: Mon, 13 Mar 2017 17:33:04 +0300 Committer: Ingo Molnar CommitDate: Tue, 14 Mar 2017 08:45:07 +0100 x86/mm: Extend headers with basic definitions to support 5-level paging This patch extends x86 headers to enable 5-level paging support. It's still based on . We will get to the point where we can have later. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level_types.h | 1 + arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 26 ++++++++++++++++++++----- arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++++++++++++++++++- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 3925764..373ab1d 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -7,6 +7,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc8962..b8a4341 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -7,6 +7,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b..6f6f351 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -179,6 +179,17 @@ static inline unsigned long pud_pfn(pud_t pud) return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; +} + +static inline int p4d_large(p4d_t p4d) +{ + /* No 512 GiB pages yet */ + return 0; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -770,6 +781,16 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline unsigned long p4d_index(unsigned long address) +{ + return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + #if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { @@ -788,11 +809,6 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned long pud_index(unsigned long address) -{ - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) { return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a26420..0b2797e 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -13,6 +13,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 6248433..df08535 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if CONFIG_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 4 + +#error FIXME + +#else #include +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return native_pgd_val(p4d); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -318,6 +329,22 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline p4dval_t p4d_pfn_mask(p4d_t p4d) +{ + /* No 512 GiB huge pages yet */ + return PTE_PFN_MASK; +} + +static inline p4dval_t p4d_flags_mask(p4d_t p4d) +{ + return ~p4d_pfn_mask(p4d); +} + +static inline p4dval_t p4d_flags(p4d_t p4d) +{ + return native_p4d_val(p4d) & p4d_flags_mask(p4d); +} + static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) @@ -461,6 +488,7 @@ enum pg_level { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_512G, PG_LEVEL_NUM }; From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751126AbdCNJlR (ORCPT ); Tue, 14 Mar 2017 05:41:17 -0400 Received: from terminus.zytor.com ([65.50.211.136]:46326 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751071AbdCNJlO (ORCPT ); Tue, 14 Mar 2017 05:41:14 -0400 Date: Tue, 14 Mar 2017 02:39:59 -0700 From: "tip-bot for Kirill A. Shutemov" Message-ID: Cc: dvlasenk@redhat.com, torvalds@linux-foundation.org, akpm@linux-foundation.org, linux-kernel@vger.kernel.org, jpoimboe@redhat.com, peterz@infradead.org, luto@kernel.org, kirill.shutemov@linux.intel.com, bp@alien8.de, mhocko@suse.com, mingo@kernel.org, arnd@arndb.de, hpa@zytor.com, brgerst@gmail.com, tglx@linutronix.de, dave.hansen@intel.com Reply-To: linux-kernel@vger.kernel.org, dvlasenk@redhat.com, akpm@linux-foundation.org, torvalds@linux-foundation.org, jpoimboe@redhat.com, luto@kernel.org, peterz@infradead.org, mingo@kernel.org, arnd@arndb.de, mhocko@suse.com, kirill.shutemov@linux.intel.com, bp@alien8.de, dave.hansen@intel.com, tglx@linutronix.de, hpa@zytor.com, brgerst@gmail.com In-Reply-To: <20170313143309.16020-7-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-7-kirill.shutemov@linux.intel.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:x86/mm] x86/power: Add 5-level paging support Git-Commit-ID: 06c830a48346643e195801460dfe16d96ba4dff5 X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: 06c830a48346643e195801460dfe16d96ba4dff5 Gitweb: http://git.kernel.org/tip/06c830a48346643e195801460dfe16d96ba4dff5 Author: Kirill A. Shutemov AuthorDate: Mon, 13 Mar 2017 17:33:09 +0300 Committer: Ingo Molnar CommitDate: Tue, 14 Mar 2017 08:45:09 +0100 x86/power: Add 5-level paging support set_up_temporary_text_mapping() and relocate_restore_code() require adjustments to handle additional page table level. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-7-kirill.shutemov@linux.intel.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- arch/x86/power/hibernate_64.c | 47 ++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ded2e82..2a9f993 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; + p4d_t *p4d; /* * The new mapping only has to cover the page containing the image @@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * the virtual address space after switching over to the original page * tables used by the image kernel. */ + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!p4d) + return -ENOMEM; + } + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; @@ -75,8 +83,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), - __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* No p4d for 4-level paging: point the pgd to the pud page table */ + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); + } return 0; } @@ -124,7 +137,10 @@ static int set_up_temporary_mappings(void) static int relocate_restore_code(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) @@ -134,22 +150,25 @@ static int relocate_restore_code(void) /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); - pud = pud_offset(pgd, relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); if (pud_large(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - } else { - pmd_t *pmd = pmd_offset(pud, relocated_restore_code); - - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - } else { - pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); - } + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: __flush_tlb_all(); - return 0; } From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751069AbdCNJlK (ORCPT ); Tue, 14 Mar 2017 05:41:10 -0400 Received: from terminus.zytor.com ([65.50.211.136]:46306 "EHLO terminus.zytor.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750808AbdCNJlG (ORCPT ); Tue, 14 Mar 2017 05:41:06 -0400 Date: Tue, 14 Mar 2017 02:38:51 -0700 From: "tip-bot for Kirill A. Shutemov" Message-ID: Cc: mingo@kernel.org, torvalds@linux-foundation.org, brgerst@gmail.com, tglx@linutronix.de, peterz@infradead.org, linux-kernel@vger.kernel.org, bp@alien8.de, arnd@arndb.de, dvlasenk@redhat.com, akpm@linux-foundation.org, kirill.shutemov@linux.intel.com, luto@kernel.org, dave.hansen@intel.com, hpa@zytor.com, mhocko@suse.com, jpoimboe@redhat.com Reply-To: dvlasenk@redhat.com, akpm@linux-foundation.org, kirill.shutemov@linux.intel.com, luto@kernel.org, dave.hansen@intel.com, hpa@zytor.com, mhocko@suse.com, jpoimboe@redhat.com, mingo@kernel.org, torvalds@linux-foundation.org, brgerst@gmail.com, tglx@linutronix.de, peterz@infradead.org, linux-kernel@vger.kernel.org, bp@alien8.de, arnd@arndb.de In-Reply-To: <20170313143309.16020-5-kirill.shutemov@linux.intel.com> References: <20170313143309.16020-5-kirill.shutemov@linux.intel.com> To: linux-tip-commits@vger.kernel.org Subject: [tip:x86/mm] x86/mm/ident_map: Add 5-level paging support Git-Commit-ID: ea3b5e60ce804403ca019039d6331368521348de X-Mailer: tip-git-log-daemon Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset=UTF-8 Content-Disposition: inline Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Commit-ID: ea3b5e60ce804403ca019039d6331368521348de Gitweb: http://git.kernel.org/tip/ea3b5e60ce804403ca019039d6331368521348de Author: Kirill A. Shutemov AuthorDate: Mon, 13 Mar 2017 17:33:07 +0300 Committer: Ingo Molnar CommitDate: Tue, 14 Mar 2017 08:45:08 +0100 x86/mm/ident_map: Add 5-level paging support Add additional page table level handing. It's mostly mechanical. The only quirk is that with p4d folded, 'pgd' is equal to 'p4d' in kernel_ident_mapping_init(). The pgd entry has to point to the pud page table in this case. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ident_map.c | 51 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4..04210a2 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0;