* [PATCH v1 2/9] powerpc/vdso: Remove get_page() in vdso_pagelist initialization
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
Partly copied from commit 16fb1a9bec61 ("arm64: vdso: clean up
vdso_pagelist initialization").
No need to get_page() the vdso text/data - these are part of the
kernel image.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/kernel/vdso.c | 18 ++++++------------
1 file changed, 6 insertions(+), 12 deletions(-)
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 88a4a02ed4c4..3bc4d5b1980b 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -757,11 +757,9 @@ static int __init vdso_init(void)
if (!vdso32_pagelist)
goto alloc_failed;
- for (i = 0; i < vdso32_pages; i++) {
- struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE);
- get_page(pg);
- vdso32_pagelist[i] = pg;
- }
+ for (i = 0; i < vdso32_pages; i++)
+ vdso32_pagelist[i] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
+
vdso32_pagelist[i++] = virt_to_page(vdso_data);
vdso32_pagelist[i] = NULL;
#endif
@@ -772,17 +770,13 @@ static int __init vdso_init(void)
if (!vdso64_pagelist)
goto alloc_failed;
- for (i = 0; i < vdso64_pages; i++) {
- struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE);
- get_page(pg);
- vdso64_pagelist[i] = pg;
- }
+ for (i = 0; i < vdso64_pages; i++)
+ vdso64_pagelist[i] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
+
vdso64_pagelist[i++] = virt_to_page(vdso_data);
vdso64_pagelist[i] = NULL;
#endif /* CONFIG_PPC64 */
- get_page(virt_to_page(vdso_data));
-
smp_wmb();
vdso_ready = 1;
--
2.25.0
^ permalink raw reply related
* [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
No need of all those #ifdefs around the pagelist initialisation,
use IS_ENABLED(), GCC will kick out unused static variables.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/kernel/vdso.c | 57 +++++++++++++++-----------------------
1 file changed, 22 insertions(+), 35 deletions(-)
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index daef14a284a3..bbb69832fd46 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -51,15 +51,13 @@ static struct page **vdso32_pagelist;
unsigned long vdso32_sigtramp;
unsigned long vdso32_rt_sigtramp;
-#ifdef CONFIG_VDSO32
extern char vdso32_start, vdso32_end;
-#endif
-#ifdef CONFIG_PPC64
extern char vdso64_start, vdso64_end;
static void *vdso64_kbase = &vdso64_start;
static unsigned int vdso64_pages;
static struct page **vdso64_pagelist;
+#ifdef CONFIG_PPC64
unsigned long vdso64_rt_sigtramp;
#endif /* CONFIG_PPC64 */
@@ -134,7 +132,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (!vdso_ready)
return 0;
-#ifdef CONFIG_PPC64
if (is_32bit_task()) {
vdso_pagelist = vdso32_pagelist;
vdso_pages = vdso32_pages;
@@ -149,11 +146,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
*/
vdso_base = 0;
}
-#else
- vdso_pagelist = vdso32_pagelist;
- vdso_pages = vdso32_pages;
- vdso_base = VDSO32_MBASE;
-#endif
current->mm->context.vdso_base = 0;
@@ -718,16 +710,14 @@ static int __init vdso_init(void)
vdso_data->icache_block_size = ppc64_caches.l1i.block_size;
vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size;
vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size;
+#endif /* CONFIG_PPC64 */
/*
* Calculate the size of the 64 bits vDSO
*/
vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_VDSO32
vdso32_kbase = &vdso32_start;
/*
@@ -735,8 +725,6 @@ static int __init vdso_init(void)
*/
vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
-#endif
-
/*
* Setup the syscall map in the vDOS
@@ -750,30 +738,30 @@ static int __init vdso_init(void)
if (vdso_setup())
goto setup_failed;
-#ifdef CONFIG_VDSO32
- /* Make sure pages are in the correct state */
- vdso32_pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- if (!vdso32_pagelist)
- goto alloc_failed;
+ if (IS_ENABLED(CONFIG_VDSO32)) {
+ /* Make sure pages are in the correct state */
+ vdso32_pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!vdso32_pagelist)
+ goto alloc_failed;
- for (i = 0; i < vdso32_pages; i++)
- vdso32_pagelist[i] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
+ for (i = 0; i < vdso32_pages; i++)
+ vdso32_pagelist[i] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
- vdso32_pagelist[i] = virt_to_page(vdso_data);
-#endif
+ vdso32_pagelist[i] = virt_to_page(vdso_data);
+ }
-#ifdef CONFIG_PPC64
- vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- if (!vdso64_pagelist)
- goto alloc_failed;
+ if (IS_ENABLED(CONFIG_PPC64)) {
+ vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!vdso64_pagelist)
+ goto alloc_failed;
- for (i = 0; i < vdso64_pages; i++)
- vdso64_pagelist[i] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
+ for (i = 0; i < vdso64_pages; i++)
+ vdso64_pagelist[i] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
- vdso64_pagelist[i] = virt_to_page(vdso_data);
-#endif /* CONFIG_PPC64 */
+ vdso64_pagelist[i] = virt_to_page(vdso_data);
+ }
smp_wmb();
vdso_ready = 1;
@@ -784,9 +772,8 @@ static int __init vdso_init(void)
pr_err("vDSO setup failure, not enabled !\n");
alloc_failed:
vdso32_pages = 0;
-#ifdef CONFIG_PPC64
vdso64_pages = 0;
-#endif
+
return 0;
}
arch_initcall(vdso_init);
--
2.25.0
^ permalink raw reply related
* [PATCH v1 5/9] powerpc/vdso: move to _install_special_mapping() and remove arch_vma_name()
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
From commit 2fea7f6c98f5 ("arm64: vdso: move to
_install_special_mapping and remove arch_vma_name").
Use the new _install_special_mapping() API added by
commit a62c34bd2a8a ("x86, mm: Improve _install_special_mapping
and fix x86 vdso naming") which obsolete install_special_mapping().
And remove arch_vma_name() as the name is handled by the new API.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/kernel/vdso.c | 59 +++++++++++++++++++-------------------
1 file changed, 30 insertions(+), 29 deletions(-)
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index bbb69832fd46..4ccfc0dc96b5 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -47,7 +47,6 @@
static unsigned int vdso32_pages;
static void *vdso32_kbase;
-static struct page **vdso32_pagelist;
unsigned long vdso32_sigtramp;
unsigned long vdso32_rt_sigtramp;
@@ -56,7 +55,6 @@ extern char vdso32_start, vdso32_end;
extern char vdso64_start, vdso64_end;
static void *vdso64_kbase = &vdso64_start;
static unsigned int vdso64_pages;
-static struct page **vdso64_pagelist;
#ifdef CONFIG_PPC64
unsigned long vdso64_rt_sigtramp;
#endif /* CONFIG_PPC64 */
@@ -117,6 +115,14 @@ struct lib64_elfinfo
};
+static struct vm_special_mapping vdso32_spec __ro_after_init = {
+ .name = "[vdso]",
+};
+
+static struct vm_special_mapping vdso64_spec __ro_after_init = {
+ .name = "[vdso]",
+};
+
/*
* This is called from binfmt_elf, we create the special vma for the
* vDSO and insert it into the mm struct tree
@@ -124,7 +130,8 @@ struct lib64_elfinfo
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
- struct page **vdso_pagelist;
+ struct vm_special_mapping *vdso_spec;
+ struct vm_area_struct *vma;
unsigned long vdso_pages;
unsigned long vdso_base;
int rc;
@@ -133,11 +140,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
return 0;
if (is_32bit_task()) {
- vdso_pagelist = vdso32_pagelist;
+ vdso_spec = &vdso32_spec;
vdso_pages = vdso32_pages;
vdso_base = VDSO32_MBASE;
} else {
- vdso_pagelist = vdso64_pagelist;
+ vdso_spec = &vdso64_spec;
vdso_pages = vdso64_pages;
/*
* On 64bit we don't have a preferred map address. This
@@ -194,12 +201,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
* It's fine to use that for setting breakpoints in the vDSO code
* pages though.
*/
- rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
- VM_READ|VM_EXEC|
- VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
- vdso_pagelist);
- if (rc) {
+ vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT,
+ VM_READ | VM_EXEC | VM_MAYREAD |
+ VM_MAYWRITE | VM_MAYEXEC, vdso_spec);
+ if (IS_ERR(vma)) {
current->mm->context.vdso_base = 0;
+ rc = PTR_ERR(vma);
goto fail_mmapsem;
}
@@ -211,15 +218,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
return rc;
}
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
- if (vma->vm_mm && vma->vm_start == vma->vm_mm->context.vdso_base)
- return "[vdso]";
- return NULL;
-}
-
-
-
#ifdef CONFIG_VDSO32
static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
unsigned long *size)
@@ -685,6 +683,7 @@ early_initcall(vdso_getcpu_init);
static int __init vdso_init(void)
{
int i;
+ struct page **pagelist;
#ifdef CONFIG_PPC64
/*
@@ -740,27 +739,29 @@ static int __init vdso_init(void)
if (IS_ENABLED(CONFIG_VDSO32)) {
/* Make sure pages are in the correct state */
- vdso32_pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- if (!vdso32_pagelist)
+ pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), GFP_KERNEL);
+ if (!pagelist)
goto alloc_failed;
for (i = 0; i < vdso32_pages; i++)
- vdso32_pagelist[i] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
+ pagelist[i] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
+
+ pagelist[i++] = virt_to_page(vdso_data);
- vdso32_pagelist[i] = virt_to_page(vdso_data);
+ vdso32_spec.pages = pagelist;
}
if (IS_ENABLED(CONFIG_PPC64)) {
- vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *),
- GFP_KERNEL);
- if (!vdso64_pagelist)
+ pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), GFP_KERNEL);
+ if (!pagelist)
goto alloc_failed;
for (i = 0; i < vdso64_pages; i++)
- vdso64_pagelist[i] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
+ pagelist[i] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
+
+ pagelist[i++] = virt_to_page(vdso_data);
- vdso64_pagelist[i] = virt_to_page(vdso_data);
+ vdso64_spec.pages = pagelist;
}
smp_wmb();
--
2.25.0
^ permalink raw reply related
* [PATCH v1 9/9] powerpc/vdso: Remove unused \tmp param in __get_datapage()
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
The \tmp param is not used anymore, remove it.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/vdso/gettimeofday.h | 4 ++--
arch/powerpc/include/asm/vdso_datapage.h | 2 +-
arch/powerpc/kernel/vdso32/cacheflush.S | 2 +-
arch/powerpc/kernel/vdso32/datapage.S | 4 ++--
arch/powerpc/kernel/vdso64/cacheflush.S | 2 +-
arch/powerpc/kernel/vdso64/datapage.S | 4 ++--
6 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
index 59a609a48b63..8602f1243e8d 100644
--- a/arch/powerpc/include/asm/vdso/gettimeofday.h
+++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
@@ -22,7 +22,7 @@
#ifdef CONFIG_PPC64
PPC_STL r2, STACK_FRAME_OVERHEAD + STK_GOT(r1)
#endif
- get_datapage r5, r0
+ get_datapage r5
addi r5, r5, VDSO_DATA_OFFSET
bl \funct
PPC_LL r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)
@@ -51,7 +51,7 @@
#ifdef CONFIG_PPC64
PPC_STL r2, STACK_FRAME_OVERHEAD + STK_GOT(r1)
#endif
- get_datapage r4, r0
+ get_datapage r4
addi r4, r4, VDSO_DATA_OFFSET
bl \funct
PPC_LL r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index 2bc415f7714c..71f44598f392 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -102,7 +102,7 @@ extern struct vdso_arch_data *vdso_data;
#else /* __ASSEMBLY__ */
-.macro get_datapage ptr, tmp
+.macro get_datapage ptr
bcl 20, 31, .+4
999:
mflr \ptr
diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S
index 3440ddf21c8b..017843bf5382 100644
--- a/arch/powerpc/kernel/vdso32/cacheflush.S
+++ b/arch/powerpc/kernel/vdso32/cacheflush.S
@@ -27,7 +27,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
#ifdef CONFIG_PPC64
mflr r12
.cfi_register lr,r12
- get_datapage r10, r0
+ get_datapage r10
mtlr r12
#endif
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 5513a4f8253e..0513a2eabec8 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -28,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
mflr r12
.cfi_register lr,r12
mr. r4,r3
- get_datapage r3, r0
+ get_datapage r3
mtlr r12
addi r3,r3,CFG_SYSCALL_MAP32
beqlr
@@ -49,7 +49,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
.cfi_startproc
mflr r12
.cfi_register lr,r12
- get_datapage r3, r0
+ get_datapage r3
lwz r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
lwz r3,CFG_TB_TICKS_PER_SEC(r3)
mtlr r12
diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
index cab14324242b..61985de5758f 100644
--- a/arch/powerpc/kernel/vdso64/cacheflush.S
+++ b/arch/powerpc/kernel/vdso64/cacheflush.S
@@ -25,7 +25,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
.cfi_startproc
mflr r12
.cfi_register lr,r12
- get_datapage r10, r0
+ get_datapage r10
mtlr r12
lwz r7,CFG_DCACHE_BLOCKSZ(r10)
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index 03bb72c440dc..00760dc69d68 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -28,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
mflr r12
.cfi_register lr,r12
mr r4,r3
- get_datapage r3, r0
+ get_datapage r3
mtlr r12
addi r3,r3,CFG_SYSCALL_MAP64
cmpldi cr0,r4,0
@@ -50,7 +50,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
.cfi_startproc
mflr r12
.cfi_register lr,r12
- get_datapage r3, r0
+ get_datapage r3
ld r3,CFG_TB_TICKS_PER_SEC(r3)
mtlr r12
crclr cr0*4+so
--
2.25.0
^ permalink raw reply related
* [PATCH v1 7/9] powerpc/vdso: Move vdso datapage up front
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
Move the vdso datapage in front of the VDSO area,
before vdso test.
This will allow to remove the __kernel_datapage_offset symbol
and simplify __get_datapage() in the following patch.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/mmu_context.h | 4 +++-
arch/powerpc/kernel/vdso.c | 22 ++++++++++------------
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 7f3658a97384..be18ad12bb54 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -262,7 +262,9 @@ extern void arch_exit_mmap(struct mm_struct *mm);
static inline void arch_unmap(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
- if (start <= mm->context.vdso_base && mm->context.vdso_base < end)
+ unsigned long vdso_base = mm->context.vdso_base - PAGE_SIZE;
+
+ if (start <= vdso_base && vdso_base < end)
mm->context.vdso_base = 0;
}
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index b9270923452e..1d72c4b7672f 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -124,7 +124,7 @@ static int vdso_mremap(unsigned long vdso_pages,
if (new_size != vdso_size)
return -EINVAL;
- current->mm->context.vdso_base = (unsigned long)new_vma->vm_start;
+ current->mm->context.vdso_base = (unsigned long)new_vma->vm_start + PAGE_SIZE;
return 0;
}
@@ -217,7 +217,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
* install_special_mapping or the perf counter mmap tracking code
* will fail to recognise it as a vDSO (since arch_vma_name fails).
*/
- current->mm->context.vdso_base = vdso_base;
+ current->mm->context.vdso_base = vdso_base + PAGE_SIZE;
/*
* our vma flags don't have VM_WRITE so by default, the process isn't
@@ -516,8 +516,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
return -1;
}
*((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) =
- (vdso64_pages << PAGE_SHIFT) -
- (sym64->st_value - VDSO64_LBASE);
+ (sym64->st_value - VDSO64_LBASE) - PAGE_SIZE;
#endif /* CONFIG_PPC64 */
#ifdef CONFIG_VDSO32
@@ -528,8 +527,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
return -1;
}
*((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
- (vdso32_pages << PAGE_SHIFT) -
- (sym32->st_value - VDSO32_LBASE);
+ (sym32->st_value - VDSO32_LBASE) - PAGE_SIZE;
#endif
return 0;
@@ -771,10 +769,10 @@ static int __init vdso_init(void)
if (!pagelist)
goto alloc_failed;
- for (i = 0; i < vdso32_pages; i++)
- pagelist[i] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
+ pagelist[0] = virt_to_page(vdso_data);
- pagelist[i++] = virt_to_page(vdso_data);
+ for (i = 0; i < vdso32_pages; i++)
+ pagelist[i + 1] = virt_to_page(vdso32_kbase + i * PAGE_SIZE);
vdso32_spec.pages = pagelist;
}
@@ -784,10 +782,10 @@ static int __init vdso_init(void)
if (!pagelist)
goto alloc_failed;
- for (i = 0; i < vdso64_pages; i++)
- pagelist[i] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
+ pagelist[0] = virt_to_page(vdso_data);
- pagelist[i++] = virt_to_page(vdso_data);
+ for (i = 0; i < vdso64_pages; i++)
+ pagelist[i + 1] = virt_to_page(vdso64_kbase + i * PAGE_SIZE);
vdso64_spec.pages = pagelist;
}
--
2.25.0
^ permalink raw reply related
* [PATCH v1 6/9] powerpc/vdso: Provide vdso_remap()
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
Provide vdso_remap() through _install_special_mapping() and
drop arch_remap().
This adds a test of the size and returns -EINVAL if the size
is not correct.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/mm-arch-hooks.h | 25 ---------------------
arch/powerpc/kernel/vdso.c | 28 ++++++++++++++++++++++++
2 files changed, 28 insertions(+), 25 deletions(-)
delete mode 100644 arch/powerpc/include/asm/mm-arch-hooks.h
diff --git a/arch/powerpc/include/asm/mm-arch-hooks.h b/arch/powerpc/include/asm/mm-arch-hooks.h
deleted file mode 100644
index dce274be824a..000000000000
--- a/arch/powerpc/include/asm/mm-arch-hooks.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Architecture specific mm hooks
- *
- * Copyright (C) 2015, IBM Corporation
- * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
- */
-
-#ifndef _ASM_POWERPC_MM_ARCH_HOOKS_H
-#define _ASM_POWERPC_MM_ARCH_HOOKS_H
-
-static inline void arch_remap(struct mm_struct *mm,
- unsigned long old_start, unsigned long old_end,
- unsigned long new_start, unsigned long new_end)
-{
- /*
- * mremap() doesn't allow moving multiple vmas so we can limit the
- * check to old_start == vdso_base.
- */
- if (old_start == mm->context.vdso_base)
- mm->context.vdso_base = new_start;
-}
-#define arch_remap arch_remap
-
-#endif /* _ASM_POWERPC_MM_ARCH_HOOKS_H */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 4ccfc0dc96b5..b9270923452e 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -114,13 +114,41 @@ struct lib64_elfinfo
unsigned long text;
};
+static int vdso_mremap(unsigned long vdso_pages,
+ const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+ unsigned long vdso_size = (vdso_pages + 1) << PAGE_SHIFT;
+
+ if (new_size != vdso_size)
+ return -EINVAL;
+
+ current->mm->context.vdso_base = (unsigned long)new_vma->vm_start;
+
+ return 0;
+}
+
+static int vdso32_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ return vdso_mremap(vdso32_pages, sm, new_vma);
+}
+
+static int vdso64_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ return vdso_mremap(vdso64_pages, sm, new_vma);
+}
static struct vm_special_mapping vdso32_spec __ro_after_init = {
.name = "[vdso]",
+ .mremap = vdso32_mremap,
};
static struct vm_special_mapping vdso64_spec __ro_after_init = {
.name = "[vdso]",
+ .mremap = vdso64_mremap,
};
/*
--
2.25.0
^ permalink raw reply related
* [PATCH v1 8/9] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
From: Christophe Leroy @ 2020-08-25 13:54 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <df48ed76cf8a756a7f97ed42a1a39d0a404014bc.1598363608.git.christophe.leroy@csgroup.eu>
The VDSO datapage and the text pages are always located immediately
next to each other, so it can be hardcoded without an indirection
through __kernel_datapage_offset
Before:
clock-getres-realtime-coarse: vdso: 714 nsec/call
clock-gettime-realtime-coarse: vdso: 792 nsec/call
clock-gettime-realtime: vdso: 1243 nsec/call
After:
clock-getres-realtime-coarse: vdso: 699 nsec/call
clock-gettime-realtime-coarse: vdso: 784 nsec/call
clock-gettime-realtime: vdso: 1231 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
arch/powerpc/include/asm/vdso_datapage.h | 8 +++--
arch/powerpc/kernel/vdso.c | 37 ------------------------
arch/powerpc/kernel/vdso32/datapage.S | 3 --
arch/powerpc/kernel/vdso32/vdso32.lds.S | 7 ++---
arch/powerpc/kernel/vdso64/datapage.S | 3 --
arch/powerpc/kernel/vdso64/vdso64.lds.S | 7 ++---
6 files changed, 9 insertions(+), 56 deletions(-)
diff --git a/arch/powerpc/include/asm/vdso_datapage.h b/arch/powerpc/include/asm/vdso_datapage.h
index c4d320504d26..2bc415f7714c 100644
--- a/arch/powerpc/include/asm/vdso_datapage.h
+++ b/arch/powerpc/include/asm/vdso_datapage.h
@@ -104,10 +104,12 @@ extern struct vdso_arch_data *vdso_data;
.macro get_datapage ptr, tmp
bcl 20, 31, .+4
+999:
mflr \ptr
- addi \ptr, \ptr, (__kernel_datapage_offset - (.-4))@l
- lwz \tmp, 0(\ptr)
- add \ptr, \tmp, \ptr
+#if CONFIG_PPC_PAGE_SHIFT > 14
+ addis \ptr, \ptr, (_vdso_datapage - 999b)@ha
+#endif
+ addi \ptr, \ptr, (_vdso_datapage - 999b)@l
.endm
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 1d72c4b7672f..e2568d9ecdff 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -500,40 +500,6 @@ static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
vdso32_rt_sigtramp = find_function32(v32, "__kernel_sigtramp_rt32");
}
-static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
- struct lib64_elfinfo *v64)
-{
-#ifdef CONFIG_VDSO32
- Elf32_Sym *sym32;
-#endif
-#ifdef CONFIG_PPC64
- Elf64_Sym *sym64;
-
- sym64 = find_symbol64(v64, "__kernel_datapage_offset");
- if (sym64 == NULL) {
- printk(KERN_ERR "vDSO64: Can't find symbol "
- "__kernel_datapage_offset !\n");
- return -1;
- }
- *((int *)(vdso64_kbase + sym64->st_value - VDSO64_LBASE)) =
- (sym64->st_value - VDSO64_LBASE) - PAGE_SIZE;
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_VDSO32
- sym32 = find_symbol32(v32, "__kernel_datapage_offset");
- if (sym32 == NULL) {
- printk(KERN_ERR "vDSO32: Can't find symbol "
- "__kernel_datapage_offset !\n");
- return -1;
- }
- *((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
- (sym32->st_value - VDSO32_LBASE) - PAGE_SIZE;
-#endif
-
- return 0;
-}
-
-
static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
struct lib64_elfinfo *v64)
{
@@ -634,9 +600,6 @@ static __init int vdso_setup(void)
if (vdso_do_find_sections(&v32, &v64))
return -1;
- if (vdso_fixup_datapage(&v32, &v64))
- return -1;
-
if (vdso_fixup_features(&v32, &v64))
return -1;
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 217bb630f8f9..5513a4f8253e 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -13,9 +13,6 @@
#include <asm/vdso_datapage.h>
.text
- .global __kernel_datapage_offset;
-__kernel_datapage_offset:
- .long 0
/*
* void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S
index 582c5b046cc9..25be27b47a9f 100644
--- a/arch/powerpc/kernel/vdso32/vdso32.lds.S
+++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S
@@ -4,6 +4,7 @@
* library
*/
#include <asm/vdso.h>
+#include <asm/page.h>
#ifdef __LITTLE_ENDIAN__
OUTPUT_FORMAT("elf32-powerpcle", "elf32-powerpcle", "elf32-powerpcle")
@@ -15,6 +16,7 @@ ENTRY(_start)
SECTIONS
{
+ PROVIDE(_vdso_datapage = . - PAGE_SIZE);
. = VDSO32_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
@@ -139,11 +141,6 @@ VERSION
{
VDSO_VERSION_STRING {
global:
- /*
- * Has to be there for the kernel to find
- */
- __kernel_datapage_offset;
-
__kernel_get_syscall_map;
#ifndef CONFIG_PPC_BOOK3S_601
__kernel_gettimeofday;
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index 067247d3efb9..03bb72c440dc 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -13,9 +13,6 @@
#include <asm/vdso_datapage.h>
.text
-.global __kernel_datapage_offset;
-__kernel_datapage_offset:
- .long 0
/*
* void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S
index 4e3a8d4ee614..aaa5acf6d1b9 100644
--- a/arch/powerpc/kernel/vdso64/vdso64.lds.S
+++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S
@@ -4,6 +4,7 @@
* library
*/
#include <asm/vdso.h>
+#include <asm/page.h>
#ifdef __LITTLE_ENDIAN__
OUTPUT_FORMAT("elf64-powerpcle", "elf64-powerpcle", "elf64-powerpcle")
@@ -15,6 +16,7 @@ ENTRY(_start)
SECTIONS
{
+ PROVIDE(_vdso_datapage = . - PAGE_SIZE);
. = VDSO64_LBASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
@@ -139,11 +141,6 @@ VERSION
{
VDSO_VERSION_STRING {
global:
- /*
- * Has to be there for the kernel to find
- */
- __kernel_datapage_offset;
-
__kernel_get_syscall_map;
__kernel_gettimeofday;
__kernel_clock_gettime;
--
2.25.0
^ permalink raw reply related
* Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
From: Christophe Leroy @ 2020-08-25 14:15 UTC (permalink / raw)
To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras, nathanl
Cc: linux-arch, arnd, linux-kernel, luto, tglx, vincenzo.frascino,
linuxppc-dev
In-Reply-To: <2f9b7d02-9e2f-4724-2608-c5573f6507a2@csgroup.eu>
Le 04/08/2020 à 13:17, Christophe Leroy a écrit :
>
>
> On 07/16/2020 02:59 AM, Michael Ellerman wrote:
>> Christophe Leroy <christophe.leroy@c-s.fr> writes:
>>> The VDSO datapage and the text pages are always located immediately
>>> next to each other, so it can be hardcoded without an indirection
>>> through __kernel_datapage_offset
>>>
>>> In order to ease things, move the data page in front like other
>>> arches, that way there is no need to know the size of the library
>>> to locate the data page.
>>>
[...]
>>
>> I merged this but then realised it breaks the display of the vdso in
>> /proc/self/maps.
>>
>> ie. the vdso vma gets no name:
>>
>> # cat /proc/self/maps
[...]
>>
>>
>> And it's also going to break the logic in arch_unmap() to detect if
>> we're unmapping (part of) the VDSO. And it will break arch_remap() too.
>>
>> And the logic to recognise the signal trampoline in
>> arch/powerpc/perf/callchain_*.c as well.
>
> I don't think it breaks that one, because ->vdsobase is still the start
> of text.
>
>>
>> So I'm going to rebase and drop this for now.
>>
>> Basically we have a bunch of places that assume that vdso_base is == the
>> start of the VDSO vma, and also that the code starts there. So that will
>> need some work to tease out all those assumptions and make them work
>> with this change.
>
> Ok, one day I need to look at it in more details and see how other
> architectures handle it etc ...
>
I just sent out a series which switches powerpc to the new
_install_special_mapping() API, the one powerpc uses being deprecated
since commit a62c34bd2a8a ("x86, mm: Improve _install_special_mapping
and fix x86 vdso naming")
arch_remap() gets replaced by vdso_remap()
For arch_unmap(), I'm wondering how/what other architectures do, because
powerpc seems to be the only one to erase the vdso context pointer when
unmapping the vdso. So far I updated it to take into account the pages
switch.
Everything else is not impacted because our vdso_base is still the base
of the text and that's what those things (signal trampoline, callchain,
...) expect.
Maybe we should change it to 'void *vdso' in the same way as other
architectures, as it is not anymore the exact vdso_base but the start of
VDSO text.
Note that the series applies on top of the generic C VDSO implementation
series. However all but the last commit cleanly apply without that
series. As that last commit is just an afterwork cleanup, it can come in
a second step.
Christophe
^ permalink raw reply
* Re: [PATCH 17/29] fs_enet: Avoid comma separated statements
From: David Miller @ 2020-08-25 14:55 UTC (permalink / raw)
To: joe; +Cc: trivial, netdev, linux-kernel, kuba, linuxppc-dev
In-Reply-To: <418850ae2026b293ea6ba3b8b19b8a7f8dfcaf3d.1598331149.git.joe@perches.com>
From: Joe Perches <joe@perches.com>
Date: Mon, 24 Aug 2020 21:56:14 -0700
> Use semicolons and braces.
>
> Signed-off-by: Joe Perches <joe@perches.com>
Applied.
^ permalink raw reply
* [PATCH v7 00/12] huge vmalloc mappings
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
I think it's ready to go into -mm if it gets acks for the arch
changes.
Thanks,
Nick
Since v6:
- Fixed a false positive warning introduced in patch 2, found by
kbuild test robot.
Since v5:
- Split arch changes out better and make the constant folding work
- Avoid most of the 80 column wrap, fix a reference to lib/ioremap.c
- Fix compile error on some archs
Since v4:
- Fixed an off-by-page-order bug in v4
- Several minor cleanups.
- Added page order to /proc/vmallocinfo
- Added hugepage to alloc_large_system_hage output.
- Made an architecture config option, powerpc only for now.
Since v3:
- Fixed an off-by-one bug in a loop
- Fix !CONFIG_HAVE_ARCH_HUGE_VMAP build fail
- Hopefully this time fix the arm64 vmap stack bug, thanks Jonathan
Cameron for debugging the cause of this (hopefully).
Since v2:
- Rebased on vmalloc cleanups, split series into simpler pieces.
- Fixed several compile errors and warnings
- Keep the page array and accounting in small page units because
struct vm_struct is an interface (this should fix x86 vmap stack debug
assert). [Thanks Zefan]
Nicholas Piggin (12):
mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
mm: apply_to_pte_range warn and fail if a large pte is encountered
mm/vmalloc: rename vmap_*_range vmap_pages_*_range
mm/ioremap: rename ioremap_*_range to vmap_*_range
mm: HUGE_VMAP arch support cleanup
powerpc: inline huge vmap supported functions
arm64: inline huge vmap supported functions
x86: inline huge vmap supported functions
mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c
mm/vmalloc: add vmap_range_noflush variant
mm/vmalloc: Hugepage vmalloc mappings
powerpc/64s/radix: Enable huge vmalloc mappings
.../admin-guide/kernel-parameters.txt | 2 +
arch/Kconfig | 4 +
arch/arm64/include/asm/vmalloc.h | 25 +
arch/arm64/mm/mmu.c | 26 -
arch/powerpc/Kconfig | 1 +
arch/powerpc/include/asm/vmalloc.h | 21 +
arch/powerpc/mm/book3s64/radix_pgtable.c | 21 -
arch/x86/include/asm/vmalloc.h | 23 +
arch/x86/mm/ioremap.c | 19 -
arch/x86/mm/pgtable.c | 13 -
include/linux/io.h | 9 -
include/linux/vmalloc.h | 10 +
init/main.c | 1 -
mm/ioremap.c | 225 +--------
mm/memory.c | 60 ++-
mm/page_alloc.c | 5 +-
mm/vmalloc.c | 443 +++++++++++++++---
17 files changed, 515 insertions(+), 393 deletions(-)
--
2.23.0
^ permalink raw reply
* [PATCH v7 01/12] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
vmalloc_to_page returns NULL for addresses mapped by larger pages[*].
Whether or not a vmap is huge depends on the architecture details,
alignments, boot options, etc., which the caller can not be expected
to know. Therefore HUGE_VMAP is a regression for vmalloc_to_page.
This change teaches vmalloc_to_page about larger pages, and returns
the struct page that corresponds to the offset within the large page.
This makes the API agnostic to mapping implementation details.
[*] As explained by commit 029c54b095995 ("mm/vmalloc.c: huge-vmap:
fail gracefully on unexpected huge vmap mappings")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
mm/vmalloc.c | 41 ++++++++++++++++++++++++++---------------
1 file changed, 26 insertions(+), 15 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b482d240f9a2..4e9b21adc73d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -36,7 +36,7 @@
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
-
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -343,7 +343,9 @@ int is_vmalloc_or_module_addr(const void *x)
}
/*
- * Walk a vmap address to the struct page it maps.
+ * Walk a vmap address to the struct page it maps. Huge vmap mappings will
+ * return the tail page that corresponds to the base page address, which
+ * matches small vmap mappings.
*/
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
@@ -363,25 +365,33 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pgd_none(*pgd))
return NULL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return NULL; /* XXX: no allowance for huge pgd */
+ if (WARN_ON_ONCE(pgd_bad(*pgd)))
+ return NULL;
+
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return NULL;
- pud = pud_offset(p4d, addr);
+ if (p4d_leaf(*p4d))
+ return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(p4d_bad(*p4d)))
+ return NULL;
- /*
- * Don't dereference bad PUD or PMD (below) entries. This will also
- * identify huge mappings, which we may encounter on architectures
- * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
- * identified as vmalloc addresses by is_vmalloc_addr(), but are
- * not [unambiguously] associated with a struct page, so there is
- * no correct value to return for them.
- */
- WARN_ON_ONCE(pud_bad(*pud));
- if (pud_none(*pud) || pud_bad(*pud))
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return NULL;
+ if (pud_leaf(*pud))
+ return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pud_bad(*pud)))
return NULL;
+
pmd = pmd_offset(pud, addr);
- WARN_ON_ONCE(pmd_bad(*pmd));
- if (pmd_none(*pmd) || pmd_bad(*pmd))
+ if (pmd_none(*pmd))
+ return NULL;
+ if (pmd_leaf(*pmd))
+ return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (WARN_ON_ONCE(pmd_bad(*pmd)))
return NULL;
ptep = pte_offset_map(pmd, addr);
@@ -389,6 +399,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
+
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
--
2.23.0
^ permalink raw reply related
* [PATCH v7 02/12] mm: apply_to_pte_range warn and fail if a large pte is encountered
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
apply_to_pte_range might mistake a large pte for bad, or treat it as a
page table, resulting in a crash or corruption. Add a test to warn and
return error if large entries are found.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
mm/memory.c | 60 +++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 44 insertions(+), 16 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 602f4283122f..995b2e790b79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2262,13 +2262,20 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
}
do {
next = pmd_addr_end(addr, end);
- if (create || !pmd_none_or_clear_bad(pmd)) {
- err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
- create);
- if (err)
- break;
+ if (pmd_none(*pmd) && !create)
+ continue;
+ if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+ return -EINVAL;
+ if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+ if (!create)
+ continue;
+ pmd_clear_bad(pmd);
}
+ err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create);
+ if (err)
+ break;
} while (pmd++, addr = next, addr != end);
+
return err;
}
@@ -2289,13 +2296,20 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
}
do {
next = pud_addr_end(addr, end);
- if (create || !pud_none_or_clear_bad(pud)) {
- err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
- create);
- if (err)
- break;
+ if (pud_none(*pud) && !create)
+ continue;
+ if (WARN_ON_ONCE(pud_leaf(*pud)))
+ return -EINVAL;
+ if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+ if (!create)
+ continue;
+ pud_clear_bad(pud);
}
+ err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create);
+ if (err)
+ break;
} while (pud++, addr = next, addr != end);
+
return err;
}
@@ -2316,13 +2330,20 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
}
do {
next = p4d_addr_end(addr, end);
- if (create || !p4d_none_or_clear_bad(p4d)) {
- err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
- create);
- if (err)
- break;
+ if (p4d_none(*p4d) && !create)
+ continue;
+ if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+ return -EINVAL;
+ if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+ if (!create)
+ continue;
+ p4d_clear_bad(p4d);
}
+ err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create);
+ if (err)
+ break;
} while (p4d++, addr = next, addr != end);
+
return err;
}
@@ -2341,8 +2362,15 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
- if (!create && pgd_none_or_clear_bad(pgd))
+ if (pgd_none(*pgd) && !create)
continue;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+ return -EINVAL;
+ if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+ if (!create)
+ continue;
+ pgd_clear_bad(pgd);
+ }
err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
if (err)
break;
--
2.23.0
^ permalink raw reply related
* [PATCH v7 03/12] mm/vmalloc: rename vmap_*_range vmap_pages_*_range
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
The vmalloc mapper operates on a struct page * array rather than a
linear physical address, re-name it to make this distinction clear.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
mm/vmalloc.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4e9b21adc73d..45cd80ec7eeb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -189,7 +189,7 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
arch_sync_kernel_mappings(start, end);
}
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -217,7 +217,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -229,13 +229,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -247,13 +247,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, pgprot_t prot, struct page **pages, int *nr,
pgtbl_mod_mask *mask)
{
@@ -265,7 +265,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
+ if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
@@ -306,7 +306,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
+ err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
--
2.23.0
^ permalink raw reply related
* [PATCH v7 04/12] mm/ioremap: rename ioremap_*_range to vmap_*_range
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
This will be used as a generic kernel virtual mapping function, so
re-name it in preparation.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
mm/ioremap.c | 64 +++++++++++++++++++++++++++-------------------------
1 file changed, 33 insertions(+), 31 deletions(-)
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fa1ab41d152..3f4d36f9745a 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -61,9 +61,9 @@ static inline int ioremap_pud_enabled(void) { return 0; }
static inline int ioremap_pmd_enabled(void) { return 0; }
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
u64 pfn;
@@ -81,9 +81,8 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
{
if (!ioremap_pmd_enabled())
return 0;
@@ -103,9 +102,9 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
return pmd_set_huge(pmd, phys_addr, prot);
}
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
@@ -116,20 +115,19 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
do {
next = pmd_addr_end(addr, end);
- if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
*mask |= PGTBL_PMD_MODIFIED;
continue;
}
- if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
return -ENOMEM;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
{
if (!ioremap_pud_enabled())
return 0;
@@ -149,9 +147,9 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
return pud_set_huge(pud, phys_addr, prot);
}
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
@@ -162,20 +160,19 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
do {
next = pud_addr_end(addr, end);
- if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
*mask |= PGTBL_PUD_MODIFIED;
continue;
}
- if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask))
return -ENOMEM;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr,
- pgprot_t prot)
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
{
if (!ioremap_p4d_enabled())
return 0;
@@ -195,9 +192,9 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
return p4d_set_huge(p4d, phys_addr, prot);
}
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
@@ -208,19 +205,19 @@ static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
do {
next = p4d_addr_end(addr, end);
- if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
*mask |= PGTBL_P4D_MODIFIED;
continue;
}
- if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask))
return -ENOMEM;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
-int ioremap_page_range(unsigned long addr,
- unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+static int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
{
pgd_t *pgd;
unsigned long start;
@@ -235,8 +232,7 @@ int ioremap_page_range(unsigned long addr,
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
- &mask);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask);
if (err)
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
@@ -249,6 +245,12 @@ int ioremap_page_range(unsigned long addr,
return err;
}
+int ioremap_page_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ return vmap_range(addr, end, phys_addr, prot);
+}
+
#ifdef CONFIG_GENERIC_IOREMAP
void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
{
--
2.23.0
^ permalink raw reply related
* [PATCH v7 05/12] mm: HUGE_VMAP arch support cleanup
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, H. Peter Anvin, Will Deacon, Catalin Marinas, x86,
linux-kernel, Nicholas Piggin, Christoph Hellwig, Zefan Li,
Borislav Petkov, Jonathan Cameron, Thomas Gleixner, linuxppc-dev,
Ingo Molnar, linux-arm-kernel
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
This changes the awkward approach where architectures provide init
functions to determine which levels they can provide large mappings for,
to one where the arch is queried for each call.
This removes code and indirection, and allows constant-folding of dead
code for unsupported levels.
This also adds a prot argument to the arch query. This is unused
currently but could help with some architectures (e.g., some powerpc
processors can't map uncacheable memory with large pages).
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Ack or objection from arch maintainers if this goes via the -mm tree?
arch/arm64/include/asm/vmalloc.h | 8 +++
arch/arm64/mm/mmu.c | 10 +--
arch/powerpc/include/asm/vmalloc.h | 8 +++
arch/powerpc/mm/book3s64/radix_pgtable.c | 8 +--
arch/x86/include/asm/vmalloc.h | 7 ++
arch/x86/mm/ioremap.c | 10 +--
include/linux/io.h | 9 ---
include/linux/vmalloc.h | 6 ++
init/main.c | 1 -
mm/ioremap.c | 88 +++++++++---------------
10 files changed, 77 insertions(+), 78 deletions(-)
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 2ca708ab9b20..597b40405319 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -1,4 +1,12 @@
#ifndef _ASM_ARM64_VMALLOC_H
#define _ASM_ARM64_VMALLOC_H
+#include <asm/page.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
#endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 75df62fea1b6..9df7e0058c78 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1304,12 +1304,12 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
return dt_virt;
}
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
{
- return 0;
+ return false;
}
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot);
{
/*
* Only 4k granule supports level 1 block mappings.
@@ -1319,9 +1319,9 @@ int __init arch_ioremap_pud_supported(void)
!IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
{
- /* See arch_ioremap_pud_supported() */
+ /* See arch_vmap_pud_supported() */
return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}
diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
index b992dfaaa161..105abb73f075 100644
--- a/arch/powerpc/include/asm/vmalloc.h
+++ b/arch/powerpc/include/asm/vmalloc.h
@@ -1,4 +1,12 @@
#ifndef _ASM_POWERPC_VMALLOC_H
#define _ASM_POWERPC_VMALLOC_H
+#include <asm/page.h>
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
#endif /* _ASM_POWERPC_VMALLOC_H */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 28c784976bed..eca83a50bf2e 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1134,13 +1134,13 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
set_pte_at(mm, addr, ptep, pte);
}
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
{
/* HPT does not cope with large pages in the vmalloc area */
return radix_enabled();
}
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
{
return radix_enabled();
}
@@ -1234,7 +1234,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
return 1;
}
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
{
- return 0;
+ return false;
}
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 29837740b520..094ea2b565f3 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,6 +1,13 @@
#ifndef _ASM_X86_VMALLOC_H
#define _ASM_X86_VMALLOC_H
+#include <asm/page.h>
#include <asm/pgtable_areas.h>
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+bool arch_vmap_p4d_supported(pgprot_t prot);
+bool arch_vmap_pud_supported(pgprot_t prot);
+bool arch_vmap_pmd_supported(pgprot_t prot);
+#endif
+
#endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 84d85dbd1dad..159bfca757b9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,21 +481,21 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-int __init arch_ioremap_p4d_supported(void)
+bool arch_vmap_p4d_supported(pgprot_t prot)
{
- return 0;
+ return false;
}
-int __init arch_ioremap_pud_supported(void)
+bool arch_vmap_pud_supported(pgprot_t prot)
{
#ifdef CONFIG_X86_64
return boot_cpu_has(X86_FEATURE_GBPAGES);
#else
- return 0;
+ return false;
#endif
}
-int __init arch_ioremap_pmd_supported(void)
+bool arch_vmap_pmd_supported(pgprot_t prot)
{
return boot_cpu_has(X86_FEATURE_PSE);
}
diff --git a/include/linux/io.h b/include/linux/io.h
index 8394c56babc2..f1effd4d7a3c 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -31,15 +31,6 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
}
#endif
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-void __init ioremap_huge_init(void);
-int arch_ioremap_p4d_supported(void);
-int arch_ioremap_pud_supported(void);
-int arch_ioremap_pmd_supported(void);
-#else
-static inline void ioremap_huge_init(void) { }
-#endif
-
/*
* Managed iomap interface
*/
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0221f852a7e1..3f6bba4cc9bc 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -84,6 +84,12 @@ struct vmap_area {
};
};
+#ifndef CONFIG_HAVE_ARCH_HUGE_VMAP
+static inline bool arch_vmap_p4d_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pud_supported(pgprot_t prot) { return false; }
+static inline bool arch_vmap_pmd_supported(pgprot_t prot) { return false; }
+#endif
+
/*
* Highlevel APIs for driver use
*/
diff --git a/init/main.c b/init/main.c
index ae78fb68d231..1c89aa127b8f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -820,7 +820,6 @@ static void __init mm_init(void)
pgtable_init();
debug_objects_mem_init();
vmalloc_init();
- ioremap_huge_init();
/* Should be run before the first non-init thread is created */
init_espfix_bsp();
/* Should be run after espfix64 is set up. */
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 3f4d36f9745a..c67f91164401 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,49 +16,16 @@
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static int __read_mostly ioremap_p4d_capable;
-static int __read_mostly ioremap_pud_capable;
-static int __read_mostly ioremap_pmd_capable;
-static int __read_mostly ioremap_huge_disabled;
+static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
static int __init set_nohugeiomap(char *str)
{
- ioremap_huge_disabled = 1;
+ iomap_max_page_shift = P4D_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
-
-void __init ioremap_huge_init(void)
-{
- if (!ioremap_huge_disabled) {
- if (arch_ioremap_p4d_supported())
- ioremap_p4d_capable = 1;
- if (arch_ioremap_pud_supported())
- ioremap_pud_capable = 1;
- if (arch_ioremap_pmd_supported())
- ioremap_pmd_capable = 1;
- }
-}
-
-static inline int ioremap_p4d_enabled(void)
-{
- return ioremap_p4d_capable;
-}
-
-static inline int ioremap_pud_enabled(void)
-{
- return ioremap_pud_capable;
-}
-
-static inline int ioremap_pmd_enabled(void)
-{
- return ioremap_pmd_capable;
-}
-
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static inline int ioremap_p4d_enabled(void) { return 0; }
-static inline int ioremap_pud_enabled(void) { return 0; }
-static inline int ioremap_pmd_enabled(void) { return 0; }
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static const bool iomap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
@@ -82,9 +49,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot)
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
{
- if (!ioremap_pmd_enabled())
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
return 0;
if ((end - addr) != PMD_SIZE)
@@ -104,7 +75,7 @@ static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
@@ -115,7 +86,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
do {
next = pmd_addr_end(addr, end);
- if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) {
*mask |= PGTBL_PMD_MODIFIED;
continue;
}
@@ -127,9 +98,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
}
static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot)
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
{
- if (!ioremap_pud_enabled())
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
return 0;
if ((end - addr) != PUD_SIZE)
@@ -149,7 +124,7 @@ static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
@@ -160,21 +135,25 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
do {
next = pud_addr_end(addr, end);
- if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) {
*mask |= PGTBL_PUD_MODIFIED;
continue;
}
- if (vmap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask))
return -ENOMEM;
} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot)
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
{
- if (!ioremap_p4d_enabled())
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
return 0;
if ((end - addr) != P4D_SIZE)
@@ -194,7 +173,7 @@ static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
@@ -205,19 +184,20 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
do {
next = p4d_addr_end(addr, end);
- if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) {
*mask |= PGTBL_P4D_MODIFIED;
continue;
}
- if (vmap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask))
return -ENOMEM;
} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
}
static int vmap_range(unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot)
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
{
pgd_t *pgd;
unsigned long start;
@@ -232,7 +212,7 @@ static int vmap_range(unsigned long addr, unsigned long end,
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, &mask);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask);
if (err)
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
@@ -248,7 +228,7 @@ static int vmap_range(unsigned long addr, unsigned long end,
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
- return vmap_range(addr, end, phys_addr, prot);
+ return vmap_range(addr, end, phys_addr, prot, iomap_max_page_shift);
}
#ifdef CONFIG_GENERIC_IOREMAP
--
2.23.0
^ permalink raw reply related
* [PATCH v7 06/12] powerpc: inline huge vmap supported functions
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
This allows unsupported levels to be constant folded away, and so
p4d_free_pud_page can be removed because it's no longer linked to.
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Ack or objection if this goes via the -mm tree?
arch/powerpc/include/asm/vmalloc.h | 19 ++++++++++++++++---
arch/powerpc/mm/book3s64/radix_pgtable.c | 21 ---------------------
2 files changed, 16 insertions(+), 24 deletions(-)
diff --git a/arch/powerpc/include/asm/vmalloc.h b/arch/powerpc/include/asm/vmalloc.h
index 105abb73f075..3f0c153befb0 100644
--- a/arch/powerpc/include/asm/vmalloc.h
+++ b/arch/powerpc/include/asm/vmalloc.h
@@ -1,12 +1,25 @@
#ifndef _ASM_POWERPC_VMALLOC_H
#define _ASM_POWERPC_VMALLOC_H
+#include <asm/mmu.h>
#include <asm/page.h>
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot);
-bool arch_vmap_pud_supported(pgprot_t prot);
-bool arch_vmap_pmd_supported(pgprot_t prot);
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+ return false;
+}
+
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+ /* HPT does not cope with large pages in the vmalloc area */
+ return radix_enabled();
+}
+
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ return radix_enabled();
+}
#endif
#endif /* _ASM_POWERPC_VMALLOC_H */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index eca83a50bf2e..27f5837cf145 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1134,22 +1134,6 @@ void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
set_pte_at(mm, addr, ptep, pte);
}
-bool arch_vmap_pud_supported(pgprot_t prot)
-{
- /* HPT does not cope with large pages in the vmalloc area */
- return radix_enabled();
-}
-
-bool arch_vmap_pmd_supported(pgprot_t prot)
-{
- return radix_enabled();
-}
-
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0;
-}
-
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
pte_t *ptep = (pte_t *)pud;
@@ -1233,8 +1217,3 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
return 1;
}
-
-bool arch_vmap_p4d_supported(pgprot_t prot)
-{
- return false;
-}
--
2.23.0
^ permalink raw reply related
* [PATCH v7 07/12] arm64: inline huge vmap supported functions
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, Will Deacon, Catalin Marinas, linux-kernel,
Nicholas Piggin, Christoph Hellwig, Zefan Li, Jonathan Cameron,
linuxppc-dev, linux-arm-kernel
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
This allows unsupported levels to be constant folded away, and so
p4d_free_pud_page can be removed because it's no longer linked to.
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Ack or objection if this goes via the -mm tree?
arch/arm64/include/asm/vmalloc.h | 23 ++++++++++++++++++++---
arch/arm64/mm/mmu.c | 26 --------------------------
2 files changed, 20 insertions(+), 29 deletions(-)
diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 597b40405319..fc9a12d6cc1a 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -4,9 +4,26 @@
#include <asm/page.h>
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot);
-bool arch_vmap_pud_supported(pgprot_t prot);
-bool arch_vmap_pmd_supported(pgprot_t prot);
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+ return false;
+}
+
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+ /*
+ * Only 4k granule supports level 1 block mappings.
+ * SW table walks can't handle removal of intermediate entries.
+ */
+ return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
+ !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+}
+
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ /* See arch_vmap_pud_supported() */
+ return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
+}
#endif
#endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 9df7e0058c78..07093e148957 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1304,27 +1304,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
return dt_virt;
}
-bool arch_vmap_p4d_supported(pgprot_t prot)
-{
- return false;
-}
-
-bool arch_vmap_pud_supported(pgprot_t prot);
-{
- /*
- * Only 4k granule supports level 1 block mappings.
- * SW table walks can't handle removal of intermediate entries.
- */
- return IS_ENABLED(CONFIG_ARM64_4K_PAGES) &&
- !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
-}
-
-bool arch_vmap_pmd_supported(pgprot_t prot)
-{
- /* See arch_vmap_pud_supported() */
- return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
-}
-
int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
{
pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
@@ -1416,11 +1395,6 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
return 1;
}
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0; /* Don't attempt a block mapping */
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
{
--
2.23.0
^ permalink raw reply related
* [PATCH v7 08/12] x86: inline huge vmap supported functions
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, H. Peter Anvin, x86, linux-kernel, Nicholas Piggin,
Christoph Hellwig, Zefan Li, Borislav Petkov, Jonathan Cameron,
Thomas Gleixner, linuxppc-dev, Ingo Molnar
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
This allows unsupported levels to be constant folded away, and so
p4d_free_pud_page can be removed because it's no longer linked to.
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Ack or objection if this goes via the -mm tree?
arch/x86/include/asm/vmalloc.h | 22 +++++++++++++++++++---
arch/x86/mm/ioremap.c | 19 -------------------
arch/x86/mm/pgtable.c | 13 -------------
3 files changed, 19 insertions(+), 35 deletions(-)
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 094ea2b565f3..e714b00fc0ca 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,13 +1,29 @@
#ifndef _ASM_X86_VMALLOC_H
#define _ASM_X86_VMALLOC_H
+#include <asm/cpufeature.h>
#include <asm/page.h>
#include <asm/pgtable_areas.h>
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-bool arch_vmap_p4d_supported(pgprot_t prot);
-bool arch_vmap_pud_supported(pgprot_t prot);
-bool arch_vmap_pmd_supported(pgprot_t prot);
+static inline bool arch_vmap_p4d_supported(pgprot_t prot)
+{
+ return false;
+}
+
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+#ifdef CONFIG_X86_64
+ return boot_cpu_has(X86_FEATURE_GBPAGES);
+#else
+ return false;
+#endif
+}
+
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
#endif
#endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 159bfca757b9..1465a22a9bfb 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,25 +481,6 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-bool arch_vmap_p4d_supported(pgprot_t prot)
-{
- return false;
-}
-
-bool arch_vmap_pud_supported(pgprot_t prot)
-{
-#ifdef CONFIG_X86_64
- return boot_cpu_has(X86_FEATURE_GBPAGES);
-#else
- return false;
-#endif
-}
-
-bool arch_vmap_pmd_supported(pgprot_t prot)
-{
- return boot_cpu_has(X86_FEATURE_PSE);
-}
-
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfd82f51ba66..801c418ee97d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -780,14 +780,6 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
-/*
- * Until we support 512GB pages, skip them in the vmap area.
- */
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
@@ -859,11 +851,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#else /* !CONFIG_X86_64 */
-int pud_free_pmd_page(pud_t *pud, unsigned long addr)
-{
- return pud_none(*pud);
-}
-
/*
* Disable free page handling on x86-PAE. This assures that ioremap()
* does not update sync'd pmd entries. See vmalloc_sync_one().
--
2.23.0
^ permalink raw reply related
* [PATCH v7 09/12] mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
This is a generic kernel virtual memory mapper, not specific to ioremap.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
include/linux/vmalloc.h | 3 +
mm/ioremap.c | 197 ----------------------------------------
mm/vmalloc.c | 196 +++++++++++++++++++++++++++++++++++++++
3 files changed, 199 insertions(+), 197 deletions(-)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3f6bba4cc9bc..15adb9a14fb6 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -177,6 +177,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
#ifdef CONFIG_MMU
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift);
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
diff --git a/mm/ioremap.c b/mm/ioremap.c
index c67f91164401..d1dcc7e744ac 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -28,203 +28,6 @@ early_param("nohugeiomap", set_nohugeiomap);
static const bool iomap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
-{
- pte_t *pte;
- u64 pfn;
-
- pfn = phys_addr >> PAGE_SHIFT;
- pte = pte_alloc_kernel_track(pmd, addr, mask);
- if (!pte)
- return -ENOMEM;
- do {
- BUG_ON(!pte_none(*pte));
- set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
- pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- *mask |= PGTBL_PTE_MODIFIED;
- return 0;
-}
-
-static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift)
-{
- if (max_page_shift < PMD_SHIFT)
- return 0;
-
- if (!arch_vmap_pmd_supported(prot))
- return 0;
-
- if ((end - addr) != PMD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PMD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PMD_SIZE))
- return 0;
-
- if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
- return 0;
-
- return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
- if (!pmd)
- return -ENOMEM;
- do {
- next = pmd_addr_end(addr, end);
-
- if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) {
- *mask |= PGTBL_PMD_MODIFIED;
- continue;
- }
-
- if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
- return -ENOMEM;
- } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift)
-{
- if (max_page_shift < PUD_SHIFT)
- return 0;
-
- if (!arch_vmap_pud_supported(prot))
- return 0;
-
- if ((end - addr) != PUD_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, PUD_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, PUD_SIZE))
- return 0;
-
- if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
- return 0;
-
- return pud_set_huge(pud, phys_addr, prot);
-}
-
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_alloc_track(&init_mm, p4d, addr, mask);
- if (!pud)
- return -ENOMEM;
- do {
- next = pud_addr_end(addr, end);
-
- if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) {
- *mask |= PGTBL_PUD_MODIFIED;
- continue;
- }
-
- if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask))
- return -ENOMEM;
- } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift)
-{
- if (max_page_shift < P4D_SHIFT)
- return 0;
-
- if (!arch_vmap_p4d_supported(prot))
- return 0;
-
- if ((end - addr) != P4D_SIZE)
- return 0;
-
- if (!IS_ALIGNED(addr, P4D_SIZE))
- return 0;
-
- if (!IS_ALIGNED(phys_addr, P4D_SIZE))
- return 0;
-
- if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
- return 0;
-
- return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift, pgtbl_mod_mask *mask)
-{
- p4d_t *p4d;
- unsigned long next;
-
- p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
- if (!p4d)
- return -ENOMEM;
- do {
- next = p4d_addr_end(addr, end);
-
- if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) {
- *mask |= PGTBL_P4D_MODIFIED;
- continue;
- }
-
- if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask))
- return -ENOMEM;
- } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
- return 0;
-}
-
-static int vmap_range(unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot,
- unsigned int max_page_shift)
-{
- pgd_t *pgd;
- unsigned long start;
- unsigned long next;
- int err;
- pgtbl_mod_mask mask = 0;
-
- might_sleep();
- BUG_ON(addr >= end);
-
- start = addr;
- pgd = pgd_offset_k(addr);
- do {
- next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask);
- if (err)
- break;
- } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
- flush_cache_vmap(start, end);
-
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
- arch_sync_kernel_mappings(start, end);
-
- return err;
-}
-
int ioremap_page_range(unsigned long addr,
unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 45cd80ec7eeb..256554d598e6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -70,6 +70,202 @@ static void free_work(struct work_struct *w)
}
/*** Page table manipulation functions ***/
+static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PMD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, max_page_shift)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < PUD_SHIFT)
+ return 0;
+
+ if (!arch_vmap_pud_supported(prot))
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, max_page_shift)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ if (max_page_shift < P4D_SHIFT)
+ return 0;
+
+ if (!arch_vmap_p4d_supported(prot))
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, max_page_shift)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, max_page_shift, &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ flush_cache_vmap(start, end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
--
2.23.0
^ permalink raw reply related
* [PATCH v7 10/12] mm/vmalloc: add vmap_range_noflush variant
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
As a side-effect, the order of flush_cache_vmap() and
arch_sync_kernel_mappings() calls are switched, but that now matches
the other callers in this file.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
mm/vmalloc.c | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 256554d598e6..1d6cad16bda3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -237,7 +237,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
return 0;
}
-int vmap_range(unsigned long addr, unsigned long end,
+static int vmap_range_noflush(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
unsigned int max_page_shift)
{
@@ -259,14 +259,24 @@ int vmap_range(unsigned long addr, unsigned long end,
break;
} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
- flush_cache_vmap(start, end);
-
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return err;
}
+int vmap_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int max_page_shift)
+{
+ int err;
+
+ err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+ flush_cache_vmap(addr, end);
+
+ return err;
+}
+
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
--
2.23.0
^ permalink raw reply related
* [PATCH v7 11/12] mm/vmalloc: Hugepage vmalloc mappings
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
supports PMD sized vmap mappings.
vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or
larger, and fall back to small pages if that was unsuccessful.
Allocations that do not use PAGE_KERNEL prot are not permitted to use huge
pages, because not all callers expect this (e.g., module allocations vs
strict module rwx).
This reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
This can result in more internal fragmentation and memory overhead for a
given allocation, an option nohugevmalloc is added to disable at boot.
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
arch/Kconfig | 4 +
include/linux/vmalloc.h | 1 +
mm/page_alloc.c | 5 +-
mm/vmalloc.c | 180 ++++++++++++++++++++++++++++++----------
4 files changed, 145 insertions(+), 45 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index af14a567b493..b2b89d629317 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -616,6 +616,10 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
config HAVE_ARCH_HUGE_VMAP
bool
+config HAVE_ARCH_HUGE_VMALLOC
+ depends on HAVE_ARCH_HUGE_VMAP
+ bool
+
config ARCH_WANT_HUGE_PMD_SHARE
bool
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 15adb9a14fb6..a7449064fe35 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -58,6 +58,7 @@ struct vm_struct {
unsigned long size;
unsigned long flags;
struct page **pages;
+ unsigned int page_order;
unsigned int nr_pages;
phys_addr_t phys_addr;
const void *caller;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e2bab486fea..b6427cc7b838 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
+#include <linux/vmalloc.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -8102,6 +8103,7 @@ void *__init alloc_large_system_hash(const char *tablename,
void *table = NULL;
gfp_t gfp_flags;
bool virt;
+ bool huge;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -8169,6 +8171,7 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
+ huge = (find_vm_area(table)->page_order > 0);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -8185,7 +8188,7 @@ void *__init alloc_large_system_hash(const char *tablename,
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
- virt ? "vmalloc" : "linear");
+ virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d6cad16bda3..8db53c2d7f72 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -44,6 +44,19 @@
#include "internal.h"
#include "pgalloc-track.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
+static bool __ro_after_init vmap_allow_huge = true;
+
+static int __init set_nohugevmalloc(char *str)
+{
+ vmap_allow_huge = false;
+ return 0;
+}
+early_param("nohugevmalloc", set_nohugevmalloc);
+#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+static const bool vmap_allow_huge = false;
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
+
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x;
@@ -477,31 +490,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
return 0;
}
-/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
- * have been allocated using get_vm_area() and its friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is responsible for
- * calling flush_cache_vmap() on to-be-mapped areas before calling this
- * function.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
- unsigned long end = addr + size;
- unsigned long next;
pgd_t *pgd;
+ unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
@@ -523,6 +517,65 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
return 0;
}
+static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
+
+ WARN_ON(page_shift < PAGE_SHIFT);
+
+ if (page_shift == PAGE_SHIFT)
+ return vmap_small_pages_range_noflush(addr, end, prot, pages);
+
+ for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
+ int err;
+
+ err = vmap_range_noflush(addr, addr + (1UL << page_shift),
+ __pa(page_address(pages[i])), prot,
+ page_shift);
+ if (err)
+ return err;
+
+ addr += 1UL << page_shift;
+ }
+
+ return 0;
+}
+
+static int vmap_pages_range(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ int err;
+
+ err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ flush_cache_vmap(addr, end);
+ return err;
+}
+
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
+{
+ return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
+}
+
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
struct page **pages)
{
@@ -2400,6 +2453,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
+ /* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
@@ -2433,11 +2487,12 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << area->page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
+ unsigned long page_size = PAGE_SIZE << area->page_order;
start = min(addr, start);
- end = max(addr + PAGE_SIZE, end);
+ end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@@ -2480,11 +2535,11 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (deallocate_pages) {
int i;
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << area->page_order) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, 0);
+ __free_pages(page, area->page_order);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2623,9 +2678,12 @@ void *vmap(struct page **pages, unsigned int count,
EXPORT_SYMBOL(vmap);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, unsigned int page_shift, int node)
{
struct page **pages;
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long size = get_vm_area_size(area);
+ unsigned int page_order = page_shift - PAGE_SHIFT;
unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
@@ -2633,7 +2691,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
0 :
__GFP_HIGHMEM;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ nr_pages = size >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
/* Please note that the recursion is strictly bounded. */
@@ -2652,29 +2710,29 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->pages = pages;
area->nr_pages = nr_pages;
+ area->page_order = page_order;
- for (i = 0; i < area->nr_pages; i++) {
+ for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page;
+ int p;
- if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
- else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
-
+ page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
- area->pages[i] = page;
+
+ for (p = 0; p < (1U << page_order); p++)
+ area->pages[i + p] = page + p;
+
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
- prot, pages) < 0)
+ if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
goto fail;
return area->addr;
@@ -2682,7 +2740,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
- (area->nr_pages*PAGE_SIZE), area->size);
+ (area->nr_pages*PAGE_SIZE), size);
__vfree(area->addr);
return NULL;
}
@@ -2713,19 +2771,42 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
+ unsigned long real_align = align;
+ unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages())
goto fail;
- area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
+ if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) {
+ unsigned long size_per_node;
+
+ /*
+ * Try huge pages. Only try for PAGE_KERNEL allocations,
+ * others like modules don't yet expect huge pages in
+ * their allocations due to apply_to_page_range not
+ * supporting them.
+ */
+
+ size_per_node = size;
+ if (node == NUMA_NO_NODE)
+ size_per_node /= num_online_nodes();
+ if (size_per_node >= PMD_SIZE) {
+ shift = PMD_SHIFT;
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
+ }
+ }
+
+again:
+ size = PAGE_ALIGN(size);
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
- return NULL;
+ goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2739,8 +2820,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;
fail:
- warn_alloc(gfp_mask, NULL,
+ if (shift > PAGE_SHIFT) {
+ free_vm_area(area);
+ shift = PAGE_SHIFT;
+ align = real_align;
+ size = real_size;
+ goto again;
+ }
+
+ if (!area) {
+ /* Warn for area allocation, page allocations already warn */
+ warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
+ }
return NULL;
}
@@ -3739,7 +3831,7 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
- seq_printf(m, " pages=%d", v->nr_pages);
+ seq_printf(m, " pages=%d order=%d", v->nr_pages, v->page_order);
if (v->phys_addr)
seq_printf(m, " phys=%pa", &v->phys_addr);
--
2.23.0
^ permalink raw reply related
* [PATCH v7 12/12] powerpc/64s/radix: Enable huge vmalloc mappings
From: Nicholas Piggin @ 2020-08-25 14:57 UTC (permalink / raw)
To: linux-mm, Andrew Morton
Cc: linux-arch, linux-kernel, Nicholas Piggin, Christoph Hellwig,
Zefan Li, Jonathan Cameron, linuxppc-dev
In-Reply-To: <20200825145753.529284-1-npiggin@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
Documentation/admin-guide/kernel-parameters.txt | 2 ++
arch/powerpc/Kconfig | 1 +
2 files changed, 3 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bdc1f33fd3d1..6f0b41289a90 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3190,6 +3190,8 @@
nohugeiomap [KNL,X86,PPC] Disable kernel huge I/O mappings.
+ nohugevmalloc [PPC] Disable kernel huge vmalloc mappings.
+
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f48bbfb3ce9..9171d25ad7dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -175,6 +175,7 @@ config PPC
select GENERIC_TIME_VSYSCALL
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU
+ select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14
select HAVE_ARCH_KASAN_VMALLOC if PPC32 && PPC_PAGE_SHIFT <= 14
--
2.23.0
^ permalink raw reply related
* [PATCH net v2] ibmvnic fix NULL tx_pools and rx_tools issue at do_reset
From: Dany Madden @ 2020-08-25 16:56 UTC (permalink / raw)
To: davem; +Cc: Dany Madden, netdev, Mingming Cao, linuxppc-dev
From: Mingming Cao <mmc@linux.vnet.ibm.com>
At the time of do_rest, ibmvnic tries to re-initalize the tx_pools
and rx_pools to avoid re-allocating the long term buffer. However
there is a window inside do_reset that the tx_pools and
rx_pools were freed before re-initialized making it possible to deference
null pointers.
This patch fix this issue by always check the tx_pool
and rx_pool are not NULL after ibmvnic_login. If so, re-allocating
the pools. This will avoid getting into calling reset_tx/rx_pools with
NULL adapter tx_pools/rx_pools pointer. Also add null pointer check in
reset_tx_pools and reset_rx_pools to safe handle NULL pointer case.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
Signed-off-by: Dany Madden <drt@linux.ibm.com>
---
drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 5afb3c9c52d2..52feee97821e 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -479,6 +479,9 @@ static int reset_rx_pools(struct ibmvnic_adapter *adapter)
int i, j, rc;
u64 *size_array;
+ if (!adapter->rx_pool)
+ return -1;
+
size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size));
@@ -649,6 +652,9 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter)
int tx_scrqs;
int i, rc;
+ if (!adapter->tx_pool)
+ return -1;
+
tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
for (i = 0; i < tx_scrqs; i++) {
rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]);
@@ -2011,7 +2017,10 @@ static int do_reset(struct ibmvnic_adapter *adapter,
adapter->req_rx_add_entries_per_subcrq !=
old_num_rx_slots ||
adapter->req_tx_entries_per_subcrq !=
- old_num_tx_slots) {
+ old_num_tx_slots ||
+ !adapter->rx_pool ||
+ !adapter->tso_pool ||
+ !adapter->tx_pool) {
release_rx_pools(adapter);
release_tx_pools(adapter);
release_napi(adapter);
@@ -2024,10 +2033,14 @@ static int do_reset(struct ibmvnic_adapter *adapter,
} else {
rc = reset_tx_pools(adapter);
if (rc)
+ netdev_dbg(adapter->netdev, "reset tx pools failed (%d)\n",
+ rc);
goto out;
rc = reset_rx_pools(adapter);
if (rc)
+ netdev_dbg(adapter->netdev, "reset rx pools failed (%d)\n",
+ rc);
goto out;
}
ibmvnic_disable_irqs(adapter);
--
2.18.2
^ permalink raw reply related
* Re: [PATCH net v2] ibmvnic fix NULL tx_pools and rx_tools issue at do_reset
From: David Miller @ 2020-08-25 17:08 UTC (permalink / raw)
To: drt; +Cc: netdev, mmc, linuxppc-dev
In-Reply-To: <20200825165606.806674-1-drt@linux.ibm.com>
From: Dany Madden <drt@linux.ibm.com>
Date: Tue, 25 Aug 2020 12:56:06 -0400
> @@ -2011,7 +2017,10 @@ static int do_reset(struct ibmvnic_adapter *adapter,
> adapter->req_rx_add_entries_per_subcrq !=
> old_num_rx_slots ||
> adapter->req_tx_entries_per_subcrq !=
> - old_num_tx_slots) {
> + old_num_tx_slots ||
> + !adapter->rx_pool ||
> + !adapter->tso_pool ||
> + !adapter->tx_pool) {
Please don't over indent these new lines, indent them identically as the
lines above where you are adding new conditions.
Thank you.
^ permalink raw reply
* [PATCH net v3] ibmvnic fix NULL tx_pools and rx_tools issue at do_reset
From: Dany Madden @ 2020-08-25 17:26 UTC (permalink / raw)
To: davem; +Cc: Dany Madden, netdev, Mingming Cao, linuxppc-dev
From: Mingming Cao <mmc@linux.vnet.ibm.com>
At the time of do_rest, ibmvnic tries to re-initalize the tx_pools
and rx_pools to avoid re-allocating the long term buffer. However
there is a window inside do_reset that the tx_pools and
rx_pools were freed before re-initialized making it possible to deference
null pointers.
This patch fix this issue by always check the tx_pool
and rx_pool are not NULL after ibmvnic_login. If so, re-allocating
the pools. This will avoid getting into calling reset_tx/rx_pools with
NULL adapter tx_pools/rx_pools pointer. Also add null pointer check in
reset_tx_pools and reset_rx_pools to safe handle NULL pointer case.
Signed-off-by: Mingming Cao <mmc@linux.vnet.ibm.com>
Signed-off-by: Dany Madden <drt@linux.ibm.com>
---
drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 5afb3c9c52d2..d3a774331afc 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -479,6 +479,9 @@ static int reset_rx_pools(struct ibmvnic_adapter *adapter)
int i, j, rc;
u64 *size_array;
+ if (!adapter->rx_pool)
+ return -1;
+
size_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
be32_to_cpu(adapter->login_rsp_buf->off_rxadd_buff_size));
@@ -649,6 +652,9 @@ static int reset_tx_pools(struct ibmvnic_adapter *adapter)
int tx_scrqs;
int i, rc;
+ if (!adapter->tx_pool)
+ return -1;
+
tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
for (i = 0; i < tx_scrqs; i++) {
rc = reset_one_tx_pool(adapter, &adapter->tso_pool[i]);
@@ -2011,7 +2017,10 @@ static int do_reset(struct ibmvnic_adapter *adapter,
adapter->req_rx_add_entries_per_subcrq !=
old_num_rx_slots ||
adapter->req_tx_entries_per_subcrq !=
- old_num_tx_slots) {
+ old_num_tx_slots ||
+ !adapter->rx_pool ||
+ !adapter->tso_pool ||
+ !adapter->tx_pool) {
release_rx_pools(adapter);
release_tx_pools(adapter);
release_napi(adapter);
@@ -2024,10 +2033,14 @@ static int do_reset(struct ibmvnic_adapter *adapter,
} else {
rc = reset_tx_pools(adapter);
if (rc)
+ netdev_dbg(adapter->netdev, "reset tx pools failed (%d)\n",
+ rc);
goto out;
rc = reset_rx_pools(adapter);
if (rc)
+ netdev_dbg(adapter->netdev, "reset rx pools failed (%d)\n",
+ rc);
goto out;
}
ibmvnic_disable_irqs(adapter);
--
2.18.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox