* [PATCH] Add per-process flag to control thp
@ 2013-08-02 19:46 Alex Thorlton
2013-08-02 19:53 ` Dave Jones
` (3 more replies)
0 siblings, 4 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-02 19:46 UTC (permalink / raw)
To: linux-kernel
Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman,
Kirill A. Shutemov, Rik van Riel, Johannes Weiner,
Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt,
linux-kernel
This patch implements functionality to allow processes to disable the use of
transparent hugepages through the prctl syscall.
We've determined that some jobs perform significantly better with thp disabled,
and we need a way to control thp on a per-process basis, without relying on
madvise.
---
include/linux/huge_mm.h | 14 +++++++++++++-
include/linux/init_task.h | 8 ++++++++
include/linux/sched.h | 3 +++
include/uapi/linux/prctl.h | 3 +++
kernel/fork.c | 4 ++++
kernel/sys.c | 31 +++++++++++++++++++++++++++++++
6 files changed, 62 insertions(+), 1 deletion(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b60de92..53af3ca 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,6 +1,8 @@
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H
+#include <linux/sched.h>
+
extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -66,7 +68,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
-#define transparent_hugepage_enabled(__vma) \
+#define _transparent_hugepage_enabled(__vma) \
((transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_FLAG) || \
(transparent_hugepage_flags & \
@@ -177,6 +179,11 @@ static inline struct page *compound_trans_head(struct page *page)
return page;
}
+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ return !current->thp_disabled & _transparent_hugepage_enabled(vma);
+}
+
extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp);
@@ -230,6 +237,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
return 0;
}
+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ return _transparent_hugepage_enabled(vma);
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f09..aae74fd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -152,6 +152,13 @@ extern struct task_group root_task_group;
# define INIT_VTIME(tsk)
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+# define INIT_THP_DISABLED \
+ .thp_disabled = 0,
+#else
+# define INIT_THP_DISABLED
+#endif
+
#define INIT_TASK_COMM "swapper"
/*
@@ -222,6 +229,7 @@ extern struct task_group root_task_group;
INIT_TASK_RCU_PREEMPT(tsk) \
INIT_CPUSET_SEQ \
INIT_VTIME(tsk) \
+ INIT_THP_DISABLED \
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50d04b9..f084c76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1406,6 +1406,9 @@ struct task_struct {
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int thp_disabled;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..f69780d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
#define PR_GET_TID_ADDRESS 40
+#define PR_SET_THP_DISABLED 41
+#define PR_GET_THP_DISABLED 42
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb..0b4afb5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1311,6 +1311,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->sequential_io_avg = 0;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ p->thp_disabled = current->thp_disabled;
+#endif
+
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b..416c8a6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1836,6 +1836,31 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+ me->thp_disabled = 1;
+ return 0;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+ int __user *thp_disabled)
+{
+ return put_user(me->thp_disabled, thp_disabled);
+}
+#else
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+ return -EINVAL;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+ int __user *thp_disabled)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -1999,6 +2024,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
return current->no_new_privs ? 1 : 0;
+ case PR_SET_THP_DISABLED:
+ error = prctl_set_thp_disabled(me);
+ break;
+ case PR_GET_THP_DISABLED:
+ error = prctl_get_thp_disabled(me, (int __user *) arg2);
+ break;
default:
error = -EINVAL;
break;
--
1.7.12.4
^ permalink raw reply related [flat|nested] 57+ messages in thread* Re: [PATCH] Add per-process flag to control thp 2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton @ 2013-08-02 19:53 ` Dave Jones 2013-08-02 20:00 ` Alex Thorlton 2013-08-02 20:13 ` Kirill A. Shutemov ` (2 subsequent siblings) 3 siblings, 1 reply; 57+ messages in thread From: Dave Jones @ 2013-08-02 19:53 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On Fri, Aug 02, 2013 at 02:46:59PM -0500, Alex Thorlton wrote: > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 50d04b9..f084c76 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1406,6 +1406,9 @@ struct task_struct { > unsigned int sequential_io; > unsigned int sequential_io_avg; > #endif > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + int thp_disabled; > +#endif > }; Instead of blowing a whole int on this bool, we could add it to the bitfield a few pages up where we have other prctl things like.. unsigned no_new_privs:1; Dave ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 19:53 ` Dave Jones @ 2013-08-02 20:00 ` Alex Thorlton 0 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-02 20:00 UTC (permalink / raw) To: Dave Jones, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt > Instead of blowing a whole int on this bool, we could add it > to the bitfield a few pages up where we have other prctl things like.. > > unsigned no_new_privs:1; Definitely a better way to go. I'll tweak that and float another version of the patch. Thanks for the input, Dave! ^ permalink raw reply [flat|nested] 57+ messages in thread
* RE: [PATCH] Add per-process flag to control thp 2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton 2013-08-02 19:53 ` Dave Jones @ 2013-08-02 20:13 ` Kirill A. Shutemov 2013-08-02 20:34 ` Alex Thorlton 2013-08-04 14:44 ` Rasmus Villemoes 2013-08-28 13:56 ` Andrea Arcangeli 3 siblings, 1 reply; 57+ messages in thread From: Kirill A. Shutemov @ 2013-08-02 20:13 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt, linux-kernel Alex Thorlton wrote: > This patch implements functionality to allow processes to disable the use of > transparent hugepages through the prctl syscall. > > We've determined that some jobs perform significantly better with thp disabled, > and we need a way to control thp on a per-process basis, without relying on > madvise. What kind of workloads are you talking about? What's wrong with madvise? Could you elaborate? And I think thp_disabled should be reset to 0 on exec. -- Kirill A. Shutemov ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 20:13 ` Kirill A. Shutemov @ 2013-08-02 20:34 ` Alex Thorlton 2013-08-02 23:59 ` Kirill A. Shutemov ` (2 more replies) 0 siblings, 3 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-02 20:34 UTC (permalink / raw) To: Kirill A. Shutemov Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt > What kind of workloads are you talking about? Our benchmarking team has a list of several of the SPEC OMP benchmarks that perform significantly better when THP is disabled. I tried to get the list but one of our servers is acting up and I can't get to it right now :/ > What's wrong with madvise? Could you elaborate? The main issue with using madvise is that it's not an option with static binaries, but there are also some users who have legacy Fortran code that they're not willing/able to change. > And I think thp_disabled should be reset to 0 on exec. The main purpose for this getting carried down from the parent process is that we'd like to be able to have a userland program set this flag on itself, and then spawn off children who will also carry the flag. This allows us to set the flag for programs where we're unable to modify the code, thus resolving the issues detailed above. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 20:34 ` Alex Thorlton @ 2013-08-02 23:59 ` Kirill A. Shutemov 2013-08-03 19:35 ` Kees Cook 2013-08-05 2:36 ` Andi Kleen 2 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-08-02 23:59 UTC (permalink / raw) To: Alex Thorlton Cc: Kirill A. Shutemov, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Alex Thorlton wrote: > > What kind of workloads are you talking about? > > Our benchmarking team has a list of several of the SPEC OMP benchmarks > that perform significantly better when THP is disabled. I tried to get > the list but one of our servers is acting up and I can't get to it > right now :/ Have you track down the reason why it's slow tih THP? -- Kirill A. Shutemov ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 20:34 ` Alex Thorlton 2013-08-02 23:59 ` Kirill A. Shutemov @ 2013-08-03 19:35 ` Kees Cook 2013-08-04 14:19 ` Oleg Nesterov 2013-08-05 2:36 ` Andi Kleen 2 siblings, 1 reply; 57+ messages in thread From: Kees Cook @ 2013-08-03 19:35 UTC (permalink / raw) To: Alex Thorlton Cc: Kirill A. Shutemov, LKML, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Robin Holt On Fri, Aug 2, 2013 at 1:34 PM, Alex Thorlton <athorlton@sgi.com> wrote: >> What kind of workloads are you talking about? > > Our benchmarking team has a list of several of the SPEC OMP benchmarks > that perform significantly better when THP is disabled. I tried to get > the list but one of our servers is acting up and I can't get to it > right now :/ > >> What's wrong with madvise? Could you elaborate? > > The main issue with using madvise is that it's not an option with static > binaries, but there are also some users who have legacy Fortran code > that they're not willing/able to change. > >> And I think thp_disabled should be reset to 0 on exec. > > The main purpose for this getting carried down from the parent process > is that we'd like to be able to have a userland program set this flag on > itself, and then spawn off children who will also carry the flag. > This allows us to set the flag for programs where we're unable to modify > the code, thus resolving the issues detailed above. This could be done with LD_PRELOAD for uncontrolled binaries. Though it does seem sensible to make it act like most personality flags do (i.e. inherited). -Kees -- Kees Cook Chrome OS Security ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-03 19:35 ` Kees Cook @ 2013-08-04 14:19 ` Oleg Nesterov 0 siblings, 0 replies; 57+ messages in thread From: Oleg Nesterov @ 2013-08-04 14:19 UTC (permalink / raw) To: Kees Cook Cc: Alex Thorlton, Kirill A. Shutemov, LKML, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Srikar Dronamraju, Robin Holt On 08/03, Kees Cook wrote: > > On Fri, Aug 2, 2013 at 1:34 PM, Alex Thorlton <athorlton@sgi.com> wrote: > > > >> And I think thp_disabled should be reset to 0 on exec. > > > > The main purpose for this getting carried down from the parent process > > is that we'd like to be able to have a userland program set this flag on > > itself, and then spawn off children who will also carry the flag. > > This allows us to set the flag for programs where we're unable to modify > > the code, thus resolving the issues detailed above. > > This could be done with LD_PRELOAD for uncontrolled binaries. Though > it does seem sensible to make it act like most personality flags do > (i.e. inherited). It is only inheritable if the process is single-threaded. And even if it is single-threaded I can't understand how it can really help. OK, transparent_hugepage_enabled() checks ->thp_disabled, but this is the fault path. But this flag can't hide the task from khugepaged and collapse_huge_page ? I still think this should be mm flag. And we have MMF_INIT_MASK, it can be inheritable. Oleg. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 20:34 ` Alex Thorlton 2013-08-02 23:59 ` Kirill A. Shutemov 2013-08-03 19:35 ` Kees Cook @ 2013-08-05 2:36 ` Andi Kleen 2013-08-05 15:07 ` Alex Thorlton 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton 2 siblings, 2 replies; 57+ messages in thread From: Andi Kleen @ 2013-08-05 2:36 UTC (permalink / raw) To: Alex Thorlton Cc: Kirill A. Shutemov, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Alex Thorlton <athorlton@sgi.com> writes: >> What kind of workloads are you talking about? > > Our benchmarking team has a list of several of the SPEC OMP benchmarks > that perform significantly better when THP is disabled. I tried to get > the list but one of our servers is acting up and I can't get to it > right now :/ Please try it with the vclear patches. https://lkml.org/lkml/2012/7/20/165 et.al. -Andi -- ak@linux.intel.com -- Speaking for myself only ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-05 2:36 ` Andi Kleen @ 2013-08-05 15:07 ` Alex Thorlton 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton 1 sibling, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-05 15:07 UTC (permalink / raw) To: Andi Kleen Cc: Kirill A. Shutemov, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt > Please try it with the vclear patches. Thanks, Andi. I'll give that a shot and see if it makes any difference. - Alex ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp 2013-08-05 2:36 ` Andi Kleen 2013-08-05 15:07 ` Alex Thorlton @ 2013-08-16 14:33 ` Alex Thorlton 2013-08-16 14:33 ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton ` (8 more replies) 1 sibling, 9 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Here are the results from one of the benchmarks that performs particularly poorly when thp is enabled. Unfortunately the vclear patches don't seem to provide a performance boost. I've attached the patches that include the changes I had to make to get the vclear patches applied to the latest kernel. This first set of tests was run on the latest community kernel, with the vclear patches: Kernel string: Kernel 3.11.0-rc5-medusa-00021-g1a15a96-dirty harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled [always] madvise never harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh ... Done. Terminating the simulation. real 25m34.052s user 10769m7.948s sys 37m46.524s harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# echo never > /sys/kernel/mm/transparent_hugepage/enabled harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled always madvise [never] harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh ... Done. Terminating the simulation. real 5m0.377s user 2202m0.684s sys 108m31.816s Here are the same tests on the clean kernel: Kernel string: Kernel 3.11.0-rc5-medusa-00013-g584d88b Kernel string: Kernel 3.11.0-rc5-medusa-00013-g584d88b athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> cat /sys/kernel/mm/transparent_hugepage/enabled [always] madvise never athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> time ./run.sh ... Done. Terminating the simulation. real 21m44.052s user 10809m55.356s sys 39m58.300s harp31-sys:~ # echo never > /sys/kernel/mm/transparent_hugepage/enabled athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> cat /sys/kernel/mm/transparent_hugepage/enabled always madvise [never] athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> time ./run.sh ... Done. Terminating the simulation. real 4m52.502s user 2127m18.548s sys 104m50.828s Working on getting some more information about the root of the performance issues now... Alex Thorlton (8): THP: Use real address for NUMA policy mm: make clear_huge_page tolerate non aligned address THP: Pass real, not rounded, address to clear_huge_page x86: Add clear_page_nocache mm: make clear_huge_page cache clear only around the fault address x86: switch the 64bit uncached page clear to SSE/AVX v2 remove KM_USER0 from kmap_atomic call fix up references to kernel_fpu_begin/end arch/x86/include/asm/page.h | 2 + arch/x86/include/asm/string_32.h | 5 ++ arch/x86/include/asm/string_64.h | 5 ++ arch/x86/lib/Makefile | 1 + arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++ arch/x86/lib/clear_page_nocache_64.S | 92 ++++++++++++++++++++++++++++++++++++ arch/x86/mm/fault.c | 7 +++ mm/huge_memory.c | 17 +++---- mm/memory.c | 31 ++++++++++-- 9 files changed, 179 insertions(+), 11 deletions(-) create mode 100644 arch/x86/lib/clear_page_nocache_32.S create mode 100644 arch/x86/lib/clear_page_nocache_64.S -- 1.7.12.4 ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton @ 2013-08-16 14:33 ` Alex Thorlton 2013-08-16 17:53 ` Dave Hansen 2013-08-16 14:33 ` [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address Alex Thorlton ` (7 subsequent siblings) 8 siblings, 1 reply; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt --- mm/huge_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a92012a..55ec681 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -746,11 +746,11 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr, int nd, + unsigned long address, int nd, gfp_t extra_gfp) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), - HPAGE_PMD_ORDER, vma, haddr, nd); + HPAGE_PMD_ORDER, vma, address, nd); } #ifndef CONFIG_NUMA @@ -815,7 +815,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); + vma, address, numa_node_id(), 0); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1160,7 +1160,7 @@ alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr, numa_node_id(), 0); + vma, address, numa_node_id(), 0); else new_page = NULL; -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 14:33 ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton @ 2013-08-16 17:53 ` Dave Hansen 2013-08-16 18:17 ` Alex Thorlton 0 siblings, 1 reply; 57+ messages in thread From: Dave Hansen @ 2013-08-16 17:53 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On 08/16/2013 07:33 AM, Alex Thorlton wrote: > --- > mm/huge_memory.c | 8 ++++---- > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index a92012a..55ec681 100644 Could you add some actual descriptions to these patches that say why you are doing this, and why this particular patch is needed and implemented this way? You mention that THP is slow for you, then go on to implement some non-cached page zero'ing, but you never quite connect the dots. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 17:53 ` Dave Hansen @ 2013-08-16 18:17 ` Alex Thorlton 2013-08-16 18:52 ` Kirill A. Shutemov 2013-08-16 19:46 ` Peter Zijlstra 0 siblings, 2 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 18:17 UTC (permalink / raw) To: Dave Hansen Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt > Could you add some actual descriptions to these patches that say why you > are doing this, and why this particular patch is needed and implemented > this way? > > You mention that THP is slow for you, then go on to implement some > non-cached page zero'ing, but you never quite connect the dots. I actually didn't write these patches (made a few tweaks to get them running on the latest kernel though). They were submitted last July by Peter Zijlstra. Andi Kleen suggested that I re-run some of my tests using these patches to see whether they solved my issue. I just included my updated patches so that people could confirm that I'd pulled them forward properly. The messages from the original submission can be found here: https://lkml.org/lkml/2012/7/20/165 I did write the patch that these were submitted in response to, to control THP on a per-process basis, but it looks like we're probably going to end up taking this in a different direction, pending some more test results. - Alex ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 18:17 ` Alex Thorlton @ 2013-08-16 18:52 ` Kirill A. Shutemov 2013-08-27 16:50 ` Alex Thorlton 2013-08-16 19:46 ` Peter Zijlstra 1 sibling, 1 reply; 57+ messages in thread From: Kirill A. Shutemov @ 2013-08-16 18:52 UTC (permalink / raw) To: Alex Thorlton Cc: Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote: > > Could you add some actual descriptions to these patches that say why you > > are doing this, and why this particular patch is needed and implemented > > this way? > > > > You mention that THP is slow for you, then go on to implement some > > non-cached page zero'ing, but you never quite connect the dots. > > I actually didn't write these patches (made a few tweaks to get them > running on the latest kernel though). They were submitted last July by > Peter Zijlstra. Andi Kleen suggested that I re-run some of my tests > using these patches to see whether they solved my issue. I just > included my updated patches so that people could confirm that I'd pulled > them forward properly. > > The messages from the original submission can be found here: > https://lkml.org/lkml/2012/7/20/165 Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337 -- Kirill A. Shutemov ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 18:52 ` Kirill A. Shutemov @ 2013-08-27 16:50 ` Alex Thorlton 2013-08-27 17:01 ` Robin Holt [not found] ` <20130828091814.GA13681@gmail.com> 0 siblings, 2 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-27 16:50 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt > Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337 These don't seem to give us a noticeable performance change either: With THP: real 22m34.279s user 10797m35.984s sys 39m18.188s Without THP: real 4m48.957s user 2118m23.208s sys 113m12.740s Looks like we got a few minutes faster on the with THP case, but it's still significantly slower, and that could just be a fluke result; we're still floating at about a 5x performance degradation. I talked with one of our performance/benchmarking experts last week and he's done a bit more research into the actual problem here, so I've got a bit more information: The real performance hit, based on our testing, seems to be coming from the increased latency that comes into play on large NUMA systems when a process has to go off-node to read from/write to memory. To give an extreme example, say we have a 16 node system with 8 cores per node. If we have a job that shares a 2MB data structure between 128 threads, with THP on, the first thread to touch the structure will allocate all 2MB of space for that structure in a 2MB page, local to its socket. This means that all the memory accessses for the other 120 threads will be remote acceses. With THP off, each thread could locally allocate a number of 4K pages sufficient to hold the chunk of the structure on which it needs to work, significantly reducing the number of remote accesses that each thread will need to perform. So, with that in mind, do we agree that a per-process tunable (or something similar) to control THP seems like a reasonable method to handle this issue? Just want to confirm that everyone likes this approach before moving forward with another revision of the patch. I'm currently in favor of moving this to a per-mm tunable, since that seems to make more sense when it comes to threaded jobs. Also, a decent chunk of the code I've already written can be reused with this approach, and prctl will still be an appropriate place from which to control the behavior. Andrew Morton suggested possibly controlling this through the ELF header, but I'm going to lean towards the per-mm route unless anyone has a major objection to it. - Alex ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-27 16:50 ` Alex Thorlton @ 2013-08-27 17:01 ` Robin Holt 2013-09-04 15:43 ` Alex Thorlton [not found] ` <20130828091814.GA13681@gmail.com> 1 sibling, 1 reply; 57+ messages in thread From: Robin Holt @ 2013-08-27 17:01 UTC (permalink / raw) To: Alex Thorlton Cc: Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook Alex, Although the explanation seems plausible, have you verified this is actually possible? You could make a simple pthread test case which allocates a getpagesize() * <number-of-threads> area, prints its address and then each thread migrate and reference their page. Have the task then sleep(<long-time>) before exit. Look at the physical address space with dlook for those virtual addresses in both the THP and non-THP cases. Thanks, Robin On Tue, Aug 27, 2013 at 11:50 AM, Alex Thorlton <athorlton@sgi.com> wrote: >> Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337 > > These don't seem to give us a noticeable performance change either: > > With THP: > > real 22m34.279s > user 10797m35.984s > sys 39m18.188s > > Without THP: > > real 4m48.957s > user 2118m23.208s > sys 113m12.740s > > Looks like we got a few minutes faster on the with THP case, but it's > still significantly slower, and that could just be a fluke result; we're > still floating at about a 5x performance degradation. > > I talked with one of our performance/benchmarking experts last week and > he's done a bit more research into the actual problem here, so I've got > a bit more information: > > The real performance hit, based on our testing, seems to be coming from > the increased latency that comes into play on large NUMA systems when a > process has to go off-node to read from/write to memory. > > To give an extreme example, say we have a 16 node system with 8 cores > per node. If we have a job that shares a 2MB data structure between 128 > threads, with THP on, the first thread to touch the structure will > allocate all 2MB of space for that structure in a 2MB page, local to its > socket. This means that all the memory accessses for the other 120 > threads will be remote acceses. With THP off, each thread could locally > allocate a number of 4K pages sufficient to hold the chunk of the > structure on which it needs to work, significantly reducing the number > of remote accesses that each thread will need to perform. > > So, with that in mind, do we agree that a per-process tunable (or > something similar) to control THP seems like a reasonable method to > handle this issue? > > Just want to confirm that everyone likes this approach before moving > forward with another revision of the patch. I'm currently in favor of > moving this to a per-mm tunable, since that seems to make more sense > when it comes to threaded jobs. Also, a decent chunk of the code I've > already written can be reused with this approach, and prctl will still > be an appropriate place from which to control the behavior. Andrew > Morton suggested possibly controlling this through the ELF header, but > I'm going to lean towards the per-mm route unless anyone has a major > objection to it. > > - Alex ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-27 17:01 ` Robin Holt @ 2013-09-04 15:43 ` Alex Thorlton 2013-09-04 17:15 ` Alex Thorlton 0 siblings, 1 reply; 57+ messages in thread From: Alex Thorlton @ 2013-09-04 15:43 UTC (permalink / raw) To: Robin Holt Cc: Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook On Tue, Aug 27, 2013 at 12:01:01PM -0500, Robin Holt wrote: > Alex, > > Although the explanation seems plausible, have you verified this is > actually possible? You could make a simple pthread test case which > allocates a getpagesize() * <number-of-threads> area, prints its > address and then each thread migrate and reference their page. Have > the task then sleep(<long-time>) before exit. Look at the physical > address space with dlook for those virtual addresses in both the THP > and non-THP cases. > > Thanks, > Robin Robin, I tweaked one of our other tests to behave pretty much exactly as I described, and I can see a very significant increase in performance with THP turned off. The test behaves as follows: - malloc a large array - Spawn a specified number of threads - Have each thread touch small, evenly spaced chunks of the array (e.g. for 128 threads, the array is divided into 128 chunks, and each thread touches 1/128th of each chunk, dividing the array into 16,384 pieces) With THP off, the majority of each thread's pages are node local. With THP on, most of the pages end up as THPs on the first thread's nodes, since it is touching chunks that are close enough together to be collapsed into THPs which will, of course, remain on the first node for the duration of the test. Here are some timings for 128 threads, allocating a total of 64gb: THP on: real 1m6.394s user 16m1.160s sys 75m25.232s THP off: real 0m35.251s user 26m37.316s sys 3m28.472s The performance hit here isn't as severe as shown with the SPEC workload that we originally used, but it still appears to consistently take about twice as long with THP enabled. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-09-04 15:43 ` Alex Thorlton @ 2013-09-04 17:15 ` Alex Thorlton 2013-09-05 11:15 ` Ingo Molnar 0 siblings, 1 reply; 57+ messages in thread From: Alex Thorlton @ 2013-09-04 17:15 UTC (permalink / raw) To: Robin Holt Cc: Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook > Robin, > > I tweaked one of our other tests to behave pretty much exactly as I > - malloc a large array > - Spawn a specified number of threads > - Have each thread touch small, evenly spaced chunks of the array (e.g. > for 128 threads, the array is divided into 128 chunks, and each thread > touches 1/128th of each chunk, dividing the array into 16,384 pieces) Forgot to mention that the threads don't touch their chunks of memory concurrently, i.e. thread 2 has to wait for thread 1 to finish first. This is important to note, since the pages won't all get stuck on the first node without this behavior. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-09-04 17:15 ` Alex Thorlton @ 2013-09-05 11:15 ` Ingo Molnar 2013-09-09 16:48 ` Alex Thorlton 0 siblings, 1 reply; 57+ messages in thread From: Ingo Molnar @ 2013-09-05 11:15 UTC (permalink / raw) To: Alex Thorlton Cc: Robin Holt, Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook * Alex Thorlton <athorlton@sgi.com> wrote: > > Robin, > > > > I tweaked one of our other tests to behave pretty much exactly as I > > - malloc a large array > > - Spawn a specified number of threads > > - Have each thread touch small, evenly spaced chunks of the array (e.g. > > for 128 threads, the array is divided into 128 chunks, and each thread > > touches 1/128th of each chunk, dividing the array into 16,384 pieces) > > Forgot to mention that the threads don't touch their chunks of memory > concurrently, i.e. thread 2 has to wait for thread 1 to finish first. > This is important to note, since the pages won't all get stuck on the > first node without this behavior. Could you post the testcase please? Thanks, Ingo ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-09-05 11:15 ` Ingo Molnar @ 2013-09-09 16:48 ` Alex Thorlton 2013-09-10 7:47 ` [benchmark] THP performance testcase Ingo Molnar 0 siblings, 1 reply; 57+ messages in thread From: Alex Thorlton @ 2013-09-09 16:48 UTC (permalink / raw) To: Ingo Molnar Cc: Robin Holt, Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook On Thu, Sep 05, 2013 at 01:15:10PM +0200, Ingo Molnar wrote: > > * Alex Thorlton <athorlton@sgi.com> wrote: > > > > Robin, > > > > > > I tweaked one of our other tests to behave pretty much exactly as I > > > - malloc a large array > > > - Spawn a specified number of threads > > > - Have each thread touch small, evenly spaced chunks of the array (e.g. > > > for 128 threads, the array is divided into 128 chunks, and each thread > > > touches 1/128th of each chunk, dividing the array into 16,384 pieces) > > > > Forgot to mention that the threads don't touch their chunks of memory > > concurrently, i.e. thread 2 has to wait for thread 1 to finish first. > > This is important to note, since the pages won't all get stuck on the > > first node without this behavior. > > Could you post the testcase please? > > Thanks, > > Ingo Sorry for the delay here, had to make sure that everything in my tests was okay to push out to the public. Here's a pointer to the test I wrote: ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz Everything to compile the test should be there (just run make in the thp_pthread directory). To run the test use something like: time ./thp_pthread -C 0 -m 0 -c <max_cores> -b <memory> I ran: time ./thp_pthread -C 0 -m 0 -c 128 -b 128g On a 256 core machine, with ~500gb of memory and got these results: THP off: real 0m57.797s user 46m22.156s sys 6m14.220s THP on: real 1m36.906s user 0m2.612s sys 143m13.764s I snagged some code from another test we use, so I can't vouch for the usefulness/accuracy of all the output (actually, I know some of it is wrong). I've mainly been looking at the total run time. Don't want to bloat this e-mail up with too many test results, but I found this one really interesting. Same machine, using all the cores, with the same amount of memory. This means that each cpu is actually doing *less* work, since the chunk we reserve gets divided up evenly amongst the cpus: time ./thp_pthread -C 0 -m 0 -c 256 -b 128g THP off: real 1m1.028s user 104m58.448s sys 8m52.908s THP on: real 2m26.072s user 60m39.404s sys 337m10.072s Seems that the test scales really well in the THP off case, but, once again, with THP on, we really see the performance start to degrade. I'm planning to start investigating possible ways to split up THPs, if we detect that that majority of the references to a THP are off-node. I've heard some horror stories about migrating pages in this situation (i.e., process switches cpu and then all the pages follow it), but I think we might be able to get some better results if we can cleverly determine an appropriate time to split up pages. I've heard a bit of talk about doing something similar to this from a few people, but haven't seen any code/test results. If anybody has any input on that topic, it would be greatly appreciated. - Alex ^ permalink raw reply [flat|nested] 57+ messages in thread
* [benchmark] THP performance testcase 2013-09-09 16:48 ` Alex Thorlton @ 2013-09-10 7:47 ` Ingo Molnar 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov 0 siblings, 1 reply; 57+ messages in thread From: Ingo Molnar @ 2013-09-10 7:47 UTC (permalink / raw) To: Alex Thorlton Cc: Robin Holt, Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook ( Changed the subject line to make it stand out better on lkml. Mail with link & results quoted below. ) * Alex Thorlton <athorlton@sgi.com> wrote: > [...] Here's a pointer to the test I wrote: > > ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz > > Everything to compile the test should be there (just run make in the > thp_pthread directory). To run the test use something like: > > time ./thp_pthread -C 0 -m 0 -c <max_cores> -b <memory> > > I ran: > > time ./thp_pthread -C 0 -m 0 -c 128 -b 128g > > On a 256 core machine, with ~500gb of memory and got these results: > > THP off: > > real 0m57.797s > user 46m22.156s > sys 6m14.220s > > THP on: > > real 1m36.906s > user 0m2.612s > sys 143m13.764s > > I snagged some code from another test we use, so I can't vouch for the > usefulness/accuracy of all the output (actually, I know some of it is > wrong). I've mainly been looking at the total run time. > > Don't want to bloat this e-mail up with too many test results, but I > found this one really interesting. Same machine, using all the cores, > with the same amount of memory. This means that each cpu is actually > doing *less* work, since the chunk we reserve gets divided up evenly > amongst the cpus: > > time ./thp_pthread -C 0 -m 0 -c 256 -b 128g > > THP off: > > real 1m1.028s > user 104m58.448s > sys 8m52.908s > > THP on: > > real 2m26.072s > user 60m39.404s > sys 337m10.072s > > Seems that the test scales really well in the THP off case, but, once > again, with THP on, we really see the performance start to degrade. > > I'm planning to start investigating possible ways to split up THPs, if > we detect that that majority of the references to a THP are off-node. > I've heard some horror stories about migrating pages in this situation > (i.e., process switches cpu and then all the pages follow it), but I > think we might be able to get some better results if we can cleverly > determine an appropriate time to split up pages. I've heard a bit of > talk about doing something similar to this from a few people, but > haven't seen any code/test results. If anybody has any input on that > topic, it would be greatly appreciated. ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 0/9] split page table lock for PMD tables 2013-09-10 7:47 ` [benchmark] THP performance testcase Ingo Molnar @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov ` (8 more replies) 0 siblings, 9 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov Alex Thorlton noticed that some massivly threaded workloads work poorly, if THP enabled. This patchset fixes this by introducing split page table lock for PMD tables. hugetlbfs is not covered yet. This patchset is based on work by Naoya Horiguchi. Benchmark (from Alex): ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz THP off: -------- Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs): 1738259.808012 task-clock # 47.571 CPUs utilized ( +- 9.49% ) 147,359 context-switches # 0.085 K/sec ( +- 9.67% ) 14 cpu-migrations # 0.000 K/sec ( +- 13.25% ) 24,410,139 page-faults # 0.014 M/sec ( +- 0.00% ) 4,149,037,526,252 cycles # 2.387 GHz ( +- 9.50% ) 3,649,839,735,027 stalled-cycles-frontend # 87.97% frontend cycles idle ( +- 6.60% ) 2,455,558,969,567 stalled-cycles-backend # 59.18% backend cycles idle ( +- 22.92% ) 1,434,961,518,604 instructions # 0.35 insns per cycle # 2.54 stalled cycles per insn ( +- 92.86% ) 241,472,020,951 branches # 138.916 M/sec ( +- 91.72% ) 84,022,172 branch-misses # 0.03% of all branches ( +- 3.16% ) 36.540185552 seconds time elapsed ( +- 18.36% ) THP on, no patchset: -------------------- Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs): 2528378.966949 task-clock # 50.715 CPUs utilized ( +- 11.86% ) 214,063 context-switches # 0.085 K/sec ( +- 11.94% ) 19 cpu-migrations # 0.000 K/sec ( +- 22.72% ) 49,226 page-faults # 0.019 K/sec ( +- 0.33% ) 6,034,640,598,498 cycles # 2.387 GHz ( +- 11.91% ) 5,685,933,794,081 stalled-cycles-frontend # 94.22% frontend cycles idle ( +- 7.67% ) 4,414,381,393,353 stalled-cycles-backend # 73.15% backend cycles idle ( +- 2.09% ) 952,086,804,776 instructions # 0.16 insns per cycle # 5.97 stalled cycles per insn ( +- 89.59% ) 166,191,211,974 branches # 65.730 M/sec ( +- 85.52% ) 33,341,022 branch-misses # 0.02% of all branches ( +- 3.90% ) 49.854741504 seconds time elapsed ( +- 14.76% ) THP on, with patchset: ---------------------- echo always > /sys/kernel/mm/transparent_hugepage/enabled Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs): 1538763.343568 task-clock # 45.386 CPUs utilized ( +- 7.21% ) 130,469 context-switches # 0.085 K/sec ( +- 7.32% ) 14 cpu-migrations # 0.000 K/sec ( +- 23.58% ) 49,299 page-faults # 0.032 K/sec ( +- 0.15% ) 3,666,748,502,650 cycles # 2.383 GHz ( +- 7.25% ) 3,330,488,035,212 stalled-cycles-frontend # 90.83% frontend cycles idle ( +- 4.70% ) 2,383,357,073,990 stalled-cycles-backend # 65.00% backend cycles idle ( +- 16.06% ) 935,504,610,528 instructions # 0.26 insns per cycle # 3.56 stalled cycles per insn ( +- 91.16% ) 161,466,689,532 branches # 104.933 M/sec ( +- 87.67% ) 22,602,225 branch-misses # 0.01% of all branches ( +- 6.43% ) 33.903917543 seconds time elapsed ( +- 12.57% ) Kirill A. Shutemov (9): mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS mm: convert mm->nr_ptes to atomic_t mm: introduce api for split page table lock for PMD level mm, thp: change pmd_trans_huge_lock() to return taken lock mm, thp: move ptl taking inside page_check_address_pmd() mm, thp: do not access mm->pmd_huge_pte directly mm: convent the rest to new page table lock api mm: implement split page table lock for PMD level x86, mm: enable split page table lock for PMD level arch/arm/mm/fault-armv.c | 6 +- arch/s390/mm/pgtable.c | 12 +-- arch/sparc/mm/tlb.c | 12 +-- arch/um/defconfig | 2 +- arch/x86/Kconfig | 4 + arch/x86/include/asm/pgalloc.h | 8 +- arch/x86/xen/mmu.c | 6 +- arch/xtensa/configs/iss_defconfig | 2 +- arch/xtensa/configs/s6105_defconfig | 2 +- fs/proc/task_mmu.c | 15 +-- include/linux/huge_mm.h | 17 +-- include/linux/mm.h | 51 ++++++++- include/linux/mm_types.h | 15 ++- kernel/fork.c | 6 +- mm/Kconfig | 12 ++- mm/huge_memory.c | 201 +++++++++++++++++++++--------------- mm/memcontrol.c | 10 +- mm/memory.c | 21 ++-- mm/migrate.c | 7 +- mm/mmap.c | 3 +- mm/mprotect.c | 4 +- mm/oom_kill.c | 6 +- mm/pgtable-generic.c | 16 +-- mm/rmap.c | 13 +-- 24 files changed, 280 insertions(+), 171 deletions(-) -- 1.8.4.rc3 ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 15:20 ` Dave Hansen 2013-09-13 13:06 ` [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov ` (7 subsequent siblings) 8 siblings, 1 reply; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov We're going to introduce split page table lock for PMD level. Let's rename existing split ptlock for PTE level to avoid confusion. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- arch/arm/mm/fault-armv.c | 6 +++--- arch/um/defconfig | 2 +- arch/x86/xen/mmu.c | 6 +++--- arch/xtensa/configs/iss_defconfig | 2 +- arch/xtensa/configs/s6105_defconfig | 2 +- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 8 ++++---- mm/Kconfig | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 2a5907b..ff379ac 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -65,7 +65,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS /* * If we are using split PTE locks, then we need to take the page * lock here. Otherwise we are using shared mm->page_table_lock @@ -84,10 +84,10 @@ static inline void do_pte_unlock(spinlock_t *ptl) { spin_unlock(ptl); } -#else /* !USE_SPLIT_PTLOCKS */ +#else /* !USE_SPLIT_PTE_PTLOCKS */ static inline void do_pte_lock(spinlock_t *ptl) {} static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, unsigned long pfn) diff --git a/arch/um/defconfig b/arch/um/defconfig index 08107a7..6b0a10f 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -82,7 +82,7 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_PAGEFLAGS_EXTENDED=y -CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_SPLIT_PTE_PTLOCK_CPUS=4 # CONFIG_COMPACTION is not set # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=0 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index fdc3ba2..455c873 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -796,7 +796,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS ptl = __pte_lockptr(page); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -1637,7 +1637,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -1671,7 +1671,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) if (!PageHighMem(page)) { xen_mc_batch(); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); __set_pfn_prot(pfn, PAGE_KERNEL); diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index 77c52f8..54cc946 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -174,7 +174,7 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_PAGEFLAGS_EXTENDED=y -CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_SPLIT_PTE_PTLOCK_CPUS=4 # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig index 4799c6a..d802f11 100644 --- a/arch/xtensa/configs/s6105_defconfig +++ b/arch/xtensa/configs/s6105_defconfig @@ -138,7 +138,7 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_PAGEFLAGS_EXTENDED=y -CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_SPLIT_PTE_PTLOCK_CPUS=4 # CONFIG_PHYS_ADDR_T_64BIT is not set CONFIG_ZONE_DMA_FLAG=1 CONFIG_VIRT_TO_BUS=y diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55e..6cf8ddb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1232,7 +1232,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS /* * We tuck a spinlock to guard each pagetable page into its struct page, * at page->private, with BUILD_BUG_ON to make sure that this will not @@ -1245,14 +1245,14 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } while (0) #define pte_lock_deinit(page) ((page)->mapping = NULL) #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) -#else /* !USE_SPLIT_PTLOCKS */ +#else /* !USE_SPLIT_PTE_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ #define pte_lock_init(page) do {} while (0) #define pte_lock_deinit(page) do {} while (0) #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ static inline void pgtable_page_ctor(struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index faf4b7c..fe0a4bb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,7 +23,7 @@ struct address_space; -#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) +#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS) /* * Each physical page in the system has a struct page associated with @@ -141,7 +141,7 @@ struct page { * indicates order in the buddy * system if PG_buddy is set. */ -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS spinlock_t ptl; #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ @@ -309,14 +309,14 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTLOCKS && defined(CONFIG_MMU) +#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) #define SPLIT_RSS_COUNTING /* per-thread cached information, */ struct task_rss_stat { int events; /* for synchronization threshold */ int count[NR_MM_COUNTERS]; }; -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; diff --git a/mm/Kconfig b/mm/Kconfig index 026771a..1977a33 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -207,7 +207,7 @@ config PAGEFLAGS_EXTENDED # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # -config SPLIT_PTLOCK_CPUS +config SPLIT_PTE_PTLOCK_CPUS int default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* Re: [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS 2013-09-13 13:06 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov @ 2013-09-13 15:20 ` Dave Hansen 0 siblings, 0 replies; 57+ messages in thread From: Dave Hansen @ 2013-09-13 15:20 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On 09/13/2013 06:06 AM, Kirill A. Shutemov wrote: > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -207,7 +207,7 @@ config PAGEFLAGS_EXTENDED > # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. > # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. > # > -config SPLIT_PTLOCK_CPUS > +config SPLIT_PTE_PTLOCK_CPUS > int > default "999999" if ARM && !CPU_CACHE_VIPT > default "999999" if PARISC && !PA20 If someone has a config where this is set to some non-default value, won't changing the name cause this to revert back to the defaults? I don't know how big of a deal it is to other folks, but you can always do this: config SPLIT_PTE_PTLOCK_CPUS int default SPLIT_PTLOCK_CPUS if SPLIT_PTLOCK_CPUS ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov ` (6 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov With split page table lock for PMD level we can't hold mm->page_table_lock while updating nr_ptes. Let's convert it to atomic_t to avoid races. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- fs/proc/task_mmu.c | 2 +- include/linux/mm_types.h | 2 +- kernel/fork.c | 2 +- mm/huge_memory.c | 10 +++++----- mm/memory.c | 4 ++-- mm/mmap.c | 3 ++- mm/oom_kill.c | 6 +++--- 7 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d..d45d423 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -62,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + (PTRS_PER_PTE*sizeof(pte_t)*atomic_read(&mm->nr_ptes)) >> 10, swap << (PAGE_SHIFT-10)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fe0a4bb..1c64730 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -338,6 +338,7 @@ struct mm_struct { pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + atomic_t nr_ptes; /* Page table pages */ int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ @@ -359,7 +360,6 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; - unsigned long nr_ptes; /* Page table pages */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; diff --git a/kernel/fork.c b/kernel/fork.c index 81ccb4f..4c8b986 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - mm->nr_ptes = 0; + atomic_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884..bbd41a2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -737,7 +737,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm->nr_ptes++; + atomic_inc(&mm->nr_ptes); spin_unlock(&mm->page_table_lock); } @@ -778,7 +778,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - mm->nr_ptes++; + atomic_inc(&mm->nr_ptes); return true; } @@ -903,7 +903,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - dst_mm->nr_ptes++; + atomic_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: @@ -1358,7 +1358,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { - tlb->mm->nr_ptes--; + atomic_dec(&tlb->mm->nr_ptes); spin_unlock(&tlb->mm->page_table_lock); put_huge_zero_page(); } else { @@ -1367,7 +1367,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; + atomic_dec(&tlb->mm->nr_ptes); spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); } diff --git a/mm/memory.c b/mm/memory.c index ca00039..1046396 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - tlb->mm->nr_ptes--; + atomic_dec(&tlb->mm->nr_ptes); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -575,7 +575,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm->nr_ptes++; + atomic_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) diff --git a/mm/mmap.c b/mm/mmap.c index 9d54851..1d0efbc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2726,7 +2726,8 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d2..7ab394e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes + + points = get_mm_rss(p->mm) + atomic_read(&p->mm->nr_ptes) + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7d %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task->mm->nr_ptes, + atomic_read(&task->mm->nr_ptes), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 3/9] mm: introduce api for split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:19 ` Peter Zijlstra 2013-09-13 13:06 ` [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock Kirill A. Shutemov ` (5 subsequent siblings) 8 siblings, 1 reply; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov Basic api, backed by mm->page_table_lock for now. Actual implementation will be added later. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- include/linux/mm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6cf8ddb..d4361e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} + + +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl = huge_pmd_lockptr(mm, pmd); + spin_lock(ptl); + return ptl; +} + extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* Re: [PATCH 3/9] mm: introduce api for split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov @ 2013-09-13 13:19 ` Peter Zijlstra 2013-09-13 14:22 ` Kirill A. Shutemov 0 siblings, 1 reply; 57+ messages in thread From: Peter Zijlstra @ 2013-09-13 13:19 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On Fri, Sep 13, 2013 at 04:06:10PM +0300, Kirill A. Shutemov wrote: > Basic api, backed by mm->page_table_lock for now. Actual implementation > will be added later. > > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > --- > include/linux/mm.h | 13 +++++++++++++ > 1 file changed, 13 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 6cf8ddb..d4361e7 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page) > ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ > NULL: pte_offset_kernel(pmd, address)) > > +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) > +{ > + return &mm->page_table_lock; > +} > + > + > +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd) > +{ > + spinlock_t *ptl = huge_pmd_lockptr(mm, pmd); > + spin_lock(ptl); > + return ptl; > +} Why not call the thing pmd_lock()? The pmd bit differentiates it from pte_lock() enough IMIO. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 3/9] mm: introduce api for split page table lock for PMD level 2013-09-13 13:19 ` Peter Zijlstra @ 2013-09-13 14:22 ` Kirill A. Shutemov 0 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 14:22 UTC (permalink / raw) To: Peter Zijlstra Cc: Kirill A. Shutemov, Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm Peter Zijlstra wrote: > On Fri, Sep 13, 2013 at 04:06:10PM +0300, Kirill A. Shutemov wrote: > > Basic api, backed by mm->page_table_lock for now. Actual implementation > > will be added later. > > > > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > > --- > > include/linux/mm.h | 13 +++++++++++++ > > 1 file changed, 13 insertions(+) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index 6cf8ddb..d4361e7 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page) > > ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ > > NULL: pte_offset_kernel(pmd, address)) > > > > +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) > > +{ > > + return &mm->page_table_lock; > > +} > > + > > + > > +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd) > > +{ > > + spinlock_t *ptl = huge_pmd_lockptr(mm, pmd); > > + spin_lock(ptl); > > + return ptl; > > +} > > Why not call the thing pmd_lock()? The pmd bit differentiates it from > pte_lock() enough IMIO. Okay, will rename it. -- Kirill A. Shutemov ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov ` (2 preceding siblings ...) 2013-09-13 13:06 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov ` (4 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov With split ptlock it's important to know which lock pmd_trans_huge_lock() took. This patch adds one more parameter to the function to return the lock. In most places new api migration to new api is trivial. Exception is move_huge_pmd(): we need to take two locks if pmd tables are different. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- fs/proc/task_mmu.c | 13 +++++++------ include/linux/huge_mm.h | 14 +++++++------- mm/huge_memory.c | 40 +++++++++++++++++++++++++++------------- mm/memcontrol.c | 10 +++++----- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d45d423..bbf7420 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -505,9 +505,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -993,13 +993,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { struct vm_area_struct *vma; struct pagemapread *pm = walk->private; + spinlock_t *ptl; pte_t *pte; int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { int pmd_flags2; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) @@ -1017,7 +1018,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (err) break; } - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return err; } @@ -1319,7 +1320,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, md = walk->private; - if (pmd_trans_huge_lock(pmd, md->vma) == 1) { + if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; @@ -1327,7 +1328,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (page) gather_stats(page, md, pte_dirty(huge_pte), HPAGE_PMD_SIZE/PAGE_SIZE); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return 0; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 3935428..4aca0d8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -129,15 +129,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern int __pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma); +extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl); /* mmap_sem must be held on entry */ -static inline int pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma) +static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); if (pmd_trans_huge(*pmd)) - return __pmd_trans_huge_lock(pmd, vma); + return __pmd_trans_huge_lock(pmd, vma, ptl); else return 0; } @@ -215,8 +215,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline int pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma) +static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bbd41a2..acf5b4d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1342,9 +1342,10 @@ out_unlock: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { struct page *page; pgtable_t pgtable; pmd_t orig_pmd; @@ -1359,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { atomic_dec(&tlb->mm->nr_ptes); - spin_unlock(&tlb->mm->page_table_lock); + spin_unlock(ptl); put_huge_zero_page(); } else { page = pmd_page(orig_pmd); @@ -1368,7 +1369,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); atomic_dec(&tlb->mm->nr_ptes); - spin_unlock(&tlb->mm->page_table_lock); + spin_unlock(ptl); tlb_remove_page(tlb, page); } pte_free(tlb->mm, pgtable); @@ -1381,14 +1382,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1401,6 +1403,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { + spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1421,12 +1424,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - ret = __pmd_trans_huge_lock(old_pmd, vma); + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); if (ret == 1) { + new_ptl = huge_pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - spin_unlock(&mm->page_table_lock); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); } out: return ret; @@ -1436,9 +1448,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); if (!prot_numa) { @@ -1454,7 +1467,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } } set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); ret = 1; } @@ -1468,12 +1481,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { - spin_lock(&vma->vm_mm->page_table_lock); + *ptl = huge_pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1482,7 +1496,7 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return 1; } } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce..5f35b2a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6376,10 +6376,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -6568,9 +6568,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6587,7 +6587,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov ` (3 preceding siblings ...) 2013-09-13 13:06 ` [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov ` (3 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov With split page table lock we can't know which lock we need to take before we find the relevant pmd. Let's move lock taking inside the function. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- include/linux/huge_mm.h | 3 ++- mm/huge_memory.c | 43 +++++++++++++++++++++++++++---------------- mm/rmap.c | 13 +++++-------- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4aca0d8..91672e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -54,7 +54,8 @@ enum page_check_address_pmd_flag { extern pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag); + enum page_check_address_pmd_flag flag, + spinlock_t **ptl); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index acf5b4d..4b58a01 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1500,23 +1500,33 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, return 0; } +/* + * This function returns whether a given @page is mapped onto the @address + * in the virtual space of @mm. + * + * When it's true, this function returns *pmd with holding the page table lock + * and passing it back to the caller via @ptl. + * If it's false, returns NULL without holding the page table lock. + */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag) + enum page_check_address_pmd_flag flag, + spinlock_t **ptl) { - pmd_t *pmd, *ret = NULL; + pmd_t *pmd; if (address & ~HPAGE_PMD_MASK) - goto out; + return NULL; pmd = mm_find_pmd(mm, address); if (!pmd) - goto out; + return NULL; + *ptl = huge_pmd_lock(mm, pmd); if (pmd_none(*pmd)) - goto out; + goto unlock; if (pmd_page(*pmd) != page) - goto out; + goto unlock; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1526,14 +1536,15 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto out; + goto unlock; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - ret = pmd; + return pmd; } -out: - return ret; +unlock: + spin_unlock(*ptl); + return NULL; } static int __split_huge_page_splitting(struct page *page, @@ -1541,6 +1552,7 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; /* For mmu_notifiers */ @@ -1548,9 +1560,8 @@ static int __split_huge_page_splitting(struct page *page, const unsigned long mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order @@ -1561,8 +1572,8 @@ static int __split_huge_page_splitting(struct page *page, */ pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; @@ -1693,14 +1704,14 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -1755,8 +1766,8 @@ static int __split_huge_page_map(struct page *page, pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index fd3ee7a..b59d741 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int referenced = 0; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; - spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG); - if (!pmd) { - spin_unlock(&mm->page_table_lock); + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) goto out; - } if (vma->vm_flags & VM_LOCKED) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; goto out; @@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } else { pte_t *pte; - spinlock_t *ptl; /* * rmap might return false positives; we must filter -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov ` (4 preceding siblings ...) 2013-09-13 13:06 ` [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov ` (2 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov Currently mm->pmd_huge_pte protected by page table lock. It will not work with split lock. We have to have per-pmd pmd_huge_pte for proper access serialization. For now, let's just introduce wrapper to access mm->pmd_huge_pte. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- arch/s390/mm/pgtable.c | 12 ++++++------ arch/sparc/mm/tlb.c | 12 ++++++------ include/linux/mm.h | 1 + mm/pgtable-generic.c | 12 ++++++------ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index de8cbc3..c463e5c 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1225,11 +1225,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(lh); else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) @@ -1241,12 +1241,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); lh = (struct list_head *) pgtable; if (list_empty(lh)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = (pgtable_t) lh->next; + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; list_del(lh); } ptep = (pte_t *) pgtable; diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7a91f28..656cc46 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -196,11 +196,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(lh); else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) @@ -211,12 +211,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); lh = (struct list_head *) pgtable; if (list_empty(lh)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = (pgtable_t) lh->next; + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; list_del(lh); } pte_val(pgtable[0]) = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index d4361e7..d2f8a50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1299,6 +1299,7 @@ static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } +#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3929a40..41fee3e 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -154,11 +154,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -173,11 +173,11 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 7/9] mm: convent the rest to new page table lock api 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov ` (5 preceding siblings ...) 2013-09-13 13:06 ` [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov 2013-09-13 13:06 ` [PATCH 9/9] x86, mm: enable " Kirill A. Shutemov 8 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov Only trivial cases left. Let's convert them altogether. hugetlbfs is not covered for now. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- mm/huge_memory.c | 108 ++++++++++++++++++++++++++++----------------------- mm/memory.c | 17 ++++---- mm/migrate.c | 7 ++-- mm/mprotect.c | 4 +- mm/pgtable-generic.c | 4 +- 5 files changed, 77 insertions(+), 63 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4b58a01..e728d74 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -709,6 +709,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct page *page) { pgtable_t pgtable; + spinlock_t *ptl; VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); @@ -723,9 +724,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); @@ -738,7 +739,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_inc(&mm->nr_ptes); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } return 0; @@ -766,6 +767,7 @@ static inline struct page *alloc_hugepage(int defrag) } #endif +/* Caller must hold page table lock. */ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -797,6 +799,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { + spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; @@ -809,10 +812,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, zero_page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); @@ -845,6 +848,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { + spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -855,8 +859,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - spin_lock(&dst_mm->page_table_lock); - spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = huge_pmd_lock(dst_mm, dst_pmd); + src_ptl = huge_pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -865,7 +870,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } /* - * mm->page_table_lock is enough to be sure that huge zero pmd is not + * When page table lock is held, the huge zero pmd should not be * under splitting since we don't split the page itself, only pmd to * a page table. */ @@ -886,8 +891,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, } if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ @@ -907,8 +912,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = 0; out_unlock: - spin_unlock(&src_mm->page_table_lock); - spin_unlock(&dst_mm->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); out: return ret; } @@ -919,10 +924,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm, pmd_t *pmd, pmd_t orig_pmd, int dirty) { + spinlock_t *ptl; pmd_t entry; unsigned long haddr; - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto unlock; @@ -932,13 +938,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm, update_mmu_cache_pmd(vma, address, pmd); unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; struct page *page; @@ -965,7 +972,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_page; @@ -992,7 +999,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); @@ -1002,7 +1009,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, out: return ret; out_free_page: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_page(page); put_page(page); @@ -1016,6 +1023,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1062,7 +1070,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); @@ -1088,7 +1096,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1099,7 +1107,7 @@ out: return ret; out_free_pages: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1114,17 +1122,19 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { + spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + ptl = huge_pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; @@ -1140,7 +1150,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) @@ -1187,11 +1197,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (page) put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mem_cgroup_uncharge_page(new_page); put_page(new_page); goto out_mn; @@ -1213,13 +1223,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return ret; } @@ -1231,7 +1241,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(huge_pmd_lockptr(mm, pmd)); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; @@ -1278,13 +1288,14 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { + spinlock_t *ptl; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; bool migrated; - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; @@ -1302,17 +1313,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Acquire the page lock to serialise THP migrations */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); lock_page(page); /* Confirm the PTE did not while locked */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); goto out_unlock; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* Migrate the THP to the requested node */ migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1324,7 +1335,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; check_same: - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; clear_pmdnuma: @@ -1333,7 +1344,7 @@ clear_pmdnuma: VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); out_unlock: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (current_nid != -1) task_numa_fault(current_nid, HPAGE_PMD_NR, false); return 0; @@ -2282,7 +2293,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte_t *pte; pgtable_t pgtable; struct page *new_page; - spinlock_t *ptl; + spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2325,12 +2336,12 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - ptl = pte_lockptr(mm, pmd); + pte_ptl = pte_lockptr(mm, pmd); mmun_start = address; mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); /* probably unnecessary */ + pmd_ptl = huge_pmd_lock(mm, pmd); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow @@ -2338,16 +2349,16 @@ static void collapse_huge_page(struct mm_struct *mm, * to avoid the risk of CPU bugs in that area. */ _pmd = pmdp_clear_flush(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(ptl); + spin_unlock(pte_ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing @@ -2355,7 +2366,7 @@ static void collapse_huge_page(struct mm_struct *mm, * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2366,7 +2377,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -2381,13 +2392,13 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(&mm->page_table_lock); + spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(pmd_ptl); *hpage = NULL; @@ -2712,6 +2723,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { + spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -2723,22 +2735,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); diff --git a/mm/memory.c b/mm/memory.c index 1046396..a0ed1d5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -552,6 +552,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { + spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) @@ -572,7 +573,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ atomic_inc(&mm->nr_ptes); @@ -580,7 +581,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (new) pte_free(mm, new); if (wait_split_huge_page) @@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(vma, address, pmd, flags); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; goto out; } } else - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* fall through */ } split_fallthrough: @@ -3602,13 +3603,13 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool numa = false; int local_nid = numa_node_id(); - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); pmd = *pmdp; if (pmd_numa(pmd)) { set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); numa = true; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); if (!numa) return 0; diff --git a/mm/migrate.c b/mm/migrate.c index b7ded7e..32eff0c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1653,6 +1653,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long address, struct page *page, int node) { + spinlock_t *ptl; unsigned long haddr = address & HPAGE_PMD_MASK; pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; @@ -1699,9 +1700,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - spin_lock(&mm->page_table_lock); + ptl = huge_pmd_lock(mm, pmd); if (unlikely(!pmd_same(*pmd, entry))) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -1746,7 +1747,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * before it's fully transferred to the new page. */ mem_cgroup_end_migration(memcg, page, new_page, true); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(new_page); unlock_page(page); diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4..885cd78 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -116,9 +116,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, pmd_t *pmd) { - spin_lock(&mm->page_table_lock); + spinlock_t *ptl = huge_pmd_lock(mm, pmd); set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } #else static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 41fee3e..80587a5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -151,7 +151,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(huge_pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) @@ -170,7 +170,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; - assert_spin_locked(&mm->page_table_lock); + assert_spin_locked(huge_pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov ` (6 preceding siblings ...) 2013-09-13 13:06 ` [PATCH 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 2013-09-13 13:24 ` Peter Zijlstra ` (3 more replies) 2013-09-13 13:06 ` [PATCH 9/9] x86, mm: enable " Kirill A. Shutemov 8 siblings, 4 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov The basic idea is the same as with PTE level: the lock is embedded into struct page of table's page. Split pmd page table lock only makes sense on big machines. Let's say >= 32 CPUs for now. We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't take mm->page_table_lock anymore. Let's reuse page->lru of table's page for that. hugetlbfs hasn't converted to split locking: disable split locking if hugetlbfs enabled. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- include/linux/mm.h | 31 +++++++++++++++++++++++++++++++ include/linux/mm_types.h | 5 +++++ kernel/fork.c | 4 ++-- mm/Kconfig | 10 ++++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d2f8a50..5b3922d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1294,13 +1294,44 @@ static inline void pgtable_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) +#if USE_SPLIT_PMD_PTLOCKS + +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &virt_to_page(pmd)->ptl; +} + +static inline void pgtable_pmd_page_ctor(struct page *page) +{ + spin_lock_init(&page->ptl); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page->pmd_huge_pte = NULL; +#endif +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(page->pmd_huge_pte); +#endif +} + +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) + +#else + static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline void pgtable_pmd_page_ctor(struct page *page) {} +static inline void pgtable_pmd_page_dtor(struct page *page) {} + #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) +#endif + static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = huge_pmd_lockptr(mm, pmd); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c64730..5706ddf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -24,6 +24,8 @@ struct address_space; #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS) +#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ + NR_CPUS >= CONFIG_SPLIT_PMD_PTLOCK_CPUS) /* * Each physical page in the system has a struct page associated with @@ -130,6 +132,9 @@ struct page { struct list_head list; /* slobs list of pages */ struct slab *slab_page; /* slab fields */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + pgtable_t pmd_huge_pte; /* protected by page->ptl */ +#endif }; /* Remainder is not double word aligned */ diff --git a/kernel/fork.c b/kernel/fork.c index 4c8b986..1670af7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/Kconfig b/mm/Kconfig index 1977a33..ab32eda 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -214,6 +214,16 @@ config SPLIT_PTE_PTLOCK_CPUS default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + boolean + +config SPLIT_PMD_PTLOCK_CPUS + int + # hugetlb hasn't converted to split locking yet + default "999999" if HUGETLB_PAGE + default "32" if ARCH_ENABLE_SPLIT_PMD_PTLOCK + default "999999" + # # support for memory balloon compaction config BALLOON_COMPACTION -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov @ 2013-09-13 13:24 ` Peter Zijlstra 2013-09-13 14:25 ` Kirill A. Shutemov 2013-09-13 13:36 ` Peter Zijlstra ` (2 subsequent siblings) 3 siblings, 1 reply; 57+ messages in thread From: Peter Zijlstra @ 2013-09-13 13:24 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote: > The basic idea is the same as with PTE level: the lock is embedded into > struct page of table's page. > > Split pmd page table lock only makes sense on big machines. > Let's say >= 32 CPUs for now. Why is this? Couldn't I generate the same amount of contention on PMD level as I can on PTE level in the THP case? ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:24 ` Peter Zijlstra @ 2013-09-13 14:25 ` Kirill A. Shutemov 2013-09-13 14:52 ` Peter Zijlstra 0 siblings, 1 reply; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 14:25 UTC (permalink / raw) To: Peter Zijlstra Cc: Kirill A. Shutemov, Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm Peter Zijlstra wrote: > On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote: > > The basic idea is the same as with PTE level: the lock is embedded into > > struct page of table's page. > > > > Split pmd page table lock only makes sense on big machines. > > Let's say >= 32 CPUs for now. > > Why is this? Couldn't I generate the same amount of contention on PMD > level as I can on PTE level in the THP case? Hm. You are right. You just need more memory for that. Do you want it to be "4" too? -- Kirill A. Shutemov ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 14:25 ` Kirill A. Shutemov @ 2013-09-13 14:52 ` Peter Zijlstra 0 siblings, 0 replies; 57+ messages in thread From: Peter Zijlstra @ 2013-09-13 14:52 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On Fri, Sep 13, 2013 at 05:25:13PM +0300, Kirill A. Shutemov wrote: > Peter Zijlstra wrote: > > On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote: > > > The basic idea is the same as with PTE level: the lock is embedded into > > > struct page of table's page. > > > > > > Split pmd page table lock only makes sense on big machines. > > > Let's say >= 32 CPUs for now. > > > > Why is this? Couldn't I generate the same amount of contention on PMD > > level as I can on PTE level in the THP case? > > Hm. You are right. You just need more memory for that. > Do you want it to be "4" too? Well, I would drop your patch-1 and use the same config var. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov 2013-09-13 13:24 ` Peter Zijlstra @ 2013-09-13 13:36 ` Peter Zijlstra 2013-09-13 14:25 ` Kirill A. Shutemov 2013-09-13 15:45 ` Naoya Horiguchi 2013-09-13 19:57 ` Dave Hansen 3 siblings, 1 reply; 57+ messages in thread From: Peter Zijlstra @ 2013-09-13 13:36 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote: > +#if USE_SPLIT_PMD_PTLOCKS > + > +static inline void pgtable_pmd_page_ctor(struct page *page) > +{ > + spin_lock_init(&page->ptl); > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + page->pmd_huge_pte = NULL; > +#endif > +} > + > +static inline void pgtable_pmd_page_dtor(struct page *page) > +{ > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + VM_BUG_ON(page->pmd_huge_pte); > +#endif > +} > + > +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) > + > +#else So on -rt we have the problem that spinlock_t is rather huge (its a rtmutex) so instead of blowing up the pageframe like that we treat page->pte as a pointer and allocate the spinlock. Since allocations could fail the above ctor path gets 'interesting'. It would be good if new code could assume the ctor could fail so we don't have to replicate that horror-show. --- From: Peter Zijlstra <peterz@infradead.org> Date: Fri, 3 Jul 2009 08:44:54 -0500 Subject: mm: shrink the page frame to !-rt size He below is a boot-tested hack to shrink the page frame size back to normal. Should be a net win since there should be many less PTE-pages than page-frames. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/mm.h | 46 +++++++++++++++++++++++++++++++++++++++------- include/linux/mm_types.h | 4 ++++ mm/memory.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 7 deletions(-) --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1241,27 +1241,59 @@ static inline pmd_t *pmd_alloc(struct mm * overflow into the next struct page (as it might with DEBUG_SPINLOCK). * When freeing, reset page->mapping so free_pages_check won't complain. */ +#ifndef CONFIG_PREEMPT_RT_FULL + #define __pte_lockptr(page) &((page)->ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) + +static inline struct page *pte_lock_init(struct page *page) +{ + spin_lock_init(__pte_lockptr(page)); + return page; +} + #define pte_lock_deinit(page) ((page)->mapping = NULL) + +#else /* !PREEMPT_RT_FULL */ + +/* + * On PREEMPT_RT_FULL the spinlock_t's are too large to embed in the + * page frame, hence it only has a pointer and we need to dynamically + * allocate the lock when we allocate PTE-pages. + * + * This is an overall win, since only a small fraction of the pages + * will be PTE pages under normal circumstances. + */ + +#define __pte_lockptr(page) ((page)->ptl) + +extern struct page *pte_lock_init(struct page *page); +extern void pte_lock_deinit(struct page *page); + +#endif /* PREEMPT_RT_FULL */ + #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) #else /* !USE_SPLIT_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ -#define pte_lock_init(page) do {} while (0) +static inline struct page *pte_lock_init(struct page *page) { return page; } #define pte_lock_deinit(page) do {} while (0) #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) #endif /* USE_SPLIT_PTLOCKS */ -static inline void pgtable_page_ctor(struct page *page) +static inline struct page *__pgtable_page_ctor(struct page *page) { - pte_lock_init(page); - inc_zone_page_state(page, NR_PAGETABLE); + page = pte_lock_init(page); + if (page) + inc_zone_page_state(page, NR_PAGETABLE); + return page; } +#define pgtable_page_ctor(page) \ +do { \ + page = __pgtable_page_ctor(page); \ +} while (0) + static inline void pgtable_page_dtor(struct page *page) { pte_lock_deinit(page); --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -142,7 +142,11 @@ struct page { * system if PG_buddy is set. */ #if USE_SPLIT_PTLOCKS +# ifndef CONFIG_PREEMPT_RT_FULL spinlock_t ptl; +# else + spinlock_t *ptl; +# endif #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ struct page *first_page; /* Compound tail pages */ --- a/mm/memory.c +++ b/mm/memory.c @@ -4328,3 +4328,35 @@ void copy_user_huge_page(struct page *ds } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0) +/* + * Heinous hack, relies on the caller doing something like: + * + * pte = alloc_pages(PGALLOC_GFP, 0); + * if (pte) + * pgtable_page_ctor(pte); + * return pte; + * + * This ensures we release the page and return NULL when the + * lock allocation fails. + */ +struct page *pte_lock_init(struct page *page) +{ + page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (page->ptl) { + spin_lock_init(__pte_lockptr(page)); + } else { + __free_page(page); + page = NULL; + } + return page; +} + +void pte_lock_deinit(struct page *page) +{ + kfree(page->ptl); + page->mapping = NULL; +} + +#endif ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:36 ` Peter Zijlstra @ 2013-09-13 14:25 ` Kirill A. Shutemov 0 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 14:25 UTC (permalink / raw) To: Peter Zijlstra Cc: Kirill A. Shutemov, Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm Peter Zijlstra wrote: > On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote: > > +#if USE_SPLIT_PMD_PTLOCKS > > + > > +static inline void pgtable_pmd_page_ctor(struct page *page) > > +{ > > + spin_lock_init(&page->ptl); > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > > + page->pmd_huge_pte = NULL; > > +#endif > > +} > > + > > +static inline void pgtable_pmd_page_dtor(struct page *page) > > +{ > > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > > + VM_BUG_ON(page->pmd_huge_pte); > > +#endif > > +} > > + > > +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) > > + > > +#else > > So on -rt we have the problem that spinlock_t is rather huge (its a > rtmutex) so instead of blowing up the pageframe like that we treat > page->pte as a pointer and allocate the spinlock. > > Since allocations could fail the above ctor path gets 'interesting'. > > It would be good if new code could assume the ctor could fail so we > don't have to replicate that horror-show. Okay, I'll rework this. -- Kirill A. Shutemov ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov 2013-09-13 13:24 ` Peter Zijlstra 2013-09-13 13:36 ` Peter Zijlstra @ 2013-09-13 15:45 ` Naoya Horiguchi 2013-09-13 19:57 ` Dave Hansen 3 siblings, 0 replies; 57+ messages in thread From: Naoya Horiguchi @ 2013-09-13 15:45 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote: > The basic idea is the same as with PTE level: the lock is embedded into > struct page of table's page. > > Split pmd page table lock only makes sense on big machines. > Let's say >= 32 CPUs for now. > > We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't > take mm->page_table_lock anymore. Let's reuse page->lru of table's page > for that. Looks nice. > hugetlbfs hasn't converted to split locking: disable split locking if > hugetlbfs enabled. I don't think that we have to disable when hugetlbfs is enabled, because hugetlbfs code doesn't use huge_pmd_lockptr() or huge_pmd_lock(). > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> > --- > include/linux/mm.h | 31 +++++++++++++++++++++++++++++++ > include/linux/mm_types.h | 5 +++++ > kernel/fork.c | 4 ++-- > mm/Kconfig | 10 ++++++++++ > 4 files changed, 48 insertions(+), 2 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index d2f8a50..5b3922d 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1294,13 +1294,44 @@ static inline void pgtable_page_dtor(struct page *page) > ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ > NULL: pte_offset_kernel(pmd, address)) > > +#if USE_SPLIT_PMD_PTLOCKS > + > +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) > +{ > + return &virt_to_page(pmd)->ptl; > +} > + > +static inline void pgtable_pmd_page_ctor(struct page *page) > +{ > + spin_lock_init(&page->ptl); > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + page->pmd_huge_pte = NULL; > +#endif > +} > + > +static inline void pgtable_pmd_page_dtor(struct page *page) > +{ > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + VM_BUG_ON(page->pmd_huge_pte); > +#endif > +} > + > +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) > > + > +#else > + > static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) > { > return &mm->page_table_lock; > } > > +static inline void pgtable_pmd_page_ctor(struct page *page) {} > +static inline void pgtable_pmd_page_dtor(struct page *page) {} > + > #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) > > +#endif > + > static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd) > { > spinlock_t *ptl = huge_pmd_lockptr(mm, pmd); > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 1c64730..5706ddf 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -24,6 +24,8 @@ > struct address_space; > > #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS) > +#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ > + NR_CPUS >= CONFIG_SPLIT_PMD_PTLOCK_CPUS) > > /* > * Each physical page in the system has a struct page associated with > @@ -130,6 +132,9 @@ struct page { > > struct list_head list; /* slobs list of pages */ > struct slab *slab_page; /* slab fields */ > +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS > + pgtable_t pmd_huge_pte; /* protected by page->ptl */ > +#endif > }; > > /* Remainder is not double word aligned */ Can we remove pmd_huge_pte from mm_struct when USE_SPLIT_PMD_PTLOCKS is true? Thanks, Naoya Horiguchi ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 8/9] mm: implement split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov ` (2 preceding siblings ...) 2013-09-13 15:45 ` Naoya Horiguchi @ 2013-09-13 19:57 ` Dave Hansen 3 siblings, 0 replies; 57+ messages in thread From: Dave Hansen @ 2013-09-13 19:57 UTC (permalink / raw) To: Kirill A. Shutemov Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm On 09/13/2013 06:06 AM, Kirill A. Shutemov wrote: > +config ARCH_ENABLE_SPLIT_PMD_PTLOCK > + boolean > + > +config SPLIT_PMD_PTLOCK_CPUS > + int > + # hugetlb hasn't converted to split locking yet > + default "999999" if HUGETLB_PAGE > + default "32" if ARCH_ENABLE_SPLIT_PMD_PTLOCK > + default "999999" Is there a reason we should have separate config knobs for this from SPLIT_PTLOCK_CPUS? Seem a bit silly. ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 9/9] x86, mm: enable split page table lock for PMD level 2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov ` (7 preceding siblings ...) 2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov @ 2013-09-13 13:06 ` Kirill A. Shutemov 8 siblings, 0 replies; 57+ messages in thread From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw) To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel, linux-mm, Kirill A. Shutemov Enable PMD split page table lock for X86_64 and PAE. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/pgalloc.h | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 30c40f0..6a5cf6a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1880,6 +1880,10 @@ config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + def_bool y + depends on X86_64 || X86_PAE + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b4389a4..f2daea1 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -80,12 +80,18 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #if PAGETABLE_LEVELS > 2 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + struct page *page; + page = alloc_pages(GFP_KERNEL | __GFP_REPEAT| __GFP_ZERO, 0); + if (!page) + return NULL; + pgtable_pmd_page_ctor(page); + return (pmd_t *)page_address(page); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + pgtable_pmd_page_dtor(virt_to_page(pmd)); free_page((unsigned long)pmd); } -- 1.8.4.rc3 ^ permalink raw reply related [flat|nested] 57+ messages in thread
[parent not found: <20130828091814.GA13681@gmail.com>]
* Re: [PATCH 1/8] THP: Use real address for NUMA policy [not found] ` <20130828091814.GA13681@gmail.com> @ 2013-08-28 9:32 ` Peter Zijlstra 0 siblings, 0 replies; 57+ messages in thread From: Peter Zijlstra @ 2013-08-28 9:32 UTC (permalink / raw) To: Ingo Molnar Cc: Alex Thorlton, Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt, Linus Torvalds, Andrea Arcangeli On Wed, Aug 28, 2013 at 11:18:14AM +0200, Ingo Molnar wrote: > But ideally THP should detect cases where a hugepage is heavily used from > multiple nodes and un-HP the page in question. Or not turn it into a > hugepage in the first place. (We actually have a memory access pattern > sampling facility in place upstream, the new CONFIG_NUMA_BALANCING=y code, > which could in theory be utilized.) Mel and I spoke about doing just that a few weeks ago. Breaking up THP pages when we get shared faults from different nodes and avoiding coalescing when the pages are from different nodes. Rik seemed a little skeptical about the entire thing saying that the huge tlb gain is about the same order as the locality thing -- while this is more or less true for the 'tiny' machines most of us have, this is clearly not the case for these big machines. ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 18:17 ` Alex Thorlton 2013-08-16 18:52 ` Kirill A. Shutemov @ 2013-08-16 19:46 ` Peter Zijlstra 2013-08-16 19:49 ` Alex Thorlton 1 sibling, 1 reply; 57+ messages in thread From: Peter Zijlstra @ 2013-08-16 19:46 UTC (permalink / raw) To: Alex Thorlton Cc: Dave Hansen, linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote: > I actually didn't write these patches (made a few tweaks to get them > running on the latest kernel though). They were submitted last July by > Peter Zijlstra. By Kirill, I don't think I've ever touched them. > The messages from the original submission can be found here: > https://lkml.org/lkml/2012/7/20/165 As can be seen by this posting you reference ;-) ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 1/8] THP: Use real address for NUMA policy 2013-08-16 19:46 ` Peter Zijlstra @ 2013-08-16 19:49 ` Alex Thorlton 0 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 19:49 UTC (permalink / raw) To: Peter Zijlstra Cc: Dave Hansen, linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On Fri, Aug 16, 2013 at 09:46:35PM +0200, Peter Zijlstra wrote: > On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote: > > I actually didn't write these patches (made a few tweaks to get them > > running on the latest kernel though). They were submitted last July by > > Peter Zijlstra. > > By Kirill, I don't think I've ever touched them. > > > The messages from the original submission can be found here: > > https://lkml.org/lkml/2012/7/20/165 > > As can be seen by this posting you reference ;-) Sorry about that! Not sure where I got the idea that you wrote them, must've gotten some names mixed up somewhere. ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton 2013-08-16 14:33 ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton @ 2013-08-16 14:33 ` Alex Thorlton 2013-08-16 14:33 ` [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page Alex Thorlton ` (6 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt --- mm/memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1ce2e2a..69a5a38 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4249,16 +4249,17 @@ void clear_huge_page(struct page *page, unsigned long addr, unsigned int pages_per_huge_page) { int i; + unsigned long haddr = addr & HPAGE_PMD_MASK; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, pages_per_huge_page); + clear_gigantic_page(page, haddr, pages_per_huge_page); return; } might_sleep(); for (i = 0; i < pages_per_huge_page; i++) { cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + clear_user_highpage(page + i, haddr + i * PAGE_SIZE); } } -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton 2013-08-16 14:33 ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton 2013-08-16 14:33 ` [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address Alex Thorlton @ 2013-08-16 14:33 ` Alex Thorlton 2013-08-16 14:34 ` [PATCH 4/8] x86: Add clear_page_nocache Alex Thorlton ` (5 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Conflicts: mm/huge_memory.c --- mm/huge_memory.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 55ec681..2e6f106 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -701,7 +701,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, + unsigned long haddr, + unsigned long address, pmd_t *pmd, struct page *page) { pgtable_t pgtable; @@ -711,12 +712,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, if (unlikely(!pgtable)) return VM_FAULT_OOM; - clear_huge_page(page, haddr, HPAGE_PMD_NR); /* * The memory barrier inside __SetPageUptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() * write. */ + clear_huge_page(page, address, HPAGE_PMD_NR); __SetPageUptodate(page); spin_lock(&mm->page_table_lock); @@ -825,8 +826,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, put_page(page); goto out; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, - page))) { + if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, + address, pmd, page))) { mem_cgroup_uncharge_page(page); put_page(page); goto out; -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 4/8] x86: Add clear_page_nocache 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton ` (2 preceding siblings ...) 2013-08-16 14:33 ` [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page Alex Thorlton @ 2013-08-16 14:34 ` Alex Thorlton 2013-08-16 14:34 ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton ` (4 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Conflicts: arch/x86/mm/fault.c --- arch/x86/include/asm/page.h | 2 ++ arch/x86/include/asm/string_32.h | 5 +++++ arch/x86/include/asm/string_64.h | 5 +++++ arch/x86/lib/Makefile | 1 + arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++ arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++ arch/x86/mm/fault.c | 7 +++++++ 7 files changed, 79 insertions(+) create mode 100644 arch/x86/lib/clear_page_nocache_32.S create mode 100644 arch/x86/lib/clear_page_nocache_64.S diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index c878924..ccec9c3 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -33,6 +33,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, copy_page(to, from); } +void clear_user_highpage_nocache(struct page *page, unsigned long vaddr); + #define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \ alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr) #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 3d3e835..3f2fbcf 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include <linux/linkage.h> + /* Let gcc decide whether to inline or use the out of line functions */ #define __HAVE_ARCH_STRCPY @@ -337,6 +339,9 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern, #define __HAVE_ARCH_MEMSCAN extern void *memscan(void *addr, int c, size_t size); +#define ARCH_HAS_USER_NOCACHE 1 +asmlinkage void clear_page_nocache(void *page); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_32_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 19e2c46..ca23d1d 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -3,6 +3,8 @@ #ifdef __KERNEL__ +#include <linux/linkage.h> + /* Written 2002 by Andi Kleen */ /* Only used for special circumstances. Stolen from i386/string.h */ @@ -63,6 +65,9 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); +#define ARCH_HAS_USER_NOCACHE 1 +asmlinkage void clear_page_nocache(void *page); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_STRING_64_H */ diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 96b2c66..e48776c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -23,6 +23,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_SMP) += rwlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o +lib-y += clear_page_nocache_$(BITS).o obj-y += msr.o msr-reg.o msr-reg-export.o diff --git a/arch/x86/lib/clear_page_nocache_32.S b/arch/x86/lib/clear_page_nocache_32.S new file mode 100644 index 0000000..2394e0c --- /dev/null +++ b/arch/x86/lib/clear_page_nocache_32.S @@ -0,0 +1,30 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +/* + * Zero a page avoiding the caches + * rdi page + */ +ENTRY(clear_page_nocache) + CFI_STARTPROC + mov %eax,%edi + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi) + PUT(0) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + lea 64(%edi),%edi + jnz .Lloop + nop + ret + CFI_ENDPROC +ENDPROC(clear_page_nocache) diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S new file mode 100644 index 0000000..ee16d15 --- /dev/null +++ b/arch/x86/lib/clear_page_nocache_64.S @@ -0,0 +1,29 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +/* + * Zero a page avoiding the caches + * rdi page + */ +ENTRY(clear_page_nocache) + CFI_STARTPROC + xorl %eax,%eax + movl $4096/64,%ecx + .p2align 4 +.Lloop: + decl %ecx +#define PUT(x) movnti %rax,x*8(%rdi) + movnti %rax,(%rdi) + PUT(1) + PUT(2) + PUT(3) + PUT(4) + PUT(5) + PUT(6) + PUT(7) + leaq 64(%rdi),%rdi + jnz .Lloop + nop + ret + CFI_ENDPROC +ENDPROC(clear_page_nocache) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4a..a088ed7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1230,3 +1230,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) __do_page_fault(regs, error_code); exception_exit(prev_state); } + +void clear_user_highpage_nocache(struct page *page, unsigned long vaddr) +{ + void *p = kmap_atomic(page, KM_USER0); + clear_page_nocache(p); + kunmap_atomic(p); +} -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton ` (3 preceding siblings ...) 2013-08-16 14:34 ` [PATCH 4/8] x86: Add clear_page_nocache Alex Thorlton @ 2013-08-16 14:34 ` Alex Thorlton 2013-08-16 18:02 ` Dave Hansen 2013-08-16 14:34 ` [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 Alex Thorlton ` (3 subsequent siblings) 8 siblings, 1 reply; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt --- mm/memory.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 69a5a38..17d61f0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4231,18 +4231,35 @@ EXPORT_SYMBOL(might_fault); #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) + +#ifndef ARCH_HAS_USER_NOCACHE +#define ARCH_HAS_USER_NOCACHE 0 +#endif + +#if ARCH_HAS_USER_NOCACHE == 0 +#define clear_user_highpage_nocache clear_user_highpage +#endif + static void clear_gigantic_page(struct page *page, unsigned long addr, unsigned int pages_per_huge_page) { int i; struct page *p = page; + unsigned long vaddr; + unsigned long haddr = addr & HPAGE_PMD_MASK; + int target = (addr - haddr) >> PAGE_SHIFT; might_sleep(); + vaddr = haddr; for (i = 0; i < pages_per_huge_page; i++, p = mem_map_next(p, page, i)) { cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); + vaddr = haddr + i*PAGE_SIZE; + if (!ARCH_HAS_USER_NOCACHE || i == target) + clear_user_highpage(p, vaddr); + else + clear_user_highpage_nocache(p, vaddr); } } void clear_huge_page(struct page *page, @@ -4250,6 +4267,8 @@ void clear_huge_page(struct page *page, { int i; unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long vaddr; + int target = (addr - haddr) >> PAGE_SHIFT; if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, haddr, pages_per_huge_page); @@ -4257,9 +4276,14 @@ void clear_huge_page(struct page *page, } might_sleep(); + vaddr = haddr; for (i = 0; i < pages_per_huge_page; i++) { cond_resched(); - clear_user_highpage(page + i, haddr + i * PAGE_SIZE); + vaddr = haddr + i*PAGE_SIZE; + if (!ARCH_HAS_USER_NOCACHE || i == target) + clear_user_highpage(page + i, vaddr); + else + clear_user_highpage_nocache(page + i, vaddr); } } -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* Re: [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address 2013-08-16 14:34 ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton @ 2013-08-16 18:02 ` Dave Hansen 0 siblings, 0 replies; 57+ messages in thread From: Dave Hansen @ 2013-08-16 18:02 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On 08/16/2013 07:34 AM, Alex Thorlton wrote: > +#if ARCH_HAS_USER_NOCACHE == 0 > +#define clear_user_highpage_nocache clear_user_highpage > +#endif ... > cond_resched(); > - clear_user_highpage(p, addr + i * PAGE_SIZE); > + vaddr = haddr + i*PAGE_SIZE; > + if (!ARCH_HAS_USER_NOCACHE || i == target) > + clear_user_highpage(p, vaddr); > + else > + clear_user_highpage_nocache(p, vaddr); > } > } Is the check for !ARCH_HAS_USER_NOCACHE in there really necessary? That logic seems like it would be taken care of by that #define. Plus, if you don't check 'ARCH_HAS_USER_NOCACHE' like this, you don't need to define a value for it, and you can axe the: +#ifndef ARCH_HAS_USER_NOCACHE +#define ARCH_HAS_USER_NOCACHE 0 +#endif ^ permalink raw reply [flat|nested] 57+ messages in thread
* [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton ` (4 preceding siblings ...) 2013-08-16 14:34 ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton @ 2013-08-16 14:34 ` Alex Thorlton 2013-08-16 14:34 ` [PATCH 7/8] remove KM_USER0 from kmap_atomic call Alex Thorlton ` (2 subsequent siblings) 8 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt --- arch/x86/lib/clear_page_nocache_64.S | 91 ++++++++++++++++++++++++++++++------ 1 file changed, 77 insertions(+), 14 deletions(-) diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S index ee16d15..a6d938c 100644 --- a/arch/x86/lib/clear_page_nocache_64.S +++ b/arch/x86/lib/clear_page_nocache_64.S @@ -1,29 +1,92 @@ +/* + * Clear pages with cache bypass. + * + * Copyright (C) 2011, 2012 Intel Corporation + * Author: Andi Kleen + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + */ + #include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/cpufeature.h> #include <asm/dwarf2.h> +#define SSE_UNROLL 128 + /* * Zero a page avoiding the caches * rdi page */ ENTRY(clear_page_nocache) CFI_STARTPROC - xorl %eax,%eax - movl $4096/64,%ecx + push %rdi + call kernel_fpu_begin + pop %rdi + sub $16,%rsp + CFI_ADJUST_CFA_OFFSET 16 + movdqu %xmm0,(%rsp) + xorpd %xmm0,%xmm0 + movl $4096/SSE_UNROLL,%ecx .p2align 4 .Lloop: decl %ecx -#define PUT(x) movnti %rax,x*8(%rdi) - movnti %rax,(%rdi) - PUT(1) - PUT(2) - PUT(3) - PUT(4) - PUT(5) - PUT(6) - PUT(7) - leaq 64(%rdi),%rdi + .set x,0 + .rept SSE_UNROLL/16 + movntdq %xmm0,x(%rdi) + .set x,x+16 + .endr + leaq SSE_UNROLL(%rdi),%rdi jnz .Lloop - nop - ret + movdqu (%rsp),%xmm0 + addq $16,%rsp + CFI_ADJUST_CFA_OFFSET -16 + jmp kernel_fpu_end CFI_ENDPROC ENDPROC(clear_page_nocache) + +#ifdef CONFIG_AS_AVX + + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_nocache_avx - clear_page_nocache) - (2f - 1b) + /* offset */ +2: + .previous + .section .altinstructions,"a" + altinstruction_entry clear_page_nocache,1b,X86_FEATURE_AVX,\ + 16, 2b-1b + .previous + +#define AVX_UNROLL 256 /* TUNE ME */ + +ENTRY(clear_page_nocache_avx) + CFI_STARTPROC + push %rdi + call kernel_fpu_begin + pop %rdi + sub $32,%rsp + CFI_ADJUST_CFA_OFFSET 32 + vmovdqu %ymm0,(%rsp) + vxorpd %ymm0,%ymm0,%ymm0 + movl $4096/AVX_UNROLL,%ecx + .p2align 4 +.Lloop_avx: + decl %ecx + .set x,0 + .rept AVX_UNROLL/32 + vmovntdq %ymm0,x(%rdi) + .set x,x+32 + .endr + leaq AVX_UNROLL(%rdi),%rdi + jnz .Lloop_avx + vmovdqu (%rsp),%ymm0 + addq $32,%rsp + CFI_ADJUST_CFA_OFFSET -32 + jmp kernel_fpu_end + CFI_ENDPROC +ENDPROC(clear_page_nocache_avx) + +#endif -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 7/8] remove KM_USER0 from kmap_atomic call 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton ` (5 preceding siblings ...) 2013-08-16 14:34 ` [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 Alex Thorlton @ 2013-08-16 14:34 ` Alex Thorlton 2013-08-16 14:34 ` [PATCH 8/8] fix up references to kernel_fpu_begin/end Alex Thorlton 2013-08-16 14:47 ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra 8 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a088ed7..d3f6dc7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1233,7 +1233,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) void clear_user_highpage_nocache(struct page *page, unsigned long vaddr) { - void *p = kmap_atomic(page, KM_USER0); + void *p = kmap_atomic(page); clear_page_nocache(p); kunmap_atomic(p); } -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* [PATCH 8/8] fix up references to kernel_fpu_begin/end 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton ` (6 preceding siblings ...) 2013-08-16 14:34 ` [PATCH 7/8] remove KM_USER0 from kmap_atomic call Alex Thorlton @ 2013-08-16 14:34 ` Alex Thorlton 2013-08-16 14:47 ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra 8 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw) To: linux-kernel Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt --- arch/x86/lib/clear_page_nocache_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S index a6d938c..d03408e 100644 --- a/arch/x86/lib/clear_page_nocache_64.S +++ b/arch/x86/lib/clear_page_nocache_64.S @@ -23,7 +23,7 @@ ENTRY(clear_page_nocache) CFI_STARTPROC push %rdi - call kernel_fpu_begin + call __kernel_fpu_begin pop %rdi sub $16,%rsp CFI_ADJUST_CFA_OFFSET 16 @@ -43,7 +43,7 @@ ENTRY(clear_page_nocache) movdqu (%rsp),%xmm0 addq $16,%rsp CFI_ADJUST_CFA_OFFSET -16 - jmp kernel_fpu_end + jmp __kernel_fpu_end CFI_ENDPROC ENDPROC(clear_page_nocache) @@ -65,7 +65,7 @@ ENDPROC(clear_page_nocache) ENTRY(clear_page_nocache_avx) CFI_STARTPROC push %rdi - call kernel_fpu_begin + call __kernel_fpu_begin pop %rdi sub $32,%rsp CFI_ADJUST_CFA_OFFSET 32 @@ -85,7 +85,7 @@ ENTRY(clear_page_nocache_avx) vmovdqu (%rsp),%ymm0 addq $32,%rsp CFI_ADJUST_CFA_OFFSET -32 - jmp kernel_fpu_end + jmp __kernel_fpu_end CFI_ENDPROC ENDPROC(clear_page_nocache_avx) -- 1.7.12.4 ^ permalink raw reply related [flat|nested] 57+ messages in thread
* Re: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp 2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton ` (7 preceding siblings ...) 2013-08-16 14:34 ` [PATCH 8/8] fix up references to kernel_fpu_begin/end Alex Thorlton @ 2013-08-16 14:47 ` Peter Zijlstra 2013-08-16 15:04 ` Alex Thorlton 8 siblings, 1 reply; 57+ messages in thread From: Peter Zijlstra @ 2013-08-16 14:47 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt On Fri, Aug 16, 2013 at 09:33:56AM -0500, Alex Thorlton wrote: > This first set of tests was run on the latest community kernel, with the > vclear patches: > > Kernel string: Kernel 3.11.0-rc5-medusa-00021-g1a15a96-dirty > harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled > [always] madvise never > harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh > ... > Done. Terminating the simulation. > > real 25m34.052s > user 10769m7.948s > sys 37m46.524s > > harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# echo never > /sys/kernel/mm/transparent_hugepage/enabled > harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled > always madvise [never] > harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh > ... > Done. Terminating the simulation. > > real 5m0.377s > user 2202m0.684s > sys 108m31.816s > This 321.equake_l thing is not public, right? Do you have anything that is public that shows the same problem? Do you have any idea where all the extra time is spend? ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp 2013-08-16 14:47 ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra @ 2013-08-16 15:04 ` Alex Thorlton 0 siblings, 0 replies; 57+ messages in thread From: Alex Thorlton @ 2013-08-16 15:04 UTC (permalink / raw) To: Peter Zijlstra Cc: linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt > This 321.equake_l thing is not public, right? Do you have anything > that > is public that shows the same problem? The quake test comes from the SPEC OMP benchmarks. While I believe this suite is available to anybody, I don't think it's free. I was given the test by our benchmarking team, I'll have to ask them if this is publicly available/if they know of any publicly available tests that can reproduce the issue. > Do you have any idea where all the extra time is spend? I'm working on that as we speak. I was trying to eliminate the possibility that the vclear patches would help solve the problem, before digging any deeper. I should have some more information on exactly where it's getting hung up in a few hours. - Alex ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton 2013-08-02 19:53 ` Dave Jones 2013-08-02 20:13 ` Kirill A. Shutemov @ 2013-08-04 14:44 ` Rasmus Villemoes 2013-08-28 13:56 ` Andrea Arcangeli 3 siblings, 0 replies; 57+ messages in thread From: Rasmus Villemoes @ 2013-08-04 14:44 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Alex Thorlton <athorlton@sgi.com> writes: > This patch implements functionality to allow processes to disable the use of > transparent hugepages through the prctl syscall. A few comments: Is there a reason it shouldn't be possible for a process to un-disable/reenable thp? > +static inline int transparent_hugepage_enabled(struct vm_area_struct *vma) > +{ > + return !current->thp_disabled & _transparent_hugepage_enabled(vma); > +} Should probably be &&. > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 50d04b9..f084c76 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1406,6 +1406,9 @@ struct task_struct { > unsigned int sequential_io; > unsigned int sequential_io_avg; > #endif > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE > + int thp_disabled; > +#endif > }; Is there a reason this needs a whole int in task_struct and not just a single bit? Rasmus ^ permalink raw reply [flat|nested] 57+ messages in thread
* Re: [PATCH] Add per-process flag to control thp 2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton ` (2 preceding siblings ...) 2013-08-04 14:44 ` Rasmus Villemoes @ 2013-08-28 13:56 ` Andrea Arcangeli 3 siblings, 0 replies; 57+ messages in thread From: Andrea Arcangeli @ 2013-08-28 13:56 UTC (permalink / raw) To: Alex Thorlton Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner, Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt Hi everyone, On Fri, Aug 02, 2013 at 02:46:59PM -0500, Alex Thorlton wrote: > This patch implements functionality to allow processes to disable the use of > transparent hugepages through the prctl syscall. > > We've determined that some jobs perform significantly better with thp disabled, > and we need a way to control thp on a per-process basis, without relying on > madvise. Are you using automatic NUMA balancing or CPU pinning or was this a short lived computation? If you're not using automatic NUMA balancing or if you're using automatic NUMA balancing and this is a fairly short lived computation, zone_reclaim_mode was broken with THP enabled as it never called into compaction, so with THP enabled you would have a random NUMA placement during page fault to satisfy hugepage allocation (first priority was to get any hugepage from any wrong node, only then it would call compaction). Did you verify the numa placement that you were getting? I fixed the zone_reclaim_mode for THP and posted the fixes on linux-mm, so you may want to try that fix just in case it helps your workload. Chances are you work with a big system where zone_reclaim_mode is enabled by default, and for a test I would suggest you to enable it once even if it's not enabled by default (after applying the fixes) to see if it makes any difference. I just did a push of my latest tree to the master branch of aa.git on kernel.org, so you can just pull that for a quick test with zone_reclaim_mode enabled. NOTE: it's not guaranteed to help but it's worth a try. As you pointed out if the threads are working on separate 4k fragments, it won't help. But as things stands now THP enabled would breaks zone_reclaim_mode and so without hard memory pinning it wouldn't perform as good as with THP disabled. ^ permalink raw reply [flat|nested] 57+ messages in thread
end of thread, other threads:[~2013-09-13 19:58 UTC | newest]
Thread overview: 57+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton
2013-08-02 19:53 ` Dave Jones
2013-08-02 20:00 ` Alex Thorlton
2013-08-02 20:13 ` Kirill A. Shutemov
2013-08-02 20:34 ` Alex Thorlton
2013-08-02 23:59 ` Kirill A. Shutemov
2013-08-03 19:35 ` Kees Cook
2013-08-04 14:19 ` Oleg Nesterov
2013-08-05 2:36 ` Andi Kleen
2013-08-05 15:07 ` Alex Thorlton
2013-08-16 14:33 ` [PATCH 0/8] " Alex Thorlton
2013-08-16 14:33 ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton
2013-08-16 17:53 ` Dave Hansen
2013-08-16 18:17 ` Alex Thorlton
2013-08-16 18:52 ` Kirill A. Shutemov
2013-08-27 16:50 ` Alex Thorlton
2013-08-27 17:01 ` Robin Holt
2013-09-04 15:43 ` Alex Thorlton
2013-09-04 17:15 ` Alex Thorlton
2013-09-05 11:15 ` Ingo Molnar
2013-09-09 16:48 ` Alex Thorlton
2013-09-10 7:47 ` [benchmark] THP performance testcase Ingo Molnar
2013-09-13 13:06 ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov
2013-09-13 15:20 ` Dave Hansen
2013-09-13 13:06 ` [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov
2013-09-13 13:19 ` Peter Zijlstra
2013-09-13 14:22 ` Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov
2013-09-13 13:06 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
2013-09-13 13:24 ` Peter Zijlstra
2013-09-13 14:25 ` Kirill A. Shutemov
2013-09-13 14:52 ` Peter Zijlstra
2013-09-13 13:36 ` Peter Zijlstra
2013-09-13 14:25 ` Kirill A. Shutemov
2013-09-13 15:45 ` Naoya Horiguchi
2013-09-13 19:57 ` Dave Hansen
2013-09-13 13:06 ` [PATCH 9/9] x86, mm: enable " Kirill A. Shutemov
[not found] ` <20130828091814.GA13681@gmail.com>
2013-08-28 9:32 ` [PATCH 1/8] THP: Use real address for NUMA policy Peter Zijlstra
2013-08-16 19:46 ` Peter Zijlstra
2013-08-16 19:49 ` Alex Thorlton
2013-08-16 14:33 ` [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address Alex Thorlton
2013-08-16 14:33 ` [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page Alex Thorlton
2013-08-16 14:34 ` [PATCH 4/8] x86: Add clear_page_nocache Alex Thorlton
2013-08-16 14:34 ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton
2013-08-16 18:02 ` Dave Hansen
2013-08-16 14:34 ` [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 Alex Thorlton
2013-08-16 14:34 ` [PATCH 7/8] remove KM_USER0 from kmap_atomic call Alex Thorlton
2013-08-16 14:34 ` [PATCH 8/8] fix up references to kernel_fpu_begin/end Alex Thorlton
2013-08-16 14:47 ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra
2013-08-16 15:04 ` Alex Thorlton
2013-08-04 14:44 ` Rasmus Villemoes
2013-08-28 13:56 ` Andrea Arcangeli
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).