[PATCH] Add per-process flag to control thp

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] Add per-process flag to control thp
@ 2013-08-02 19:46 Alex Thorlton
  2013-08-02 19:53 ` Dave Jones
                   ` (3 more replies)
  0 siblings, 4 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-02 19:46 UTC (permalink / raw)
  To: linux-kernel
  Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman,
	Kirill A. Shutemov, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt,
	linux-kernel

This patch implements functionality to allow processes to disable the use of
transparent hugepages through the prctl syscall.

We've determined that some jobs perform significantly better with thp disabled,
and we need a way to control thp on a per-process basis, without relying on
madvise.

---
 include/linux/huge_mm.h    | 14 +++++++++++++-
 include/linux/init_task.h  |  8 ++++++++
 include/linux/sched.h      |  3 +++
 include/uapi/linux/prctl.h |  3 +++
 kernel/fork.c              |  4 ++++
 kernel/sys.c               | 31 +++++++++++++++++++++++++++++++
 6 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b60de92..53af3ca 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_HUGE_MM_H
 #define _LINUX_HUGE_MM_H
 
+#include <linux/sched.h>
+
 extern int do_huge_pmd_anonymous_page(struct mm_struct *mm,
 				      struct vm_area_struct *vma,
 				      unsigned long address, pmd_t *pmd,
@@ -66,7 +68,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 
 extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
 
-#define transparent_hugepage_enabled(__vma)				\
+#define _transparent_hugepage_enabled(__vma)				\
 	((transparent_hugepage_flags &					\
 	  (1<<TRANSPARENT_HUGEPAGE_FLAG) ||				\
 	  (transparent_hugepage_flags &					\
@@ -177,6 +179,11 @@ static inline struct page *compound_trans_head(struct page *page)
 	return page;
 }
 
+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+	return !current->thp_disabled & _transparent_hugepage_enabled(vma);
+}
+
 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
@@ -230,6 +237,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
 	return 0;
 }
 
+static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+	return _transparent_hugepage_enabled(vma);
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5cd0f09..aae74fd 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -152,6 +152,13 @@ extern struct task_group root_task_group;
 # define INIT_VTIME(tsk)
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+# define INIT_THP_DISABLED						\
+	.thp_disabled = 0,
+#else
+# define INIT_THP_DISABLED
+#endif
+
 #define INIT_TASK_COMM "swapper"
 
 /*
@@ -222,6 +229,7 @@ extern struct task_group root_task_group;
 	INIT_TASK_RCU_PREEMPT(tsk)					\
 	INIT_CPUSET_SEQ							\
 	INIT_VTIME(tsk)							\
+	INIT_THP_DISABLED						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 50d04b9..f084c76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1406,6 +1406,9 @@ struct task_struct {
 	unsigned int	sequential_io;
 	unsigned int	sequential_io_avg;
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	int thp_disabled;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f..f69780d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
 
 #define PR_GET_TID_ADDRESS	40
 
+#define PR_SET_THP_DISABLED	41
+#define PR_GET_THP_DISABLED	42
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 403d2bb..0b4afb5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1311,6 +1311,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->sequential_io_avg	= 0;
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	p->thp_disabled = current->thp_disabled;
+#endif
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p);
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 771129b..416c8a6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1836,6 +1836,31 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+	me->thp_disabled = 1;
+	return 0;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+				  int __user *thp_disabled)
+{
+	return put_user(me->thp_disabled, thp_disabled);
+}
+#else
+static int prctl_set_thp_disabled(struct task_struct *me)
+{
+	return -EINVAL;
+}
+
+static int prctl_get_thp_disabled(struct task_struct *me,
+				  int __user *thp_disabled)
+{
+	return -EINVAL;
+}
+#endif
+
 SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
@@ -1999,6 +2024,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
 		return current->no_new_privs ? 1 : 0;
+	case PR_SET_THP_DISABLED:
+		error = prctl_set_thp_disabled(me);
+		break;
+	case PR_GET_THP_DISABLED:
+		error = prctl_get_thp_disabled(me, (int __user *) arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton
@ 2013-08-02 19:53 ` Dave Jones
  2013-08-02 20:00   ` Alex Thorlton
  2013-08-02 20:13 ` Kirill A. Shutemov
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Dave Jones @ 2013-08-02 19:53 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

On Fri, Aug 02, 2013 at 02:46:59PM -0500, Alex Thorlton wrote:

 > diff --git a/include/linux/sched.h b/include/linux/sched.h
 > index 50d04b9..f084c76 100644
 > --- a/include/linux/sched.h
 > +++ b/include/linux/sched.h
 > @@ -1406,6 +1406,9 @@ struct task_struct {
 >  	unsigned int	sequential_io;
 >  	unsigned int	sequential_io_avg;
 >  #endif
 > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 > +	int thp_disabled;
 > +#endif
 >  };

Instead of blowing a whole int on this bool, we could add it
to the bitfield a few pages up where we have other prctl things like..

	unsigned no_new_privs:1;

	Dave


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 19:53 ` Dave Jones
@ 2013-08-02 20:00   ` Alex Thorlton
  0 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-02 20:00 UTC (permalink / raw)
  To: Dave Jones, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Kirill A. Shutemov, Rik van Riel,
	Johannes Weiner, Eric W. Biederman, Sedat Dilek,
	Frederic Weisbecker, Michael Kerrisk, Paul E. McKenney,
	David Howells, Thomas Gleixner, Al Viro, Oleg Nesterov,
	Srikar Dronamraju, Kees Cook, Robin Holt

> Instead of blowing a whole int on this bool, we could add it
> to the bitfield a few pages up where we have other prctl things like..
> 
> 	unsigned no_new_privs:1;

Definitely a better way to go.  I'll tweak that and float another
version of the patch.  Thanks for the input, Dave!

^ permalink raw reply	[flat|nested] 57+ messages in thread

* RE: [PATCH] Add per-process flag to control thp
  2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton
  2013-08-02 19:53 ` Dave Jones
@ 2013-08-02 20:13 ` Kirill A. Shutemov
  2013-08-02 20:34   ` Alex Thorlton
  2013-08-04 14:44 ` Rasmus Villemoes
  2013-08-28 13:56 ` Andrea Arcangeli
  3 siblings, 1 reply; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-08-02 20:13 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt,
	linux-kernel

Alex Thorlton wrote:
> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
> 
> We've determined that some jobs perform significantly better with thp disabled,
> and we need a way to control thp on a per-process basis, without relying on
> madvise.

What kind of workloads are you talking about?

What's wrong with madvise? Could you elaborate?

And I think thp_disabled should be reset to 0 on exec.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 20:13 ` Kirill A. Shutemov
@ 2013-08-02 20:34   ` Alex Thorlton
  2013-08-02 23:59     ` Kirill A. Shutemov
                       ` (2 more replies)
  0 siblings, 3 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-02 20:34 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Rik van Riel, Johannes Weiner, Eric W. Biederman,
	Sedat Dilek, Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

> What kind of workloads are you talking about?

Our benchmarking team has a list of several of the SPEC OMP benchmarks
that perform significantly better when THP is disabled. I tried to get
the list but one of our servers is acting up and I can't get to it
right now :/

> What's wrong with madvise? Could you elaborate?

The main issue with using madvise is that it's not an option with static
binaries, but there are also some users who have legacy Fortran code
that they're not willing/able to change.

> And I think thp_disabled should be reset to 0 on exec.

The main purpose for this getting carried down from the parent process
is that we'd like to be able to have a userland program set this flag on
itself, and then spawn off children who will also carry the flag.
This allows us to set the flag for programs where we're unable to modify
the code, thus resolving the issues detailed above.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 20:34   ` Alex Thorlton
@ 2013-08-02 23:59     ` Kirill A. Shutemov
  2013-08-03 19:35     ` Kees Cook
  2013-08-05  2:36     ` Andi Kleen
  2 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-08-02 23:59 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Kirill A. Shutemov, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

Alex Thorlton wrote:
> > What kind of workloads are you talking about?
> 
> Our benchmarking team has a list of several of the SPEC OMP benchmarks
> that perform significantly better when THP is disabled. I tried to get
> the list but one of our servers is acting up and I can't get to it
> right now :/

Have you track down the reason why it's slow tih THP?

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 20:34   ` Alex Thorlton
  2013-08-02 23:59     ` Kirill A. Shutemov
@ 2013-08-03 19:35     ` Kees Cook
  2013-08-04 14:19       ` Oleg Nesterov
  2013-08-05  2:36     ` Andi Kleen
  2 siblings, 1 reply; 57+ messages in thread
From: Kees Cook @ 2013-08-03 19:35 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Kirill A. Shutemov, LKML, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Robin Holt

On Fri, Aug 2, 2013 at 1:34 PM, Alex Thorlton <athorlton@sgi.com> wrote:
>> What kind of workloads are you talking about?
>
> Our benchmarking team has a list of several of the SPEC OMP benchmarks
> that perform significantly better when THP is disabled. I tried to get
> the list but one of our servers is acting up and I can't get to it
> right now :/
>
>> What's wrong with madvise? Could you elaborate?
>
> The main issue with using madvise is that it's not an option with static
> binaries, but there are also some users who have legacy Fortran code
> that they're not willing/able to change.
>
>> And I think thp_disabled should be reset to 0 on exec.
>
> The main purpose for this getting carried down from the parent process
> is that we'd like to be able to have a userland program set this flag on
> itself, and then spawn off children who will also carry the flag.
> This allows us to set the flag for programs where we're unable to modify
> the code, thus resolving the issues detailed above.

This could be done with LD_PRELOAD for uncontrolled binaries. Though
it does seem sensible to make it act like most personality flags do
(i.e. inherited).

-Kees

-- 
Kees Cook
Chrome OS Security

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-03 19:35     ` Kees Cook
@ 2013-08-04 14:19       ` Oleg Nesterov
  0 siblings, 0 replies; 57+ messages in thread
From: Oleg Nesterov @ 2013-08-04 14:19 UTC (permalink / raw)
  To: Kees Cook
  Cc: Alex Thorlton, Kirill A. Shutemov, LKML, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Mel Gorman, Rik van Riel,
	Johannes Weiner, Eric W. Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E. McKenney, David Howells, Thomas Gleixner, Al Viro,
	Srikar Dronamraju, Robin Holt

On 08/03, Kees Cook wrote:
>
> On Fri, Aug 2, 2013 at 1:34 PM, Alex Thorlton <athorlton@sgi.com> wrote:
> >
> >> And I think thp_disabled should be reset to 0 on exec.
> >
> > The main purpose for this getting carried down from the parent process
> > is that we'd like to be able to have a userland program set this flag on
> > itself, and then spawn off children who will also carry the flag.
> > This allows us to set the flag for programs where we're unable to modify
> > the code, thus resolving the issues detailed above.
>
> This could be done with LD_PRELOAD for uncontrolled binaries. Though
> it does seem sensible to make it act like most personality flags do
> (i.e. inherited).

It is only inheritable if the process is single-threaded.

And even if it is single-threaded I can't understand how it can really
help.

OK, transparent_hugepage_enabled() checks ->thp_disabled, but this is
the fault path. But this flag can't hide the task from khugepaged and
collapse_huge_page ?

I still think this should be mm flag. And we have MMF_INIT_MASK, it
can be inheritable.

Oleg.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 20:34   ` Alex Thorlton
  2013-08-02 23:59     ` Kirill A. Shutemov
  2013-08-03 19:35     ` Kees Cook
@ 2013-08-05  2:36     ` Andi Kleen
  2013-08-05 15:07       ` Alex Thorlton
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
  2 siblings, 2 replies; 57+ messages in thread
From: Andi Kleen @ 2013-08-05  2:36 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Kirill A. Shutemov, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

Alex Thorlton <athorlton@sgi.com> writes:

>> What kind of workloads are you talking about?
>
> Our benchmarking team has a list of several of the SPEC OMP benchmarks
> that perform significantly better when THP is disabled. I tried to get
> the list but one of our servers is acting up and I can't get to it
> right now :/

Please try it with the vclear patches.

https://lkml.org/lkml/2012/7/20/165
et.al.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-05  2:36     ` Andi Kleen
@ 2013-08-05 15:07       ` Alex Thorlton
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
  1 sibling, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-05 15:07 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Kirill A. Shutemov, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

> Please try it with the vclear patches.

Thanks, Andi.  I'll give that a shot and see if it makes any difference.

- Alex

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp
  2013-08-05  2:36     ` Andi Kleen
  2013-08-05 15:07       ` Alex Thorlton
@ 2013-08-16 14:33       ` Alex Thorlton
  2013-08-16 14:33         ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton
                           ` (8 more replies)
  1 sibling, 9 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

Here are the results from one of the benchmarks that performs
particularly poorly when thp is enabled.  Unfortunately the vclear
patches don't seem to provide a performance boost.  I've attached
the patches that include the changes I had to make to get the vclear
patches applied to the latest kernel.

This first set of tests was run on the latest community kernel, with the
vclear patches:

Kernel string: Kernel 3.11.0-rc5-medusa-00021-g1a15a96-dirty
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
[always] madvise never
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
...
Done. Terminating the simulation.

real    25m34.052s
user    10769m7.948s
sys     37m46.524s

harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# echo never > /sys/kernel/mm/transparent_hugepage/enabled
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
always madvise [never]
harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
...
Done. Terminating the simulation.

real    5m0.377s
user    2202m0.684s
sys     108m31.816s

Here are the same tests on the clean kernel:

Kernel string: Kernel 3.11.0-rc5-medusa-00013-g584d88b

Kernel string: Kernel 3.11.0-rc5-medusa-00013-g584d88b
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> cat /sys/kernel/mm/transparent_hugepage/enabled
[always] madvise never
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> time ./run.sh
...
Done. Terminating the simulation.

real    21m44.052s
user    10809m55.356s
sys     39m58.300s

harp31-sys:~ # echo never > /sys/kernel/mm/transparent_hugepage/enabled
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> cat /sys/kernel/mm/transparent_hugepage/enabled
always madvise [never]
athorlton@harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l> time ./run.sh
...
Done. Terminating the simulation.

real    4m52.502s
user    2127m18.548s
sys     104m50.828s

Working on getting some more information about the root of the
performance issues now...

Alex Thorlton (8):
  THP: Use real address for NUMA policy
  mm: make clear_huge_page tolerate non aligned address
  THP: Pass real, not rounded, address to clear_huge_page
  x86: Add clear_page_nocache
  mm: make clear_huge_page cache clear only around the fault address
  x86: switch the 64bit uncached page clear to SSE/AVX v2
  remove KM_USER0 from kmap_atomic call
  fix up references to kernel_fpu_begin/end

 arch/x86/include/asm/page.h          |  2 +
 arch/x86/include/asm/string_32.h     |  5 ++
 arch/x86/include/asm/string_64.h     |  5 ++
 arch/x86/lib/Makefile                |  1 +
 arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++
 arch/x86/lib/clear_page_nocache_64.S | 92 ++++++++++++++++++++++++++++++++++++
 arch/x86/mm/fault.c                  |  7 +++
 mm/huge_memory.c                     | 17 +++----
 mm/memory.c                          | 31 ++++++++++--
 9 files changed, 179 insertions(+), 11 deletions(-)
 create mode 100644 arch/x86/lib/clear_page_nocache_32.S
 create mode 100644 arch/x86/lib/clear_page_nocache_64.S

-- 
1.7.12.4

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
@ 2013-08-16 14:33         ` Alex Thorlton
  2013-08-16 17:53           ` Dave Hansen
  2013-08-16 14:33         ` [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address Alex Thorlton
                           ` (7 subsequent siblings)
  8 siblings, 1 reply; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

---
 mm/huge_memory.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a92012a..55ec681 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -746,11 +746,11 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 
 static inline struct page *alloc_hugepage_vma(int defrag,
 					      struct vm_area_struct *vma,
-					      unsigned long haddr, int nd,
+					      unsigned long address, int nd,
 					      gfp_t extra_gfp)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
-			       HPAGE_PMD_ORDER, vma, haddr, nd);
+			       HPAGE_PMD_ORDER, vma, address, nd);
 }
 
 #ifndef CONFIG_NUMA
@@ -815,7 +815,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			return 0;
 		}
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					  vma, haddr, numa_node_id(), 0);
+					  vma, address, numa_node_id(), 0);
 		if (unlikely(!page)) {
 			count_vm_event(THP_FAULT_FALLBACK);
 			goto out;
@@ -1160,7 +1160,7 @@ alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr, numa_node_id(), 0);
+					      vma, address, numa_node_id(), 0);
 	else
 		new_page = NULL;
 
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 14:33         ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton
@ 2013-08-16 17:53           ` Dave Hansen
  2013-08-16 18:17             ` Alex Thorlton
  0 siblings, 1 reply; 57+ messages in thread
From: Dave Hansen @ 2013-08-16 17:53 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

On 08/16/2013 07:33 AM, Alex Thorlton wrote:
> ---
>  mm/huge_memory.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index a92012a..55ec681 100644

Could you add some actual descriptions to these patches that say why you
are doing this, and why this particular patch is needed and implemented
this way?

You mention that THP is slow for you, then go on to implement some
non-cached page zero'ing, but you never quite connect the dots.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 17:53           ` Dave Hansen
@ 2013-08-16 18:17             ` Alex Thorlton
  2013-08-16 18:52               ` Kirill A. Shutemov
  2013-08-16 19:46               ` Peter Zijlstra
  0 siblings, 2 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 18:17 UTC (permalink / raw)
  To: Dave Hansen
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

> Could you add some actual descriptions to these patches that say why you
> are doing this, and why this particular patch is needed and implemented
> this way?
> 
> You mention that THP is slow for you, then go on to implement some
> non-cached page zero'ing, but you never quite connect the dots.

I actually didn't write these patches (made a few tweaks to get them
running on the latest kernel though).  They were submitted last July by 
Peter Zijlstra.  Andi Kleen suggested that I re-run some of my tests
using these patches to see whether they solved my issue.  I just
included my updated patches so that people could confirm that I'd pulled
them forward properly.

The messages from the original submission can be found here:
https://lkml.org/lkml/2012/7/20/165

I did write the patch that these were submitted in response to, to
control THP on a per-process basis, but it looks like we're probably
going to end up taking this in a different direction, pending some more
test results.

- Alex

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 18:17             ` Alex Thorlton
@ 2013-08-16 18:52               ` Kirill A. Shutemov
  2013-08-27 16:50                 ` Alex Thorlton
  2013-08-16 19:46               ` Peter Zijlstra
  1 sibling, 1 reply; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-08-16 18:52 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel,
	Johannes Weiner, Eric W . Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote:
> > Could you add some actual descriptions to these patches that say why you
> > are doing this, and why this particular patch is needed and implemented
> > this way?
> > 
> > You mention that THP is slow for you, then go on to implement some
> > non-cached page zero'ing, but you never quite connect the dots.
> 
> I actually didn't write these patches (made a few tweaks to get them
> running on the latest kernel though).  They were submitted last July by 
> Peter Zijlstra.  Andi Kleen suggested that I re-run some of my tests
> using these patches to see whether they solved my issue.  I just
> included my updated patches so that people could confirm that I'd pulled
> them forward properly.
> 
> The messages from the original submission can be found here:
> https://lkml.org/lkml/2012/7/20/165

Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 18:52               ` Kirill A. Shutemov
@ 2013-08-27 16:50                 ` Alex Thorlton
  2013-08-27 17:01                   ` Robin Holt
       [not found]                   ` <20130828091814.GA13681@gmail.com>
  0 siblings, 2 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-27 16:50 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Dave Hansen, linux-kernel, Ingo Molnar, Peter Zijlstra,
	Andrew Morton, Mel Gorman, Kirill A . Shutemov, Rik van Riel,
	Johannes Weiner, Eric W . Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

> Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337

These don't seem to give us a noticeable performance change either:

With THP:

real	22m34.279s
user	10797m35.984s
sys	39m18.188s

Without THP:

real	4m48.957s
user	2118m23.208s
sys	113m12.740s

Looks like we got a few minutes faster on the with THP case, but it's
still significantly slower, and that could just be a fluke result; we're
still floating at about a 5x performance degradation.

I talked with one of our performance/benchmarking experts last week and
he's done a bit more research into the actual problem here, so I've got
a bit more information:

The real performance hit, based on our testing, seems to be coming from
the increased latency that comes into play on large NUMA systems when a
process has to go off-node to read from/write to memory.

To give an extreme example, say we have a 16 node system with 8 cores
per node. If we have a job that shares a 2MB data structure between 128
threads, with THP on, the first thread to touch the structure will 
allocate all 2MB of space for that structure in a 2MB page, local to its
socket.  This means that all the memory accessses for the other 120
threads will be remote acceses.  With THP off, each thread could locally
allocate a number of 4K pages sufficient to hold the chunk of the
structure on which it needs to work, significantly reducing the number
of remote accesses that each thread will need to perform.

So, with that in mind, do we agree that a per-process tunable (or
something similar) to control THP seems like a reasonable method to
handle this issue?

Just want to confirm that everyone likes this approach before moving
forward with another revision of the patch.  I'm currently in favor of
moving this to a per-mm tunable, since that seems to make more sense
when it comes to threaded jobs. Also, a decent chunk of the code I've
already written can be reused with this approach, and prctl will still
be an appropriate place from which to control the behavior. Andrew
Morton suggested possibly controlling this through the ELF header, but
I'm going to lean towards the per-mm route unless anyone has a major
objection to it.

- Alex

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-27 16:50                 ` Alex Thorlton
@ 2013-08-27 17:01                   ` Robin Holt
  2013-09-04 15:43                     ` Alex Thorlton
       [not found]                   ` <20130828091814.GA13681@gmail.com>
  1 sibling, 1 reply; 57+ messages in thread
From: Robin Holt @ 2013-08-27 17:01 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov,
	Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook

Alex,

Although the explanation seems plausible, have you verified this is
actually possible?  You could make a simple pthread test case which
allocates a getpagesize() * <number-of-threads> area, prints its
address and then each thread migrate and reference their page.  Have
the task then sleep(<long-time>) before exit.  Look at the physical
address space with dlook for those virtual addresses in both the THP
and non-THP cases.

Thanks,
Robin

On Tue, Aug 27, 2013 at 11:50 AM, Alex Thorlton <athorlton@sgi.com> wrote:
>> Here's more up-to-date version: https://lkml.org/lkml/2012/8/20/337
>
> These don't seem to give us a noticeable performance change either:
>
> With THP:
>
> real    22m34.279s
> user    10797m35.984s
> sys     39m18.188s
>
> Without THP:
>
> real    4m48.957s
> user    2118m23.208s
> sys     113m12.740s
>
> Looks like we got a few minutes faster on the with THP case, but it's
> still significantly slower, and that could just be a fluke result; we're
> still floating at about a 5x performance degradation.
>
> I talked with one of our performance/benchmarking experts last week and
> he's done a bit more research into the actual problem here, so I've got
> a bit more information:
>
> The real performance hit, based on our testing, seems to be coming from
> the increased latency that comes into play on large NUMA systems when a
> process has to go off-node to read from/write to memory.
>
> To give an extreme example, say we have a 16 node system with 8 cores
> per node. If we have a job that shares a 2MB data structure between 128
> threads, with THP on, the first thread to touch the structure will
> allocate all 2MB of space for that structure in a 2MB page, local to its
> socket.  This means that all the memory accessses for the other 120
> threads will be remote acceses.  With THP off, each thread could locally
> allocate a number of 4K pages sufficient to hold the chunk of the
> structure on which it needs to work, significantly reducing the number
> of remote accesses that each thread will need to perform.
>
> So, with that in mind, do we agree that a per-process tunable (or
> something similar) to control THP seems like a reasonable method to
> handle this issue?
>
> Just want to confirm that everyone likes this approach before moving
> forward with another revision of the patch.  I'm currently in favor of
> moving this to a per-mm tunable, since that seems to make more sense
> when it comes to threaded jobs. Also, a decent chunk of the code I've
> already written can be reused with this approach, and prctl will still
> be an appropriate place from which to control the behavior. Andrew
> Morton suggested possibly controlling this through the ELF header, but
> I'm going to lean towards the per-mm route unless anyone has a major
> objection to it.
>
> - Alex

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-27 17:01                   ` Robin Holt
@ 2013-09-04 15:43                     ` Alex Thorlton
  2013-09-04 17:15                       ` Alex Thorlton
  0 siblings, 1 reply; 57+ messages in thread
From: Alex Thorlton @ 2013-09-04 15:43 UTC (permalink / raw)
  To: Robin Holt
  Cc: Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov,
	Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook

On Tue, Aug 27, 2013 at 12:01:01PM -0500, Robin Holt wrote:
> Alex,
> 
> Although the explanation seems plausible, have you verified this is
> actually possible?  You could make a simple pthread test case which
> allocates a getpagesize() * <number-of-threads> area, prints its
> address and then each thread migrate and reference their page.  Have
> the task then sleep(<long-time>) before exit.  Look at the physical
> address space with dlook for those virtual addresses in both the THP
> and non-THP cases.
> 
> Thanks,
> Robin

Robin,

I tweaked one of our other tests to behave pretty much exactly as I
described, and I can see a very significant increase in performance with
THP turned off.  The test behaves as follows:

- malloc a large array
- Spawn a specified number of threads
- Have each thread touch small, evenly spaced chunks of the array (e.g.
  for 128 threads, the array is divided into 128 chunks, and each thread
  touches 1/128th of each chunk, dividing the array into 16,384 pieces)

With THP off, the majority of each thread's pages are node local.  With
THP on, most of the pages end up as THPs on the first thread's nodes,
since it is touching chunks that are close enough together to be
collapsed into THPs which will, of course, remain on the first node for
the duration of the test.

Here are some timings for 128 threads, allocating a total of 64gb:

THP on:

real	1m6.394s
user	16m1.160s
sys	75m25.232s

THP off:

real	0m35.251s
user	26m37.316s
sys	3m28.472s

The performance hit here isn't as severe as shown with the SPEC workload
that we originally used, but it still appears to consistently take about
twice as long with THP enabled.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-09-04 15:43                     ` Alex Thorlton
@ 2013-09-04 17:15                       ` Alex Thorlton
  2013-09-05 11:15                         ` Ingo Molnar
  0 siblings, 1 reply; 57+ messages in thread
From: Alex Thorlton @ 2013-09-04 17:15 UTC (permalink / raw)
  To: Robin Holt
  Cc: Kirill A. Shutemov, Dave Hansen, linux-kernel, Ingo Molnar,
	Peter Zijlstra, Andrew Morton, Mel Gorman, Kirill A . Shutemov,
	Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook

> Robin,
> 
> I tweaked one of our other tests to behave pretty much exactly as I
> - malloc a large array
> - Spawn a specified number of threads
> - Have each thread touch small, evenly spaced chunks of the array (e.g.
>   for 128 threads, the array is divided into 128 chunks, and each thread
>   touches 1/128th of each chunk, dividing the array into 16,384 pieces)

Forgot to mention that the threads don't touch their chunks of memory
concurrently, i.e. thread 2 has to wait for thread 1 to finish first.
This is important to note, since the pages won't all get stuck on the
first node without this behavior.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-09-04 17:15                       ` Alex Thorlton
@ 2013-09-05 11:15                         ` Ingo Molnar
  2013-09-09 16:48                           ` Alex Thorlton
  0 siblings, 1 reply; 57+ messages in thread
From: Ingo Molnar @ 2013-09-05 11:15 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Robin Holt, Kirill A. Shutemov, Dave Hansen, linux-kernel,
	Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook


* Alex Thorlton <athorlton@sgi.com> wrote:

> > Robin,
> > 
> > I tweaked one of our other tests to behave pretty much exactly as I
> > - malloc a large array
> > - Spawn a specified number of threads
> > - Have each thread touch small, evenly spaced chunks of the array (e.g.
> >   for 128 threads, the array is divided into 128 chunks, and each thread
> >   touches 1/128th of each chunk, dividing the array into 16,384 pieces)
> 
> Forgot to mention that the threads don't touch their chunks of memory
> concurrently, i.e. thread 2 has to wait for thread 1 to finish first.
> This is important to note, since the pages won't all get stuck on the
> first node without this behavior.

Could you post the testcase please?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-09-05 11:15                         ` Ingo Molnar
@ 2013-09-09 16:48                           ` Alex Thorlton
  2013-09-10  7:47                             ` [benchmark] THP performance testcase Ingo Molnar
  0 siblings, 1 reply; 57+ messages in thread
From: Alex Thorlton @ 2013-09-09 16:48 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Robin Holt, Kirill A. Shutemov, Dave Hansen, linux-kernel,
	Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook

On Thu, Sep 05, 2013 at 01:15:10PM +0200, Ingo Molnar wrote:
> 
> * Alex Thorlton <athorlton@sgi.com> wrote:
> 
> > > Robin,
> > > 
> > > I tweaked one of our other tests to behave pretty much exactly as I
> > > - malloc a large array
> > > - Spawn a specified number of threads
> > > - Have each thread touch small, evenly spaced chunks of the array (e.g.
> > >   for 128 threads, the array is divided into 128 chunks, and each thread
> > >   touches 1/128th of each chunk, dividing the array into 16,384 pieces)
> > 
> > Forgot to mention that the threads don't touch their chunks of memory
> > concurrently, i.e. thread 2 has to wait for thread 1 to finish first.
> > This is important to note, since the pages won't all get stuck on the
> > first node without this behavior.
> 
> Could you post the testcase please?
> 
> Thanks,
> 
> 	Ingo

Sorry for the delay here, had to make sure that everything in my tests
was okay to push out to the public.  Here's a pointer to the test I
wrote:

ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz

Everything to compile the test should be there (just run make in the
thp_pthread directory).  To run the test use something like:

time ./thp_pthread -C 0 -m 0 -c <max_cores> -b <memory>

I ran:

time ./thp_pthread -C 0 -m 0 -c 128 -b 128g

On a 256 core machine, with ~500gb of memory and got these results:

THP off:

real	0m57.797s
user	46m22.156s
sys	6m14.220s

THP on:

real	1m36.906s
user	0m2.612s
sys	143m13.764s

I snagged some code from another test we use, so I can't vouch for the
usefulness/accuracy of all the output (actually, I know some of it is
wrong).  I've mainly been looking at the total run time.

Don't want to bloat this e-mail up with too many test results, but I
found this one really interesting.  Same machine, using all the cores,
with the same amount of memory.  This means that each cpu is actually
doing *less* work, since the chunk we reserve gets divided up evenly
amongst the cpus:

time ./thp_pthread -C 0 -m 0 -c 256 -b 128g

THP off:

real	1m1.028s
user	104m58.448s
sys	8m52.908s

THP on:

real	2m26.072s
user	60m39.404s
sys	337m10.072s

Seems that the test scales really well in the THP off case, but, once
again, with THP on, we really see the performance start to degrade.

I'm planning to start investigating possible ways to split up THPs, if
we detect that that majority of the references to a THP are off-node.
I've heard some horror stories about migrating pages in this situation
(i.e., process switches cpu and then all the pages follow it), but I
think we might be able to get some better results if we can cleverly
determine an appropriate time to split up pages.  I've heard a bit of
talk about doing something similar to this from a few people, but
haven't seen any code/test results.  If anybody has any input on that
topic, it would be greatly appreciated.

- Alex

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [benchmark] THP performance testcase
  2013-09-09 16:48                           ` Alex Thorlton
@ 2013-09-10  7:47                             ` Ingo Molnar
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
  0 siblings, 1 reply; 57+ messages in thread
From: Ingo Molnar @ 2013-09-10  7:47 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Robin Holt, Kirill A. Shutemov, Dave Hansen, linux-kernel,
	Ingo Molnar, Peter Zijlstra, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook


( Changed the subject line to make it stand out better on lkml. Mail with 
  link & results quoted below. )

* Alex Thorlton <athorlton@sgi.com> wrote:

> [...] Here's a pointer to the test I wrote:
> 
> ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz
> 
> Everything to compile the test should be there (just run make in the 
> thp_pthread directory).  To run the test use something like:
> 
> time ./thp_pthread -C 0 -m 0 -c <max_cores> -b <memory>
> 
> I ran:
> 
> time ./thp_pthread -C 0 -m 0 -c 128 -b 128g
> 
> On a 256 core machine, with ~500gb of memory and got these results:
> 
> THP off:
> 
> real	0m57.797s
> user	46m22.156s
> sys	6m14.220s
> 
> THP on:
> 
> real	1m36.906s
> user	0m2.612s
> sys	143m13.764s
> 
> I snagged some code from another test we use, so I can't vouch for the
> usefulness/accuracy of all the output (actually, I know some of it is
> wrong).  I've mainly been looking at the total run time.
> 
> Don't want to bloat this e-mail up with too many test results, but I
> found this one really interesting.  Same machine, using all the cores,
> with the same amount of memory.  This means that each cpu is actually
> doing *less* work, since the chunk we reserve gets divided up evenly
> amongst the cpus:
> 
> time ./thp_pthread -C 0 -m 0 -c 256 -b 128g
> 
> THP off:
> 
> real	1m1.028s
> user	104m58.448s
> sys	8m52.908s
> 
> THP on:
> 
> real	2m26.072s
> user	60m39.404s
> sys	337m10.072s
> 
> Seems that the test scales really well in the THP off case, but, once
> again, with THP on, we really see the performance start to degrade.
> 
> I'm planning to start investigating possible ways to split up THPs, if
> we detect that that majority of the references to a THP are off-node.
> I've heard some horror stories about migrating pages in this situation
> (i.e., process switches cpu and then all the pages follow it), but I
> think we might be able to get some better results if we can cleverly
> determine an appropriate time to split up pages.  I've heard a bit of
> talk about doing something similar to this from a few people, but
> haven't seen any code/test results.  If anybody has any input on that
> topic, it would be greatly appreciated.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 0/9] split page table lock for PMD tables
  2013-09-10  7:47                             ` [benchmark] THP performance testcase Ingo Molnar
@ 2013-09-13 13:06                               ` Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov
                                                   ` (8 more replies)
  0 siblings, 9 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

Alex Thorlton noticed that some massivly threaded workloads work poorly,
if THP enabled. This patchset fixes this by introducing split page table
lock for PMD tables. hugetlbfs is not covered yet.

This patchset is based on work by Naoya Horiguchi.

Benchmark (from Alex): ftp://shell.sgi.com/collect/appsx_test/pthread_test.tar.gz

THP off:
--------

 Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs):

    1738259.808012 task-clock                #   47.571 CPUs utilized            ( +-  9.49% )
           147,359 context-switches          #    0.085 K/sec                    ( +-  9.67% )
                14 cpu-migrations            #    0.000 K/sec                    ( +- 13.25% )
        24,410,139 page-faults               #    0.014 M/sec                    ( +-  0.00% )
 4,149,037,526,252 cycles                    #    2.387 GHz                      ( +-  9.50% )
 3,649,839,735,027 stalled-cycles-frontend   #   87.97% frontend cycles idle     ( +-  6.60% )
 2,455,558,969,567 stalled-cycles-backend    #   59.18% backend  cycles idle     ( +- 22.92% )
 1,434,961,518,604 instructions              #    0.35  insns per cycle
                                             #    2.54  stalled cycles per insn  ( +- 92.86% )
   241,472,020,951 branches                  #  138.916 M/sec                    ( +- 91.72% )
        84,022,172 branch-misses             #    0.03% of all branches          ( +-  3.16% )

      36.540185552 seconds time elapsed                                          ( +- 18.36% )

THP on, no patchset:
--------------------
 Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs):

    2528378.966949 task-clock                #   50.715 CPUs utilized            ( +- 11.86% )
           214,063 context-switches          #    0.085 K/sec                    ( +- 11.94% )
                19 cpu-migrations            #    0.000 K/sec                    ( +- 22.72% )
            49,226 page-faults               #    0.019 K/sec                    ( +-  0.33% )
 6,034,640,598,498 cycles                    #    2.387 GHz                      ( +- 11.91% )
 5,685,933,794,081 stalled-cycles-frontend   #   94.22% frontend cycles idle     ( +-  7.67% )
 4,414,381,393,353 stalled-cycles-backend    #   73.15% backend  cycles idle     ( +-  2.09% )
   952,086,804,776 instructions              #    0.16  insns per cycle
                                             #    5.97  stalled cycles per insn  ( +- 89.59% )
   166,191,211,974 branches                  #   65.730 M/sec                    ( +- 85.52% )
        33,341,022 branch-misses             #    0.02% of all branches          ( +-  3.90% )

      49.854741504 seconds time elapsed                                          ( +- 14.76% )

THP on, with patchset:
----------------------

echo always > /sys/kernel/mm/transparent_hugepage/enabled
 Performance counter stats for './thp_pthread -C 0 -m 0 -c 80 -b 100g' (5 runs):

    1538763.343568 task-clock                #   45.386 CPUs utilized            ( +-  7.21% )
           130,469 context-switches          #    0.085 K/sec                    ( +-  7.32% )
                14 cpu-migrations            #    0.000 K/sec                    ( +- 23.58% )
            49,299 page-faults               #    0.032 K/sec                    ( +-  0.15% )
 3,666,748,502,650 cycles                    #    2.383 GHz                      ( +-  7.25% )
 3,330,488,035,212 stalled-cycles-frontend   #   90.83% frontend cycles idle     ( +-  4.70% )
 2,383,357,073,990 stalled-cycles-backend    #   65.00% backend  cycles idle     ( +- 16.06% )
   935,504,610,528 instructions              #    0.26  insns per cycle
                                             #    3.56  stalled cycles per insn  ( +- 91.16% )
   161,466,689,532 branches                  #  104.933 M/sec                    ( +- 87.67% )
        22,602,225 branch-misses             #    0.01% of all branches          ( +-  6.43% )

      33.903917543 seconds time elapsed                                          ( +- 12.57% )

Kirill A. Shutemov (9):
  mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS
  mm: convert mm->nr_ptes to atomic_t
  mm: introduce api for split page table lock for PMD level
  mm, thp: change pmd_trans_huge_lock() to return taken lock
  mm, thp: move ptl taking inside page_check_address_pmd()
  mm, thp: do not access mm->pmd_huge_pte directly
  mm: convent the rest to new page table lock api
  mm: implement split page table lock for PMD level
  x86, mm: enable split page table lock for PMD level

 arch/arm/mm/fault-armv.c            |   6 +-
 arch/s390/mm/pgtable.c              |  12 +--
 arch/sparc/mm/tlb.c                 |  12 +--
 arch/um/defconfig                   |   2 +-
 arch/x86/Kconfig                    |   4 +
 arch/x86/include/asm/pgalloc.h      |   8 +-
 arch/x86/xen/mmu.c                  |   6 +-
 arch/xtensa/configs/iss_defconfig   |   2 +-
 arch/xtensa/configs/s6105_defconfig |   2 +-
 fs/proc/task_mmu.c                  |  15 +--
 include/linux/huge_mm.h             |  17 +--
 include/linux/mm.h                  |  51 ++++++++-
 include/linux/mm_types.h            |  15 ++-
 kernel/fork.c                       |   6 +-
 mm/Kconfig                          |  12 ++-
 mm/huge_memory.c                    | 201 +++++++++++++++++++++---------------
 mm/memcontrol.c                     |  10 +-
 mm/memory.c                         |  21 ++--
 mm/migrate.c                        |   7 +-
 mm/mmap.c                           |   3 +-
 mm/mprotect.c                       |   4 +-
 mm/oom_kill.c                       |   6 +-
 mm/pgtable-generic.c                |  16 +--
 mm/rmap.c                           |  13 +--
 24 files changed, 280 insertions(+), 171 deletions(-)

-- 
1.8.4.rc3


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 15:20                                   ` Dave Hansen
  2013-09-13 13:06                                 ` [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov
                                                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

We're going to introduce split page table lock for PMD level.
Let's rename existing split ptlock for PTE level to avoid confusion.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/arm/mm/fault-armv.c            | 6 +++---
 arch/um/defconfig                   | 2 +-
 arch/x86/xen/mmu.c                  | 6 +++---
 arch/xtensa/configs/iss_defconfig   | 2 +-
 arch/xtensa/configs/s6105_defconfig | 2 +-
 include/linux/mm.h                  | 6 +++---
 include/linux/mm_types.h            | 8 ++++----
 mm/Kconfig                          | 2 +-
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 2a5907b..ff379ac 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -65,7 +65,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address,
 	return ret;
 }
 
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
 /*
  * If we are using split PTE locks, then we need to take the page
  * lock here.  Otherwise we are using shared mm->page_table_lock
@@ -84,10 +84,10 @@ static inline void do_pte_unlock(spinlock_t *ptl)
 {
 	spin_unlock(ptl);
 }
-#else /* !USE_SPLIT_PTLOCKS */
+#else /* !USE_SPLIT_PTE_PTLOCKS */
 static inline void do_pte_lock(spinlock_t *ptl) {}
 static inline void do_pte_unlock(spinlock_t *ptl) {}
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static int adjust_pte(struct vm_area_struct *vma, unsigned long address,
 	unsigned long pfn)
diff --git a/arch/um/defconfig b/arch/um/defconfig
index 08107a7..6b0a10f 100644
--- a/arch/um/defconfig
+++ b/arch/um/defconfig
@@ -82,7 +82,7 @@ CONFIG_FLATMEM_MANUAL=y
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_SPLIT_PTE_PTLOCK_CPUS=4
 # CONFIG_COMPACTION is not set
 # CONFIG_PHYS_ADDR_T_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=0
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fdc3ba2..455c873 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -796,7 +796,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 {
 	spinlock_t *ptl = NULL;
 
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
 	ptl = __pte_lockptr(page);
 	spin_lock_nest_lock(ptl, &mm->page_table_lock);
 #endif
@@ -1637,7 +1637,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
 
 			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
 
-			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
 				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
 
 			xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -1671,7 +1671,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
 		if (!PageHighMem(page)) {
 			xen_mc_batch();
 
-			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
 				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 
 			__set_pfn_prot(pfn, PAGE_KERNEL);
diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig
index 77c52f8..54cc946 100644
--- a/arch/xtensa/configs/iss_defconfig
+++ b/arch/xtensa/configs/iss_defconfig
@@ -174,7 +174,7 @@ CONFIG_FLATMEM_MANUAL=y
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_SPLIT_PTE_PTLOCK_CPUS=4
 # CONFIG_PHYS_ADDR_T_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig
index 4799c6a..d802f11 100644
--- a/arch/xtensa/configs/s6105_defconfig
+++ b/arch/xtensa/configs/s6105_defconfig
@@ -138,7 +138,7 @@ CONFIG_FLATMEM_MANUAL=y
 CONFIG_FLATMEM=y
 CONFIG_FLAT_NODE_MEM_MAP=y
 CONFIG_PAGEFLAGS_EXTENDED=y
-CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_SPLIT_PTE_PTLOCK_CPUS=4
 # CONFIG_PHYS_ADDR_T_64BIT is not set
 CONFIG_ZONE_DMA_FLAG=1
 CONFIG_VIRT_TO_BUS=y
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55e..6cf8ddb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1232,7 +1232,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
 /*
  * We tuck a spinlock to guard each pagetable page into its struct page,
  * at page->private, with BUILD_BUG_ON to make sure that this will not
@@ -1245,14 +1245,14 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 } while (0)
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
 #define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
-#else	/* !USE_SPLIT_PTLOCKS */
+#else	/* !USE_SPLIT_PTE_PTLOCKS */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
 #define pte_lock_init(page)	do {} while (0)
 #define pte_lock_deinit(page)	do {} while (0)
 #define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */
 
 static inline void pgtable_page_ctor(struct page *page)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index faf4b7c..fe0a4bb 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -23,7 +23,7 @@
 
 struct address_space;
 
-#define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
+#define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS)
 
 /*
  * Each physical page in the system has a struct page associated with
@@ -141,7 +141,7 @@ struct page {
 						 * indicates order in the buddy
 						 * system if PG_buddy is set.
 						 */
-#if USE_SPLIT_PTLOCKS
+#if USE_SPLIT_PTE_PTLOCKS
 		spinlock_t ptl;
 #endif
 		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
@@ -309,14 +309,14 @@ enum {
 	NR_MM_COUNTERS
 };
 
-#if USE_SPLIT_PTLOCKS && defined(CONFIG_MMU)
+#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
 #define SPLIT_RSS_COUNTING
 /* per-thread cached information, */
 struct task_rss_stat {
 	int events;	/* for synchronization threshold */
 	int count[NR_MM_COUNTERS];
 };
-#endif /* USE_SPLIT_PTLOCKS */
+#endif /* USE_SPLIT_PTE_PTLOCKS */
 
 struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
diff --git a/mm/Kconfig b/mm/Kconfig
index 026771a..1977a33 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -207,7 +207,7 @@ config PAGEFLAGS_EXTENDED
 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
 # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
 #
-config SPLIT_PTLOCK_CPUS
+config SPLIT_PTE_PTLOCK_CPUS
 	int
 	default "999999" if ARM && !CPU_CACHE_VIPT
 	default "999999" if PARISC && !PA20
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS
  2013-09-13 13:06                                 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov
@ 2013-09-13 15:20                                   ` Dave Hansen
  0 siblings, 0 replies; 57+ messages in thread
From: Dave Hansen @ 2013-09-13 15:20 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi,
	Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Jones, David Howells, Frederic Weisbecker,
	Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk,
	Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On 09/13/2013 06:06 AM, Kirill A. Shutemov wrote:
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -207,7 +207,7 @@ config PAGEFLAGS_EXTENDED
>  # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
>  # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
>  #
> -config SPLIT_PTLOCK_CPUS
> +config SPLIT_PTE_PTLOCK_CPUS
>  	int
>  	default "999999" if ARM && !CPU_CACHE_VIPT
>  	default "999999" if PARISC && !PA20

If someone has a config where this is set to some non-default value,
won't changing the name cause this to revert back to the defaults?

I don't know how big of a deal it is to other folks, but you can always
do this:

config SPLIT_PTE_PTLOCK_CPUS
  	int
	default SPLIT_PTLOCK_CPUS if SPLIT_PTLOCK_CPUS


^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov
                                                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

With split page table lock for PMD level we can't hold
mm->page_table_lock while updating nr_ptes.

Let's convert it to atomic_t to avoid races.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 fs/proc/task_mmu.c       |  2 +-
 include/linux/mm_types.h |  2 +-
 kernel/fork.c            |  2 +-
 mm/huge_memory.c         | 10 +++++-----
 mm/memory.c              |  4 ++--
 mm/mmap.c                |  3 ++-
 mm/oom_kill.c            |  6 +++---
 7 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7366e9d..d45d423 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -62,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		(PTRS_PER_PTE*sizeof(pte_t)*atomic_read(&mm->nr_ptes)) >> 10,
 		swap << (PAGE_SHIFT-10));
 }
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fe0a4bb..1c64730 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -338,6 +338,7 @@ struct mm_struct {
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+	atomic_t nr_ptes;			/* Page table pages */
 	int map_count;				/* number of VMAs */
 
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
@@ -359,7 +360,6 @@ struct mm_struct {
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
 	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
 	unsigned long def_flags;
-	unsigned long nr_ptes;		/* Page table pages */
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/kernel/fork.c b/kernel/fork.c
index 81ccb4f..4c8b986 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm->flags = (current->mm) ?
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
-	mm->nr_ptes = 0;
+	atomic_set(&mm->nr_ptes, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
 	mm_init_aio(mm);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884..bbd41a2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -737,7 +737,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		mm->nr_ptes++;
+		atomic_inc(&mm->nr_ptes);
 		spin_unlock(&mm->page_table_lock);
 	}
 
@@ -778,7 +778,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	entry = pmd_mkhuge(entry);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	mm->nr_ptes++;
+	atomic_inc(&mm->nr_ptes);
 	return true;
 }
 
@@ -903,7 +903,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	pmd = pmd_mkold(pmd_wrprotect(pmd));
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-	dst_mm->nr_ptes++;
+	atomic_inc(&dst_mm->nr_ptes);
 
 	ret = 0;
 out_unlock:
@@ -1358,7 +1358,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
-			tlb->mm->nr_ptes--;
+			atomic_dec(&tlb->mm->nr_ptes);
 			spin_unlock(&tlb->mm->page_table_lock);
 			put_huge_zero_page();
 		} else {
@@ -1367,7 +1367,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			VM_BUG_ON(page_mapcount(page) < 0);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
-			tlb->mm->nr_ptes--;
+			atomic_dec(&tlb->mm->nr_ptes);
 			spin_unlock(&tlb->mm->page_table_lock);
 			tlb_remove_page(tlb, page);
 		}
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..1046396 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	tlb->mm->nr_ptes--;
+	atomic_dec(&tlb->mm->nr_ptes);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -575,7 +575,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	wait_split_huge_page = 0;
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		mm->nr_ptes++;
+		atomic_inc(&mm->nr_ptes);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	} else if (unlikely(pmd_trans_splitting(*pmd)))
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d54851..1d0efbc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2726,7 +2726,8 @@ void exit_mmap(struct mm_struct *mm)
 	}
 	vm_unacct_memory(nr_accounted);
 
-	WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+	WARN_ON(atomic_read(&mm->nr_ptes) >
+			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d2..7ab394e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + p->mm->nr_ptes +
+	points = get_mm_rss(p->mm) + atomic_read(&p->mm->nr_ptes) +
 		 get_mm_counter(p->mm, MM_SWAPENTS);
 	task_unlock(p);
 
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7d %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			task->mm->nr_ptes,
+			atomic_read(&task->mm->nr_ptes),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 3/9] mm: introduce api for split page table lock for PMD level
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:19                                   ` Peter Zijlstra
  2013-09-13 13:06                                 ` [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock Kirill A. Shutemov
                                                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

Basic api, backed by mm->page_table_lock for now. Actual implementation
will be added later.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/mm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6cf8ddb..d4361e7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page)
 	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
 
+static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+	return &mm->page_table_lock;
+}
+
+
+static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
+{
+	spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
+	spin_lock(ptl);
+	return ptl;
+}
+
 extern void free_area_init(unsigned long * zones_size);
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/9] mm: introduce api for split page table lock for PMD level
  2013-09-13 13:06                                 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov
@ 2013-09-13 13:19                                   ` Peter Zijlstra
  2013-09-13 14:22                                     ` Kirill A. Shutemov
  0 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2013-09-13 13:19 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi,
	Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On Fri, Sep 13, 2013 at 04:06:10PM +0300, Kirill A. Shutemov wrote:
> Basic api, backed by mm->page_table_lock for now. Actual implementation
> will be added later.
> 
> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  include/linux/mm.h | 13 +++++++++++++
>  1 file changed, 13 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 6cf8ddb..d4361e7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page)
>  	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
>  		NULL: pte_offset_kernel(pmd, address))
>  
> +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +{
> +	return &mm->page_table_lock;
> +}
> +
> +
> +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
> +{
> +	spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
> +	spin_lock(ptl);
> +	return ptl;
> +}

Why not call the thing pmd_lock()? The pmd bit differentiates it from
pte_lock() enough IMIO.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/9] mm: introduce api for split page table lock for PMD level
  2013-09-13 13:19                                   ` Peter Zijlstra
@ 2013-09-13 14:22                                     ` Kirill A. Shutemov
  0 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 14:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kirill A. Shutemov, Alex Thorlton, Ingo Molnar, Andrew Morton,
	Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro,
	Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones,
	David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook,
	Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm

Peter Zijlstra wrote:
> On Fri, Sep 13, 2013 at 04:06:10PM +0300, Kirill A. Shutemov wrote:
> > Basic api, backed by mm->page_table_lock for now. Actual implementation
> > will be added later.
> > 
> > Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  include/linux/mm.h | 13 +++++++++++++
> >  1 file changed, 13 insertions(+)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 6cf8ddb..d4361e7 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1294,6 +1294,19 @@ static inline void pgtable_page_dtor(struct page *page)
> >  	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
> >  		NULL: pte_offset_kernel(pmd, address))
> >  
> > +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> > +{
> > +	return &mm->page_table_lock;
> > +}
> > +
> > +
> > +static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
> > +{
> > +	spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
> > +	spin_lock(ptl);
> > +	return ptl;
> > +}
> 
> Why not call the thing pmd_lock()? The pmd bit differentiates it from
> pte_lock() enough IMIO.

Okay, will rename it.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
                                                   ` (2 preceding siblings ...)
  2013-09-13 13:06                                 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov
                                                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

With split ptlock it's important to know which lock pmd_trans_huge_lock()
took. This patch adds one more parameter to the function to return the
lock.

In most places new api migration to new api is trivial.
Exception is move_huge_pmd(): we need to take two locks if pmd tables
are different.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 fs/proc/task_mmu.c      | 13 +++++++------
 include/linux/huge_mm.h | 14 +++++++-------
 mm/huge_memory.c        | 40 +++++++++++++++++++++++++++-------------
 mm/memcontrol.c         | 10 +++++-----
 4 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d45d423..bbf7420 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -505,9 +505,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
-		spin_unlock(&walk->mm->page_table_lock);
+		spin_unlock(ptl);
 		mss->anonymous_thp += HPAGE_PMD_SIZE;
 		return 0;
 	}
@@ -993,13 +993,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 {
 	struct vm_area_struct *vma;
 	struct pagemapread *pm = walk->private;
+	spinlock_t *ptl;
 	pte_t *pte;
 	int err = 0;
 	pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
 
 	/* find the first VMA at or above 'addr' */
 	vma = find_vma(walk->mm, addr);
-	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		int pmd_flags2;
 
 		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1017,7 +1018,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			if (err)
 				break;
 		}
-		spin_unlock(&walk->mm->page_table_lock);
+		spin_unlock(ptl);
 		return err;
 	}
 
@@ -1319,7 +1320,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	md = walk->private;
 
-	if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
@@ -1327,7 +1328,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		if (page)
 			gather_stats(page, md, pte_dirty(huge_pte),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
-		spin_unlock(&walk->mm->page_table_lock);
+		spin_unlock(ptl);
 		return 0;
 	}
 
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 3935428..4aca0d8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -129,15 +129,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
 				    unsigned long start,
 				    unsigned long end,
 				    long adjust_next);
-extern int __pmd_trans_huge_lock(pmd_t *pmd,
-				 struct vm_area_struct *vma);
+extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl);
 /* mmap_sem must be held on entry */
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
-				      struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl)
 {
 	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
 	if (pmd_trans_huge(*pmd))
-		return __pmd_trans_huge_lock(pmd, vma);
+		return __pmd_trans_huge_lock(pmd, vma, ptl);
 	else
 		return 0;
 }
@@ -215,8 +215,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
 					 long adjust_next)
 {
 }
-static inline int pmd_trans_huge_lock(pmd_t *pmd,
-				      struct vm_area_struct *vma)
+static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bbd41a2..acf5b4d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1342,9 +1342,10 @@ out_unlock:
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 {
+	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		struct page *page;
 		pgtable_t pgtable;
 		pmd_t orig_pmd;
@@ -1359,7 +1360,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
 		if (is_huge_zero_pmd(orig_pmd)) {
 			atomic_dec(&tlb->mm->nr_ptes);
-			spin_unlock(&tlb->mm->page_table_lock);
+			spin_unlock(ptl);
 			put_huge_zero_page();
 		} else {
 			page = pmd_page(orig_pmd);
@@ -1368,7 +1369,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 			VM_BUG_ON(!PageHead(page));
 			atomic_dec(&tlb->mm->nr_ptes);
-			spin_unlock(&tlb->mm->page_table_lock);
+			spin_unlock(ptl);
 			tlb_remove_page(tlb, page);
 		}
 		pte_free(tlb->mm, pgtable);
@@ -1381,14 +1382,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		unsigned char *vec)
 {
+	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		/*
 		 * All logical pages in the range are present
 		 * if backed by a huge page.
 		 */
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		memset(vec, 1, (end - addr) >> PAGE_SHIFT);
 		ret = 1;
 	}
@@ -1401,6 +1403,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd)
 {
+	spinlock_t *old_ptl, *new_ptl;
 	int ret = 0;
 	pmd_t pmd;
 
@@ -1421,12 +1424,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		goto out;
 	}
 
-	ret = __pmd_trans_huge_lock(old_pmd, vma);
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * ptlocks because exclusive mmap_sem prevents deadlock.
+	 */
+	ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
 	if (ret == 1) {
+		new_ptl = huge_pmd_lockptr(mm, new_pmd);
+		if (new_ptl != old_ptl)
+			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 		pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
 		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-		spin_unlock(&mm->page_table_lock);
+		if (new_ptl != old_ptl)
+			spin_unlock(new_ptl);
+		spin_unlock(old_ptl);
 	}
 out:
 	return ret;
@@ -1436,9 +1448,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, pgprot_t newprot, int prot_numa)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
 	int ret = 0;
 
-	if (__pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		pmd_t entry;
 		entry = pmdp_get_and_clear(mm, addr, pmd);
 		if (!prot_numa) {
@@ -1454,7 +1467,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			}
 		}
 		set_pmd_at(mm, addr, pmd, entry);
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		ret = 1;
 	}
 
@@ -1468,12 +1481,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  * Note that if it returns 1, this routine returns without unlocking page
  * table locks. So callers must unlock them.
  */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
+		spinlock_t **ptl)
 {
-	spin_lock(&vma->vm_mm->page_table_lock);
+	*ptl = huge_pmd_lock(vma->vm_mm, pmd);
 	if (likely(pmd_trans_huge(*pmd))) {
 		if (unlikely(pmd_trans_splitting(*pmd))) {
-			spin_unlock(&vma->vm_mm->page_table_lock);
+			spin_unlock(*ptl);
 			wait_split_huge_page(vma->anon_vma, pmd);
 			return -1;
 		} else {
@@ -1482,7 +1496,7 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
 			return 1;
 		}
 	}
-	spin_unlock(&vma->vm_mm->page_table_lock);
+	spin_unlock(*ptl);
 	return 0;
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce..5f35b2a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6376,10 +6376,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		return 0;
 	}
 
@@ -6568,9 +6568,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	 *    to be unlocked in __split_huge_page_splitting(), where the main
 	 *    part of thp split is not executed yet.
 	 */
-	if (pmd_trans_huge_lock(pmd, vma) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (mc.precharge < HPAGE_PMD_NR) {
-			spin_unlock(&vma->vm_mm->page_table_lock);
+			spin_unlock(ptl);
 			return 0;
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6587,7 +6587,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 			}
 			put_page(page);
 		}
-		spin_unlock(&vma->vm_mm->page_table_lock);
+		spin_unlock(ptl);
 		return 0;
 	}
 
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd()
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
                                                   ` (3 preceding siblings ...)
  2013-09-13 13:06                                 ` [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov
                                                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

With split page table lock we can't know which lock we need to take
before we find the relevant pmd.

Let's move lock taking inside the function.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/huge_mm.h |  3 ++-
 mm/huge_memory.c        | 43 +++++++++++++++++++++++++++----------------
 mm/rmap.c               | 13 +++++--------
 3 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4aca0d8..91672e2 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -54,7 +54,8 @@ enum page_check_address_pmd_flag {
 extern pmd_t *page_check_address_pmd(struct page *page,
 				     struct mm_struct *mm,
 				     unsigned long address,
-				     enum page_check_address_pmd_flag flag);
+				     enum page_check_address_pmd_flag flag,
+				     spinlock_t **ptl);
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index acf5b4d..4b58a01 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1500,23 +1500,33 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
 	return 0;
 }
 
+/*
+ * This function returns whether a given @page is mapped onto the @address
+ * in the virtual space of @mm.
+ *
+ * When it's true, this function returns *pmd with holding the page table lock
+ * and passing it back to the caller via @ptl.
+ * If it's false, returns NULL without holding the page table lock.
+ */
 pmd_t *page_check_address_pmd(struct page *page,
 			      struct mm_struct *mm,
 			      unsigned long address,
-			      enum page_check_address_pmd_flag flag)
+			      enum page_check_address_pmd_flag flag,
+			      spinlock_t **ptl)
 {
-	pmd_t *pmd, *ret = NULL;
+	pmd_t *pmd;
 
 	if (address & ~HPAGE_PMD_MASK)
-		goto out;
+		return NULL;
 
 	pmd = mm_find_pmd(mm, address);
 	if (!pmd)
-		goto out;
+		return NULL;
+	*ptl = huge_pmd_lock(mm, pmd);
 	if (pmd_none(*pmd))
-		goto out;
+		goto unlock;
 	if (pmd_page(*pmd) != page)
-		goto out;
+		goto unlock;
 	/*
 	 * split_vma() may create temporary aliased mappings. There is
 	 * no risk as long as all huge pmd are found and have their
@@ -1526,14 +1536,15 @@ pmd_t *page_check_address_pmd(struct page *page,
 	 */
 	if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
 	    pmd_trans_splitting(*pmd))
-		goto out;
+		goto unlock;
 	if (pmd_trans_huge(*pmd)) {
 		VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
 			  !pmd_trans_splitting(*pmd));
-		ret = pmd;
+		return pmd;
 	}
-out:
-	return ret;
+unlock:
+	spin_unlock(*ptl);
+	return NULL;
 }
 
 static int __split_huge_page_splitting(struct page *page,
@@ -1541,6 +1552,7 @@ static int __split_huge_page_splitting(struct page *page,
 				       unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
 	pmd_t *pmd;
 	int ret = 0;
 	/* For mmu_notifiers */
@@ -1548,9 +1560,8 @@ static int __split_huge_page_splitting(struct page *page,
 	const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
 
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-	spin_lock(&mm->page_table_lock);
 	pmd = page_check_address_pmd(page, mm, address,
-				     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+			PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
 	if (pmd) {
 		/*
 		 * We can't temporarily set the pmd to null in order
@@ -1561,8 +1572,8 @@ static int __split_huge_page_splitting(struct page *page,
 		 */
 		pmdp_splitting_flush(vma, address, pmd);
 		ret = 1;
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
 	return ret;
@@ -1693,14 +1704,14 @@ static int __split_huge_page_map(struct page *page,
 				 unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
 	pmd_t *pmd, _pmd;
 	int ret = 0, i;
 	pgtable_t pgtable;
 	unsigned long haddr;
 
-	spin_lock(&mm->page_table_lock);
 	pmd = page_check_address_pmd(page, mm, address,
-				     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+			PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
 	if (pmd) {
 		pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 		pmd_populate(mm, &_pmd, pgtable);
@@ -1755,8 +1766,8 @@ static int __split_huge_page_map(struct page *page,
 		pmdp_invalidate(vma, address, pmd);
 		pmd_populate(mm, pmd, pgtable);
 		ret = 1;
+		spin_unlock(ptl);
 	}
-	spin_unlock(&mm->page_table_lock);
 
 	return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index fd3ee7a..b59d741 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			unsigned long *vm_flags)
 {
 	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
 	int referenced = 0;
 
 	if (unlikely(PageTransHuge(page))) {
 		pmd_t *pmd;
 
-		spin_lock(&mm->page_table_lock);
 		/*
 		 * rmap might return false positives; we must filter
 		 * these out using page_check_address_pmd().
 		 */
 		pmd = page_check_address_pmd(page, mm, address,
-					     PAGE_CHECK_ADDRESS_PMD_FLAG);
-		if (!pmd) {
-			spin_unlock(&mm->page_table_lock);
+					     PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+		if (!pmd)
 			goto out;
-		}
 
 		if (vma->vm_flags & VM_LOCKED) {
-			spin_unlock(&mm->page_table_lock);
+			spin_unlock(ptl);
 			*mapcount = 0;	/* break early from loop */
 			*vm_flags |= VM_LOCKED;
 			goto out;
@@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		/* go ahead even if the pmd is pmd_trans_splitting() */
 		if (pmdp_clear_flush_young_notify(vma, address, pmd))
 			referenced++;
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 	} else {
 		pte_t *pte;
-		spinlock_t *ptl;
 
 		/*
 		 * rmap might return false positives; we must filter
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
                                                   ` (4 preceding siblings ...)
  2013-09-13 13:06                                 ` [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov
                                                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

Currently mm->pmd_huge_pte protected by page table lock. It will not
work with split lock. We have to have per-pmd pmd_huge_pte for proper
access serialization.

For now, let's just introduce wrapper to access mm->pmd_huge_pte.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/s390/mm/pgtable.c | 12 ++++++------
 arch/sparc/mm/tlb.c    | 12 ++++++------
 include/linux/mm.h     |  1 +
 mm/pgtable-generic.c   | 12 ++++++------
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index de8cbc3..c463e5c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1225,11 +1225,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 	assert_spin_locked(&mm->page_table_lock);
 
 	/* FIFO */
-	if (!mm->pmd_huge_pte)
+	if (!pmd_huge_pte(mm, pmdp))
 		INIT_LIST_HEAD(lh);
 	else
-		list_add(lh, (struct list_head *) mm->pmd_huge_pte);
-	mm->pmd_huge_pte = pgtable;
+		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+	pmd_huge_pte(mm, pmdp) = pgtable;
 }
 
 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
@@ -1241,12 +1241,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	assert_spin_locked(&mm->page_table_lock);
 
 	/* FIFO */
-	pgtable = mm->pmd_huge_pte;
+	pgtable = pmd_huge_pte(mm, pmdp);
 	lh = (struct list_head *) pgtable;
 	if (list_empty(lh))
-		mm->pmd_huge_pte = NULL;
+		pmd_huge_pte(mm, pmdp) = NULL;
 	else {
-		mm->pmd_huge_pte = (pgtable_t) lh->next;
+		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
 		list_del(lh);
 	}
 	ptep = (pte_t *) pgtable;
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index 7a91f28..656cc46 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -196,11 +196,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 	assert_spin_locked(&mm->page_table_lock);
 
 	/* FIFO */
-	if (!mm->pmd_huge_pte)
+	if (!pmd_huge_pte(mm, pmdp))
 		INIT_LIST_HEAD(lh);
 	else
-		list_add(lh, (struct list_head *) mm->pmd_huge_pte);
-	mm->pmd_huge_pte = pgtable;
+		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
+	pmd_huge_pte(mm, pmdp) = pgtable;
 }
 
 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
@@ -211,12 +211,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	assert_spin_locked(&mm->page_table_lock);
 
 	/* FIFO */
-	pgtable = mm->pmd_huge_pte;
+	pgtable = pmd_huge_pte(mm, pmdp);
 	lh = (struct list_head *) pgtable;
 	if (list_empty(lh))
-		mm->pmd_huge_pte = NULL;
+		pmd_huge_pte(mm, pmdp) = NULL;
 	else {
-		mm->pmd_huge_pte = (pgtable_t) lh->next;
+		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
 		list_del(lh);
 	}
 	pte_val(pgtable[0]) = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d4361e7..d2f8a50 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1299,6 +1299,7 @@ static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 	return &mm->page_table_lock;
 }
 
+#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
 
 static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
 {
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3929a40..41fee3e 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -154,11 +154,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 	assert_spin_locked(&mm->page_table_lock);
 
 	/* FIFO */
-	if (!mm->pmd_huge_pte)
+	if (!pmd_huge_pte(mm, pmdp))
 		INIT_LIST_HEAD(&pgtable->lru);
 	else
-		list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-	mm->pmd_huge_pte = pgtable;
+		list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
+	pmd_huge_pte(mm, pmdp) = pgtable;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
@@ -173,11 +173,11 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	assert_spin_locked(&mm->page_table_lock);
 
 	/* FIFO */
-	pgtable = mm->pmd_huge_pte;
+	pgtable = pmd_huge_pte(mm, pmdp);
 	if (list_empty(&pgtable->lru))
-		mm->pmd_huge_pte = NULL;
+		pmd_huge_pte(mm, pmdp) = NULL;
 	else {
-		mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+		pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
 					      struct page, lru);
 		list_del(&pgtable->lru);
 	}
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 7/9] mm: convent the rest to new page table lock api
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
                                                   ` (5 preceding siblings ...)
  2013-09-13 13:06                                 ` [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
  2013-09-13 13:06                                 ` [PATCH 9/9] x86, mm: enable " Kirill A. Shutemov
  8 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

Only trivial cases left. Let's convert them altogether.

hugetlbfs is not covered for now.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c     | 108 ++++++++++++++++++++++++++++-----------------------
 mm/memory.c          |  17 ++++----
 mm/migrate.c         |   7 ++--
 mm/mprotect.c        |   4 +-
 mm/pgtable-generic.c |   4 +-
 5 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4b58a01..e728d74 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -709,6 +709,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct page *page)
 {
 	pgtable_t pgtable;
+	spinlock_t *ptl;
 
 	VM_BUG_ON(!PageCompound(page));
 	pgtable = pte_alloc_one(mm, haddr);
@@ -723,9 +724,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	 */
 	__SetPageUptodate(page);
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	if (unlikely(!pmd_none(*pmd))) {
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 		mem_cgroup_uncharge_page(page);
 		put_page(page);
 		pte_free(mm, pgtable);
@@ -738,7 +739,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		atomic_inc(&mm->nr_ptes);
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 	}
 
 	return 0;
@@ -766,6 +767,7 @@ static inline struct page *alloc_hugepage(int defrag)
 }
 #endif
 
+/* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 		struct page *zero_page)
@@ -797,6 +799,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	if (!(flags & FAULT_FLAG_WRITE) &&
 			transparent_hugepage_use_zero_page()) {
+		spinlock_t *ptl;
 		pgtable_t pgtable;
 		struct page *zero_page;
 		bool set;
@@ -809,10 +812,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			count_vm_event(THP_FAULT_FALLBACK);
 			return VM_FAULT_FALLBACK;
 		}
-		spin_lock(&mm->page_table_lock);
+		ptl = huge_pmd_lock(mm, pmd);
 		set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
 				zero_page);
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 		if (!set) {
 			pte_free(mm, pgtable);
 			put_huge_zero_page();
@@ -845,6 +848,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *vma)
 {
+	spinlock_t *dst_ptl, *src_ptl;
 	struct page *src_page;
 	pmd_t pmd;
 	pgtable_t pgtable;
@@ -855,8 +859,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (unlikely(!pgtable))
 		goto out;
 
-	spin_lock(&dst_mm->page_table_lock);
-	spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+	dst_ptl = huge_pmd_lock(dst_mm, dst_pmd);
+	src_ptl = huge_pmd_lockptr(src_mm, src_pmd);
+	spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 
 	ret = -EAGAIN;
 	pmd = *src_pmd;
@@ -865,7 +870,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_unlock;
 	}
 	/*
-	 * mm->page_table_lock is enough to be sure that huge zero pmd is not
+	 * When page table lock is held, the huge zero pmd should not be
 	 * under splitting since we don't split the page itself, only pmd to
 	 * a page table.
 	 */
@@ -886,8 +891,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	}
 	if (unlikely(pmd_trans_splitting(pmd))) {
 		/* split huge page running from under us */
-		spin_unlock(&src_mm->page_table_lock);
-		spin_unlock(&dst_mm->page_table_lock);
+		spin_unlock(src_ptl);
+		spin_unlock(dst_ptl);
 		pte_free(dst_mm, pgtable);
 
 		wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -907,8 +912,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	ret = 0;
 out_unlock:
-	spin_unlock(&src_mm->page_table_lock);
-	spin_unlock(&dst_mm->page_table_lock);
+	spin_unlock(src_ptl);
+	spin_unlock(dst_ptl);
 out:
 	return ret;
 }
@@ -919,10 +924,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
 			   pmd_t *pmd, pmd_t orig_pmd,
 			   int dirty)
 {
+	spinlock_t *ptl;
 	pmd_t entry;
 	unsigned long haddr;
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto unlock;
 
@@ -932,13 +938,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
 		update_mmu_cache_pmd(vma, address, pmd);
 
 unlock:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 }
 
 static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
 {
+	spinlock_t *ptl;
 	pgtable_t pgtable;
 	pmd_t _pmd;
 	struct page *page;
@@ -965,7 +972,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_free_page;
 
@@ -992,7 +999,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 	}
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	put_huge_zero_page();
 	inc_mm_counter(mm, MM_ANONPAGES);
 
@@ -1002,7 +1009,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 out:
 	return ret;
 out_free_page:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	mem_cgroup_uncharge_page(page);
 	put_page(page);
@@ -1016,6 +1023,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct page *page,
 					unsigned long haddr)
 {
+	spinlock_t *ptl;
 	pgtable_t pgtable;
 	pmd_t _pmd;
 	int ret = 0, i;
@@ -1062,7 +1070,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_free_pages;
 	VM_BUG_ON(!PageHead(page));
@@ -1088,7 +1096,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	smp_wmb(); /* make pte visible before pmd */
 	pmd_populate(mm, pmd, pgtable);
 	page_remove_rmap(page);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
@@ -1099,7 +1107,7 @@ out:
 	return ret;
 
 out_free_pages:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	mem_cgroup_uncharge_start();
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1114,17 +1122,19 @@ out_free_pages:
 int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 {
+	spinlock_t *ptl;
 	int ret = 0;
 	struct page *page = NULL, *new_page;
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 
+	ptl = huge_pmd_lockptr(mm, pmd);
 	VM_BUG_ON(!vma->anon_vma);
 	haddr = address & HPAGE_PMD_MASK;
 	if (is_huge_zero_pmd(orig_pmd))
 		goto alloc;
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_unlock;
 
@@ -1140,7 +1150,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_unlock;
 	}
 	get_page(page);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
@@ -1187,11 +1197,11 @@ alloc:
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (page)
 		put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 		mem_cgroup_uncharge_page(new_page);
 		put_page(new_page);
 		goto out_mn;
@@ -1213,13 +1223,13 @@ alloc:
 		}
 		ret |= VM_FAULT_WRITE;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 out_mn:
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
 	return ret;
 out_unlock:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	return ret;
 }
 
@@ -1231,7 +1241,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 
-	assert_spin_locked(&mm->page_table_lock);
+	assert_spin_locked(huge_pmd_lockptr(mm, pmd));
 
 	if (flags & FOLL_WRITE && !pmd_write(*pmd))
 		goto out;
@@ -1278,13 +1288,14 @@ out:
 int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
 {
+	spinlock_t *ptl;
 	struct page *page;
 	unsigned long haddr = addr & HPAGE_PMD_MASK;
 	int target_nid;
 	int current_nid = -1;
 	bool migrated;
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmdp);
 	if (unlikely(!pmd_same(pmd, *pmdp)))
 		goto out_unlock;
 
@@ -1302,17 +1313,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	/* Acquire the page lock to serialise THP migrations */
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	lock_page(page);
 
 	/* Confirm the PTE did not while locked */
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (unlikely(!pmd_same(pmd, *pmdp))) {
 		unlock_page(page);
 		put_page(page);
 		goto out_unlock;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 
 	/* Migrate the THP to the requested node */
 	migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1324,7 +1335,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	return 0;
 
 check_same:
-	spin_lock(&mm->page_table_lock);
+	spin_lock(ptl);
 	if (unlikely(!pmd_same(pmd, *pmdp)))
 		goto out_unlock;
 clear_pmdnuma:
@@ -1333,7 +1344,7 @@ clear_pmdnuma:
 	VM_BUG_ON(pmd_numa(*pmdp));
 	update_mmu_cache_pmd(vma, addr, pmdp);
 out_unlock:
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	if (current_nid != -1)
 		task_numa_fault(current_nid, HPAGE_PMD_NR, false);
 	return 0;
@@ -2282,7 +2293,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	pte_t *pte;
 	pgtable_t pgtable;
 	struct page *new_page;
-	spinlock_t *ptl;
+	spinlock_t *pmd_ptl, *pte_ptl;
 	int isolated;
 	unsigned long hstart, hend;
 	unsigned long mmun_start;	/* For mmu_notifiers */
@@ -2325,12 +2336,12 @@ static void collapse_huge_page(struct mm_struct *mm,
 	anon_vma_lock_write(vma->anon_vma);
 
 	pte = pte_offset_map(pmd, address);
-	ptl = pte_lockptr(mm, pmd);
+	pte_ptl = pte_lockptr(mm, pmd);
 
 	mmun_start = address;
 	mmun_end   = address + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-	spin_lock(&mm->page_table_lock); /* probably unnecessary */
+	pmd_ptl = huge_pmd_lock(mm, pmd); /* probably unnecessary */
 	/*
 	 * After this gup_fast can't run anymore. This also removes
 	 * any huge TLB entry from the CPU so we won't allow
@@ -2338,16 +2349,16 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * to avoid the risk of CPU bugs in that area.
 	 */
 	_pmd = pmdp_clear_flush(vma, address, pmd);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(pmd_ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
-	spin_lock(ptl);
+	spin_lock(pte_ptl);
 	isolated = __collapse_huge_page_isolate(vma, address, pte);
-	spin_unlock(ptl);
+	spin_unlock(pte_ptl);
 
 	if (unlikely(!isolated)) {
 		pte_unmap(pte);
-		spin_lock(&mm->page_table_lock);
+		spin_lock(pmd_ptl);
 		BUG_ON(!pmd_none(*pmd));
 		/*
 		 * We can only use set_pmd_at when establishing
@@ -2355,7 +2366,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 		 * points to regular pagetables. Use pmd_populate for that
 		 */
 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(pmd_ptl);
 		anon_vma_unlock_write(vma->anon_vma);
 		goto out;
 	}
@@ -2366,7 +2377,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 */
 	anon_vma_unlock_write(vma->anon_vma);
 
-	__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+	__collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
 	pte_unmap(pte);
 	__SetPageUptodate(new_page);
 	pgtable = pmd_pgtable(_pmd);
@@ -2381,13 +2392,13 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 */
 	smp_wmb();
 
-	spin_lock(&mm->page_table_lock);
+	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(pmd_ptl);
 
 	*hpage = NULL;
 
@@ -2712,6 +2723,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd)
 {
+	spinlock_t *ptl;
 	struct page *page;
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2723,22 +2735,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
 	mmun_start = haddr;
 	mmun_end   = haddr + HPAGE_PMD_SIZE;
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	if (unlikely(!pmd_trans_huge(*pmd))) {
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 		return;
 	}
 	if (is_huge_zero_pmd(*pmd)) {
 		__split_huge_zero_page_pmd(vma, haddr, pmd);
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 		return;
 	}
 	page = pmd_page(*pmd);
 	VM_BUG_ON(!page_count(page));
 	get_page(page);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
 	split_huge_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 1046396..a0ed1d5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -552,6 +552,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		pmd_t *pmd, unsigned long address)
 {
+	spinlock_t *ptl;
 	pgtable_t new = pte_alloc_one(mm, address);
 	int wait_split_huge_page;
 	if (!new)
@@ -572,7 +573,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	wait_split_huge_page = 0;
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
 		atomic_inc(&mm->nr_ptes);
@@ -580,7 +581,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 		new = NULL;
 	} else if (unlikely(pmd_trans_splitting(*pmd)))
 		wait_split_huge_page = 1;
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 	if (new)
 		pte_free(mm, new);
 	if (wait_split_huge_page)
@@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 			split_huge_page_pmd(vma, address, pmd);
 			goto split_fallthrough;
 		}
-		spin_lock(&mm->page_table_lock);
+		ptl = huge_pmd_lock(mm, pmd);
 		if (likely(pmd_trans_huge(*pmd))) {
 			if (unlikely(pmd_trans_splitting(*pmd))) {
-				spin_unlock(&mm->page_table_lock);
+				spin_unlock(ptl);
 				wait_split_huge_page(vma->anon_vma, pmd);
 			} else {
 				page = follow_trans_huge_pmd(vma, address,
 							     pmd, flags);
-				spin_unlock(&mm->page_table_lock);
+				spin_unlock(ptl);
 				*page_mask = HPAGE_PMD_NR - 1;
 				goto out;
 			}
 		} else
-			spin_unlock(&mm->page_table_lock);
+			spin_unlock(ptl);
 		/* fall through */
 	}
 split_fallthrough:
@@ -3602,13 +3603,13 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	bool numa = false;
 	int local_nid = numa_node_id();
 
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	pmd = *pmdp;
 	if (pmd_numa(pmd)) {
 		set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
 		numa = true;
 	}
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 
 	if (!numa)
 		return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index b7ded7e..32eff0c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1653,6 +1653,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 				unsigned long address,
 				struct page *page, int node)
 {
+	spinlock_t *ptl;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated = 0;
@@ -1699,9 +1700,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	WARN_ON(PageLRU(new_page));
 
 	/* Recheck the target PMD */
-	spin_lock(&mm->page_table_lock);
+	ptl = huge_pmd_lock(mm, pmd);
 	if (unlikely(!pmd_same(*pmd, entry))) {
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 
 		/* Reverse changes made by migrate_page_copy() */
 		if (TestClearPageActive(new_page))
@@ -1746,7 +1747,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	 * before it's fully transferred to the new page.
 	 */
 	mem_cgroup_end_migration(memcg, page, new_page, true);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 
 	unlock_page(new_page);
 	unlock_page(page);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..885cd78 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -116,9 +116,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
 				       pmd_t *pmd)
 {
-	spin_lock(&mm->page_table_lock);
+	spinlock_t *ptl = huge_pmd_lock(mm, pmd);
 	set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(ptl);
 }
 #else
 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 41fee3e..80587a5 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -151,7 +151,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 				pgtable_t pgtable)
 {
-	assert_spin_locked(&mm->page_table_lock);
+	assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
 
 	/* FIFO */
 	if (!pmd_huge_pte(mm, pmdp))
@@ -170,7 +170,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 {
 	pgtable_t pgtable;
 
-	assert_spin_locked(&mm->page_table_lock);
+	assert_spin_locked(huge_pmd_lockptr(mm, pmdp));
 
 	/* FIFO */
 	pgtable = pmd_huge_pte(mm, pmdp);
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
                                                   ` (6 preceding siblings ...)
  2013-09-13 13:06                                 ` [PATCH 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  2013-09-13 13:24                                   ` Peter Zijlstra
                                                     ` (3 more replies)
  2013-09-13 13:06                                 ` [PATCH 9/9] x86, mm: enable " Kirill A. Shutemov
  8 siblings, 4 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

The basic idea is the same as with PTE level: the lock is embedded into
struct page of table's page.

Split pmd page table lock only makes sense on big machines.
Let's say >= 32 CPUs for now.

We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't
take mm->page_table_lock anymore. Let's reuse page->lru of table's page
for that.

hugetlbfs hasn't converted to split locking: disable split locking if
hugetlbfs enabled.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 include/linux/mm.h       | 31 +++++++++++++++++++++++++++++++
 include/linux/mm_types.h |  5 +++++
 kernel/fork.c            |  4 ++--
 mm/Kconfig               | 10 ++++++++++
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2f8a50..5b3922d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1294,13 +1294,44 @@ static inline void pgtable_page_dtor(struct page *page)
 	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
 
+#if USE_SPLIT_PMD_PTLOCKS
+
+static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
+{
+	return &virt_to_page(pmd)->ptl;
+}
+
+static inline void pgtable_pmd_page_ctor(struct page *page)
+{
+	spin_lock_init(&page->ptl);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	page->pmd_huge_pte = NULL;
+#endif
+}
+
+static inline void pgtable_pmd_page_dtor(struct page *page)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(page->pmd_huge_pte);
+#endif
+}
+
+#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
+
+#else
+
 static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
 {
 	return &mm->page_table_lock;
 }
 
+static inline void pgtable_pmd_page_ctor(struct page *page) {}
+static inline void pgtable_pmd_page_dtor(struct page *page) {}
+
 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
 
+#endif
+
 static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
 {
 	spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c64730..5706ddf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -24,6 +24,8 @@
 struct address_space;
 
 #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS)
+#define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
+		NR_CPUS >= CONFIG_SPLIT_PMD_PTLOCK_CPUS)
 
 /*
  * Each physical page in the system has a struct page associated with
@@ -130,6 +132,9 @@ struct page {
 
 		struct list_head list;	/* slobs list of pages */
 		struct slab *slab_page; /* slab fields */
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
+		pgtable_t pmd_huge_pte; /* protected by page->ptl */
+#endif
 	};
 
 	/* Remainder is not double word aligned */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c8b986..1670af7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON(mm->pmd_huge_pte);
 #endif
 }
@@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	memcpy(mm, oldmm, sizeof(*mm));
 	mm_init_cpumask(mm);
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/Kconfig b/mm/Kconfig
index 1977a33..ab32eda 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -214,6 +214,16 @@ config SPLIT_PTE_PTLOCK_CPUS
 	default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
 	default "4"
 
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+	boolean
+
+config SPLIT_PMD_PTLOCK_CPUS
+	int
+	# hugetlb hasn't converted to split locking yet
+	default "999999" if HUGETLB_PAGE
+	default "32" if ARCH_ENABLE_SPLIT_PMD_PTLOCK
+	default "999999"
+
 #
 # support for memory balloon compaction
 config BALLOON_COMPACTION
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
@ 2013-09-13 13:24                                   ` Peter Zijlstra
  2013-09-13 14:25                                     ` Kirill A. Shutemov
  2013-09-13 13:36                                   ` Peter Zijlstra
                                                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2013-09-13 13:24 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi,
	Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> The basic idea is the same as with PTE level: the lock is embedded into
> struct page of table's page.
> 
> Split pmd page table lock only makes sense on big machines.
> Let's say >= 32 CPUs for now.

Why is this? Couldn't I generate the same amount of contention on PMD
level as I can on PTE level in the THP case?



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:24                                   ` Peter Zijlstra
@ 2013-09-13 14:25                                     ` Kirill A. Shutemov
  2013-09-13 14:52                                       ` Peter Zijlstra
  0 siblings, 1 reply; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 14:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kirill A. Shutemov, Alex Thorlton, Ingo Molnar, Andrew Morton,
	Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro,
	Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones,
	David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook,
	Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm

Peter Zijlstra wrote:
> On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> > The basic idea is the same as with PTE level: the lock is embedded into
> > struct page of table's page.
> > 
> > Split pmd page table lock only makes sense on big machines.
> > Let's say >= 32 CPUs for now.
> 
> Why is this? Couldn't I generate the same amount of contention on PMD
> level as I can on PTE level in the THP case?

Hm. You are right. You just need more memory for that.
Do you want it to be "4" too?

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 14:25                                     ` Kirill A. Shutemov
@ 2013-09-13 14:52                                       ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2013-09-13 14:52 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi,
	Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On Fri, Sep 13, 2013 at 05:25:13PM +0300, Kirill A. Shutemov wrote:
> Peter Zijlstra wrote:
> > On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> > > The basic idea is the same as with PTE level: the lock is embedded into
> > > struct page of table's page.
> > > 
> > > Split pmd page table lock only makes sense on big machines.
> > > Let's say >= 32 CPUs for now.
> > 
> > Why is this? Couldn't I generate the same amount of contention on PMD
> > level as I can on PTE level in the THP case?
> 
> Hm. You are right. You just need more memory for that.
> Do you want it to be "4" too?

Well, I would drop your patch-1 and use the same config var.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
  2013-09-13 13:24                                   ` Peter Zijlstra
@ 2013-09-13 13:36                                   ` Peter Zijlstra
  2013-09-13 14:25                                     ` Kirill A. Shutemov
  2013-09-13 15:45                                   ` Naoya Horiguchi
  2013-09-13 19:57                                   ` Dave Hansen
  3 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2013-09-13 13:36 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi,
	Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> +#if USE_SPLIT_PMD_PTLOCKS
> +
> +static inline void pgtable_pmd_page_ctor(struct page *page)
> +{
> +	spin_lock_init(&page->ptl);
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	page->pmd_huge_pte = NULL;
> +#endif
> +}
> +
> +static inline void pgtable_pmd_page_dtor(struct page *page)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	VM_BUG_ON(page->pmd_huge_pte);
> +#endif
> +}
> +
> +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
> +
> +#else

So on -rt we have the problem that spinlock_t is rather huge (its a
rtmutex) so instead of blowing up the pageframe like that we treat
page->pte as a pointer and allocate the spinlock.

Since allocations could fail the above ctor path gets 'interesting'.

It would be good if new code could assume the ctor could fail so we
don't have to replicate that horror-show.


---
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 3 Jul 2009 08:44:54 -0500
Subject: mm: shrink the page frame to !-rt size

He below is a boot-tested hack to shrink the page frame size back to
normal.

Should be a net win since there should be many less PTE-pages than
page-frames.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mm.h       |   46 +++++++++++++++++++++++++++++++++++++++-------
 include/linux/mm_types.h |    4 ++++
 mm/memory.c              |   32 ++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 7 deletions(-)

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1241,27 +1241,59 @@ static inline pmd_t *pmd_alloc(struct mm
  * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
  * When freeing, reset page->mapping so free_pages_check won't complain.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
+
 #define __pte_lockptr(page)	&((page)->ptl)
-#define pte_lock_init(_page)	do {					\
-	spin_lock_init(__pte_lockptr(_page));				\
-} while (0)
+
+static inline struct page *pte_lock_init(struct page *page)
+{
+	spin_lock_init(__pte_lockptr(page));
+	return page;
+}
+
 #define pte_lock_deinit(page)	((page)->mapping = NULL)
+
+#else /* !PREEMPT_RT_FULL */
+
+/*
+ * On PREEMPT_RT_FULL the spinlock_t's are too large to embed in the
+ * page frame, hence it only has a pointer and we need to dynamically
+ * allocate the lock when we allocate PTE-pages.
+ *
+ * This is an overall win, since only a small fraction of the pages
+ * will be PTE pages under normal circumstances.
+ */
+
+#define __pte_lockptr(page)	((page)->ptl)
+
+extern struct page *pte_lock_init(struct page *page);
+extern void pte_lock_deinit(struct page *page);
+
+#endif /* PREEMPT_RT_FULL */
+
 #define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
 #else	/* !USE_SPLIT_PTLOCKS */
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
-#define pte_lock_init(page)	do {} while (0)
+static inline struct page *pte_lock_init(struct page *page) { return page; }
 #define pte_lock_deinit(page)	do {} while (0)
 #define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
 #endif /* USE_SPLIT_PTLOCKS */
 
-static inline void pgtable_page_ctor(struct page *page)
+static inline struct page *__pgtable_page_ctor(struct page *page)
 {
-	pte_lock_init(page);
-	inc_zone_page_state(page, NR_PAGETABLE);
+	page = pte_lock_init(page);
+	if (page)
+		inc_zone_page_state(page, NR_PAGETABLE);
+	return page;
 }
 
+#define pgtable_page_ctor(page)				\
+do {							\
+	page = __pgtable_page_ctor(page);		\
+} while (0)
+
 static inline void pgtable_page_dtor(struct page *page)
 {
 	pte_lock_deinit(page);
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -142,7 +142,11 @@ struct page {
 						 * system if PG_buddy is set.
 						 */
 #if USE_SPLIT_PTLOCKS
+# ifndef CONFIG_PREEMPT_RT_FULL
 		spinlock_t ptl;
+# else
+		spinlock_t *ptl;
+# endif
 #endif
 		struct kmem_cache *slab_cache;	/* SL[AU]B: Pointer to slab */
 		struct page *first_page;	/* Compound tail pages */
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4328,3 +4328,35 @@ void copy_user_huge_page(struct page *ds
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if defined(CONFIG_PREEMPT_RT_FULL) && (USE_SPLIT_PTLOCKS > 0)
+/*
+ * Heinous hack, relies on the caller doing something like:
+ *
+ *   pte = alloc_pages(PGALLOC_GFP, 0);
+ *   if (pte)
+ *     pgtable_page_ctor(pte);
+ *   return pte;
+ *
+ * This ensures we release the page and return NULL when the
+ * lock allocation fails.
+ */
+struct page *pte_lock_init(struct page *page)
+{
+	page->ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+	if (page->ptl) {
+		spin_lock_init(__pte_lockptr(page));
+	} else {
+		__free_page(page);
+		page = NULL;
+	}
+	return page;
+}
+
+void pte_lock_deinit(struct page *page)
+{
+	kfree(page->ptl);
+	page->mapping = NULL;
+}
+
+#endif

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:36                                   ` Peter Zijlstra
@ 2013-09-13 14:25                                     ` Kirill A. Shutemov
  0 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 14:25 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kirill A. Shutemov, Alex Thorlton, Ingo Molnar, Andrew Morton,
	Naoya Horiguchi, Eric W . Biederman, Paul E . McKenney, Al Viro,
	Andi Kleen, Andrea Arcangeli, Dave Hansen, Dave Jones,
	David Howells, Frederic Weisbecker, Johannes Weiner, Kees Cook,
	Mel Gorman, Michael Kerrisk, Oleg Nesterov, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm

Peter Zijlstra wrote:
> On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> > +#if USE_SPLIT_PMD_PTLOCKS
> > +
> > +static inline void pgtable_pmd_page_ctor(struct page *page)
> > +{
> > +	spin_lock_init(&page->ptl);
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +	page->pmd_huge_pte = NULL;
> > +#endif
> > +}
> > +
> > +static inline void pgtable_pmd_page_dtor(struct page *page)
> > +{
> > +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> > +	VM_BUG_ON(page->pmd_huge_pte);
> > +#endif
> > +}
> > +
> > +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
> > +
> > +#else
> 
> So on -rt we have the problem that spinlock_t is rather huge (its a
> rtmutex) so instead of blowing up the pageframe like that we treat
> page->pte as a pointer and allocate the spinlock.
> 
> Since allocations could fail the above ctor path gets 'interesting'.
> 
> It would be good if new code could assume the ctor could fail so we
> don't have to replicate that horror-show.

Okay, I'll rework this.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
  2013-09-13 13:24                                   ` Peter Zijlstra
  2013-09-13 13:36                                   ` Peter Zijlstra
@ 2013-09-13 15:45                                   ` Naoya Horiguchi
  2013-09-13 19:57                                   ` Dave Hansen
  3 siblings, 0 replies; 57+ messages in thread
From: Naoya Horiguchi @ 2013-09-13 15:45 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Eric W . Biederman,
	Paul E . McKenney, Al Viro, Andi Kleen, Andrea Arcangeli,
	Dave Hansen, Dave Jones, David Howells, Frederic Weisbecker,
	Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk,
	Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On Fri, Sep 13, 2013 at 04:06:15PM +0300, Kirill A. Shutemov wrote:
> The basic idea is the same as with PTE level: the lock is embedded into
> struct page of table's page.
> 
> Split pmd page table lock only makes sense on big machines.
> Let's say >= 32 CPUs for now.
> 
> We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't
> take mm->page_table_lock anymore. Let's reuse page->lru of table's page
> for that.

Looks nice.

> hugetlbfs hasn't converted to split locking: disable split locking if
> hugetlbfs enabled.

I don't think that we have to disable when hugetlbfs is enabled,
because hugetlbfs code doesn't use huge_pmd_lockptr() or huge_pmd_lock().

> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  include/linux/mm.h       | 31 +++++++++++++++++++++++++++++++
>  include/linux/mm_types.h |  5 +++++
>  kernel/fork.c            |  4 ++--
>  mm/Kconfig               | 10 ++++++++++
>  4 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d2f8a50..5b3922d 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1294,13 +1294,44 @@ static inline void pgtable_page_dtor(struct page *page)
>  	((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
>  		NULL: pte_offset_kernel(pmd, address))
>  
> +#if USE_SPLIT_PMD_PTLOCKS
> +
> +static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +{
> +	return &virt_to_page(pmd)->ptl;
> +}
> +
> +static inline void pgtable_pmd_page_ctor(struct page *page)
> +{
> +	spin_lock_init(&page->ptl);
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	page->pmd_huge_pte = NULL;
> +#endif
> +}
> +
> +static inline void pgtable_pmd_page_dtor(struct page *page)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	VM_BUG_ON(page->pmd_huge_pte);
> +#endif
> +}
> +
> +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte)
> 
> +
> +#else
> +
>  static inline spinlock_t *huge_pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
>  {
>  	return &mm->page_table_lock;
>  }
>  
> +static inline void pgtable_pmd_page_ctor(struct page *page) {}
> +static inline void pgtable_pmd_page_dtor(struct page *page) {}
> +
>  #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
>  
> +#endif
> +
>  static inline spinlock_t *huge_pmd_lock(struct mm_struct *mm, pmd_t *pmd)
>  {
>  	spinlock_t *ptl = huge_pmd_lockptr(mm, pmd);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 1c64730..5706ddf 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -24,6 +24,8 @@
>  struct address_space;
>  
>  #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTE_PTLOCK_CPUS)
> +#define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
> +		NR_CPUS >= CONFIG_SPLIT_PMD_PTLOCK_CPUS)
>  
>  /*
>   * Each physical page in the system has a struct page associated with
> @@ -130,6 +132,9 @@ struct page {
>  
>  		struct list_head list;	/* slobs list of pages */
>  		struct slab *slab_page; /* slab fields */
> +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
> +		pgtable_t pmd_huge_pte; /* protected by page->ptl */
> +#endif
>  	};
>  
>  	/* Remainder is not double word aligned */

Can we remove pmd_huge_pte from mm_struct when USE_SPLIT_PMD_PTLOCKS is true?

Thanks,
Naoya Horiguchi

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 8/9] mm: implement split page table lock for PMD level
  2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
                                                     ` (2 preceding siblings ...)
  2013-09-13 15:45                                   ` Naoya Horiguchi
@ 2013-09-13 19:57                                   ` Dave Hansen
  3 siblings, 0 replies; 57+ messages in thread
From: Dave Hansen @ 2013-09-13 19:57 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi,
	Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Jones, David Howells, Frederic Weisbecker,
	Johannes Weiner, Kees Cook, Mel Gorman, Michael Kerrisk,
	Oleg Nesterov, Peter Zijlstra, Rik van Riel, Robin Holt,
	Sedat Dilek, Srikar Dronamraju, Thomas Gleixner, linux-kernel,
	linux-mm

On 09/13/2013 06:06 AM, Kirill A. Shutemov wrote:
> +config ARCH_ENABLE_SPLIT_PMD_PTLOCK
> +	boolean
> +
> +config SPLIT_PMD_PTLOCK_CPUS
> +	int
> +	# hugetlb hasn't converted to split locking yet
> +	default "999999" if HUGETLB_PAGE
> +	default "32" if ARCH_ENABLE_SPLIT_PMD_PTLOCK
> +	default "999999"

Is there a reason we should have separate config knobs for this from
SPLIT_PTLOCK_CPUS?  Seem a bit silly.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 9/9] x86, mm: enable split page table lock for PMD level
  2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
                                                   ` (7 preceding siblings ...)
  2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
@ 2013-09-13 13:06                                 ` Kirill A. Shutemov
  8 siblings, 0 replies; 57+ messages in thread
From: Kirill A. Shutemov @ 2013-09-13 13:06 UTC (permalink / raw)
  To: Alex Thorlton, Ingo Molnar, Andrew Morton, Naoya Horiguchi
  Cc: Eric W . Biederman, Paul E . McKenney, Al Viro, Andi Kleen,
	Andrea Arcangeli, Dave Hansen, Dave Jones, David Howells,
	Frederic Weisbecker, Johannes Weiner, Kees Cook, Mel Gorman,
	Michael Kerrisk, Oleg Nesterov, Peter Zijlstra, Rik van Riel,
	Robin Holt, Sedat Dilek, Srikar Dronamraju, Thomas Gleixner,
	linux-kernel, linux-mm, Kirill A. Shutemov

Enable PMD split page table lock for X86_64 and PAE.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 arch/x86/Kconfig               | 4 ++++
 arch/x86/include/asm/pgalloc.h | 8 +++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 30c40f0..6a5cf6a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1880,6 +1880,10 @@ config USE_PERCPU_NUMA_NODE_ID
 	def_bool y
 	depends on NUMA
 
+config ARCH_ENABLE_SPLIT_PMD_PTLOCK
+	def_bool y
+	depends on X86_64 || X86_PAE
+
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b4389a4..f2daea1 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -80,12 +80,18 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 #if PAGETABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	struct page *page;
+	page = alloc_pages(GFP_KERNEL | __GFP_REPEAT| __GFP_ZERO, 0);
+	if (!page)
+		return NULL;
+	pgtable_pmd_page_ctor(page);
+	return (pmd_t *)page_address(page);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
 	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+	pgtable_pmd_page_dtor(virt_to_page(pmd));
 	free_page((unsigned long)pmd);
 }
 
-- 
1.8.4.rc3


^ permalink raw reply related	[flat|nested] 57+ messages in thread

[parent not found: <20130828091814.GA13681@gmail.com>]

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
       [not found]                   ` <20130828091814.GA13681@gmail.com>
@ 2013-08-28  9:32                     ` Peter Zijlstra
  0 siblings, 0 replies; 57+ messages in thread
From: Peter Zijlstra @ 2013-08-28  9:32 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Alex Thorlton, Kirill A. Shutemov, Dave Hansen, linux-kernel,
	Ingo Molnar, Andrew Morton, Mel Gorman, Kirill A . Shutemov,
	Rik van Riel, Johannes Weiner, Eric W . Biederman, Sedat Dilek,
	Frederic Weisbecker, Dave Jones, Michael Kerrisk,
	Paul E . McKenney, David Howells, Thomas Gleixner, Al Viro,
	Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt,
	Linus Torvalds, Andrea Arcangeli

On Wed, Aug 28, 2013 at 11:18:14AM +0200, Ingo Molnar wrote:
> But ideally THP should detect cases where a hugepage is heavily used from 
> multiple nodes and un-HP the page in question. Or not turn it into a 
> hugepage in the first place. (We actually have a memory access pattern 
> sampling facility in place upstream, the new CONFIG_NUMA_BALANCING=y code, 
> which could in theory be utilized.)

Mel and I spoke about doing just that a few weeks ago. Breaking up THP
pages when we get shared faults from different nodes and avoiding
coalescing when the pages are from different nodes.

Rik seemed a little skeptical about the entire thing saying that the
huge tlb gain is about the same order as the locality thing -- while
this is more or less true for the 'tiny' machines most of us have, this
is clearly not the case for these big machines.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 18:17             ` Alex Thorlton
  2013-08-16 18:52               ` Kirill A. Shutemov
@ 2013-08-16 19:46               ` Peter Zijlstra
  2013-08-16 19:49                 ` Alex Thorlton
  1 sibling, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2013-08-16 19:46 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: Dave Hansen, linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote:
> I actually didn't write these patches (made a few tweaks to get them
> running on the latest kernel though).  They were submitted last July by 
> Peter Zijlstra. 

By Kirill, I don't think I've ever touched them.

> The messages from the original submission can be found here:
> https://lkml.org/lkml/2012/7/20/165

As can be seen by this posting you reference ;-)

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/8] THP: Use real address for NUMA policy
  2013-08-16 19:46               ` Peter Zijlstra
@ 2013-08-16 19:49                 ` Alex Thorlton
  0 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 19:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Hansen, linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

On Fri, Aug 16, 2013 at 09:46:35PM +0200, Peter Zijlstra wrote:
> On Fri, Aug 16, 2013 at 01:17:28PM -0500, Alex Thorlton wrote:
> > I actually didn't write these patches (made a few tweaks to get them
> > running on the latest kernel though).  They were submitted last July by 
> > Peter Zijlstra. 
> 
> By Kirill, I don't think I've ever touched them.
> 
> > The messages from the original submission can be found here:
> > https://lkml.org/lkml/2012/7/20/165
> 
> As can be seen by this posting you reference ;-)

Sorry about that!  Not sure where I got the idea that you wrote them,
must've gotten some names mixed up somewhere.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
  2013-08-16 14:33         ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton
@ 2013-08-16 14:33         ` Alex Thorlton
  2013-08-16 14:33         ` [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page Alex Thorlton
                           ` (6 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

---
 mm/memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 1ce2e2a..69a5a38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4249,16 +4249,17 @@ void clear_huge_page(struct page *page,
 		     unsigned long addr, unsigned int pages_per_huge_page)
 {
 	int i;
+	unsigned long haddr = addr & HPAGE_PMD_MASK;
 
 	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
-		clear_gigantic_page(page, addr, pages_per_huge_page);
+		clear_gigantic_page(page, haddr, pages_per_huge_page);
 		return;
 	}
 
 	might_sleep();
 	for (i = 0; i < pages_per_huge_page; i++) {
 		cond_resched();
-		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+		clear_user_highpage(page + i, haddr + i * PAGE_SIZE);
 	}
 }
 
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
  2013-08-16 14:33         ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton
  2013-08-16 14:33         ` [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address Alex Thorlton
@ 2013-08-16 14:33         ` Alex Thorlton
  2013-08-16 14:34         ` [PATCH 4/8] x86: Add clear_page_nocache Alex Thorlton
                           ` (5 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:33 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

Conflicts:
	mm/huge_memory.c
---
 mm/huge_memory.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 55ec681..2e6f106 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -701,7 +701,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
 
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct vm_area_struct *vma,
-					unsigned long haddr, pmd_t *pmd,
+					unsigned long haddr,
+					unsigned long address, pmd_t *pmd,
 					struct page *page)
 {
 	pgtable_t pgtable;
@@ -711,12 +712,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	if (unlikely(!pgtable))
 		return VM_FAULT_OOM;
 
-	clear_huge_page(page, haddr, HPAGE_PMD_NR);
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
 	 * clear_huge_page writes become visible before the set_pmd_at()
 	 * write.
 	 */
+	clear_huge_page(page, address, HPAGE_PMD_NR);
 	__SetPageUptodate(page);
 
 	spin_lock(&mm->page_table_lock);
@@ -825,8 +826,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			put_page(page);
 			goto out;
 		}
-		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
-							  page))) {
+		if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr,
+						address, pmd, page))) {
 			mem_cgroup_uncharge_page(page);
 			put_page(page);
 			goto out;
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 4/8] x86: Add clear_page_nocache
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
                           ` (2 preceding siblings ...)
  2013-08-16 14:33         ` [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page Alex Thorlton
@ 2013-08-16 14:34         ` Alex Thorlton
  2013-08-16 14:34         ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton
                           ` (4 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

Conflicts:
	arch/x86/mm/fault.c
---
 arch/x86/include/asm/page.h          |  2 ++
 arch/x86/include/asm/string_32.h     |  5 +++++
 arch/x86/include/asm/string_64.h     |  5 +++++
 arch/x86/lib/Makefile                |  1 +
 arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++
 arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++
 arch/x86/mm/fault.c                  |  7 +++++++
 7 files changed, 79 insertions(+)
 create mode 100644 arch/x86/lib/clear_page_nocache_32.S
 create mode 100644 arch/x86/lib/clear_page_nocache_64.S

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index c878924..ccec9c3 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -33,6 +33,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
 	copy_page(to, from);
 }
 
+void clear_user_highpage_nocache(struct page *page, unsigned long vaddr);
+
 #define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
 	alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
 #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 3d3e835..3f2fbcf 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/linkage.h>
+
 /* Let gcc decide whether to inline or use the out of line functions */
 
 #define __HAVE_ARCH_STRCPY
@@ -337,6 +339,9 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
 #define __HAVE_ARCH_MEMSCAN
 extern void *memscan(void *addr, int c, size_t size);
 
+#define ARCH_HAS_USER_NOCACHE 1
+asmlinkage void clear_page_nocache(void *page);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c46..ca23d1d 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 
+#include <linux/linkage.h>
+
 /* Written 2002 by Andi Kleen */
 
 /* Only used for special circumstances. Stolen from i386/string.h */
@@ -63,6 +65,9 @@ char *strcpy(char *dest, const char *src);
 char *strcat(char *dest, const char *src);
 int strcmp(const char *cs, const char *ct);
 
+#define ARCH_HAS_USER_NOCACHE 1
+asmlinkage void clear_page_nocache(void *page);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 96b2c66..e48776c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -23,6 +23,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_SMP) += rwlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-y += clear_page_nocache_$(BITS).o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/clear_page_nocache_32.S b/arch/x86/lib/clear_page_nocache_32.S
new file mode 100644
index 0000000..2394e0c
--- /dev/null
+++ b/arch/x86/lib/clear_page_nocache_32.S
@@ -0,0 +1,30 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page avoiding the caches
+ * rdi	page
+ */
+ENTRY(clear_page_nocache)
+	CFI_STARTPROC
+	mov    %eax,%edi
+	xorl   %eax,%eax
+	movl   $4096/64,%ecx
+	.p2align 4
+.Lloop:
+	decl	%ecx
+#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi)
+	PUT(0)
+	PUT(1)
+	PUT(2)
+	PUT(3)
+	PUT(4)
+	PUT(5)
+	PUT(6)
+	PUT(7)
+	lea	64(%edi),%edi
+	jnz	.Lloop
+	nop
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_nocache)
diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
new file mode 100644
index 0000000..ee16d15
--- /dev/null
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -0,0 +1,29 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page avoiding the caches
+ * rdi	page
+ */
+ENTRY(clear_page_nocache)
+	CFI_STARTPROC
+	xorl   %eax,%eax
+	movl   $4096/64,%ecx
+	.p2align 4
+.Lloop:
+	decl	%ecx
+#define PUT(x) movnti %rax,x*8(%rdi)
+	movnti %rax,(%rdi)
+	PUT(1)
+	PUT(2)
+	PUT(3)
+	PUT(4)
+	PUT(5)
+	PUT(6)
+	PUT(7)
+	leaq	64(%rdi),%rdi
+	jnz	.Lloop
+	nop
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_nocache)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 654be4a..a088ed7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1230,3 +1230,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	__do_page_fault(regs, error_code);
 	exception_exit(prev_state);
 }
+
+void clear_user_highpage_nocache(struct page *page, unsigned long vaddr)
+{
+	void *p = kmap_atomic(page, KM_USER0);
+	clear_page_nocache(p);
+	kunmap_atomic(p);
+}
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
                           ` (3 preceding siblings ...)
  2013-08-16 14:34         ` [PATCH 4/8] x86: Add clear_page_nocache Alex Thorlton
@ 2013-08-16 14:34         ` Alex Thorlton
  2013-08-16 18:02           ` Dave Hansen
  2013-08-16 14:34         ` [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 Alex Thorlton
                           ` (3 subsequent siblings)
  8 siblings, 1 reply; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

---
 mm/memory.c | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 69a5a38..17d61f0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4231,18 +4231,35 @@ EXPORT_SYMBOL(might_fault);
 #endif
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+
+#ifndef ARCH_HAS_USER_NOCACHE
+#define ARCH_HAS_USER_NOCACHE 0
+#endif
+
+#if ARCH_HAS_USER_NOCACHE == 0
+#define clear_user_highpage_nocache clear_user_highpage
+#endif
+
 static void clear_gigantic_page(struct page *page,
 				unsigned long addr,
 				unsigned int pages_per_huge_page)
 {
 	int i;
 	struct page *p = page;
+	unsigned long vaddr;
+	unsigned long haddr = addr & HPAGE_PMD_MASK;
+	int target = (addr - haddr) >> PAGE_SHIFT;
 
 	might_sleep();
+	vaddr = haddr;
 	for (i = 0; i < pages_per_huge_page;
 	     i++, p = mem_map_next(p, page, i)) {
 		cond_resched();
-		clear_user_highpage(p, addr + i * PAGE_SIZE);
+		vaddr = haddr + i*PAGE_SIZE;
+		if (!ARCH_HAS_USER_NOCACHE  || i == target)
+			clear_user_highpage(p, vaddr);
+		else
+			clear_user_highpage_nocache(p, vaddr);
 	}
 }
 void clear_huge_page(struct page *page,
@@ -4250,6 +4267,8 @@ void clear_huge_page(struct page *page,
 {
 	int i;
 	unsigned long haddr = addr & HPAGE_PMD_MASK;
+	unsigned long vaddr;
+	int target = (addr - haddr) >> PAGE_SHIFT;
 
 	if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
 		clear_gigantic_page(page, haddr, pages_per_huge_page);
@@ -4257,9 +4276,14 @@ void clear_huge_page(struct page *page,
 	}
 
 	might_sleep();
+	vaddr = haddr;
 	for (i = 0; i < pages_per_huge_page; i++) {
 		cond_resched();
-		clear_user_highpage(page + i, haddr + i * PAGE_SIZE);
+		vaddr = haddr + i*PAGE_SIZE;
+		if (!ARCH_HAS_USER_NOCACHE || i == target)
+			clear_user_highpage(page + i, vaddr);
+		else
+			clear_user_highpage_nocache(page + i, vaddr);
 	}
 }
 
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address
  2013-08-16 14:34         ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton
@ 2013-08-16 18:02           ` Dave Hansen
  0 siblings, 0 replies; 57+ messages in thread
From: Dave Hansen @ 2013-08-16 18:02 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

On 08/16/2013 07:34 AM, Alex Thorlton wrote:
> +#if ARCH_HAS_USER_NOCACHE == 0
> +#define clear_user_highpage_nocache clear_user_highpage
> +#endif
...
>  		cond_resched();
> -		clear_user_highpage(p, addr + i * PAGE_SIZE);
> +		vaddr = haddr + i*PAGE_SIZE;
> +		if (!ARCH_HAS_USER_NOCACHE  || i == target)
> +			clear_user_highpage(p, vaddr);
> +		else
> +			clear_user_highpage_nocache(p, vaddr);
>  	}
>  }

Is the check for !ARCH_HAS_USER_NOCACHE in there really necessary?  That
logic seems like it would be taken care of by that #define.

Plus, if you don't check 'ARCH_HAS_USER_NOCACHE' like this, you don't
need to define a value for it, and you can axe the:

+#ifndef ARCH_HAS_USER_NOCACHE
+#define ARCH_HAS_USER_NOCACHE 0
+#endif

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
                           ` (4 preceding siblings ...)
  2013-08-16 14:34         ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton
@ 2013-08-16 14:34         ` Alex Thorlton
  2013-08-16 14:34         ` [PATCH 7/8] remove KM_USER0 from kmap_atomic call Alex Thorlton
                           ` (2 subsequent siblings)
  8 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

---
 arch/x86/lib/clear_page_nocache_64.S | 91 ++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 14 deletions(-)

diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
index ee16d15..a6d938c 100644
--- a/arch/x86/lib/clear_page_nocache_64.S
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -1,29 +1,92 @@
+/*
+ * Clear pages with cache bypass.
+ *
+ * Copyright (C) 2011, 2012 Intel Corporation
+ * Author: Andi Kleen
+ *
+ * This software may be redistributed and/or modified under the terms of
+ * the GNU General Public License ("GPL") version 2 only as published by the
+ * Free Software Foundation.
+ */
+
 #include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
 
+#define SSE_UNROLL 128
+
 /*
  * Zero a page avoiding the caches
  * rdi	page
  */
 ENTRY(clear_page_nocache)
 	CFI_STARTPROC
-	xorl   %eax,%eax
-	movl   $4096/64,%ecx
+	push   %rdi
+	call   kernel_fpu_begin
+	pop    %rdi
+	sub    $16,%rsp
+	CFI_ADJUST_CFA_OFFSET 16
+	movdqu %xmm0,(%rsp)
+	xorpd  %xmm0,%xmm0
+	movl   $4096/SSE_UNROLL,%ecx
 	.p2align 4
 .Lloop:
 	decl	%ecx
-#define PUT(x) movnti %rax,x*8(%rdi)
-	movnti %rax,(%rdi)
-	PUT(1)
-	PUT(2)
-	PUT(3)
-	PUT(4)
-	PUT(5)
-	PUT(6)
-	PUT(7)
-	leaq	64(%rdi),%rdi
+	.set x,0
+	.rept SSE_UNROLL/16
+	movntdq %xmm0,x(%rdi)
+	.set x,x+16
+	.endr
+	leaq	SSE_UNROLL(%rdi),%rdi
 	jnz	.Lloop
-	nop
-	ret
+	movdqu (%rsp),%xmm0
+	addq   $16,%rsp
+	CFI_ADJUST_CFA_OFFSET -16
+	jmp   kernel_fpu_end
 	CFI_ENDPROC
 ENDPROC(clear_page_nocache)
+
+#ifdef CONFIG_AS_AVX
+
+	.section .altinstr_replacement,"ax"
+1:	.byte 0xeb					/* jmp <disp8> */
+	.byte (clear_page_nocache_avx - clear_page_nocache) - (2f - 1b)
+	/* offset */
+2:
+	.previous
+	.section .altinstructions,"a"
+	altinstruction_entry clear_page_nocache,1b,X86_FEATURE_AVX,\
+	                     16, 2b-1b
+	.previous
+
+#define AVX_UNROLL 256 /* TUNE ME */
+
+ENTRY(clear_page_nocache_avx)
+	CFI_STARTPROC
+	push   %rdi
+	call   kernel_fpu_begin
+	pop    %rdi
+	sub    $32,%rsp
+	CFI_ADJUST_CFA_OFFSET 32
+	vmovdqu %ymm0,(%rsp)
+	vxorpd  %ymm0,%ymm0,%ymm0
+	movl   $4096/AVX_UNROLL,%ecx
+	.p2align 4
+.Lloop_avx:
+	decl	%ecx
+	.set x,0
+	.rept AVX_UNROLL/32
+	vmovntdq %ymm0,x(%rdi)
+	.set x,x+32
+	.endr
+	leaq	AVX_UNROLL(%rdi),%rdi
+	jnz	.Lloop_avx
+	vmovdqu (%rsp),%ymm0
+	addq   $32,%rsp
+	CFI_ADJUST_CFA_OFFSET -32
+	jmp   kernel_fpu_end
+	CFI_ENDPROC
+ENDPROC(clear_page_nocache_avx)
+
+#endif
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 7/8] remove KM_USER0 from kmap_atomic call
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
                           ` (5 preceding siblings ...)
  2013-08-16 14:34         ` [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 Alex Thorlton
@ 2013-08-16 14:34         ` Alex Thorlton
  2013-08-16 14:34         ` [PATCH 8/8] fix up references to kernel_fpu_begin/end Alex Thorlton
  2013-08-16 14:47         ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra
  8 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

---
 arch/x86/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a088ed7..d3f6dc7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1233,7 +1233,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 void clear_user_highpage_nocache(struct page *page, unsigned long vaddr)
 {
-	void *p = kmap_atomic(page, KM_USER0);
+	void *p = kmap_atomic(page);
 	clear_page_nocache(p);
 	kunmap_atomic(p);
 }
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 8/8] fix up references to kernel_fpu_begin/end
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
                           ` (6 preceding siblings ...)
  2013-08-16 14:34         ` [PATCH 7/8] remove KM_USER0 from kmap_atomic call Alex Thorlton
@ 2013-08-16 14:34         ` Alex Thorlton
  2013-08-16 14:47         ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra
  8 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 14:34 UTC (permalink / raw)
  To: linux-kernel
  Cc: Alex Thorlton, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

---
 arch/x86/lib/clear_page_nocache_64.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/lib/clear_page_nocache_64.S b/arch/x86/lib/clear_page_nocache_64.S
index a6d938c..d03408e 100644
--- a/arch/x86/lib/clear_page_nocache_64.S
+++ b/arch/x86/lib/clear_page_nocache_64.S
@@ -23,7 +23,7 @@
 ENTRY(clear_page_nocache)
 	CFI_STARTPROC
 	push   %rdi
-	call   kernel_fpu_begin
+	call   __kernel_fpu_begin
 	pop    %rdi
 	sub    $16,%rsp
 	CFI_ADJUST_CFA_OFFSET 16
@@ -43,7 +43,7 @@ ENTRY(clear_page_nocache)
 	movdqu (%rsp),%xmm0
 	addq   $16,%rsp
 	CFI_ADJUST_CFA_OFFSET -16
-	jmp   kernel_fpu_end
+	jmp   __kernel_fpu_end
 	CFI_ENDPROC
 ENDPROC(clear_page_nocache)
 
@@ -65,7 +65,7 @@ ENDPROC(clear_page_nocache)
 ENTRY(clear_page_nocache_avx)
 	CFI_STARTPROC
 	push   %rdi
-	call   kernel_fpu_begin
+	call   __kernel_fpu_begin
 	pop    %rdi
 	sub    $32,%rsp
 	CFI_ADJUST_CFA_OFFSET 32
@@ -85,7 +85,7 @@ ENTRY(clear_page_nocache_avx)
 	vmovdqu (%rsp),%ymm0
 	addq   $32,%rsp
 	CFI_ADJUST_CFA_OFFSET -32
-	jmp   kernel_fpu_end
+	jmp   __kernel_fpu_end
 	CFI_ENDPROC
 ENDPROC(clear_page_nocache_avx)
 
-- 
1.7.12.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp
  2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
                           ` (7 preceding siblings ...)
  2013-08-16 14:34         ` [PATCH 8/8] fix up references to kernel_fpu_begin/end Alex Thorlton
@ 2013-08-16 14:47         ` Peter Zijlstra
  2013-08-16 15:04           ` Alex Thorlton
  8 siblings, 1 reply; 57+ messages in thread
From: Peter Zijlstra @ 2013-08-16 14:47 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

On Fri, Aug 16, 2013 at 09:33:56AM -0500, Alex Thorlton wrote:

> This first set of tests was run on the latest community kernel, with the
> vclear patches:
> 
> Kernel string: Kernel 3.11.0-rc5-medusa-00021-g1a15a96-dirty
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
> [always] madvise never
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
> ...
> Done. Terminating the simulation.
> 
> real    25m34.052s
> user    10769m7.948s
> sys     37m46.524s
> 
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# echo never > /sys/kernel/mm/transparent_hugepage/enabled
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# cat /sys/kernel/mm/transparent_hugepage/enabled
> always madvise [never]
> harp31-sys:/home/estes04/athorlton/Testing/progs/thp_benchmarks/321.equake_l# time ./run.sh
> ...
> Done. Terminating the simulation.
> 
> real    5m0.377s
> user    2202m0.684s
> sys     108m31.816s
> 

This 321.equake_l thing is not public, right? Do you have anything that
is public that shows the same problem?

Do you have any idea where all the extra time is spend? 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp
  2013-08-16 14:47         ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra
@ 2013-08-16 15:04           ` Alex Thorlton
  0 siblings, 0 replies; 57+ messages in thread
From: Alex Thorlton @ 2013-08-16 15:04 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Ingo Molnar, Andrew Morton, Mel Gorman,
	Kirill A . Shutemov, Rik van Riel, Johannes Weiner,
	Eric W . Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E . McKenney, David Howells,
	Thomas Gleixner, Al Viro, Oleg Nesterov, Srikar Dronamraju,
	Kees Cook, Robin Holt

> This 321.equake_l thing is not public, right? Do you have anything
> that
> is public that shows the same problem?

The quake test comes from the SPEC OMP benchmarks.  While I believe this
suite is available to anybody, I don't think it's free.  I was given
the test by our benchmarking team, I'll have to ask them if this is
publicly available/if they know of any publicly available tests that can
reproduce the issue.

> Do you have any idea where all the extra time is spend? 

I'm working on that as we speak.  I was trying to eliminate the
possibility that the vclear patches would help solve the problem,
before digging any deeper.  I should have some more information on
exactly where it's getting hung up in a few hours.

- Alex

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton
  2013-08-02 19:53 ` Dave Jones
  2013-08-02 20:13 ` Kirill A. Shutemov
@ 2013-08-04 14:44 ` Rasmus Villemoes
  2013-08-28 13:56 ` Andrea Arcangeli
  3 siblings, 0 replies; 57+ messages in thread
From: Rasmus Villemoes @ 2013-08-04 14:44 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

Alex Thorlton <athorlton@sgi.com> writes:

> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.

A few comments: 

Is there a reason it shouldn't be possible for a process to un-disable/reenable thp? 

> +static inline int transparent_hugepage_enabled(struct vm_area_struct *vma)
> +{
> +	return !current->thp_disabled & _transparent_hugepage_enabled(vma);
> +}

Should probably be &&.

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 50d04b9..f084c76 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1406,6 +1406,9 @@ struct task_struct {
>  	unsigned int	sequential_io;
>  	unsigned int	sequential_io_avg;
>  #endif
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	int thp_disabled;
> +#endif
>  };

Is there a reason this needs a whole int in task_struct and not just
a single bit?

Rasmus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH] Add per-process flag to control thp
  2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton
                   ` (2 preceding siblings ...)
  2013-08-04 14:44 ` Rasmus Villemoes
@ 2013-08-28 13:56 ` Andrea Arcangeli
  3 siblings, 0 replies; 57+ messages in thread
From: Andrea Arcangeli @ 2013-08-28 13:56 UTC (permalink / raw)
  To: Alex Thorlton
  Cc: linux-kernel, Ingo Molnar, Peter Zijlstra, Andrew Morton,
	Mel Gorman, Kirill A. Shutemov, Rik van Riel, Johannes Weiner,
	Eric W. Biederman, Sedat Dilek, Frederic Weisbecker, Dave Jones,
	Michael Kerrisk, Paul E. McKenney, David Howells, Thomas Gleixner,
	Al Viro, Oleg Nesterov, Srikar Dronamraju, Kees Cook, Robin Holt

Hi everyone,

On Fri, Aug 02, 2013 at 02:46:59PM -0500, Alex Thorlton wrote:
> This patch implements functionality to allow processes to disable the use of
> transparent hugepages through the prctl syscall.
> 
> We've determined that some jobs perform significantly better with thp disabled,
> and we need a way to control thp on a per-process basis, without relying on
> madvise.

Are you using automatic NUMA balancing or CPU pinning or was this a
short lived computation?

If you're not using automatic NUMA balancing or if you're using
automatic NUMA balancing and this is a fairly short lived computation,
zone_reclaim_mode was broken with THP enabled as it never called into
compaction, so with THP enabled you would have a random NUMA placement
during page fault to satisfy hugepage allocation (first priority was
to get any hugepage from any wrong node, only then it would call
compaction).

Did you verify the numa placement that you were getting?

I fixed the zone_reclaim_mode for THP and posted the fixes on
linux-mm, so you may want to try that fix just in case it helps your
workload.

Chances are you work with a big system where zone_reclaim_mode is
enabled by default, and for a test I would suggest you to enable it
once even if it's not enabled by default (after applying the fixes) to
see if it makes any difference.

I just did a push of my latest tree to the master branch of aa.git on
kernel.org, so you can just pull that for a quick test with
zone_reclaim_mode enabled.

NOTE: it's not guaranteed to help but it's worth a try. As you pointed
out if the threads are working on separate 4k fragments, it won't
help. But as things stands now THP enabled would breaks
zone_reclaim_mode and so without hard memory pinning it wouldn't
perform as good as with THP disabled.

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2013-09-13 19:58 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-08-02 19:46 [PATCH] Add per-process flag to control thp Alex Thorlton
2013-08-02 19:53 ` Dave Jones
2013-08-02 20:00   ` Alex Thorlton
2013-08-02 20:13 ` Kirill A. Shutemov
2013-08-02 20:34   ` Alex Thorlton
2013-08-02 23:59     ` Kirill A. Shutemov
2013-08-03 19:35     ` Kees Cook
2013-08-04 14:19       ` Oleg Nesterov
2013-08-05  2:36     ` Andi Kleen
2013-08-05 15:07       ` Alex Thorlton
2013-08-16 14:33       ` [PATCH 0/8] " Alex Thorlton
2013-08-16 14:33         ` [PATCH 1/8] THP: Use real address for NUMA policy Alex Thorlton
2013-08-16 17:53           ` Dave Hansen
2013-08-16 18:17             ` Alex Thorlton
2013-08-16 18:52               ` Kirill A. Shutemov
2013-08-27 16:50                 ` Alex Thorlton
2013-08-27 17:01                   ` Robin Holt
2013-09-04 15:43                     ` Alex Thorlton
2013-09-04 17:15                       ` Alex Thorlton
2013-09-05 11:15                         ` Ingo Molnar
2013-09-09 16:48                           ` Alex Thorlton
2013-09-10  7:47                             ` [benchmark] THP performance testcase Ingo Molnar
2013-09-13 13:06                               ` [PATCH 0/9] split page table lock for PMD tables Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 1/9] mm: rename SPLIT_PTLOCKS to SPLIT_PTE_PTLOCKS Kirill A. Shutemov
2013-09-13 15:20                                   ` Dave Hansen
2013-09-13 13:06                                 ` [PATCH 2/9] mm: convert mm->nr_ptes to atomic_t Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 3/9] mm: introduce api for split page table lock for PMD level Kirill A. Shutemov
2013-09-13 13:19                                   ` Peter Zijlstra
2013-09-13 14:22                                     ` Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 4/9] mm, thp: change pmd_trans_huge_lock() to return taken lock Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 5/9] mm, thp: move ptl taking inside page_check_address_pmd() Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 6/9] mm, thp: do not access mm->pmd_huge_pte directly Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 7/9] mm: convent the rest to new page table lock api Kirill A. Shutemov
2013-09-13 13:06                                 ` [PATCH 8/9] mm: implement split page table lock for PMD level Kirill A. Shutemov
2013-09-13 13:24                                   ` Peter Zijlstra
2013-09-13 14:25                                     ` Kirill A. Shutemov
2013-09-13 14:52                                       ` Peter Zijlstra
2013-09-13 13:36                                   ` Peter Zijlstra
2013-09-13 14:25                                     ` Kirill A. Shutemov
2013-09-13 15:45                                   ` Naoya Horiguchi
2013-09-13 19:57                                   ` Dave Hansen
2013-09-13 13:06                                 ` [PATCH 9/9] x86, mm: enable " Kirill A. Shutemov
     [not found]                   ` <20130828091814.GA13681@gmail.com>
2013-08-28  9:32                     ` [PATCH 1/8] THP: Use real address for NUMA policy Peter Zijlstra
2013-08-16 19:46               ` Peter Zijlstra
2013-08-16 19:49                 ` Alex Thorlton
2013-08-16 14:33         ` [PATCH 2/8] mm: make clear_huge_page tolerate non aligned address Alex Thorlton
2013-08-16 14:33         ` [PATCH 3/8] THP: Pass real, not rounded, address to clear_huge_page Alex Thorlton
2013-08-16 14:34         ` [PATCH 4/8] x86: Add clear_page_nocache Alex Thorlton
2013-08-16 14:34         ` [PATCH 5/8] mm: make clear_huge_page cache clear only around the fault address Alex Thorlton
2013-08-16 18:02           ` Dave Hansen
2013-08-16 14:34         ` [PATCH 6/8] x86: switch the 64bit uncached page clear to SSE/AVX v2 Alex Thorlton
2013-08-16 14:34         ` [PATCH 7/8] remove KM_USER0 from kmap_atomic call Alex Thorlton
2013-08-16 14:34         ` [PATCH 8/8] fix up references to kernel_fpu_begin/end Alex Thorlton
2013-08-16 14:47         ` [PATCH 0/8] Re: [PATCH] Add per-process flag to control thp Peter Zijlstra
2013-08-16 15:04           ` Alex Thorlton
2013-08-04 14:44 ` Rasmus Villemoes
2013-08-28 13:56 ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).