linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-02-21 19:07 Fix NUMA problems in transparent hugepages and KSM Andi Kleen
@ 2011-02-21 19:07 ` Andi Kleen
  2011-02-22 15:42   ` Christoph Lameter
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2011-02-21 19:07 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, linux-kernel, aarcange, lwoodman, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Currently alloc_pages_vma always uses the local node as policy node
for the LOCAL policy. Pass this node down as an argument instead.

No behaviour change from this patch, but will be needed for followons.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/gfp.h |    9 +++++----
 mm/huge_memory.c    |    2 +-
 mm/mempolicy.c      |   11 +++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0b84c61..782e74a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
-			struct vm_area_struct *vma, unsigned long addr);
+		    	struct vm_area_struct *vma, unsigned long addr,
+			int node);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-#define alloc_page_vma(gfp_mask, vma, addr)	\
-	alloc_pages_vma(gfp_mask, 0, vma, addr)
+#define alloc_page_vma(gfp_mask, vma, addr)			\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e62ddb8..73ecca5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
 					      unsigned long haddr)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr);
+			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
 }
 
 #ifndef CONFIG_NUMA
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 76c51b7..d3d1e747 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+	int nd)
 {
-	int nd = numa_node_id();
-
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
+		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, int node)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
@@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		put_mems_allowed();
 		return page;
 	}
-	zl = policy_zonelist(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-02-21 19:07 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
@ 2011-02-22 15:42   ` Christoph Lameter
  0 siblings, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2011-02-22 15:42 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-mm, linux-kernel, aarcange, lwoodman, Andi Kleen

On Mon, 21 Feb 2011, Andi Kleen wrote:

> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
>  }
>
>  /* Return a zonelist indicated by gfp for node representing a mempolicy */
> -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
> +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
> +	int nd)
>  {
> -	int nd = numa_node_id();
> -
>  	switch (policy->mode) {
>  	case MPOL_PREFERRED:
>  		if (!(policy->flags & MPOL_F_LOCAL))
> @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
>  		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
>  				huge_page_shift(hstate_vma(vma))), gfp_flags);
>  	} else {
> -		zl = policy_zonelist(gfp_flags, *mpol);
> +		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
>  		if ((*mpol)->mode == MPOL_BIND)
>  			*nodemask = &(*mpol)->v.nodes;
>  	}

If we do that then why not also consolidate the MPOL_INTERLEAVE
treatment also in policy_zonelist()? Looks awfully similar now and Would
simplify the code and likely get rid of some functions.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Fix NUMA problems in transparent hugepages v2
@ 2011-02-23  1:51 Andi Kleen
  2011-02-23  1:51 ` [PATCH 1/8] Fix interleaving for " Andi Kleen
                   ` (7 more replies)
  0 siblings, 8 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm

v2: I dropped the controversal KSM changes and fixed
the interleaving bug. Now it's purely for transparent huge pages.

The current transparent hugepages daemon can mess up local
memory affinity on NUMA systems. When it copies memory to a 
huge page it does not necessarily keep it on the same
node as the local allocations.

While fixing this I also found some more related issues:
- The NUMA policy interleaving for THP was using the small
page size, not the large parse size.
- THP copies also did not preserve the local node
- The accounting for local/remote allocations in the daemon
was misleading.
- There were no VM statistics counters for THP, which made it 
impossible to analyze.
 
At least some of the bug fixes are 2.6.38 candidates IMHO
because some of the NUMA problems are pretty bad. In some workloads
this can cause performance problems. 

What can be delayed are GFP_OTHERNODE and the statistics changes.

Git tree:

  git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc-2.6.git thp-numa

Andi Kleen (8):
      Fix interleaving for transparent hugepages v2
      Change alloc_pages_vma to pass down the policy node for local policy
      Add alloc_page_vma_node
      Preserve original node for transparent huge page copies
      Use correct numa policy node for transparent hugepages
      Add __GFP_OTHER_NODE flag
      Use GFP_OTHER_NODE for transparent huge pages
      Add VM counters for transparent hugepages

 include/linux/gfp.h    |   13 ++++++++---
 include/linux/vmstat.h |   11 ++++++++-
 mm/huge_memory.c       |   49 +++++++++++++++++++++++++++++++++--------------
 mm/mempolicy.c         |   16 +++++++-------
 mm/page_alloc.c        |    2 +-
 mm/vmstat.c            |   17 ++++++++++++++-
 6 files changed, 76 insertions(+), 32 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/8] Fix interleaving for transparent hugepages v2
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
@ 2011-02-23  1:51 ` Andi Kleen
  2011-02-23 19:26   ` Christoph Lameter
  2011-02-24 23:23   ` Andrea Arcangeli
  2011-02-23  1:51 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
                   ` (6 subsequent siblings)
  7 siblings, 2 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

Bugfix, independent from the rest of the series.

The THP code didn't pass the correct interleaving shift to the memory
policy code. Fix this here by adjusting for the order.

v2: Use + (thanks Christoph)
Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 mm/mempolicy.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d..49355a9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1830,7 +1830,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
 		put_mems_allowed();
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
  2011-02-23  1:51 ` [PATCH 1/8] Fix interleaving for " Andi Kleen
@ 2011-02-23  1:51 ` Andi Kleen
  2011-02-23  1:51 ` [PATCH 3/8] Add alloc_page_vma_node Andi Kleen
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

Currently alloc_pages_vma always uses the local node as policy node
for the LOCAL policy. Pass this node down as an argument instead.

No behaviour change from this patch, but will be needed for followons.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/gfp.h |    9 +++++----
 mm/huge_memory.c    |    2 +-
 mm/mempolicy.c      |   11 +++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0b84c61..782e74a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
-			struct vm_area_struct *vma, unsigned long addr);
+		    	struct vm_area_struct *vma, unsigned long addr,
+			int node);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-#define alloc_page_vma(gfp_mask, vma, addr)	\
-	alloc_pages_vma(gfp_mask, 0, vma, addr)
+#define alloc_page_vma(gfp_mask, vma, addr)			\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e62ddb8..73ecca5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
 					      unsigned long haddr)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr);
+			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
 }
 
 #ifndef CONFIG_NUMA
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 49355a9..25a5a91 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+	int nd)
 {
-	int nd = numa_node_id();
-
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
+		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, int node)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
@@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		put_mems_allowed();
 		return page;
 	}
-	zl = policy_zonelist(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 3/8] Add alloc_page_vma_node
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
  2011-02-23  1:51 ` [PATCH 1/8] Fix interleaving for " Andi Kleen
  2011-02-23  1:51 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
@ 2011-02-23  1:51 ` Andi Kleen
  2011-02-23  1:51 ` [PATCH 4/8] Preserve original node for transparent huge page copies Andi Kleen
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

Add a alloc_page_vma_node that allows passing the "local" node in.
Used in a followon patch.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/gfp.h |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 782e74a..814d50e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -343,6 +343,8 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
 	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
+#define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, node)
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 4/8] Preserve original node for transparent huge page copies
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
                   ` (2 preceding siblings ...)
  2011-02-23  1:51 ` [PATCH 3/8] Add alloc_page_vma_node Andi Kleen
@ 2011-02-23  1:51 ` Andi Kleen
  2011-02-23  1:51 ` [PATCH 5/8] Use correct numa policy node for transparent hugepages Andi Kleen
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

This makes a difference for LOCAL policy, where the node cannot
be determined from the policy itself, but has to be gotten
from the original page.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 mm/huge_memory.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 73ecca5..00a5c39 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	}
 
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-					  vma, address);
+		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
+					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
 			     mem_cgroup_newpage_charge(pages[i], mm,
 						       GFP_KERNEL))) {
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 5/8] Use correct numa policy node for transparent hugepages
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
                   ` (3 preceding siblings ...)
  2011-02-23  1:51 ` [PATCH 4/8] Preserve original node for transparent huge page copies Andi Kleen
@ 2011-02-23  1:51 ` Andi Kleen
  2011-02-23  1:52 ` [PATCH 6/8] Add __GFP_OTHER_NODE flag Andi Kleen
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:51 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

Pass down the correct node for a transparent hugepage allocation.
Most callers continue to use the current node, however the hugepaged
daemon now uses the previous node of the first to be collapsed page
instead. This ensures that khugepaged does not mess up local memory
for an existing process which uses local policy.

The choice of node is somewhat primitive currently: it just
uses the node of the first page in the pmd range. An alternative
would be to look at multiple pages and use the most popular
node. I used the simplest variant for now which should work
well enough for the case of all pages being on the same node.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 mm/huge_memory.c |   24 +++++++++++++++++-------
 mm/mempolicy.c   |    3 ++-
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 00a5c39..5a05b35 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 
 static inline struct page *alloc_hugepage_vma(int defrag,
 					      struct vm_area_struct *vma,
-					      unsigned long haddr)
+					      unsigned long haddr, int nd)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+			       HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
 #ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					  vma, haddr);
+					  vma, haddr, numa_node_id());
 		if (unlikely(!page))
 			goto out;
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr);
+					      vma, haddr, numa_node_id());
 	else
 		new_page = NULL;
 
@@ -1745,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
 			       unsigned long address,
 			       struct page **hpage,
-			       struct vm_area_struct *vma)
+			       struct vm_area_struct *vma,
+			       int node)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -1773,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * mmap_sem in read mode is good idea also to allow greater
 	 * scalability.
 	 */
-	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+				      node);
 	if (unlikely(!new_page)) {
 		up_read(&mm->mmap_sem);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -1917,6 +1919,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	struct page *page;
 	unsigned long _address;
 	spinlock_t *ptl;
+	int node = -1;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1947,6 +1950,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		page = vm_normal_page(vma, _address, pteval);
 		if (unlikely(!page))
 			goto out_unmap;
+		/* 
+		 * Chose the node of the first page. This could 
+		 * be more sophisticated and look at more pages,
+		 * but isn't for now.
+		 */
+		if (node == -1) 
+			node = page_to_nid(page);
 		VM_BUG_ON(PageCompound(page));
 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
 			goto out_unmap;
@@ -1963,7 +1973,7 @@ out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret)
 		/* collapse_huge_page will return with the mmap_sem released */
-		collapse_huge_page(mm, address, hpage, vma);
+		collapse_huge_page(mm, address, hpage, vma, node);
 out:
 	return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 25a5a91..151c20c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1891,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else
 		page = __alloc_pages_nodemask(gfp, order,
-			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+	      			policy_zonelist(gfp, pol, numa_node_id()), 
+				policy_nodemask(gfp, pol));
 	put_mems_allowed();
 	return page;
 }
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 6/8] Add __GFP_OTHER_NODE flag
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
                   ` (4 preceding siblings ...)
  2011-02-23  1:51 ` [PATCH 5/8] Use correct numa policy node for transparent hugepages Andi Kleen
@ 2011-02-23  1:52 ` Andi Kleen
  2011-02-23  1:52 ` [PATCH 7/8] Use GFP_OTHER_NODE for transparent huge pages Andi Kleen
  2011-02-23  1:52 ` [PATCH 8/8] Add VM counters for transparent hugepages Andi Kleen
  7 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:52 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

Add a new __GFP_OTHER_NODE flag to tell the low level numa statistics
in zone_statistics() that an allocation is on behalf of another thread.
This way the local and remote counters can be still correct, even
when background daemons like khugepaged are changing memory
mappings.

This only affects the accounting, but I think it's worth doing that
right to avoid confusing users.

I first tried to just pass down the right node, but this required
a lot of changes to pass down this parameter and at least one
addition of a 10th argument to a 9 argument function. Using
the flag is a lot less intrusive.

Open: should be also used for migration?

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/gfp.h    |    2 ++
 include/linux/vmstat.h |    4 ++--
 mm/page_alloc.c        |    2 +-
 mm/vmstat.c            |    9 +++++++--
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 814d50e..a064724 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -35,6 +35,7 @@ struct vm_area_struct;
 #define ___GFP_NOTRACK		0
 #endif
 #define ___GFP_NO_KSWAPD	0x400000u
+#define ___GFP_OTHER_NODE	0x800000u
 
 /*
  * GFP bitmasks..
@@ -83,6 +84,7 @@ struct vm_area_struct;
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
+#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 833e676..9b5c63d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -220,12 +220,12 @@ static inline unsigned long node_page_state(int node,
 		zone_page_state(&zones[ZONE_MOVABLE], item);
 }
 
-extern void zone_statistics(struct zone *, struct zone *);
+extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
 
 #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl,_z) do { } while (0)
+#define zone_statistics(_zl,_z, gfp) do { } while (0)
 
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a873e61..4ce06a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1333,7 +1333,7 @@ again:
 	}
 
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
-	zone_statistics(preferred_zone, zone);
+	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
 
 	VM_BUG_ON(bad_range(zone, page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b504..2b461ed 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -500,8 +500,12 @@ void refresh_cpu_vm_stats(int cpu)
  * z 	    = the zone from which the allocation occurred.
  *
  * Must be called with interrupts disabled.
+ * 
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
  */
-void zone_statistics(struct zone *preferred_zone, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 {
 	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
 		__inc_zone_state(z, NUMA_HIT);
@@ -509,7 +513,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 		__inc_zone_state(z, NUMA_MISS);
 		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
 	}
-	if (z->node == numa_node_id())
+	if (z->node == ((flags & __GFP_OTHER_NODE) ? 
+			preferred_zone->node : numa_node_id()))
 		__inc_zone_state(z, NUMA_LOCAL);
 	else
 		__inc_zone_state(z, NUMA_OTHER);
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 7/8] Use GFP_OTHER_NODE for transparent huge pages
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
                   ` (5 preceding siblings ...)
  2011-02-23  1:52 ` [PATCH 6/8] Add __GFP_OTHER_NODE flag Andi Kleen
@ 2011-02-23  1:52 ` Andi Kleen
  2011-02-24  4:56   ` Andrea Arcangeli
  2011-02-23  1:52 ` [PATCH 8/8] Add VM counters for transparent hugepages Andi Kleen
  7 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:52 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

Pass GFP_OTHER_NODE for transparent hugepages NUMA allocations
done by the hugepages daemon. This way the low level accounting
for local versus remote pages works correctly.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 mm/huge_memory.c |   18 ++++++++++--------
 1 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5a05b35..877756e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -643,16 +643,17 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	return ret;
 }
 
-static inline gfp_t alloc_hugepage_gfpmask(int defrag)
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 {
-	return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
+	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 
 static inline struct page *alloc_hugepage_vma(int defrag,
 					      struct vm_area_struct *vma,
-					      unsigned long haddr, int nd)
+					      unsigned long haddr, int nd,
+					      gfp_t extra_gfp)
 {
-	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
+	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
 			       HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
@@ -678,7 +679,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					  vma, haddr, numa_node_id());
+					  vma, haddr, numa_node_id(), 0);
 		if (unlikely(!page))
 			goto out;
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -799,7 +800,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	}
 
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
+		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 
+					       __GFP_OTHER_NODE,
 					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
 			     mem_cgroup_newpage_charge(pages[i], mm,
@@ -902,7 +904,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr, numa_node_id());
+					      vma, haddr, numa_node_id(), 0);
 	else
 		new_page = NULL;
 
@@ -1775,7 +1777,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * scalability.
 	 */
 	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
-				      node);
+				      node, __GFP_OTHER_NODE);
 	if (unlikely(!new_page)) {
 		up_read(&mm->mmap_sem);
 		*hpage = ERR_PTR(-ENOMEM);
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
                   ` (6 preceding siblings ...)
  2011-02-23  1:52 ` [PATCH 7/8] Use GFP_OTHER_NODE for transparent huge pages Andi Kleen
@ 2011-02-23  1:52 ` Andi Kleen
  2011-02-24  4:18   ` Andrea Arcangeli
                     ` (2 more replies)
  7 siblings, 3 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-23  1:52 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, Andi Kleen, aarcange

From: Andi Kleen <ak@linux.intel.com>

I found it difficult to make sense of transparent huge pages without
having any counters for its actions. Add some counters to vmstat
for allocation of transparent hugepages and fallback to smaller
pages.

Optional patch, but useful for development and understanding the system.

Cc: aarcange@redhat.com
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/vmstat.h |    7 +++++++
 mm/huge_memory.c       |   13 ++++++++++---
 mm/vmstat.c            |    8 ++++++++
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 9b5c63d..7794d1a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -58,6 +58,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
 		UNEVICTABLE_MLOCKFREED,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	        THP_DIRECT_ALLOC,
+		THP_DAEMON_ALLOC,	
+		THP_DIRECT_FALLBACK,	
+		THP_DAEMON_ALLOC_FAILED,
+		THP_SPLIT,
+#endif
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 877756e..4ef8c32 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -680,13 +680,15 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			return VM_FAULT_OOM;
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 					  vma, haddr, numa_node_id(), 0);
-		if (unlikely(!page))
+		if (unlikely(!page)) {
+			count_vm_event(THP_DIRECT_FALLBACK);
 			goto out;
+		}
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
 			put_page(page);
 			goto out;
 		}
-
+		count_vm_event(THP_DIRECT_ALLOC);
 		return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
 	}
 out:
@@ -909,6 +911,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		new_page = NULL;
 
 	if (unlikely(!new_page)) {
+		count_vm_event(THP_DIRECT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
 		put_page(page);
@@ -921,7 +924,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		ret |= VM_FAULT_OOM;
 		goto out;
 	}
-
+	count_vm_event(THP_DIRECT_ALLOC);
 	copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
 
@@ -1780,6 +1783,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 				      node, __GFP_OTHER_NODE);
 	if (unlikely(!new_page)) {
 		up_read(&mm->mmap_sem);
+		count_vm_event(THP_DAEMON_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
 		return;
 	}
@@ -2286,6 +2290,9 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 		spin_unlock(&mm->page_table_lock);
 		return;
 	}
+
+	count_vm_event(THP_SPLIT);
+
 	page = pmd_page(*pmd);
 	VM_BUG_ON(!page_count(page));
 	get_page(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2b461ed..f3ab7e9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -946,6 +946,14 @@ static const char * const vmstat_text[] = {
 	"unevictable_pgs_stranded",
 	"unevictable_pgs_mlockfreed",
 #endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	"thp_direct_alloc",
+	"thp_daemon_alloc",
+	"thp_direct_fallback",
+	"thp_daemon_alloc_failed",
+	"thp_split",
+#endif
 };
 
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/8] Fix interleaving for transparent hugepages v2
  2011-02-23  1:51 ` [PATCH 1/8] Fix interleaving for " Andi Kleen
@ 2011-02-23 19:26   ` Christoph Lameter
  2011-02-24 23:23   ` Andrea Arcangeli
  1 sibling, 0 replies; 27+ messages in thread
From: Christoph Lameter @ 2011-02-23 19:26 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen, aarcange


Reviewed-by: Christoph Lameter <cl@linux.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-23  1:52 ` [PATCH 8/8] Add VM counters for transparent hugepages Andi Kleen
@ 2011-02-24  4:18   ` Andrea Arcangeli
  2011-02-24 22:43     ` Dave Hansen
  2011-02-24 22:43   ` Dave Hansen
  2011-02-25  0:51   ` Andrea Arcangeli
  2 siblings, 1 reply; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-24  4:18 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen

Incremental fix for your patch 8 (I doubt it was intentional).

===
Subject: thp: move THP_SPLIT from __split_huge_page_pmd to inner split_huge_page

From: Andrea Arcangeli <aarcange@redhat.com>

Provide more accurate stats by accounting every split_huge_page not only the
ones coming from pmd manipulations.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 mm/huge_memory.c |    4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1393,6 +1393,7 @@ int split_huge_page(struct page *page)
 
 	BUG_ON(!PageSwapBacked(page));
 	__split_huge_page(page, anon_vma);
+	count_vm_event(THP_SPLIT);
 
 	BUG_ON(PageCompound(page));
 out_unlock:
@@ -2287,9 +2288,6 @@ void __split_huge_page_pmd(struct mm_str
 		spin_unlock(&mm->page_table_lock);
 		return;
 	}
-
-	count_vm_event(THP_SPLIT);
-
 	page = pmd_page(*pmd);
 	VM_BUG_ON(!page_count(page));
 	get_page(page);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 7/8] Use GFP_OTHER_NODE for transparent huge pages
  2011-02-23  1:52 ` [PATCH 7/8] Use GFP_OTHER_NODE for transparent huge pages Andi Kleen
@ 2011-02-24  4:56   ` Andrea Arcangeli
  0 siblings, 0 replies; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-24  4:56 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen

This fixes build with CONFIG_NUMA=n for patch 7 (noticed on my
laptop which isn't NUMA yet ;).

===
Subject: thp: add extra_gfp in alloc_hugepage non NUMA

From: Andrea Arcangeli <aarcange@redhat.com>

Add extra_gfp to avoid build failure with CONFIG_NUMA=n.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 mm/huge_memory.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -660,7 +660,7 @@ static inline struct page *alloc_hugepag
 #ifndef CONFIG_NUMA
 static inline struct page *alloc_hugepage(int defrag)
 {
-	return alloc_pages(alloc_hugepage_gfpmask(defrag),
+	return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
 			   HPAGE_PMD_ORDER);
 }
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-23  1:52 ` [PATCH 8/8] Add VM counters for transparent hugepages Andi Kleen
  2011-02-24  4:18   ` Andrea Arcangeli
@ 2011-02-24 22:43   ` Dave Hansen
  2011-02-24 23:14     ` Andrea Arcangeli
  2011-02-25  0:51   ` Andrea Arcangeli
  2 siblings, 1 reply; 27+ messages in thread
From: Dave Hansen @ 2011-02-24 22:43 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen, aarcange

On Tue, 2011-02-22 at 17:52 -0800, Andi Kleen wrote:
> @@ -2286,6 +2290,9 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
>  		spin_unlock(&mm->page_table_lock);
>  		return;
>  	}
> +
> +	count_vm_event(THP_SPLIT);
> +
>  	page = pmd_page(*pmd);
>  	VM_BUG_ON(!page_count(page));
>  	get_page(page);

Hey Andi,

Your split counter tracks the split_huge_page_pmd() calls, but misses
plain split_huge_page() calls.  Did you do this on purpose?  Could we
move the counter in to the low-level split function like below?

---

 linux-2.6.git-dave/mm/huge_memory.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff -puN mm/huge_memory.c~move-THP_SPLIT mm/huge_memory.c
--- linux-2.6.git/mm/huge_memory.c~move-THP_SPLIT	2011-02-24 14:37:32.825288409 -0800
+++ linux-2.6.git-dave/mm/huge_memory.c	2011-02-24 14:39:01.767939971 -0800
@@ -1342,6 +1342,8 @@ static void __split_huge_page(struct pag
 	BUG_ON(!PageHead(page));
 	BUG_ON(PageTail(page));
 
+	count_vm_event(THP_SPLIT);
+
 	mapcount = 0;
 	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
 		struct vm_area_struct *vma = avc->vma;
@@ -2293,8 +2295,6 @@ void __split_huge_page_pmd(struct mm_str
 		return;
 	}
 
-	count_vm_event(THP_SPLIT);
-
 	page = pmd_page(*pmd);
 	VM_BUG_ON(!page_count(page));
 	get_page(page);

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-24  4:18   ` Andrea Arcangeli
@ 2011-02-24 22:43     ` Dave Hansen
  2011-02-24 23:15       ` Andrea Arcangeli
  0 siblings, 1 reply; 27+ messages in thread
From: Dave Hansen @ 2011-02-24 22:43 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andi Kleen, akpm, linux-kernel, linux-mm, Andi Kleen

On Thu, 2011-02-24 at 05:18 +0100, Andrea Arcangeli wrote:
> Incremental fix for your patch 8 (I doubt it was intentional).

Bah, sorry.  Should have read one more message down the thread. :)

-- Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-24 22:43   ` Dave Hansen
@ 2011-02-24 23:14     ` Andrea Arcangeli
  2011-02-25  1:36       ` Andi Kleen
  0 siblings, 1 reply; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-24 23:14 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Andi Kleen, akpm, linux-kernel, linux-mm, Andi Kleen

On Thu, Feb 24, 2011 at 02:43:04PM -0800, Dave Hansen wrote:
> On Tue, 2011-02-22 at 17:52 -0800, Andi Kleen wrote:
> > @@ -2286,6 +2290,9 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
> >  		spin_unlock(&mm->page_table_lock);
> >  		return;
> >  	}
> > +
> > +	count_vm_event(THP_SPLIT);
> > +
> >  	page = pmd_page(*pmd);
> >  	VM_BUG_ON(!page_count(page));
> >  	get_page(page);
> 
> Hey Andi,
> 
> Your split counter tracks the split_huge_page_pmd() calls, but misses
> plain split_huge_page() calls.  Did you do this on purpose?  Could we
> move the counter in to the low-level split function like below?

Agreed, I already noticed and posted this same change in Message-ID:
20110224041851.GF31195

> diff -puN mm/huge_memory.c~move-THP_SPLIT mm/huge_memory.c
> --- linux-2.6.git/mm/huge_memory.c~move-THP_SPLIT	2011-02-24 14:37:32.825288409 -0800
> +++ linux-2.6.git-dave/mm/huge_memory.c	2011-02-24 14:39:01.767939971 -0800
> @@ -1342,6 +1342,8 @@ static void __split_huge_page(struct pag
>  	BUG_ON(!PageHead(page));
>  	BUG_ON(PageTail(page));
>  
> +	count_vm_event(THP_SPLIT);
> +
>  	mapcount = 0;
>  	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
>  		struct vm_area_struct *vma = avc->vma;

I've a micropreference in having it in split_huge_page succeeding path
after __split_huge_page returns, as the __ function is where the
brainer code is and statcode to me is annoying to read mixed in the
more complex code. Not that it makes any practical difference though.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-24 22:43     ` Dave Hansen
@ 2011-02-24 23:15       ` Andrea Arcangeli
  0 siblings, 0 replies; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-24 23:15 UTC (permalink / raw)
  To: Dave Hansen; +Cc: Andi Kleen, akpm, linux-kernel, linux-mm, Andi Kleen

On Thu, Feb 24, 2011 at 02:43:50PM -0800, Dave Hansen wrote:
> On Thu, 2011-02-24 at 05:18 +0100, Andrea Arcangeli wrote:
> > Incremental fix for your patch 8 (I doubt it was intentional).
> 
> Bah, sorry.  Should have read one more message down the thread. :)

no problem ;).

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/8] Fix interleaving for transparent hugepages v2
  2011-02-23  1:51 ` [PATCH 1/8] Fix interleaving for " Andi Kleen
  2011-02-23 19:26   ` Christoph Lameter
@ 2011-02-24 23:23   ` Andrea Arcangeli
  1 sibling, 0 replies; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-24 23:23 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen, David Rientjes

For patches 1-5 and 8:

Acked-by: Andrea Arcangeli <aarcange@redhat.com>

Patch 6-7 I've to trust this branch is really worth it, I agree
khugepaged can hardly be better, but this comes at the cost of one
more branch for something that looks minor issue. I'm netural if
others likes it it's sure fine with me (I think David didn't like it
though, but he didn't answer to last email from Andi, I'm CCing him in
case he wants to elaborate further).

My patch incremental with patch 8 is also needed. My patch incremental
with patch 7 is also needed if 6-7 gets applied.

They're good to be in 2.6.38 but I don't rate them extremely urgent
with the exception of patch 1 that is already in -mm in fact.

In some ways this also shows how the default numa policy is
inefficient if the best it can do is to look at where the page was
allocated initially without any knowledge of where the task run last
but I don't want to risk making things worse, so for the short term
it's ok fix (it's not a band-aid it's really a fix for an heuristic
that is not good enough and it can't make things worse unlike the KSM
change in previous series that definitely made things worse), but I
hope in the long term getting info from the page in khugepaged won't
be needed anymore and it can be rolled back.

Thanks a lot Andi,
Andrea

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-23  1:52 ` [PATCH 8/8] Add VM counters for transparent hugepages Andi Kleen
  2011-02-24  4:18   ` Andrea Arcangeli
  2011-02-24 22:43   ` Dave Hansen
@ 2011-02-25  0:51   ` Andrea Arcangeli
  2011-02-25  1:12     ` Andi Kleen
  2 siblings, 1 reply; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-25  0:51 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen

On Tue, Feb 22, 2011 at 05:52:02PM -0800, Andi Kleen wrote:
> +	"thp_direct_alloc",
> +	"thp_daemon_alloc",
> +	"thp_direct_fallback",
> +	"thp_daemon_alloc_failed",

I've been wondering if we should do s/daemon/khugepaged/ or
s/daemon/collapse/.

And s/direct/fault/.

Comments welcome.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-25  0:51   ` Andrea Arcangeli
@ 2011-02-25  1:12     ` Andi Kleen
  2011-02-25  1:34       ` Andrea Arcangeli
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2011-02-25  1:12 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Andi Kleen, akpm, linux-kernel, linux-mm, Andi Kleen

On Fri, Feb 25, 2011 at 01:51:55AM +0100, Andrea Arcangeli wrote:
> On Tue, Feb 22, 2011 at 05:52:02PM -0800, Andi Kleen wrote:
> > +	"thp_direct_alloc",
> > +	"thp_daemon_alloc",
> > +	"thp_direct_fallback",
> > +	"thp_daemon_alloc_failed",
> 
> I've been wondering if we should do s/daemon/khugepaged/ or

Fine by me.

> s/daemon/collapse/.
> 
> And s/direct/fault/.

Fine for me too.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-25  1:12     ` Andi Kleen
@ 2011-02-25  1:34       ` Andrea Arcangeli
  0 siblings, 0 replies; 27+ messages in thread
From: Andrea Arcangeli @ 2011-02-25  1:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, linux-kernel, linux-mm, Andi Kleen

On Fri, Feb 25, 2011 at 02:12:05AM +0100, Andi Kleen wrote:
> On Fri, Feb 25, 2011 at 01:51:55AM +0100, Andrea Arcangeli wrote:
> > On Tue, Feb 22, 2011 at 05:52:02PM -0800, Andi Kleen wrote:
> > > +	"thp_direct_alloc",
> > > +	"thp_daemon_alloc",
> > > +	"thp_direct_fallback",
> > > +	"thp_daemon_alloc_failed",
> > 
> > I've been wondering if we should do s/daemon/khugepaged/ or
> 
> Fine by me.
> 
> > s/daemon/collapse/.
> > 
> > And s/direct/fault/.
> 
> Fine for me too.

So this would be it. (incremental with previous patch I sent that
adjusts the location of THP_SPLIT)

===
Subject: thp: make vmstat more accurate

From: Andrea Arcangeli <aarcange@redhat.com>

s/direct/fault/g s/daemon/collapse/g

It's better to account even if memcg fails if the allocation succeeded so
it gives a bit more accurate ratios on the effectiveness of the VM in
creating hugepages. This adds coverage to the not NUMA case and it actually
uses THP_COLLAPSE_ALLOC. The thp_collapse_alloc is closely related to the
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed but just like
for memcg this also accounts when the strict _allocation_ succeed but the
collapse can't go through after releasing the mmap_sem for a little
(pages_collapsed only accounts when the collapse really went through in
addition to the strict THP allocation).

Output under heavy swap load with khugepaged scan_sleep_millisecs=0
(and new kswapd compaction logic) follows.

$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa
 5 10 3390416 147772   2108   8476    0 42196     0 42196 6908  610  0  1 88 11
 1 12 3493968 153624   2104   8968    0 103552     0 103552 2664  901  0  6 41 52
 1 13 3598636 158336   2104   8404    0 104668     0 104668  778  431  0  5 60 34
 1 12 3377120 130148   2104   7576  184 42120   184 42120  998  399  0  5 38 57
 0 11 2419352 149360   2104   8844  232 19936   232 19936 9028  718  3  4 83 11
 0 13 2488964 139476   2104   8036    0 76184     0 76184 3340 1133  0  1 89 11
$ grep thp /proc/vmstat 
thp_fault_alloc 44725
thp_fault_fallback 364
thp_collapse_alloc 59
thp_collapse_alloc_failed 3
thp_split 14223

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---
 include/linux/vmstat.h |    8 ++++----
 mm/huge_memory.c       |   27 +++++++++++++++++++--------
 mm/vmstat.c            |    8 ++++----
 3 files changed, 27 insertions(+), 16 deletions(-)

--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -59,10 +59,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS
 		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
 		UNEVICTABLE_MLOCKFREED,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	        THP_DIRECT_ALLOC,
-		THP_DAEMON_ALLOC,
-		THP_DIRECT_FALLBACK,
-		THP_DAEMON_ALLOC_FAILED,
+	        THP_FAULT_ALLOC,
+		THP_FAULT_FALLBACK,
+		THP_COLLAPSE_ALLOC,
+		THP_COLLAPSE_ALLOC_FAILED,
 		THP_SPLIT,
 #endif
 		NR_VM_EVENT_ITEMS
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -681,14 +681,14 @@ int do_huge_pmd_anonymous_page(struct mm
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
 					  vma, haddr, numa_node_id(), 0);
 		if (unlikely(!page)) {
-			count_vm_event(THP_DIRECT_FALLBACK);
+			count_vm_event(THP_FAULT_FALLBACK);
 			goto out;
 		}
+		count_vm_event(THP_FAULT_ALLOC);
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
 			put_page(page);
 			goto out;
 		}
-		count_vm_event(THP_DIRECT_ALLOC);
 		return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
 	}
 out:
@@ -911,12 +911,13 @@ int do_huge_pmd_wp_page(struct mm_struct
 		new_page = NULL;
 
 	if (unlikely(!new_page)) {
-		count_vm_event(THP_DIRECT_FALLBACK);
+		count_vm_event(THP_FAULT_FALLBACK);
 		ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
 						   pmd, orig_pmd, page, haddr);
 		put_page(page);
 		goto out;
 	}
+	count_vm_event(THP_FAULT_ALLOC);
 
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		put_page(new_page);
@@ -924,7 +925,7 @@ int do_huge_pmd_wp_page(struct mm_struct
 		ret |= VM_FAULT_OOM;
 		goto out;
 	}
-	count_vm_event(THP_DIRECT_ALLOC);
+
 	copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
 	__SetPageUptodate(new_page);
 
@@ -1784,10 +1785,11 @@ static void collapse_huge_page(struct mm
 				      node, __GFP_OTHER_NODE);
 	if (unlikely(!new_page)) {
 		up_read(&mm->mmap_sem);
-		count_vm_event(THP_DAEMON_ALLOC_FAILED);
+		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 		*hpage = ERR_PTR(-ENOMEM);
 		return;
 	}
+	count_vm_event(THP_COLLAPSE_ALLOC);
 #endif
 	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
 		up_read(&mm->mmap_sem);
@@ -2152,8 +2154,11 @@ static void khugepaged_do_scan(struct pa
 #ifndef CONFIG_NUMA
 		if (!*hpage) {
 			*hpage = alloc_hugepage(khugepaged_defrag());
-			if (unlikely(!*hpage))
+			if (unlikely(!*hpage)) {
+				count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 				break;
+			}
+			count_vm_event(THP_COLLAPSE_ALLOC);
 		}
 #else
 		if (IS_ERR(*hpage))
@@ -2193,8 +2198,11 @@ static struct page *khugepaged_alloc_hug
 
 	do {
 		hpage = alloc_hugepage(khugepaged_defrag());
-		if (!hpage)
+		if (!hpage) {
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 			khugepaged_alloc_sleep();
+		} else
+			count_vm_event(THP_COLLAPSE_ALLOC);
 	} while (unlikely(!hpage) &&
 		 likely(khugepaged_enabled()));
 	return hpage;
@@ -2211,8 +2219,11 @@ static void khugepaged_loop(void)
 	while (likely(khugepaged_enabled())) {
 #ifndef CONFIG_NUMA
 		hpage = khugepaged_alloc_hugepage();
-		if (unlikely(!hpage))
+		if (unlikely(!hpage)) {
+			count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
 			break;
+		}
+		count_vm_event(THP_COLLAPSE_ALLOC);
 #else
 		if (IS_ERR(hpage)) {
 			khugepaged_alloc_sleep();
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -948,10 +948,10 @@ static const char * const vmstat_text[] 
 #endif
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	"thp_direct_alloc",
-	"thp_daemon_alloc",
-	"thp_direct_fallback",
-	"thp_daemon_alloc_failed",
+	"thp_fault_alloc",
+	"thp_fault_fallback",
+	"thp_collapse_alloc",
+	"thp_collapse_alloc_failure",
 	"thp_split",
 #endif
 };

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8] Add VM counters for transparent hugepages
  2011-02-24 23:14     ` Andrea Arcangeli
@ 2011-02-25  1:36       ` Andi Kleen
  0 siblings, 0 replies; 27+ messages in thread
From: Andi Kleen @ 2011-02-25  1:36 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Dave Hansen, Andi Kleen, akpm, linux-kernel, linux-mm, Andi Kleen

> I've a micropreference in having it in split_huge_page succeeding path
> after __split_huge_page returns, as the __ function is where the
> brainer code is and statcode to me is annoying to read mixed in the
> more complex code. Not that it makes any practical difference though.

Thanks for the improvements.

-Andi


-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-03-03  0:45 Fix NUMA problems in transparent hugepages and KSM Andi Kleen
@ 2011-03-03  0:45 ` Andi Kleen
  2011-03-03  2:14   ` KAMEZAWA Hiroyuki
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2011-03-03  0:45 UTC (permalink / raw)
  To: akpm; +Cc: aarcange, linux-mm, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Currently alloc_pages_vma always uses the local node as policy node
for the LOCAL policy. Pass this node down as an argument instead.

No behaviour change from this patch, but will be needed for followons.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/gfp.h |    9 +++++----
 mm/huge_memory.c    |    2 +-
 mm/mempolicy.c      |   11 +++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0b84c61..782e74a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
-			struct vm_area_struct *vma, unsigned long addr);
+		    	struct vm_area_struct *vma, unsigned long addr,
+			int node);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-#define alloc_page_vma(gfp_mask, vma, addr)	\
-	alloc_pages_vma(gfp_mask, 0, vma, addr)
+#define alloc_page_vma(gfp_mask, vma, addr)			\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3e29781..c7c2cd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
 					      unsigned long haddr)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr);
+			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
 }
 
 #ifndef CONFIG_NUMA
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 49355a9..25a5a91 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+	int nd)
 {
-	int nd = numa_node_id();
-
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
+		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, int node)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
@@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		put_mems_allowed();
 		return page;
 	}
-	zl = policy_zonelist(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-03-03  0:45 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
@ 2011-03-03  2:14   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 27+ messages in thread
From: KAMEZAWA Hiroyuki @ 2011-03-03  2:14 UTC (permalink / raw)
  To: Andi Kleen; +Cc: akpm, aarcange, linux-mm, linux-kernel, Andi Kleen

On Wed,  2 Mar 2011 16:45:22 -0800
Andi Kleen <andi@firstfloor.org> wrote:

> From: Andi Kleen <ak@linux.intel.com>
> 
> Currently alloc_pages_vma always uses the local node as policy node
> for the LOCAL policy. Pass this node down as an argument instead.
> 
> No behaviour change from this patch, but will be needed for followons.
> 
> Acked-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>

Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-03-03 19:59 Fix NUMA problems in transparent hugepages and KSM Andi Kleen
@ 2011-03-03 19:59 ` Andi Kleen
  2011-03-07  8:36   ` KOSAKI Motohiro
  0 siblings, 1 reply; 27+ messages in thread
From: Andi Kleen @ 2011-03-03 19:59 UTC (permalink / raw)
  To: akpm; +Cc: linux-mm, linux-kernel, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Currently alloc_pages_vma always uses the local node as policy node
for the LOCAL policy. Pass this node down as an argument instead.

No behaviour change from this patch, but will be needed for followons.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/gfp.h |    9 +++++----
 mm/huge_memory.c    |    2 +-
 mm/mempolicy.c      |   11 +++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0b84c61..782e74a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
-			struct vm_area_struct *vma, unsigned long addr);
+		    	struct vm_area_struct *vma, unsigned long addr,
+			int node);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-#define alloc_page_vma(gfp_mask, vma, addr)	\
-	alloc_pages_vma(gfp_mask, 0, vma, addr)
+#define alloc_page_vma(gfp_mask, vma, addr)			\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3e29781..c7c2cd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
 					      unsigned long haddr)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr);
+			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
 }
 
 #ifndef CONFIG_NUMA
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 49355a9..25a5a91 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+	int nd)
 {
-	int nd = numa_node_id();
-
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
+		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, int node)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
@@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		put_mems_allowed();
 		return page;
 	}
-	zl = policy_zonelist(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
-- 
1.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy
  2011-03-03 19:59 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
@ 2011-03-07  8:36   ` KOSAKI Motohiro
  0 siblings, 0 replies; 27+ messages in thread
From: KOSAKI Motohiro @ 2011-03-07  8:36 UTC (permalink / raw)
  To: Andi Kleen; +Cc: kosaki.motohiro, akpm, linux-mm, linux-kernel, Andi Kleen

> From: Andi Kleen <ak@linux.intel.com>
> 
> Currently alloc_pages_vma always uses the local node as policy node
> for the LOCAL policy. Pass this node down as an argument instead.
> 
> No behaviour change from this patch, but will be needed for followons.
> 
> Acked-by: Andrea Arcangeli <aarcange@redhat.com>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2011-03-07  8:36 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-02-23  1:51 Fix NUMA problems in transparent hugepages v2 Andi Kleen
2011-02-23  1:51 ` [PATCH 1/8] Fix interleaving for " Andi Kleen
2011-02-23 19:26   ` Christoph Lameter
2011-02-24 23:23   ` Andrea Arcangeli
2011-02-23  1:51 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
2011-02-23  1:51 ` [PATCH 3/8] Add alloc_page_vma_node Andi Kleen
2011-02-23  1:51 ` [PATCH 4/8] Preserve original node for transparent huge page copies Andi Kleen
2011-02-23  1:51 ` [PATCH 5/8] Use correct numa policy node for transparent hugepages Andi Kleen
2011-02-23  1:52 ` [PATCH 6/8] Add __GFP_OTHER_NODE flag Andi Kleen
2011-02-23  1:52 ` [PATCH 7/8] Use GFP_OTHER_NODE for transparent huge pages Andi Kleen
2011-02-24  4:56   ` Andrea Arcangeli
2011-02-23  1:52 ` [PATCH 8/8] Add VM counters for transparent hugepages Andi Kleen
2011-02-24  4:18   ` Andrea Arcangeli
2011-02-24 22:43     ` Dave Hansen
2011-02-24 23:15       ` Andrea Arcangeli
2011-02-24 22:43   ` Dave Hansen
2011-02-24 23:14     ` Andrea Arcangeli
2011-02-25  1:36       ` Andi Kleen
2011-02-25  0:51   ` Andrea Arcangeli
2011-02-25  1:12     ` Andi Kleen
2011-02-25  1:34       ` Andrea Arcangeli
  -- strict thread matches above, loose matches on Subject: below --
2011-03-03 19:59 Fix NUMA problems in transparent hugepages and KSM Andi Kleen
2011-03-03 19:59 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
2011-03-07  8:36   ` KOSAKI Motohiro
2011-03-03  0:45 Fix NUMA problems in transparent hugepages and KSM Andi Kleen
2011-03-03  0:45 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
2011-03-03  2:14   ` KAMEZAWA Hiroyuki
2011-02-21 19:07 Fix NUMA problems in transparent hugepages and KSM Andi Kleen
2011-02-21 19:07 ` [PATCH 2/8] Change alloc_pages_vma to pass down the policy node for local policy Andi Kleen
2011-02-22 15:42   ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).