[PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs
@ 2016-11-14  7:07 Huang Shijie
  2016-11-14  7:07 ` [PATCH v2 1/6] mm: hugetlb: rename some allocation functions Huang Shijie
                   ` (9 more replies)
  0 siblings, 10 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

(1) Background
   For the arm64, the hugetlb page size can be 32M (PMD + Contiguous bit).
   In the 4K page environment, the max page order is 10 (max_order - 1),
   so 32M page is the gigantic page.    

   The arm64 MMU supports a Contiguous bit which is a hint that the TTE
   is one of a set of contiguous entries which can be cached in a single
   TLB entry.  Please refer to the arm64v8 mannul :
       DDI0487A_f_armv8_arm.pdf (in page D4-1811)

(2) The bug   
   After I tested the libhugetlbfs, I found the test case "counter.sh"
   will fail with the gigantic page (32M page in arm64 board).

   This patch set adds support for gigantic surplus hugetlb pages,
   allowing the counter.sh unit test to pass.   

v1 -- > v2:
   1.) fix the compiler error in X86.
   2.) add new patches for NUMA.
       The patch #2 ~ #5 are new patches.

Huang Shijie (6):
  mm: hugetlb: rename some allocation functions
  mm: hugetlb: add a new parameter for some functions
  mm: hugetlb: change the return type for alloc_fresh_gigantic_page
  mm: mempolicy: intruduce a helper huge_nodemask()
  mm: hugetlb: add a new function to allocate a new gigantic page
  mm: hugetlb: support gigantic surplus pages

 include/linux/mempolicy.h |   8 +++
 mm/hugetlb.c              | 128 ++++++++++++++++++++++++++++++++++++----------
 mm/mempolicy.c            |  20 ++++++++
 3 files changed, 130 insertions(+), 26 deletions(-)

-- 
2.5.5

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 1/6] mm: hugetlb: rename some allocation functions
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
@ 2016-11-14  7:07 ` Huang Shijie
  2016-11-28 13:29   ` Vlastimil Babka
  2016-11-14  7:07 ` [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions Huang Shijie
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

After a future patch, the __alloc_buddy_huge_page() will not necessarily
use the buddy allocator.

So this patch removes the "buddy" from these functions:
	__alloc_buddy_huge_page -> __alloc_huge_page
	__alloc_buddy_huge_page_no_mpol -> __alloc_huge_page_no_mpol
	__alloc_buddy_huge_page_with_mpol -> __alloc_huge_page_with_mpol

This patch makes preparation for the later patch.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 mm/hugetlb.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3edb759..496b703 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1157,6 +1157,10 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
 
 static inline bool gigantic_page_supported(void) { return true; }
 #else
+static inline struct page *alloc_gigantic_page(int nid, unsigned int order)
+{
+	return NULL;
+}
 static inline bool gigantic_page_supported(void) { return false; }
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
@@ -1568,7 +1572,7 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
  * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
  * implies that memory policies will not be taken in to account.
  */
-static struct page *__alloc_buddy_huge_page(struct hstate *h,
+static struct page *__alloc_huge_page(struct hstate *h,
 		struct vm_area_struct *vma, unsigned long addr, int nid)
 {
 	struct page *page;
@@ -1649,21 +1653,21 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
  * anywhere.
  */
 static
-struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+struct page *__alloc_huge_page_no_mpol(struct hstate *h, int nid)
 {
 	unsigned long addr = -1;
 
-	return __alloc_buddy_huge_page(h, NULL, addr, nid);
+	return __alloc_huge_page(h, NULL, addr, nid);
 }
 
 /*
  * Use the VMA's mpolicy to allocate a huge page from the buddy.
  */
 static
-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct page *__alloc_huge_page_with_mpol(struct hstate *h,
 		struct vm_area_struct *vma, unsigned long addr)
 {
-	return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+	return __alloc_huge_page(h, vma, addr, NUMA_NO_NODE);
 }
 
 /*
@@ -1681,7 +1685,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
 	spin_unlock(&hugetlb_lock);
 
 	if (!page)
-		page = __alloc_buddy_huge_page_no_mpol(h, nid);
+		page = __alloc_huge_page_no_mpol(h, nid);
 
 	return page;
 }
@@ -1711,7 +1715,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
 	spin_unlock(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
+		page = __alloc_huge_page_no_mpol(h, NUMA_NO_NODE);
 		if (!page) {
 			alloc_ok = false;
 			break;
@@ -2027,7 +2031,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
-		page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
+		page = __alloc_huge_page_with_mpol(h, vma, addr);
 		if (!page)
 			goto out_uncharge_cgroup;
 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
@@ -2285,7 +2289,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 	 * First take pages out of surplus state.  Then make up the
 	 * remaining difference by allocating fresh huge pages.
 	 *
-	 * We might race with __alloc_buddy_huge_page() here and be unable
+	 * We might race with __alloc_huge_page() here and be unable
 	 * to convert a surplus huge page to a normal huge page. That is
 	 * not critical, though, it just means the overall size of the
 	 * pool might be one hugepage larger than it needs to be, but
@@ -2331,7 +2335,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 	 * By placing pages into the surplus state independent of the
 	 * overcommit value, we are allowing the surplus pool size to
 	 * exceed overcommit. There are few sane options here. Since
-	 * __alloc_buddy_huge_page() is checking the global counter,
+	 * __alloc_huge_page() is checking the global counter,
 	 * though, we'll note that we're not allowed to exceed surplus
 	 * and won't grow the pool anywhere else. Not until one of the
 	 * sysctls are changed, or the surplus pages go out of use.
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
  2016-11-14  7:07 ` [PATCH v2 1/6] mm: hugetlb: rename some allocation functions Huang Shijie
@ 2016-11-14  7:07 ` Huang Shijie
  2016-12-02 13:52   ` Michal Hocko
  2016-11-14  7:07 ` [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page Huang Shijie
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds a new parameter, the "no_init", for these functions:
   alloc_fresh_gigantic_page_node()
   alloc_fresh_gigantic_page()

The prep_new_huge_page() does some initialization for the new page.
But sometime, we do not need it to do so, such as in the surplus case
in later patch.

With this parameter, the prep_new_huge_page() can be called by needed:
   If the "no_init" is false, calls the prep_new_huge_page() in
   the alloc_fresh_gigantic_page_node();

This patch makes preparation for the later patches.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 mm/hugetlb.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 496b703..db0177b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1127,27 +1127,29 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
 static void prep_compound_gigantic_page(struct page *page, unsigned int order);
 
-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h,
+					int nid, bool no_init)
 {
 	struct page *page;
 
 	page = alloc_gigantic_page(nid, huge_page_order(h));
 	if (page) {
 		prep_compound_gigantic_page(page, huge_page_order(h));
-		prep_new_huge_page(h, page, nid);
+		if (!no_init)
+			prep_new_huge_page(h, page, nid);
 	}
 
 	return page;
 }
 
 static int alloc_fresh_gigantic_page(struct hstate *h,
-				nodemask_t *nodes_allowed)
+				nodemask_t *nodes_allowed, bool no_init)
 {
 	struct page *page = NULL;
 	int nr_nodes, node;
 
 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
-		page = alloc_fresh_gigantic_page_node(h, node);
+		page = alloc_fresh_gigantic_page_node(h, node, no_init);
 		if (page)
 			return 1;
 	}
@@ -1166,7 +1168,7 @@ static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
 						unsigned int order) { }
 static inline int alloc_fresh_gigantic_page(struct hstate *h,
-					nodemask_t *nodes_allowed) { return 0; }
+		nodemask_t *nodes_allowed, bool no_init) { return 0; }
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -2313,7 +2315,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 		cond_resched();
 
 		if (hstate_is_gigantic(h))
-			ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+			ret = alloc_fresh_gigantic_page(h, nodes_allowed,
+							false);
 		else
 			ret = alloc_fresh_huge_page(h, nodes_allowed);
 		spin_lock(&hugetlb_lock);
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
  2016-11-14  7:07 ` [PATCH v2 1/6] mm: hugetlb: rename some allocation functions Huang Shijie
  2016-11-14  7:07 ` [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions Huang Shijie
@ 2016-11-14  7:07 ` Huang Shijie
  2016-12-02 13:56   ` Michal Hocko
  2016-11-14  7:07 ` [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask() Huang Shijie
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

This patch changes the return type to "struct page*" for
alloc_fresh_gigantic_page().

This patch makes preparation for later patch.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 mm/hugetlb.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db0177b..6995087 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1142,7 +1142,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h,
 	return page;
 }
 
-static int alloc_fresh_gigantic_page(struct hstate *h,
+static struct page *alloc_fresh_gigantic_page(struct hstate *h,
 				nodemask_t *nodes_allowed, bool no_init)
 {
 	struct page *page = NULL;
@@ -1151,10 +1151,10 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
 		page = alloc_fresh_gigantic_page_node(h, node, no_init);
 		if (page)
-			return 1;
+			return page;
 	}
 
-	return 0;
+	return NULL;
 }
 
 static inline bool gigantic_page_supported(void) { return true; }
@@ -1167,8 +1167,8 @@ static inline bool gigantic_page_supported(void) { return false; }
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
 						unsigned int order) { }
-static inline int alloc_fresh_gigantic_page(struct hstate *h,
-		nodemask_t *nodes_allowed, bool no_init) { return 0; }
+static inline struct page *alloc_fresh_gigantic_page(struct hstate *h,
+		nodemask_t *nodes_allowed, bool no_init) { return NULL; }
 #endif
 
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -2315,7 +2315,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 		cond_resched();
 
 		if (hstate_is_gigantic(h))
-			ret = alloc_fresh_gigantic_page(h, nodes_allowed,
+			ret = !!alloc_fresh_gigantic_page(h, nodes_allowed,
 							false);
 		else
 			ret = alloc_fresh_huge_page(h, nodes_allowed);
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (2 preceding siblings ...)
  2016-11-14  7:07 ` [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page Huang Shijie
@ 2016-11-14  7:07 ` Huang Shijie
  2016-11-15  6:01   ` Aneesh Kumar K.V
  2016-11-16  6:53   ` [PATCH V2 fix " Huang Shijie
  2016-11-14  7:07 ` [PATCH v2 5/6] mm: hugetlb: add a new function to allocate a new gigantic page Huang Shijie
                   ` (5 subsequent siblings)
  9 siblings, 2 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

This patch intruduces a new helper huge_nodemask(),
we can use it to get the node mask.

This idea of the function is from the huge_zonelist().

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 include/linux/mempolicy.h |  8 ++++++++
 mm/mempolicy.c            | 20 ++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5e5b296..01173c6 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -145,6 +145,8 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 				enum mpol_rebind_step step);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 
+extern nodemask_t *huge_nodemask(struct vm_area_struct *vma,
+				unsigned long addr);
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
@@ -261,6 +263,12 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
+static inline nodemask_t *huge_nodemask(struct vm_area_struct *vma,
+				unsigned long addr)
+{
+	return NULL;
+}
+
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6d3639e..4830dd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1800,6 +1800,26 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 
 #ifdef CONFIG_HUGETLBFS
 /*
+ * huge_nodemask(@vma, @addr)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup and interleave policy
+ *
+ * If the effective policy is BIND, returns a pointer to the mempolicy's
+ * @nodemask.
+ */
+nodemask_t *huge_nodemask(struct vm_area_struct *vma, unsigned long addr)
+{
+	nodemask_t *nodes_mask = NULL;
+	struct mempolicy *mpol = get_vma_policy(vma, addr);
+
+	if (mpol->mode == MPOL_BIND)
+		nodes_mask = &mpol->v.nodes;
+	mpol_cond_put(mpol);
+
+	return nodes_mask;
+}
+
+/*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup and interleave policy
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (3 preceding siblings ...)
  2016-11-14  7:07 ` [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask() Huang Shijie
@ 2016-11-14  7:07 ` Huang Shijie
  2016-11-16  6:55   ` [PATCH V2 fix " Huang Shijie
  2016-11-14  7:07 ` [PATCH v2 6/6] mm: hugetlb: support gigantic surplus pages Huang Shijie
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

There are three ways we can allocate a new gigantic page:

1. When the NUMA is not enabled, use alloc_gigantic_page() to get
   the gigantic page.

2. The NUMA is enabled, but the vma is NULL.
   There is no memory policy we can refer to.
   So create a @nodes_allowed, initialize it with init_nodemask_of_mempolicy()
   or init_nodemask_of_node(). Then use alloc_fresh_gigantic_page() to get
   the gigantic page.

3. The NUMA is enabled, and the vma is valid.
   We can follow the memory policy of the @vma.

   Get @nodes_mask by huge_nodemask(), and use alloc_fresh_gigantic_page()
   to get the gigantic page.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 mm/hugetlb.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6995087..58a59f0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1502,6 +1502,73 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 
 /*
  * There are 3 ways this can get called:
+ *
+ * 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
+ *    the gigantic page.
+ *
+ * 2. The NUMA is enabled, but the vma is NULL.
+ *    Create a @nodes_allowed, use alloc_fresh_gigantic_page() to get
+ *    the gigantic page.
+ *
+ * 3. The NUMA is enabled, and the vma is valid.
+ *    Use the @vma's memory policy.
+ *    Get @nodes_mask by huge_nodemask(), and use alloc_fresh_gigantic_page()
+ *    to get the gigantic page.
+ */
+static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h,
+		struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+	struct page *page;
+	nodemask_t *nodes_mask;
+
+	/* Not NUMA */
+	if (!IS_ENABLED(CONFIG_NUMA)) {
+		if (nid == NUMA_NO_NODE)
+			nid = numa_mem_id();
+
+		page = alloc_gigantic_page(nid, huge_page_order(h));
+		if (page)
+			prep_compound_gigantic_page(page, huge_page_order(h));
+
+		return page;
+	}
+
+	/* NUMA && !vma */
+	if (!vma) {
+		NODEMASK_ALLOC(nodemask_t, nodes_allowed,
+				GFP_KERNEL | __GFP_NORETRY);
+
+		if (nid == NUMA_NO_NODE) {
+			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
+				NODEMASK_FREE(nodes_allowed);
+				nodes_allowed = &node_states[N_MEMORY];
+			}
+		} else if (nodes_allowed) {
+			init_nodemask_of_node(nodes_allowed, nid);
+		} else {
+			nodes_allowed = &node_states[N_MEMORY];
+		}
+
+		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
+
+		if (nodes_allowed != &node_states[N_MEMORY])
+			NODEMASK_FREE(nodes_allowed);
+
+		return page;
+	}
+
+	/* NUMA && vma */
+	nodes_mask = huge_nodemask(vma, addr);
+	if (nodes_mask) {
+		page = alloc_fresh_gigantic_page(h, nodes_mask, true);
+		if (page)
+			return page;
+	}
+	return NULL;
+}
+
+/*
+ * There are 3 ways this can get called:
  * 1. With vma+addr: we use the VMA's memory policy
  * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
  *    page from any node, and let the buddy allocator itself figure
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 6/6] mm: hugetlb: support gigantic surplus pages
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (4 preceding siblings ...)
  2016-11-14  7:07 ` [PATCH v2 5/6] mm: hugetlb: add a new function to allocate a new gigantic page Huang Shijie
@ 2016-11-14  7:07 ` Huang Shijie
  2016-11-14 22:44 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Andrew Morton
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-14  7:07 UTC (permalink / raw)
  To: linux-arm-kernel

When testing the gigantic page whose order is too large for the buddy
allocator, the libhugetlbfs test case "counter.sh" will fail.

The failure is caused by:
 1) kernel fails to allocate a gigantic page for the surplus case.
    And the gather_surplus_pages() will return NULL in the end.

 2) The condition checks for "over-commit" is wrong.

This patch uses __hugetlb_alloc_gigantic_page() to allocate the
gigantic page in the __alloc_huge_page(). After this patch,
 gather_surplus_pages() can return a gigantic page for the surplus case.

This patch also changes the condition checks for:
     return_unused_surplus_pages()
     nr_overcommit_hugepages_store()
     hugetlb_overcommit_handler()

After this patch, the counter.sh can pass for the gigantic page.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 mm/hugetlb.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 58a59f0..08e66ca 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1647,7 +1647,7 @@ static struct page *__alloc_huge_page(struct hstate *h,
 	struct page *page;
 	unsigned int r_nid;
 
-	if (hstate_is_gigantic(h))
+	if (hstate_is_gigantic(h) && !gigantic_page_supported())
 		return NULL;
 
 	/*
@@ -1692,7 +1692,10 @@ static struct page *__alloc_huge_page(struct hstate *h,
 	}
 	spin_unlock(&hugetlb_lock);
 
-	page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
+	if (hstate_is_gigantic(h))
+		page = __hugetlb_alloc_gigantic_page(h, vma, addr, nid);
+	else
+		page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
 
 	spin_lock(&hugetlb_lock);
 	if (page) {
@@ -1859,8 +1862,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 	/* Uncommit the reservation */
 	h->resv_huge_pages -= unused_resv_pages;
 
-	/* Cannot return gigantic pages currently */
-	if (hstate_is_gigantic(h))
+	if (hstate_is_gigantic(h) && !gigantic_page_supported())
 		return;
 
 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -2577,7 +2579,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
 	unsigned long input;
 	struct hstate *h = kobj_to_hstate(kobj, NULL);
 
-	if (hstate_is_gigantic(h))
+	if (hstate_is_gigantic(h) && !gigantic_page_supported())
 		return -EINVAL;
 
 	err = kstrtoul(buf, 10, &input);
@@ -3018,7 +3020,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 	tmp = h->nr_overcommit_huge_pages;
 
-	if (write && hstate_is_gigantic(h))
+	if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
 		return -EINVAL;
 
 	table->data = &tmp;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (5 preceding siblings ...)
  2016-11-14  7:07 ` [PATCH v2 6/6] mm: hugetlb: support gigantic surplus pages Huang Shijie
@ 2016-11-14 22:44 ` Andrew Morton
  2016-11-15  2:36   ` Huang Shijie
  2016-11-28 14:20 ` Vlastimil Babka
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 34+ messages in thread
From: Andrew Morton @ 2016-11-14 22:44 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, 14 Nov 2016 15:07:33 +0800 Huang Shijie <shijie.huang@arm.com> wrote:

> (1) Background
>    For the arm64, the hugetlb page size can be 32M (PMD + Contiguous bit).
>    In the 4K page environment, the max page order is 10 (max_order - 1),
>    so 32M page is the gigantic page.    
> 
>    The arm64 MMU supports a Contiguous bit which is a hint that the TTE
>    is one of a set of contiguous entries which can be cached in a single
>    TLB entry.  Please refer to the arm64v8 mannul :
>        DDI0487A_f_armv8_arm.pdf (in page D4-1811)
> 
> (2) The bug   
>    After I tested the libhugetlbfs, I found the test case "counter.sh"
>    will fail with the gigantic page (32M page in arm64 board).
> 
>    This patch set adds support for gigantic surplus hugetlb pages,
>    allowing the counter.sh unit test to pass.   

I'm not really seeing a description of the actual bug.  I don't know
what counter.sh is, there is no copy of counter.sh included in the
changelogs and there is no description of the kernel error which
counter.sh demonstrates.

So can you pleaser send to me a copy of counter.sh as well as a
suitable description of the kernel error which counter.sh triggers?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs
  2016-11-14 22:44 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Andrew Morton
@ 2016-11-15  2:36   ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-15  2:36 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 14, 2016 at 02:44:29PM -0800, Andrew Morton wrote:
> On Mon, 14 Nov 2016 15:07:33 +0800 Huang Shijie <shijie.huang@arm.com> wrote:
> I'm not really seeing a description of the actual bug.  I don't know
> what counter.sh is, there is no copy of counter.sh included in the
> changelogs and there is no description of the kernel error which
> counter.sh demonstrates.
> 
> So can you pleaser send to me a copy of counter.sh as well as a
> suitable description of the kernel error which counter.sh triggers?
> 
Sorry.

The counter.sh is just a wrapper for counter.c.
I append them in the attachment, you can also find them in:
  https://github.com/libhugetlbfs/libhugetlbfs/blob/master/tests/counters.c
  https://github.com/libhugetlbfs/libhugetlbfs/blob/master/tests/counters.sh

The description:
 The "counter.sh" test case will fail when we test the ARM64 32M gigantic page.
 The error shows below:

 ----------------------------------------------------------
        ...........................................
	LD_PRELOAD=libhugetlbfs.so shmoverride_unlinked (32M: 64):	PASS
	LD_PRELOAD=libhugetlbfs.so HUGETLB_SHM=yes shmoverride_unlinked (32M: 64):	PASS
	quota.sh (32M: 64):	PASS
	counters.sh (32M: 64):	FAIL mmap failed: Invalid argument
	********** TEST SUMMARY
	*                      32M           
	*                      32-bit 64-bit 
	*     Total testcases:     0     87   
	*             Skipped:     0      0   
	*                PASS:     0     86   
	*                FAIL:     0      1   
	*    Killed by signal:     0      0   
	*   Bad configuration:     0      0   
	*       Expected FAIL:     0      0   
	*     Unexpected PASS:     0      0   
	* Strange test result:     0      0   
	**********
 ----------------------------------------------------------

The failure is caused by:
 1) kernel fails to allocate a gigantic page for the surplus case.
    And the gather_surplus_pages() will return NULL in the end.

 2) The condition checks for some functions are wrong:
     return_unused_surplus_pages()
     nr_overcommit_hugepages_store()
     hugetlb_overcommit_handler()

  
Thanks
Huang Shijie
-------------- next part --------------
A non-text attachment was scrubbed...
Name: counters.sh
Type: application/x-sh
Size: 223 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-arm-kernel/attachments/20161115/bdcf18b6/attachment-0001.sh>
-------------- next part --------------
/*
 * libhugetlbfs - Easy use of Linux hugepages
 * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
 */
#include <sys/types.h>
#include <sys/shm.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <hugetlbfs.h>
#include "hugetests.h"

/*
 * Test Rationale:
 *
 * The hugetlb pool maintains 4 global counters to track pages as they
 * transition between various states.  Due to the complex relationships between
 * the counters, regressions are likely to occur in the future.  This test
 * performs operations that change the counters in known ways.  It emulates the
 * expected kernel behavior and compares the expected result to the actual
 * values after each operation.
 */

extern int errno;

/* Global test configuration */
#define DYNAMIC_SYSCTL "/proc/sys/vm/nr_overcommit_hugepages"
static long saved_nr_hugepages = -1;
static long saved_oc_hugepages = -1;
static long hpage_size;
static int private_resv;

/* State arrays for our mmaps */
#define NR_SLOTS	2
#define SL_SETUP	0
#define SL_TEST		1
static int map_fd[NR_SLOTS];
static char *map_addr[NR_SLOTS];
static unsigned long map_size[NR_SLOTS];
static unsigned int touched[NR_SLOTS];

/* Keep track of expected counter values */
static long prev_total;
static long prev_free;
static long prev_resv;
static long prev_surp;

#define min(a,b) (((a) < (b)) ? (a) : (b))
#define max(a,b) (((a) > (b)) ? (a) : (b))

/* Restore original nr_hugepages */
void cleanup(void) {
	if (hpage_size <= 0)
		return;
	if (saved_nr_hugepages >= 0)
		set_nr_hugepages(hpage_size, saved_nr_hugepages);
	if (saved_oc_hugepages >= 0)
		set_nr_overcommit_hugepages(hpage_size, saved_oc_hugepages);
}

void verify_dynamic_pool_support(void)
{
	saved_oc_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_OC);
	if (saved_oc_hugepages < 0)
		FAIL("Kernel appears to lack dynamic hugetlb pool support");
	set_nr_overcommit_hugepages(hpage_size, 10);
}

void bad_value(int line, const char *name, long expect, long actual)
{
	if (actual == -1)
		ERROR("%s not found in /proc/meminfo", name);
	else
		FAIL("Line %i: Bad %s: expected %li, actual %li",
			line, name, expect, actual);
}

void verify_counters(int line, long et, long ef, long er, long es)
{
	long t, f, r, s;

	t = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL);
	f = get_huge_page_counter(hpage_size, HUGEPAGES_FREE);
	r = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD);
	s = get_huge_page_counter(hpage_size, HUGEPAGES_SURP);

	/* Invariant checks */
	if (t < 0 || f < 0 || r < 0 || s < 0)
		ERROR("Negative counter value");
	if (f < r)
		ERROR("HugePages_Free < HugePages_Rsvd");

	/* Check actual values against expected values */
	if (t != et)
		bad_value(line, "HugePages_Total", et, t);

	if (f != ef)
		bad_value(line, "HugePages_Free", ef, f);

	if (r != er)
		bad_value(line, "HugePages_Rsvd", er, r);

	if (s != es)
		bad_value(line, "HugePages_Surp", es, s);

	/* Everything's good.  Update counters */
	prev_total = t;
	prev_free = f;
	prev_resv = r;
	prev_surp = s;
}

/* Memory operations:
 * Each of these has a predefined effect on the counters
 */
#define persistent_huge_pages (et - es)
void _set_nr_hugepages(unsigned long count, int line)
{
	long min_size;
	long et, ef, er, es;

	if (set_nr_hugepages(hpage_size, count))
		FAIL("Cannot set nr_hugepages");

	/* The code below is based on set_max_huge_pages in mm/hugetlb.c */
	es = prev_surp;
	et = prev_total;
	ef = prev_free;
	er = prev_resv;

	/*
	 * Increase the pool size
	 * First take pages out of surplus state.  Then make up the
	 * remaining difference by allocating fresh huge pages.
	 */
	while (es && count > persistent_huge_pages)
		es--;
	while (count > persistent_huge_pages) {
		et++;
		ef++;
	}
	if (count >= persistent_huge_pages)
		goto out;

	/*
	 * Decrease the pool size
	 * First return free pages to the buddy allocator (being careful
	 * to keep enough around to satisfy reservations).  Then place
	 * pages into surplus state as needed so the pool will shrink
	 * to the desired size as pages become free.
	 */
	min_size = max(count, er + et - ef);
	while (min_size < persistent_huge_pages) {
		ef--;
		et--;
	}
	while (count < persistent_huge_pages) {
		es++;
	}

out:
	verify_counters(line, et, ef, er, es);
}
#undef set_nr_hugepages
#define set_nr_hugepages(c) _set_nr_hugepages(c, __LINE__)

void _map(int s, int hpages, int flags, int line)
{
	long et, ef, er, es;

	map_fd[s] = hugetlbfs_unlinked_fd();
	if (map_fd[s] < 0)
		CONFIG("Unable to open hugetlbfs file: %s", strerror(errno));
	map_size[s] = hpages * hpage_size;
	map_addr[s] = mmap(NULL, map_size[s], PROT_READ|PROT_WRITE, flags,
				map_fd[s], 0);
	if (map_addr[s] == MAP_FAILED)
		FAIL("mmap failed: %s", strerror(errno));
	touched[s] = 0;

	et = prev_total;
	ef = prev_free;
	er = prev_resv;
	es = prev_surp;

	/*
	 * When using MAP_SHARED, a reservation will be created to guarantee
	 * pages to the process.  If not enough pages are available to
	 * satisfy the reservation, surplus pages are added to the pool.
	 * NOTE: This code assumes that the whole mapping needs to be
	 * reserved and hence, will not work with partial reservations.
	 *
	 * If the kernel supports private reservations, then MAP_PRIVATE
	 * mappings behave like MAP_SHARED@mmap time.  Otherwise,
	 * no counter updates will occur.
	 */
	if ((flags & MAP_SHARED) || private_resv) {
		unsigned long shortfall = 0;
		if (hpages + prev_resv > prev_free)
			shortfall = hpages - prev_free + prev_resv;
		et += shortfall;
		ef = prev_free + shortfall;
		er = prev_resv + hpages;
		es = prev_surp + shortfall;
	}

	verify_counters(line, et, ef, er, es);
}
#define map(s, h, f) _map(s, h, f, __LINE__)

void _unmap(int s, int hpages, int flags, int line)
{
	long et, ef, er, es;
	unsigned long i;

	munmap(map_addr[s], map_size[s]);
	close(map_fd[s]);
	map_fd[s] = -1;
	map_addr[s] = NULL;
	map_size[s] = 0;

	et = prev_total;
	ef = prev_free;
	er = prev_resv;
	es = prev_surp;

	/*
	 * When a VMA is unmapped, the instantiated (touched) pages are
	 * freed.  If the pool is in a surplus state, pages are freed to the
	 * buddy allocator, otherwise they go back into the hugetlb pool.
	 * NOTE: This code assumes touched pages have only one user.
	 */
	for (i = 0; i < touched[s]; i++) {
		if (es) {
			et--;
			es--;
		} else
			ef++;
	}

	/*
	 * mmap may have created some surplus pages to accomodate a
	 * reservation.  If those pages were not touched, then they will
	 * not have been freed by the code above.  Free them here.
	 */
	if ((flags & MAP_SHARED) || private_resv) {
		int unused_surplus = min(hpages - touched[s], es);
		et -= unused_surplus;
		ef -= unused_surplus;
		er -= hpages - touched[s];
		es -= unused_surplus;
	}

	verify_counters(line, et, ef, er, es);
}
#define unmap(s, h, f) _unmap(s, h, f, __LINE__)

void _touch(int s, int hpages, int flags, int line)
{
	long et, ef, er, es;
	int nr;
	char *c;

	for (c = map_addr[s], nr = hpages;
			hpages && c < map_addr[s] + map_size[s];
			c += hpage_size, nr--)
		*c = (char) (nr % 2);
	/*
	 * Keep track of how many pages were touched since we can't easily
	 * detect that from user space.
	 * NOTE: Calling this function more than once for a mmap may yield
	 * results you don't expect.  Be careful :)
	 */
	touched[s] = max(touched[s], hpages);

	/*
	 * Shared (and private when supported) mappings and consume resv pages
	 * that were previously allocated. Also deduct them from the free count.
	 *
	 * Unreserved private mappings may need to allocate surplus pages to
	 * satisfy the fault.  The surplus pages become part of the pool
	 * which could elevate total, free, and surplus counts.  resv is
	 * unchanged but free must be decreased.
	 */
	if (flags & MAP_SHARED || private_resv) {
		et = prev_total;
		ef = prev_free - hpages;
		er = prev_resv - hpages;
		es = prev_surp;
	} else {
		if (hpages + prev_resv > prev_free)
			et = prev_total + (hpages - prev_free + prev_resv);
		else
			et = prev_total;
		er = prev_resv;
		es = prev_surp + et - prev_total;
		ef = prev_free - hpages + et - prev_total;
	}
	verify_counters(line, et, ef, er, es);
}
#define touch(s, h, f) _touch(s, h, f, __LINE__)

void run_test(char *desc, int base_nr)
{
	verbose_printf("%s...\n", desc);
	set_nr_hugepages(base_nr);

	/* untouched, shared mmap */
	map(SL_TEST, 1, MAP_SHARED);
	unmap(SL_TEST, 1, MAP_SHARED);

	/* untouched, private mmap */
	map(SL_TEST, 1, MAP_PRIVATE);
	unmap(SL_TEST, 1, MAP_PRIVATE);

	/* touched, shared mmap */
	map(SL_TEST, 1, MAP_SHARED);
	touch(SL_TEST, 1, MAP_SHARED);
	unmap(SL_TEST, 1, MAP_SHARED);

	/* touched, private mmap */
	map(SL_TEST, 1, MAP_PRIVATE);
	touch(SL_TEST, 1, MAP_PRIVATE);
	unmap(SL_TEST, 1, MAP_PRIVATE);

	/* Explicit resizing during outstanding surplus */
	/* Consume surplus when growing pool */
	map(SL_TEST, 2, MAP_SHARED);
	set_nr_hugepages(max(base_nr, 1));

	/* Add pages once surplus is consumed */
	set_nr_hugepages(max(base_nr, 3));

	/* Release free huge pages first */
	set_nr_hugepages(max(base_nr, 2));

	/* When shrinking beyond committed level, increase surplus */
	set_nr_hugepages(base_nr);

	/* Upon releasing the reservation, reduce surplus counts */
	unmap(SL_TEST, 2, MAP_SHARED);

	verbose_printf("OK.\n");
}

int main(int argc, char ** argv)
{
	int base_nr;

	test_init(argc, argv);
	hpage_size = check_hugepagesize();
	saved_nr_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL);
	verify_dynamic_pool_support();
	check_must_be_root();

	if ((private_resv = kernel_has_private_reservations()) == -1)
		FAIL("kernel_has_private_reservations() failed\n");

	/*
	 * This test case should require a maximum of 3 huge pages.
	 * Run through the battery of tests multiple times, with an increasing
	 * base pool size.  This alters the circumstances under which surplus
	 * pages need to be allocated and increases the corner cases tested.
	 */
	for (base_nr = 0; base_nr <= 3; base_nr++) {
		verbose_printf("Base pool size: %i\n", base_nr);
		/* Run the tests with a clean slate */
		run_test("Clean", base_nr);

		/* Now with a pre-existing untouched, shared mmap */
		map(SL_SETUP, 1, MAP_SHARED);
		run_test("Untouched, shared", base_nr);
		unmap(SL_SETUP, 1, MAP_SHARED);

		/* Now with a pre-existing untouched, private mmap */
		map(SL_SETUP, 1, MAP_PRIVATE);
		run_test("Untouched, private", base_nr);
		unmap(SL_SETUP, 1, MAP_PRIVATE);

		/* Now with a pre-existing touched, shared mmap */
		map(SL_SETUP, 1, MAP_SHARED);
		touch(SL_SETUP, 1, MAP_SHARED);
		run_test("Touched, shared", base_nr);
		unmap(SL_SETUP, 1, MAP_SHARED);

		/* Now with a pre-existing touched, private mmap */
		map(SL_SETUP, 1, MAP_PRIVATE);
		touch(SL_SETUP, 1, MAP_PRIVATE);
		run_test("Touched, private", base_nr);
		unmap(SL_SETUP, 1, MAP_PRIVATE);
	}

	PASS();
}
-------------- next part --------------
zero_filesize_segment (32M: 64):	PASS
test_root (32M: 64):	PASS
meminfo_nohuge (32M: 64):	PASS
gethugepagesize (32M: 64):	PASS
gethugepagesizes (32M: 64):	PASS
HUGETLB_VERBOSE=1 empty_mounts (32M: 64):	PASS
HUGETLB_VERBOSE=1 large_mounts (32M: 64):	PASS
find_path (32M: 64):	PASS
unlinked_fd (32M: 64):	PASS
readback (32M: 64):	PASS
truncate (32M: 64):	PASS
shared (32M: 64):	PASS
mprotect (32M: 64):	PASS
mlock (32M: 64):	PASS
misalign (32M: 64):	PASS
fallocate_basic.sh (32M: 64):	PASS
fallocate_align.sh (32M: 64):	PASS
ptrace-write-hugepage (32M: 64):	PASS
icache-hygiene (32M: 64):	PASS
slbpacaflush (32M: 64):	PASS (inconclusive)
straddle_4GB_static (32M: 64):	PASS
huge_at_4GB_normal_below_static (32M: 64):	PASS
huge_below_4GB_normal_above_static (32M: 64):	PASS
map_high_truncate_2 (32M: 64):	PASS
misaligned_offset (32M: 64):	PASS (inconclusive)
truncate_above_4GB (32M: 64):	PASS
brk_near_huge (32M: 64):	PASS
task-size-overrun (32M: 64):	PASS
stack_grow_into_huge (32M: 64):	PASS
corrupt-by-cow-opt (32M: 64):	PASS
noresv-preserve-resv-page (32M: 64):	PASS
noresv-regarded-as-resv (32M: 64):	PASS
readahead_reserve.sh (32M: 64):	PASS
madvise_reserve.sh (32M: 64):	PASS
fadvise_reserve.sh (32M: 64):	PASS
mremap-expand-slice-collision.sh (32M: 64):	PASS
mremap-fixed-normal-near-huge.sh (32M: 64):	PASS
mremap-fixed-huge-near-normal.sh (32M: 64):	PASS
set shmmax limit to 67108864
shm-perms (32M: 64):	PASS
private (32M: 64):	PASS
fork-cow (32M: 64):	PASS
direct (32M: 64):	PASS
malloc (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_MORECORE=yes malloc (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_RESTRICT_EXE=unknown:none HUGETLB_MORECORE=yes malloc (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_RESTRICT_EXE=unknown:malloc HUGETLB_MORECORE=yes malloc (32M: 64):	PASS
malloc_manysmall (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_MORECORE=yes malloc_manysmall (32M: 64):	PASS
heapshrink (32M: 64):	PASS
LD_PRELOAD=libheapshrink.so heapshrink (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_MORECORE=yes heapshrink (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so libheapshrink.so HUGETLB_MORECORE=yes heapshrink (32M: 64):	PASS
LD_PRELOAD=libheapshrink.so HUGETLB_MORECORE_SHRINK=yes HUGETLB_MORECORE=yes heapshrink (32M: 64):	PASS (inconclusive)
LD_PRELOAD=libhugetlbfs.so libheapshrink.so HUGETLB_MORECORE_SHRINK=yes HUGETLB_MORECORE=yes heapshrink (32M: 64):	PASS
HUGETLB_VERBOSE=1 HUGETLB_MORECORE=yes heap-overflow (32M: 64):	PASS
HUGETLB_VERBOSE=0 linkhuge_nofd (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_VERBOSE=0 linkhuge_nofd (32M: 64):	PASS
linkhuge (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so linkhuge (32M: 64):	PASS
linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP=R linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP=W linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP=RW linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP=no linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP= HUGETLB_MINIMAL_COPY=no linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP=W HUGETLB_MINIMAL_COPY=no linkhuge_rw (32M: 64):	PASS
HUGETLB_ELFMAP=RW HUGETLB_MINIMAL_COPY=no linkhuge_rw (32M: 64):	PASS
HUGETLB_SHARE=0 HUGETLB_ELFMAP=R linkhuge_rw (32M: 64):	PASS
HUGETLB_SHARE=1 HUGETLB_ELFMAP=R linkhuge_rw (32M: 64):	PASS
HUGETLB_SHARE=0 HUGETLB_ELFMAP=W linkhuge_rw (32M: 64):	PASS
HUGETLB_SHARE=1 HUGETLB_ELFMAP=W linkhuge_rw (32M: 64):	PASS
HUGETLB_SHARE=0 HUGETLB_ELFMAP=RW linkhuge_rw (32M: 64):	PASS
HUGETLB_SHARE=1 HUGETLB_ELFMAP=RW linkhuge_rw (32M: 64):	PASS
chunk-overcommit (32M: 64):	PASS
alloc-instantiate-race shared (32M: 64):	PASS
alloc-instantiate-race private (32M: 64):	PASS
truncate_reserve_wraparound (32M: 64):	PASS
truncate_sigbus_versus_oom (32M: 64):	PASS
get_huge_pages (32M: 64):	PASS
shmoverride_linked (32M: 64):	PASS
HUGETLB_SHM=yes shmoverride_linked (32M: 64):	PASS
shmoverride_linked_static (32M: 64):	PASS
HUGETLB_SHM=yes shmoverride_linked_static (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so shmoverride_unlinked (32M: 64):	PASS
LD_PRELOAD=libhugetlbfs.so HUGETLB_SHM=yes shmoverride_unlinked (32M: 64):	PASS
quota.sh (32M: 64):	PASS
counters.sh (32M: 64):	FAIL mmap failed: Invalid argument
********** TEST SUMMARY
*                      32M           
*                      32-bit 64-bit 
*     Total testcases:     0     87   
*             Skipped:     0      0   
*                PASS:     0     86   
*                FAIL:     0      1   
*    Killed by signal:     0      0   
*   Bad configuration:     0      0   
*       Expected FAIL:     0      0   
*     Unexpected PASS:     0      0   
* Strange test result:     0      0   
**********

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-11-14  7:07 ` [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask() Huang Shijie
@ 2016-11-15  6:01   ` Aneesh Kumar K.V
  2016-11-15  8:20     ` Huang Shijie
  2016-11-15  8:52     ` Huang Shijie
  2016-11-16  6:53   ` [PATCH V2 fix " Huang Shijie
  1 sibling, 2 replies; 34+ messages in thread
From: Aneesh Kumar K.V @ 2016-11-15  6:01 UTC (permalink / raw)
  To: linux-arm-kernel

Huang Shijie <shijie.huang@arm.com> writes:

> This patch intruduces a new helper huge_nodemask(),
> we can use it to get the node mask.
>
> This idea of the function is from the huge_zonelist().
>
> Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> ---
>  include/linux/mempolicy.h |  8 ++++++++
>  mm/mempolicy.c            | 20 ++++++++++++++++++++
>  2 files changed, 28 insertions(+)
>
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index 5e5b296..01173c6 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -145,6 +145,8 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
>  				enum mpol_rebind_step step);
>  extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
>
> +extern nodemask_t *huge_nodemask(struct vm_area_struct *vma,
> +				unsigned long addr);
>  extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
>  				unsigned long addr, gfp_t gfp_flags,
>  				struct mempolicy **mpol, nodemask_t **nodemask);
> @@ -261,6 +263,12 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
>  {
>  }
>
> +static inline nodemask_t *huge_nodemask(struct vm_area_struct *vma,
> +				unsigned long addr)
> +{
> +	return NULL;
> +}
> +
>  static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
>  				unsigned long addr, gfp_t gfp_flags,
>  				struct mempolicy **mpol, nodemask_t **nodemask)
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 6d3639e..4830dd6 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1800,6 +1800,26 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
>
>  #ifdef CONFIG_HUGETLBFS
>  /*
> + * huge_nodemask(@vma, @addr)
> + * @vma: virtual memory area whose policy is sought
> + * @addr: address in @vma for shared policy lookup and interleave policy
> + *
> + * If the effective policy is BIND, returns a pointer to the mempolicy's
> + * @nodemask.
> + */
> +nodemask_t *huge_nodemask(struct vm_area_struct *vma, unsigned long addr)
> +{
> +	nodemask_t *nodes_mask = NULL;
> +	struct mempolicy *mpol = get_vma_policy(vma, addr);
> +
> +	if (mpol->mode == MPOL_BIND)
> +		nodes_mask = &mpol->v.nodes;
> +	mpol_cond_put(mpol);

What if it is MPOL_PREFERED or MPOL_INTERLEAVE ? we don't honor node
mask in that case ?


> +
> +	return nodes_mask;
> +}
> +
> +/*
>   * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
>   * @vma: virtual memory area whose policy is sought
>   * @addr: address in @vma for shared policy lookup and interleave policy
> -- 
> 2.5.5

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-11-15  6:01   ` Aneesh Kumar K.V
@ 2016-11-15  8:20     ` Huang Shijie
  2016-11-15  8:52     ` Huang Shijie
  1 sibling, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-15  8:20 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 15, 2016 at 11:31:06AM +0530, Aneesh Kumar K.V wrote:
> Huang Shijie <shijie.huang@arm.com> writes:
> >  #ifdef CONFIG_HUGETLBFS
> >  /*
> > + * huge_nodemask(@vma, @addr)
> > + * @vma: virtual memory area whose policy is sought
> > + * @addr: address in @vma for shared policy lookup and interleave policy
> > + *
> > + * If the effective policy is BIND, returns a pointer to the mempolicy's
> > + * @nodemask.
> > + */
> > +nodemask_t *huge_nodemask(struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +	nodemask_t *nodes_mask = NULL;
> > +	struct mempolicy *mpol = get_vma_policy(vma, addr);
> > +
> > +	if (mpol->mode == MPOL_BIND)
> > +		nodes_mask = &mpol->v.nodes;
> > +	mpol_cond_put(mpol);
> 
> What if it is MPOL_PREFERED or MPOL_INTERLEAVE ? we don't honor node
> mask in that case ?
I wrote this code by following the logic in the huge_zonelist().
So I ignored the support for MPOL_PREFERED/MPOL_INTERLEAVE.

IMHO, it is okay to allocate a gigantic page with MPOL_PREFERED/MPOL_BIND.
But I am not sure if we can allocate a gigantic page with MPOL_INTERLEAVE, since
since the gigantic page's order is bigger then MAX_ORDER.

Could you give me some advice about this?

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-11-15  6:01   ` Aneesh Kumar K.V
  2016-11-15  8:20     ` Huang Shijie
@ 2016-11-15  8:52     ` Huang Shijie
  1 sibling, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-15  8:52 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 15, 2016 at 11:31:06AM +0530, Aneesh Kumar K.V wrote:
> Huang Shijie <shijie.huang@arm.com> writes:
> >  #ifdef CONFIG_HUGETLBFS
> >  /*
> > + * huge_nodemask(@vma, @addr)
> > + * @vma: virtual memory area whose policy is sought
> > + * @addr: address in @vma for shared policy lookup and interleave policy
> > + *
> > + * If the effective policy is BIND, returns a pointer to the mempolicy's
> > + * @nodemask.
> > + */
> > +nodemask_t *huge_nodemask(struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +	nodemask_t *nodes_mask = NULL;
> > +	struct mempolicy *mpol = get_vma_policy(vma, addr);
> > +
> > +	if (mpol->mode == MPOL_BIND)
> > +		nodes_mask = &mpol->v.nodes;
> > +	mpol_cond_put(mpol);
> 
> What if it is MPOL_PREFERED or MPOL_INTERLEAVE ? we don't honor node
> mask in that case ?
> 
I suddenly find maybe I should follow init_nodemask_of_mempolicy(), not the
huge_zonelist(). Is it okay?

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-11-14  7:07 ` [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask() Huang Shijie
  2016-11-15  6:01   ` Aneesh Kumar K.V
@ 2016-11-16  6:53   ` Huang Shijie
  2016-12-02 13:58     ` Michal Hocko
  1 sibling, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-16  6:53 UTC (permalink / raw)
  To: linux-arm-kernel

This patch intruduces a new helper huge_nodemask(),
we can use it to get the node mask.

This idea of the function is from the init_nodemask_of_mempolicy():
   Return true if we can succeed in extracting the node_mask
for 'bind' or 'interleave' policy or initializing the node_mask
to contain the single node for 'preferred' or 'local' policy.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
The previous version does not treat the MPOL_PREFERRED/MPOL_INTERLEAVE cases.
This patch adds the code to set proper node mask for
MPOL_PREFERRED/MPOL_INTERLEAVE.
---
 include/linux/mempolicy.h |  8 ++++++++
 mm/mempolicy.c            | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5e5b296..7796a40 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -145,6 +145,8 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 				enum mpol_rebind_step step);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 
+extern bool huge_nodemask(struct vm_area_struct *vma,
+				unsigned long addr, nodemask_t *mask);
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
@@ -261,6 +263,12 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
+static inline bool huge_nodemask(struct vm_area_struct *vma,
+				unsigned long addr, nodemask_t *mask)
+{
+	return false;
+}
+
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6d3639e..5063a69 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1800,6 +1800,53 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 
 #ifdef CONFIG_HUGETLBFS
 /*
+ * huge_nodemask(@vma, @addr, @mask)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma
+ * @mask: a nodemask pointer
+ *
+ * Return true if we can succeed in extracting the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument @mask, or
+ * initializing the argument @mask to contain the single node for
+ * 'preferred' or 'local' policy.
+ */
+bool huge_nodemask(struct vm_area_struct *vma, unsigned long addr,
+			nodemask_t *mask)
+{
+	struct mempolicy *mpol;
+	bool ret = true;
+	int nid;
+
+	if (!mask)
+		return false;
+
+	mpol = get_vma_policy(vma, addr);
+
+	switch (mpol->mode) {
+	case MPOL_PREFERRED:
+		if (mpol->flags & MPOL_F_LOCAL)
+			nid = numa_node_id();
+		else
+			nid = mpol->v.preferred_node;
+		init_nodemask_of_node(mask, nid);
+		break;
+
+	case MPOL_BIND:
+		/* Fall through */
+	case MPOL_INTERLEAVE:
+		*mask = mpol->v.nodes;
+		break;
+
+	default:
+		ret = false;
+		break;
+	}
+	mpol_cond_put(mpol);
+
+	return ret;
+}
+
+/*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup and interleave policy
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-14  7:07 ` [PATCH v2 5/6] mm: hugetlb: add a new function to allocate a new gigantic page Huang Shijie
@ 2016-11-16  6:55   ` Huang Shijie
  2016-11-28 14:17     ` Vlastimil Babka
  2016-12-02 14:03     ` Michal Hocko
  0 siblings, 2 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-16  6:55 UTC (permalink / raw)
  To: linux-arm-kernel

There are three ways we can allocate a new gigantic page:

1. When the NUMA is not enabled, use alloc_gigantic_page() to get
   the gigantic page.

2. The NUMA is enabled, but the vma is NULL.
   There is no memory policy we can refer to.
   So create a @nodes_allowed, initialize it with init_nodemask_of_mempolicy()
   or init_nodemask_of_node(). Then use alloc_fresh_gigantic_page() to get
   the gigantic page.

3. The NUMA is enabled, and the vma is valid.
   We can follow the memory policy of the @vma.

   Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
   to get the gigantic page.

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
Since the huge_nodemask() is changed, we have to change this function a little.

---
 mm/hugetlb.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6995087..c33bddc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1502,6 +1502,69 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 
 /*
  * There are 3 ways this can get called:
+ *
+ * 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
+ *    the gigantic page.
+ *
+ * 2. The NUMA is enabled, but the vma is NULL.
+ *    Create a @nodes_allowed, and use alloc_fresh_gigantic_page() to get
+ *    the gigantic page.
+ *
+ * 3. The NUMA is enabled, and the vma is valid.
+ *    Use the @vma's memory policy.
+ *    Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
+ *    to get the gigantic page.
+ */
+static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h,
+		struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+	struct page *page = NULL;
+
+	/* Not NUMA */
+	if (!IS_ENABLED(CONFIG_NUMA)) {
+		if (nid == NUMA_NO_NODE)
+			nid = numa_mem_id();
+
+		page = alloc_gigantic_page(nid, huge_page_order(h));
+		if (page)
+			prep_compound_gigantic_page(page, huge_page_order(h));
+
+		NODEMASK_FREE(nodes_allowed);
+		return page;
+	}
+
+	/* NUMA && !vma */
+	if (!vma) {
+		if (nid == NUMA_NO_NODE) {
+			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
+				NODEMASK_FREE(nodes_allowed);
+				nodes_allowed = &node_states[N_MEMORY];
+			}
+		} else if (nodes_allowed) {
+			init_nodemask_of_node(nodes_allowed, nid);
+		} else {
+			nodes_allowed = &node_states[N_MEMORY];
+		}
+
+		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
+
+		if (nodes_allowed != &node_states[N_MEMORY])
+			NODEMASK_FREE(nodes_allowed);
+
+		return page;
+	}
+
+	/* NUMA && vma */
+	if (huge_nodemask(vma, addr, nodes_allowed))
+		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
+
+	NODEMASK_FREE(nodes_allowed);
+	return page;
+}
+
+/*
+ * There are 3 ways this can get called:
  * 1. With vma+addr: we use the VMA's memory policy
  * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
  *    page from any node, and let the buddy allocator itself figure
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 1/6] mm: hugetlb: rename some allocation functions
  2016-11-14  7:07 ` [PATCH v2 1/6] mm: hugetlb: rename some allocation functions Huang Shijie
@ 2016-11-28 13:29   ` Vlastimil Babka
  2016-11-29  8:53     ` Huang Shijie
  0 siblings, 1 reply; 34+ messages in thread
From: Vlastimil Babka @ 2016-11-28 13:29 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/14/2016 08:07 AM, Huang Shijie wrote:
> After a future patch, the __alloc_buddy_huge_page() will not necessarily
> use the buddy allocator.
>
> So this patch removes the "buddy" from these functions:
> 	__alloc_buddy_huge_page -> __alloc_huge_page
> 	__alloc_buddy_huge_page_no_mpol -> __alloc_huge_page_no_mpol
> 	__alloc_buddy_huge_page_with_mpol -> __alloc_huge_page_with_mpol
>
> This patch makes preparation for the later patch.
>
> Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> ---
>  mm/hugetlb.c | 24 ++++++++++++++----------
>  1 file changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3edb759..496b703 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1157,6 +1157,10 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
>
>  static inline bool gigantic_page_supported(void) { return true; }
>  #else
> +static inline struct page *alloc_gigantic_page(int nid, unsigned int order)
> +{
> +	return NULL;
> +}

This hunk is not explained by the description. Could belong to a later 
patch?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-16  6:55   ` [PATCH V2 fix " Huang Shijie
@ 2016-11-28 14:17     ` Vlastimil Babka
  2016-11-29  9:03       ` Huang Shijie
  2016-12-02 14:03     ` Michal Hocko
  1 sibling, 1 reply; 34+ messages in thread
From: Vlastimil Babka @ 2016-11-28 14:17 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/16/2016 07:55 AM, Huang Shijie wrote:
> There are three ways we can allocate a new gigantic page:
>
> 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
>    the gigantic page.
>
> 2. The NUMA is enabled, but the vma is NULL.
>    There is no memory policy we can refer to.
>    So create a @nodes_allowed, initialize it with init_nodemask_of_mempolicy()
>    or init_nodemask_of_node(). Then use alloc_fresh_gigantic_page() to get
>    the gigantic page.
>
> 3. The NUMA is enabled, and the vma is valid.
>    We can follow the memory policy of the @vma.
>
>    Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
>    to get the gigantic page.
>
> Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> ---
> Since the huge_nodemask() is changed, we have to change this function a little.
>
> ---
>  mm/hugetlb.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6995087..c33bddc 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1502,6 +1502,69 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
>
>  /*
>   * There are 3 ways this can get called:
> + *
> + * 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
> + *    the gigantic page.
> + *
> + * 2. The NUMA is enabled, but the vma is NULL.
> + *    Create a @nodes_allowed, and use alloc_fresh_gigantic_page() to get
> + *    the gigantic page.
> + *
> + * 3. The NUMA is enabled, and the vma is valid.
> + *    Use the @vma's memory policy.
> + *    Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
> + *    to get the gigantic page.
> + */
> +static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h,
> +		struct vm_area_struct *vma, unsigned long addr, int nid)
> +{
> +	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);

What if the allocation fails and nodes_allowed is NULL?
It might work fine now, but it's rather fragile, so I'd rather see an 
explicit check.

BTW same thing applies to __nr_hugepages_store_common().

> +	struct page *page = NULL;
> +
> +	/* Not NUMA */
> +	if (!IS_ENABLED(CONFIG_NUMA)) {
> +		if (nid == NUMA_NO_NODE)
> +			nid = numa_mem_id();
> +
> +		page = alloc_gigantic_page(nid, huge_page_order(h));
> +		if (page)
> +			prep_compound_gigantic_page(page, huge_page_order(h));
> +
> +		NODEMASK_FREE(nodes_allowed);
> +		return page;
> +	}
> +
> +	/* NUMA && !vma */
> +	if (!vma) {
> +		if (nid == NUMA_NO_NODE) {
> +			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
> +				NODEMASK_FREE(nodes_allowed);
> +				nodes_allowed = &node_states[N_MEMORY];
> +			}
> +		} else if (nodes_allowed) {
> +			init_nodemask_of_node(nodes_allowed, nid);
> +		} else {
> +			nodes_allowed = &node_states[N_MEMORY];
> +		}
> +
> +		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
> +
> +		if (nodes_allowed != &node_states[N_MEMORY])
> +			NODEMASK_FREE(nodes_allowed);
> +
> +		return page;
> +	}
> +
> +	/* NUMA && vma */
> +	if (huge_nodemask(vma, addr, nodes_allowed))
> +		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
> +
> +	NODEMASK_FREE(nodes_allowed);
> +	return page;
> +}
> +
> +/*
> + * There are 3 ways this can get called:
>   * 1. With vma+addr: we use the VMA's memory policy
>   * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
>   *    page from any node, and let the buddy allocator itself figure
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (6 preceding siblings ...)
  2016-11-14 22:44 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Andrew Morton
@ 2016-11-28 14:20 ` Vlastimil Babka
  2016-11-29  9:07   ` Huang Shijie
  2016-11-30  6:30 ` [PATCH extra ] mm: hugetlb: add description for alloc_gigantic_page() Huang Shijie
  2016-12-02 14:05 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Michal Hocko
  9 siblings, 1 reply; 34+ messages in thread
From: Vlastimil Babka @ 2016-11-28 14:20 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/14/2016 08:07 AM, Huang Shijie wrote:
> (1) Background
>    For the arm64, the hugetlb page size can be 32M (PMD + Contiguous bit).
>    In the 4K page environment, the max page order is 10 (max_order - 1),
>    so 32M page is the gigantic page.
>
>    The arm64 MMU supports a Contiguous bit which is a hint that the TTE
>    is one of a set of contiguous entries which can be cached in a single
>    TLB entry.  Please refer to the arm64v8 mannul :
>        DDI0487A_f_armv8_arm.pdf (in page D4-1811)
>
> (2) The bug
>    After I tested the libhugetlbfs, I found the test case "counter.sh"
>    will fail with the gigantic page (32M page in arm64 board).
>
>    This patch set adds support for gigantic surplus hugetlb pages,
>    allowing the counter.sh unit test to pass.
>
> v1 -- > v2:
>    1.) fix the compiler error in X86.
>    2.) add new patches for NUMA.
>        The patch #2 ~ #5 are new patches.
>
> Huang Shijie (6):
>   mm: hugetlb: rename some allocation functions
>   mm: hugetlb: add a new parameter for some functions
>   mm: hugetlb: change the return type for alloc_fresh_gigantic_page
>   mm: mempolicy: intruduce a helper huge_nodemask()
>   mm: hugetlb: add a new function to allocate a new gigantic page
>   mm: hugetlb: support gigantic surplus pages
>
>  include/linux/mempolicy.h |   8 +++
>  mm/hugetlb.c              | 128 ++++++++++++++++++++++++++++++++++++----------
>  mm/mempolicy.c            |  20 ++++++++
>  3 files changed, 130 insertions(+), 26 deletions(-)

Can't say I'm entirely happy with the continued direction of maze of 
functions for huge page allocation :( Feels like path of least 
resistance to basically copy/paste the missing parts here. Is there no 
way to consolidate the code more?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 1/6] mm: hugetlb: rename some allocation functions
  2016-11-28 13:29   ` Vlastimil Babka
@ 2016-11-29  8:53     ` Huang Shijie
  2016-11-29 10:44       ` Vlastimil Babka
  0 siblings, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-29  8:53 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 28, 2016 at 02:29:03PM +0100, Vlastimil Babka wrote:
> On 11/14/2016 08:07 AM, Huang Shijie wrote:
> >  static inline bool gigantic_page_supported(void) { return true; }
> >  #else
> > +static inline struct page *alloc_gigantic_page(int nid, unsigned int order)
> > +{
> > +	return NULL;
> > +}
> 
> This hunk is not explained by the description. Could belong to a later
> patch?
> 

Okay, I can create an extra patch to add the description for the
alloc_gigantic_page().

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-28 14:17     ` Vlastimil Babka
@ 2016-11-29  9:03       ` Huang Shijie
  2016-11-29 10:50         ` Vlastimil Babka
  0 siblings, 1 reply; 34+ messages in thread
From: Huang Shijie @ 2016-11-29  9:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 28, 2016 at 03:17:28PM +0100, Vlastimil Babka wrote:
> On 11/16/2016 07:55 AM, Huang Shijie wrote:
> > +static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h,
> > +		struct vm_area_struct *vma, unsigned long addr, int nid)
> > +{
> > +	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
> 
> What if the allocation fails and nodes_allowed is NULL?
> It might work fine now, but it's rather fragile, so I'd rather see an
Yes.

> explicit check.
See the comment below.

> 
> BTW same thing applies to __nr_hugepages_store_common().
> 
> > +	struct page *page = NULL;
> > +
> > +	/* Not NUMA */
> > +	if (!IS_ENABLED(CONFIG_NUMA)) {
> > +		if (nid == NUMA_NO_NODE)
> > +			nid = numa_mem_id();
> > +
> > +		page = alloc_gigantic_page(nid, huge_page_order(h));
> > +		if (page)
> > +			prep_compound_gigantic_page(page, huge_page_order(h));
> > +
> > +		NODEMASK_FREE(nodes_allowed);
> > +		return page;
> > +	}
> > +
> > +	/* NUMA && !vma */
> > +	if (!vma) {
> > +		if (nid == NUMA_NO_NODE) {
> > +			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
> > +				NODEMASK_FREE(nodes_allowed);
> > +				nodes_allowed = &node_states[N_MEMORY];
> > +			}
> > +		} else if (nodes_allowed) {
The check is here.

Do we really need to re-arrange the code here for the explicit check? :)


Thanks
Huang Shijie
> > +			init_nodemask_of_node(nodes_allowed, nid);
> > +		} else {
> > +			nodes_allowed = &node_states[N_MEMORY];
> > +		}
> > +
> > +		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
> > +

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs
  2016-11-28 14:20 ` Vlastimil Babka
@ 2016-11-29  9:07   ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-29  9:07 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon, Nov 28, 2016 at 03:20:05PM +0100, Vlastimil Babka wrote:
> > Huang Shijie (6):
> >   mm: hugetlb: rename some allocation functions
> >   mm: hugetlb: add a new parameter for some functions
> >   mm: hugetlb: change the return type for alloc_fresh_gigantic_page
> >   mm: mempolicy: intruduce a helper huge_nodemask()
> >   mm: hugetlb: add a new function to allocate a new gigantic page
> >   mm: hugetlb: support gigantic surplus pages
> > 
> >  include/linux/mempolicy.h |   8 +++
> >  mm/hugetlb.c              | 128 ++++++++++++++++++++++++++++++++++++----------
> >  mm/mempolicy.c            |  20 ++++++++
> >  3 files changed, 130 insertions(+), 26 deletions(-)
> 
> Can't say I'm entirely happy with the continued direction of maze of
> functions for huge page allocation :( Feels like path of least resistance to
> basically copy/paste the missing parts here. Is there no way to consolidate
> the code more?
Ok, I will spend some time to read the code and think about it.

If you have interest, please do it too. :)

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 1/6] mm: hugetlb: rename some allocation functions
  2016-11-29  8:53     ` Huang Shijie
@ 2016-11-29 10:44       ` Vlastimil Babka
  2016-11-30  3:03         ` Huang Shijie
  0 siblings, 1 reply; 34+ messages in thread
From: Vlastimil Babka @ 2016-11-29 10:44 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/29/2016 09:53 AM, Huang Shijie wrote:
> On Mon, Nov 28, 2016 at 02:29:03PM +0100, Vlastimil Babka wrote:
>> On 11/14/2016 08:07 AM, Huang Shijie wrote:
>> >  static inline bool gigantic_page_supported(void) { return true; }
>> >  #else
>> > +static inline struct page *alloc_gigantic_page(int nid, unsigned int order)
>> > +{
>> > +	return NULL;
>> > +}
>>
>> This hunk is not explained by the description. Could belong to a later
>> patch?
>>
>
> Okay, I can create an extra patch to add the description for the
> alloc_gigantic_page().

Not sure about extra patch, just move it to an existing later patch that relies 
on it?

Vlastimil

> Thanks
> Huang Shijie
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-29  9:03       ` Huang Shijie
@ 2016-11-29 10:50         ` Vlastimil Babka
  2016-11-30  3:02           ` Huang Shijie
  0 siblings, 1 reply; 34+ messages in thread
From: Vlastimil Babka @ 2016-11-29 10:50 UTC (permalink / raw)
  To: linux-arm-kernel

On 11/29/2016 10:03 AM, Huang Shijie wrote:
> On Mon, Nov 28, 2016 at 03:17:28PM +0100, Vlastimil Babka wrote:
>> On 11/16/2016 07:55 AM, Huang Shijie wrote:
>> > +static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h,
>> > +		struct vm_area_struct *vma, unsigned long addr, int nid)
>> > +{
>> > +	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
>>
>> What if the allocation fails and nodes_allowed is NULL?
>> It might work fine now, but it's rather fragile, so I'd rather see an
> Yes.
>
>> explicit check.
> See the comment below.
>
>>
>> BTW same thing applies to __nr_hugepages_store_common().
>>
>> > +	struct page *page = NULL;
>> > +
>> > +	/* Not NUMA */
>> > +	if (!IS_ENABLED(CONFIG_NUMA)) {
>> > +		if (nid == NUMA_NO_NODE)
>> > +			nid = numa_mem_id();
>> > +
>> > +		page = alloc_gigantic_page(nid, huge_page_order(h));
>> > +		if (page)
>> > +			prep_compound_gigantic_page(page, huge_page_order(h));
>> > +
>> > +		NODEMASK_FREE(nodes_allowed);
>> > +		return page;
>> > +	}
>> > +
>> > +	/* NUMA && !vma */
>> > +	if (!vma) {
>> > +		if (nid == NUMA_NO_NODE) {
>> > +			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
>> > +				NODEMASK_FREE(nodes_allowed);
>> > +				nodes_allowed = &node_states[N_MEMORY];
>> > +			}
>> > +		} else if (nodes_allowed) {
> The check is here.

It's below a possible usage of nodes_allowed as an argument of 
init_nodemask_of_mempolicy(mask). Which does

         if (!(mask && current->mempolicy))
                 return false;

which itself looks like an error at first sight :)

> Do we really need to re-arrange the code here for the explicit check? :)

We don't need it *now* to be correct, but I still find it fragile. Also it
mixes up the semantic of NULL as a conscious "default" value, and NULL as
a side-effect of memory allocation failure. Nothing good can come from that in 
the long term :)

> Thanks
> Huang Shijie
>> > +			init_nodemask_of_node(nodes_allowed, nid);
>> > +		} else {
>> > +			nodes_allowed = &node_states[N_MEMORY];
>> > +		}
>> > +
>> > +		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
>> > +
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-29 10:50         ` Vlastimil Babka
@ 2016-11-30  3:02           ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-30  3:02 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 29, 2016 at 11:50:37AM +0100, Vlastimil Babka wrote:
> > > > +	if (!vma) {
> > > > +		if (nid == NUMA_NO_NODE) {
> > > > +			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
> > > > +				NODEMASK_FREE(nodes_allowed);
> > > > +				nodes_allowed = &node_states[N_MEMORY];
> > > > +			}
> > > > +		} else if (nodes_allowed) {
> > The check is here.
> 
> It's below a possible usage of nodes_allowed as an argument of
> init_nodemask_of_mempolicy(mask). Which does
Sorry, I missed that.
> 
>         if (!(mask && current->mempolicy))
>                 return false;
> 
> which itself looks like an error at first sight :)
Yes. I agree.
> 
> > Do we really need to re-arrange the code here for the explicit check? :)
> 
> We don't need it *now* to be correct, but I still find it fragile. Also it
> mixes up the semantic of NULL as a conscious "default" value, and NULL as
> a side-effect of memory allocation failure. Nothing good can come from that
> in the long term :)
Okay, I think we do have the need to do the NULL check for
@nodes_allowed. :)

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 1/6] mm: hugetlb: rename some allocation functions
  2016-11-29 10:44       ` Vlastimil Babka
@ 2016-11-30  3:03         ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-30  3:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 29, 2016 at 11:44:23AM +0100, Vlastimil Babka wrote:
> On 11/29/2016 09:53 AM, Huang Shijie wrote:
> > On Mon, Nov 28, 2016 at 02:29:03PM +0100, Vlastimil Babka wrote:
> > > On 11/14/2016 08:07 AM, Huang Shijie wrote:
> > > >  static inline bool gigantic_page_supported(void) { return true; }
> > > >  #else
> > > > +static inline struct page *alloc_gigantic_page(int nid, unsigned int order)
> > > > +{
> > > > +	return NULL;
> > > > +}
> > > 
> > > This hunk is not explained by the description. Could belong to a later
> > > patch?
> > > 
> > 
> > Okay, I can create an extra patch to add the description for the
> > alloc_gigantic_page().
> 
> Not sure about extra patch, just move it to an existing later patch that
> relies on it?
The whole patch set has been merged to Andrew's tree, so an extra patch
is better. :)

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH extra ] mm: hugetlb: add description for alloc_gigantic_page()
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (7 preceding siblings ...)
  2016-11-28 14:20 ` Vlastimil Babka
@ 2016-11-30  6:30 ` Huang Shijie
  2016-12-02 14:05 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Michal Hocko
  9 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-11-30  6:30 UTC (permalink / raw)
  To: linux-arm-kernel

This patch adds the description for function alloc_gigantic_page().

Signed-off-by: Huang Shijie <shijie.huang@arm.com>
---
 mm/hugetlb.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3faec05..0d4bb8a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1089,6 +1089,12 @@ static bool zone_spans_last_pfn(const struct zone *zone,
 	return zone_spans_pfn(zone, last_pfn);
 }
 
+/*
+ * Allocate a gigantic page from @nid node.
+ *
+ * Scan the zones of @nid node, and try to allocate a number of contiguous
+ * pages (1 << order).
+ */
 static struct page *alloc_gigantic_page(int nid, unsigned int order)
 {
 	unsigned long nr_pages = 1 << order;
-- 
2.5.5

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions
  2016-11-14  7:07 ` [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions Huang Shijie
@ 2016-12-02 13:52   ` Michal Hocko
  2016-12-05  3:05     ` Huang Shijie
  0 siblings, 1 reply; 34+ messages in thread
From: Michal Hocko @ 2016-12-02 13:52 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon 14-11-16 15:07:35, Huang Shijie wrote:
> This patch adds a new parameter, the "no_init", for these functions:
>    alloc_fresh_gigantic_page_node()
>    alloc_fresh_gigantic_page()
> 
> The prep_new_huge_page() does some initialization for the new page.
> But sometime, we do not need it to do so, such as in the surplus case
> in later patch.
> 
> With this parameter, the prep_new_huge_page() can be called by needed:
>    If the "no_init" is false, calls the prep_new_huge_page() in
>    the alloc_fresh_gigantic_page_node();

This double negative just makes my head spin. I haven't got to later
patch to understand the motivation but if anything bool do_prep would
be much more clear. In general doing these "init if a parameter is
specified" is a bad idea. It just makes the code more convoluted and
sutble. If you need the separation then __foo vs foo with the first
doing the real work and the later some additional initialization on top
sounds like a better idea to me.

Let's see what other changes are about.

> This patch makes preparation for the later patches.
> 
> Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> ---
>  mm/hugetlb.c | 15 +++++++++------
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 496b703..db0177b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1127,27 +1127,29 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
>  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
>  static void prep_compound_gigantic_page(struct page *page, unsigned int order);
>  
> -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
> +static struct page *alloc_fresh_gigantic_page_node(struct hstate *h,
> +					int nid, bool no_init)
>  {
>  	struct page *page;
>  
>  	page = alloc_gigantic_page(nid, huge_page_order(h));
>  	if (page) {
>  		prep_compound_gigantic_page(page, huge_page_order(h));
> -		prep_new_huge_page(h, page, nid);
> +		if (!no_init)
> +			prep_new_huge_page(h, page, nid);
>  	}
>  
>  	return page;
>  }
>  
>  static int alloc_fresh_gigantic_page(struct hstate *h,
> -				nodemask_t *nodes_allowed)
> +				nodemask_t *nodes_allowed, bool no_init)
>  {
>  	struct page *page = NULL;
>  	int nr_nodes, node;
>  
>  	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
> -		page = alloc_fresh_gigantic_page_node(h, node);
> +		page = alloc_fresh_gigantic_page_node(h, node, no_init);
>  		if (page)
>  			return 1;
>  	}
> @@ -1166,7 +1168,7 @@ static inline void free_gigantic_page(struct page *page, unsigned int order) { }
>  static inline void destroy_compound_gigantic_page(struct page *page,
>  						unsigned int order) { }
>  static inline int alloc_fresh_gigantic_page(struct hstate *h,
> -					nodemask_t *nodes_allowed) { return 0; }
> +		nodemask_t *nodes_allowed, bool no_init) { return 0; }
>  #endif
>  
>  static void update_and_free_page(struct hstate *h, struct page *page)
> @@ -2313,7 +2315,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
>  		cond_resched();
>  
>  		if (hstate_is_gigantic(h))
> -			ret = alloc_fresh_gigantic_page(h, nodes_allowed);
> +			ret = alloc_fresh_gigantic_page(h, nodes_allowed,
> +							false);
>  		else
>  			ret = alloc_fresh_huge_page(h, nodes_allowed);
>  		spin_lock(&hugetlb_lock);
> -- 
> 2.5.5
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page
  2016-11-14  7:07 ` [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page Huang Shijie
@ 2016-12-02 13:56   ` Michal Hocko
  2016-12-05  3:06     ` Huang Shijie
  0 siblings, 1 reply; 34+ messages in thread
From: Michal Hocko @ 2016-12-02 13:56 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon 14-11-16 15:07:36, Huang Shijie wrote:
> This patch changes the return type to "struct page*" for
> alloc_fresh_gigantic_page().

OK, this makes somme sense. Other hugetlb allocation function (and page
allocator in general) return struct page as well. Besides that int would
make sense if we wanted to convey an error code but 0 vs. 1 just doesn't
make any sense.

But if you are changing that then alloc_fresh_huge_page should be
changed as well.

> This patch makes preparation for later patch.
> 
> Signed-off-by: Huang Shijie <shijie.huang@arm.com>

Acked-by: Michal Hocko <mhocko@suse.com>

> ---
>  mm/hugetlb.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index db0177b..6995087 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1142,7 +1142,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h,
>  	return page;
>  }
>  
> -static int alloc_fresh_gigantic_page(struct hstate *h,
> +static struct page *alloc_fresh_gigantic_page(struct hstate *h,
>  				nodemask_t *nodes_allowed, bool no_init)
>  {
>  	struct page *page = NULL;
> @@ -1151,10 +1151,10 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
>  	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
>  		page = alloc_fresh_gigantic_page_node(h, node, no_init);
>  		if (page)
> -			return 1;
> +			return page;
>  	}
>  
> -	return 0;
> +	return NULL;
>  }
>  
>  static inline bool gigantic_page_supported(void) { return true; }
> @@ -1167,8 +1167,8 @@ static inline bool gigantic_page_supported(void) { return false; }
>  static inline void free_gigantic_page(struct page *page, unsigned int order) { }
>  static inline void destroy_compound_gigantic_page(struct page *page,
>  						unsigned int order) { }
> -static inline int alloc_fresh_gigantic_page(struct hstate *h,
> -		nodemask_t *nodes_allowed, bool no_init) { return 0; }
> +static inline struct page *alloc_fresh_gigantic_page(struct hstate *h,
> +		nodemask_t *nodes_allowed, bool no_init) { return NULL; }
>  #endif
>  
>  static void update_and_free_page(struct hstate *h, struct page *page)
> @@ -2315,7 +2315,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
>  		cond_resched();
>  
>  		if (hstate_is_gigantic(h))
> -			ret = alloc_fresh_gigantic_page(h, nodes_allowed,
> +			ret = !!alloc_fresh_gigantic_page(h, nodes_allowed,
>  							false);
>  		else
>  			ret = alloc_fresh_huge_page(h, nodes_allowed);
> -- 
> 2.5.5
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo at kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email at kvack.org </a>

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-11-16  6:53   ` [PATCH V2 fix " Huang Shijie
@ 2016-12-02 13:58     ` Michal Hocko
  2016-12-05  3:09       ` Huang Shijie
  0 siblings, 1 reply; 34+ messages in thread
From: Michal Hocko @ 2016-12-02 13:58 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed 16-11-16 14:53:02, Huang Shijie wrote:
> This patch intruduces a new helper huge_nodemask(),
> we can use it to get the node mask.
> 
> This idea of the function is from the init_nodemask_of_mempolicy():
>    Return true if we can succeed in extracting the node_mask
> for 'bind' or 'interleave' policy or initializing the node_mask
> to contain the single node for 'preferred' or 'local' policy.

It is absolutely unclear how this is going to be used from this patch.
Please make sure to also use a newly added function in the same patch.

> 
> Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> ---
> The previous version does not treat the MPOL_PREFERRED/MPOL_INTERLEAVE cases.
> This patch adds the code to set proper node mask for
> MPOL_PREFERRED/MPOL_INTERLEAVE.
> ---
>  include/linux/mempolicy.h |  8 ++++++++
>  mm/mempolicy.c            | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 55 insertions(+)
> 
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> index 5e5b296..7796a40 100644
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -145,6 +145,8 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
>  				enum mpol_rebind_step step);
>  extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
>  
> +extern bool huge_nodemask(struct vm_area_struct *vma,
> +				unsigned long addr, nodemask_t *mask);
>  extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
>  				unsigned long addr, gfp_t gfp_flags,
>  				struct mempolicy **mpol, nodemask_t **nodemask);
> @@ -261,6 +263,12 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
>  {
>  }
>  
> +static inline bool huge_nodemask(struct vm_area_struct *vma,
> +				unsigned long addr, nodemask_t *mask)
> +{
> +	return false;
> +}
> +
>  static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
>  				unsigned long addr, gfp_t gfp_flags,
>  				struct mempolicy **mpol, nodemask_t **nodemask)
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index 6d3639e..5063a69 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1800,6 +1800,53 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
>  
>  #ifdef CONFIG_HUGETLBFS
>  /*
> + * huge_nodemask(@vma, @addr, @mask)
> + * @vma: virtual memory area whose policy is sought
> + * @addr: address in @vma
> + * @mask: a nodemask pointer
> + *
> + * Return true if we can succeed in extracting the policy nodemask
> + * for 'bind' or 'interleave' policy into the argument @mask, or
> + * initializing the argument @mask to contain the single node for
> + * 'preferred' or 'local' policy.
> + */
> +bool huge_nodemask(struct vm_area_struct *vma, unsigned long addr,
> +			nodemask_t *mask)
> +{
> +	struct mempolicy *mpol;
> +	bool ret = true;
> +	int nid;
> +
> +	if (!mask)
> +		return false;
> +
> +	mpol = get_vma_policy(vma, addr);
> +
> +	switch (mpol->mode) {
> +	case MPOL_PREFERRED:
> +		if (mpol->flags & MPOL_F_LOCAL)
> +			nid = numa_node_id();
> +		else
> +			nid = mpol->v.preferred_node;
> +		init_nodemask_of_node(mask, nid);
> +		break;
> +
> +	case MPOL_BIND:
> +		/* Fall through */
> +	case MPOL_INTERLEAVE:
> +		*mask = mpol->v.nodes;
> +		break;
> +
> +	default:
> +		ret = false;
> +		break;
> +	}
> +	mpol_cond_put(mpol);
> +
> +	return ret;
> +}
> +
> +/*
>   * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
>   * @vma: virtual memory area whose policy is sought
>   * @addr: address in @vma for shared policy lookup and interleave policy
> -- 
> 2.5.5
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-11-16  6:55   ` [PATCH V2 fix " Huang Shijie
  2016-11-28 14:17     ` Vlastimil Babka
@ 2016-12-02 14:03     ` Michal Hocko
  2016-12-05  3:15       ` Huang Shijie
  1 sibling, 1 reply; 34+ messages in thread
From: Michal Hocko @ 2016-12-02 14:03 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed 16-11-16 14:55:04, Huang Shijie wrote:
> There are three ways we can allocate a new gigantic page:
> 
> 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
>    the gigantic page.
> 
> 2. The NUMA is enabled, but the vma is NULL.
>    There is no memory policy we can refer to.
>    So create a @nodes_allowed, initialize it with init_nodemask_of_mempolicy()
>    or init_nodemask_of_node(). Then use alloc_fresh_gigantic_page() to get
>    the gigantic page.
> 
> 3. The NUMA is enabled, and the vma is valid.
>    We can follow the memory policy of the @vma.
> 
>    Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
>    to get the gigantic page.

Again __hugetlb_alloc_gigantic_page is not used and it is hard to deduce
its usage from this commit. The above shouldn't be really much different from
what we do in alloc_pages_vma so please make sure to check it before
coming up with something hugetlb specific.

> Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> ---
> Since the huge_nodemask() is changed, we have to change this function a little.
> 
> ---
>  mm/hugetlb.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6995087..c33bddc 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1502,6 +1502,69 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
>  
>  /*
>   * There are 3 ways this can get called:
> + *
> + * 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
> + *    the gigantic page.
> + *
> + * 2. The NUMA is enabled, but the vma is NULL.
> + *    Create a @nodes_allowed, and use alloc_fresh_gigantic_page() to get
> + *    the gigantic page.
> + *
> + * 3. The NUMA is enabled, and the vma is valid.
> + *    Use the @vma's memory policy.
> + *    Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
> + *    to get the gigantic page.
> + */
> +static struct page *__hugetlb_alloc_gigantic_page(struct hstate *h,
> +		struct vm_area_struct *vma, unsigned long addr, int nid)
> +{
> +	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
> +	struct page *page = NULL;
> +
> +	/* Not NUMA */
> +	if (!IS_ENABLED(CONFIG_NUMA)) {
> +		if (nid == NUMA_NO_NODE)
> +			nid = numa_mem_id();
> +
> +		page = alloc_gigantic_page(nid, huge_page_order(h));
> +		if (page)
> +			prep_compound_gigantic_page(page, huge_page_order(h));
> +
> +		NODEMASK_FREE(nodes_allowed);
> +		return page;
> +	}
> +
> +	/* NUMA && !vma */
> +	if (!vma) {
> +		if (nid == NUMA_NO_NODE) {
> +			if (!init_nodemask_of_mempolicy(nodes_allowed)) {
> +				NODEMASK_FREE(nodes_allowed);
> +				nodes_allowed = &node_states[N_MEMORY];
> +			}
> +		} else if (nodes_allowed) {
> +			init_nodemask_of_node(nodes_allowed, nid);
> +		} else {
> +			nodes_allowed = &node_states[N_MEMORY];
> +		}
> +
> +		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
> +
> +		if (nodes_allowed != &node_states[N_MEMORY])
> +			NODEMASK_FREE(nodes_allowed);
> +
> +		return page;
> +	}
> +
> +	/* NUMA && vma */
> +	if (huge_nodemask(vma, addr, nodes_allowed))
> +		page = alloc_fresh_gigantic_page(h, nodes_allowed, true);
> +
> +	NODEMASK_FREE(nodes_allowed);
> +	return page;
> +}
> +
> +/*
> + * There are 3 ways this can get called:
>   * 1. With vma+addr: we use the VMA's memory policy
>   * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
>   *    page from any node, and let the buddy allocator itself figure
> -- 
> 2.5.5
> 

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs
  2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
                   ` (8 preceding siblings ...)
  2016-11-30  6:30 ` [PATCH extra ] mm: hugetlb: add description for alloc_gigantic_page() Huang Shijie
@ 2016-12-02 14:05 ` Michal Hocko
  9 siblings, 0 replies; 34+ messages in thread
From: Michal Hocko @ 2016-12-02 14:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Mon 14-11-16 15:07:33, Huang Shijie wrote:
> (1) Background
>    For the arm64, the hugetlb page size can be 32M (PMD + Contiguous bit).
>    In the 4K page environment, the max page order is 10 (max_order - 1),
>    so 32M page is the gigantic page.    
> 
>    The arm64 MMU supports a Contiguous bit which is a hint that the TTE
>    is one of a set of contiguous entries which can be cached in a single
>    TLB entry.  Please refer to the arm64v8 mannul :
>        DDI0487A_f_armv8_arm.pdf (in page D4-1811)
> 
> (2) The bug   
>    After I tested the libhugetlbfs, I found the test case "counter.sh"
>    will fail with the gigantic page (32M page in arm64 board).
> 
>    This patch set adds support for gigantic surplus hugetlb pages,
>    allowing the counter.sh unit test to pass.   

Andrew, I have noticed that this patchset is sitting in the mmotm tree
already. I have to say I am not really happy about the changes it is
introducing. It is making a confused code base even more so. I have
already commented on respective patches but in general I think it needs
a deeper thought before it can be merged.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions
  2016-12-02 13:52   ` Michal Hocko
@ 2016-12-05  3:05     ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-12-05  3:05 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 02, 2016 at 02:52:30PM +0100, Michal Hocko wrote:
> On Mon 14-11-16 15:07:35, Huang Shijie wrote:
> > This patch adds a new parameter, the "no_init", for these functions:
> >    alloc_fresh_gigantic_page_node()
> >    alloc_fresh_gigantic_page()
> > 
> > The prep_new_huge_page() does some initialization for the new page.
> > But sometime, we do not need it to do so, such as in the surplus case
> > in later patch.
> > 
> > With this parameter, the prep_new_huge_page() can be called by needed:
> >    If the "no_init" is false, calls the prep_new_huge_page() in
> >    the alloc_fresh_gigantic_page_node();
> 
> This double negative just makes my head spin. I haven't got to later
> patch to understand the motivation but if anything bool do_prep would
> be much more clear. In general doing these "init if a parameter is
Okay, I will use the "do_prep" for the new parameter.

thanks for the code review.

Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page
  2016-12-02 13:56   ` Michal Hocko
@ 2016-12-05  3:06     ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-12-05  3:06 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 02, 2016 at 02:56:43PM +0100, Michal Hocko wrote:
> On Mon 14-11-16 15:07:36, Huang Shijie wrote:
> > This patch changes the return type to "struct page*" for
> > alloc_fresh_gigantic_page().
> 
> OK, this makes somme sense. Other hugetlb allocation function (and page
> allocator in general) return struct page as well. Besides that int would
> make sense if we wanted to convey an error code but 0 vs. 1 just doesn't
> make any sense.
> 
> But if you are changing that then alloc_fresh_huge_page should be
> changed as well.
Okay.

> 
> > This patch makes preparation for later patch.
> > 
> > Signed-off-by: Huang Shijie <shijie.huang@arm.com>
> 
> Acked-by: Michal Hocko <mhocko@suse.com>
Thanks a lot.

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 4/6] mm: mempolicy: intruduce a helper huge_nodemask()
  2016-12-02 13:58     ` Michal Hocko
@ 2016-12-05  3:09       ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-12-05  3:09 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 02, 2016 at 02:58:46PM +0100, Michal Hocko wrote:
> On Wed 16-11-16 14:53:02, Huang Shijie wrote:
> > This patch intruduces a new helper huge_nodemask(),
> > we can use it to get the node mask.
> > 
> > This idea of the function is from the init_nodemask_of_mempolicy():
> >    Return true if we can succeed in extracting the node_mask
> > for 'bind' or 'interleave' policy or initializing the node_mask
> > to contain the single node for 'preferred' or 'local' policy.
> 
> It is absolutely unclear how this is going to be used from this patch.
> Please make sure to also use a newly added function in the same patch.
> 
Okay, I will merge this patch into the later patch.

Thanks	
Huang Shijie	

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH V2 fix 5/6] mm: hugetlb: add a new function to allocate a new gigantic page
  2016-12-02 14:03     ` Michal Hocko
@ 2016-12-05  3:15       ` Huang Shijie
  0 siblings, 0 replies; 34+ messages in thread
From: Huang Shijie @ 2016-12-05  3:15 UTC (permalink / raw)
  To: linux-arm-kernel

On Fri, Dec 02, 2016 at 03:03:30PM +0100, Michal Hocko wrote:
> On Wed 16-11-16 14:55:04, Huang Shijie wrote:
> > There are three ways we can allocate a new gigantic page:
> > 
> > 1. When the NUMA is not enabled, use alloc_gigantic_page() to get
> >    the gigantic page.
> > 
> > 2. The NUMA is enabled, but the vma is NULL.
> >    There is no memory policy we can refer to.
> >    So create a @nodes_allowed, initialize it with init_nodemask_of_mempolicy()
> >    or init_nodemask_of_node(). Then use alloc_fresh_gigantic_page() to get
> >    the gigantic page.
> > 
> > 3. The NUMA is enabled, and the vma is valid.
> >    We can follow the memory policy of the @vma.
> > 
> >    Get @nodes_allowed by huge_nodemask(), and use alloc_fresh_gigantic_page()
> >    to get the gigantic page.
> 
> Again __hugetlb_alloc_gigantic_page is not used and it is hard to deduce
> its usage from this commit. The above shouldn't be really much different from

Okay, I will merge it into the later patch.

> what we do in alloc_pages_vma so please make sure to check it before
> coming up with something hugetlb specific.
No problem. Thanks for the hint.

Thanks
Huang Shijie

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2016-12-05  3:15 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-11-14  7:07 [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Huang Shijie
2016-11-14  7:07 ` [PATCH v2 1/6] mm: hugetlb: rename some allocation functions Huang Shijie
2016-11-28 13:29   ` Vlastimil Babka
2016-11-29  8:53     ` Huang Shijie
2016-11-29 10:44       ` Vlastimil Babka
2016-11-30  3:03         ` Huang Shijie
2016-11-14  7:07 ` [PATCH v2 2/6] mm: hugetlb: add a new parameter for some functions Huang Shijie
2016-12-02 13:52   ` Michal Hocko
2016-12-05  3:05     ` Huang Shijie
2016-11-14  7:07 ` [PATCH v2 3/6] mm: hugetlb: change the return type for alloc_fresh_gigantic_page Huang Shijie
2016-12-02 13:56   ` Michal Hocko
2016-12-05  3:06     ` Huang Shijie
2016-11-14  7:07 ` [PATCH v2 4/6] mm: mempolicy: intruduce a helper huge_nodemask() Huang Shijie
2016-11-15  6:01   ` Aneesh Kumar K.V
2016-11-15  8:20     ` Huang Shijie
2016-11-15  8:52     ` Huang Shijie
2016-11-16  6:53   ` [PATCH V2 fix " Huang Shijie
2016-12-02 13:58     ` Michal Hocko
2016-12-05  3:09       ` Huang Shijie
2016-11-14  7:07 ` [PATCH v2 5/6] mm: hugetlb: add a new function to allocate a new gigantic page Huang Shijie
2016-11-16  6:55   ` [PATCH V2 fix " Huang Shijie
2016-11-28 14:17     ` Vlastimil Babka
2016-11-29  9:03       ` Huang Shijie
2016-11-29 10:50         ` Vlastimil Babka
2016-11-30  3:02           ` Huang Shijie
2016-12-02 14:03     ` Michal Hocko
2016-12-05  3:15       ` Huang Shijie
2016-11-14  7:07 ` [PATCH v2 6/6] mm: hugetlb: support gigantic surplus pages Huang Shijie
2016-11-14 22:44 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Andrew Morton
2016-11-15  2:36   ` Huang Shijie
2016-11-28 14:20 ` Vlastimil Babka
2016-11-29  9:07   ` Huang Shijie
2016-11-30  6:30 ` [PATCH extra ] mm: hugetlb: add description for alloc_gigantic_page() Huang Shijie
2016-12-02 14:05 ` [PATCH v2 0/6] mm: fix the "counter.sh" failure for libhugetlbfs Michal Hocko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).