[PATCH v13 8/8] mm: Don't split THP page when syscall is called

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Minchan Kim <minchan@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Linux API <linux-api@vger.kernel.org>,
	Hugh Dickins <hughd@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Mel Gorman <mgorman@suse.de>, Jason Evans <je@fb.com>,
	Zhang Yanfei <zhangyanfei@cn.fujitsu.com>,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	Minchan Kim <minchan@kernel.org>,
	Andrea Arcangeli <aarcange@redhat.com>
Subject: [PATCH v13 8/8] mm: Don't split THP page when syscall is called
Date: Fri, 18 Jul 2014 15:53:06 +0900	[thread overview]
Message-ID: <1405666386-15095-9-git-send-email-minchan@kernel.org> (raw)
In-Reply-To: <1405666386-15095-1-git-send-email-minchan@kernel.org>

We don't need to split THP page when MADV_FREE syscall is
called. It could be done when VM decide really frees it so
we could avoid unnecessary THP split.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/huge_mm.h |  4 ++++
 mm/huge_memory.c        | 35 +++++++++++++++++++++++++++++++++++
 mm/madvise.c            | 21 ++++++++++++++++++++-
 mm/rmap.c               |  8 ++++++--
 mm/vmscan.c             | 28 ++++++++++++++++++----------
 5 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 63579cb8d3dc..25a961256d9f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
 					  pmd_t *pmd,
 					  unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+			struct vm_area_struct *vma,
+			pmd_t *pmd, unsigned long addr);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
 			struct vm_area_struct *vma,
 			pmd_t *pmd, unsigned long addr);
@@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 				     unsigned long address,
 				     enum page_check_address_pmd_flag flag,
 				     spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 02559efd9827..c238a19a648f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1384,6 +1384,36 @@ out:
 	return 0;
 }
 
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		 pmd_t *pmd, unsigned long addr)
+
+{
+	spinlock_t *ptl;
+	struct mm_struct *mm = tlb->mm;
+	int ret = 1;
+
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+		struct page *page;
+		pmd_t orig_pmd;
+
+		orig_pmd = pmdp_get_and_clear(mm, addr, pmd);
+
+		/* No hugepage in swapcache */
+		page = pmd_page(orig_pmd);
+		VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+		orig_pmd = pmd_mkold(orig_pmd);
+		orig_pmd = pmd_mkclean(orig_pmd);
+
+		set_pmd_at(mm, addr, pmd, orig_pmd);
+		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+		spin_unlock(ptl);
+		ret = 0;
+	}
+
+	return ret;
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 {
@@ -1620,6 +1650,11 @@ unlock:
 	return NULL;
 }
 
+int pmd_freeable(pmd_t pmd)
+{
+	return !pmd_dirty(pmd);
+}
+
 static int __split_huge_page_splitting(struct page *page,
 				       struct vm_area_struct *vma,
 				       unsigned long address)
diff --git a/mm/madvise.c b/mm/madvise.c
index a21584235bb6..84badee5f46d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -271,8 +271,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	pte_t *pte, ptent;
 	struct page *page;
+	unsigned long next;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		if (next - addr != HPAGE_PMD_SIZE) {
+#ifdef CONFIG_DEBUG_VM
+			if (!rwsem_is_locked(&mm->mmap_sem)) {
+				pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+					__func__, addr, end,
+					vma->vm_start,
+					vma->vm_end);
+				BUG();
+			}
+#endif
+			split_huge_page_pmd(vma, addr, pmd);
+		} else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+			goto next;
+		/* fall through */
+	}
 
-	split_huge_page_pmd(vma, addr, pmd);
 	if (pmd_trans_unstable(pmd))
 		return 0;
 
@@ -316,6 +334,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
+next:
 	cond_resched();
 	return 0;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 04c181133890..9c407576ff8e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -704,9 +704,13 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			referenced++;
 
 		/*
-		 * In this implmentation, MADV_FREE doesn't support THP free
+		 * Use pmd_freeable instead of raw pmd_dirty because in some
+		 * of architecture, pmd_dirty is not defined unless
+		 * CONFIG_TRANSPARNTE_HUGE is enabled
 		 */
-		dirty++;
+		if (!pmd_freeable(*pmd))
+			dirty++;
+
 		spin_unlock(ptl);
 	} else {
 		pte_t *pte;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 598e2ade21f7..b1428bdad88e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -973,17 +973,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page) && !freeable) {
-			if (!(sc->gfp_mask & __GFP_IO))
-				goto keep_locked;
-			if (!add_to_swap(page, page_list))
-				goto activate_locked;
-			may_enter_fs = 1;
-
-			/* Adding to swap updated mapping */
-			mapping = page_mapping(page);
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!freeable) {
+				if (!(sc->gfp_mask & __GFP_IO))
+					goto keep_locked;
+				if (!add_to_swap(page, page_list))
+					goto activate_locked;
+				may_enter_fs = 1;
+				/* Adding to swap updated mapping */
+				mapping = page_mapping(page);
+			} else {
+				if (likely(!PageTransHuge(page)))
+					goto unmap;
+				/* try_to_unmap isn't aware of THP page */
+				if (unlikely(split_huge_page_to_list(page,
+								page_list)))
+					goto keep_locked;
+			}
 		}
-
+unmap:
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
-- 
2.0.0

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

WARNING: multiple messages have this Message-ID (diff)

From: Minchan Kim <minchan@kernel.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	Michael Kerrisk <mtk.manpages@gmail.com>,
	Linux API <linux-api@vger.kernel.org>,
	Hugh Dickins <hughd@google.com>,
	Johannes Weiner <hannes@cmpxchg.org>,
	Rik van Riel <riel@redhat.com>,
	KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>,
	Mel Gorman <mgorman@suse.de>, Jason Evans <je@fb.com>,
	Zhang Yanfei <zhangyanfei@cn.fujitsu.com>,
	"Kirill A. Shutemov" <kirill@shutemov.name>,
	Minchan Kim <minchan@kernel.org>,
	Andrea Arcangeli <aarcange@redhat.com>
Subject: [PATCH v13 8/8] mm: Don't split THP page when syscall is called
Date: Fri, 18 Jul 2014 15:53:06 +0900	[thread overview]
Message-ID: <1405666386-15095-9-git-send-email-minchan@kernel.org> (raw)
In-Reply-To: <1405666386-15095-1-git-send-email-minchan@kernel.org>

We don't need to split THP page when MADV_FREE syscall is
called. It could be done when VM decide really frees it so
we could avoid unnecessary THP split.

Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 include/linux/huge_mm.h |  4 ++++
 mm/huge_memory.c        | 35 +++++++++++++++++++++++++++++++++++
 mm/madvise.c            | 21 ++++++++++++++++++++-
 mm/rmap.c               |  8 ++++++--
 mm/vmscan.c             | 28 ++++++++++++++++++----------
 5 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 63579cb8d3dc..25a961256d9f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
 					  pmd_t *pmd,
 					  unsigned int flags);
+extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+			struct vm_area_struct *vma,
+			pmd_t *pmd, unsigned long addr);
 extern int zap_huge_pmd(struct mmu_gather *tlb,
 			struct vm_area_struct *vma,
 			pmd_t *pmd, unsigned long addr);
@@ -56,6 +59,7 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 				     unsigned long address,
 				     enum page_check_address_pmd_flag flag,
 				     spinlock_t **ptl);
+extern int pmd_freeable(pmd_t pmd);
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 02559efd9827..c238a19a648f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1384,6 +1384,36 @@ out:
 	return 0;
 }
 
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+		 pmd_t *pmd, unsigned long addr)
+
+{
+	spinlock_t *ptl;
+	struct mm_struct *mm = tlb->mm;
+	int ret = 1;
+
+	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+		struct page *page;
+		pmd_t orig_pmd;
+
+		orig_pmd = pmdp_get_and_clear(mm, addr, pmd);
+
+		/* No hugepage in swapcache */
+		page = pmd_page(orig_pmd);
+		VM_BUG_ON_PAGE(PageSwapCache(page), page);
+
+		orig_pmd = pmd_mkold(orig_pmd);
+		orig_pmd = pmd_mkclean(orig_pmd);
+
+		set_pmd_at(mm, addr, pmd, orig_pmd);
+		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+		spin_unlock(ptl);
+		ret = 0;
+	}
+
+	return ret;
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pmd_t *pmd, unsigned long addr)
 {
@@ -1620,6 +1650,11 @@ unlock:
 	return NULL;
 }
 
+int pmd_freeable(pmd_t pmd)
+{
+	return !pmd_dirty(pmd);
+}
+
 static int __split_huge_page_splitting(struct page *page,
 				       struct vm_area_struct *vma,
 				       unsigned long address)
diff --git a/mm/madvise.c b/mm/madvise.c
index a21584235bb6..84badee5f46d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -271,8 +271,26 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	pte_t *pte, ptent;
 	struct page *page;
+	unsigned long next;
+
+	next = pmd_addr_end(addr, end);
+	if (pmd_trans_huge(*pmd)) {
+		if (next - addr != HPAGE_PMD_SIZE) {
+#ifdef CONFIG_DEBUG_VM
+			if (!rwsem_is_locked(&mm->mmap_sem)) {
+				pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+					__func__, addr, end,
+					vma->vm_start,
+					vma->vm_end);
+				BUG();
+			}
+#endif
+			split_huge_page_pmd(vma, addr, pmd);
+		} else if (!madvise_free_huge_pmd(tlb, vma, pmd, addr))
+			goto next;
+		/* fall through */
+	}
 
-	split_huge_page_pmd(vma, addr, pmd);
 	if (pmd_trans_unstable(pmd))
 		return 0;
 
@@ -316,6 +334,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
+next:
 	cond_resched();
 	return 0;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 04c181133890..9c407576ff8e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -704,9 +704,13 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			referenced++;
 
 		/*
-		 * In this implmentation, MADV_FREE doesn't support THP free
+		 * Use pmd_freeable instead of raw pmd_dirty because in some
+		 * of architecture, pmd_dirty is not defined unless
+		 * CONFIG_TRANSPARNTE_HUGE is enabled
 		 */
-		dirty++;
+		if (!pmd_freeable(*pmd))
+			dirty++;
+
 		spin_unlock(ptl);
 	} else {
 		pte_t *pte;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 598e2ade21f7..b1428bdad88e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -973,17 +973,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page) && !freeable) {
-			if (!(sc->gfp_mask & __GFP_IO))
-				goto keep_locked;
-			if (!add_to_swap(page, page_list))
-				goto activate_locked;
-			may_enter_fs = 1;
-
-			/* Adding to swap updated mapping */
-			mapping = page_mapping(page);
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!freeable) {
+				if (!(sc->gfp_mask & __GFP_IO))
+					goto keep_locked;
+				if (!add_to_swap(page, page_list))
+					goto activate_locked;
+				may_enter_fs = 1;
+				/* Adding to swap updated mapping */
+				mapping = page_mapping(page);
+			} else {
+				if (likely(!PageTransHuge(page)))
+					goto unmap;
+				/* try_to_unmap isn't aware of THP page */
+				if (unlikely(split_huge_page_to_list(page,
+								page_list)))
+					goto keep_locked;
+			}
 		}
-
+unmap:
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
-- 
2.0.0

next prev parent reply	other threads:[~2014-07-18  6:53 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-07-18  6:52 [PATCH v13 0/8] MADV_FREE support Minchan Kim
2014-07-18  6:52 ` Minchan Kim
2014-07-18  6:52 ` Minchan Kim
2014-07-18  6:52 ` [PATCH v13 1/8] mm: support madvise(MADV_FREE) Minchan Kim
2014-07-18  6:52   ` Minchan Kim
2014-07-18  6:53 ` [PATCH v13 2/8] x86: add pmd_[dirty|mkclean] for THP Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53 ` [PATCH v13 3/8] sparc: " Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53 ` [PATCH v13 4/8] powerpc: " Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-08-04 10:50   ` Aneesh Kumar K.V
2014-08-04 10:50     ` Aneesh Kumar K.V
2014-08-04 10:50     ` Aneesh Kumar K.V
2014-08-04 10:50     ` Aneesh Kumar K.V
2014-07-18  6:53 ` [PATCH v13 5/8] s390: " Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53 ` [PATCH v13 6/8] arm: " Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53   ` Minchan Kim
     [not found]   ` <1405666386-15095-7-git-send-email-minchan-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
2014-07-18  8:48     ` Will Deacon
2014-07-18  8:48       ` Will Deacon
2014-07-18  8:48       ` Will Deacon
2014-07-18  8:48       ` Will Deacon
2014-07-18 14:54     ` Steve Capper
2014-07-18 14:54       ` Steve Capper
2014-07-18 14:54       ` Steve Capper
2014-07-18 14:54       ` Steve Capper
2014-07-18  6:53 ` [PATCH v13 7/8] arm64: " Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18  6:53   ` Minchan Kim
2014-07-18 14:55   ` Steve Capper
2014-07-18 14:55     ` Steve Capper
2014-07-18 14:55     ` Steve Capper
2014-07-18  6:53 ` Minchan Kim [this message]
2014-07-18  6:53   ` [PATCH v13 8/8] mm: Don't split THP page when syscall is called Minchan Kim

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:63579cb8d3d dfblob:25a961256d9 dfblob:02559efd982
dfblob:c238a19a648 dfblob:a21584235bb dfblob:84badee5f46
dfblob:04c18113389 dfblob:9c407576ff8 dfblob:598e2ade21f
dfblob:b1428bdad88 dfblob:63579cb8d3d dfblob:25a961256d9
dfblob:02559efd982 dfblob:c238a19a648 dfblob:a21584235bb
dfblob:84badee5f46 dfblob:04c18113389 dfblob:9c407576ff8
dfblob:598e2ade21f dfblob:b1428bdad88 )
 OR (
bs:"[PATCH v13 8/8] mm: Don't split THP page when syscall is called" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1405666386-15095-9-git-send-email-minchan@kernel.org \
    --to=minchan@kernel.org \
    --cc=aarcange@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=hughd@google.com \
    --cc=je@fb.com \
    --cc=kirill@shutemov.name \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mgorman@suse.de \
    --cc=mtk.manpages@gmail.com \
    --cc=riel@redhat.com \
    --cc=zhangyanfei@cn.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.