diff for duplicates of <20160505143924.GC28755@redhat.com>

All of lore.kernel.org
 help / color / mirror / Atom feed

diff for duplicates of <20160505143924.GC28755@redhat.com>

diff --git a/a/1.txt b/N1/1.txt
index 6cfa07b..6fd65a8 100644
--- a/a/1.txt
+++ b/N1/1.txt
@@ -16,3 +16,349 @@ On Wed, May 04, 2016 at 07:19:27PM -0600, Alex Williamson wrote:
 > to v4.5?  Thanks,
 
 I'm currently testing this:
+
+>From c327b17f4de0c968bb3b9035fe36d80b2c28b2f8 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Fri, 29 Apr 2016 01:05:06 +0200
+Subject: [PATCH 1/3] mm: thp: calculate the mapcount correctly for THP pages
+ during WP faults
+
+This will provide fully accuracy to the mapcount calculation in the
+write protect faults, so page pinning will not get broken by false
+positive copy-on-writes.
+
+total_mapcount() isn't the right calculation needed in
+reuse_swap_page(), so this introduces a page_trans_huge_mapcount()
+that is effectively the full accurate return value for page_mapcount()
+if dealing with Transparent Hugepages, however we only use the
+page_trans_huge_mapcount() during COW faults where it strictly needed,
+due to its higher runtime cost.
+
+This also provide at practical zero cost the total_mapcount
+information which is needed to know if we can still relocate the page
+anon_vma to the local vma. If page_trans_huge_mapcount() returns 1 we
+can reuse the page no matter if it's a pte or a pmd_trans_huge
+triggering the fault, but we can only relocate the page anon_vma to
+the local vma->anon_vma if we're sure it's only this "vma" mapping the
+whole THP physical range.
+
+Kirill A. Shutemov reported the problem with moving the page anon_vma
+to the local vma->anon_vma in a previous version of this patch.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ include/linux/mm.h   |  6 +++++
+ include/linux/swap.h |  8 ++++---
+ mm/huge_memory.c     | 67 +++++++++++++++++++++++++++++++++++++++++++++-------
+ mm/memory.c          | 21 +++++++++-------
+ mm/swapfile.c        | 13 +++++-----
+ 5 files changed, 89 insertions(+), 26 deletions(-)
+
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index a55e5be..4b532d8 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -500,11 +500,17 @@ static inline int page_mapcount(struct page *page)
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int total_mapcount(struct page *page);
++int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
+ #else
+ static inline int total_mapcount(struct page *page)
+ {
+ 	return page_mapcount(page);
+ }
++static inline int page_trans_huge_mapcount(struct page *page,
++					   int *total_mapcount)
++{
++	return *total_mapcount = page_mapcount(page);
++}
+ #endif
+ 
+ static inline struct page *virt_to_head_page(const void *x)
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 2b83359..acef20d 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);
+ extern int page_swapcount(struct page *);
+ extern int swp_swapcount(swp_entry_t entry);
+ extern struct swap_info_struct *page_swap_info(struct page *);
+-extern int reuse_swap_page(struct page *);
++extern bool reuse_swap_page(struct page *, int *);
+ extern int try_to_free_swap(struct page *);
+ struct backing_dev_info;
+ 
+@@ -513,8 +513,10 @@ static inline int swp_swapcount(swp_entry_t entry)
+ 	return 0;
+ }
+ 
+-#define reuse_swap_page(page) \
+-	(!PageTransCompound(page) && page_mapcount(page) == 1)
++static inline bool reuse_swap_page(struct page *page, int *total_mapcount)
++{
++	return page_trans_huge_mapcount(page, total_mapcount) == 1;
++}
+ 
+ static inline int try_to_free_swap(struct page *page)
+ {
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 86f9f8b..d368620 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
+ 	/*
+ 	 * We can only reuse the page if nobody else maps the huge page or it's
+-	 * part. We can do it by checking page_mapcount() on each sub-page, but
+-	 * it's expensive.
+-	 * The cheaper way is to check page_count() to be equal 1: every
+-	 * mapcount takes page reference reference, so this way we can
+-	 * guarantee, that the PMD is the only mapping.
+-	 * This can give false negative if somebody pinned the page, but that's
+-	 * fine.
++	 * part.
+ 	 */
+-	if (page_mapcount(page) == 1 && page_count(page) == 1) {
++	if (page_trans_huge_mapcount(page, NULL) == 1) {
+ 		pmd_t entry;
+ 		entry = pmd_mkyoung(orig_pmd);
+ 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+@@ -2080,7 +2074,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ 		if (pte_write(pteval)) {
+ 			writable = true;
+ 		} else {
+-			if (PageSwapCache(page) && !reuse_swap_page(page)) {
++			if (PageSwapCache(page) &&
++			    !reuse_swap_page(page, NULL)) {
+ 				unlock_page(page);
+ 				result = SCAN_SWAP_CACHE_PAGE;
+ 				goto out;
+@@ -3225,6 +3220,60 @@ int total_mapcount(struct page *page)
+ }
+ 
+ /*
++ * This calculates accurately how many mappings a transparent hugepage
++ * has (unlike page_mapcount() which isn't fully accurate). This full
++ * accuracy is primarily needed to know if copy-on-write faults can
++ * takeover the page and change the mapping to read-write instead of
++ * copying them. At the same time this returns the total_mapcount too.
++ *
++ * The return value is telling if the page can be reused as it returns
++ * the highest mapcount any one of the subpages has. If the return
++ * value is one, even if different processes are mapping different
++ * subpages of the transparent hugepage, they can all reuse it,
++ * because each process is reusing a different subpage.
++ *
++ * The total_mapcount is instead counting all virtual mappings of the
++ * subpages. If the total_mapcount is equal to "one", it tells the
++ * caller all mappings belong to the same "mm" and in turn the
++ * anon_vma of the transparent hugepage can become the vma->anon_vma
++ * local one as no other process may be mapping any of the subpages.
++ *
++ * It would be more accurate to replace page_mapcount() with
++ * page_trans_huge_mapcount(), however we only use
++ * page_trans_huge_mapcount() in the copy-on-write faults where we
++ * need full accuracy to avoid breaking page pinning, because
++ * page_trans_huge_mapcount is slower than page_mapcount().
++ */
++int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
++{
++	int i, ret, _total_mapcount, mapcount;
++
++	/* hugetlbfs shouldn't call it */
++	VM_BUG_ON_PAGE(PageHuge(page), page);
++
++	if (likely(!PageTransCompound(page)))
++		return atomic_read(&page->_mapcount) + 1;
++
++	page = compound_head(page);
++
++	_total_mapcount = ret = 0;
++	for (i = 0; i < HPAGE_PMD_NR; i++) {
++		mapcount = atomic_read(&page[i]._mapcount) + 1;
++		ret = max(ret, mapcount);
++		_total_mapcount += mapcount;
++	}
++	if (PageDoubleMap(page)) {
++		ret -= 1;
++		_total_mapcount -= HPAGE_PMD_NR;
++	}
++	mapcount = compound_mapcount(page);
++	ret += mapcount;
++	_total_mapcount += mapcount;
++	*total_mapcount = _total_mapcount;
++	return ret;
++}
++
++/*
+  * This function splits huge page into normal pages. @page can point to any
+  * subpage of huge page to split. Split doesn't change the position of @page.
+  *
+diff --git a/mm/memory.c b/mm/memory.c
+index 93897f2..1589aa4 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -2340,6 +2340,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	 * not dirty accountable.
+ 	 */
+ 	if (PageAnon(old_page) && !PageKsm(old_page)) {
++		int total_mapcount;
+ 		if (!trylock_page(old_page)) {
+ 			get_page(old_page);
+ 			pte_unmap_unlock(page_table, ptl);
+@@ -2354,13 +2355,17 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 			}
+ 			put_page(old_page);
+ 		}
+-		if (reuse_swap_page(old_page)) {
+-			/*
+-			 * The page is all ours.  Move it to our anon_vma so
+-			 * the rmap code will not search our parent or siblings.
+-			 * Protected against the rmap code by the page lock.
+-			 */
+-			page_move_anon_rmap(old_page, vma, address);
++		if (reuse_swap_page(old_page, &total_mapcount)) {
++			if (total_mapcount == 1) {
++				/*
++				 * The page is all ours. Move it to
++				 * our anon_vma so the rmap code will
++				 * not search our parent or siblings.
++				 * Protected against the rmap code by
++				 * the page lock.
++				 */
++				page_move_anon_rmap(old_page, vma, address);
++			}
+ 			unlock_page(old_page);
+ 			return wp_page_reuse(mm, vma, address, page_table, ptl,
+ 					     orig_pte, old_page, 0, 0);
+@@ -2584,7 +2589,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 	inc_mm_counter_fast(mm, MM_ANONPAGES);
+ 	dec_mm_counter_fast(mm, MM_SWAPENTS);
+ 	pte = mk_pte(page, vma->vm_page_prot);
+-	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
++	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ 		flags &= ~FAULT_FLAG_WRITE;
+ 		ret |= VM_FAULT_WRITE;
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index 83874ec..031713ab 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -922,18 +922,19 @@ out:
+  * to it.  And as a side-effect, free up its swap: because the old content
+  * on disk will never be read, and seeking back there to write new content
+  * later would only waste time away from clustering.
++ *
++ * NOTE: total_mapcount should not be relied upon by the caller if
++ * reuse_swap_page() returns false, but it may be always overwritten
++ * (see the other implementation for CONFIG_SWAP=n).
+  */
+-int reuse_swap_page(struct page *page)
++bool reuse_swap_page(struct page *page, int *total_mapcount)
+ {
+ 	int count;
+ 
+ 	VM_BUG_ON_PAGE(!PageLocked(page), page);
+ 	if (unlikely(PageKsm(page)))
+-		return 0;
+-	/* The page is part of THP and cannot be reused */
+-	if (PageTransCompound(page))
+-		return 0;
+-	count = page_mapcount(page);
++		return false;
++	count = page_trans_huge_mapcount(page, total_mapcount);
+ 	if (count <= 1 && PageSwapCache(page)) {
+ 		count += page_swapcount(page);
+ 		if (count == 1 && !PageWriteback(page)) {
+
+
+
+>From b3cd271859f4c8243b58b4b55998fcc9ee0a0988 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Sat, 30 Apr 2016 18:35:34 +0200
+Subject: [PATCH 3/4] mm: thp: microoptimize compound_mapcount()
+
+compound_mapcount() is only called after PageCompound() has already
+been checked by the caller, so there's no point to check it again. Gcc
+may optimize it away too because it's inline but this will remove the
+runtime check for sure and add it'll add an assert instead.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ include/linux/mm.h | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 4b532d8..119325d 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -471,8 +471,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
+ 
+ static inline int compound_mapcount(struct page *page)
+ {
+-	if (!PageCompound(page))
+-		return 0;
++	VM_BUG_ON_PAGE(!PageCompound(page), page);
+ 	page = compound_head(page);
+ 	return atomic_read(compound_mapcount_ptr(page)) + 1;
+ }
+
+
+
+>From a2f1172344b87b1b0e18d07014ee5ab2027fac10 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 5 May 2016 00:59:27 +0200
+Subject: [PATCH 4/4] mm: thp: split_huge_pmd_address() comment improvement
+
+Comment is partly wrong, this improves it by including the case of
+split_huge_pmd_address() called by try_to_unmap_one if
+TTU_SPLIT_HUGE_PMD is set.
+
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+---
+ mm/huge_memory.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index f8f07e4..e716726 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -3032,8 +3032,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ 		return;
+ 
+ 	/*
+-	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
+-	 * materialize from under us.
++	 * Caller holds the mmap_sem write mode or the anon_vma lock,
++	 * so a huge pmd cannot materialize from under us (khugepaged
++	 * holds both the mmap_sem write mode and the anon_vma lock
++	 * write mode).
+ 	 */
+ 	__split_huge_pmd(vma, pmd, address, freeze);
+ }
+
+
+I also noticed we aren't calling page_move_anon_rmap in
+do_huge_pmd_wp_page when page_trans_huge_mapcount returns 1, that's a
+longstanding inefficiency but it's not a bug. We're not locking the
+page down in the THP COW because we don't have to deal with swapcache,
+and in turn we can't overwrite the page->mapping. I think in practice
+it would be safe anyway because it's an atomic write and no matter if
+the rmap_walk reader sees the value before or after the write, it'll
+still be able to find the pmd_trans_huge during the rmap walk. However
+if page->mapping can change under the reader (i.e. rmap_walk) then the
+reader should use READ_ONCE to access page->mapping (or page->mapping
+should become volatile). Otherwise it'd be a bug with the C standard
+where gcc could get confused in theory (in practice it would work fine
+as we're mostly just dereferencing that page->mapping pointer and not
+using it for switch/case or stuff like that where gcc could use an
+hash). Regardless for robustness it'd be better if we take appropriate
+locking and so we should take the page lock by doing a check if the
+page->mapping is already pointing to the local vma->anon_vma first, if
+not then we should take the page lock on the head THP and call
+page_move_anon_rmap. Because this is a longstanding problem I didn't
+address it yet, and it's only a missing optimization but it'd be nice
+to get that covered too (considering we just worsened a bit the
+optimization in presence of a COW after a pmd split and before the
+physical split).
diff --git a/a/content_digest b/N1/content_digest
index 76ad7ee..0957dd3 100644
--- a/a/content_digest
+++ b/N1/content_digest
@@ -35,6 +35,352 @@
  "> something we feel confident for posting to v4.6 with a stable backport\n"
  "> to v4.5?  Thanks,\n"
  "\n"
- I'm currently testing this:
+ "I'm currently testing this:\n"
+ "\n"
+ ">From c327b17f4de0c968bb3b9035fe36d80b2c28b2f8 Mon Sep 17 00:00:00 2001\n"
+ "From: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Date: Fri, 29 Apr 2016 01:05:06 +0200\n"
+ "Subject: [PATCH 1/3] mm: thp: calculate the mapcount correctly for THP pages\n"
+ " during WP faults\n"
+ "\n"
+ "This will provide fully accuracy to the mapcount calculation in the\n"
+ "write protect faults, so page pinning will not get broken by false\n"
+ "positive copy-on-writes.\n"
+ "\n"
+ "total_mapcount() isn't the right calculation needed in\n"
+ "reuse_swap_page(), so this introduces a page_trans_huge_mapcount()\n"
+ "that is effectively the full accurate return value for page_mapcount()\n"
+ "if dealing with Transparent Hugepages, however we only use the\n"
+ "page_trans_huge_mapcount() during COW faults where it strictly needed,\n"
+ "due to its higher runtime cost.\n"
+ "\n"
+ "This also provide at practical zero cost the total_mapcount\n"
+ "information which is needed to know if we can still relocate the page\n"
+ "anon_vma to the local vma. If page_trans_huge_mapcount() returns 1 we\n"
+ "can reuse the page no matter if it's a pte or a pmd_trans_huge\n"
+ "triggering the fault, but we can only relocate the page anon_vma to\n"
+ "the local vma->anon_vma if we're sure it's only this \"vma\" mapping the\n"
+ "whole THP physical range.\n"
+ "\n"
+ "Kirill A. Shutemov reported the problem with moving the page anon_vma\n"
+ "to the local vma->anon_vma in a previous version of this patch.\n"
+ "\n"
+ "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "---\n"
+ " include/linux/mm.h   |  6 +++++\n"
+ " include/linux/swap.h |  8 ++++---\n"
+ " mm/huge_memory.c     | 67 +++++++++++++++++++++++++++++++++++++++++++++-------\n"
+ " mm/memory.c          | 21 +++++++++-------\n"
+ " mm/swapfile.c        | 13 +++++-----\n"
+ " 5 files changed, 89 insertions(+), 26 deletions(-)\n"
+ "\n"
+ "diff --git a/include/linux/mm.h b/include/linux/mm.h\n"
+ "index a55e5be..4b532d8 100644\n"
+ "--- a/include/linux/mm.h\n"
+ "+++ b/include/linux/mm.h\n"
+ "@@ -500,11 +500,17 @@ static inline int page_mapcount(struct page *page)\n"
+ " \n"
+ " #ifdef CONFIG_TRANSPARENT_HUGEPAGE\n"
+ " int total_mapcount(struct page *page);\n"
+ "+int page_trans_huge_mapcount(struct page *page, int *total_mapcount);\n"
+ " #else\n"
+ " static inline int total_mapcount(struct page *page)\n"
+ " {\n"
+ " \treturn page_mapcount(page);\n"
+ " }\n"
+ "+static inline int page_trans_huge_mapcount(struct page *page,\n"
+ "+\t\t\t\t\t   int *total_mapcount)\n"
+ "+{\n"
+ "+\treturn *total_mapcount = page_mapcount(page);\n"
+ "+}\n"
+ " #endif\n"
+ " \n"
+ " static inline struct page *virt_to_head_page(const void *x)\n"
+ "diff --git a/include/linux/swap.h b/include/linux/swap.h\n"
+ "index 2b83359..acef20d 100644\n"
+ "--- a/include/linux/swap.h\n"
+ "+++ b/include/linux/swap.h\n"
+ "@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);\n"
+ " extern int page_swapcount(struct page *);\n"
+ " extern int swp_swapcount(swp_entry_t entry);\n"
+ " extern struct swap_info_struct *page_swap_info(struct page *);\n"
+ "-extern int reuse_swap_page(struct page *);\n"
+ "+extern bool reuse_swap_page(struct page *, int *);\n"
+ " extern int try_to_free_swap(struct page *);\n"
+ " struct backing_dev_info;\n"
+ " \n"
+ "@@ -513,8 +513,10 @@ static inline int swp_swapcount(swp_entry_t entry)\n"
+ " \treturn 0;\n"
+ " }\n"
+ " \n"
+ "-#define reuse_swap_page(page) \\\n"
+ "-\t(!PageTransCompound(page) && page_mapcount(page) == 1)\n"
+ "+static inline bool reuse_swap_page(struct page *page, int *total_mapcount)\n"
+ "+{\n"
+ "+\treturn page_trans_huge_mapcount(page, total_mapcount) == 1;\n"
+ "+}\n"
+ " \n"
+ " static inline int try_to_free_swap(struct page *page)\n"
+ " {\n"
+ "diff --git a/mm/huge_memory.c b/mm/huge_memory.c\n"
+ "index 86f9f8b..d368620 100644\n"
+ "--- a/mm/huge_memory.c\n"
+ "+++ b/mm/huge_memory.c\n"
+ "@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,\n"
+ " \tVM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);\n"
+ " \t/*\n"
+ " \t * We can only reuse the page if nobody else maps the huge page or it's\n"
+ "-\t * part. We can do it by checking page_mapcount() on each sub-page, but\n"
+ "-\t * it's expensive.\n"
+ "-\t * The cheaper way is to check page_count() to be equal 1: every\n"
+ "-\t * mapcount takes page reference reference, so this way we can\n"
+ "-\t * guarantee, that the PMD is the only mapping.\n"
+ "-\t * This can give false negative if somebody pinned the page, but that's\n"
+ "-\t * fine.\n"
+ "+\t * part.\n"
+ " \t */\n"
+ "-\tif (page_mapcount(page) == 1 && page_count(page) == 1) {\n"
+ "+\tif (page_trans_huge_mapcount(page, NULL) == 1) {\n"
+ " \t\tpmd_t entry;\n"
+ " \t\tentry = pmd_mkyoung(orig_pmd);\n"
+ " \t\tentry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);\n"
+ "@@ -2080,7 +2074,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,\n"
+ " \t\tif (pte_write(pteval)) {\n"
+ " \t\t\twritable = true;\n"
+ " \t\t} else {\n"
+ "-\t\t\tif (PageSwapCache(page) && !reuse_swap_page(page)) {\n"
+ "+\t\t\tif (PageSwapCache(page) &&\n"
+ "+\t\t\t    !reuse_swap_page(page, NULL)) {\n"
+ " \t\t\t\tunlock_page(page);\n"
+ " \t\t\t\tresult = SCAN_SWAP_CACHE_PAGE;\n"
+ " \t\t\t\tgoto out;\n"
+ "@@ -3225,6 +3220,60 @@ int total_mapcount(struct page *page)\n"
+ " }\n"
+ " \n"
+ " /*\n"
+ "+ * This calculates accurately how many mappings a transparent hugepage\n"
+ "+ * has (unlike page_mapcount() which isn't fully accurate). This full\n"
+ "+ * accuracy is primarily needed to know if copy-on-write faults can\n"
+ "+ * takeover the page and change the mapping to read-write instead of\n"
+ "+ * copying them. At the same time this returns the total_mapcount too.\n"
+ "+ *\n"
+ "+ * The return value is telling if the page can be reused as it returns\n"
+ "+ * the highest mapcount any one of the subpages has. If the return\n"
+ "+ * value is one, even if different processes are mapping different\n"
+ "+ * subpages of the transparent hugepage, they can all reuse it,\n"
+ "+ * because each process is reusing a different subpage.\n"
+ "+ *\n"
+ "+ * The total_mapcount is instead counting all virtual mappings of the\n"
+ "+ * subpages. If the total_mapcount is equal to \"one\", it tells the\n"
+ "+ * caller all mappings belong to the same \"mm\" and in turn the\n"
+ "+ * anon_vma of the transparent hugepage can become the vma->anon_vma\n"
+ "+ * local one as no other process may be mapping any of the subpages.\n"
+ "+ *\n"
+ "+ * It would be more accurate to replace page_mapcount() with\n"
+ "+ * page_trans_huge_mapcount(), however we only use\n"
+ "+ * page_trans_huge_mapcount() in the copy-on-write faults where we\n"
+ "+ * need full accuracy to avoid breaking page pinning, because\n"
+ "+ * page_trans_huge_mapcount is slower than page_mapcount().\n"
+ "+ */\n"
+ "+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)\n"
+ "+{\n"
+ "+\tint i, ret, _total_mapcount, mapcount;\n"
+ "+\n"
+ "+\t/* hugetlbfs shouldn't call it */\n"
+ "+\tVM_BUG_ON_PAGE(PageHuge(page), page);\n"
+ "+\n"
+ "+\tif (likely(!PageTransCompound(page)))\n"
+ "+\t\treturn atomic_read(&page->_mapcount) + 1;\n"
+ "+\n"
+ "+\tpage = compound_head(page);\n"
+ "+\n"
+ "+\t_total_mapcount = ret = 0;\n"
+ "+\tfor (i = 0; i < HPAGE_PMD_NR; i++) {\n"
+ "+\t\tmapcount = atomic_read(&page[i]._mapcount) + 1;\n"
+ "+\t\tret = max(ret, mapcount);\n"
+ "+\t\t_total_mapcount += mapcount;\n"
+ "+\t}\n"
+ "+\tif (PageDoubleMap(page)) {\n"
+ "+\t\tret -= 1;\n"
+ "+\t\t_total_mapcount -= HPAGE_PMD_NR;\n"
+ "+\t}\n"
+ "+\tmapcount = compound_mapcount(page);\n"
+ "+\tret += mapcount;\n"
+ "+\t_total_mapcount += mapcount;\n"
+ "+\t*total_mapcount = _total_mapcount;\n"
+ "+\treturn ret;\n"
+ "+}\n"
+ "+\n"
+ "+/*\n"
+ "  * This function splits huge page into normal pages. @page can point to any\n"
+ "  * subpage of huge page to split. Split doesn't change the position of @page.\n"
+ "  *\n"
+ "diff --git a/mm/memory.c b/mm/memory.c\n"
+ "index 93897f2..1589aa4 100644\n"
+ "--- a/mm/memory.c\n"
+ "+++ b/mm/memory.c\n"
+ "@@ -2340,6 +2340,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,\n"
+ " \t * not dirty accountable.\n"
+ " \t */\n"
+ " \tif (PageAnon(old_page) && !PageKsm(old_page)) {\n"
+ "+\t\tint total_mapcount;\n"
+ " \t\tif (!trylock_page(old_page)) {\n"
+ " \t\t\tget_page(old_page);\n"
+ " \t\t\tpte_unmap_unlock(page_table, ptl);\n"
+ "@@ -2354,13 +2355,17 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,\n"
+ " \t\t\t}\n"
+ " \t\t\tput_page(old_page);\n"
+ " \t\t}\n"
+ "-\t\tif (reuse_swap_page(old_page)) {\n"
+ "-\t\t\t/*\n"
+ "-\t\t\t * The page is all ours.  Move it to our anon_vma so\n"
+ "-\t\t\t * the rmap code will not search our parent or siblings.\n"
+ "-\t\t\t * Protected against the rmap code by the page lock.\n"
+ "-\t\t\t */\n"
+ "-\t\t\tpage_move_anon_rmap(old_page, vma, address);\n"
+ "+\t\tif (reuse_swap_page(old_page, &total_mapcount)) {\n"
+ "+\t\t\tif (total_mapcount == 1) {\n"
+ "+\t\t\t\t/*\n"
+ "+\t\t\t\t * The page is all ours. Move it to\n"
+ "+\t\t\t\t * our anon_vma so the rmap code will\n"
+ "+\t\t\t\t * not search our parent or siblings.\n"
+ "+\t\t\t\t * Protected against the rmap code by\n"
+ "+\t\t\t\t * the page lock.\n"
+ "+\t\t\t\t */\n"
+ "+\t\t\t\tpage_move_anon_rmap(old_page, vma, address);\n"
+ "+\t\t\t}\n"
+ " \t\t\tunlock_page(old_page);\n"
+ " \t\t\treturn wp_page_reuse(mm, vma, address, page_table, ptl,\n"
+ " \t\t\t\t\t     orig_pte, old_page, 0, 0);\n"
+ "@@ -2584,7 +2589,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,\n"
+ " \tinc_mm_counter_fast(mm, MM_ANONPAGES);\n"
+ " \tdec_mm_counter_fast(mm, MM_SWAPENTS);\n"
+ " \tpte = mk_pte(page, vma->vm_page_prot);\n"
+ "-\tif ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {\n"
+ "+\tif ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {\n"
+ " \t\tpte = maybe_mkwrite(pte_mkdirty(pte), vma);\n"
+ " \t\tflags &= ~FAULT_FLAG_WRITE;\n"
+ " \t\tret |= VM_FAULT_WRITE;\n"
+ "diff --git a/mm/swapfile.c b/mm/swapfile.c\n"
+ "index 83874ec..031713ab 100644\n"
+ "--- a/mm/swapfile.c\n"
+ "+++ b/mm/swapfile.c\n"
+ "@@ -922,18 +922,19 @@ out:\n"
+ "  * to it.  And as a side-effect, free up its swap: because the old content\n"
+ "  * on disk will never be read, and seeking back there to write new content\n"
+ "  * later would only waste time away from clustering.\n"
+ "+ *\n"
+ "+ * NOTE: total_mapcount should not be relied upon by the caller if\n"
+ "+ * reuse_swap_page() returns false, but it may be always overwritten\n"
+ "+ * (see the other implementation for CONFIG_SWAP=n).\n"
+ "  */\n"
+ "-int reuse_swap_page(struct page *page)\n"
+ "+bool reuse_swap_page(struct page *page, int *total_mapcount)\n"
+ " {\n"
+ " \tint count;\n"
+ " \n"
+ " \tVM_BUG_ON_PAGE(!PageLocked(page), page);\n"
+ " \tif (unlikely(PageKsm(page)))\n"
+ "-\t\treturn 0;\n"
+ "-\t/* The page is part of THP and cannot be reused */\n"
+ "-\tif (PageTransCompound(page))\n"
+ "-\t\treturn 0;\n"
+ "-\tcount = page_mapcount(page);\n"
+ "+\t\treturn false;\n"
+ "+\tcount = page_trans_huge_mapcount(page, total_mapcount);\n"
+ " \tif (count <= 1 && PageSwapCache(page)) {\n"
+ " \t\tcount += page_swapcount(page);\n"
+ " \t\tif (count == 1 && !PageWriteback(page)) {\n"
+ "\n"
+ "\n"
+ "\n"
+ ">From b3cd271859f4c8243b58b4b55998fcc9ee0a0988 Mon Sep 17 00:00:00 2001\n"
+ "From: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Date: Sat, 30 Apr 2016 18:35:34 +0200\n"
+ "Subject: [PATCH 3/4] mm: thp: microoptimize compound_mapcount()\n"
+ "\n"
+ "compound_mapcount() is only called after PageCompound() has already\n"
+ "been checked by the caller, so there's no point to check it again. Gcc\n"
+ "may optimize it away too because it's inline but this will remove the\n"
+ "runtime check for sure and add it'll add an assert instead.\n"
+ "\n"
+ "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "---\n"
+ " include/linux/mm.h | 3 +--\n"
+ " 1 file changed, 1 insertion(+), 2 deletions(-)\n"
+ "\n"
+ "diff --git a/include/linux/mm.h b/include/linux/mm.h\n"
+ "index 4b532d8..119325d 100644\n"
+ "--- a/include/linux/mm.h\n"
+ "+++ b/include/linux/mm.h\n"
+ "@@ -471,8 +471,7 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)\n"
+ " \n"
+ " static inline int compound_mapcount(struct page *page)\n"
+ " {\n"
+ "-\tif (!PageCompound(page))\n"
+ "-\t\treturn 0;\n"
+ "+\tVM_BUG_ON_PAGE(!PageCompound(page), page);\n"
+ " \tpage = compound_head(page);\n"
+ " \treturn atomic_read(compound_mapcount_ptr(page)) + 1;\n"
+ " }\n"
+ "\n"
+ "\n"
+ "\n"
+ ">From a2f1172344b87b1b0e18d07014ee5ab2027fac10 Mon Sep 17 00:00:00 2001\n"
+ "From: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "Date: Thu, 5 May 2016 00:59:27 +0200\n"
+ "Subject: [PATCH 4/4] mm: thp: split_huge_pmd_address() comment improvement\n"
+ "\n"
+ "Comment is partly wrong, this improves it by including the case of\n"
+ "split_huge_pmd_address() called by try_to_unmap_one if\n"
+ "TTU_SPLIT_HUGE_PMD is set.\n"
+ "\n"
+ "Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>\n"
+ "---\n"
+ " mm/huge_memory.c | 6 ++++--\n"
+ " 1 file changed, 4 insertions(+), 2 deletions(-)\n"
+ "\n"
+ "diff --git a/mm/huge_memory.c b/mm/huge_memory.c\n"
+ "index f8f07e4..e716726 100644\n"
+ "--- a/mm/huge_memory.c\n"
+ "+++ b/mm/huge_memory.c\n"
+ "@@ -3032,8 +3032,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,\n"
+ " \t\treturn;\n"
+ " \n"
+ " \t/*\n"
+ "-\t * Caller holds the mmap_sem write mode, so a huge pmd cannot\n"
+ "-\t * materialize from under us.\n"
+ "+\t * Caller holds the mmap_sem write mode or the anon_vma lock,\n"
+ "+\t * so a huge pmd cannot materialize from under us (khugepaged\n"
+ "+\t * holds both the mmap_sem write mode and the anon_vma lock\n"
+ "+\t * write mode).\n"
+ " \t */\n"
+ " \t__split_huge_pmd(vma, pmd, address, freeze);\n"
+ " }\n"
+ "\n"
+ "\n"
+ "I also noticed we aren't calling page_move_anon_rmap in\n"
+ "do_huge_pmd_wp_page when page_trans_huge_mapcount returns 1, that's a\n"
+ "longstanding inefficiency but it's not a bug. We're not locking the\n"
+ "page down in the THP COW because we don't have to deal with swapcache,\n"
+ "and in turn we can't overwrite the page->mapping. I think in practice\n"
+ "it would be safe anyway because it's an atomic write and no matter if\n"
+ "the rmap_walk reader sees the value before or after the write, it'll\n"
+ "still be able to find the pmd_trans_huge during the rmap walk. However\n"
+ "if page->mapping can change under the reader (i.e. rmap_walk) then the\n"
+ "reader should use READ_ONCE to access page->mapping (or page->mapping\n"
+ "should become volatile). Otherwise it'd be a bug with the C standard\n"
+ "where gcc could get confused in theory (in practice it would work fine\n"
+ "as we're mostly just dereferencing that page->mapping pointer and not\n"
+ "using it for switch/case or stuff like that where gcc could use an\n"
+ "hash). Regardless for robustness it'd be better if we take appropriate\n"
+ "locking and so we should take the page lock by doing a check if the\n"
+ "page->mapping is already pointing to the local vma->anon_vma first, if\n"
+ "not then we should take the page lock on the head THP and call\n"
+ "page_move_anon_rmap. Because this is a longstanding problem I didn't\n"
+ "address it yet, and it's only a missing optimization but it'd be nice\n"
+ "to get that covered too (considering we just worsened a bit the\n"
+ "optimization in presence of a COW after a pmd split and before the\n"
+ physical split).
 
-cecb863a19eac5e88d0548c51225595d5524ccc8268e98b45f32f92fbfc24ccc
+89cd4473ffa21ba371ca477c643d1df4cb136fdae42e061a6f48552f173a3b1e

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.