linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: David Hildenbrand <david@redhat.com>
To: linux-kernel@vger.kernel.org
Cc: linux-mm@kvack.org, David Hildenbrand <david@redhat.com>,
	Andrew Morton <akpm@linux-foundation.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Ryan Roberts <ryan.roberts@arm.com>,
	Matthew Wilcox <willy@infradead.org>,
	Hugh Dickins <hughd@google.com>,
	Yin Fengwei <fengwei.yin@intel.com>,
	Yang Shi <shy828301@gmail.com>, Ying Huang <ying.huang@intel.com>,
	Zi Yan <ziy@nvidia.com>, Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@redhat.com>, Will Deacon <will@kernel.org>,
	Waiman Long <longman@redhat.com>,
	"Paul E. McKenney" <paulmck@kernel.org>
Subject: [PATCH WIP v1 20/20] mm/rmap: perform all mapcount operations of large folios under the rmap seqcount
Date: Fri, 24 Nov 2023 14:26:25 +0100	[thread overview]
Message-ID: <20231124132626.235350-21-david@redhat.com> (raw)
In-Reply-To: <20231124132626.235350-1-david@redhat.com>

Let's extend the atomic seqcount to also protect modifications of:
* The subpage mapcounts
* The entire mapcount
* folio->_nr_pages_mapped

This way, we can avoid another 1/2 atomic RMW operations on the fast
path (and significantly more when patching): When we are the exclusive
writer, we only need two atomic RMW operations to manage the atomic
seqcount.

Let's document how the existing atomic seqcount memory barriers keep the
old behavior unmodified: especially, how it makes sure that folio
refcount updates cannot be reordered with folio mapcount updates.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/rmap.h | 95 ++++++++++++++++++++++++++------------------
 mm/rmap.c            | 84 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 137 insertions(+), 42 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 538c23d3c0c9..3cff4aa71393 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -301,6 +301,12 @@ static inline bool __folio_write_large_rmap_begin(struct folio *folio)
 	exclusive = raw_write_atomic_seqcount_begin(&folio->_rmap_atomic_seqcount,
 						    true);
 	if (likely(exclusive)) {
+		/*
+		 * Note: raw_write_atomic_seqcount_begin() implies a full
+		 * memory barrier like non-exclusive mapcount operations
+		 * will. Any refcount updates that happened before this call
+		 * are visible before any mapcount updates on other CPUs.
+		 */
 		prefetchw(&folio->_rmap_val0);
 		if (unlikely(folio_order(folio) > RMAP_SUBID_4_MAX_ORDER))
 			prefetchw(&folio->_rmap_val4);
@@ -311,6 +317,12 @@ static inline bool __folio_write_large_rmap_begin(struct folio *folio)
 static inline void __folio_write_large_rmap_end(struct folio *folio,
 		bool exclusive)
 {
+	/*
+	 * Note: raw_write_atomic_seqcount_end() implies a full memory
+	 * barrier like non-exclusive mapcount operations will. Any
+	 * refcount updates happening after this call are visible after any
+	 * mapcount updates on other CPUs.
+	 */
 	raw_write_atomic_seqcount_end(&folio->_rmap_atomic_seqcount,
 				      exclusive);
 }
@@ -367,52 +379,46 @@ static inline void folio_set_large_mapcount(struct folio *folio,
 static inline void folio_inc_large_mapcount(struct folio *folio,
 		struct vm_area_struct *vma)
 {
-	bool exclusive;
+	atomic_inc(&folio->_total_mapcount);
+	__folio_add_large_rmap_val(folio, 1, vma->vm_mm);
+}
 
-	exclusive = __folio_write_large_rmap_begin(folio);
-	if (likely(exclusive)) {
-		atomic_set(&folio->_total_mapcount,
-			   atomic_read(&folio->_total_mapcount) + 1);
-		__folio_add_large_rmap_val_exclusive(folio, 1, vma->vm_mm);
-	} else {
-		atomic_inc(&folio->_total_mapcount);
-		__folio_add_large_rmap_val(folio, 1, vma->vm_mm);
-	}
-	__folio_write_large_rmap_end(folio, exclusive);
+static inline void folio_inc_large_mapcount_exclusive(struct folio *folio,
+		struct vm_area_struct *vma)
+{
+	atomic_set(&folio->_total_mapcount,
+		   atomic_read(&folio->_total_mapcount) + 1);
+	__folio_add_large_rmap_val_exclusive(folio, 1, vma->vm_mm);
 }
 
 static inline void folio_add_large_mapcount(struct folio *folio,
 		int count, struct vm_area_struct *vma)
 {
-	bool exclusive;
+	atomic_add(count, &folio->_total_mapcount);
+	__folio_add_large_rmap_val(folio, count, vma->vm_mm);
+}
 
-	exclusive = __folio_write_large_rmap_begin(folio);
-	if (likely(exclusive)) {
-		atomic_set(&folio->_total_mapcount,
-			   atomic_read(&folio->_total_mapcount) + count);
-		__folio_add_large_rmap_val_exclusive(folio, count, vma->vm_mm);
-	} else {
-		atomic_add(count, &folio->_total_mapcount);
-		__folio_add_large_rmap_val(folio, count, vma->vm_mm);
-	}
-	__folio_write_large_rmap_end(folio, exclusive);
+static inline void folio_add_large_mapcount_exclusive(struct folio *folio,
+		int count, struct vm_area_struct *vma)
+{
+	atomic_set(&folio->_total_mapcount,
+		   atomic_read(&folio->_total_mapcount) + count);
+	__folio_add_large_rmap_val_exclusive(folio, count, vma->vm_mm);
 }
 
 static inline void folio_dec_large_mapcount(struct folio *folio,
 		struct vm_area_struct *vma)
 {
-	bool exclusive;
+	atomic_dec(&folio->_total_mapcount);
+	__folio_add_large_rmap_val(folio, -1, vma->vm_mm);
+}
 
-	exclusive = __folio_write_large_rmap_begin(folio);
-	if (likely(exclusive)) {
-		atomic_set(&folio->_total_mapcount,
-			   atomic_read(&folio->_total_mapcount) - 1);
-		__folio_add_large_rmap_val_exclusive(folio, -1, vma->vm_mm);
-	} else {
-		atomic_dec(&folio->_total_mapcount);
-		__folio_add_large_rmap_val(folio, -1, vma->vm_mm);
-	}
-	__folio_write_large_rmap_end(folio, exclusive);
+static inline void folio_dec_large_mapcount_exclusive(struct folio *folio,
+		struct vm_area_struct *vma)
+{
+	atomic_set(&folio->_total_mapcount,
+		   atomic_read(&folio->_total_mapcount) - 1);
+	__folio_add_large_rmap_val_exclusive(folio, -1, vma->vm_mm);
 }
 
 /* RMAP flags, currently only relevant for some anon rmap operations. */
@@ -462,6 +468,7 @@ static inline void __page_dup_rmap(struct page *page,
 		struct vm_area_struct *dst_vma, bool compound)
 {
 	struct folio *folio = page_folio(page);
+	bool exclusive;
 
 	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
 	if (likely(!folio_test_large(folio))) {
@@ -475,11 +482,23 @@ static inline void __page_dup_rmap(struct page *page,
 		return;
 	}
 
-	if (compound)
-		atomic_inc(&folio->_entire_mapcount);
-	else
-		atomic_inc(&page->_mapcount);
-	folio_inc_large_mapcount(folio, dst_vma);
+	exclusive = __folio_write_large_rmap_begin(folio);
+	if (likely(exclusive)) {
+		if (compound)
+			atomic_set(&folio->_entire_mapcount,
+				   atomic_read(&folio->_entire_mapcount) + 1);
+		else
+			atomic_set(&page->_mapcount,
+				   atomic_read(&page->_mapcount) + 1);
+		folio_inc_large_mapcount_exclusive(folio, dst_vma);
+	} else {
+		if (compound)
+			atomic_inc(&folio->_entire_mapcount);
+		else
+			atomic_inc(&page->_mapcount);
+		folio_inc_large_mapcount(folio, dst_vma);
+	}
+	__folio_write_large_rmap_end(folio, exclusive);
 }
 
 static inline void page_dup_file_rmap(struct page *page,
diff --git a/mm/rmap.c b/mm/rmap.c
index 80ac53633332..755a62b046e2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1109,7 +1109,8 @@ static unsigned int __folio_add_rmap_range(struct folio *folio,
 		struct vm_area_struct *vma, bool compound, int *nr_pmdmapped)
 {
 	atomic_t *mapped = &folio->_nr_pages_mapped;
-	int first, count, nr = 0;
+	int first, val, count, nr = 0;
+	bool exclusive;
 
 	VM_WARN_ON_FOLIO(compound && page != &folio->page, folio);
 	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
@@ -1119,8 +1120,23 @@ static unsigned int __folio_add_rmap_range(struct folio *folio,
 	if (likely(!folio_test_large(folio)))
 		return atomic_inc_and_test(&page->_mapcount);
 
+	exclusive = __folio_write_large_rmap_begin(folio);
+
 	/* Is page being mapped by PTE? Is this its first map to be added? */
-	if (!compound) {
+	if (likely(exclusive) && !compound) {
+		count = nr_pages;
+		do {
+			val = atomic_read(&page->_mapcount) + 1;
+			atomic_set(&page->_mapcount, val);
+			if (!val) {
+				val = atomic_read(mapped) + 1;
+				atomic_set(mapped, val);
+				if (val < COMPOUND_MAPPED)
+					nr++;
+			}
+		} while (page++, --count > 0);
+		folio_add_large_mapcount_exclusive(folio, nr_pages, vma);
+	} else if (!compound) {
 		count = nr_pages;
 		do {
 			first = atomic_inc_and_test(&page->_mapcount);
@@ -1131,6 +1147,26 @@ static unsigned int __folio_add_rmap_range(struct folio *folio,
 			}
 		} while (page++, --count > 0);
 		folio_add_large_mapcount(folio, nr_pages, vma);
+	} else if (likely(exclusive) && folio_test_pmd_mappable(folio)) {
+		/* That test is redundant: it's for safety or to optimize out */
+
+		val = atomic_read(&folio->_entire_mapcount) + 1;
+		atomic_set(&folio->_entire_mapcount, val);
+		if (!val) {
+			nr = atomic_read(mapped) + COMPOUND_MAPPED;
+			atomic_set(mapped, nr);
+			if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) {
+				*nr_pmdmapped = folio_nr_pages(folio);
+				nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+				/* Raced ahead of a remove and another add? */
+				if (unlikely(nr < 0))
+					nr = 0;
+			} else {
+				/* Raced ahead of a remove of COMPOUND_MAPPED */
+				nr = 0;
+			}
+		}
+		folio_inc_large_mapcount_exclusive(folio, vma);
 	} else if (folio_test_pmd_mappable(folio)) {
 		/* That test is redundant: it's for safety or to optimize out */
 
@@ -1152,6 +1188,8 @@ static unsigned int __folio_add_rmap_range(struct folio *folio,
 	} else {
 		VM_WARN_ON_ONCE_FOLIO(true, folio);
 	}
+
+	__folio_write_large_rmap_end(folio, exclusive);
 	return nr;
 }
 
@@ -1160,7 +1198,8 @@ static unsigned int __folio_remove_rmap_range(struct folio *folio,
 		struct vm_area_struct *vma, bool compound, int *nr_pmdmapped)
 {
 	atomic_t *mapped = &folio->_nr_pages_mapped;
-	int last, count, nr = 0;
+	int last, val, count, nr = 0;
+	bool exclusive;
 
 	VM_WARN_ON_FOLIO(compound && page != &folio->page, folio);
 	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
@@ -1170,8 +1209,23 @@ static unsigned int __folio_remove_rmap_range(struct folio *folio,
 	if (likely(!folio_test_large(folio)))
 		return atomic_add_negative(-1, &page->_mapcount);
 
+	exclusive = __folio_write_large_rmap_begin(folio);
+
 	/* Is page being unmapped by PTE? Is this its last map to be removed? */
-	if (!compound) {
+	if (likely(exclusive) && !compound) {
+		folio_add_large_mapcount_exclusive(folio, -nr_pages, vma);
+		count = nr_pages;
+		do {
+			val = atomic_read(&page->_mapcount) - 1;
+			atomic_set(&page->_mapcount, val);
+			if (val < 0) {
+				val = atomic_read(mapped) - 1;
+				atomic_set(mapped, val);
+				if (val < COMPOUND_MAPPED)
+					nr++;
+			}
+		} while (page++, --count > 0);
+	} else if (!compound) {
 		folio_add_large_mapcount(folio, -nr_pages, vma);
 		count = nr_pages;
 		do {
@@ -1182,6 +1236,26 @@ static unsigned int __folio_remove_rmap_range(struct folio *folio,
 					nr++;
 			}
 		} while (page++, --count > 0);
+	} else if (likely(exclusive) && folio_test_pmd_mappable(folio)) {
+		/* That test is redundant: it's for safety or to optimize out */
+
+		folio_dec_large_mapcount_exclusive(folio, vma);
+		val = atomic_read(&folio->_entire_mapcount) - 1;
+		atomic_set(&folio->_entire_mapcount, val);
+		if (val < 0) {
+			nr = atomic_read(mapped) - COMPOUND_MAPPED;
+			atomic_set(mapped, nr);
+			if (likely(nr < COMPOUND_MAPPED)) {
+				*nr_pmdmapped = folio_nr_pages(folio);
+				nr = *nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED);
+				/* Raced ahead of another remove and an add? */
+				if (unlikely(nr < 0))
+					nr = 0;
+			} else {
+				/* An add of COMPOUND_MAPPED raced ahead */
+				nr = 0;
+			}
+		}
 	} else if (folio_test_pmd_mappable(folio)) {
 		/* That test is redundant: it's for safety or to optimize out */
 
@@ -1203,6 +1277,8 @@ static unsigned int __folio_remove_rmap_range(struct folio *folio,
 	} else {
 		VM_WARN_ON_ONCE_FOLIO(true, folio);
 	}
+
+	__folio_write_large_rmap_end(folio, exclusive);
 	return nr;
 }
 
-- 
2.41.0



  parent reply	other threads:[~2023-11-24 13:27 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-24 13:26 [PATCH WIP v1 00/20] mm: precise "mapped shared" vs. "mapped exclusively" detection for PTE-mapped THP / partially-mappable folios David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 01/20] mm/rmap: factor out adding folio range into __folio_add_rmap_range() David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 02/20] mm: add a total mapcount for large folios David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 03/20] mm: convert folio_estimated_sharers() to folio_mapped_shared() and improve it David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 04/20] mm/rmap: pass dst_vma to page_try_dup_anon_rmap() and page_dup_file_rmap() David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 05/20] mm/rmap: abstract total mapcount operations for partially-mappable folios David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 06/20] atomic_seqcount: new (raw) seqcount variant to support concurrent writers David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 07/20] mm/rmap_id: track if one ore multiple MMs map a partially-mappable folio David Hildenbrand
2023-12-17 19:13   ` Nadav Amit
2023-12-18 14:04     ` David Hildenbrand
2023-12-18 14:34       ` Nadav Amit
2023-11-24 13:26 ` [PATCH WIP v1 08/20] mm: pass MM to folio_mapped_shared() David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 09/20] mm: improve folio_mapped_shared() for partially-mappable folios using rmap IDs David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 10/20] mm/memory: COW reuse support for PTE-mapped THP with " David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 11/20] mm/rmap_id: support for 1, 2 and 3 values by manual calculation David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 12/20] mm/rmap: introduce folio_add_anon_rmap_range() David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 13/20] mm/huge_memory: batch rmap operations in __split_huge_pmd_locked() David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 14/20] mm/huge_memory: avoid folio_refcount() < folio_mapcount() " David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 15/20] mm/rmap_id: verify precalculated subids with CONFIG_DEBUG_VM David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 16/20] atomic_seqcount: support a single exclusive writer in the absence of other writers David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 17/20] mm/rmap_id: reduce atomic RMW operations when we are the exclusive writer David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 18/20] atomic_seqcount: use atomic add-return instead of atomic cmpxchg on 64bit David Hildenbrand
2023-11-24 13:26 ` [PATCH WIP v1 19/20] mm/rmap: factor out removing folio range into __folio_remove_rmap_range() David Hildenbrand
2023-11-24 13:26 ` David Hildenbrand [this message]
2023-11-24 20:55 ` [PATCH WIP v1 00/20] mm: precise "mapped shared" vs. "mapped exclusively" detection for PTE-mapped THP / partially-mappable folios Linus Torvalds
2023-11-25 17:02   ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231124132626.235350-21-david@redhat.com \
    --to=david@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=fengwei.yin@intel.com \
    --cc=hughd@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=longman@redhat.com \
    --cc=mingo@redhat.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=ryan.roberts@arm.com \
    --cc=shy828301@gmail.com \
    --cc=torvalds@linux-foundation.org \
    --cc=will@kernel.org \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).