[PATCH v5 8/9] mm: swap: relaim the cached parts that got scanned

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: chrisl@kernel.org
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Kairui Song <kasong@tencent.com>, Hugh Dickins <hughd@google.com>,
	 Ryan Roberts <ryan.roberts@arm.com>,
	"Huang, Ying" <ying.huang@intel.com>,
	 Kalesh Singh <kaleshsingh@google.com>,
	linux-kernel@vger.kernel.org,  linux-mm@kvack.org,
	Chris Li <chrisl@kernel.org>,  Barry Song <baohua@kernel.org>
Subject: [PATCH v5 8/9] mm: swap: relaim the cached parts that got scanned
Date: Tue, 30 Jul 2024 23:49:20 -0700	[thread overview]
Message-ID: <20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org> (raw)
In-Reply-To: <20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org>

From: Kairui Song <kasong@tencent.com>

This commit implements reclaim during scan for cluster allocator.

Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which
could result in low allocation success rate or early OOM.

So to ensure maximum allocation success rate, integrate reclaiming
with scanning. If found a range of suitable swap slots but fragmented
due to HAS_CACHE, just try to reclaim the slots.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 include/linux/swap.h |   1 +
 mm/swapfile.c        | 140 +++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 110 insertions(+), 31 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 5a14b6c65949..9eb740563d63 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -302,6 +302,7 @@ struct swap_info_struct {
 					/* list of cluster that contains at least one free slot */
 	struct list_head frag_clusters[SWAP_NR_ORDERS];
 					/* list of cluster that are fragmented or contented */
+	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eb3e387e86b2..50e7f600a9a1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
 	VM_BUG_ON(ci->count != 0);
 	lockdep_assert_held(&si->lock);
 	lockdep_assert_held(&ci->lock);
+
+	if (ci->flags & CLUSTER_FLAG_FRAG)
+		si->frag_cluster_nr[ci->order]--;
+
 	/*
 	 * If the swap is discardable, prepare discard the cluster
 	 * instead of free it immediately. The cluster will be freed
@@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 
 	if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
 		VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
-		if (ci->flags & CLUSTER_FLAG_FRAG)
+		if (ci->flags & CLUSTER_FLAG_FRAG) {
+			p->frag_cluster_nr[ci->order]--;
 			list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
-		else
+		} else {
 			list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+		}
 		ci->flags = CLUSTER_FLAG_NONFULL;
 	}
 }
 
-static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
-				      unsigned int nr_pages)
+static bool cluster_reclaim_range(struct swap_info_struct *si,
+				  struct swap_cluster_info *ci,
+				  unsigned long start, unsigned long end)
 {
-	unsigned char *p = si->swap_map + start;
-	unsigned char *end = p + nr_pages;
+	unsigned char *map = si->swap_map;
+	unsigned long offset;
+
+	spin_unlock(&ci->lock);
+	spin_unlock(&si->lock);
+
+	for (offset = start; offset < end; offset++) {
+		switch (READ_ONCE(map[offset])) {
+		case 0:
+			continue;
+		case SWAP_HAS_CACHE:
+			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
+				continue;
+			goto out;
+		default:
+			goto out;
+		}
+	}
+out:
+	spin_lock(&si->lock);
+	spin_lock(&ci->lock);
 
-	while (p < end)
-		if (*p++)
+	/*
+	 * Recheck the range no matter reclaim succeeded or not, the slot
+	 * could have been be freed while we are not holding the lock.
+	 */
+	for (offset = start; offset < end; offset++)
+		if (READ_ONCE(map[offset]))
 			return false;
 
 	return true;
 }
 
+static bool cluster_scan_range(struct swap_info_struct *si,
+			       struct swap_cluster_info *ci,
+			       unsigned long start, unsigned int nr_pages)
+{
+	unsigned long offset, end = start + nr_pages;
+	unsigned char *map = si->swap_map;
+	bool need_reclaim = false;
 
-static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
-						unsigned int start, unsigned char usage,
-						unsigned int order)
+	for (offset = start; offset < end; offset++) {
+		switch (READ_ONCE(map[offset])) {
+		case 0:
+			continue;
+		case SWAP_HAS_CACHE:
+			if (!vm_swap_full())
+				return false;
+			need_reclaim = true;
+			continue;
+		default:
+			return false;
+		}
+	}
+
+	if (need_reclaim)
+		return cluster_reclaim_range(si, ci, start, end);
+
+	return true;
+}
+
+static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+				unsigned int start, unsigned char usage,
+				unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 
@@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
 	if (ci->count == SWAPFILE_CLUSTER) {
 		VM_BUG_ON(!(ci->flags &
 			  (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+		if (ci->flags & CLUSTER_FLAG_FRAG)
+			si->frag_cluster_nr[ci->order]--;
 		list_del(&ci->list);
 		ci->flags = 0;
 	}
@@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
 	}
 
 	while (offset <= end) {
-		if (cluster_scan_range(si, offset, nr_pages)) {
+		if (cluster_scan_range(si, ci, offset, nr_pages)) {
 			cluster_alloc_range(si, ci, offset, usage, order);
 			*foundp = offset;
 			if (ci->count == SWAPFILE_CLUSTER) {
@@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 					      unsigned char usage)
 {
 	struct percpu_cluster *cluster;
-	struct swap_cluster_info *ci, *n;
+	struct swap_cluster_info *ci;
 	unsigned int offset, found = 0;
-	LIST_HEAD(fraged);
 
 new_cluster:
 	lockdep_assert_held(&si->lock);
@@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 	}
 
 	if (order < PMD_ORDER) {
-		list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
-			list_move_tail(&ci->list, &fraged);
+		unsigned int frags = 0;
+
+		while (!list_empty(&si->nonfull_clusters[order])) {
+			ci = list_first_entry(&si->nonfull_clusters[order],
+					      struct swap_cluster_info, list);
+			list_move_tail(&ci->list, &si->frag_clusters[order]);
 			ci->flags = CLUSTER_FLAG_FRAG;
+			si->frag_cluster_nr[order]++;
 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
 							 &found, order, usage);
+			frags++;
 			if (found)
 				break;
 		}
 
 		if (!found) {
-			list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
+			/*
+			 * Nonfull clusters are moved to frag tail if we reached
+			 * here, count them too, don't over scan the frag list.
+			 */
+			while (frags < si->frag_cluster_nr[order]) {
+				ci = list_first_entry(&si->frag_clusters[order],
+						      struct swap_cluster_info, list);
+				/*
+				 * Rotate the frag list to iterate, they were all failing
+				 * high order allocation or moved here due to per-CPU usage,
+				 * this help keeping usable cluster ahead.
+				 */
+				list_move_tail(&ci->list, &si->frag_clusters[order]);
 				offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
 								 &found, order, usage);
+				frags++;
 				if (found)
 					break;
 			}
 		}
-
-		list_splice_tail(&fraged, &si->frag_clusters[order]);
 	}
 
 	if (found)
@@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 
 	/* Order 0 stealing from higher order */
 	for (int o = 1; o < PMD_ORDER; o++) {
-		if (!list_empty(&si->frag_clusters[o])) {
+		/*
+		 * Clusters here have at least one usable slots and can't fail order 0
+		 * allocation, but reclaim may drop si->lock and race with another user.
+		 */
+		while (!list_empty(&si->frag_clusters[o])) {
 			ci = list_first_entry(&si->frag_clusters[o],
 					      struct swap_cluster_info, list);
-			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
-							 0, usage);
-			VM_BUG_ON(!found);
-			goto done;
+			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+							 &found, 0, usage);
+			if (found)
+				goto done;
 		}
 
-		if (!list_empty(&si->nonfull_clusters[o])) {
-			ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
-					      list);
+		while (!list_empty(&si->nonfull_clusters[o])) {
+			ci = list_first_entry(&si->nonfull_clusters[o],
+					      struct swap_cluster_info, list);
 			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
 							 &found, 0, usage);
-			VM_BUG_ON(!found);
-			goto done;
+			if (found)
+				goto done;
 		}
 	}
-
 done:
 	cluster->next[order] = offset;
 	return found;
@@ -3053,6 +3131,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	for (i = 0; i < SWAP_NR_ORDERS; i++) {
 		INIT_LIST_HEAD(&p->nonfull_clusters[i]);
 		INIT_LIST_HEAD(&p->frag_clusters[i]);
+		p->frag_cluster_nr[i] = 0;
 	}
 
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3096,7 +3175,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	if (!cluster_info)
 		return nr_extents;
 
-
 	/*
 	 * Reduce false cache line sharing between cluster_info and
 	 * sharing same address space.

-- 
2.46.0.rc1.232.g9752f9e123-goog

next prev parent reply	other threads:[~2024-07-31  6:49 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-31  6:49 [PATCH v5 0/9] mm: swap: mTHP swap allocator base on swap cluster order Chris Li
2024-07-31  6:49 ` [PATCH v5 1/9] mm: swap: swap cluster switch to double link list Chris Li
2024-07-31  6:49 ` [PATCH v5 2/9] mm: swap: mTHP allocate swap entries from nonfull list Chris Li
     [not found]   ` <87bk23250r.fsf@yhuang6-desk2.ccr.corp.intel.com>
2024-08-16  8:01     ` Chris Li
2024-08-19  8:08       ` Huang, Ying
2024-08-26 21:26         ` Chris Li
2024-09-09  7:19           ` Huang, Ying
2024-07-31  6:49 ` [PATCH v5 3/9] mm: swap: separate SSD allocation from scan_swap_map_slots() Chris Li
2024-07-31  6:49 ` [PATCH v5 4/9] mm: swap: clean up initialization helper chrisl
2024-07-31  6:49 ` [PATCH v5 5/9] mm: swap: skip slot cache on freeing for mTHP chrisl
2024-08-03  9:11   ` Barry Song
2024-08-03 10:57     ` Barry Song
2024-07-31  6:49 ` [PATCH v5 6/9] mm: swap: allow cache reclaim to skip slot cache chrisl
2024-08-03 10:38   ` Barry Song
2024-08-03 12:18     ` Kairui Song
2024-08-04 18:06       ` Chris Li
2024-08-05  1:53         ` Barry Song
2024-07-31  6:49 ` [PATCH v5 7/9] mm: swap: add a fragment cluster list chrisl
2024-07-31  6:49 ` chrisl [this message]
2024-07-31  6:49 ` [PATCH v5 9/9] mm: swap: add a adaptive full cluster cache reclaim chrisl
2024-08-01  9:14 ` [PATCH v5 0/9] mm: swap: mTHP swap allocator base on swap cluster order David Hildenbrand
2024-08-01  9:59   ` Kairui Song
2024-08-01 10:06     ` Kairui Song
     [not found]     ` <87le17z9zr.fsf@yhuang6-desk2.ccr.corp.intel.com>
2024-08-16  7:36       ` Chris Li
2024-08-17 17:47         ` Kairui Song
     [not found] ` <87h6bw3gxl.fsf@yhuang6-desk2.ccr.corp.intel.com>
     [not found]   ` <CACePvbXH8b9SOePQ-Ld_UBbcAdJ3gdYtEkReMto5Hbq9WAL7JQ@mail.gmail.com>
     [not found]     ` <87sevfza3w.fsf@yhuang6-desk2.ccr.corp.intel.com>
2024-08-16  7:47       ` Chris Li
2024-08-18 16:59         ` Kairui Song
2024-08-19  8:27           ` Huang, Ying
2024-08-19  8:47             ` Kairui Song
2024-08-19 21:27               ` Chris Li
2024-08-19  8:39         ` Huang, Ying
2024-09-02  1:20 ` Andrew Morton

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:5a14b6c6594 dfblob:9eb740563d6 dfblob:eb3e387e86b
dfblob:50e7f600a9a )
 OR (
bs:"[PATCH v5 8/9] mm: swap: relaim the cached parts that got scanned" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org \
    --to=chrisl@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=baohua@kernel.org \
    --cc=hughd@google.com \
    --cc=kaleshsingh@google.com \
    --cc=kasong@tencent.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ryan.roberts@arm.com \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).