linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Nicolas Saenz Julienne <nsaenzju@redhat.com>
To: akpm@linux-foundation.org, frederic@kernel.org
Cc: linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	tglx@linutronix.de, cl@linux.com, peterz@infradead.org,
	juri.lelli@redhat.com, mingo@redhat.com, mtosatti@redhat.com,
	nilal@redhat.com, mgorman@suse.de, ppandit@redhat.com,
	williams@redhat.com, bigeasy@linutronix.de,
	anna-maria@linutronix.de, linux-rt-users@vger.kernel.org,
	Nicolas Saenz Julienne <nsaenzju@redhat.com>
Subject: [PATCH 2/6] mm/swap: Introduce alternative per-cpu LRU cache locking
Date: Tue, 21 Sep 2021 18:13:20 +0200	[thread overview]
Message-ID: <20210921161323.607817-3-nsaenzju@redhat.com> (raw)
In-Reply-To: <20210921161323.607817-1-nsaenzju@redhat.com>

'struct lru_pvecs' and 'struct lru_rotate' per-CPU pagevecs are
protected using local locks. While performance savvy, this doesn't
allow for remote access to these structures. CPUs requiring system-wide
LRU cache drains get around this by scheduling drain work on all CPUs.
That said, some select setups like systems with NOHZ_FULL CPUs, aren't
well suited to this, as they can't handle interruptions of any sort.

To mitigate this, introduce an alternative locking scheme using
spinlocks that will permit remotely accessing these per-CPU pagevecs.
It's disabled by default, with no functional change to regular users,
and enabled through the 'remote_pcpu_cache_access' static key. Upcoming
patches will make use of this static key.

The naming of the static key is left vague on purpose as it is planned
to also enable a similar locking setup to access mm/page_alloc.c's
per-cpu page lists remotely.

This is based on previous work by Thomas Gleixner, Anna-Maria Gleixner,
and Sebastian Andrzej Siewior[1].

[1] https://patchwork.kernel.org/project/linux-mm/patch/20190424111208.24459-3-bigeasy@linutronix.de/
Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
---
 mm/internal.h |   2 +
 mm/swap.c     | 152 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 128 insertions(+), 26 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 18256e32a14c..5a2cef7cd394 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,6 +32,8 @@
 /* Do not use these with a slab allocator */
 #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
 
+extern struct static_key_false remote_pcpu_cache_access;
+
 void page_writeback_init(void);
 
 static inline void *folio_raw_mapping(struct folio *folio)
diff --git a/mm/swap.c b/mm/swap.c
index e7f9e4018ccf..bcf73bd563a6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,13 +46,27 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
+/*
+ * On some setups, like with nohz_full, CPUs might be too busy to handle
+ * per-cpu drain work, leading to unwarranted interruptions and hangs. This
+ * key, when enabled, allows for remote draining of these per-cpu caches/page
+ * lists at the cost of more constraining locking.
+ */
+__ro_after_init DEFINE_STATIC_KEY_FALSE(remote_pcpu_cache_access);
+
+struct lru_cache_locks {
+	local_lock_t local;
+	spinlock_t spin;
+};
+
 /* Protecting only lru_rotate.pvec which requires disabling interrupts */
 struct lru_rotate {
-	local_lock_t lock;
+	struct lru_cache_locks locks;
 	struct pagevec pvec;
 };
 static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
-	.lock = INIT_LOCAL_LOCK(lock),
+	.locks.local = INIT_LOCAL_LOCK(lru_rotate.locks.local),
+	.locks.spin = __SPIN_LOCK_UNLOCKED(lru_rotate.locks.spin),
 };
 
 /*
@@ -60,7 +74,7 @@ static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
  * by disabling preemption (and interrupts remain enabled).
  */
 struct lru_pvecs {
-	local_lock_t lock;
+	struct lru_cache_locks locks;
 	struct pagevec lru_add;
 	struct pagevec lru_deactivate_file;
 	struct pagevec lru_deactivate;
@@ -70,9 +84,94 @@ struct lru_pvecs {
 #endif
 };
 static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
-	.lock = INIT_LOCAL_LOCK(lock),
+	.locks.local = INIT_LOCAL_LOCK(lru_pvecs.locks.local),
+	.locks.spin = __SPIN_LOCK_UNLOCKED(lru_pvecs.locks.spin),
 };
 
+static inline void lru_cache_lock(struct lru_cache_locks *locks)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access)) {
+		/* Avoid migration between this_cpu_ptr() and spin_lock() */
+		migrate_disable();
+		spin_lock(this_cpu_ptr(&locks->spin));
+	} else {
+		local_lock(&locks->local);
+	}
+}
+
+static inline void lru_cache_lock_irqsave(struct lru_cache_locks *locks,
+					  unsigned long *flagsp)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access)) {
+		/* Avoid migration between this_cpu_ptr() and spin_lock_irqsave() */
+		migrate_disable();
+		spin_lock_irqsave(this_cpu_ptr(&locks->spin), *flagsp);
+	} else {
+		local_lock_irqsave(&locks->local, *flagsp);
+	}
+}
+
+/*
+ * The lru_cache_lock_cpu()/lru_cache_lock_irqsave_cpu() flavor of functions
+ * should only be used from remote CPUs when 'remote_pcpu_cache_access' is
+ * enabled or the target CPU is dead. Otherwise, it can still be called on the
+ * local CPU with migration disabled.
+ */
+static inline void lru_cache_lock_cpu(struct lru_cache_locks *locks, int cpu)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access))
+		spin_lock(per_cpu_ptr(&locks->spin, cpu));
+	else
+		local_lock(&locks->local);
+}
+
+static inline void lru_cache_lock_irqsave_cpu(struct lru_cache_locks *locks,
+					      unsigned long *flagsp, int cpu)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access))
+		spin_lock_irqsave(per_cpu_ptr(&locks->spin, cpu), *flagsp);
+	else
+		local_lock_irqsave(&locks->local, *flagsp);
+}
+
+static inline void lru_cache_unlock(struct lru_cache_locks *locks)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access)) {
+		spin_unlock(this_cpu_ptr(&locks->spin));
+		migrate_enable();
+	} else {
+		local_unlock(&locks->local);
+	}
+}
+
+static inline void lru_cache_unlock_irqrestore(struct lru_cache_locks *locks,
+					       unsigned long flags)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access)) {
+		spin_unlock_irqrestore(this_cpu_ptr(&locks->spin), flags);
+		migrate_enable();
+	} else {
+		local_unlock_irqrestore(&locks->local, flags);
+	}
+}
+
+static inline void lru_cache_unlock_cpu(struct lru_cache_locks *locks, int cpu)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access))
+		spin_unlock(per_cpu_ptr(&locks->spin, cpu));
+	else
+		local_unlock(&locks->local);
+}
+
+static inline void lru_cache_unlock_irqrestore_cpu(struct lru_cache_locks *locks,
+						   unsigned long flags, int cpu)
+{
+	if (static_branch_unlikely(&remote_pcpu_cache_access))
+		spin_unlock_irqrestore(per_cpu_ptr(&locks->spin, cpu), flags);
+	else
+		local_unlock_irqrestore(&locks->local, flags);
+}
+
 /*
  * This path almost never happens for VM activity - pages are normally
  * freed via pagevecs.  But it gets used by networking.
@@ -245,11 +344,11 @@ void folio_rotate_reclaimable(struct folio *folio)
 		unsigned long flags;
 
 		folio_get(folio);
-		local_lock_irqsave(&lru_rotate.lock, flags);
+		lru_cache_lock_irqsave(&lru_rotate.locks, &flags);
 		pvec = this_cpu_ptr(&lru_rotate.pvec);
 		if (pagevec_add_and_need_flush(pvec, &folio->page))
 			pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
-		local_unlock_irqrestore(&lru_rotate.lock, flags);
+		lru_cache_unlock_irqrestore(&lru_rotate.locks, flags);
 	}
 }
 
@@ -341,11 +440,11 @@ static void folio_activate(struct folio *folio)
 		struct pagevec *pvec;
 
 		folio_get(folio);
-		local_lock(&lru_pvecs.lock);
+		lru_cache_lock(&lru_pvecs.locks);
 		pvec = this_cpu_ptr(&lru_pvecs.activate_page);
 		if (pagevec_add_and_need_flush(pvec, &folio->page))
 			pagevec_lru_move_fn(pvec, __activate_page);
-		local_unlock(&lru_pvecs.lock);
+		lru_cache_unlock(&lru_pvecs.locks);
 	}
 }
 
@@ -372,7 +471,7 @@ static void __lru_cache_activate_folio(struct folio *folio)
 	struct pagevec *pvec;
 	int i;
 
-	local_lock(&lru_pvecs.lock);
+	lru_cache_lock(&lru_pvecs.locks);
 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
 
 	/*
@@ -394,7 +493,7 @@ static void __lru_cache_activate_folio(struct folio *folio)
 		}
 	}
 
-	local_unlock(&lru_pvecs.lock);
+	lru_cache_unlock(&lru_pvecs.locks);
 }
 
 /*
@@ -453,11 +552,11 @@ void folio_add_lru(struct folio *folio)
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 
 	folio_get(folio);
-	local_lock(&lru_pvecs.lock);
+	lru_cache_lock(&lru_pvecs.locks);
 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
 	if (pagevec_add_and_need_flush(pvec, &folio->page))
 		__pagevec_lru_add(pvec);
-	local_unlock(&lru_pvecs.lock);
+	lru_cache_unlock(&lru_pvecs.locks);
 }
 EXPORT_SYMBOL(folio_add_lru);
 
@@ -592,8 +691,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
 
 /*
  * Drain pages out of the cpu's pagevecs.
- * Either "cpu" is the current CPU, and preemption has already been
- * disabled; or "cpu" is being hot-unplugged, and is already dead.
+ * Either "cpu" is the current CPU, and preemption has already been disabled,
+ * or we're remotely flushing pvecs with the 'remote_pcpu_cache_access' key
+ * enabled, or "cpu" is being hot-unplugged and is already dead.
  */
 void lru_add_drain_cpu(int cpu)
 {
@@ -608,9 +708,9 @@ void lru_add_drain_cpu(int cpu)
 		unsigned long flags;
 
 		/* No harm done if a racing interrupt already did this */
-		local_lock_irqsave(&lru_rotate.lock, flags);
+		lru_cache_lock_irqsave_cpu(&lru_rotate.locks, &flags, cpu);
 		pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
-		local_unlock_irqrestore(&lru_rotate.lock, flags);
+		lru_cache_unlock_irqrestore_cpu(&lru_rotate.locks, flags, cpu);
 	}
 
 	pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
@@ -649,12 +749,12 @@ void deactivate_file_page(struct page *page)
 	if (likely(get_page_unless_zero(page))) {
 		struct pagevec *pvec;
 
-		local_lock(&lru_pvecs.lock);
+		lru_cache_lock(&lru_pvecs.locks);
 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
 
 		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
-		local_unlock(&lru_pvecs.lock);
+		lru_cache_unlock(&lru_pvecs.locks);
 	}
 }
 
@@ -671,12 +771,12 @@ void deactivate_page(struct page *page)
 	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
 		struct pagevec *pvec;
 
-		local_lock(&lru_pvecs.lock);
+		lru_cache_lock(&lru_pvecs.locks);
 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
 		get_page(page);
 		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_fn);
-		local_unlock(&lru_pvecs.lock);
+		lru_cache_unlock(&lru_pvecs.locks);
 	}
 }
 
@@ -693,28 +793,28 @@ void mark_page_lazyfree(struct page *page)
 	    !PageSwapCache(page) && !PageUnevictable(page)) {
 		struct pagevec *pvec;
 
-		local_lock(&lru_pvecs.lock);
+		lru_cache_lock(&lru_pvecs.locks);
 		pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
 		get_page(page);
 		if (pagevec_add_and_need_flush(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
-		local_unlock(&lru_pvecs.lock);
+		lru_cache_unlock(&lru_pvecs.locks);
 	}
 }
 
 void lru_add_drain(void)
 {
-	local_lock(&lru_pvecs.lock);
+	lru_cache_lock(&lru_pvecs.locks);
 	lru_add_drain_cpu(smp_processor_id());
-	local_unlock(&lru_pvecs.lock);
+	lru_cache_unlock(&lru_pvecs.locks);
 }
 
 void lru_add_drain_cpu_zone(struct zone *zone)
 {
-	local_lock(&lru_pvecs.lock);
+	lru_cache_lock(&lru_pvecs.locks);
 	lru_add_drain_cpu(smp_processor_id());
 	drain_local_pages(zone);
-	local_unlock(&lru_pvecs.lock);
+	lru_cache_unlock(&lru_pvecs.locks);
 }
 
 #ifdef CONFIG_SMP
-- 
2.31.1



  parent reply	other threads:[~2021-09-21 16:13 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-09-21 16:13 [PATCH 0/6] mm: Remote LRU per-cpu pagevec cache/per-cpu page list drain support Nicolas Saenz Julienne
2021-09-21 16:13 ` [PATCH 1/6] mm/swap: Introduce lru_cpu_needs_drain() Nicolas Saenz Julienne
2021-09-21 16:13 ` Nicolas Saenz Julienne [this message]
2021-09-21 22:03   ` [PATCH 2/6] mm/swap: Introduce alternative per-cpu LRU cache locking Peter Zijlstra
2021-09-22  8:47     ` nsaenzju
2021-09-22  9:20       ` Sebastian Andrzej Siewior
2021-09-22  9:50         ` nsaenzju
2021-09-22 11:37       ` Peter Zijlstra
2021-09-22 11:43         ` nsaenzju
2021-09-21 16:13 ` [PATCH 3/6] mm/swap: Allow remote LRU cache draining Nicolas Saenz Julienne
2021-09-21 16:13 ` [PATCH 4/6] mm/page_alloc: Introduce alternative per-cpu list locking Nicolas Saenz Julienne
2021-09-21 16:13 ` [PATCH 5/6] mm/page_alloc: Allow remote per-cpu page list draining Nicolas Saenz Julienne
2021-09-21 16:13 ` [PATCH 6/6] sched/isolation: Enable 'remote_pcpu_cache_access' on NOHZ_FULL systems Nicolas Saenz Julienne
2021-09-21 17:51 ` [PATCH 0/6] mm: Remote LRU per-cpu pagevec cache/per-cpu page list drain support Andrew Morton
2021-09-21 17:59 ` Vlastimil Babka
2021-09-22 11:28   ` Peter Zijlstra
2021-09-22 22:09     ` Thomas Gleixner
2021-09-23  7:12       ` Vlastimil Babka
2021-09-23 10:36         ` Thomas Gleixner
2021-09-27  9:30       ` nsaenzju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210921161323.607817-3-nsaenzju@redhat.com \
    --to=nsaenzju@redhat.com \
    --cc=akpm@linux-foundation.org \
    --cc=anna-maria@linutronix.de \
    --cc=bigeasy@linutronix.de \
    --cc=cl@linux.com \
    --cc=frederic@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-rt-users@vger.kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=nilal@redhat.com \
    --cc=peterz@infradead.org \
    --cc=ppandit@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=williams@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).