[PATCH v4 2/4] mm/swap: move bh draining into a separate workqueue

Linux Documentation
 help / color / mirror / Atom feed

From: Leonardo Bras <leobras.c@gmail.com>
To: "Jonathan Corbet" <corbet@lwn.net>,
	"Shuah Khan" <skhan@linuxfoundation.org>,
	"Leonardo Bras" <leobras.c@gmail.com>,
	"Peter Zijlstra" <peterz@infradead.org>,
	"Ingo Molnar" <mingo@redhat.com>, "Will Deacon" <will@kernel.org>,
	"Boqun Feng" <boqun@kernel.org>,
	"Waiman Long" <longman@redhat.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"David Hildenbrand" <david@kernel.org>,
	"Lorenzo Stoakes" <ljs@kernel.org>,
	"Liam R. Howlett" <liam@infradead.org>,
	"Vlastimil Babka" <vbabka@kernel.org>,
	"Mike Rapoport" <rppt@kernel.org>,
	"Suren Baghdasaryan" <surenb@google.com>,
	"Michal Hocko" <mhocko@suse.com>, "Jann Horn" <jannh@google.com>,
	"Pedro Falcato" <pfalcato@suse.de>,
	"Brendan Jackman" <jackmanb@google.com>,
	"Johannes Weiner" <hannes@cmpxchg.org>, "Zi Yan" <ziy@nvidia.com>,
	"Harry Yoo" <harry@kernel.org>, "Hao Li" <hao.li@linux.dev>,
	"Christoph Lameter" <cl@gentwo.org>,
	"David Rientjes" <rientjes@google.com>,
	"Roman Gushchin" <roman.gushchin@linux.dev>,
	"Chris Li" <chrisl@kernel.org>,
	"Kairui Song" <kasong@tencent.com>,
	"Kemeng Shi" <shikemeng@huaweicloud.com>,
	"Nhat Pham" <nphamcs@gmail.com>, "Baoquan He" <bhe@redhat.com>,
	"Barry Song" <baohua@kernel.org>,
	"Youngjun Park" <youngjun.park@lge.com>,
	"Qi Zheng" <qi.zheng@linux.dev>,
	"Shakeel Butt" <shakeel.butt@linux.dev>,
	"Axel Rasmussen" <axelrasmussen@google.com>,
	"Yuanchu Xie" <yuanchu@google.com>, "Wei Xu" <weixugc@google.com>,
	"Borislav Petkov (AMD)" <bp@alien8.de>,
	"Randy Dunlap" <rdunlap@infradead.org>,
	"Feng Tang" <feng.tang@linux.alibaba.com>,
	"Dapeng Mi" <dapeng1.mi@linux.intel.com>,
	"Kees Cook" <kees@kernel.org>, "Marco Elver" <elver@google.com>,
	"Jakub Kicinski" <kuba@kernel.org>,
	"Li RongQing" <lirongqing@baidu.com>,
	"Eric Biggers" <ebiggers@kernel.org>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	"Nathan Chancellor" <nathan@kernel.org>,
	"Nicolas Schier" <nsc@kernel.org>,
	"Miguel Ojeda" <ojeda@kernel.org>,
	"Thomas Weißschuh" <thomas.weissschuh@linutronix.de>,
	"Thomas Gleixner" <tglx@kernel.org>,
	"Douglas Anderson" <dianders@chromium.org>,
	"Gary Guo" <gary@garyguo.net>,
	"Christian Brauner" <brauner@kernel.org>,
	"Pasha Tatashin" <pasha.tatashin@soleen.com>,
	"Coiby Xu" <coxu@redhat.com>,
	"Masahiro Yamada" <masahiroy@kernel.org>,
	"Frederic Weisbecker" <frederic@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-mm@kvack.org, linux-rt-devel@lists.linux.dev
Subject: [PATCH v4 2/4] mm/swap: move bh draining into a separate workqueue
Date: Mon, 18 May 2026 22:27:48 -0300	[thread overview]
Message-ID: <20260519012754.240804-3-leobras.c@gmail.com> (raw)
In-Reply-To: <20260519012754.240804-1-leobras.c@gmail.com>

From: Marcelo Tosatti <mtosatti@redhat.com>

Separate the bh draining into a separate workqueue
(from the mm lru draining), so that its possible to switch
the mm lru draining to QPW.

To switch bh draining to QPW, it would be necessary to add
a spinlock to addition of bhs to percpu cache, and that is a
very hot path.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 mm/swap.c | 52 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 5cc44f0de987..ed9b3d371547 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -744,60 +744,70 @@ void lru_add_drain(void)
 	local_unlock(&cpu_fbatches.lock);
 	mlock_drain_local();
 }
 
 /*
  * It's called from per-cpu workqueue context in SMP case so
  * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
  * the same cpu. It shouldn't be a problem in !SMP case since
  * the core is only one and the locks will disable preemption.
  */
-static void lru_add_and_bh_lrus_drain(void)
+static void lru_add_mm_drain(void)
 {
 	local_lock(&cpu_fbatches.lock);
 	lru_add_drain_cpu(smp_processor_id());
 	local_unlock(&cpu_fbatches.lock);
-	invalidate_bh_lrus_cpu();
 	mlock_drain_local();
 }
 
 void lru_add_drain_cpu_zone(struct zone *zone)
 {
 	local_lock(&cpu_fbatches.lock);
 	lru_add_drain_cpu(smp_processor_id());
 	drain_local_pages(zone);
 	local_unlock(&cpu_fbatches.lock);
 	mlock_drain_local();
 }
 
 #ifdef CONFIG_SMP
 
 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
-	lru_add_and_bh_lrus_drain();
+	lru_add_mm_drain();
 }
 
-static bool cpu_needs_drain(unsigned int cpu)
+static DEFINE_PER_CPU(struct work_struct, bh_add_drain_work);
+
+static void bh_add_drain_per_cpu(struct work_struct *dummy)
+{
+	invalidate_bh_lrus_cpu();
+}
+
+static bool cpu_needs_mm_drain(unsigned int cpu)
 {
 	struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
 
 	/* Check these in order of likelihood that they're not zero */
 	return folio_batch_count(&fbatches->lru_add) ||
 		folio_batch_count(&fbatches->lru_move_tail) ||
 		folio_batch_count(&fbatches->lru_deactivate_file) ||
 		folio_batch_count(&fbatches->lru_deactivate) ||
 		folio_batch_count(&fbatches->lru_lazyfree) ||
 		folio_batch_count(&fbatches->lru_activate) ||
-		need_mlock_drain(cpu) ||
-		has_bh_in_lru(cpu, NULL);
+		need_mlock_drain(cpu);
+}
+
+static bool cpu_needs_bh_drain(unsigned int cpu)
+{
+	return has_bh_in_lru(cpu, NULL);
 }
 
 /*
  * Doesn't need any cpu hotplug locking because we do rely on per-cpu
  * kworkers being shut down before our page_alloc_cpu_dead callback is
  * executed on the offlined cpu.
  * Calling this function with cpu hotplug locks held can actually lead
  * to obscure indirect dependencies via WQ context.
  */
 static inline void __lru_add_drain_all(bool force_all_cpus)
@@ -806,21 +816,21 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
 	 * lru_drain_gen - Global pages generation number
 	 *
 	 * (A) Definition: global lru_drain_gen = x implies that all generations
 	 *     0 < n <= x are already *scheduled* for draining.
 	 *
 	 * This is an optimization for the highly-contended use case where a
 	 * user space workload keeps constantly generating a flow of pages for
 	 * each CPU.
 	 */
 	static unsigned int lru_drain_gen;
-	static struct cpumask has_work;
+	static struct cpumask has_mm_work, has_bh_work;
 	static DEFINE_MUTEX(lock);
 	unsigned cpu, this_gen;
 
 	/*
 	 * Make sure nobody triggers this path before mm_percpu_wq is fully
 	 * initialized.
 	 */
 	if (WARN_ON(!mm_percpu_wq))
 		return;
 
@@ -869,34 +879,45 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
 	 * along, adds some pages to its per-cpu vectors, then calls
 	 * lru_add_drain_all().
 	 *
 	 * If the paired barrier is done at any later step, e.g. after the
 	 * loop, CPU #x will just exit at (C) and miss flushing out all of its
 	 * added pages.
 	 */
 	WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
 	smp_mb();
 
-	cpumask_clear(&has_work);
+	cpumask_clear(&has_mm_work);
+	cpumask_clear(&has_bh_work);
 	for_each_online_cpu(cpu) {
-		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+		struct work_struct *mm_work = &per_cpu(lru_add_drain_work, cpu);
+		struct work_struct *bh_work = &per_cpu(bh_add_drain_work, cpu);
 
-		if (cpu_needs_drain(cpu)) {
-			INIT_WORK(work, lru_add_drain_per_cpu);
-			queue_work_on(cpu, mm_percpu_wq, work);
-			__cpumask_set_cpu(cpu, &has_work);
+		if (cpu_needs_mm_drain(cpu)) {
+			INIT_WORK(mm_work, lru_add_drain_per_cpu);
+			queue_work_on(cpu, mm_percpu_wq, mm_work);
+			__cpumask_set_cpu(cpu, &has_mm_work);
+		}
+
+		if (cpu_needs_bh_drain(cpu)) {
+			INIT_WORK(bh_work, bh_add_drain_per_cpu);
+			queue_work_on(cpu, mm_percpu_wq, bh_work);
+			__cpumask_set_cpu(cpu, &has_bh_work);
 		}
 	}
 
-	for_each_cpu(cpu, &has_work)
+	for_each_cpu(cpu, &has_mm_work)
 		flush_work(&per_cpu(lru_add_drain_work, cpu));
 
+	for_each_cpu(cpu, &has_bh_work)
+		flush_work(&per_cpu(bh_add_drain_work, cpu));
+
 done:
 	mutex_unlock(&lock);
 }
 
 void lru_add_drain_all(void)
 {
 	__lru_add_drain_all(false);
 }
 #else
 void lru_add_drain_all(void)
@@ -928,21 +949,22 @@ void lru_cache_disable(void)
 	 *
 	 * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
 	 * preempt_disable() regions of code. So any CPU which sees
 	 * lru_disable_count = 0 will have exited the critical
 	 * section when synchronize_rcu() returns.
 	 */
 	synchronize_rcu_expedited();
 #ifdef CONFIG_SMP
 	__lru_add_drain_all(true);
 #else
-	lru_add_and_bh_lrus_drain();
+	lru_add_mm_drain();
+	invalidate_bh_lrus_cpu();
 #endif
 }
 
 /**
  * folios_put_refs - Reduce the reference count on a batch of folios.
  * @folios: The folios.
  * @refs: The number of refs to subtract from each folio.
  *
  * Like folio_put(), but for a batch of folios.  This is more efficient
  * than writing the loop yourself as it will optimise the locks which need
-- 
2.54.0

next prev parent reply	other threads:[~2026-05-19  1:28 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-19  1:27 [PATCH v4 0/4] Introduce Per-CPU Work helpers (was QPW) Leonardo Bras
2026-05-19  1:27 ` [PATCH v4 1/4] Introducing pw_lock() and per-cpu queue & flush work Leonardo Bras
2026-05-19  1:27 ` Leonardo Bras [this message]
2026-05-19  1:27 ` [PATCH v4 3/4] swap: apply new pw_queue_on() interface Leonardo Bras
2026-05-19  1:27 ` [PATCH v4 4/4] slub: " Leonardo Bras
2026-05-19  6:58 ` [syzbot ci] Re: Introduce Per-CPU Work helpers (was QPW) syzbot ci

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:5cc44f0de98 dfblob:ed9b3d37154 )
 OR (
bs:"[PATCH v4 2/4] mm/swap: move bh draining into a separate workqueue" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260519012754.240804-3-leobras.c@gmail.com \
    --to=leobras.c@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=axelrasmussen@google.com \
    --cc=baohua@kernel.org \
    --cc=bhe@redhat.com \
    --cc=boqun@kernel.org \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=chrisl@kernel.org \
    --cc=cl@gentwo.org \
    --cc=corbet@lwn.net \
    --cc=coxu@redhat.com \
    --cc=dapeng1.mi@linux.intel.com \
    --cc=david@kernel.org \
    --cc=dianders@chromium.org \
    --cc=ebiggers@kernel.org \
    --cc=elver@google.com \
    --cc=feng.tang@linux.alibaba.com \
    --cc=frederic@kernel.org \
    --cc=gary@garyguo.net \
    --cc=hannes@cmpxchg.org \
    --cc=hao.li@linux.dev \
    --cc=harry@kernel.org \
    --cc=jackmanb@google.com \
    --cc=jannh@google.com \
    --cc=kasong@tencent.com \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=liam@infradead.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-rt-devel@lists.linux.dev \
    --cc=lirongqing@baidu.com \
    --cc=ljs@kernel.org \
    --cc=longman@redhat.com \
    --cc=masahiroy@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=nathan@kernel.org \
    --cc=nphamcs@gmail.com \
    --cc=nsc@kernel.org \
    --cc=ojeda@kernel.org \
    --cc=pasha.tatashin@soleen.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=qi.zheng@linux.dev \
    --cc=rdunlap@infradead.org \
    --cc=rientjes@google.com \
    --cc=roman.gushchin@linux.dev \
    --cc=rppt@kernel.org \
    --cc=shakeel.butt@linux.dev \
    --cc=shikemeng@huaweicloud.com \
    --cc=skhan@linuxfoundation.org \
    --cc=surenb@google.com \
    --cc=tglx@kernel.org \
    --cc=thomas.weissschuh@linutronix.de \
    --cc=vbabka@kernel.org \
    --cc=weixugc@google.com \
    --cc=will@kernel.org \
    --cc=youngjun.park@lge.com \
    --cc=yuanchu@google.com \
    --cc=ziy@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox