From: Jan Kara <jack@suse.cz>
To: Christian Brauner <brauner@kernel.org>
Cc: <linux-fsdevel@vger.kernel.org>, Tejun Heo <tj@kernel.org>,
Jan Kara <jack@suse.cz>
Subject: [PATCH 1/4] writeback: Avoid contention on wb->list_lock when switching inodes
Date: Tue, 9 Sep 2025 16:44:02 +0200 [thread overview]
Message-ID: <20250909144400.2901-5-jack@suse.cz> (raw)
In-Reply-To: <20250909143734.30801-1-jack@suse.cz>
There can be multiple inode switch works that are trying to switch
inodes to / from the same wb. This can happen in particular if some
cgroup exits which owns many (thousands) inodes and we need to switch
them all. In this case several inode_switch_wbs_work_fn() instances will
be just spinning on the same wb->list_lock while only one of them makes
forward progress. This wastes CPU cycles and quickly leads to softlockup
reports and unusable system.
Instead of running several inode_switch_wbs_work_fn() instances in
parallel switching to the same wb and contending on wb->list_lock, run
just one instance and let the other isw items switching to this wb queue
behind the one being processed.
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/fs-writeback.c | 42 +++++++++++++++++++++++++++++---
include/linux/backing-dev-defs.h | 5 +++-
mm/backing-dev.c | 2 ++
3 files changed, 44 insertions(+), 5 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..3f3e6efd5d78 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -369,6 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
struct inode_switch_wbs_context {
struct rcu_work work;
+ /* List of queued switching contexts for new_wb */
+ struct list_head list;
/*
* Multiple inodes can be switched at once. The switching procedure
@@ -486,10 +488,8 @@ static bool inode_do_switch_wbs(struct inode *inode,
return switched;
}
-static void inode_switch_wbs_work_fn(struct work_struct *work)
+static void process_inode_switch_wbs_work(struct inode_switch_wbs_context *isw)
{
- struct inode_switch_wbs_context *isw =
- container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
@@ -539,10 +539,44 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
for (inodep = isw->inodes; *inodep; inodep++)
iput(*inodep);
wb_put(new_wb);
- kfree(isw);
atomic_dec(&isw_nr_in_flight);
}
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+ struct inode_switch_wbs_context *isw =
+ container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
+ struct bdi_writeback *new_wb = isw->new_wb;
+ bool switch_running;
+
+ spin_lock_irq(&new_wb->work_lock);
+ switch_running = !list_empty(&new_wb->switch_wbs_ctxs);
+ list_add_tail(&isw->list, &new_wb->switch_wbs_ctxs);
+ spin_unlock_irq(&new_wb->work_lock);
+
+ /*
+ * Let's leave the real work for the running worker since we'd just
+ * contend with it on wb->list_lock anyway.
+ */
+ if (switch_running)
+ return;
+
+ /* OK, we will be doing the switching work */
+ wb_get(new_wb);
+ spin_lock_irq(&new_wb->work_lock);
+ while (!list_empty(&new_wb->switch_wbs_ctxs)) {
+ isw = list_first_entry(&new_wb->switch_wbs_ctxs,
+ struct inode_switch_wbs_context, list);
+ spin_unlock_irq(&new_wb->work_lock);
+ process_inode_switch_wbs_work(isw);
+ spin_lock_irq(&new_wb->work_lock);
+ list_del(&isw->list);
+ kfree(isw);
+ }
+ spin_unlock_irq(&new_wb->work_lock);
+ wb_put(new_wb);
+}
+
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ad261082bba..f94fd458c248 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -136,7 +136,8 @@ struct bdi_writeback {
int dirty_exceeded;
enum wb_reason start_all_reason;
- spinlock_t work_lock; /* protects work_list & dwork scheduling */
+ spinlock_t work_lock; /* protects work_list,
+ dwork scheduling, and switch_wbs_ctxs */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
struct delayed_work bw_dwork; /* work item used for bandwidth estimate */
@@ -152,6 +153,8 @@ struct bdi_writeback {
struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
struct list_head b_attached; /* attached inodes, protected by list_lock */
struct list_head offline_node; /* anchored at offline_cgwbs */
+ struct list_head switch_wbs_ctxs; /* queued contexts for
+ * writeback switching */
union {
struct work_struct release_work;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 783904d8c5ef..ce0aa7d03cc5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -709,6 +709,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
wb->memcg_css = memcg_css;
wb->blkcg_css = blkcg_css;
INIT_LIST_HEAD(&wb->b_attached);
+ INIT_LIST_HEAD(&wb->switch_wbs_ctxs);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
bdi_get(bdi);
@@ -839,6 +840,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
+ INIT_LIST_HEAD(&bdi->wb.switch_wbs_ctxs);
}
return ret;
}
--
2.51.0
next prev parent reply other threads:[~2025-09-09 14:44 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-09 14:44 [PATCH 0/4] writeback: Avoid lockups when switching inodes Jan Kara
2025-09-09 14:44 ` Jan Kara [this message]
2025-09-09 16:52 ` [PATCH 1/4] writeback: Avoid contention on wb->list_lock " Tejun Heo
2025-09-10 8:19 ` Jan Kara
2025-09-10 17:10 ` Tejun Heo
2025-09-11 11:30 ` Jan Kara
2025-09-11 12:01 ` Jan Kara
2025-09-12 10:39 ` Jan Kara
2025-09-09 14:44 ` [PATCH 2/4] writeback: Avoid softlockup when switching many inodes Jan Kara
2025-09-09 16:55 ` Tejun Heo
2025-09-09 14:44 ` [PATCH 3/4] writeback: Avoid excessively long inode switching times Jan Kara
2025-09-09 16:56 ` Tejun Heo
2025-09-09 14:44 ` [PATCH 4/4] writeback: Add tracepoint to track pending inode switches Jan Kara
2025-09-09 16:57 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250909144400.2901-5-jack@suse.cz \
--to=jack@suse.cz \
--cc=brauner@kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=tj@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).