From: Dave Chinner <david@fromorbit.com>
To: linux-fsdevel@vger.kernel.org
Cc: linux-xfs@vger.kernel.org, linux-bcachefs@vger.kernel.org,
kent.overstreet@linux.dev, torvalds@linux-foundation.org
Subject: [PATCH 3/7] vfs: convert vfs inode iterators to super_iter_inodes_unsafe()
Date: Wed, 2 Oct 2024 11:33:20 +1000 [thread overview]
Message-ID: <20241002014017.3801899-4-david@fromorbit.com> (raw)
In-Reply-To: <20241002014017.3801899-1-david@fromorbit.com>
From: Dave Chinner <dchinner@redhat.com>
Convert VFS internal superblock inode iterators that cannot use
referenced inodes to the new super_iter_inodes_unsafe() iterator.
Dquot and inode eviction require this special handling due to
special eviction handling requirements. The special
nr_blockdev_pages() statistics code needs it as well, as this is
called from si_meminfo() and so can potentially be run from
locations where arbitrary blocking is not allowed or desirable.
New cases using this iterator need careful consideration.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
block/bdev.c | 24 +++++++++++----
fs/inode.c | 79 ++++++++++++++++++++++++++----------------------
fs/quota/dquot.c | 72 ++++++++++++++++++++++++-------------------
3 files changed, 102 insertions(+), 73 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index 33f9c4605e3a..b5a362156ca1 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -472,16 +472,28 @@ void bdev_drop(struct block_device *bdev)
iput(BD_INODE(bdev));
}
+static int bdev_pages_count(struct inode *inode, void *data)
+{
+ long *pages = data;
+
+ *pages += inode->i_mapping->nrpages;
+ return INO_ITER_DONE;
+}
+
long nr_blockdev_pages(void)
{
- struct inode *inode;
long ret = 0;
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
- ret += inode->i_mapping->nrpages;
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
-
+ /*
+ * We can be called from contexts where blocking is not
+ * desirable. The count is advisory at best, and we only
+ * need to access the inode mapping. Hence as long as we
+ * have an inode existence guarantee, we can safely count
+ * the cached pages on each inode without needing reference
+ * counted inodes.
+ */
+ super_iter_inodes_unsafe(blockdev_superblock,
+ bdev_pages_count, &ret);
return ret;
}
diff --git a/fs/inode.c b/fs/inode.c
index 0a53d8c34203..3f335f78c5b2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -761,8 +761,11 @@ static void evict(struct inode *inode)
* Dispose-list gets a local list with local inodes in it, so it doesn't
* need to worry about list corruption and SMP locks.
*/
-static void dispose_list(struct list_head *head)
+static bool dispose_list(struct list_head *head)
{
+ if (list_empty(head))
+ return false;
+
while (!list_empty(head)) {
struct inode *inode;
@@ -772,6 +775,7 @@ static void dispose_list(struct list_head *head)
evict(inode);
cond_resched();
}
+ return true;
}
/**
@@ -783,47 +787,50 @@ static void dispose_list(struct list_head *head)
* so any inode reaching zero refcount during or after that call will
* be immediately evicted.
*/
+static int evict_inode_fn(struct inode *inode, void *data)
+{
+ struct list_head *dispose = data;
+
+ spin_lock(&inode->i_lock);
+ if (atomic_read(&inode->i_count) ||
+ (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))) {
+ spin_unlock(&inode->i_lock);
+ return INO_ITER_DONE;
+ }
+
+ inode->i_state |= I_FREEING;
+ inode_lru_list_del(inode);
+ spin_unlock(&inode->i_lock);
+ list_add(&inode->i_lru, dispose);
+
+ /*
+ * If we've run long enough to need rescheduling, abort the
+ * iteration so we can return to evict_inodes() and dispose of the
+ * inodes before collecting more inodes to evict.
+ */
+ if (need_resched())
+ return INO_ITER_ABORT;
+ return INO_ITER_DONE;
+}
+
void evict_inodes(struct super_block *sb)
{
- struct inode *inode, *next;
LIST_HEAD(dispose);
-again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- if (atomic_read(&inode->i_count))
- continue;
-
- spin_lock(&inode->i_lock);
- if (atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- inode->i_state |= I_FREEING;
- inode_lru_list_del(inode);
- spin_unlock(&inode->i_lock);
- list_add(&inode->i_lru, &dispose);
-
+ do {
/*
- * We can have a ton of inodes to evict at unmount time given
- * enough memory, check to see if we need to go to sleep for a
- * bit so we don't livelock.
+ * We do not want to take references to inodes whilst iterating
+ * because we are trying to evict unreferenced inodes from
+ * the cache. Hence we need to use the unsafe iteration
+ * mechanism and do all the required inode validity checks in
+ * evict_inode_fn() to safely queue unreferenced inodes for
+ * eviction.
+ *
+ * We repeat the iteration until it doesn't find any more
+ * inodes to dispose of.
*/
- if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
- dispose_list(&dispose);
- goto again;
- }
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- dispose_list(&dispose);
+ super_iter_inodes_unsafe(sb, evict_inode_fn, &dispose);
+ } while (dispose_list(&dispose));
}
EXPORT_SYMBOL_GPL(evict_inodes);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b40410cd39af..ea0bd807fed7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1075,41 +1075,51 @@ static int add_dquot_ref(struct super_block *sb, int type)
return err;
}
+struct dquot_ref_data {
+ int type;
+ int reserved;
+};
+
+static int remove_dquot_ref_fn(struct inode *inode, void *data)
+{
+ struct dquot_ref_data *ref = data;
+
+ spin_lock(&dq_data_lock);
+ if (!IS_NOQUOTA(inode)) {
+ struct dquot __rcu **dquots = i_dquot(inode);
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[ref->type], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+
+#ifdef CONFIG_QUOTA_DEBUG
+ if (unlikely(inode_get_rsv_space(inode) > 0))
+ ref->reserved++;
+#endif
+ rcu_assign_pointer(dquots[ref->type], NULL);
+ if (dquot)
+ dqput(dquot);
+ }
+ spin_unlock(&dq_data_lock);
+ return INO_ITER_DONE;
+}
+
static void remove_dquot_ref(struct super_block *sb, int type)
{
- struct inode *inode;
-#ifdef CONFIG_QUOTA_DEBUG
- int reserved = 0;
-#endif
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- /*
- * We have to scan also I_NEW inodes because they can already
- * have quota pointer initialized. Luckily, we need to touch
- * only quota pointers and these have separate locking
- * (dq_data_lock).
- */
- spin_lock(&dq_data_lock);
- if (!IS_NOQUOTA(inode)) {
- struct dquot __rcu **dquots = i_dquot(inode);
- struct dquot *dquot = srcu_dereference_check(
- dquots[type], &dquot_srcu,
- lockdep_is_held(&dq_data_lock));
+ struct dquot_ref_data ref = {
+ .type = type,
+ };
+ /*
+ * We have to scan I_NEW inodes because they can already
+ * have quota pointer initialized. Luckily, we need to touch
+ * only quota pointers and these have separate locking
+ * (dq_data_lock) so the existence guarantee that
+ * super_iter_inodes_unsafe() provides inodes passed to
+ * remove_dquot_ref_fn() is sufficient for this operation.
+ */
+ super_iter_inodes_unsafe(sb, remove_dquot_ref_fn, &ref);
#ifdef CONFIG_QUOTA_DEBUG
- if (unlikely(inode_get_rsv_space(inode) > 0))
- reserved = 1;
-#endif
- rcu_assign_pointer(dquots[type], NULL);
- if (dquot)
- dqput(dquot);
- }
- spin_unlock(&dq_data_lock);
- }
- spin_unlock(&sb->s_inode_list_lock);
-#ifdef CONFIG_QUOTA_DEBUG
- if (reserved) {
+ if (ref.reserved) {
printk(KERN_WARNING "VFS (%s): Writes happened after quota"
" was disabled thus quota information is probably "
"inconsistent. Please run quotacheck(8).\n", sb->s_id);
--
2.45.2
next prev parent reply other threads:[~2024-10-02 1:40 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-02 1:33 [RFC PATCH 0/7] vfs: improving inode cache iteration scalability Dave Chinner
2024-10-02 1:33 ` [PATCH 1/7] vfs: replace invalidate_inodes() with evict_inodes() Dave Chinner
2024-10-03 7:07 ` Christoph Hellwig
2024-10-03 9:20 ` Jan Kara
2024-10-02 1:33 ` [PATCH 2/7] vfs: add inode iteration superblock method Dave Chinner
2024-10-03 7:12 ` Christoph Hellwig
2024-10-03 10:35 ` Dave Chinner
2024-10-04 9:53 ` kernel test robot
2024-10-02 1:33 ` Dave Chinner [this message]
2024-10-03 7:14 ` [PATCH 3/7] vfs: convert vfs inode iterators to super_iter_inodes_unsafe() Christoph Hellwig
2024-10-03 10:45 ` Dave Chinner
2024-10-04 10:55 ` kernel test robot
2024-10-02 1:33 ` [PATCH 4/7] vfs: Convert sb->s_inodes iteration to super_iter_inodes() Dave Chinner
2024-10-03 7:23 ` lsm sb_delete hook, was " Christoph Hellwig
2024-10-03 7:38 ` Christoph Hellwig
2024-10-03 11:57 ` Jan Kara
2024-10-03 12:11 ` Christoph Hellwig
2024-10-03 12:26 ` Jan Kara
2024-10-03 12:39 ` Christoph Hellwig
2024-10-03 12:56 ` Jan Kara
2024-10-03 13:04 ` Christoph Hellwig
2024-10-03 13:59 ` Dave Chinner
2024-10-03 16:17 ` Jan Kara
2024-10-04 0:46 ` Dave Chinner
2024-10-04 7:21 ` Christian Brauner
2024-10-04 12:14 ` Christoph Hellwig
2024-10-04 13:49 ` Jan Kara
2024-10-04 18:15 ` Paul Moore
2024-10-04 22:57 ` Dave Chinner
2024-10-05 15:21 ` Mickaël Salaün
2024-10-05 16:03 ` Mickaël Salaün
2024-10-05 16:03 ` Paul Moore
2024-10-07 20:37 ` Linus Torvalds
2024-10-07 23:33 ` Dave Chinner
2024-10-08 0:28 ` Linus Torvalds
2024-10-08 0:54 ` Linus Torvalds
2024-10-09 9:49 ` Jan Kara
2024-10-08 12:59 ` Mickaël Salaün
2024-10-09 0:21 ` Dave Chinner
2024-10-09 9:23 ` Mickaël Salaün
2024-10-08 8:57 ` Amir Goldstein
2024-10-08 11:23 ` Jan Kara
2024-10-08 12:16 ` Christian Brauner
2024-10-09 0:03 ` Dave Chinner
2024-10-08 23:44 ` Dave Chinner
2024-10-09 6:10 ` Amir Goldstein
2024-10-09 14:18 ` Jan Kara
2024-10-02 1:33 ` [PATCH 5/7] vfs: add inode iteration superblock method Dave Chinner
2024-10-03 7:24 ` Christoph Hellwig
2024-10-02 1:33 ` [PATCH 6/7] xfs: implement sb->iter_vfs_inodes Dave Chinner
2024-10-03 7:30 ` Christoph Hellwig
2024-10-02 1:33 ` [PATCH 7/7] bcachefs: " Dave Chinner
2024-10-02 10:00 ` [RFC PATCH 0/7] vfs: improving inode cache iteration scalability Christian Brauner
2024-10-02 12:34 ` Dave Chinner
2024-10-02 19:29 ` Kent Overstreet
2024-10-02 22:23 ` Dave Chinner
2024-10-02 23:20 ` Kent Overstreet
2024-10-03 1:41 ` Dave Chinner
2024-10-03 2:24 ` Kent Overstreet
2024-10-03 9:17 ` Jan Kara
2024-10-03 9:59 ` Dave Chinner
2024-10-02 19:49 ` Linus Torvalds
2024-10-02 20:28 ` Kent Overstreet
2024-10-02 23:17 ` Dave Chinner
2024-10-03 1:22 ` Kent Overstreet
2024-10-03 2:20 ` Dave Chinner
2024-10-03 2:42 ` Kent Overstreet
2024-10-03 11:45 ` Jan Kara
2024-10-03 12:18 ` Christoph Hellwig
2024-10-03 12:46 ` Jan Kara
2024-10-03 13:35 ` Dave Chinner
2024-10-03 13:03 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241002014017.3801899-4-david@fromorbit.com \
--to=david@fromorbit.com \
--cc=kent.overstreet@linux.dev \
--cc=linux-bcachefs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).