From: Dave Chinner <david@fromorbit.com>
To: linux-fsdevel@vger.kernel.org
Cc: linux-xfs@vger.kernel.org, linux-bcachefs@vger.kernel.org,
kent.overstreet@linux.dev, torvalds@linux-foundation.org
Subject: [PATCH 3/7] vfs: convert vfs inode iterators to super_iter_inodes_unsafe()
Date: Wed, 2 Oct 2024 11:33:20 +1000 [thread overview]
Message-ID: <20241002014017.3801899-4-david@fromorbit.com> (raw)
In-Reply-To: <20241002014017.3801899-1-david@fromorbit.com>
From: Dave Chinner <dchinner@redhat.com>
Convert VFS internal superblock inode iterators that cannot use
referenced inodes to the new super_iter_inodes_unsafe() iterator.
Dquot and inode eviction require this special handling due to
special eviction handling requirements. The special
nr_blockdev_pages() statistics code needs it as well, as this is
called from si_meminfo() and so can potentially be run from
locations where arbitrary blocking is not allowed or desirable.
New cases using this iterator need careful consideration.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
block/bdev.c | 24 +++++++++++----
fs/inode.c | 79 ++++++++++++++++++++++++++----------------------
fs/quota/dquot.c | 72 ++++++++++++++++++++++++-------------------
3 files changed, 102 insertions(+), 73 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index 33f9c4605e3a..b5a362156ca1 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -472,16 +472,28 @@ void bdev_drop(struct block_device *bdev)
iput(BD_INODE(bdev));
}
+static int bdev_pages_count(struct inode *inode, void *data)
+{
+ long *pages = data;
+
+ *pages += inode->i_mapping->nrpages;
+ return INO_ITER_DONE;
+}
+
long nr_blockdev_pages(void)
{
- struct inode *inode;
long ret = 0;
- spin_lock(&blockdev_superblock->s_inode_list_lock);
- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
- ret += inode->i_mapping->nrpages;
- spin_unlock(&blockdev_superblock->s_inode_list_lock);
-
+ /*
+ * We can be called from contexts where blocking is not
+ * desirable. The count is advisory at best, and we only
+ * need to access the inode mapping. Hence as long as we
+ * have an inode existence guarantee, we can safely count
+ * the cached pages on each inode without needing reference
+ * counted inodes.
+ */
+ super_iter_inodes_unsafe(blockdev_superblock,
+ bdev_pages_count, &ret);
return ret;
}
diff --git a/fs/inode.c b/fs/inode.c
index 0a53d8c34203..3f335f78c5b2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -761,8 +761,11 @@ static void evict(struct inode *inode)
* Dispose-list gets a local list with local inodes in it, so it doesn't
* need to worry about list corruption and SMP locks.
*/
-static void dispose_list(struct list_head *head)
+static bool dispose_list(struct list_head *head)
{
+ if (list_empty(head))
+ return false;
+
while (!list_empty(head)) {
struct inode *inode;
@@ -772,6 +775,7 @@ static void dispose_list(struct list_head *head)
evict(inode);
cond_resched();
}
+ return true;
}
/**
@@ -783,47 +787,50 @@ static void dispose_list(struct list_head *head)
* so any inode reaching zero refcount during or after that call will
* be immediately evicted.
*/
+static int evict_inode_fn(struct inode *inode, void *data)
+{
+ struct list_head *dispose = data;
+
+ spin_lock(&inode->i_lock);
+ if (atomic_read(&inode->i_count) ||
+ (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))) {
+ spin_unlock(&inode->i_lock);
+ return INO_ITER_DONE;
+ }
+
+ inode->i_state |= I_FREEING;
+ inode_lru_list_del(inode);
+ spin_unlock(&inode->i_lock);
+ list_add(&inode->i_lru, dispose);
+
+ /*
+ * If we've run long enough to need rescheduling, abort the
+ * iteration so we can return to evict_inodes() and dispose of the
+ * inodes before collecting more inodes to evict.
+ */
+ if (need_resched())
+ return INO_ITER_ABORT;
+ return INO_ITER_DONE;
+}
+
void evict_inodes(struct super_block *sb)
{
- struct inode *inode, *next;
LIST_HEAD(dispose);
-again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- if (atomic_read(&inode->i_count))
- continue;
-
- spin_lock(&inode->i_lock);
- if (atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- inode->i_state |= I_FREEING;
- inode_lru_list_del(inode);
- spin_unlock(&inode->i_lock);
- list_add(&inode->i_lru, &dispose);
-
+ do {
/*
- * We can have a ton of inodes to evict at unmount time given
- * enough memory, check to see if we need to go to sleep for a
- * bit so we don't livelock.
+ * We do not want to take references to inodes whilst iterating
+ * because we are trying to evict unreferenced inodes from
+ * the cache. Hence we need to use the unsafe iteration
+ * mechanism and do all the required inode validity checks in
+ * evict_inode_fn() to safely queue unreferenced inodes for
+ * eviction.
+ *
+ * We repeat the iteration until it doesn't find any more
+ * inodes to dispose of.
*/
- if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
- dispose_list(&dispose);
- goto again;
- }
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- dispose_list(&dispose);
+ super_iter_inodes_unsafe(sb, evict_inode_fn, &dispose);
+ } while (dispose_list(&dispose));
}
EXPORT_SYMBOL_GPL(evict_inodes);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b40410cd39af..ea0bd807fed7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1075,41 +1075,51 @@ static int add_dquot_ref(struct super_block *sb, int type)
return err;
}
+struct dquot_ref_data {
+ int type;
+ int reserved;
+};
+
+static int remove_dquot_ref_fn(struct inode *inode, void *data)
+{
+ struct dquot_ref_data *ref = data;
+
+ spin_lock(&dq_data_lock);
+ if (!IS_NOQUOTA(inode)) {
+ struct dquot __rcu **dquots = i_dquot(inode);
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[ref->type], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+
+#ifdef CONFIG_QUOTA_DEBUG
+ if (unlikely(inode_get_rsv_space(inode) > 0))
+ ref->reserved++;
+#endif
+ rcu_assign_pointer(dquots[ref->type], NULL);
+ if (dquot)
+ dqput(dquot);
+ }
+ spin_unlock(&dq_data_lock);
+ return INO_ITER_DONE;
+}
+
static void remove_dquot_ref(struct super_block *sb, int type)
{
- struct inode *inode;
-#ifdef CONFIG_QUOTA_DEBUG
- int reserved = 0;
-#endif
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- /*
- * We have to scan also I_NEW inodes because they can already
- * have quota pointer initialized. Luckily, we need to touch
- * only quota pointers and these have separate locking
- * (dq_data_lock).
- */
- spin_lock(&dq_data_lock);
- if (!IS_NOQUOTA(inode)) {
- struct dquot __rcu **dquots = i_dquot(inode);
- struct dquot *dquot = srcu_dereference_check(
- dquots[type], &dquot_srcu,
- lockdep_is_held(&dq_data_lock));
+ struct dquot_ref_data ref = {
+ .type = type,
+ };
+ /*
+ * We have to scan I_NEW inodes because they can already
+ * have quota pointer initialized. Luckily, we need to touch
+ * only quota pointers and these have separate locking
+ * (dq_data_lock) so the existence guarantee that
+ * super_iter_inodes_unsafe() provides inodes passed to
+ * remove_dquot_ref_fn() is sufficient for this operation.
+ */
+ super_iter_inodes_unsafe(sb, remove_dquot_ref_fn, &ref);
#ifdef CONFIG_QUOTA_DEBUG
- if (unlikely(inode_get_rsv_space(inode) > 0))
- reserved = 1;
-#endif
- rcu_assign_pointer(dquots[type], NULL);
- if (dquot)
- dqput(dquot);
- }
- spin_unlock(&dq_data_lock);
- }
- spin_unlock(&sb->s_inode_list_lock);
-#ifdef CONFIG_QUOTA_DEBUG
- if (reserved) {
+ if (ref.reserved) {
printk(KERN_WARNING "VFS (%s): Writes happened after quota"
" was disabled thus quota information is probably "
"inconsistent. Please run quotacheck(8).\n", sb->s_id);
--
2.45.2
next prev parent reply other threads:[~2024-10-02 1:40 UTC|newest]
Thread overview: 72+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-02 1:33 [RFC PATCH 0/7] vfs: improving inode cache iteration scalability Dave Chinner
2024-10-02 1:33 ` [PATCH 1/7] vfs: replace invalidate_inodes() with evict_inodes() Dave Chinner
2024-10-03 7:07 ` Christoph Hellwig
2024-10-03 9:20 ` Jan Kara
2024-10-02 1:33 ` [PATCH 2/7] vfs: add inode iteration superblock method Dave Chinner
2024-10-03 7:12 ` Christoph Hellwig
2024-10-03 10:35 ` Dave Chinner
2024-10-04 9:53 ` kernel test robot
2024-10-02 1:33 ` Dave Chinner [this message]
2024-10-03 7:14 ` [PATCH 3/7] vfs: convert vfs inode iterators to super_iter_inodes_unsafe() Christoph Hellwig
2024-10-03 10:45 ` Dave Chinner
2024-10-04 10:55 ` kernel test robot
2024-10-02 1:33 ` [PATCH 4/7] vfs: Convert sb->s_inodes iteration to super_iter_inodes() Dave Chinner
2024-10-03 7:23 ` lsm sb_delete hook, was " Christoph Hellwig
2024-10-03 7:38 ` Christoph Hellwig
2024-10-03 11:57 ` Jan Kara
2024-10-03 12:11 ` Christoph Hellwig
2024-10-03 12:26 ` Jan Kara
2024-10-03 12:39 ` Christoph Hellwig
2024-10-03 12:56 ` Jan Kara
2024-10-03 13:04 ` Christoph Hellwig
2024-10-03 13:59 ` Dave Chinner
2024-10-03 16:17 ` Jan Kara
2024-10-04 0:46 ` Dave Chinner
2024-10-04 7:21 ` Christian Brauner
2024-10-04 12:14 ` Christoph Hellwig
2024-10-04 13:49 ` Jan Kara
2024-10-04 18:15 ` Paul Moore
2024-10-04 22:57 ` Dave Chinner
2024-10-05 15:21 ` Mickaël Salaün
2024-10-05 16:03 ` Mickaël Salaün
2024-10-05 16:03 ` Paul Moore
2024-10-07 20:37 ` Linus Torvalds
2024-10-07 23:33 ` Dave Chinner
2024-10-08 0:28 ` Linus Torvalds
2024-10-08 0:54 ` Linus Torvalds
2024-10-09 9:49 ` Jan Kara
2024-10-08 12:59 ` Mickaël Salaün
2024-10-09 0:21 ` Dave Chinner
2024-10-09 9:23 ` Mickaël Salaün
2024-10-08 8:57 ` Amir Goldstein
2024-10-08 11:23 ` Jan Kara
2024-10-08 12:16 ` Christian Brauner
2024-10-09 0:03 ` Dave Chinner
2024-10-08 23:44 ` Dave Chinner
2024-10-09 6:10 ` Amir Goldstein
2024-10-09 14:18 ` Jan Kara
2024-10-02 1:33 ` [PATCH 5/7] vfs: add inode iteration superblock method Dave Chinner
2024-10-03 7:24 ` Christoph Hellwig
2024-10-02 1:33 ` [PATCH 6/7] xfs: implement sb->iter_vfs_inodes Dave Chinner
2024-10-03 7:30 ` Christoph Hellwig
2024-10-02 1:33 ` [PATCH 7/7] bcachefs: " Dave Chinner
2024-10-02 10:00 ` [RFC PATCH 0/7] vfs: improving inode cache iteration scalability Christian Brauner
2024-10-02 12:34 ` Dave Chinner
2024-10-02 19:29 ` Kent Overstreet
2024-10-02 22:23 ` Dave Chinner
2024-10-02 23:20 ` Kent Overstreet
2024-10-03 1:41 ` Dave Chinner
2024-10-03 2:24 ` Kent Overstreet
2024-10-03 9:17 ` Jan Kara
2024-10-03 9:59 ` Dave Chinner
2024-10-02 19:49 ` Linus Torvalds
2024-10-02 20:28 ` Kent Overstreet
2024-10-02 23:17 ` Dave Chinner
2024-10-03 1:22 ` Kent Overstreet
2024-10-03 2:20 ` Dave Chinner
2024-10-03 2:42 ` Kent Overstreet
2024-10-03 11:45 ` Jan Kara
2024-10-03 12:18 ` Christoph Hellwig
2024-10-03 12:46 ` Jan Kara
2024-10-03 13:35 ` Dave Chinner
2024-10-03 13:03 ` Dave Chinner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241002014017.3801899-4-david@fromorbit.com \
--to=david@fromorbit.com \
--cc=kent.overstreet@linux.dev \
--cc=linux-bcachefs@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.