* [PATCH 01/15] rcu pathwalk: prevent bogus hard errors from may_lookup()
2023-10-02 2:28 ` Al Viro
@ 2023-10-02 2:29 ` Al Viro
2023-10-02 2:30 ` [PATCH 02/15] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper Al Viro
` (12 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:29 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
If lazy call of ->permission() returns a hard error, check that
try_to_unlazy() succeeds before returning it. That both makes
life easier for ->permission() instances and closes the race
in ENOTDIR handling - it is possible that positive d_can_lookup()
seen in link_path_walk() applies to the state *after* unlink() +
mkdir(), while nd->inode matches the state prior to that.
Normally seeing e.g. EACCES from permission check in rcu pathwalk
means that with some timings non-rcu pathwalk would've run into
the same; however, running into a non-executable regular file
in the middle of a pathname would not get to permission check -
it would fail with ENOTDIR instead.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/namei.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..2494561a21fe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap,
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- if (err != -ECHILD || !try_to_unlazy(nd))
+ if (!err) // success, keep going
+ return 0;
+ if (!try_to_unlazy(nd))
+ return -ECHILD; // redo it all non-lazy
+ if (err != -ECHILD) // hard error
return err;
}
return inode_permission(idmap, nd->inode, MAY_EXEC);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 02/15] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper
2023-10-02 2:28 ` Al Viro
2023-10-02 2:29 ` [PATCH 01/15] rcu pathwalk: prevent bogus hard errors from may_lookup() Al Viro
@ 2023-10-02 2:30 ` Al Viro
2023-10-02 16:10 ` Linus Torvalds
2023-10-02 2:30 ` [PATCH 03/15] affs: free affs_sb_info with kfree_rcu() Al Viro
` (11 subsequent siblings)
13 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:30 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
That stuff can be accessed by ->d_hash()/->d_compare(); as it is, we have
a hard-to-hit UAF if rcu pathwalk manages to get into ->d_hash() on a filesystem
that is in process of getting shut down.
Besides, having nls and upcase table cleanup moved from ->put_super() towards
the place where sbi is freed makes for simpler failure exits.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/exfat/exfat_fs.h | 1 +
fs/exfat/nls.c | 14 ++++----------
fs/exfat/super.c | 20 +++++++++++---------
3 files changed, 16 insertions(+), 19 deletions(-)
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index f55498e5c23d..22e17b0a66e8 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -273,6 +273,7 @@ struct exfat_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+ struct rcu_head rcu;
};
#define EXFAT_CACHE_VALID 0
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 705710f93e2d..afdf13c34ff5 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb,
unsigned int sect_size = sb->s_blocksize;
unsigned int i, index = 0;
u32 chksum = 0;
- int ret;
unsigned char skip = false;
unsigned short *upcase_table;
@@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
if (!bh) {
exfat_err(sb, "failed to read sector(0x%llx)",
(unsigned long long)sector);
- ret = -EIO;
- goto free_table;
+ return -EIO;
}
sector++;
for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
@@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb,
exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)",
index, chksum, utbl_checksum);
- ret = -EINVAL;
-free_table:
- exfat_free_upcase_table(sbi);
- return ret;
+ return -EINVAL;
}
static int exfat_load_default_upcase_table(struct super_block *sb)
{
- int i, ret = -EIO;
+ int i;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned char skip = false;
unsigned short uni = 0, *upcase_table;
@@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb)
return 0;
/* FATAL error: default upcase table has error */
- exfat_free_upcase_table(sbi);
- return ret;
+ return -EIO;
}
int exfat_create_upcase_table(struct super_block *sb)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 2778bd9b631e..593cfff8c6f4 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb)
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
-
- unload_nls(sbi->nls_io);
- exfat_free_upcase_table(sbi);
}
static int exfat_sync_fs(struct super_block *sb, int wait)
@@ -593,7 +590,7 @@ static int __exfat_fill_super(struct super_block *sb)
ret = exfat_load_bitmap(sb);
if (ret) {
exfat_err(sb, "failed to load alloc-bitmap");
- goto free_upcase_table;
+ goto free_bh;
}
ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
@@ -606,8 +603,6 @@ static int __exfat_fill_super(struct super_block *sb)
free_alloc_bitmap:
exfat_free_bitmap(sbi);
-free_upcase_table:
- exfat_free_upcase_table(sbi);
free_bh:
brelse(sbi->boot_bh);
return ret;
@@ -694,12 +689,10 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_root = NULL;
free_table:
- exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
check_nls_io:
- unload_nls(sbi->nls_io);
return err;
}
@@ -764,13 +757,22 @@ static int exfat_init_fs_context(struct fs_context *fc)
return 0;
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
+
+ unload_nls(sbi->nls_io);
+ exfat_free_upcase_table(sbi);
+ exfat_free_sbi(sbi);
+}
+
static void exfat_kill_sb(struct super_block *sb)
{
struct exfat_sb_info *sbi = sb->s_fs_info;
kill_block_super(sb);
if (sbi)
- exfat_free_sbi(sbi);
+ call_rcu(&sbi->rcu, delayed_free);
}
static struct file_system_type exfat_fs_type = {
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH 02/15] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper
2023-10-02 2:30 ` [PATCH 02/15] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper Al Viro
@ 2023-10-02 16:10 ` Linus Torvalds
2023-10-02 18:04 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Linus Torvalds @ 2023-10-02 16:10 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Sun, 1 Oct 2023 at 19:30, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> That stuff can be accessed by ->d_hash()/->d_compare(); as it is, we have
> a hard-to-hit UAF if rcu pathwalk manages to get into ->d_hash() on a filesystem
> that is in process of getting shut down.
>
> Besides, having nls and upcase table cleanup moved from ->put_super() towards
> the place where sbi is freed makes for simpler failure exits.
I don't disagree with moving the freeing, but the RCU-delay makes me go "hmm".
Is there some reason why we can't try to do this in generic code? The
umount code already does RCU delays for other things, I get the
feeling that we should have a RCu delay between "put_super" and
"kkill_sb".
Could we move the ->kill_sb() call into destroy_super_work(), which is
already RCU-delayed, for example?
It feels wrong to have the filesystems have to deal with the vfs layer
doing RCU-lookups.
Linus
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 02/15] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper
2023-10-02 16:10 ` Linus Torvalds
@ 2023-10-02 18:04 ` Al Viro
0 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 18:04 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Mon, Oct 02, 2023 at 09:10:22AM -0700, Linus Torvalds wrote:
> On Sun, 1 Oct 2023 at 19:30, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > That stuff can be accessed by ->d_hash()/->d_compare(); as it is, we have
> > a hard-to-hit UAF if rcu pathwalk manages to get into ->d_hash() on a filesystem
> > that is in process of getting shut down.
> >
> > Besides, having nls and upcase table cleanup moved from ->put_super() towards
> > the place where sbi is freed makes for simpler failure exits.
>
> I don't disagree with moving the freeing, but the RCU-delay makes me go "hmm".
>
> Is there some reason why we can't try to do this in generic code? The
> umount code already does RCU delays for other things, I get the
> feeling that we should have a RCu delay between "put_super" and
> "kkill_sb".
>
> Could we move the ->kill_sb() call into destroy_super_work(), which is
> already RCU-delayed, for example?
>
> It feels wrong to have the filesystems have to deal with the vfs layer
> doing RCU-lookups.
For one thing, ->kill_sb() might do tons of IO. And we really want
to have that done before umount(2) returns to userland, so that part can't
be offloaded via schedule_work()...
^ permalink raw reply [flat|nested] 38+ messages in thread
* [PATCH 03/15] affs: free affs_sb_info with kfree_rcu()
2023-10-02 2:28 ` Al Viro
2023-10-02 2:29 ` [PATCH 01/15] rcu pathwalk: prevent bogus hard errors from may_lookup() Al Viro
2023-10-02 2:30 ` [PATCH 02/15] exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper Al Viro
@ 2023-10-02 2:30 ` Al Viro
2023-10-02 2:31 ` [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info Al Viro
` (10 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:30 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
one of the flags in it is used by ->d_hash()/->d_compare()
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/affs/affs.h | 1 +
fs/affs/super.c | 2 +-
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 60685ec76d98..2e612834329a 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -105,6 +105,7 @@ struct affs_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sb_work; /* superblock flush delayed work */
spinlock_t work_lock; /* protects sb_work and work_queued */
+ struct rcu_head rcu;
};
#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 58b391446ae1..b56a95cf414a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb)
affs_brelse(sbi->s_root_bh);
kfree(sbi->s_prefix);
mutex_destroy(&sbi->s_bmlock);
- kfree(sbi);
+ kfree_rcu(sbi, rcu);
}
}
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-02 2:28 ` Al Viro
` (2 preceding siblings ...)
2023-10-02 2:30 ` [PATCH 03/15] affs: free affs_sb_info with kfree_rcu() Al Viro
@ 2023-10-02 2:31 ` Al Viro
2023-10-02 6:49 ` Christoph Hellwig
2023-10-02 2:31 ` [PATCH 05/15] cifs_get_link(): bail out in unsafe case Al Viro
` (9 subsequent siblings)
13 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:31 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
->d_hash() and ->d_compare() use those, so we need to delay freeing
them.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/hfsplus/hfsplus_fs.h | 1 +
fs/hfsplus/super.c | 12 +++++++++---
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ededcb720c1..012a3d003fbe 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -190,6 +190,7 @@ struct hfsplus_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
+ struct rcu_head rcu;
};
#define HFSPLUS_SB_WRITEBACKUP 0
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1986b4f18a90..97920202790f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb)
spin_unlock(&sbi->work_lock);
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu);
+
+ unload_nls(sbi->nls);
+ kfree(sbi);
+}
+
static void hfsplus_put_super(struct super_block *sb)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
@@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb)
hfs_btree_close(sbi->ext_tree);
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
- unload_nls(sbi->nls);
- kfree(sb->s_fs_info);
- sb->s_fs_info = NULL;
+ call_rcu(&sbi->rcu, delayed_free);
}
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-02 2:31 ` [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info Al Viro
@ 2023-10-02 6:49 ` Christoph Hellwig
2023-10-02 7:14 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Christoph Hellwig @ 2023-10-02 6:49 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Amir Goldstein, Trond Myklebust, Bob Peterson,
Steve French, Luis Chamberlain
Instead of all this duplicatio in the file system, can we please just
add a
struct nls_table *s_nls;
to struct super_block and RCU free it in common code and drop all the
code in the file systems?
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-02 6:49 ` Christoph Hellwig
@ 2023-10-02 7:14 ` Al Viro
2023-10-02 7:21 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 7:14 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, Christian Brauner, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Mon, Oct 02, 2023 at 08:49:12AM +0200, Christoph Hellwig wrote:
> Instead of all this duplicatio in the file system, can we please just
> add a
>
> struct nls_table *s_nls;
>
> to struct super_block and RCU free it in common code and drop all the
> code in the file systems?
It makes no sense for most of the filesystems, for one thing (note that
any use in ->lookup() does not warrant rcu delays). What's more,
how do you formulate the rules for what goes in that field when filesystem
uses more than one nls_table?
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-02 7:14 ` Al Viro
@ 2023-10-02 7:21 ` Al Viro
2023-10-02 18:09 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 7:21 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, Christian Brauner, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Mon, Oct 02, 2023 at 08:14:01AM +0100, Al Viro wrote:
> On Mon, Oct 02, 2023 at 08:49:12AM +0200, Christoph Hellwig wrote:
> > Instead of all this duplicatio in the file system, can we please just
> > add a
> >
> > struct nls_table *s_nls;
> >
> > to struct super_block and RCU free it in common code and drop all the
> > code in the file systems?
>
> It makes no sense for most of the filesystems, for one thing (note that
> any use in ->lookup() does not warrant rcu delays). What's more,
> how do you formulate the rules for what goes in that field when filesystem
> uses more than one nls_table?
Consider e.g. HFS; two separate nls_table (nls_disk, nls_io), neither needs
RCU delays of any sort. On VFAT, for that matter - again, two tables,
one needs RCU delay, another doesn't (both get dropped from the same
helper, so both get it).
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-02 7:21 ` Al Viro
@ 2023-10-02 18:09 ` Al Viro
2023-10-04 19:04 ` Linus Torvalds
0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 18:09 UTC (permalink / raw)
To: Christoph Hellwig
Cc: linux-fsdevel, Christian Brauner, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Mon, Oct 02, 2023 at 08:21:43AM +0100, Al Viro wrote:
> On Mon, Oct 02, 2023 at 08:14:01AM +0100, Al Viro wrote:
> > On Mon, Oct 02, 2023 at 08:49:12AM +0200, Christoph Hellwig wrote:
> > > Instead of all this duplicatio in the file system, can we please just
> > > add a
> > >
> > > struct nls_table *s_nls;
> > >
> > > to struct super_block and RCU free it in common code and drop all the
> > > code in the file systems?
> >
> > It makes no sense for most of the filesystems, for one thing (note that
> > any use in ->lookup() does not warrant rcu delays). What's more,
> > how do you formulate the rules for what goes in that field when filesystem
> > uses more than one nls_table?
>
> Consider e.g. HFS; two separate nls_table (nls_disk, nls_io), neither needs
> RCU delays of any sort. On VFAT, for that matter - again, two tables,
> one needs RCU delay, another doesn't (both get dropped from the same
> helper, so both get it).
BTW, is there any reason not to have synchronize_rcu() in delete_module(2),
just before calling ->exit()?
It's not a hot path, unless something really weird is going on, and it
would get rid of the need to delay unload_nls() calls...
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-02 18:09 ` Al Viro
@ 2023-10-04 19:04 ` Linus Torvalds
2023-10-04 19:06 ` Linus Torvalds
0 siblings, 1 reply; 38+ messages in thread
From: Linus Torvalds @ 2023-10-04 19:04 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, linux-fsdevel, Christian Brauner, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Mon, 2 Oct 2023 at 11:09, Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> BTW, is there any reason not to have synchronize_rcu() in delete_module(2),
> just before calling ->exit()?
>
> It's not a hot path, unless something really weird is going on, and it
> would get rid of the need to delay unload_nls() calls...
We already have one - it's hidden in free_module().
It's done after the kallsyms list removal. Is that too late?
Note that module *loading* actually has a few other synchronize_rcu()
calls in the failure path. In fact, load_module() has *two*
"synchronize_rcu()" calls:
- after ftrace_release_mod(mod)
- before releasing module_mutex
for the failure path, but there's only one for module unloading.
Adding one to after ftrace_release_mod() - where we have that
async_synchronize_full() - would make module unloading match the
loading error path.
But the synchronize_rcu calls do seem to be a bit randomly sprinkled
about. Maybe the one in free_module() is already sufficient for nls?
Linus
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
2023-10-04 19:04 ` Linus Torvalds
@ 2023-10-04 19:06 ` Linus Torvalds
0 siblings, 0 replies; 38+ messages in thread
From: Linus Torvalds @ 2023-10-04 19:06 UTC (permalink / raw)
To: Al Viro
Cc: Christoph Hellwig, linux-fsdevel, Christian Brauner, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
On Wed, 4 Oct 2023 at 12:04, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Adding one to after ftrace_release_mod() - where we have that
> async_synchronize_full() - would make module unloading match the
> loading error path.
>
> But the synchronize_rcu calls do seem to be a bit randomly sprinkled
> about. Maybe the one in free_module() is already sufficient for nls?
Never mind. You want it before the ->exit(). That makes sense, and
there is no matching code path for the module load failure case (since
that implies no - or failed - init()).
Linus
^ permalink raw reply [flat|nested] 38+ messages in thread
* [PATCH 05/15] cifs_get_link(): bail out in unsafe case
2023-10-02 2:28 ` Al Viro
` (3 preceding siblings ...)
2023-10-02 2:31 ` [PATCH 04/15] hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info Al Viro
@ 2023-10-02 2:31 ` Al Viro
2023-10-02 2:32 ` [PATCH 06/15] procfs: move dropping pde and pid from ->evict_inode() to ->free_inode() Al Viro
` (8 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:31 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
->d_revalidate() bails out there, anyway. It's not enough
to prevent getting into ->get_link() in RCU mode, but that
could happen only in a very contrieved setup. Not worth
trying to do anything fancy here unless ->d_revalidate()
stops kicking out of RCU mode at least in some cases.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/smb/client/cifsfs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 22869cda1356..2b044e47a3a6 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1170,6 +1170,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
{
char *target_path;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
target_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!target_path)
return ERR_PTR(-ENOMEM);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 06/15] procfs: move dropping pde and pid from ->evict_inode() to ->free_inode()
2023-10-02 2:28 ` Al Viro
` (4 preceding siblings ...)
2023-10-02 2:31 ` [PATCH 05/15] cifs_get_link(): bail out in unsafe case Al Viro
@ 2023-10-02 2:32 ` Al Viro
2023-10-02 2:33 ` [PATCH 07/15] procfs: make freeing proc_fs_info rcu-delayed Al Viro
` (7 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:32 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
that keeps both around until struct inode is freed, making access
to them safe from rcu-pathwalk
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/proc/base.c | 2 --
fs/proc/inode.c | 19 ++++++++-----------
2 files changed, 8 insertions(+), 13 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..8e93b11a0fed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1881,8 +1881,6 @@ void proc_pid_evict_inode(struct proc_inode *ei)
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->lock);
}
-
- put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block *sb,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 532dc9d240f7..c83e1a55da42 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -30,7 +30,6 @@
static void proc_evict_inode(struct inode *inode)
{
- struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
@@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode)
clear_inode(inode);
/* Stop tracking associated processes */
- if (ei->pid) {
+ if (ei->pid)
proc_pid_evict_inode(ei);
- ei->pid = NULL;
- }
-
- /* Let go of any associated proc directory entry */
- de = ei->pde;
- if (de) {
- pde_put(de);
- ei->pde = NULL;
- }
head = ei->sysctl;
if (head) {
@@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_free_inode(struct inode *inode)
{
+ struct proc_inode *ei = PROC_I(inode);
+
+ if (ei->pid)
+ put_pid(ei->pid);
+ /* Let go of any associated proc directory entry */
+ if (ei->pde)
+ pde_put(ei->pde);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 07/15] procfs: make freeing proc_fs_info rcu-delayed
2023-10-02 2:28 ` Al Viro
` (5 preceding siblings ...)
2023-10-02 2:32 ` [PATCH 06/15] procfs: move dropping pde and pid from ->evict_inode() to ->free_inode() Al Viro
@ 2023-10-02 2:33 ` Al Viro
2023-10-02 2:33 ` [PATCH 08/15] gfs2: fix an oops in gfs2_permission() Al Viro
` (6 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:33 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
makes proc_pid_ns() safe from rcu pathwalk (put_pid_ns()
is still synchronous, but that's not a problem - it does
rcu-delay everything that needs to be)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/proc/root.c | 2 +-
include/linux/proc_fs.h | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 9191248f2dac..d3148e1d883b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb)
kill_anon_super(sb);
put_pid_ns(fs_info->pid_ns);
- kfree(fs_info);
+ kfree_rcu(fs_info, rcu);
}
static struct file_system_type proc_fs_type = {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index de407e7c3b55..0b2a89854440 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -65,6 +65,7 @@ struct proc_fs_info {
kgid_t pid_gid;
enum proc_hidepid hide_pid;
enum proc_pidonly pidonly;
+ struct rcu_head rcu;
};
static inline struct proc_fs_info *proc_sb_info(struct super_block *sb)
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 08/15] gfs2: fix an oops in gfs2_permission()
2023-10-02 2:28 ` Al Viro
` (6 preceding siblings ...)
2023-10-02 2:33 ` [PATCH 07/15] procfs: make freeing proc_fs_info rcu-delayed Al Viro
@ 2023-10-02 2:33 ` Al Viro
2023-10-02 11:46 ` Bob Peterson
2023-10-02 2:34 ` [PATCH 09/15] nfs: make nfs_set_verifier() safe for use in RCU pathwalk Al Viro
` (5 subsequent siblings)
13 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:33 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
in RCU mode we might race with gfs2_evict_inode(), which zeroes
->i_gl. Freeing of the object it points to is RCU-delayed, so
if we manage to fetch the pointer before it's been replaced with
NULL, we are fine. Check if we'd fetched NULL and treat that
as "bail out and tell the caller to get out of RCU mode".
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/gfs2/inode.c | 6 ++++--
fs/gfs2/super.c | 2 +-
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 0eac04507904..e2432c327599 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1868,14 +1868,16 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
{
struct gfs2_inode *ip;
struct gfs2_holder i_gh;
+ struct gfs2_glock *gl;
int error;
gfs2_holder_mark_uninitialized(&i_gh);
ip = GFS2_I(inode);
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+ gl = rcu_dereference(ip->i_gl);
+ if (!gl || gfs2_glock_is_locked_by_me(gl) == NULL) {
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
if (error)
return error;
}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 02d93da21b2b..0dd5641990b9 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1550,7 +1550,7 @@ static void gfs2_evict_inode(struct inode *inode)
wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE);
gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put_eventually(ip->i_gl);
- ip->i_gl = NULL;
+ rcu_assign_pointer(ip->i_gl, NULL);
}
}
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH 08/15] gfs2: fix an oops in gfs2_permission()
2023-10-02 2:33 ` [PATCH 08/15] gfs2: fix an oops in gfs2_permission() Al Viro
@ 2023-10-02 11:46 ` Bob Peterson
2023-10-02 12:59 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Bob Peterson @ 2023-10-02 11:46 UTC (permalink / raw)
To: Al Viro, linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Steve French, Luis Chamberlain
On 10/1/23 9:33 PM, Al Viro wrote:
> in RCU mode we might race with gfs2_evict_inode(), which zeroes
> ->i_gl. Freeing of the object it points to is RCU-delayed, so
> if we manage to fetch the pointer before it's been replaced with
> NULL, we are fine. Check if we'd fetched NULL and treat that
> as "bail out and tell the caller to get out of RCU mode".
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
> fs/gfs2/inode.c | 6 ++++--
> fs/gfs2/super.c | 2 +-
> 2 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 0eac04507904..e2432c327599 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -1868,14 +1868,16 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
> {
> struct gfs2_inode *ip;
> struct gfs2_holder i_gh;
> + struct gfs2_glock *gl;
> int error;
>
> gfs2_holder_mark_uninitialized(&i_gh);
> ip = GFS2_I(inode);
> - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
> + gl = rcu_dereference(ip->i_gl);
> + if (!gl || gfs2_glock_is_locked_by_me(gl) == NULL) {
This looks wrong. It should be if (gl && ... otherwise the
gfs2_glock_nq_init will dereference the null pointer.
Bob Peterson
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 08/15] gfs2: fix an oops in gfs2_permission()
2023-10-02 11:46 ` Bob Peterson
@ 2023-10-02 12:59 ` Al Viro
2023-10-02 14:16 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 12:59 UTC (permalink / raw)
To: Bob Peterson
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Amir Goldstein, Trond Myklebust, Steve French,
Luis Chamberlain
On Mon, Oct 02, 2023 at 06:46:03AM -0500, Bob Peterson wrote:
> > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > index 0eac04507904..e2432c327599 100644
> > --- a/fs/gfs2/inode.c
> > +++ b/fs/gfs2/inode.c
> > @@ -1868,14 +1868,16 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
> > {
> > struct gfs2_inode *ip;
> > struct gfs2_holder i_gh;
> > + struct gfs2_glock *gl;
> > int error;
> > gfs2_holder_mark_uninitialized(&i_gh);
> > ip = GFS2_I(inode);
> > - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
> > + gl = rcu_dereference(ip->i_gl);
> > + if (!gl || gfs2_glock_is_locked_by_me(gl) == NULL) {
>
> This looks wrong. It should be if (gl && ... otherwise the
> gfs2_glock_nq_init will dereference the null pointer.
We shouldn't observe NULL ->i_gl unless we are in RCU mode,
which means we'll bail out without reaching gfs2_glock_nq_init()...
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 08/15] gfs2: fix an oops in gfs2_permission()
2023-10-02 12:59 ` Al Viro
@ 2023-10-02 14:16 ` Al Viro
2023-10-03 14:46 ` Andreas Grünbacher
0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 14:16 UTC (permalink / raw)
To: Bob Peterson
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Amir Goldstein, Trond Myklebust, Steve French,
Luis Chamberlain
On Mon, Oct 02, 2023 at 01:59:46PM +0100, Al Viro wrote:
> On Mon, Oct 02, 2023 at 06:46:03AM -0500, Bob Peterson wrote:
> > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > > index 0eac04507904..e2432c327599 100644
> > > --- a/fs/gfs2/inode.c
> > > +++ b/fs/gfs2/inode.c
> > > @@ -1868,14 +1868,16 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
> > > {
> > > struct gfs2_inode *ip;
> > > struct gfs2_holder i_gh;
> > > + struct gfs2_glock *gl;
> > > int error;
> > > gfs2_holder_mark_uninitialized(&i_gh);
> > > ip = GFS2_I(inode);
> > > - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
> > > + gl = rcu_dereference(ip->i_gl);
> > > + if (!gl || gfs2_glock_is_locked_by_me(gl) == NULL) {
> >
> > This looks wrong. It should be if (gl && ... otherwise the
> > gfs2_glock_nq_init will dereference the null pointer.
>
> We shouldn't observe NULL ->i_gl unless we are in RCU mode,
> which means we'll bail out without reaching gfs2_glock_nq_init()...
Something like
if (unlikely(!gl)) {
/* inode is getting torn down, must be RCU mode */
WARN_ON_ONCE(!(mask & MAY_NOT_BLOCK));
return -ECHILD;
}
might be less confusing way to express that...
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 08/15] gfs2: fix an oops in gfs2_permission()
2023-10-02 14:16 ` Al Viro
@ 2023-10-03 14:46 ` Andreas Grünbacher
0 siblings, 0 replies; 38+ messages in thread
From: Andreas Grünbacher @ 2023-10-03 14:46 UTC (permalink / raw)
To: Al Viro
Cc: Bob Peterson, linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Amir Goldstein, Trond Myklebust, Steve French,
Luis Chamberlain
Am Mo., 2. Okt. 2023 um 19:09 Uhr schrieb Al Viro <viro@zeniv.linux.org.uk>:
> On Mon, Oct 02, 2023 at 01:59:46PM +0100, Al Viro wrote:
> > On Mon, Oct 02, 2023 at 06:46:03AM -0500, Bob Peterson wrote:
> > > > diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> > > > index 0eac04507904..e2432c327599 100644
> > > > --- a/fs/gfs2/inode.c
> > > > +++ b/fs/gfs2/inode.c
> > > > @@ -1868,14 +1868,16 @@ int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode,
> > > > {
> > > > struct gfs2_inode *ip;
> > > > struct gfs2_holder i_gh;
> > > > + struct gfs2_glock *gl;
> > > > int error;
> > > > gfs2_holder_mark_uninitialized(&i_gh);
> > > > ip = GFS2_I(inode);
> > > > - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
> > > > + gl = rcu_dereference(ip->i_gl);
> > > > + if (!gl || gfs2_glock_is_locked_by_me(gl) == NULL) {
> > >
> > > This looks wrong. It should be if (gl && ... otherwise the
> > > gfs2_glock_nq_init will dereference the null pointer.
> >
> > We shouldn't observe NULL ->i_gl unless we are in RCU mode,
> > which means we'll bail out without reaching gfs2_glock_nq_init()...
>
> Something like
> if (unlikely(!gl)) {
> /* inode is getting torn down, must be RCU mode */
> WARN_ON_ONCE(!(mask & MAY_NOT_BLOCK));
> return -ECHILD;
> }
> might be less confusing way to express that...
Looking good, thanks. I'll queue it up.
Could you please send such fixes to the filesystem-specific list in
the future (scripts/get_maintainer.pl)?
Thanks,
Andreas
^ permalink raw reply [flat|nested] 38+ messages in thread
* [PATCH 09/15] nfs: make nfs_set_verifier() safe for use in RCU pathwalk
2023-10-02 2:28 ` Al Viro
` (7 preceding siblings ...)
2023-10-02 2:33 ` [PATCH 08/15] gfs2: fix an oops in gfs2_permission() Al Viro
@ 2023-10-02 2:34 ` Al Viro
2023-10-02 2:34 ` [PATCH 10/15] nfs: fix UAF on pathwalk running into umount Al Viro
` (4 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:34 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
nfs_set_verifier() relies upon dentry being pinned; if that's
the case, grabbing ->d_lock stabilizes ->d_parent and guarantees
that ->d_parent points to a positive dentry. For something
we'd run into in RCU mode that is *not* true - dentry might've
been through dentry_kill() just as we grabbed ->d_lock, with
its parent going through the same just as we get to into
nfs_set_verifier_locked(). It might get to detaching inode
(and zeroing ->d_inode) before nfs_set_verifier_locked() gets
to fetching that; we get an oops as the result.
That can happen in nfs{,4} ->d_revalidate(); we check that
parent is positive, but that's done before we get to
nfs_set_verifier() and it's possible for memory pressure
to pick our dentry as eviction candidate by that time.
If that happens, back-to-back attempts to kill dentry and
its parent are quite normal.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/nfs/dir.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e6a51fd94fea..8ffc1f78ba51 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
{
struct inode *inode = d_inode(dentry);
- struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *dir = d_inode_rcu(dentry->d_parent);
- if (!nfs_verify_change_attribute(dir, verf))
+ if (!dir || !nfs_verify_change_attribute(dir, verf))
return;
if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
nfs_set_verifier_delegated(&verf);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 10/15] nfs: fix UAF on pathwalk running into umount
2023-10-02 2:28 ` Al Viro
` (8 preceding siblings ...)
2023-10-02 2:34 ` [PATCH 09/15] nfs: make nfs_set_verifier() safe for use in RCU pathwalk Al Viro
@ 2023-10-02 2:34 ` Al Viro
2023-10-02 2:35 ` [PATCH 11/15] fuse: fix UAF in rcu pathwalks Al Viro
` (3 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:34 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
NFS ->d_revalidate(), ->permission() and ->get_link() need to access
some parts of nfs_server when called in RCU mode:
server->flags
server->caps
*(server->io_stats)
and, worst of all, call
server->nfs_client->rpc_ops->have_delegation
(the last one - as NFS_PROTO(inode)->have_delegation()). We really
don't want to RCU-delay the entire nfs_free_server() (it would have
to be done with schedule_work() from RCU callback, since it can't
be made to run from interrupt context), but actual freeing of
nfs_server and ->io_stats can be done via call_rcu() just fine.
nfs_client part is handled simply by making nfs_free_client() use
kfree_rcu().
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/nfs/client.c | 13 ++++++++++---
include/linux/nfs_fs_sb.h | 2 ++
2 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 44eca51b2808..fbdc9ca80f71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp)
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
- kfree(clp);
+ kfree_rcu(clp, rcu);
}
EXPORT_SYMBOL_GPL(nfs_free_client);
@@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void)
}
EXPORT_SYMBOL_GPL(nfs_alloc_server);
+static void delayed_free(struct rcu_head *p)
+{
+ struct nfs_server *server = container_of(p, struct nfs_server, rcu);
+
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+}
+
/*
* Free up a server record
*/
@@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
- nfs_free_iostats(server->io_stats);
put_cred(server->cred);
- kfree(server);
nfs_release_automount_timer();
+ call_rcu(&server->rcu, delayed_free);
}
EXPORT_SYMBOL_GPL(nfs_free_server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index cd628c4b011e..4bd16da65c05 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -124,6 +124,7 @@ struct nfs_client {
char cl_ipaddr[48];
struct net *cl_net;
struct list_head pending_cb_stateids;
+ struct rcu_head rcu;
};
/*
@@ -264,6 +265,7 @@ struct nfs_server {
const struct cred *cred;
bool has_sec_mnt_opts;
struct kobject kobj;
+ struct rcu_head rcu;
};
/* Server capabilities */
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 11/15] fuse: fix UAF in rcu pathwalks
2023-10-02 2:28 ` Al Viro
` (9 preceding siblings ...)
2023-10-02 2:34 ` [PATCH 10/15] nfs: fix UAF on pathwalk running into umount Al Viro
@ 2023-10-02 2:35 ` Al Viro
2023-10-02 2:35 ` [PATCH 12/15] afs: fix __afs_break_callback() / afs_drop_open_mmap() race Al Viro
` (2 subsequent siblings)
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
->permission(), ->get_link() and ->inode_get_acl() might dereference
->s_fs_info (and, in case of ->permission(), ->s_fs_info->fc->user_ns
as well) when called from rcu pathwalk.
Freeing ->s_fs_info->fc is rcu-delayed; we need to make freeing ->s_fs_info
and dropping ->user_ns rcu-delayed too.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/fuse/cuse.c | 3 +--
fs/fuse/fuse_i.h | 1 +
fs/fuse/inode.c | 15 +++++++++++----
3 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 91e89e68177e..b6cad106c37e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -474,8 +474,7 @@ static int cuse_send_init(struct cuse_conn *cc)
static void cuse_fc_release(struct fuse_conn *fc)
{
- struct cuse_conn *cc = fc_to_cc(fc);
- kfree_rcu(cc, fc.rcu);
+ kfree(fc_to_cc(fc));
}
/**
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bf0b85d0b95c..0c45014fbb03 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -873,6 +873,7 @@ struct fuse_mount {
/* Entry on fc->mounts */
struct list_head fc_entry;
+ struct rcu_head rcu;
};
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e4eb7cf26fb..e13c9312cb55 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -881,6 +881,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
+static void delayed_release(struct rcu_head *p)
+{
+ struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+
+ put_user_ns(fc->user_ns);
+ fc->release(fc);
+}
+
void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
@@ -892,13 +900,12 @@ void fuse_conn_put(struct fuse_conn *fc)
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
- put_user_ns(fc->user_ns);
bucket = rcu_dereference_protected(fc->curr_bucket, 1);
if (bucket) {
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
- fc->release(fc);
+ call_rcu(&fc->rcu, delayed_release);
}
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
@@ -1316,7 +1323,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init);
void fuse_free_conn(struct fuse_conn *fc)
{
WARN_ON(!list_empty(&fc->devices));
- kfree_rcu(fc, rcu);
+ kfree(fc);
}
EXPORT_SYMBOL_GPL(fuse_free_conn);
@@ -1833,7 +1840,7 @@ static void fuse_sb_destroy(struct super_block *sb)
void fuse_mount_destroy(struct fuse_mount *fm)
{
fuse_conn_put(fm->fc);
- kfree(fm);
+ kfree_rcu(fm, rcu);
}
EXPORT_SYMBOL(fuse_mount_destroy);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 12/15] afs: fix __afs_break_callback() / afs_drop_open_mmap() race
2023-10-02 2:28 ` Al Viro
` (10 preceding siblings ...)
2023-10-02 2:35 ` [PATCH 11/15] fuse: fix UAF in rcu pathwalks Al Viro
@ 2023-10-02 2:35 ` Al Viro
2023-10-02 2:36 ` [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay Al Viro
2023-10-02 2:52 ` [RFC][PATCHES] fixes in methods exposed to rcu pathwalk Al Viro
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
In __afs_break_callback() we might check ->cb_nr_mmap and if it's non-zero
do queue_work(&vnode->cb_work). In afs_drop_open_mmap() we decrement
->cb_nr_mmap and do flush_work(&vnode->cb_work) if it reaches zero.
The trouble is, there's nothing to prevent __afs_break_callback() from
seeing ->cb_nr_mmap before the decrement and do queue_work() after both
the decrement and flush_work(). If that happens, we might be in trouble -
vnode might get freed before the queued work runs.
__afs_break_callback() is always done under ->cb_lock, so let's make
sure that ->cb_nr_mmap can change from non-zero to zero while holding
->cb_lock (the spinlock component of it - it's a seqlock and we don't
need to mess with the counter).
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/afs/file.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d37dd201752b..0012ea300eb5 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -529,13 +529,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
static void afs_drop_open_mmap(struct afs_vnode *vnode)
{
- if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
+ if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1))
return;
down_write(&vnode->volume->cell->fs_open_mmaps_lock);
- if (atomic_read(&vnode->cb_nr_mmap) == 0)
+ read_seqlock_excl(&vnode->cb_lock);
+ // the only place where ->cb_nr_mmap may hit 0
+ // see __afs_break_callback() for the other side...
+ if (atomic_dec_and_test(&vnode->cb_nr_mmap))
list_del_init(&vnode->cb_mmap_link);
+ read_sequnlock_excl(&vnode->cb_lock);
up_write(&vnode->volume->cell->fs_open_mmaps_lock);
flush_work(&vnode->cb_work);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay
2023-10-02 2:28 ` Al Viro
` (11 preceding siblings ...)
2023-10-02 2:35 ` [PATCH 12/15] afs: fix __afs_break_callback() / afs_drop_open_mmap() race Al Viro
@ 2023-10-02 2:36 ` Al Viro
2023-10-02 2:36 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Al Viro
2023-10-02 5:51 ` [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay Amir Goldstein
2023-10-02 2:52 ` [RFC][PATCHES] fixes in methods exposed to rcu pathwalk Al Viro
13 siblings, 2 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:36 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
... into ->free_inode(), that is.
Fixes: 0af950f57fef "ovl: move ovl_entry into ovl_inode"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/overlayfs/super.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index def266b5e2a3..f09184b865ec 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -167,6 +167,7 @@ static void ovl_free_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
kfree(oi->redirect);
+ kfree(oi->oe);
mutex_destroy(&oi->lock);
kmem_cache_free(ovl_inode_cachep, oi);
}
@@ -176,7 +177,7 @@ static void ovl_destroy_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
dput(oi->__upperdentry);
- ovl_free_entry(oi->oe);
+ ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe));
if (S_ISDIR(inode->i_mode))
ovl_dir_cache_free(inode);
else
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once
2023-10-02 2:36 ` [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay Al Viro
@ 2023-10-02 2:36 ` Al Viro
2023-10-02 2:37 ` [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk Al Viro
2023-10-02 5:47 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Amir Goldstein
2023-10-02 5:51 ` [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay Amir Goldstein
1 sibling, 2 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:36 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
d_inode_rcu() is right - we might be in rcu pathwalk;
however, OVL_E() hides plain d_inode() on the same dentry...
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/overlayfs/super.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f09184b865ec..905d3aaf4e55 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -104,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
- struct ovl_entry *oe = OVL_E(dentry);
- struct ovl_path *lowerstack = ovl_lowerstack(oe);
+ struct ovl_entry *oe;
+ struct ovl_path *lowerstack;
struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
@@ -115,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
if (!inode)
return -ECHILD;
+ oe = OVL_I_E(inode);
+ lowerstack = ovl_lowerstack(oe);
upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk
2023-10-02 2:36 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Al Viro
@ 2023-10-02 2:37 ` Al Viro
2023-10-02 6:40 ` Amir Goldstein
2023-10-02 5:47 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Amir Goldstein
1 sibling, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:37 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
freed without an RCU delay on fs shutdown. Fortunately, kern_unmount_array()
used to drop those mounts does include an RCU delay, so freeing is
delayed; unfortunately, the array passed to kern_unmount_array() is
formed by mangling ->layers contents and that happens without any
delays.
Use a separate array instead; local if we have a few layers,
kmalloc'ed if there's a lot of them. If allocation fails,
fall back to kern_unmount() for individual mounts; it's
not a fast path by any stretch of imagination.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
fs/overlayfs/ovl_entry.h | 1 -
fs/overlayfs/params.c | 26 ++++++++++++++++++++------
2 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index e9539f98e86a..618b63bb7987 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -30,7 +30,6 @@ struct ovl_sb {
};
struct ovl_layer {
- /* ovl_free_fs() relies on @mnt being the first member! */
struct vfsmount *mnt;
/* Trap in ovl inode cache */
struct inode *trap;
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index b9355bb6d75a..ab594fd407b4 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -738,8 +738,15 @@ int ovl_init_fs_context(struct fs_context *fc)
void ovl_free_fs(struct ovl_fs *ofs)
{
struct vfsmount **mounts;
+ struct vfsmount *m[16];
+ unsigned n = ofs->numlayer;
unsigned i;
+ if (n > 16)
+ mounts = kmalloc_array(n, sizeof(struct mount *), GFP_KERNEL);
+ else
+ mounts = m;
+
iput(ofs->workbasedir_trap);
iput(ofs->indexdir_trap);
iput(ofs->workdir_trap);
@@ -752,14 +759,21 @@ void ovl_free_fs(struct ovl_fs *ofs)
if (ofs->upperdir_locked)
ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
- /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
- mounts = (struct vfsmount **) ofs->layers;
- for (i = 0; i < ofs->numlayer; i++) {
+ for (i = 0; i < n; i++) {
iput(ofs->layers[i].trap);
- mounts[i] = ofs->layers[i].mnt;
- kfree(ofs->layers[i].name);
+ if (unlikely(!mounts))
+ kern_unmount(ofs->layers[i].mnt);
+ else
+ mounts[i] = ofs->layers[i].mnt;
}
- kern_unmount_array(mounts, ofs->numlayer);
+ if (mounts) {
+ kern_unmount_array(mounts, n);
+ if (mounts != m)
+ kfree(mounts);
+ }
+ // by this point we had an RCU delay from kern_unmount{_array,}()
+ for (i = 0; i < n; i++)
+ kfree(ofs->layers[i].name);
kfree(ofs->layers);
for (i = 0; i < ofs->numfs; i++)
free_anon_bdev(ofs->fs[i].pseudo_dev);
--
2.39.2
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk
2023-10-02 2:37 ` [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk Al Viro
@ 2023-10-02 6:40 ` Amir Goldstein
2023-10-02 7:23 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Amir Goldstein @ 2023-10-02 6:40 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 2, 2023 at 5:37 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
> freed without an RCU delay on fs shutdown. Fortunately, kern_unmount_array()
> used to drop those mounts does include an RCU delay, so freeing is
> delayed; unfortunately, the array passed to kern_unmount_array() is
> formed by mangling ->layers contents and that happens without any
> delays.
>
> Use a separate array instead; local if we have a few layers,
> kmalloc'ed if there's a lot of them. If allocation fails,
> fall back to kern_unmount() for individual mounts; it's
> not a fast path by any stretch of imagination.
If that is the case, then having 3 different code paths seems
quite an excessive over optimization...
I think there is a better way -
layout the mounts array linearly in ofs->mounts[] to begin with,
remove .mnt out of ovl_layer and use ofs->mounts[layer->idx] to
get to a layer's mount.
I can try to write this patch to see how it ends up looking.
Thanks,
Amir.
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
> fs/overlayfs/ovl_entry.h | 1 -
> fs/overlayfs/params.c | 26 ++++++++++++++++++++------
> 2 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
> index e9539f98e86a..618b63bb7987 100644
> --- a/fs/overlayfs/ovl_entry.h
> +++ b/fs/overlayfs/ovl_entry.h
> @@ -30,7 +30,6 @@ struct ovl_sb {
> };
>
> struct ovl_layer {
> - /* ovl_free_fs() relies on @mnt being the first member! */
> struct vfsmount *mnt;
> /* Trap in ovl inode cache */
> struct inode *trap;
> diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
> index b9355bb6d75a..ab594fd407b4 100644
> --- a/fs/overlayfs/params.c
> +++ b/fs/overlayfs/params.c
> @@ -738,8 +738,15 @@ int ovl_init_fs_context(struct fs_context *fc)
> void ovl_free_fs(struct ovl_fs *ofs)
> {
> struct vfsmount **mounts;
> + struct vfsmount *m[16];
> + unsigned n = ofs->numlayer;
> unsigned i;
>
> + if (n > 16)
> + mounts = kmalloc_array(n, sizeof(struct mount *), GFP_KERNEL);
> + else
> + mounts = m;
> +
> iput(ofs->workbasedir_trap);
> iput(ofs->indexdir_trap);
> iput(ofs->workdir_trap);
> @@ -752,14 +759,21 @@ void ovl_free_fs(struct ovl_fs *ofs)
> if (ofs->upperdir_locked)
> ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
>
> - /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
> - mounts = (struct vfsmount **) ofs->layers;
If we are getting rid of the hack, we should remove the associated
static_assert() statements in ovl_entry.h.
> - for (i = 0; i < ofs->numlayer; i++) {
> + for (i = 0; i < n; i++) {
> iput(ofs->layers[i].trap);
> - mounts[i] = ofs->layers[i].mnt;
> - kfree(ofs->layers[i].name);
> + if (unlikely(!mounts))
> + kern_unmount(ofs->layers[i].mnt);
> + else
> + mounts[i] = ofs->layers[i].mnt;
> }
> - kern_unmount_array(mounts, ofs->numlayer);
> + if (mounts) {
> + kern_unmount_array(mounts, n);
> + if (mounts != m)
> + kfree(mounts);
> + }
> + // by this point we had an RCU delay from kern_unmount{_array,}()
> + for (i = 0; i < n; i++)
> + kfree(ofs->layers[i].name);
> kfree(ofs->layers);
> for (i = 0; i < ofs->numfs; i++)
> free_anon_bdev(ofs->fs[i].pseudo_dev);
> --
> 2.39.2
>
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk
2023-10-02 6:40 ` Amir Goldstein
@ 2023-10-02 7:23 ` Al Viro
2023-10-02 8:53 ` Amir Goldstein
0 siblings, 1 reply; 38+ messages in thread
From: Al Viro @ 2023-10-02 7:23 UTC (permalink / raw)
To: Amir Goldstein
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 02, 2023 at 09:40:15AM +0300, Amir Goldstein wrote:
> On Mon, Oct 2, 2023 at 5:37 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
> > freed without an RCU delay on fs shutdown. Fortunately, kern_unmount_array()
> > used to drop those mounts does include an RCU delay, so freeing is
> > delayed; unfortunately, the array passed to kern_unmount_array() is
> > formed by mangling ->layers contents and that happens without any
> > delays.
> >
> > Use a separate array instead; local if we have a few layers,
> > kmalloc'ed if there's a lot of them. If allocation fails,
> > fall back to kern_unmount() for individual mounts; it's
> > not a fast path by any stretch of imagination.
>
> If that is the case, then having 3 different code paths seems
> quite an excessive over optimization...
>
> I think there is a better way -
> layout the mounts array linearly in ofs->mounts[] to begin with,
> remove .mnt out of ovl_layer and use ofs->mounts[layer->idx] to
> get to a layer's mount.
>
> I can try to write this patch to see how it ends up looking.
Fine by me; about the only problem I see is the cache footprint...
How many layers do you consider the normal use, actually?
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk
2023-10-02 7:23 ` Al Viro
@ 2023-10-02 8:53 ` Amir Goldstein
2023-10-03 20:47 ` Al Viro
0 siblings, 1 reply; 38+ messages in thread
From: Amir Goldstein @ 2023-10-02 8:53 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 2, 2023 at 10:23 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> On Mon, Oct 02, 2023 at 09:40:15AM +0300, Amir Goldstein wrote:
> > On Mon, Oct 2, 2023 at 5:37 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > ovl_permission() accesses ->layers[...].mnt; we can't have ->layers
> > > freed without an RCU delay on fs shutdown. Fortunately, kern_unmount_array()
> > > used to drop those mounts does include an RCU delay, so freeing is
> > > delayed; unfortunately, the array passed to kern_unmount_array() is
> > > formed by mangling ->layers contents and that happens without any
> > > delays.
> > >
> > > Use a separate array instead; local if we have a few layers,
> > > kmalloc'ed if there's a lot of them. If allocation fails,
> > > fall back to kern_unmount() for individual mounts; it's
> > > not a fast path by any stretch of imagination.
> >
> > If that is the case, then having 3 different code paths seems
> > quite an excessive over optimization...
> >
> > I think there is a better way -
> > layout the mounts array linearly in ofs->mounts[] to begin with,
> > remove .mnt out of ovl_layer and use ofs->mounts[layer->idx] to
> > get to a layer's mount.
> >
> > I can try to write this patch to see how it ends up looking.
>
> Fine by me; about the only problem I see is the cache footprint...
I've also considered just allocating the extra space for
ofs->mounts[] at super creation time rather than on super destruction.
I just cannot get myself to be bothered with this cleanup code
because of saving memory of ovl_fs.
However, looking closer, we have a wasfull layer->name pointer,
which is only ever used for ovl_show_options() (to print the original
requested layer path from mount options).
So I am inclined to move these rarely accessed pointers to
ofs->layer_names[], which can be used for the temp array for
kern_unmount_array() because freeing name does not need
RCU delay AFAICT (?).
I will take a swing at it.
> How many layers do you consider the normal use, actually?
Define "normal".
Currently, we have #define OVL_MAX_STACK 500
and being able to create an overlay with many layers was
cited as one of the requirements to drive the move to new mount api:
https://lore.kernel.org/linux-unionfs/20230605-fs-overlayfs-mount_api-v3-0-730d9646b27d@kernel.org/
I am not really familiar with the user workloads that need that many layers
and why. It's obviously not the common case.
Thanks,
Amir.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk
2023-10-02 8:53 ` Amir Goldstein
@ 2023-10-03 20:47 ` Al Viro
0 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-03 20:47 UTC (permalink / raw)
To: Amir Goldstein
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 02, 2023 at 11:53:23AM +0300, Amir Goldstein wrote:
> I've also considered just allocating the extra space for
> ofs->mounts[] at super creation time rather than on super destruction.
> I just cannot get myself to be bothered with this cleanup code
> because of saving memory of ovl_fs.
>
> However, looking closer, we have a wasfull layer->name pointer,
> which is only ever used for ovl_show_options() (to print the original
> requested layer path from mount options).
>
> So I am inclined to move these rarely accessed pointers to
> ofs->layer_names[], which can be used for the temp array for
> kern_unmount_array() because freeing name does not need
> RCU delay AFAICT (?).
AFAICS, it doesn't. The only user after the setup is done is
->show_options(), i.e. show_vfsmnt() and show_mountinfo().
Anyone who tries to use those without making sure that
struct mount is not going to come apart under them will
have far worse troubles...
FWIW, I'm perfectly fine with having these fixes go through your tree;
they are independent from the rest of the series.
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once
2023-10-02 2:36 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Al Viro
2023-10-02 2:37 ` [PATCH 15/15] overlayfs: make use of ->layers safe in rcu pathwalk Al Viro
@ 2023-10-02 5:47 ` Amir Goldstein
2023-10-02 5:56 ` Amir Goldstein
1 sibling, 1 reply; 38+ messages in thread
From: Amir Goldstein @ 2023-10-02 5:47 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 2, 2023 at 5:36 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> d_inode_rcu() is right - we might be in rcu pathwalk;
> however, OVL_E() hides plain d_inode() on the same dentry...
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
However, ovl_lowerstack(oe) does not appear to be stable in RCU walk...
> ---
> fs/overlayfs/super.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index f09184b865ec..905d3aaf4e55 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -104,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
> static int ovl_dentry_revalidate_common(struct dentry *dentry,
> unsigned int flags, bool weak)
> {
> - struct ovl_entry *oe = OVL_E(dentry);
> - struct ovl_path *lowerstack = ovl_lowerstack(oe);
> + struct ovl_entry *oe;
> + struct ovl_path *lowerstack;
> struct inode *inode = d_inode_rcu(dentry);
> struct dentry *upper;
> unsigned int i;
> @@ -115,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
> if (!inode)
> return -ECHILD;
>
> + oe = OVL_I_E(inode);
> + lowerstack = ovl_lowerstack(oe);
> upper = ovl_i_dentry_upper(inode);
> if (upper)
> ret = ovl_revalidate_real(upper, flags, weak);
> --
> 2.39.2
>
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once
2023-10-02 5:47 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Amir Goldstein
@ 2023-10-02 5:56 ` Amir Goldstein
2023-10-02 14:47 ` Amir Goldstein
0 siblings, 1 reply; 38+ messages in thread
From: Amir Goldstein @ 2023-10-02 5:56 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 2, 2023 at 8:47 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Oct 2, 2023 at 5:36 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > d_inode_rcu() is right - we might be in rcu pathwalk;
> > however, OVL_E() hides plain d_inode() on the same dentry...
> >
> > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
>
> Reviewed-by: Amir Goldstein <amir73il@gmail.com>
>
> However, ovl_lowerstack(oe) does not appear to be stable in RCU walk...
>
Ah, you fixed that in another patch.
If you are going to be sending this to Linus, please add
Fixes: a6ff2bc0be17 ("ovl: use OVL_E() and OVL_E_FLAGS() accessors")
I was going to send some fixes this week anyway, so I can
pick those through the overlayfs tree if you like.
Thanks,
Amir.
> > ---
> > fs/overlayfs/super.c | 6 ++++--
> > 1 file changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> > index f09184b865ec..905d3aaf4e55 100644
> > --- a/fs/overlayfs/super.c
> > +++ b/fs/overlayfs/super.c
> > @@ -104,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
> > static int ovl_dentry_revalidate_common(struct dentry *dentry,
> > unsigned int flags, bool weak)
> > {
> > - struct ovl_entry *oe = OVL_E(dentry);
> > - struct ovl_path *lowerstack = ovl_lowerstack(oe);
> > + struct ovl_entry *oe;
> > + struct ovl_path *lowerstack;
> > struct inode *inode = d_inode_rcu(dentry);
> > struct dentry *upper;
> > unsigned int i;
> > @@ -115,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
> > if (!inode)
> > return -ECHILD;
> >
> > + oe = OVL_I_E(inode);
> > + lowerstack = ovl_lowerstack(oe);
> > upper = ovl_i_dentry_upper(inode);
> > if (upper)
> > ret = ovl_revalidate_real(upper, flags, weak);
> > --
> > 2.39.2
> >
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once
2023-10-02 5:56 ` Amir Goldstein
@ 2023-10-02 14:47 ` Amir Goldstein
0 siblings, 0 replies; 38+ messages in thread
From: Amir Goldstein @ 2023-10-02 14:47 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 2, 2023 at 8:56 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Mon, Oct 2, 2023 at 8:47 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Mon, Oct 2, 2023 at 5:36 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
> > >
> > > d_inode_rcu() is right - we might be in rcu pathwalk;
> > > however, OVL_E() hides plain d_inode() on the same dentry...
> > >
> > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> >
> > Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> >
> > However, ovl_lowerstack(oe) does not appear to be stable in RCU walk...
> >
>
> Ah, you fixed that in another patch.
> If you are going to be sending this to Linus, please add
> Fixes: a6ff2bc0be17 ("ovl: use OVL_E() and OVL_E_FLAGS() accessors")
>
> I was going to send some fixes this week anyway, so I can
> pick those through the overlayfs tree if you like.
>
Al,
From all your series, the two ovl fixes are for rather new regressions (v6.5)
so I queued your two regression fixes (13,14) and my own version of patch 15
to go into linux-next via overlayfs tree [1] and I will send them to Linus later
this week, so they can make their way to 6.5.y.
Thanks,
Amir.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs.git/log/?h=ovl-fixes
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay
2023-10-02 2:36 ` [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay Al Viro
2023-10-02 2:36 ` [PATCH 14/15] ovl_dentry_revalidate_common(): fetch inode once Al Viro
@ 2023-10-02 5:51 ` Amir Goldstein
1 sibling, 0 replies; 38+ messages in thread
From: Amir Goldstein @ 2023-10-02 5:51 UTC (permalink / raw)
To: Al Viro
Cc: linux-fsdevel, Christian Brauner, Christoph Hellwig,
Linus Torvalds, Namjae Jeon, David Sterba, David Howells,
Miklos Szeredi, Trond Myklebust, Bob Peterson, Steve French,
Luis Chamberlain
On Mon, Oct 2, 2023 at 5:36 AM Al Viro <viro@zeniv.linux.org.uk> wrote:
>
> ... into ->free_inode(), that is.
>
> Fixes: 0af950f57fef "ovl: move ovl_entry into ovl_inode"
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
> ---
> fs/overlayfs/super.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
> index def266b5e2a3..f09184b865ec 100644
> --- a/fs/overlayfs/super.c
> +++ b/fs/overlayfs/super.c
> @@ -167,6 +167,7 @@ static void ovl_free_inode(struct inode *inode)
> struct ovl_inode *oi = OVL_I(inode);
>
> kfree(oi->redirect);
> + kfree(oi->oe);
> mutex_destroy(&oi->lock);
> kmem_cache_free(ovl_inode_cachep, oi);
> }
> @@ -176,7 +177,7 @@ static void ovl_destroy_inode(struct inode *inode)
> struct ovl_inode *oi = OVL_I(inode);
>
> dput(oi->__upperdentry);
> - ovl_free_entry(oi->oe);
> + ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe));
> if (S_ISDIR(inode->i_mode))
> ovl_dir_cache_free(inode);
> else
> --
> 2.39.2
>
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [RFC][PATCHES] fixes in methods exposed to rcu pathwalk
2023-10-02 2:28 ` Al Viro
` (12 preceding siblings ...)
2023-10-02 2:36 ` [PATCH 13/15] overlayfs: move freeing ovl_entry past rcu delay Al Viro
@ 2023-10-02 2:52 ` Al Viro
13 siblings, 0 replies; 38+ messages in thread
From: Al Viro @ 2023-10-02 2:52 UTC (permalink / raw)
To: linux-fsdevel
Cc: Christian Brauner, Christoph Hellwig, Linus Torvalds, Namjae Jeon,
David Sterba, David Howells, Miklos Szeredi, Amir Goldstein,
Trond Myklebust, Bob Peterson, Steve French, Luis Chamberlain
PS: ceph ->d_revalidate() uses ->s_fs_info and stuff hanging off it
(mdsc) and I'm not familiar enough with the codebase to do anything
useful with it; AFAICS, there are UAF on fs shutdown, but I'd rather
leave that one to ceph folks. NTFS3 ->d_hash() and ->d_compare()
do blocking allocations, and that stuff can be called under
rcu_read_lock(); ->d_compare() pretty much always is called so...
^ permalink raw reply [flat|nested] 38+ messages in thread