* [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 [PATCH 0/2] get rid of PF_MEMALLOC_NORECLAIM Michal Hocko
@ 2024-08-26 8:47 ` Michal Hocko
2024-08-26 13:11 ` Matthew Wilcox
` (5 more replies)
0 siblings, 6 replies; 51+ messages in thread
From: Michal Hocko @ 2024-08-26 8:47 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
From: Michal Hocko <mhocko@suse.com>
bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
inode to achieve GFP_NOWAIT semantic while holding locks. If this
allocation fails it will drop locks and use GFP_NOFS allocation context.
We would like to drop PF_MEMALLOC_NORECLAIM because it is really
dangerous to use if the caller doesn't control the full call chain with
this flag set. E.g. if any of the function down the chain needed
GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
cause unexpected failure.
While this is not the case in this particular case using the scoped gfp
semantic is not really needed bacause we can easily pus the allocation
context down the chain without too much clutter.
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
fs/bcachefs/fs.c | 14 ++++++--------
fs/inode.c | 6 +++---
include/linux/fs.h | 7 ++++++-
include/linux/lsm_hooks.h | 2 +-
include/linux/security.h | 4 ++--
security/security.c | 8 ++++----
6 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 15fc41e63b6c..7a55167b9133 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -231,9 +231,9 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
BUG();
}
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
{
- struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, gfp);
if (!inode)
return NULL;
@@ -245,7 +245,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
+ if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
kmem_cache_free(bch2_inode_cache, inode);
return NULL;
}
@@ -258,12 +258,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
- struct bch_inode_info *inode =
- memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
- __bch2_new_inode(trans->c));
+ struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
if (ret && inode) {
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
@@ -328,7 +326,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = __bch2_new_inode(c);
+ inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..95fd67a6cac3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -153,7 +153,7 @@ static int no_open(struct inode *inode, struct file *file)
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
*/
-int inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
static const struct inode_operations empty_iops;
static const struct file_operations no_open_fops = {.open = no_open};
@@ -230,14 +230,14 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
#endif
inode->i_flctx = NULL;
- if (unlikely(security_inode_alloc(inode)))
+ if (unlikely(security_inode_alloc(inode, gfp)))
return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
}
-EXPORT_SYMBOL(inode_init_always);
+EXPORT_SYMBOL(inode_init_always_gfp);
void free_inode_nonrcu(struct inode *inode)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..d46ca71a7855 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3027,7 +3027,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
-extern int inode_init_always(struct super_block *, struct inode *);
+extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
+static inline int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+ return inode_init_always_gfp(sb, inode, GFP_NOFS);
+}
+
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a2ade0ffe9e7..b08472d64765 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -150,6 +150,6 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
__used __section(".early_lsm_info.init") \
__aligned(sizeof(unsigned long))
-extern int lsm_inode_alloc(struct inode *inode);
+extern int lsm_inode_alloc(struct inode *inode, gfp_t gfp);
#endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 1390f1efb4f0..7c6b9b038a0d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -336,7 +336,7 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode,
struct cred *new);
int security_path_notify(const struct path *path, u64 mask,
unsigned int obj_type);
-int security_inode_alloc(struct inode *inode);
+int security_inode_alloc(struct inode *inode, gfp_t gfp);
void security_inode_free(struct inode *inode);
int security_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr,
@@ -769,7 +769,7 @@ static inline int security_path_notify(const struct path *path, u64 mask,
return 0;
}
-static inline int security_inode_alloc(struct inode *inode)
+static inline int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
return 0;
}
diff --git a/security/security.c b/security/security.c
index 8cee5b6c6e6d..3581262da5ee 100644
--- a/security/security.c
+++ b/security/security.c
@@ -660,14 +660,14 @@ static int lsm_file_alloc(struct file *file)
*
* Returns 0, or -ENOMEM if memory can't be allocated.
*/
-int lsm_inode_alloc(struct inode *inode)
+int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
{
if (!lsm_inode_cache) {
inode->i_security = NULL;
return 0;
}
- inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
+ inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
if (inode->i_security == NULL)
return -ENOMEM;
return 0;
@@ -1582,9 +1582,9 @@ int security_path_notify(const struct path *path, u64 mask,
*
* Return: Return 0 if operation was successful.
*/
-int security_inode_alloc(struct inode *inode)
+int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
- int rc = lsm_inode_alloc(inode);
+ int rc = lsm_inode_alloc(inode, gfp);
if (unlikely(rc))
return rc;
--
2.46.0
^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
@ 2024-08-26 13:11 ` Matthew Wilcox
2024-08-26 16:48 ` Michal Hocko
2024-08-26 19:39 ` Kent Overstreet
` (4 subsequent siblings)
5 siblings, 1 reply; 51+ messages in thread
From: Matthew Wilcox @ 2024-08-26 13:11 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, Kent Overstreet,
jack, Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 10:47:12AM +0200, Michal Hocko wrote:
> @@ -258,12 +258,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
> */
> static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
> {
> - struct bch_inode_info *inode =
> - memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
> - __bch2_new_inode(trans->c));
> + struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
GFP_NOWAIT include GFP_NOWARN these days (since 16f5dfbc851b)
> +++ b/fs/inode.c
> @@ -153,7 +153,7 @@ static int no_open(struct inode *inode, struct file *file)
> * These are initializations that need to be done on every inode
> * allocation as the fields are not initialised by slab allocation.
> */
> -int inode_init_always(struct super_block *sb, struct inode *inode)
> +int inode_init_always(struct super_block *sb, struct inode *inode, gfp_t gfp)
Did you send the right version of this patch? There should be a "_gfp"
appended to this function name.
> +++ b/include/linux/fs.h
> @@ -3027,7 +3027,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
>
> extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
>
> -extern int inode_init_always(struct super_block *, struct inode *);
> +extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
You can drop the "extern" while you're changing this line.
> +static inline int inode_init_always(struct super_block *sb, struct inode *inode)
> +{
> + return inode_init_always_gfp(sb, inode, GFP_NOFS);
> +}
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 13:11 ` Matthew Wilcox
@ 2024-08-26 16:48 ` Michal Hocko
0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-08-26 16:48 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, Kent Overstreet,
jack, Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon 26-08-24 14:11:39, Matthew Wilcox wrote:
> On Mon, Aug 26, 2024 at 10:47:12AM +0200, Michal Hocko wrote:
> > @@ -258,12 +258,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
> > */
> > static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
> > {
> > - struct bch_inode_info *inode =
> > - memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
> > - __bch2_new_inode(trans->c));
> > + struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
>
> GFP_NOWAIT include GFP_NOWARN these days (since 16f5dfbc851b)
Ohh, I was not aware of that. I will drop NOWARN then.
> > +++ b/fs/inode.c
> > @@ -153,7 +153,7 @@ static int no_open(struct inode *inode, struct file *file)
> > * These are initializations that need to be done on every inode
> > * allocation as the fields are not initialised by slab allocation.
> > */
> > -int inode_init_always(struct super_block *sb, struct inode *inode)
> > +int inode_init_always(struct super_block *sb, struct inode *inode, gfp_t gfp)
>
> Did you send the right version of this patch? There should be a "_gfp"
> appended to this function name.
yes, screw up on my end.
> > +++ b/include/linux/fs.h
> > @@ -3027,7 +3027,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
> >
> > extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
> >
> > -extern int inode_init_always(struct super_block *, struct inode *);
> > +extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
>
> You can drop the "extern" while you're changing this line.
OK, I can. I just kept the usual style in this file.
Thanks!
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
2024-08-26 13:11 ` Matthew Wilcox
@ 2024-08-26 19:39 ` Kent Overstreet
2024-08-26 19:41 ` Matthew Wilcox
2024-08-26 19:58 ` Michal Hocko
2024-08-26 19:52 ` kernel test robot
` (3 subsequent siblings)
5 siblings, 2 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 19:39 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 10:47:12AM GMT, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
>
> bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> inode to achieve GFP_NOWAIT semantic while holding locks. If this
> allocation fails it will drop locks and use GFP_NOFS allocation context.
>
> We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> dangerous to use if the caller doesn't control the full call chain with
> this flag set. E.g. if any of the function down the chain needed
> GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> cause unexpected failure.
>
> While this is not the case in this particular case using the scoped gfp
> semantic is not really needed bacause we can easily pus the allocation
> context down the chain without too much clutter.
yeah, eesh, nack.
Given the amount of plumbing required here, it's clear that passing gfp
flags is the less safe way of doing it, and this really does belong in
the allocation context.
Failure to pass gfp flags correctly (which we know is something that
happens today, e.g. vmalloc -> pte allocation) means you're introducing
a deadlock.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:39 ` Kent Overstreet
@ 2024-08-26 19:41 ` Matthew Wilcox
2024-08-26 19:42 ` Kent Overstreet
2024-08-26 19:44 ` Kent Overstreet
2024-08-26 19:58 ` Michal Hocko
1 sibling, 2 replies; 51+ messages in thread
From: Matthew Wilcox @ 2024-08-26 19:41 UTC (permalink / raw)
To: Kent Overstreet
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 03:39:47PM -0400, Kent Overstreet wrote:
> Given the amount of plumbing required here, it's clear that passing gfp
> flags is the less safe way of doing it, and this really does belong in
> the allocation context.
>
> Failure to pass gfp flags correctly (which we know is something that
> happens today, e.g. vmalloc -> pte allocation) means you're introducing
> a deadlock.
The problem with vmalloc is that the page table allocation _doesn't_
take a GFP parameter.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:41 ` Matthew Wilcox
@ 2024-08-26 19:42 ` Kent Overstreet
2024-08-26 19:47 ` Matthew Wilcox
2024-08-26 19:44 ` Kent Overstreet
1 sibling, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 19:42 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 08:41:42PM GMT, Matthew Wilcox wrote:
> On Mon, Aug 26, 2024 at 03:39:47PM -0400, Kent Overstreet wrote:
> > Given the amount of plumbing required here, it's clear that passing gfp
> > flags is the less safe way of doing it, and this really does belong in
> > the allocation context.
> >
> > Failure to pass gfp flags correctly (which we know is something that
> > happens today, e.g. vmalloc -> pte allocation) means you're introducing
> > a deadlock.
>
> The problem with vmalloc is that the page table allocation _doesn't_
> take a GFP parameter.
yeah, I know. I posted patches to plumb it through, which were nacked by
Linus.
And we're trying to get away from passing gfp flags directly, are we
not? I just don't buy the GFP_NOFAIL unsafety argument.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:41 ` Matthew Wilcox
2024-08-26 19:42 ` Kent Overstreet
@ 2024-08-26 19:44 ` Kent Overstreet
1 sibling, 0 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 19:44 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 08:41:42PM GMT, Matthew Wilcox wrote:
> On Mon, Aug 26, 2024 at 03:39:47PM -0400, Kent Overstreet wrote:
> > Given the amount of plumbing required here, it's clear that passing gfp
> > flags is the less safe way of doing it, and this really does belong in
> > the allocation context.
> >
> > Failure to pass gfp flags correctly (which we know is something that
> > happens today, e.g. vmalloc -> pte allocation) means you're introducing
> > a deadlock.
>
> The problem with vmalloc is that the page table allocation _doesn't_
> take a GFP parameter.
and given the increasing usage of kvmalloc() this is something we really
should be thinking about
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:42 ` Kent Overstreet
@ 2024-08-26 19:47 ` Matthew Wilcox
2024-08-26 19:54 ` Kent Overstreet
0 siblings, 1 reply; 51+ messages in thread
From: Matthew Wilcox @ 2024-08-26 19:47 UTC (permalink / raw)
To: Kent Overstreet
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 03:42:59PM -0400, Kent Overstreet wrote:
> On Mon, Aug 26, 2024 at 08:41:42PM GMT, Matthew Wilcox wrote:
> > On Mon, Aug 26, 2024 at 03:39:47PM -0400, Kent Overstreet wrote:
> > > Given the amount of plumbing required here, it's clear that passing gfp
> > > flags is the less safe way of doing it, and this really does belong in
> > > the allocation context.
> > >
> > > Failure to pass gfp flags correctly (which we know is something that
> > > happens today, e.g. vmalloc -> pte allocation) means you're introducing
> > > a deadlock.
> >
> > The problem with vmalloc is that the page table allocation _doesn't_
> > take a GFP parameter.
>
> yeah, I know. I posted patches to plumb it through, which were nacked by
> Linus.
>
> And we're trying to get away from passing gfp flags directly, are we
> not? I just don't buy the GFP_NOFAIL unsafety argument.
The problem with the giant invasive change of "getting away from passing
GFP flags directly" is that you need to build consensus for what it
looks like and convince everyone that you have a solution that solves
all the problems, or at least doesn't make any of those problems worse.
You haven't done that, you've just committed code that the MM people hate
(indeed already rejected), and set back the idea.
Look, it's not your job to fix it, but if you want to do it, do it
properly.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
2024-08-26 13:11 ` Matthew Wilcox
2024-08-26 19:39 ` Kent Overstreet
@ 2024-08-26 19:52 ` kernel test robot
2024-08-26 20:53 ` kernel test robot
` (2 subsequent siblings)
5 siblings, 0 replies; 51+ messages in thread
From: kernel test robot @ 2024-08-26 19:52 UTC (permalink / raw)
To: Michal Hocko, Andrew Morton
Cc: oe-kbuild-all, Linux Memory Management List, Christoph Hellwig,
Yafang Shao, Kent Overstreet, jack, Christian Brauner,
Alexander Viro, Paul Moore, James Morris, Serge E. Hallyn,
linux-fsdevel, linux-bcachefs, linux-security-module,
linux-kernel, Michal Hocko
Hi Michal,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on tip/sched/core brauner-vfs/vfs.all linus/master v6.11-rc5]
[cannot apply to next-20240826]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Michal-Hocko/bcachefs-do-not-use-PF_MEMALLOC_NORECLAIM/20240826-171013
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20240826085347.1152675-2-mhocko%40kernel.org
patch subject: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
config: arc-randconfig-001-20240827 (https://download.01.org/0day-ci/archive/20240827/202408270304.AUPHM0xo-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240827/202408270304.AUPHM0xo-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408270304.AUPHM0xo-lkp@intel.com/
All errors (new ones prefixed by >>):
fs/bcachefs/fs.c: In function '__bch2_new_inode':
>> fs/bcachefs/fs.c:248:70: error: macro "unlikely" passed 2 arguments, but takes just 1
248 | if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
| ^
In file included from include/linux/build_bug.h:5,
from include/linux/container_of.h:5,
from include/linux/list.h:5,
from include/linux/backing-dev-defs.h:5,
from fs/bcachefs/bcachefs.h:186,
from fs/bcachefs/fs.c:4:
include/linux/compiler.h:77: note: macro "unlikely" defined here
77 | # define unlikely(x) __builtin_expect(!!(x), 0)
|
>> fs/bcachefs/fs.c:248:13: error: 'unlikely' undeclared (first use in this function)
248 | if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
| ^~~~~~~~
fs/bcachefs/fs.c:248:13: note: each undeclared identifier is reported only once for each function it appears in
fs/bcachefs/fs.c: In function 'bch2_new_inode':
>> fs/bcachefs/fs.c:261:67: error: 'GFP_NOWARN' undeclared (first use in this function); did you mean 'GFP_NOWAIT'?
261 | struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
| ^~~~~~~~~~
| GFP_NOWAIT
vim +/unlikely +248 fs/bcachefs/fs.c
233
234 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
235 {
236 struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, gfp);
237 if (!inode)
238 return NULL;
239
240 inode_init_once(&inode->v);
241 mutex_init(&inode->ei_update_lock);
242 two_state_lock_init(&inode->ei_pagecache_lock);
243 INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
244 inode->ei_flags = 0;
245 mutex_init(&inode->ei_quota_lock);
246 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
247
> 248 if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
249 kmem_cache_free(bch2_inode_cache, inode);
250 return NULL;
251 }
252
253 return inode;
254 }
255
256 /*
257 * Allocate a new inode, dropping/retaking btree locks if necessary:
258 */
259 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
260 {
> 261 struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
262
263 if (unlikely(!inode)) {
264 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
265 if (ret && inode) {
266 __destroy_inode(&inode->v);
267 kmem_cache_free(bch2_inode_cache, inode);
268 }
269 if (ret)
270 return ERR_PTR(ret);
271 }
272
273 return inode;
274 }
275
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:47 ` Matthew Wilcox
@ 2024-08-26 19:54 ` Kent Overstreet
0 siblings, 0 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 19:54 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon, Aug 26, 2024 at 08:47:03PM GMT, Matthew Wilcox wrote:
> On Mon, Aug 26, 2024 at 03:42:59PM -0400, Kent Overstreet wrote:
> > On Mon, Aug 26, 2024 at 08:41:42PM GMT, Matthew Wilcox wrote:
> > > On Mon, Aug 26, 2024 at 03:39:47PM -0400, Kent Overstreet wrote:
> > > > Given the amount of plumbing required here, it's clear that passing gfp
> > > > flags is the less safe way of doing it, and this really does belong in
> > > > the allocation context.
> > > >
> > > > Failure to pass gfp flags correctly (which we know is something that
> > > > happens today, e.g. vmalloc -> pte allocation) means you're introducing
> > > > a deadlock.
> > >
> > > The problem with vmalloc is that the page table allocation _doesn't_
> > > take a GFP parameter.
> >
> > yeah, I know. I posted patches to plumb it through, which were nacked by
> > Linus.
> >
> > And we're trying to get away from passing gfp flags directly, are we
> > not? I just don't buy the GFP_NOFAIL unsafety argument.
>
> The problem with the giant invasive change of "getting away from passing
> GFP flags directly" is that you need to build consensus for what it
> looks like and convince everyone that you have a solution that solves
> all the problems, or at least doesn't make any of those problems worse.
> You haven't done that, you've just committed code that the MM people hate
> (indeed already rejected), and set back the idea.
Err, what? I'm not the one who originated the idae of getting away from
passing gfp flags directly. That's been the consensus for _awhile_;
you've been talking about it, Linus talked about it re: vmalloc pte
allocation.
So this looks to me like the mm people just trying to avoid having to
deal with that. GFP_NOFAIL conflicting with other memalloc/gfp flags is
not a new issue (you really shouldn't be combining GFP_NOFAIL with
GFP_NOFS or GFP_NOIO!), and it's something we have warnings for.
But the issue of gfp flags not being passed correctly is not something
we can check for at runtime with any reliability - which means this
patchset look like a net negative to me.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:39 ` Kent Overstreet
2024-08-26 19:41 ` Matthew Wilcox
@ 2024-08-26 19:58 ` Michal Hocko
2024-08-26 20:00 ` Kent Overstreet
1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-08-26 19:58 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon 26-08-24 15:39:47, Kent Overstreet wrote:
> On Mon, Aug 26, 2024 at 10:47:12AM GMT, Michal Hocko wrote:
> > From: Michal Hocko <mhocko@suse.com>
> >
> > bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> > inode to achieve GFP_NOWAIT semantic while holding locks. If this
> > allocation fails it will drop locks and use GFP_NOFS allocation context.
> >
> > We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> > dangerous to use if the caller doesn't control the full call chain with
> > this flag set. E.g. if any of the function down the chain needed
> > GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> > cause unexpected failure.
> >
> > While this is not the case in this particular case using the scoped gfp
> > semantic is not really needed bacause we can easily pus the allocation
> > context down the chain without too much clutter.
>
> yeah, eesh, nack.
Sure, you can NAK this but then deal with the lack of the PF flag by
other means. We have made it clear that PF_MEMALLOC_NORECLAIM is not we
are going to support at the MM level.
I have done your homework and shown that it is really easy
to use gfp flags directly. The net result is passing gfp flag down to
two functions. Sure part of it is ugglier by having several different
callbacks implementing it but still manageable. Without too much churn.
So do whatever you like in the bcache code but do not rely on something
that is unsupported by the MM layer which you have sneaked in without an
agreement.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 19:58 ` Michal Hocko
@ 2024-08-26 20:00 ` Kent Overstreet
2024-08-26 20:27 ` Michal Hocko
0 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 20:00 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon, Aug 26, 2024 at 09:58:08PM GMT, Michal Hocko wrote:
> On Mon 26-08-24 15:39:47, Kent Overstreet wrote:
> > On Mon, Aug 26, 2024 at 10:47:12AM GMT, Michal Hocko wrote:
> > > From: Michal Hocko <mhocko@suse.com>
> > >
> > > bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> > > inode to achieve GFP_NOWAIT semantic while holding locks. If this
> > > allocation fails it will drop locks and use GFP_NOFS allocation context.
> > >
> > > We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> > > dangerous to use if the caller doesn't control the full call chain with
> > > this flag set. E.g. if any of the function down the chain needed
> > > GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> > > cause unexpected failure.
> > >
> > > While this is not the case in this particular case using the scoped gfp
> > > semantic is not really needed bacause we can easily pus the allocation
> > > context down the chain without too much clutter.
> >
> > yeah, eesh, nack.
>
> Sure, you can NAK this but then deal with the lack of the PF flag by
> other means. We have made it clear that PF_MEMALLOC_NORECLAIM is not we
> are going to support at the MM level.
>
> I have done your homework and shown that it is really easy
> to use gfp flags directly. The net result is passing gfp flag down to
> two functions. Sure part of it is ugglier by having several different
> callbacks implementing it but still manageable. Without too much churn.
>
> So do whatever you like in the bcache code but do not rely on something
> that is unsupported by the MM layer which you have sneaked in without an
> agreement.
Michal, you're being damned hostile, while posting code you haven't even
tried to compile. Seriously, dude?
How about sticking to the technical issues at hand instead of saying
"this is mm, so my way or the highway?". We're all kernel developers
here, this is not what we do.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 20:00 ` Kent Overstreet
@ 2024-08-26 20:27 ` Michal Hocko
2024-08-26 20:43 ` Kent Overstreet
0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-08-26 20:27 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon 26-08-24 16:00:56, Kent Overstreet wrote:
> On Mon, Aug 26, 2024 at 09:58:08PM GMT, Michal Hocko wrote:
> > On Mon 26-08-24 15:39:47, Kent Overstreet wrote:
> > > On Mon, Aug 26, 2024 at 10:47:12AM GMT, Michal Hocko wrote:
> > > > From: Michal Hocko <mhocko@suse.com>
> > > >
> > > > bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> > > > inode to achieve GFP_NOWAIT semantic while holding locks. If this
> > > > allocation fails it will drop locks and use GFP_NOFS allocation context.
> > > >
> > > > We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> > > > dangerous to use if the caller doesn't control the full call chain with
> > > > this flag set. E.g. if any of the function down the chain needed
> > > > GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> > > > cause unexpected failure.
> > > >
> > > > While this is not the case in this particular case using the scoped gfp
> > > > semantic is not really needed bacause we can easily pus the allocation
> > > > context down the chain without too much clutter.
> > >
> > > yeah, eesh, nack.
> >
> > Sure, you can NAK this but then deal with the lack of the PF flag by
> > other means. We have made it clear that PF_MEMALLOC_NORECLAIM is not we
> > are going to support at the MM level.
> >
> > I have done your homework and shown that it is really easy
> > to use gfp flags directly. The net result is passing gfp flag down to
> > two functions. Sure part of it is ugglier by having several different
> > callbacks implementing it but still manageable. Without too much churn.
> >
> > So do whatever you like in the bcache code but do not rely on something
> > that is unsupported by the MM layer which you have sneaked in without an
> > agreement.
>
> Michal, you're being damned hostile, while posting code you haven't even
> tried to compile. Seriously, dude?
>
> How about sticking to the technical issues at hand instead of saying
> "this is mm, so my way or the highway?". We're all kernel developers
> here, this is not what we do.
Kent, we do respect review feedback. You are clearly fine ignoring it
when you feels like it (eab0af905bfc ("mm: introduce
PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") is a clear example of it).
I have already made my arguments (repeatedly) why implicit nowait
allocation context is tricky and problematic. Your response is that you
simply "do no buy it" which is a highly technical argument.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 20:27 ` Michal Hocko
@ 2024-08-26 20:43 ` Kent Overstreet
2024-08-26 21:10 ` Kent Overstreet
2024-08-27 6:01 ` Michal Hocko
0 siblings, 2 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 20:43 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon, Aug 26, 2024 at 10:27:44PM GMT, Michal Hocko wrote:
> On Mon 26-08-24 16:00:56, Kent Overstreet wrote:
> > On Mon, Aug 26, 2024 at 09:58:08PM GMT, Michal Hocko wrote:
> > > On Mon 26-08-24 15:39:47, Kent Overstreet wrote:
> > > > On Mon, Aug 26, 2024 at 10:47:12AM GMT, Michal Hocko wrote:
> > > > > From: Michal Hocko <mhocko@suse.com>
> > > > >
> > > > > bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> > > > > inode to achieve GFP_NOWAIT semantic while holding locks. If this
> > > > > allocation fails it will drop locks and use GFP_NOFS allocation context.
> > > > >
> > > > > We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> > > > > dangerous to use if the caller doesn't control the full call chain with
> > > > > this flag set. E.g. if any of the function down the chain needed
> > > > > GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> > > > > cause unexpected failure.
> > > > >
> > > > > While this is not the case in this particular case using the scoped gfp
> > > > > semantic is not really needed bacause we can easily pus the allocation
> > > > > context down the chain without too much clutter.
> > > >
> > > > yeah, eesh, nack.
> > >
> > > Sure, you can NAK this but then deal with the lack of the PF flag by
> > > other means. We have made it clear that PF_MEMALLOC_NORECLAIM is not we
> > > are going to support at the MM level.
> > >
> > > I have done your homework and shown that it is really easy
> > > to use gfp flags directly. The net result is passing gfp flag down to
> > > two functions. Sure part of it is ugglier by having several different
> > > callbacks implementing it but still manageable. Without too much churn.
> > >
> > > So do whatever you like in the bcache code but do not rely on something
> > > that is unsupported by the MM layer which you have sneaked in without an
> > > agreement.
> >
> > Michal, you're being damned hostile, while posting code you haven't even
> > tried to compile. Seriously, dude?
> >
> > How about sticking to the technical issues at hand instead of saying
> > "this is mm, so my way or the highway?". We're all kernel developers
> > here, this is not what we do.
>
> Kent, we do respect review feedback. You are clearly fine ignoring it
> when you feels like it (eab0af905bfc ("mm: introduce
> PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") is a clear example of it).
>
> I have already made my arguments (repeatedly) why implicit nowait
> allocation context is tricky and problematic. Your response is that you
> simply "do no buy it" which is a highly technical argument.
No, I explained why GFP_NORECLAIM/PF_MEMALLOC_NORECLAIM can absolutely
apply to a context, not a callsite, and why vmalloc() and kvmalloc()
ignoring gfp flags is a much more serious issue.
If you want to do something useful, figure out what we're going to do
about _that_. If you really don't want PF_MEMALLOC_NORECLAIM to exist,
then see if Linus will let you plumb gfp flags down to pte allocation -
and beware, that's arch code that you'll have to fix.
Reminder: kvmalloc() is a thing, and it's steadily seeing wider use.
Otherwise, PF_MEMALLOC_NORECLAIM needs to stay; and thank you for
bringing this to my attention, because it's made me realize all the
other places in bcachefs that use gfp flags for allocating memory with
btree locks held need to be switch to memalloc_flags_do().
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
` (2 preceding siblings ...)
2024-08-26 19:52 ` kernel test robot
@ 2024-08-26 20:53 ` kernel test robot
2024-08-27 2:23 ` kernel test robot
2024-08-29 9:37 ` Jan Kara
5 siblings, 0 replies; 51+ messages in thread
From: kernel test robot @ 2024-08-26 20:53 UTC (permalink / raw)
To: Michal Hocko, Andrew Morton
Cc: llvm, oe-kbuild-all, Linux Memory Management List,
Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
Hi Michal,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on tip/sched/core brauner-vfs/vfs.all linus/master v6.11-rc5]
[cannot apply to next-20240826]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Michal-Hocko/bcachefs-do-not-use-PF_MEMALLOC_NORECLAIM/20240826-171013
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20240826085347.1152675-2-mhocko%40kernel.org
patch subject: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
config: arm-randconfig-003-20240827 (https://download.01.org/0day-ci/archive/20240827/202408270446.6Ew7FPHA-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 08e5a1de8227512d4774a534b91cb2353cef6284)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240827/202408270446.6Ew7FPHA-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408270446.6Ew7FPHA-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from fs/bcachefs/fs.c:4:
In file included from fs/bcachefs/bcachefs.h:188:
In file included from include/linux/bio.h:10:
In file included from include/linux/blk_types.h:10:
In file included from include/linux/bvec.h:10:
In file included from include/linux/highmem.h:8:
In file included from include/linux/cacheflush.h:5:
In file included from arch/arm/include/asm/cacheflush.h:10:
In file included from include/linux/mm.h:2198:
include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
>> fs/bcachefs/fs.c:248:60: error: too many arguments provided to function-like macro invocation
248 | if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
| ^
include/linux/compiler.h:77:10: note: macro 'unlikely' defined here
77 | # define unlikely(x) __builtin_expect(!!(x), 0)
| ^
>> fs/bcachefs/fs.c:248:6: error: use of undeclared identifier 'unlikely'
248 | if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
| ^
>> fs/bcachefs/fs.c:261:60: error: use of undeclared identifier 'GFP_NOWARN'
261 | struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
| ^
1 warning and 3 errors generated.
vim +248 fs/bcachefs/fs.c
233
234 static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
235 {
236 struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, gfp);
237 if (!inode)
238 return NULL;
239
240 inode_init_once(&inode->v);
241 mutex_init(&inode->ei_update_lock);
242 two_state_lock_init(&inode->ei_pagecache_lock);
243 INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
244 inode->ei_flags = 0;
245 mutex_init(&inode->ei_quota_lock);
246 memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
247
> 248 if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
249 kmem_cache_free(bch2_inode_cache, inode);
250 return NULL;
251 }
252
253 return inode;
254 }
255
256 /*
257 * Allocate a new inode, dropping/retaking btree locks if necessary:
258 */
259 static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
260 {
> 261 struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
262
263 if (unlikely(!inode)) {
264 int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
265 if (ret && inode) {
266 __destroy_inode(&inode->v);
267 kmem_cache_free(bch2_inode_cache, inode);
268 }
269 if (ret)
270 return ERR_PTR(ret);
271 }
272
273 return inode;
274 }
275
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 20:43 ` Kent Overstreet
@ 2024-08-26 21:10 ` Kent Overstreet
2024-08-27 6:01 ` Michal Hocko
1 sibling, 0 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-08-26 21:10 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon, Aug 26, 2024 at 04:44:01PM GMT, Kent Overstreet wrote:
> No, I explained why GFP_NORECLAIM/PF_MEMALLOC_NORECLAIM can absolutely
> apply to a context, not a callsite, and why vmalloc() and kvmalloc()
> ignoring gfp flags is a much more serious issue.
>
> If you want to do something useful, figure out what we're going to do
> about _that_. If you really don't want PF_MEMALLOC_NORECLAIM to exist,
> then see if Linus will let you plumb gfp flags down to pte allocation -
> and beware, that's arch code that you'll have to fix.
>
> Reminder: kvmalloc() is a thing, and it's steadily seeing wider use.
>
> Otherwise, PF_MEMALLOC_NORECLAIM needs to stay; and thank you for
> bringing this to my attention, because it's made me realize all the
> other places in bcachefs that use gfp flags for allocating memory with
> btree locks held need to be switch to memalloc_flags_do().
Additionally: plumbing gfp flags to pte allocation is something we do
need to do. I proposed it before kvmalloc() was a thing, but now it's
become much more of a lurking landmine.
Even with that I'd still be against this series, though. GFP_NOFAIL
interacting badly with other gfp/memalloc flags is going to continue to
be an issue, and I think the only answer to that is stricter runtime
checks (which, thank you mm guys for adding recently).
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
` (3 preceding siblings ...)
2024-08-26 20:53 ` kernel test robot
@ 2024-08-27 2:23 ` kernel test robot
2024-08-29 9:37 ` Jan Kara
5 siblings, 0 replies; 51+ messages in thread
From: kernel test robot @ 2024-08-27 2:23 UTC (permalink / raw)
To: Michal Hocko, Andrew Morton
Cc: oe-kbuild-all, Linux Memory Management List, Christoph Hellwig,
Yafang Shao, Kent Overstreet, jack, Christian Brauner,
Alexander Viro, Paul Moore, James Morris, Serge E. Hallyn,
linux-fsdevel, linux-bcachefs, linux-security-module,
linux-kernel, Michal Hocko
Hi Michal,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on tip/sched/core brauner-vfs/vfs.all linus/master v6.11-rc5]
[cannot apply to next-20240826]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Michal-Hocko/bcachefs-do-not-use-PF_MEMALLOC_NORECLAIM/20240826-171013
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20240826085347.1152675-2-mhocko%40kernel.org
patch subject: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
config: i386-buildonly-randconfig-003-20240827 (https://download.01.org/0day-ci/archive/20240827/202408271041.5IWf4ZQC-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240827/202408271041.5IWf4ZQC-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408271041.5IWf4ZQC-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> security/security.c:664: warning: Function parameter or struct member 'gfp' not described in 'lsm_inode_alloc'
>> security/security.c:1586: warning: Function parameter or struct member 'gfp' not described in 'security_inode_alloc'
vim +664 security/security.c
33bf60cabcc768 Casey Schaufler 2018-11-12 654
afb1cbe37440c7 Casey Schaufler 2018-09-21 655 /**
afb1cbe37440c7 Casey Schaufler 2018-09-21 656 * lsm_inode_alloc - allocate a composite inode blob
afb1cbe37440c7 Casey Schaufler 2018-09-21 657 * @inode: the inode that needs a blob
afb1cbe37440c7 Casey Schaufler 2018-09-21 658 *
afb1cbe37440c7 Casey Schaufler 2018-09-21 659 * Allocate the inode blob for all the modules
afb1cbe37440c7 Casey Schaufler 2018-09-21 660 *
afb1cbe37440c7 Casey Schaufler 2018-09-21 661 * Returns 0, or -ENOMEM if memory can't be allocated.
afb1cbe37440c7 Casey Schaufler 2018-09-21 662 */
b2ce84652b3193 Michal Hocko 2024-08-26 663 int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
afb1cbe37440c7 Casey Schaufler 2018-09-21 @664 {
afb1cbe37440c7 Casey Schaufler 2018-09-21 665 if (!lsm_inode_cache) {
afb1cbe37440c7 Casey Schaufler 2018-09-21 666 inode->i_security = NULL;
afb1cbe37440c7 Casey Schaufler 2018-09-21 667 return 0;
afb1cbe37440c7 Casey Schaufler 2018-09-21 668 }
afb1cbe37440c7 Casey Schaufler 2018-09-21 669
b2ce84652b3193 Michal Hocko 2024-08-26 670 inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
afb1cbe37440c7 Casey Schaufler 2018-09-21 671 if (inode->i_security == NULL)
afb1cbe37440c7 Casey Schaufler 2018-09-21 672 return -ENOMEM;
afb1cbe37440c7 Casey Schaufler 2018-09-21 673 return 0;
afb1cbe37440c7 Casey Schaufler 2018-09-21 674 }
afb1cbe37440c7 Casey Schaufler 2018-09-21 675
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 20:43 ` Kent Overstreet
2024-08-26 21:10 ` Kent Overstreet
@ 2024-08-27 6:01 ` Michal Hocko
2024-08-27 6:40 ` Kent Overstreet
1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-08-27 6:01 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Mon 26-08-24 16:43:55, Kent Overstreet wrote:
> On Mon, Aug 26, 2024 at 10:27:44PM GMT, Michal Hocko wrote:
> > On Mon 26-08-24 16:00:56, Kent Overstreet wrote:
> > > On Mon, Aug 26, 2024 at 09:58:08PM GMT, Michal Hocko wrote:
> > > > On Mon 26-08-24 15:39:47, Kent Overstreet wrote:
> > > > > On Mon, Aug 26, 2024 at 10:47:12AM GMT, Michal Hocko wrote:
> > > > > > From: Michal Hocko <mhocko@suse.com>
> > > > > >
> > > > > > bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> > > > > > inode to achieve GFP_NOWAIT semantic while holding locks. If this
> > > > > > allocation fails it will drop locks and use GFP_NOFS allocation context.
> > > > > >
> > > > > > We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> > > > > > dangerous to use if the caller doesn't control the full call chain with
> > > > > > this flag set. E.g. if any of the function down the chain needed
> > > > > > GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> > > > > > cause unexpected failure.
> > > > > >
> > > > > > While this is not the case in this particular case using the scoped gfp
> > > > > > semantic is not really needed bacause we can easily pus the allocation
> > > > > > context down the chain without too much clutter.
> > > > >
> > > > > yeah, eesh, nack.
> > > >
> > > > Sure, you can NAK this but then deal with the lack of the PF flag by
> > > > other means. We have made it clear that PF_MEMALLOC_NORECLAIM is not we
> > > > are going to support at the MM level.
> > > >
> > > > I have done your homework and shown that it is really easy
> > > > to use gfp flags directly. The net result is passing gfp flag down to
> > > > two functions. Sure part of it is ugglier by having several different
> > > > callbacks implementing it but still manageable. Without too much churn.
> > > >
> > > > So do whatever you like in the bcache code but do not rely on something
> > > > that is unsupported by the MM layer which you have sneaked in without an
> > > > agreement.
> > >
> > > Michal, you're being damned hostile, while posting code you haven't even
> > > tried to compile. Seriously, dude?
> > >
> > > How about sticking to the technical issues at hand instead of saying
> > > "this is mm, so my way or the highway?". We're all kernel developers
> > > here, this is not what we do.
> >
> > Kent, we do respect review feedback. You are clearly fine ignoring it
> > when you feels like it (eab0af905bfc ("mm: introduce
> > PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") is a clear example of it).
> >
> > I have already made my arguments (repeatedly) why implicit nowait
> > allocation context is tricky and problematic. Your response is that you
> > simply "do no buy it" which is a highly technical argument.
>
> No, I explained why GFP_NORECLAIM/PF_MEMALLOC_NORECLAIM can absolutely
> apply to a context, not a callsite, and why vmalloc() and kvmalloc()
> ignoring gfp flags is a much more serious issue.
You are not really answering the main concern I have brought up though.
I.e. GFP_NOFAIL being fundamentally incompatible with NORECLAIM semantic
because the page allocator doesn't and will not support this allocation
mode. Scoped noreclaim semantic makes such a use much less visible
because it can be deep in the scoped context there more error prone to
introduce thus making the code harder to maintain.
I do see why you would like to have NOWAIT kvmalloc support available
and I also do see challenges in achieving that. But I completely fail to
see why you are bring that up _here_ as that is not really relevant to
PF_MEMALLOC_NORECLAIM use by bcachefs because it demonstrably doesn't
need that. There is no other user of the flag at the moment so dropping
the flag before there is more misuse is a reasonable goal. If you want
to bring up vmalloc NOWAIT support then feel free to do that in another
context and we can explore potential ways to achieve that.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-27 6:01 ` Michal Hocko
@ 2024-08-27 6:40 ` Kent Overstreet
2024-08-27 6:58 ` Michal Hocko
0 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-08-27 6:40 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Tue, Aug 27, 2024 at 08:01:32AM GMT, Michal Hocko wrote:
> You are not really answering the main concern I have brought up though.
> I.e. GFP_NOFAIL being fundamentally incompatible with NORECLAIM semantic
> because the page allocator doesn't and will not support this allocation
> mode. Scoped noreclaim semantic makes such a use much less visible
> because it can be deep in the scoped context there more error prone to
> introduce thus making the code harder to maintain.
You're too attached to GFP_NOFAIL.
GFP_NOFAIL is something we very rarely use, and it's not something we
want to use. Furthermore, GFP_NOFAIL allocations can fail regardless of
this patch - e.g. if it's more than 2 pages, it's not going to be
GFP_NOFAIL. In general you can't avoid having error paths for memory
allocations, and GFP_NOFAIL can't fundamentally change that.
Stop trying to design things around GFP_NOFAIL.
> I do see why you would like to have NOWAIT kvmalloc support available
> and I also do see challenges in achieving that. But I completely fail to
> see why you are bring that up _here_ as that is not really relevant to
> PF_MEMALLOC_NORECLAIM use by bcachefs because it demonstrably doesn't
> need that. There is no other user of the flag at the moment so dropping
> the flag before there is more misuse is a reasonable goal. If you want
> to bring up vmalloc NOWAIT support then feel free to do that in another
> context and we can explore potential ways to achieve that.
The inconsistencies between what PF_MEMALLOC supports and what gfp flags
support are quite relevant. If gfp flags aren't plumbed everywhere, and
aren't going to be plumbed everywhere (and that is the current
decision), then we must have PF_MEMALLOC support.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-27 6:40 ` Kent Overstreet
@ 2024-08-27 6:58 ` Michal Hocko
2024-08-27 7:05 ` Kent Overstreet
0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-08-27 6:58 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Tue 27-08-24 02:40:16, Kent Overstreet wrote:
> On Tue, Aug 27, 2024 at 08:01:32AM GMT, Michal Hocko wrote:
> > You are not really answering the main concern I have brought up though.
> > I.e. GFP_NOFAIL being fundamentally incompatible with NORECLAIM semantic
> > because the page allocator doesn't and will not support this allocation
> > mode. Scoped noreclaim semantic makes such a use much less visible
> > because it can be deep in the scoped context there more error prone to
> > introduce thus making the code harder to maintain.
>
> You're too attached to GFP_NOFAIL.
Unfortunatelly GFP_NOFAIL is there and we need to support it. We cannot
just close eyes and pretend it doesn't exist and hope for the best.
> GFP_NOFAIL is something we very rarely use, and it's not something we
> want to use. Furthermore, GFP_NOFAIL allocations can fail regardless of
> this patch - e.g. if it's more than 2 pages, it's not going to be
> GFP_NOFAIL.
We can reasonably assume we do not have any of those users in the tree
though. We know that because we have a warning to tell us about that.
We still have legit GFP_NOFAIL users and we can safely assume we will
have some in the future though. And they have no way to handle the
failure. If they did they wouldn't have used GFP_NOFAIL in the first
place. So they do not check for NULL and they would either blow up or
worse fail in subtle and harder to detect way.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-27 6:58 ` Michal Hocko
@ 2024-08-27 7:05 ` Kent Overstreet
2024-08-27 7:35 ` Michal Hocko
0 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-08-27 7:05 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Tue, Aug 27, 2024 at 08:58:39AM GMT, Michal Hocko wrote:
> On Tue 27-08-24 02:40:16, Kent Overstreet wrote:
> > On Tue, Aug 27, 2024 at 08:01:32AM GMT, Michal Hocko wrote:
> > > You are not really answering the main concern I have brought up though.
> > > I.e. GFP_NOFAIL being fundamentally incompatible with NORECLAIM semantic
> > > because the page allocator doesn't and will not support this allocation
> > > mode. Scoped noreclaim semantic makes such a use much less visible
> > > because it can be deep in the scoped context there more error prone to
> > > introduce thus making the code harder to maintain.
> >
> > You're too attached to GFP_NOFAIL.
>
> Unfortunatelly GFP_NOFAIL is there and we need to support it. We cannot
> just close eyes and pretend it doesn't exist and hope for the best.
You need to notice when you're trying to do something immpossible.
> > GFP_NOFAIL is something we very rarely use, and it's not something we
> > want to use. Furthermore, GFP_NOFAIL allocations can fail regardless of
> > this patch - e.g. if it's more than 2 pages, it's not going to be
> > GFP_NOFAIL.
>
> We can reasonably assume we do not have any of those users in the tree
> though. We know that because we have a warning to tell us about that.
> We still have legit GFP_NOFAIL users and we can safely assume we will
> have some in the future though. And they have no way to handle the
> failure. If they did they wouldn't have used GFP_NOFAIL in the first
> place. So they do not check for NULL and they would either blow up or
> worse fail in subtle and harder to detect way.
No, because not all GFP_NOFAIL allocations are statically sized.
And the problem of the dynamic context overriding GFP_NOFAIL is more
general - if you use GFP_NOFAIL from nonblocking context (interrupt
context or preemption disabled) - the allocation has to fail, or
something even worse will happen.
Just because we don't track that with PF_MEMALLOC flags doesn't mean the
problem isn't htere.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-27 7:05 ` Kent Overstreet
@ 2024-08-27 7:35 ` Michal Hocko
0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-08-27 7:35 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel
On Tue 27-08-24 03:05:29, Kent Overstreet wrote:
> On Tue, Aug 27, 2024 at 08:58:39AM GMT, Michal Hocko wrote:
> > On Tue 27-08-24 02:40:16, Kent Overstreet wrote:
> > > On Tue, Aug 27, 2024 at 08:01:32AM GMT, Michal Hocko wrote:
> > > > You are not really answering the main concern I have brought up though.
> > > > I.e. GFP_NOFAIL being fundamentally incompatible with NORECLAIM semantic
> > > > because the page allocator doesn't and will not support this allocation
> > > > mode. Scoped noreclaim semantic makes such a use much less visible
> > > > because it can be deep in the scoped context there more error prone to
> > > > introduce thus making the code harder to maintain.
> > >
> > > You're too attached to GFP_NOFAIL.
> >
> > Unfortunatelly GFP_NOFAIL is there and we need to support it. We cannot
> > just close eyes and pretend it doesn't exist and hope for the best.
>
> You need to notice when you're trying to do something immpossible.
Agreed! And GFP_NOFAIL for allocations <= order 1 in the page allocator or
kvmalloc(GFP_NOFAIL) for reasonable sizes is a supported setup. And it
should work as documented and shouldn't create any surprises. Like
returning unexpected failure because you have been called from withing a
NORECLAIM scope which you as an author of the code are not even aware of
because that has happened somewhere detached from your code and you
happen to be in a callchain.
> > > GFP_NOFAIL is something we very rarely use, and it's not something we
> > > want to use. Furthermore, GFP_NOFAIL allocations can fail regardless of
> > > this patch - e.g. if it's more than 2 pages, it's not going to be
> > > GFP_NOFAIL.
> >
> > We can reasonably assume we do not have any of those users in the tree
> > though. We know that because we have a warning to tell us about that.
> > We still have legit GFP_NOFAIL users and we can safely assume we will
> > have some in the future though. And they have no way to handle the
> > failure. If they did they wouldn't have used GFP_NOFAIL in the first
> > place. So they do not check for NULL and they would either blow up or
> > worse fail in subtle and harder to detect way.
>
> No, because not all GFP_NOFAIL allocations are statically sized.
This is a runtime check warning.
rmqueue:
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
> And the problem of the dynamic context overriding GFP_NOFAIL is more
> general - if you use GFP_NOFAIL from nonblocking context (interrupt
> context or preemption disabled) - the allocation has to fail, or
> something even worse will happen.
If you use __GFP_NOFAIL | GFP_KERNEL from an atomic context then you are
screwed the same way as if you used GFP_KERNEL alone - sleeping while
atomic or worse. The allocator doesn't even try to deal with this and
protect the caller by not sleeping and returning NULL.
More fundamentally, GFP_NOFAIL from non-blocking context is an incorrect
an unsupported use of the flag. This is the crux of the whole
discussion. GFP_NOWAIT | __GFP_NOFAIL or GFP_ATOMIC | __GFP_NOFAIL is
just a bug. We can git grep for those, and surprisingly found one instance
which already has a patch waiting to be merged.
We cannot enforce that at a compile time and that sucks but such is a
life. But we can grep for this at least. Now consider a scoped
(implicit) NOWAIT context which makes even seeemingly correct GFP_NOFAIL
use a bug.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
` (4 preceding siblings ...)
2024-08-27 2:23 ` kernel test robot
@ 2024-08-29 9:37 ` Jan Kara
5 siblings, 0 replies; 51+ messages in thread
From: Jan Kara @ 2024-08-29 9:37 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, Kent Overstreet,
jack, Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko
On Mon 26-08-24 10:47:12, Michal Hocko wrote:
> From: Michal Hocko <mhocko@suse.com>
>
> bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
> inode to achieve GFP_NOWAIT semantic while holding locks. If this
> allocation fails it will drop locks and use GFP_NOFS allocation context.
>
> We would like to drop PF_MEMALLOC_NORECLAIM because it is really
> dangerous to use if the caller doesn't control the full call chain with
> this flag set. E.g. if any of the function down the chain needed
> GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
> cause unexpected failure.
>
> While this is not the case in this particular case using the scoped gfp
> semantic is not really needed bacause we can easily pus the allocation
> context down the chain without too much clutter.
>
> Acked-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Michal Hocko <mhocko@suse.com>
For the VFS changes feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/bcachefs/fs.c | 14 ++++++--------
> fs/inode.c | 6 +++---
> include/linux/fs.h | 7 ++++++-
> include/linux/lsm_hooks.h | 2 +-
> include/linux/security.h | 4 ++--
> security/security.c | 8 ++++----
> 6 files changed, 22 insertions(+), 19 deletions(-)
>
> diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
> index 15fc41e63b6c..7a55167b9133 100644
> --- a/fs/bcachefs/fs.c
> +++ b/fs/bcachefs/fs.c
> @@ -231,9 +231,9 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
> BUG();
> }
>
> -static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
> +static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
> {
> - struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
> + struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, gfp);
> if (!inode)
> return NULL;
>
> @@ -245,7 +245,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
> mutex_init(&inode->ei_quota_lock);
> memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
>
> - if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
> + if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v), gfp)) {
> kmem_cache_free(bch2_inode_cache, inode);
> return NULL;
> }
> @@ -258,12 +258,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
> */
> static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
> {
> - struct bch_inode_info *inode =
> - memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
> - __bch2_new_inode(trans->c));
> + struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWARN | GFP_NOWAIT);
>
> if (unlikely(!inode)) {
> - int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
> + int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
> if (ret && inode) {
> __destroy_inode(&inode->v);
> kmem_cache_free(bch2_inode_cache, inode);
> @@ -328,7 +326,7 @@ __bch2_create(struct mnt_idmap *idmap,
> if (ret)
> return ERR_PTR(ret);
> #endif
> - inode = __bch2_new_inode(c);
> + inode = __bch2_new_inode(c, GFP_NOFS);
> if (unlikely(!inode)) {
> inode = ERR_PTR(-ENOMEM);
> goto err;
> diff --git a/fs/inode.c b/fs/inode.c
> index 86670941884b..95fd67a6cac3 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -153,7 +153,7 @@ static int no_open(struct inode *inode, struct file *file)
> * These are initializations that need to be done on every inode
> * allocation as the fields are not initialised by slab allocation.
> */
> -int inode_init_always(struct super_block *sb, struct inode *inode)
> +int inode_init_always(struct super_block *sb, struct inode *inode, gfp_t gfp)
> {
> static const struct inode_operations empty_iops;
> static const struct file_operations no_open_fops = {.open = no_open};
> @@ -230,14 +230,14 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
> #endif
> inode->i_flctx = NULL;
>
> - if (unlikely(security_inode_alloc(inode)))
> + if (unlikely(security_inode_alloc(inode, gfp)))
> return -ENOMEM;
>
> this_cpu_inc(nr_inodes);
>
> return 0;
> }
> -EXPORT_SYMBOL(inode_init_always);
> +EXPORT_SYMBOL(inode_init_always_gfp);
>
> void free_inode_nonrcu(struct inode *inode)
> {
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fd34b5755c0b..d46ca71a7855 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3027,7 +3027,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
>
> extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
>
> -extern int inode_init_always(struct super_block *, struct inode *);
> +extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
> +static inline int inode_init_always(struct super_block *sb, struct inode *inode)
> +{
> + return inode_init_always_gfp(sb, inode, GFP_NOFS);
> +}
> +
> extern void inode_init_once(struct inode *);
> extern void address_space_init_once(struct address_space *mapping);
> extern struct inode * igrab(struct inode *);
> diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
> index a2ade0ffe9e7..b08472d64765 100644
> --- a/include/linux/lsm_hooks.h
> +++ b/include/linux/lsm_hooks.h
> @@ -150,6 +150,6 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
> __used __section(".early_lsm_info.init") \
> __aligned(sizeof(unsigned long))
>
> -extern int lsm_inode_alloc(struct inode *inode);
> +extern int lsm_inode_alloc(struct inode *inode, gfp_t gfp);
>
> #endif /* ! __LINUX_LSM_HOOKS_H */
> diff --git a/include/linux/security.h b/include/linux/security.h
> index 1390f1efb4f0..7c6b9b038a0d 100644
> --- a/include/linux/security.h
> +++ b/include/linux/security.h
> @@ -336,7 +336,7 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode,
> struct cred *new);
> int security_path_notify(const struct path *path, u64 mask,
> unsigned int obj_type);
> -int security_inode_alloc(struct inode *inode);
> +int security_inode_alloc(struct inode *inode, gfp_t gfp);
> void security_inode_free(struct inode *inode);
> int security_inode_init_security(struct inode *inode, struct inode *dir,
> const struct qstr *qstr,
> @@ -769,7 +769,7 @@ static inline int security_path_notify(const struct path *path, u64 mask,
> return 0;
> }
>
> -static inline int security_inode_alloc(struct inode *inode)
> +static inline int security_inode_alloc(struct inode *inode, gfp_t gfp)
> {
> return 0;
> }
> diff --git a/security/security.c b/security/security.c
> index 8cee5b6c6e6d..3581262da5ee 100644
> --- a/security/security.c
> +++ b/security/security.c
> @@ -660,14 +660,14 @@ static int lsm_file_alloc(struct file *file)
> *
> * Returns 0, or -ENOMEM if memory can't be allocated.
> */
> -int lsm_inode_alloc(struct inode *inode)
> +int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
> {
> if (!lsm_inode_cache) {
> inode->i_security = NULL;
> return 0;
> }
>
> - inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
> + inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
> if (inode->i_security == NULL)
> return -ENOMEM;
> return 0;
> @@ -1582,9 +1582,9 @@ int security_path_notify(const struct path *path, u64 mask,
> *
> * Return: Return 0 if operation was successful.
> */
> -int security_inode_alloc(struct inode *inode)
> +int security_inode_alloc(struct inode *inode, gfp_t gfp)
> {
> - int rc = lsm_inode_alloc(inode);
> + int rc = lsm_inode_alloc(inode, gfp);
>
> if (unlikely(rc))
> return rc;
> --
> 2.46.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply [flat|nested] 51+ messages in thread
* [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
@ 2024-09-02 9:51 Michal Hocko
2024-09-02 9:51 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
` (3 more replies)
0 siblings, 4 replies; 51+ messages in thread
From: Michal Hocko @ 2024-09-02 9:51 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
The previous version has been posted in [1]. Based on the review feedback
I have sent v2 of patches in the same threat but it seems that the
review has mostly settled on these patches. There is still an open
discussion on whether having a NORECLAIM allocator semantic (compare to
atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
those are not really relevant to this particular patchset as it 1)
doesn't aim to implement either of the two and 2) it aims at spreading
PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
semantic now that it is not widely used and much harder to fix.
I have collected Reviewed-bys and reposting here. These patches are
touching bcachefs, VFS and core MM so I am not sure which tree to merge
this through but I guess going through Andrew makes the most sense.
Changes since v1;
- compile fixes
- rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
by Matthew.
^ permalink raw reply [flat|nested] 51+ messages in thread
* [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-09-02 9:51 [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Michal Hocko
@ 2024-09-02 9:51 ` Michal Hocko
2024-09-05 9:28 ` kernel test robot
2024-09-02 9:51 ` [PATCH 2/2] Revert "mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN" Michal Hocko
` (2 subsequent siblings)
3 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-09-02 9:51 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel,
Michal Hocko
From: Michal Hocko <mhocko@suse.com>
bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
inode to achieve GFP_NOWAIT semantic while holding locks. If this
allocation fails it will drop locks and use GFP_NOFS allocation context.
We would like to drop PF_MEMALLOC_NORECLAIM because it is really
dangerous to use if the caller doesn't control the full call chain with
this flag set. E.g. if any of the function down the chain needed
GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
cause unexpected failure.
While this is not the case in this particular case using the scoped gfp
semantic is not really needed bacause we can easily pus the allocation
context down the chain without too much clutter.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz> # For vfs changes
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
fs/bcachefs/fs.c | 14 ++++++--------
fs/inode.c | 6 +++---
include/linux/fs.h | 7 ++++++-
include/linux/lsm_hooks.h | 2 +-
include/linux/security.h | 4 ++--
security/security.c | 8 ++++----
6 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 15fc41e63b6c..d151a2f28d12 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -231,9 +231,9 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
BUG();
}
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
{
- struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
+ struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, gfp);
if (!inode)
return NULL;
@@ -245,7 +245,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
+ if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
kmem_cache_free(bch2_inode_cache, inode);
return NULL;
}
@@ -258,12 +258,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
- struct bch_inode_info *inode =
- memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
- __bch2_new_inode(trans->c));
+ struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
if (ret && inode) {
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
@@ -328,7 +326,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = __bch2_new_inode(c);
+ inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..a2aabbcffbe4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -153,7 +153,7 @@ static int no_open(struct inode *inode, struct file *file)
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
*/
-int inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
static const struct inode_operations empty_iops;
static const struct file_operations no_open_fops = {.open = no_open};
@@ -230,14 +230,14 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
#endif
inode->i_flctx = NULL;
- if (unlikely(security_inode_alloc(inode)))
+ if (unlikely(security_inode_alloc(inode, gfp)))
return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
}
-EXPORT_SYMBOL(inode_init_always);
+EXPORT_SYMBOL(inode_init_always_gfp);
void free_inode_nonrcu(struct inode *inode)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..d46ca71a7855 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3027,7 +3027,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
-extern int inode_init_always(struct super_block *, struct inode *);
+extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
+static inline int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+ return inode_init_always_gfp(sb, inode, GFP_NOFS);
+}
+
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index a2ade0ffe9e7..b08472d64765 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -150,6 +150,6 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];
__used __section(".early_lsm_info.init") \
__aligned(sizeof(unsigned long))
-extern int lsm_inode_alloc(struct inode *inode);
+extern int lsm_inode_alloc(struct inode *inode, gfp_t gfp);
#endif /* ! __LINUX_LSM_HOOKS_H */
diff --git a/include/linux/security.h b/include/linux/security.h
index 1390f1efb4f0..7c6b9b038a0d 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -336,7 +336,7 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode,
struct cred *new);
int security_path_notify(const struct path *path, u64 mask,
unsigned int obj_type);
-int security_inode_alloc(struct inode *inode);
+int security_inode_alloc(struct inode *inode, gfp_t gfp);
void security_inode_free(struct inode *inode);
int security_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr,
@@ -769,7 +769,7 @@ static inline int security_path_notify(const struct path *path, u64 mask,
return 0;
}
-static inline int security_inode_alloc(struct inode *inode)
+static inline int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
return 0;
}
diff --git a/security/security.c b/security/security.c
index 8cee5b6c6e6d..3581262da5ee 100644
--- a/security/security.c
+++ b/security/security.c
@@ -660,14 +660,14 @@ static int lsm_file_alloc(struct file *file)
*
* Returns 0, or -ENOMEM if memory can't be allocated.
*/
-int lsm_inode_alloc(struct inode *inode)
+int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
{
if (!lsm_inode_cache) {
inode->i_security = NULL;
return 0;
}
- inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
+ inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
if (inode->i_security == NULL)
return -ENOMEM;
return 0;
@@ -1582,9 +1582,9 @@ int security_path_notify(const struct path *path, u64 mask,
*
* Return: Return 0 if operation was successful.
*/
-int security_inode_alloc(struct inode *inode)
+int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
- int rc = lsm_inode_alloc(inode);
+ int rc = lsm_inode_alloc(inode, gfp);
if (unlikely(rc))
return rc;
--
2.46.0
^ permalink raw reply related [flat|nested] 51+ messages in thread
* [PATCH 2/2] Revert "mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN"
2024-09-02 9:51 [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Michal Hocko
2024-09-02 9:51 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
@ 2024-09-02 9:51 ` Michal Hocko
2024-09-02 9:53 ` [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Kent Overstreet
2024-09-10 19:29 ` Andrew Morton
3 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-09-02 9:51 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel,
Michal Hocko, Matthew Wilcox (Oracle)
From: Michal Hocko <mhocko@suse.com>
This reverts commit eab0af905bfc3e9c05da2ca163d76a1513159aa4.
There is no existing user of those flags. PF_MEMALLOC_NOWARN is
dangerous because a nested allocation context can use GFP_NOFAIL which
could cause unexpected failure. Such a code would be hard to maintain
because it could be deeper in the call chain.
PF_MEMALLOC_NORECLAIM has been added even when it was pointed out [1]
that such a allocation contex is inherently unsafe if the context
doesn't fully control all allocations called from this context.
While PF_MEMALLOC_NOWARN is not dangerous the way PF_MEMALLOC_NORECLAIM
is it doesn't have any user and as Matthew has pointed out we are
running out of those flags so better reclaim it without any real users.
[1] https://lore.kernel.org/all/ZcM0xtlKbAOFjv5n@tiehlicka/
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
include/linux/sched.h | 4 ++--
include/linux/sched/mm.h | 17 ++++-------------
2 files changed, 6 insertions(+), 15 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8d150343d42..731ff1078c9e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1657,8 +1657,8 @@ extern struct pid *cad_pid;
* I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
-#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */
-#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */
+#define PF__HOLE__00800000 0x00800000
+#define PF__HOLE__01000000 0x01000000
#define PF__HOLE__02000000 0x02000000
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 91546493c43d..07c4fde32827 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -258,25 +258,16 @@ static inline gfp_t current_gfp_context(gfp_t flags)
{
unsigned int pflags = READ_ONCE(current->flags);
- if (unlikely(pflags & (PF_MEMALLOC_NOIO |
- PF_MEMALLOC_NOFS |
- PF_MEMALLOC_NORECLAIM |
- PF_MEMALLOC_NOWARN |
- PF_MEMALLOC_PIN))) {
+ if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
/*
- * Stronger flags before weaker flags:
- * NORECLAIM implies NOIO, which in turn implies NOFS
+ * NOIO implies both NOIO and NOFS and it is a weaker context
+ * so always make sure it makes precedence
*/
- if (pflags & PF_MEMALLOC_NORECLAIM)
- flags &= ~__GFP_DIRECT_RECLAIM;
- else if (pflags & PF_MEMALLOC_NOIO)
+ if (pflags & PF_MEMALLOC_NOIO)
flags &= ~(__GFP_IO | __GFP_FS);
else if (pflags & PF_MEMALLOC_NOFS)
flags &= ~__GFP_FS;
- if (pflags & PF_MEMALLOC_NOWARN)
- flags |= __GFP_NOWARN;
-
if (pflags & PF_MEMALLOC_PIN)
flags &= ~__GFP_MOVABLE;
}
--
2.46.0
^ permalink raw reply related [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 9:51 [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Michal Hocko
2024-09-02 9:51 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
2024-09-02 9:51 ` [PATCH 2/2] Revert "mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN" Michal Hocko
@ 2024-09-02 9:53 ` Kent Overstreet
2024-09-02 21:52 ` Andrew Morton
2024-09-10 19:29 ` Andrew Morton
3 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-09-02 9:53 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Mon, Sep 02, 2024 at 11:51:48AM GMT, Michal Hocko wrote:
> The previous version has been posted in [1]. Based on the review feedback
> I have sent v2 of patches in the same threat but it seems that the
> review has mostly settled on these patches. There is still an open
> discussion on whether having a NORECLAIM allocator semantic (compare to
> atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
> those are not really relevant to this particular patchset as it 1)
> doesn't aim to implement either of the two and 2) it aims at spreading
> PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
> semantic now that it is not widely used and much harder to fix.
>
> I have collected Reviewed-bys and reposting here. These patches are
> touching bcachefs, VFS and core MM so I am not sure which tree to merge
> this through but I guess going through Andrew makes the most sense.
>
> Changes since v1;
> - compile fixes
> - rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
> ("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
> by Matthew.
To reiterate:
This is a trainwreck of bad ideas. Nack.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 9:53 ` [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Kent Overstreet
@ 2024-09-02 21:52 ` Andrew Morton
2024-09-02 22:32 ` Kent Overstreet
2024-09-03 5:13 ` Christoph Hellwig
0 siblings, 2 replies; 51+ messages in thread
From: Andrew Morton @ 2024-09-02 21:52 UTC (permalink / raw)
To: Kent Overstreet
Cc: Michal Hocko, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Mon, 2 Sep 2024 05:53:59 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote:
> On Mon, Sep 02, 2024 at 11:51:48AM GMT, Michal Hocko wrote:
> > The previous version has been posted in [1]. Based on the review feedback
> > I have sent v2 of patches in the same threat but it seems that the
> > review has mostly settled on these patches. There is still an open
> > discussion on whether having a NORECLAIM allocator semantic (compare to
> > atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
> > those are not really relevant to this particular patchset as it 1)
> > doesn't aim to implement either of the two and 2) it aims at spreading
> > PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
> > semantic now that it is not widely used and much harder to fix.
> >
> > I have collected Reviewed-bys and reposting here. These patches are
> > touching bcachefs, VFS and core MM so I am not sure which tree to merge
> > this through but I guess going through Andrew makes the most sense.
> >
> > Changes since v1;
> > - compile fixes
> > - rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
> > ("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
> > by Matthew.
>
> To reiterate:
>
It would be helpful to summarize your concerns.
What runtime impact do you expect this change will have upon bcachefs?
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 21:52 ` Andrew Morton
@ 2024-09-02 22:32 ` Kent Overstreet
2024-09-03 7:06 ` Michal Hocko
2024-09-03 23:53 ` Kent Overstreet
2024-09-03 5:13 ` Christoph Hellwig
1 sibling, 2 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-09-02 22:32 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Mon, Sep 02, 2024 at 02:52:52PM GMT, Andrew Morton wrote:
> On Mon, 2 Sep 2024 05:53:59 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote:
>
> > On Mon, Sep 02, 2024 at 11:51:48AM GMT, Michal Hocko wrote:
> > > The previous version has been posted in [1]. Based on the review feedback
> > > I have sent v2 of patches in the same threat but it seems that the
> > > review has mostly settled on these patches. There is still an open
> > > discussion on whether having a NORECLAIM allocator semantic (compare to
> > > atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
> > > those are not really relevant to this particular patchset as it 1)
> > > doesn't aim to implement either of the two and 2) it aims at spreading
> > > PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
> > > semantic now that it is not widely used and much harder to fix.
> > >
> > > I have collected Reviewed-bys and reposting here. These patches are
> > > touching bcachefs, VFS and core MM so I am not sure which tree to merge
> > > this through but I guess going through Andrew makes the most sense.
> > >
> > > Changes since v1;
> > > - compile fixes
> > > - rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
> > > ("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
> > > by Matthew.
> >
> > To reiterate:
> >
>
> It would be helpful to summarize your concerns.
>
> What runtime impact do you expect this change will have upon bcachefs?
For bcachefs: I try really hard to minimize tail latency and make
performance robust in extreme scenarios - thrashing. A large part of
that is that btree locks must be held for no longer than necessary.
We definitely don't want to recurse into other parts of the kernel,
taking other locks (i.e. in memory reclaim) while holding btree locks;
that's a great way to stack up (and potentially multiply) latencies.
But gfp flags don't work with vmalloc allocations (and that's unlikely
to change), and we require vmalloc fallbacks for e.g. btree node
allocation. That's the big reason we want MEMALLOC_PF_NORECLAIM.
Besides that, it's just cleaner, memalloc flags are the direction we
want to be moving in, and it's going to be necessary if we ever want to
do a malloc() that doesn't require a gfp flags parameter. That would be
a win for safety and correctness in the kernel, and it's also likely
required for proper Rust support.
And the "GFP_NOFAIL must not fail" argument makes no sense, because a
failing a GFP_NOFAIL allocation is the only sane thing to do if the
allocation is buggy (too big, i.e. resulting from an integer overflow
bug, or wrong context). The alternatives are at best never returning
(stuck unkillable process), or a scheduling while atomic bug, or Michal
was even proposing killing the process (handling it like a BUG()!).
But we don't use BUG_ON() for things that we can't prove won't happen in
the wild if we can write an error path.
That is, PF_MEMALLOC_NORECLAIM lets us turn bugs into runtime errors.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 21:52 ` Andrew Morton
2024-09-02 22:32 ` Kent Overstreet
@ 2024-09-03 5:13 ` Christoph Hellwig
2024-09-04 16:27 ` Kent Overstreet
1 sibling, 1 reply; 51+ messages in thread
From: Christoph Hellwig @ 2024-09-03 5:13 UTC (permalink / raw)
To: Andrew Morton
Cc: Kent Overstreet, Michal Hocko, Christoph Hellwig, Yafang Shao,
jack, Vlastimil Babka, Dave Chinner, Christian Brauner,
Alexander Viro, Paul Moore, James Morris, Serge E. Hallyn,
linux-fsdevel, linux-mm, linux-bcachefs, linux-security-module,
linux-kernel
On Mon, Sep 02, 2024 at 02:52:52PM -0700, Andrew Morton wrote:
> It would be helpful to summarize your concerns.
And that'd better be a really good argument for a change that was
pushed directly to Linus bypassing the maintainer after multiple
reviewers pointed out it was broken. This series simply undoes the
damage done by that, while also keeping the code dependend on it
working.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 22:32 ` Kent Overstreet
@ 2024-09-03 7:06 ` Michal Hocko
2024-09-04 16:15 ` Kent Overstreet
2024-09-03 23:53 ` Kent Overstreet
1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-09-03 7:06 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Mon 02-09-24 18:32:33, Kent Overstreet wrote:
> On Mon, Sep 02, 2024 at 02:52:52PM GMT, Andrew Morton wrote:
> > On Mon, 2 Sep 2024 05:53:59 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote:
> >
> > > On Mon, Sep 02, 2024 at 11:51:48AM GMT, Michal Hocko wrote:
> > > > The previous version has been posted in [1]. Based on the review feedback
> > > > I have sent v2 of patches in the same threat but it seems that the
> > > > review has mostly settled on these patches. There is still an open
> > > > discussion on whether having a NORECLAIM allocator semantic (compare to
> > > > atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
> > > > those are not really relevant to this particular patchset as it 1)
> > > > doesn't aim to implement either of the two and 2) it aims at spreading
> > > > PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
> > > > semantic now that it is not widely used and much harder to fix.
> > > >
> > > > I have collected Reviewed-bys and reposting here. These patches are
> > > > touching bcachefs, VFS and core MM so I am not sure which tree to merge
> > > > this through but I guess going through Andrew makes the most sense.
> > > >
> > > > Changes since v1;
> > > > - compile fixes
> > > > - rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
> > > > ("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
> > > > by Matthew.
> > >
> > > To reiterate:
> > >
> >
> > It would be helpful to summarize your concerns.
> >
> > What runtime impact do you expect this change will have upon bcachefs?
>
> For bcachefs: I try really hard to minimize tail latency and make
> performance robust in extreme scenarios - thrashing. A large part of
> that is that btree locks must be held for no longer than necessary.
>
> We definitely don't want to recurse into other parts of the kernel,
> taking other locks (i.e. in memory reclaim) while holding btree locks;
> that's a great way to stack up (and potentially multiply) latencies.
OK, these two patches do not fail to do that. The only existing user is
turned into GFP_NOWAIT so the final code works the same way. Right?
> But gfp flags don't work with vmalloc allocations (and that's unlikely
> to change), and we require vmalloc fallbacks for e.g. btree node
> allocation. That's the big reason we want MEMALLOC_PF_NORECLAIM.
Have you even tried to reach out to vmalloc maintainers and asked for
GFP_NOWAIT support for vmalloc? Because I do not remember that. Sure
kernel page tables are have hardcoded GFP_KERNEL context which slightly
complicates that but that doesn't really mean the only potential
solution is to use a per task flag to override that. Just from top of my
head we can consider pre-allocating virtual address space for
non-sleeping allocations. Maybe there are other options that only people
deeply familiar with the vmalloc internals can see.
This requires discussions not pushing a very particular solution
through.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 22:32 ` Kent Overstreet
2024-09-03 7:06 ` Michal Hocko
@ 2024-09-03 23:53 ` Kent Overstreet
2024-09-04 7:14 ` Michal Hocko
1 sibling, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-09-03 23:53 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Mon, Sep 02, 2024 at 06:32:40PM GMT, Kent Overstreet wrote:
> On Mon, Sep 02, 2024 at 02:52:52PM GMT, Andrew Morton wrote:
> > On Mon, 2 Sep 2024 05:53:59 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote:
> >
> > > On Mon, Sep 02, 2024 at 11:51:48AM GMT, Michal Hocko wrote:
> > > > The previous version has been posted in [1]. Based on the review feedback
> > > > I have sent v2 of patches in the same threat but it seems that the
> > > > review has mostly settled on these patches. There is still an open
> > > > discussion on whether having a NORECLAIM allocator semantic (compare to
> > > > atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
> > > > those are not really relevant to this particular patchset as it 1)
> > > > doesn't aim to implement either of the two and 2) it aims at spreading
> > > > PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
> > > > semantic now that it is not widely used and much harder to fix.
> > > >
> > > > I have collected Reviewed-bys and reposting here. These patches are
> > > > touching bcachefs, VFS and core MM so I am not sure which tree to merge
> > > > this through but I guess going through Andrew makes the most sense.
> > > >
> > > > Changes since v1;
> > > > - compile fixes
> > > > - rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
> > > > ("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
> > > > by Matthew.
> > >
> > > To reiterate:
> > >
> >
> > It would be helpful to summarize your concerns.
> >
> > What runtime impact do you expect this change will have upon bcachefs?
>
> For bcachefs: I try really hard to minimize tail latency and make
> performance robust in extreme scenarios - thrashing. A large part of
> that is that btree locks must be held for no longer than necessary.
>
> We definitely don't want to recurse into other parts of the kernel,
> taking other locks (i.e. in memory reclaim) while holding btree locks;
> that's a great way to stack up (and potentially multiply) latencies.
>
> But gfp flags don't work with vmalloc allocations (and that's unlikely
> to change), and we require vmalloc fallbacks for e.g. btree node
> allocation. That's the big reason we want MEMALLOC_PF_NORECLAIM.
>
> Besides that, it's just cleaner, memalloc flags are the direction we
> want to be moving in, and it's going to be necessary if we ever want to
> do a malloc() that doesn't require a gfp flags parameter. That would be
> a win for safety and correctness in the kernel, and it's also likely
> required for proper Rust support.
>
> And the "GFP_NOFAIL must not fail" argument makes no sense, because a
> failing a GFP_NOFAIL allocation is the only sane thing to do if the
> allocation is buggy (too big, i.e. resulting from an integer overflow
> bug, or wrong context). The alternatives are at best never returning
> (stuck unkillable process), or a scheduling while atomic bug, or Michal
> was even proposing killing the process (handling it like a BUG()!).
>
> But we don't use BUG_ON() for things that we can't prove won't happen in
> the wild if we can write an error path.
>
> That is, PF_MEMALLOC_NORECLAIM lets us turn bugs into runtime errors.
BTW, one of the reasons I've been giving this issue so much attention is
because of filesystem folks mentioning that they want GFP_NOFAIL
semantics more widely, and I actually _don't_ think that's a crazy idea,
provided we go about it the right way.
Not having error paths is right out; many allocations when you start to
look through more obscure code have sizes that are controlled by
userspace, so we'd be opening ourselves up to trivially expoitable
security bugs.
However, if we agreed that GFP_NOFAIL meant "only fail if it is not
possible to satisfy this allocation" (and I have been arguing that that
is the only sane meaning) - then that could lead to a lot of error paths
getting simpler.
Because there are a lot of places where there's essentially no good
reason to bubble up an -ENOMEM to userspace; if we're actually out of
memory the current allocation is just one out of many and not
particularly special, better to let the oom killer handle it...
So the error paths would be more along the lines of "there's a bug, or
userspace has requested something crazy, just shut down gracefully".
While we're at it, the definition of what allocation size is "too big"
is something we'd want to look at. Right now it's hardcoded to INT_MAX
for non GFP_NOFAIL and (I believe) 2 pages for GFP_NOFAL, we might want
to consider doing something based on total memory in the machine and
have the same limit apply to both...
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-03 23:53 ` Kent Overstreet
@ 2024-09-04 7:14 ` Michal Hocko
2024-09-04 16:05 ` Kent Overstreet
0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-09-04 7:14 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Tue 03-09-24 19:53:41, Kent Overstreet wrote:
[...]
> However, if we agreed that GFP_NOFAIL meant "only fail if it is not
> possible to satisfy this allocation" (and I have been arguing that that
> is the only sane meaning) - then that could lead to a lot of error paths
> getting simpler.
>
> Because there are a lot of places where there's essentially no good
> reason to bubble up an -ENOMEM to userspace; if we're actually out of
> memory the current allocation is just one out of many and not
> particularly special, better to let the oom killer handle it...
This is exactly GFP_KERNEL semantic for low order allocations or
kvmalloc for that matter. They simply never fail unless couple of corner
cases - e.g. the allocating task is an oom victim and all of the oom
memory reserves have been consumed. This is where we call "not possible
to allocate".
> So the error paths would be more along the lines of "there's a bug, or
> userspace has requested something crazy, just shut down gracefully".
How do you expect that to be done? Who is going to go over all those
GFP_NOFAIL users? And what kind of guide lines should they follow? It is
clear that they believe they cannot handle the failure gracefully
therefore they have requested GFP_NOFAIL. Many of them do not have
return value to return.
So really what do you expect proper GFP_NOFAIL users to do and what
should happen to those that are requesting unsupported size or
allocation mode?
> While we're at it, the definition of what allocation size is "too big"
> is something we'd want to look at. Right now it's hardcoded to INT_MAX
> for non GFP_NOFAIL and (I believe) 2 pages for GFP_NOFAL, we might want
> to consider doing something based on total memory in the machine and
> have the same limit apply to both...
Yes, we need to define some reasonable maximum supported sizes. For the
page allocator this has been order > 1 and we considering we have a
warning about those requests for years without a single report then we
can assume we do not have such abusers. for kvmalloc to story is
different. Current INT_MAX is just not any practical limit. Past
experience says that anything based on the amount of memory just doesn't
work (e.g. hash table sizes that used to that scaling and there are
other examples). So we should be practical here and look at existing
users and see what they really need and put a cap above that.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 7:14 ` Michal Hocko
@ 2024-09-04 16:05 ` Kent Overstreet
2024-09-04 16:46 ` Michal Hocko
0 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-09-04 16:05 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed, Sep 04, 2024 at 09:14:29AM GMT, Michal Hocko wrote:
> On Tue 03-09-24 19:53:41, Kent Overstreet wrote:
> [...]
> > However, if we agreed that GFP_NOFAIL meant "only fail if it is not
> > possible to satisfy this allocation" (and I have been arguing that that
> > is the only sane meaning) - then that could lead to a lot of error paths
> > getting simpler.
> >
> > Because there are a lot of places where there's essentially no good
> > reason to bubble up an -ENOMEM to userspace; if we're actually out of
> > memory the current allocation is just one out of many and not
> > particularly special, better to let the oom killer handle it...
>
> This is exactly GFP_KERNEL semantic for low order allocations or
> kvmalloc for that matter. They simply never fail unless couple of corner
> cases - e.g. the allocating task is an oom victim and all of the oom
> memory reserves have been consumed. This is where we call "not possible
> to allocate".
*nod*
Which does beg the question of why GFP_NOFAIL exists.
> > So the error paths would be more along the lines of "there's a bug, or
> > userspace has requested something crazy, just shut down gracefully".
>
> How do you expect that to be done? Who is going to go over all those
> GFP_NOFAIL users? And what kind of guide lines should they follow? It is
> clear that they believe they cannot handle the failure gracefully
> therefore they have requested GFP_NOFAIL. Many of them do not have
> return value to return.
They can't handle the allocatian failure and continue normal operation,
but that's entirely different from not being able to handle the
allocation failure at all - it's not hard to do an emergency shutdown,
that's a normal thing for filesystems to do.
And if you scan for GFP_NOFAIL uses in the kernel, a decent number
already do just that.
> So really what do you expect proper GFP_NOFAIL users to do and what
> should happen to those that are requesting unsupported size or
> allocation mode?
Emergency shutdwon.
And I'm not saying we have to rush to fix all the existing callers;
they're clearly in existing well tested code, there's not much point to
that.
Additionally most of them are deep in the guts of filesystem transaction
code where call paths to that site are limited - they're not library
code that gets called by anything.
But as a matter of policy going forward, yes we should be saying that
even GFP_NOFAIL allocations should be checking for -ENOMEM.
> Yes, we need to define some reasonable maximum supported sizes. For the
> page allocator this has been order > 1 and we considering we have a
> warning about those requests for years without a single report then we
> can assume we do not have such abusers. for kvmalloc to story is
> different. Current INT_MAX is just not any practical limit. Past
> experience says that anything based on the amount of memory just doesn't
> work (e.g. hash table sizes that used to that scaling and there are
> other examples). So we should be practical here and look at existing
> users and see what they really need and put a cap above that.
Not following what you're saying about hash tables? Hash tables scale
roughly with the amount of system memory/workingset.
But it seems to me that the limit should be lower if you're on e.g. a 2
GB machine (not failing with a warning, just failing immediately rather
than oom killing a bunch of stuff first) - and it's going to need to be
raised above INT_MAX as large memory machines keep growing, I keep
hitting it in bcachefs fsck code.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-03 7:06 ` Michal Hocko
@ 2024-09-04 16:15 ` Kent Overstreet
2024-09-04 16:50 ` Michal Hocko
0 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-09-04 16:15 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Tue, Sep 03, 2024 at 09:06:17AM GMT, Michal Hocko wrote:
> On Mon 02-09-24 18:32:33, Kent Overstreet wrote:
> > On Mon, Sep 02, 2024 at 02:52:52PM GMT, Andrew Morton wrote:
> > > On Mon, 2 Sep 2024 05:53:59 -0400 Kent Overstreet <kent.overstreet@linux.dev> wrote:
> > >
> > > > On Mon, Sep 02, 2024 at 11:51:48AM GMT, Michal Hocko wrote:
> > > > > The previous version has been posted in [1]. Based on the review feedback
> > > > > I have sent v2 of patches in the same threat but it seems that the
> > > > > review has mostly settled on these patches. There is still an open
> > > > > discussion on whether having a NORECLAIM allocator semantic (compare to
> > > > > atomic) is worthwhile or how to deal with broken GFP_NOFAIL users but
> > > > > those are not really relevant to this particular patchset as it 1)
> > > > > doesn't aim to implement either of the two and 2) it aims at spreading
> > > > > PF_MEMALLOC_NORECLAIM use while it doesn't have a properly defined
> > > > > semantic now that it is not widely used and much harder to fix.
> > > > >
> > > > > I have collected Reviewed-bys and reposting here. These patches are
> > > > > touching bcachefs, VFS and core MM so I am not sure which tree to merge
> > > > > this through but I guess going through Andrew makes the most sense.
> > > > >
> > > > > Changes since v1;
> > > > > - compile fixes
> > > > > - rather than dropping PF_MEMALLOC_NORECLAIM alone reverted eab0af905bfc
> > > > > ("mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN") suggested
> > > > > by Matthew.
> > > >
> > > > To reiterate:
> > > >
> > >
> > > It would be helpful to summarize your concerns.
> > >
> > > What runtime impact do you expect this change will have upon bcachefs?
> >
> > For bcachefs: I try really hard to minimize tail latency and make
> > performance robust in extreme scenarios - thrashing. A large part of
> > that is that btree locks must be held for no longer than necessary.
> >
> > We definitely don't want to recurse into other parts of the kernel,
> > taking other locks (i.e. in memory reclaim) while holding btree locks;
> > that's a great way to stack up (and potentially multiply) latencies.
>
> OK, these two patches do not fail to do that. The only existing user is
> turned into GFP_NOWAIT so the final code works the same way. Right?
https://lore.kernel.org/linux-mm/20240828140638.3204253-1-kent.overstreet@linux.dev/
> > But gfp flags don't work with vmalloc allocations (and that's unlikely
> > to change), and we require vmalloc fallbacks for e.g. btree node
> > allocation. That's the big reason we want MEMALLOC_PF_NORECLAIM.
>
> Have you even tried to reach out to vmalloc maintainers and asked for
> GFP_NOWAIT support for vmalloc? Because I do not remember that. Sure
> kernel page tables are have hardcoded GFP_KERNEL context which slightly
> complicates that but that doesn't really mean the only potential
> solution is to use a per task flag to override that. Just from top of my
> head we can consider pre-allocating virtual address space for
> non-sleeping allocations. Maybe there are other options that only people
> deeply familiar with the vmalloc internals can see.
That sounds really overly complicated.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-03 5:13 ` Christoph Hellwig
@ 2024-09-04 16:27 ` Kent Overstreet
2024-09-04 17:01 ` Michal Hocko
0 siblings, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-09-04 16:27 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Andrew Morton, Michal Hocko, Yafang Shao, jack, Vlastimil Babka,
Dave Chinner, Christian Brauner, Alexander Viro, Paul Moore,
James Morris, Serge E. Hallyn, linux-fsdevel, linux-mm,
linux-bcachefs, linux-security-module, linux-kernel
On Tue, Sep 03, 2024 at 07:13:42AM GMT, Christoph Hellwig wrote:
> On Mon, Sep 02, 2024 at 02:52:52PM -0700, Andrew Morton wrote:
> > It would be helpful to summarize your concerns.
>
> And that'd better be a really good argument for a change that was
> pushed directly to Linus bypassing the maintainer after multiple
> reviewers pointed out it was broken. This series simply undoes the
> damage done by that, while also keeping the code dependend on it
> working.
Well, to be blunt, I thought the "we don't want the allocator to even
know if we're in a non-sleepable context" argument was too crazy to have
real support, and moving towards PF_MEMALLOC flags is something we've
been talking about quite a bit going back years.
Little did I know the minefield I was walking into...
But the disccussion seems to finally be cooling off and going in a more
productive direction.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 16:05 ` Kent Overstreet
@ 2024-09-04 16:46 ` Michal Hocko
2024-09-04 18:03 ` Kent Overstreet
0 siblings, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-09-04 16:46 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed 04-09-24 12:05:56, Kent Overstreet wrote:
> On Wed, Sep 04, 2024 at 09:14:29AM GMT, Michal Hocko wrote:
> > On Tue 03-09-24 19:53:41, Kent Overstreet wrote:
> > [...]
> > > However, if we agreed that GFP_NOFAIL meant "only fail if it is not
> > > possible to satisfy this allocation" (and I have been arguing that that
> > > is the only sane meaning) - then that could lead to a lot of error paths
> > > getting simpler.
> > >
> > > Because there are a lot of places where there's essentially no good
> > > reason to bubble up an -ENOMEM to userspace; if we're actually out of
> > > memory the current allocation is just one out of many and not
> > > particularly special, better to let the oom killer handle it...
> >
> > This is exactly GFP_KERNEL semantic for low order allocations or
> > kvmalloc for that matter. They simply never fail unless couple of corner
> > cases - e.g. the allocating task is an oom victim and all of the oom
> > memory reserves have been consumed. This is where we call "not possible
> > to allocate".
>
> *nod*
>
> Which does beg the question of why GFP_NOFAIL exists.
Exactly for the reason that even rare failure is not acceptable and
there is no way to handle it other than keep retrying. Typical code was
while (!(ptr = kmalloc()))
;
Or the failure would be much more catastrophic than the retry loop
taking unbound amount of time.
> > > So the error paths would be more along the lines of "there's a bug, or
> > > userspace has requested something crazy, just shut down gracefully".
> >
> > How do you expect that to be done? Who is going to go over all those
> > GFP_NOFAIL users? And what kind of guide lines should they follow? It is
> > clear that they believe they cannot handle the failure gracefully
> > therefore they have requested GFP_NOFAIL. Many of them do not have
> > return value to return.
>
> They can't handle the allocatian failure and continue normal operation,
> but that's entirely different from not being able to handle the
> allocation failure at all - it's not hard to do an emergency shutdown,
> that's a normal thing for filesystems to do.
>
> And if you scan for GFP_NOFAIL uses in the kernel, a decent number
> already do just that.
It's been quite some time since I've looked the last time. And I am not
saying all the existing ones really require something as strong as
GFP_NOFAIL semantic. If they could be dropped then great! The fewer we
have the better.
But the point is there are some which _do_ need this. We have discussed
that in other email thread where you have heard why XFS and EXT4 does
that and why they are not going to change that model.
For those users we absolutely need a predictable and well defined
behavior because they know what they are doing.
[...]
> But as a matter of policy going forward, yes we should be saying that
> even GFP_NOFAIL allocations should be checking for -ENOMEM.
I argue that such NOFAIL semantic has no well defined semantic and legit
users are forced to do
while (!(ptr = kmalloc(GFP_NOFAIL))) ;
or
BUG_ON(!(ptr = kmalloc(GFP_NOFAIL)));
So it has no real reason to exist.
We at the allocator level have 2 choices. Either we tell users they
will not get GFP_NOFAIL and you just do the above or we provide NOFAIL
which really guarantees that there is no failure even if that means the
allocation gets unbounded amount of time. The latter have a slight
advantage because a) you can identify those callers more easily and b)
the allocator can do some heuristics to help those allocations.
We can still discuss how to handle unsupported cases (like GFP_ATOMIC |
__GFP_NOFAIL or kmalloc($UNCHECKED_USER_INPUT_THAT_IS_TOO_LARGE, __GFP_NOFAIL))
but the fact of the Linux kernel is that we have legit users and we need
to optimize for them.
> > Yes, we need to define some reasonable maximum supported sizes. For the
> > page allocator this has been order > 1 and we considering we have a
> > warning about those requests for years without a single report then we
> > can assume we do not have such abusers. for kvmalloc to story is
> > different. Current INT_MAX is just not any practical limit. Past
> > experience says that anything based on the amount of memory just doesn't
> > work (e.g. hash table sizes that used to that scaling and there are
> > other examples). So we should be practical here and look at existing
> > users and see what they really need and put a cap above that.
>
> Not following what you're saying about hash tables? Hash tables scale
> roughly with the amount of system memory/workingset.
I do not have sha handy but I do remember dcache hashtable scaling with
the amount of memory in the past and that led to GBs of memory allocated
on TB systems. This is not the case anymore I just wanted to mention
that scaling with the amount of memory can get really wrong easily.
> But it seems to me that the limit should be lower if you're on e.g. a 2
> GB machine (not failing with a warning, just failing immediately rather
> than oom killing a bunch of stuff first) - and it's going to need to be
> raised above INT_MAX as large memory machines keep growing, I keep
> hitting it in bcachefs fsck code.
Do we actual usecase that would require more than couple of MB? The
amount of memory wouldn't play any actual role then.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 16:15 ` Kent Overstreet
@ 2024-09-04 16:50 ` Michal Hocko
0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-09-04 16:50 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed 04-09-24 12:15:15, Kent Overstreet wrote:
> On Tue, Sep 03, 2024 at 09:06:17AM GMT, Michal Hocko wrote:
> > On Mon 02-09-24 18:32:33, Kent Overstreet wrote:
[...]
> > > For bcachefs: I try really hard to minimize tail latency and make
> > > performance robust in extreme scenarios - thrashing. A large part of
> > > that is that btree locks must be held for no longer than necessary.
> > >
> > > We definitely don't want to recurse into other parts of the kernel,
> > > taking other locks (i.e. in memory reclaim) while holding btree locks;
> > > that's a great way to stack up (and potentially multiply) latencies.
> >
> > OK, these two patches do not fail to do that. The only existing user is
> > turned into GFP_NOWAIT so the final code works the same way. Right?
>
> https://lore.kernel.org/linux-mm/20240828140638.3204253-1-kent.overstreet@linux.dev/
https://lore.kernel.org/linux-mm/Zs9xC3OJPbkMy25C@casper.infradead.org/
> > > But gfp flags don't work with vmalloc allocations (and that's unlikely
> > > to change), and we require vmalloc fallbacks for e.g. btree node
> > > allocation. That's the big reason we want MEMALLOC_PF_NORECLAIM.
> >
> > Have you even tried to reach out to vmalloc maintainers and asked for
> > GFP_NOWAIT support for vmalloc? Because I do not remember that. Sure
> > kernel page tables are have hardcoded GFP_KERNEL context which slightly
> > complicates that but that doesn't really mean the only potential
> > solution is to use a per task flag to override that. Just from top of my
> > head we can consider pre-allocating virtual address space for
> > non-sleeping allocations. Maybe there are other options that only people
> > deeply familiar with the vmalloc internals can see.
>
> That sounds really overly complicated.
Let vmalloc people discuss viable ways to deal with that. You as vmalloc
consumer want to get NOWAIT support. Ask them and see what kind of
solution they can offer to you as a user. This is how we develop kernel
in a collaborative way. We do not enforce solutions we work with domain
experts to work out a maintainable solution.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 16:27 ` Kent Overstreet
@ 2024-09-04 17:01 ` Michal Hocko
0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-09-04 17:01 UTC (permalink / raw)
To: Kent Overstreet
Cc: Christoph Hellwig, Andrew Morton, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed 04-09-24 12:27:04, Kent Overstreet wrote:
> On Tue, Sep 03, 2024 at 07:13:42AM GMT, Christoph Hellwig wrote:
> > On Mon, Sep 02, 2024 at 02:52:52PM -0700, Andrew Morton wrote:
> > > It would be helpful to summarize your concerns.
> >
> > And that'd better be a really good argument for a change that was
> > pushed directly to Linus bypassing the maintainer after multiple
> > reviewers pointed out it was broken. This series simply undoes the
> > damage done by that, while also keeping the code dependend on it
> > working.
>
> Well, to be blunt, I thought the "we don't want the allocator to even
> know if we're in a non-sleepable context" argument was too crazy to have
> real support, and moving towards PF_MEMALLOC flags is something we've
> been talking about quite a bit going back years.
>
> Little did I know the minefield I was walking into...
There is a lot of historical baggage and several people tried to explain
that things are quite complex and you cannot simply apply design choices
same way as if you were developing something from scratch.
> But the disccussion seems to finally be cooling off and going in a more
> productive direction.
Reality check: https://lore.kernel.org/all/8734mitahm.fsf@trenco.lwn.net/T/#u
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 16:46 ` Michal Hocko
@ 2024-09-04 18:03 ` Kent Overstreet
2024-09-04 22:34 ` Dave Chinner
2024-09-05 11:26 ` Michal Hocko
0 siblings, 2 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-09-04 18:03 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed, Sep 04, 2024 at 06:46:00PM GMT, Michal Hocko wrote:
> On Wed 04-09-24 12:05:56, Kent Overstreet wrote:
> > On Wed, Sep 04, 2024 at 09:14:29AM GMT, Michal Hocko wrote:
> > > On Tue 03-09-24 19:53:41, Kent Overstreet wrote:
> > > [...]
> > > > However, if we agreed that GFP_NOFAIL meant "only fail if it is not
> > > > possible to satisfy this allocation" (and I have been arguing that that
> > > > is the only sane meaning) - then that could lead to a lot of error paths
> > > > getting simpler.
> > > >
> > > > Because there are a lot of places where there's essentially no good
> > > > reason to bubble up an -ENOMEM to userspace; if we're actually out of
> > > > memory the current allocation is just one out of many and not
> > > > particularly special, better to let the oom killer handle it...
> > >
> > > This is exactly GFP_KERNEL semantic for low order allocations or
> > > kvmalloc for that matter. They simply never fail unless couple of corner
> > > cases - e.g. the allocating task is an oom victim and all of the oom
> > > memory reserves have been consumed. This is where we call "not possible
> > > to allocate".
> >
> > *nod*
> >
> > Which does beg the question of why GFP_NOFAIL exists.
>
> Exactly for the reason that even rare failure is not acceptable and
> there is no way to handle it other than keep retrying. Typical code was
> while (!(ptr = kmalloc()))
> ;
But is it _rare_ failure, or _no_ failure?
You seem to be saying (and I just reviewed the code, it looks like
you're right) that there is essentially no difference in behaviour
between GFP_KERNEL and GFP_NOFAIL.
So given that - why the wart?
I think we might be able to chalk it up to history; I'd have to go
spunking through the history (or ask Dave or Ted, maybe they'll chime
in), but I suspect GFP_KERNEL didn't provide such strong guarantees when
the allocation loops & GFP_NOFAIL were introduced.
> Or the failure would be much more catastrophic than the retry loop
> taking unbound amount of time.
But if GFP_KERNEL has the retry loop...
> > And if you scan for GFP_NOFAIL uses in the kernel, a decent number
> > already do just that.
>
> It's been quite some time since I've looked the last time. And I am not
> saying all the existing ones really require something as strong as
> GFP_NOFAIL semantic. If they could be dropped then great! The fewer we
> have the better.
>
> But the point is there are some which _do_ need this. We have discussed
> that in other email thread where you have heard why XFS and EXT4 does
> that and why they are not going to change that model.
No, I agree that they need the strong guarantees.
But if there's an actual bug, returning an error is better than killing
the task. Killing the task is really bad; these allocations are deep in
contexts where locks and refcounts are held, and the system will just
grind to a halt.
> > But as a matter of policy going forward, yes we should be saying that
> > even GFP_NOFAIL allocations should be checking for -ENOMEM.
>
> I argue that such NOFAIL semantic has no well defined semantic and legit
> users are forced to do
> while (!(ptr = kmalloc(GFP_NOFAIL))) ;
> or
> BUG_ON(!(ptr = kmalloc(GFP_NOFAIL)));
>
> So it has no real reason to exist.
I'm arguing that it does, provided when it returns NULL is defined to
be:
- invalid allocation context
- a size that is so big that it will never be possible to satisfy.
Returning an error is better than not returning at all, and letting the
system grind to a halt.
Let's also get Dave or Ted to chime in here if we can, because I
strongly suspect the situation was different when those retry loops were
introduced.
> We at the allocator level have 2 choices. Either we tell users they
> will not get GFP_NOFAIL and you just do the above or we provide NOFAIL
> which really guarantees that there is no failure even if that means the
> allocation gets unbounded amount of time. The latter have a slight
> advantage because a) you can identify those callers more easily and b)
> the allocator can do some heuristics to help those allocations.
Or option 3: recognize that this is a correctness/soundness issue, and
that if there are buggy callers they need to be fixed.
To give a non-kernel correlary, in the Rust world, soundness issues are
taken very seriously, and they are the only situation in which existing
code will be broken if necessary to fix them. Probably with equal amount
of fighting as these threads, heh.
> > > Yes, we need to define some reasonable maximum supported sizes. For the
> > > page allocator this has been order > 1 and we considering we have a
> > > warning about those requests for years without a single report then we
> > > can assume we do not have such abusers. for kvmalloc to story is
> > > different. Current INT_MAX is just not any practical limit. Past
> > > experience says that anything based on the amount of memory just doesn't
> > > work (e.g. hash table sizes that used to that scaling and there are
> > > other examples). So we should be practical here and look at existing
> > > users and see what they really need and put a cap above that.
> >
> > Not following what you're saying about hash tables? Hash tables scale
> > roughly with the amount of system memory/workingset.
>
> I do not have sha handy but I do remember dcache hashtable scaling with
> the amount of memory in the past and that led to GBs of memory allocated
> on TB systems. This is not the case anymore I just wanted to mention
> that scaling with the amount of memory can get really wrong easily.
Was the solution then to change the dcache hash table implementation,
rather than lifting the INT_MAX allocation size limit?
> > But it seems to me that the limit should be lower if you're on e.g. a 2
> > GB machine (not failing with a warning, just failing immediately rather
> > than oom killing a bunch of stuff first) - and it's going to need to be
> > raised above INT_MAX as large memory machines keep growing, I keep
> > hitting it in bcachefs fsck code.
>
> Do we actual usecase that would require more than couple of MB? The
> amount of memory wouldn't play any actual role then.
Which "amount of memory?" - not parsing that.
For large allocations in bcachefs: in journal replay we read all the
keys in the journal, and then we create a big flat array with references
to all of those keys to sort and dedup them.
We haven't hit the INT_MAX size limit there yet, but filesystem sizes
being what they are, we will soon. I've heard of users with 150 TB
filesystems, and once the fsck scalability issues are sorted we'll be
aiming for petabytes. Dirty keys in the journal scales more with system
memory, but I'm leasing machines right now with a quarter terabyte of
ram.
Another more pressing one is the extents -> backpointers and
backpointers -> extents passes of fsck; we do a linear scan through one
btree checking references to another btree. For the btree we're checking
references to the lookups are random, so we need to cache and pin the
entire btree in ram if possible, or if not whatever will fit and we run
in multiple passes.
This is the #1 scalability issue hitting a number of users right now, so
I may need to rewrite it to pull backpointers into an eytzinger array
and do our random lookups for backpointers on that - but that will be
"the biggest vmalloc array we can possible allocate", so the INT_MAX
size limit is clearly an issue there...
Q: Why isn't this in userspace?
A: Has to be in the kernel for online fsck to work, and these are fsck
passes we can already run online...
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 18:03 ` Kent Overstreet
@ 2024-09-04 22:34 ` Dave Chinner
2024-09-04 23:05 ` Kent Overstreet
2024-09-05 11:26 ` Michal Hocko
1 sibling, 1 reply; 51+ messages in thread
From: Dave Chinner @ 2024-09-04 22:34 UTC (permalink / raw)
To: Kent Overstreet
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed, Sep 04, 2024 at 02:03:13PM -0400, Kent Overstreet wrote:
> On Wed, Sep 04, 2024 at 06:46:00PM GMT, Michal Hocko wrote:
> > On Wed 04-09-24 12:05:56, Kent Overstreet wrote:
> > > But it seems to me that the limit should be lower if you're on e.g. a 2
> > > GB machine (not failing with a warning, just failing immediately rather
> > > than oom killing a bunch of stuff first) - and it's going to need to be
> > > raised above INT_MAX as large memory machines keep growing, I keep
> > > hitting it in bcachefs fsck code.
> >
> > Do we actual usecase that would require more than couple of MB? The
> > amount of memory wouldn't play any actual role then.
>
> Which "amount of memory?" - not parsing that.
>
> For large allocations in bcachefs: in journal replay we read all the
> keys in the journal, and then we create a big flat array with references
> to all of those keys to sort and dedup them.
>
> We haven't hit the INT_MAX size limit there yet, but filesystem sizes
> being what they are, we will soon. I've heard of users with 150 TB
> filesystems, and once the fsck scalability issues are sorted we'll be
> aiming for petabytes. Dirty keys in the journal scales more with system
> memory, but I'm leasing machines right now with a quarter terabyte of
> ram.
I've seen xfs_repair require a couple of TB of RAM to repair
metadata heavy filesystems of relatively small size (sub-20TB).
Once you get about a few hundred GB of metadata in the filesystem,
the fsck cross-reference data set size can easily run into the TBs.
So 256GB might *seem* like a lot of memory, but we were seeing
xfs_repair exceed that amount of RAM for metadata heavy filesystems
at least a decade ago...
Indeed, we recently heard about a 6TB filesystem with 15 *billion*
hardlinks in it. The cross reference for resolving all those
hardlinks would require somewhere in the order of 1.5TB of RAM to
hold. The only way to reliably handle random access data sets this
large is with pageable memory....
> Another more pressing one is the extents -> backpointers and
> backpointers -> extents passes of fsck; we do a linear scan through one
> btree checking references to another btree. For the btree we're checking
> references to the lookups are random, so we need to cache and pin the
> entire btree in ram if possible, or if not whatever will fit and we run
> in multiple passes.
>
> This is the #1 scalability issue hitting a number of users right now, so
> I may need to rewrite it to pull backpointers into an eytzinger array
> and do our random lookups for backpointers on that - but that will be
> "the biggest vmalloc array we can possible allocate", so the INT_MAX
> size limit is clearly an issue there...
Given my above comments, I think you are approaching this problem
the wrong way. It is known that the data set that can exceed
physical kernel memory size, hence it needs to be swappable. That
way users can extend the kernel memory capacity via swapfiles when
bcachefs.fsck needs more memory than the system has physical RAM.
This is a problem Darrick had to address for the XFS online repair
code - we've known for a long time that repair needs to hold a data
set larger than physical memory to complete successfully. Hence for
online repair we needed a mechanism that provided us with pagable
kernel memory. vmalloc() is not an option - it has hard size limits
(both API based and physical capacity based).
Hence Darrick designed and implemented pageable shmem backed memory
files (xfiles) to hold these data sets. Hence the size limit of the
online repair data set is physical RAM + swap space, same as it is
for offline repair. You can find the xfile code in
fs/xfs/scrub/xfile.[ch].
Support for large, sortable arrays of fixed size records built on
xfiles can be found in xfarray.[ch], and blob storage in
xfblob.[ch].
vmalloc() is really not a good solution for holding arbitrary sized
data sets in kernel memory....
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 22:34 ` Dave Chinner
@ 2024-09-04 23:05 ` Kent Overstreet
0 siblings, 0 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-09-04 23:05 UTC (permalink / raw)
To: Dave Chinner
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Thu, Sep 05, 2024 at 08:34:39AM GMT, Dave Chinner wrote:
> I've seen xfs_repair require a couple of TB of RAM to repair
> metadata heavy filesystems of relatively small size (sub-20TB).
> Once you get about a few hundred GB of metadata in the filesystem,
> the fsck cross-reference data set size can easily run into the TBs.
>
> So 256GB might *seem* like a lot of memory, but we were seeing
> xfs_repair exceed that amount of RAM for metadata heavy filesystems
> at least a decade ago...
>
> Indeed, we recently heard about a 6TB filesystem with 15 *billion*
> hardlinks in it. The cross reference for resolving all those
> hardlinks would require somewhere in the order of 1.5TB of RAM to
> hold. The only way to reliably handle random access data sets this
> large is with pageable memory....
Christ...
This is also where space efficiency of metadata starts to really matter.
Of course you store full backreferences for every hardlink, which is
nice in some ways and a pain in others.
> > Another more pressing one is the extents -> backpointers and
> > backpointers -> extents passes of fsck; we do a linear scan through one
> > btree checking references to another btree. For the btree we're checking
> > references to the lookups are random, so we need to cache and pin the
> > entire btree in ram if possible, or if not whatever will fit and we run
> > in multiple passes.
> >
> > This is the #1 scalability issue hitting a number of users right now, so
> > I may need to rewrite it to pull backpointers into an eytzinger array
> > and do our random lookups for backpointers on that - but that will be
> > "the biggest vmalloc array we can possible allocate", so the INT_MAX
> > size limit is clearly an issue there...
>
> Given my above comments, I think you are approaching this problem
> the wrong way. It is known that the data set that can exceed
> physical kernel memory size, hence it needs to be swappable. That
> way users can extend the kernel memory capacity via swapfiles when
> bcachefs.fsck needs more memory than the system has physical RAM.
Well, it depends on the locality of the cross references - I don't think
we want to go that route here, because if there isn't any locality in
the cross references we'll just be thrashing; better to run in multiple
passes, constraining each pass to what _will_ fit in ram...
It would be nice if we had a way to guesstimate locality in extents <->
backpointers references - if there is locality, then it's better to just
run in one pass - and we wouldn't bother with building up new tables,
we'd just rely on the btree node cache.
Perhaps that's what we'll do when online fsck is finished and we're
optimizing more for "don't disturb the rest of the system too much" than
"get it done as quick as possible".
I do need to start making use of Darrick's swappable memory code in at
least one other place though - the bucket tables when we're checking
basic allocation info. That one just exceeded the INT_MAX limit for a
user with a 30 TB hard drive, so I switched it to a radix tree for now,
but it really should be swappable memory.
Fortunately there's more locality in the accesses there.
> Hence Darrick designed and implemented pageable shmem backed memory
> files (xfiles) to hold these data sets. Hence the size limit of the
> online repair data set is physical RAM + swap space, same as it is
> for offline repair. You can find the xfile code in
> fs/xfs/scrub/xfile.[ch].
>
> Support for large, sortable arrays of fixed size records built on
> xfiles can be found in xfarray.[ch], and blob storage in
> xfblob.[ch].
*nod*
I do wish we had normal virtually mapped swappable memory though - the
thing I don't like about xfarray is that it requires a radix tree walk
on every access, and we have _hardware_ that's meant to do that for
us.
But if you still care about 32 bit then that does necessitate Darrick's
approach. I'm willing to consider 32 bit legacy for bcachefs, though.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-09-02 9:51 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
@ 2024-09-05 9:28 ` kernel test robot
0 siblings, 0 replies; 51+ messages in thread
From: kernel test robot @ 2024-09-05 9:28 UTC (permalink / raw)
To: Michal Hocko, Andrew Morton
Cc: llvm, oe-kbuild-all, Linux Memory Management List,
Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-bcachefs, linux-security-module, linux-kernel, Michal Hocko
Hi Michal,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on tip/sched/core brauner-vfs/vfs.all linus/master v6.11-rc6]
[cannot apply to next-20240904]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Michal-Hocko/bcachefs-do-not-use-PF_MEMALLOC_NORECLAIM/20240902-200126
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20240902095203.1559361-2-mhocko%40kernel.org
patch subject: [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
config: x86_64-allnoconfig (https://download.01.org/0day-ci/archive/20240905/202409051713.LFVMScXi-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240905/202409051713.LFVMScXi-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409051713.LFVMScXi-lkp@intel.com/
All warnings (new ones prefixed by >>):
>> fs/inode.c:157: warning: Function parameter or struct member 'gfp' not described in 'inode_init_always_gfp'
>> fs/inode.c:157: warning: expecting prototype for inode_init_always(). Prototype was for inode_init_always_gfp() instead
vim +157 fs/inode.c
bd9b51e79cb0b8 Al Viro 2014-11-18 147
2cb1599f9b2ecd David Chinner 2008-10-30 148 /**
6e7c2b4dd36d83 Masahiro Yamada 2017-05-08 149 * inode_init_always - perform inode structure initialisation
0bc02f3fa433a9 Randy Dunlap 2009-01-06 150 * @sb: superblock inode belongs to
0bc02f3fa433a9 Randy Dunlap 2009-01-06 151 * @inode: inode to initialise
2cb1599f9b2ecd David Chinner 2008-10-30 152 *
2cb1599f9b2ecd David Chinner 2008-10-30 153 * These are initializations that need to be done on every inode
2cb1599f9b2ecd David Chinner 2008-10-30 154 * allocation as the fields are not initialised by slab allocation.
2cb1599f9b2ecd David Chinner 2008-10-30 155 */
6185335c11aac8 Michal Hocko 2024-09-02 156 int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
^1da177e4c3f41 Linus Torvalds 2005-04-16 @157 {
6e1d5dcc2bbbe7 Alexey Dobriyan 2009-09-21 158 static const struct inode_operations empty_iops;
bd9b51e79cb0b8 Al Viro 2014-11-18 159 static const struct file_operations no_open_fops = {.open = no_open};
^1da177e4c3f41 Linus Torvalds 2005-04-16 160 struct address_space *const mapping = &inode->i_data;
^1da177e4c3f41 Linus Torvalds 2005-04-16 161
^1da177e4c3f41 Linus Torvalds 2005-04-16 162 inode->i_sb = sb;
^1da177e4c3f41 Linus Torvalds 2005-04-16 163 inode->i_blkbits = sb->s_blocksize_bits;
^1da177e4c3f41 Linus Torvalds 2005-04-16 164 inode->i_flags = 0;
5a9b911b8a24ed Mateusz Guzik 2024-06-11 165 inode->i_state = 0;
8019ad13ef7f64 Peter Zijlstra 2020-03-04 166 atomic64_set(&inode->i_sequence, 0);
^1da177e4c3f41 Linus Torvalds 2005-04-16 167 atomic_set(&inode->i_count, 1);
^1da177e4c3f41 Linus Torvalds 2005-04-16 168 inode->i_op = &empty_iops;
bd9b51e79cb0b8 Al Viro 2014-11-18 169 inode->i_fop = &no_open_fops;
edbb35cc6bdfc3 Eric Biggers 2020-10-30 170 inode->i_ino = 0;
a78ef704a8dd43 Miklos Szeredi 2011-10-28 171 inode->__i_nlink = 1;
3ddcd0569cd68f Linus Torvalds 2011-08-06 172 inode->i_opflags = 0;
d0a5b995a30834 Andreas Gruenbacher 2016-09-29 173 if (sb->s_xattr)
d0a5b995a30834 Andreas Gruenbacher 2016-09-29 174 inode->i_opflags |= IOP_XATTR;
92361636e0153b Eric W. Biederman 2012-02-08 175 i_uid_write(inode, 0);
92361636e0153b Eric W. Biederman 2012-02-08 176 i_gid_write(inode, 0);
^1da177e4c3f41 Linus Torvalds 2005-04-16 177 atomic_set(&inode->i_writecount, 0);
^1da177e4c3f41 Linus Torvalds 2005-04-16 178 inode->i_size = 0;
c75b1d9421f80f Jens Axboe 2017-06-27 179 inode->i_write_hint = WRITE_LIFE_NOT_SET;
^1da177e4c3f41 Linus Torvalds 2005-04-16 180 inode->i_blocks = 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16 181 inode->i_bytes = 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16 182 inode->i_generation = 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16 183 inode->i_pipe = NULL;
^1da177e4c3f41 Linus Torvalds 2005-04-16 184 inode->i_cdev = NULL;
61ba64fc076887 Al Viro 2015-05-02 185 inode->i_link = NULL;
84e710da2a1dfa Al Viro 2016-04-15 186 inode->i_dir_seq = 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16 187 inode->i_rdev = 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16 188 inode->dirtied_when = 0;
6146f0d5e47ca4 Mimi Zohar 2009-02-04 189
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-04 18:03 ` Kent Overstreet
2024-09-04 22:34 ` Dave Chinner
@ 2024-09-05 11:26 ` Michal Hocko
2024-09-05 13:53 ` Theodore Ts'o
1 sibling, 1 reply; 51+ messages in thread
From: Michal Hocko @ 2024-09-05 11:26 UTC (permalink / raw)
To: Kent Overstreet
Cc: Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Wed 04-09-24 14:03:13, Kent Overstreet wrote:
> On Wed, Sep 04, 2024 at 06:46:00PM GMT, Michal Hocko wrote:
> > On Wed 04-09-24 12:05:56, Kent Overstreet wrote:
> > > On Wed, Sep 04, 2024 at 09:14:29AM GMT, Michal Hocko wrote:
> > > > On Tue 03-09-24 19:53:41, Kent Overstreet wrote:
> > > > [...]
> > > > > However, if we agreed that GFP_NOFAIL meant "only fail if it is not
> > > > > possible to satisfy this allocation" (and I have been arguing that that
> > > > > is the only sane meaning) - then that could lead to a lot of error paths
> > > > > getting simpler.
> > > > >
> > > > > Because there are a lot of places where there's essentially no good
> > > > > reason to bubble up an -ENOMEM to userspace; if we're actually out of
> > > > > memory the current allocation is just one out of many and not
> > > > > particularly special, better to let the oom killer handle it...
> > > >
> > > > This is exactly GFP_KERNEL semantic for low order allocations or
> > > > kvmalloc for that matter. They simply never fail unless couple of corner
> > > > cases - e.g. the allocating task is an oom victim and all of the oom
> > > > memory reserves have been consumed. This is where we call "not possible
> > > > to allocate".
> > >
> > > *nod*
> > >
> > > Which does beg the question of why GFP_NOFAIL exists.
> >
> > Exactly for the reason that even rare failure is not acceptable and
> > there is no way to handle it other than keep retrying. Typical code was
> > while (!(ptr = kmalloc()))
> > ;
>
> But is it _rare_ failure, or _no_ failure?
>
> You seem to be saying (and I just reviewed the code, it looks like
> you're right) that there is essentially no difference in behaviour
> between GFP_KERNEL and GFP_NOFAIL.
The fundamental difference is that (appart from unsupported allocation
mode/size) the latter never returns NULL and you can rely on that fact.
Our docummentation says:
* %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
* cannot handle allocation failures. The allocation could block
* indefinitely but will never return with failure. Testing for
* failure is pointless.
> So given that - why the wart?
>
> I think we might be able to chalk it up to history; I'd have to go
> spunking through the history (or ask Dave or Ted, maybe they'll chime
> in), but I suspect GFP_KERNEL didn't provide such strong guarantees when
> the allocation loops & GFP_NOFAIL were introduced.
Sure, go ahead. If you manage to remove all existing users of
__GFP_NOFAIL (without replacing them with retry loops in the caller)
then I would be more than happy to remove __GFP_NOFAIL in the allocator.
[...]
> > But the point is there are some which _do_ need this. We have discussed
> > that in other email thread where you have heard why XFS and EXT4 does
> > that and why they are not going to change that model.
>
> No, I agree that they need the strong guarantees.
>
> But if there's an actual bug, returning an error is better than killing
> the task. Killing the task is really bad; these allocations are deep in
> contexts where locks and refcounts are held, and the system will just
> grind to a halt.
Not sure what you mean by these allocations but I am not aware that any
of the existing user would be really buggy. Also as I've said elsewhere,
there is simply no good way to handle a buggy caller. Killing the buggy
context has some downsides, returning NULL has others. I have argued
that the former has better predictable behavior than potentially silent
failure. We can clearly disagree on this but I also do not see why that
is relevant to the original discussion because my argument against
PF_MEMALLOC_NORECLAIM was focused on correct GPF_NOFAIL nested context
that would get an unexpected failure mode. No matter what kind of
failure mode that is it would be unexpected for those users.
> > > But as a matter of policy going forward, yes we should be saying that
> > > even GFP_NOFAIL allocations should be checking for -ENOMEM.
> >
> > I argue that such NOFAIL semantic has no well defined semantic and legit
> > users are forced to do
> > while (!(ptr = kmalloc(GFP_NOFAIL))) ;
> > or
> > BUG_ON(!(ptr = kmalloc(GFP_NOFAIL)));
> >
> > So it has no real reason to exist.
>
> I'm arguing that it does, provided when it returns NULL is defined to
> be:
> - invalid allocation context
> - a size that is so big that it will never be possible to satisfy.
Those are not really important situations because you are arguing about
a buggy code that needs fixing. As said above we can argue how to deal
with those users to get a predictable behavior but as the matter of
fact, correct users can expect never seeing the failure so handling
failure might be a) impossible and b) unfeasible (i.e. you are adding a
dead code that is never tested).
[...]
> For large allocations in bcachefs: in journal replay we read all the
> keys in the journal, and then we create a big flat array with references
> to all of those keys to sort and dedup them.
>
> We haven't hit the INT_MAX size limit there yet, but filesystem sizes
> being what they are, we will soon. I've heard of users with 150 TB
> filesystems, and once the fsck scalability issues are sorted we'll be
> aiming for petabytes. Dirty keys in the journal scales more with system
> memory, but I'm leasing machines right now with a quarter terabyte of
> ram.
I thought you were arguing about bcachefs handling failure mode so
presumably you do not need to use __GFP_NOFAIL for those.
I am sorry but I am getting lost in these arguments.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-05 11:26 ` Michal Hocko
@ 2024-09-05 13:53 ` Theodore Ts'o
2024-09-05 14:05 ` Kent Overstreet
2024-09-05 14:12 ` Michal Hocko
0 siblings, 2 replies; 51+ messages in thread
From: Theodore Ts'o @ 2024-09-05 13:53 UTC (permalink / raw)
To: Michal Hocko
Cc: Kent Overstreet, Andrew Morton, Christoph Hellwig, Yafang Shao,
jack, Vlastimil Babka, Dave Chinner, Christian Brauner,
Alexander Viro, Paul Moore, James Morris, Serge E. Hallyn,
linux-fsdevel, linux-mm, linux-bcachefs, linux-security-module,
linux-kernel
On Thu, Sep 05, 2024 at 01:26:50PM +0200, Michal Hocko wrote:
> > > > > This is exactly GFP_KERNEL semantic for low order allocations or
> > > > > kvmalloc for that matter. They simply never fail unless couple of corner
> > > > > cases - e.g. the allocating task is an oom victim and all of the oom
> > > > > memory reserves have been consumed. This is where we call "not possible
> > > > > to allocate".
> > > >
> > > > Which does beg the question of why GFP_NOFAIL exists.
> > >
> > > Exactly for the reason that even rare failure is not acceptable and
> > > there is no way to handle it other than keep retrying. Typical code was
> > > while (!(ptr = kmalloc()))
> > > ;
> >
> > But is it _rare_ failure, or _no_ failure?
> >
> > You seem to be saying (and I just reviewed the code, it looks like
> > you're right) that there is essentially no difference in behaviour
> > between GFP_KERNEL and GFP_NOFAIL.
That may be the currrent state of affiars; but is it
****guaranteed**** forever and ever, amen, that GFP_KERNEL will never
fail if the amount of memory allocated was lower than a particular
multiple of the page size? If so, what is that size? I've checked,
and this is not documented in the formal interface.
> The fundamental difference is that (appart from unsupported allocation
> mode/size) the latter never returns NULL and you can rely on that fact.
> Our docummentation says:
> * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
> * cannot handle allocation failures. The allocation could block
> * indefinitely but will never return with failure. Testing for
> * failure is pointless.
So if the documentation is going to give similar guarantees, as
opposed to it being an accident of the current implementation that is
subject to change at any time, then sure, we can probably get away
with all or most of ext4's uses of __GFP_NOFAIL. But I don't want to
do that and then have a "Lucy and Charlie Brown" moment from the
Peanuts comics strip where the football suddenly gets snatched away
from us[1] (and many file sytem users will be very, very sad and/or
angry).
[1] https://www.cracked.com/article_37831_good-grief-how-lucy-pulling-the-football-away-from-charlie-brown-became-a-signature-peanuts-gag.html
It might be that other file systems have requirements which isblarger
than the not-formally-defined GFP_KMALLOC guarantee, but it's true we
can probably reduce the usage of GFP_NOFAIL if this guarantee is
formalized.
- Ted
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-05 13:53 ` Theodore Ts'o
@ 2024-09-05 14:05 ` Kent Overstreet
2024-09-05 15:24 ` Theodore Ts'o
2024-09-05 14:12 ` Michal Hocko
1 sibling, 1 reply; 51+ messages in thread
From: Kent Overstreet @ 2024-09-05 14:05 UTC (permalink / raw)
To: Theodore Ts'o
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Thu, Sep 05, 2024 at 09:53:26AM GMT, Theodore Ts'o wrote:
> On Thu, Sep 05, 2024 at 01:26:50PM +0200, Michal Hocko wrote:
> > > > > > This is exactly GFP_KERNEL semantic for low order allocations or
> > > > > > kvmalloc for that matter. They simply never fail unless couple of corner
> > > > > > cases - e.g. the allocating task is an oom victim and all of the oom
> > > > > > memory reserves have been consumed. This is where we call "not possible
> > > > > > to allocate".
> > > > >
> > > > > Which does beg the question of why GFP_NOFAIL exists.
> > > >
> > > > Exactly for the reason that even rare failure is not acceptable and
> > > > there is no way to handle it other than keep retrying. Typical code was
> > > > while (!(ptr = kmalloc()))
> > > > ;
> > >
> > > But is it _rare_ failure, or _no_ failure?
> > >
> > > You seem to be saying (and I just reviewed the code, it looks like
> > > you're right) that there is essentially no difference in behaviour
> > > between GFP_KERNEL and GFP_NOFAIL.
>
> That may be the currrent state of affiars; but is it
> ****guaranteed**** forever and ever, amen, that GFP_KERNEL will never
> fail if the amount of memory allocated was lower than a particular
> multiple of the page size? If so, what is that size? I've checked,
> and this is not documented in the formal interface.
Yeah, and I think we really need to make that happen, in order to head
off a lot more sillyness in the future.
We'd also be documenting at the same time _exactly_ when it is required
to check for errors:
- small, fixed sized allocation in a known sleepable context, safe to skip
- anything else, i.e. variable sized allocation or library code that can
be called from different contexts: you check for errors (and probably
that's just "something crazy has happened, emergency shutdown" for the
xfs/ext4 paths
> > The fundamental difference is that (appart from unsupported allocation
> > mode/size) the latter never returns NULL and you can rely on that fact.
> > Our docummentation says:
> > * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
> > * cannot handle allocation failures. The allocation could block
> > * indefinitely but will never return with failure. Testing for
> > * failure is pointless.
>
> So if the documentation is going to give similar guarantees, as
> opposed to it being an accident of the current implementation that is
> subject to change at any time, then sure, we can probably get away
> with all or most of ext4's uses of __GFP_NOFAIL. But I don't want to
> do that and then have a "Lucy and Charlie Brown" moment from the
> Peanuts comics strip where the football suddenly gets snatched away
> from us[1] (and many file sytem users will be very, very sad and/or
> angry).
yeah absolutely, and the "what is a small allocation" limit needs to be
nailed down as well
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-05 13:53 ` Theodore Ts'o
2024-09-05 14:05 ` Kent Overstreet
@ 2024-09-05 14:12 ` Michal Hocko
1 sibling, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-09-05 14:12 UTC (permalink / raw)
To: Theodore Ts'o
Cc: Kent Overstreet, Andrew Morton, Christoph Hellwig, Yafang Shao,
jack, Vlastimil Babka, Dave Chinner, Christian Brauner,
Alexander Viro, Paul Moore, James Morris, Serge E. Hallyn,
linux-fsdevel, linux-mm, linux-bcachefs, linux-security-module,
linux-kernel
On Thu 05-09-24 09:53:26, Theodore Ts'o wrote:
> On Thu, Sep 05, 2024 at 01:26:50PM +0200, Michal Hocko wrote:
> > > > > > This is exactly GFP_KERNEL semantic for low order allocations or
> > > > > > kvmalloc for that matter. They simply never fail unless couple of corner
> > > > > > cases - e.g. the allocating task is an oom victim and all of the oom
> > > > > > memory reserves have been consumed. This is where we call "not possible
> > > > > > to allocate".
> > > > >
> > > > > Which does beg the question of why GFP_NOFAIL exists.
> > > >
> > > > Exactly for the reason that even rare failure is not acceptable and
> > > > there is no way to handle it other than keep retrying. Typical code was
> > > > while (!(ptr = kmalloc()))
> > > > ;
> > >
> > > But is it _rare_ failure, or _no_ failure?
> > >
> > > You seem to be saying (and I just reviewed the code, it looks like
> > > you're right) that there is essentially no difference in behaviour
> > > between GFP_KERNEL and GFP_NOFAIL.
>
> That may be the currrent state of affiars; but is it
> ****guaranteed**** forever and ever, amen, that GFP_KERNEL will never
> fail if the amount of memory allocated was lower than a particular
> multiple of the page size?
No, GFP_KERNEL is not guaranteed. Allocator tries as hard as it can to
satisfy those allocations for order <= PAGE_ALLOC_COSTLY_ORDER.
GFP_NOFAIL is guaranteed for order <= 1 for page allocator and there is
no practical limit for vmalloc currently. This is what our documentation
says
* The default allocator behavior depends on the request size. We have a concept
* of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
* !costly allocations are too essential to fail so they are implicitly
* non-failing by default (with some exceptions like OOM victims might fail so
* the caller still has to check for failures) while costly requests try to be
* not disruptive and back off even without invoking the OOM killer.
* The following three modifiers might be used to override some of these
* implicit rules.
There is no guarantee this will be that way for ever. This is unlikely
to change though.
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-05 14:05 ` Kent Overstreet
@ 2024-09-05 15:24 ` Theodore Ts'o
0 siblings, 0 replies; 51+ messages in thread
From: Theodore Ts'o @ 2024-09-05 15:24 UTC (permalink / raw)
To: Kent Overstreet
Cc: Michal Hocko, Andrew Morton, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Thu, Sep 05, 2024 at 10:05:15AM -0400, Kent Overstreet wrote:
> > That may be the currrent state of affiars; but is it
> > ****guaranteed**** forever and ever, amen, that GFP_KERNEL will never
> > fail if the amount of memory allocated was lower than a particular
> > multiple of the page size? If so, what is that size? I've checked,
> > and this is not documented in the formal interface.
>
> Yeah, and I think we really need to make that happen, in order to head
> off a lot more sillyness in the future.
I don't think there's any "sillyness"; I hear that you believe that
it's silly, but I think what we have is totally fine.
I've done a quick check of ext4, and we do check the error returns in
most if not all of the calls where we pass in __GFP_NOFAIL and/or are
small allocations less than the block size. We won't crash if someone
sends a patch which violates the documented guarantee of __GFP_NOFAIL.
So what's the sillynesss?
In any case, Michal has said ix-nay on making GFP_KERNEL == GFP_NOFAIL
for small allocations as documented guarantee, as opposed to the way
things work today, so as far as I'm concerned, the matter is closed.
- Ted
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-02 9:51 [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Michal Hocko
` (2 preceding siblings ...)
2024-09-02 9:53 ` [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Kent Overstreet
@ 2024-09-10 19:29 ` Andrew Morton
2024-09-10 19:37 ` Kent Overstreet
3 siblings, 1 reply; 51+ messages in thread
From: Andrew Morton @ 2024-09-10 19:29 UTC (permalink / raw)
To: Michal Hocko
Cc: Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
Thanks, I have queued this for 6.12-rc1.
Kent, if this results in observable runtime issues with bcachefs,
please report those and let's see what we can come up with to address
them.
^ permalink raw reply [flat|nested] 51+ messages in thread
* Re: [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM
2024-09-10 19:29 ` Andrew Morton
@ 2024-09-10 19:37 ` Kent Overstreet
0 siblings, 0 replies; 51+ messages in thread
From: Kent Overstreet @ 2024-09-10 19:37 UTC (permalink / raw)
To: Andrew Morton
Cc: Michal Hocko, Christoph Hellwig, Yafang Shao, jack,
Vlastimil Babka, Dave Chinner, Christian Brauner, Alexander Viro,
Paul Moore, James Morris, Serge E. Hallyn, linux-fsdevel,
linux-mm, linux-bcachefs, linux-security-module, linux-kernel
On Tue, Sep 10, 2024 at 12:29:12PM GMT, Andrew Morton wrote:
> Thanks, I have queued this for 6.12-rc1.
>
> Kent, if this results in observable runtime issues with bcachefs,
> please report those and let's see what we can come up with to address
> them.
Andrew, have you ever had to hunt down tail latency bugs?
^ permalink raw reply [flat|nested] 51+ messages in thread
* [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM
2024-09-26 17:11 [PATCH 0/2 v3] " Michal Hocko
@ 2024-09-26 17:11 ` Michal Hocko
0 siblings, 0 replies; 51+ messages in thread
From: Michal Hocko @ 2024-09-26 17:11 UTC (permalink / raw)
To: Andrew Morton
Cc: Christoph Hellwig, Yafang Shao, Kent Overstreet, jack,
Christian Brauner, Alexander Viro, Paul Moore, James Morris,
Serge E. Hallyn, linux-fsdevel, linux-mm, linux-bcachefs,
linux-security-module, linux-kernel, Michal Hocko, Dave Chinner
From: Michal Hocko <mhocko@suse.com>
bch2_new_inode relies on PF_MEMALLOC_NORECLAIM to try to allocate a new
inode to achieve GFP_NOWAIT semantic while holding locks. If this
allocation fails it will drop locks and use GFP_NOFS allocation context.
We would like to drop PF_MEMALLOC_NORECLAIM because it is really
dangerous to use if the caller doesn't control the full call chain with
this flag set. E.g. if any of the function down the chain needed
GFP_NOFAIL request the PF_MEMALLOC_NORECLAIM would override this and
cause unexpected failure.
While this is not the case in this particular case using the scoped gfp
semantic is not really needed bacause we can easily pus the allocation
context down the chain without too much clutter.
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz> # For vfs changes
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
fs/bcachefs/fs.c | 14 ++++++--------
fs/inode.c | 10 ++++++----
include/linux/fs.h | 7 ++++++-
include/linux/security.h | 4 ++--
security/security.c | 9 +++++----
5 files changed, 25 insertions(+), 19 deletions(-)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 4a1bb07a2574..14f50490825f 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -291,10 +291,10 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
BUG();
}
-static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
+static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c, gfp_t gfp)
{
struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb,
- bch2_inode_cache, GFP_NOFS);
+ bch2_inode_cache, gfp);
if (!inode)
return NULL;
@@ -306,7 +306,7 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
- if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
+ if (unlikely(inode_init_always_gfp(c->vfs_sb, &inode->v, gfp))) {
kmem_cache_free(bch2_inode_cache, inode);
return NULL;
}
@@ -319,12 +319,10 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
*/
static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
{
- struct bch_inode_info *inode =
- memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
- __bch2_new_inode(trans->c));
+ struct bch_inode_info *inode = __bch2_new_inode(trans->c, GFP_NOWAIT);
if (unlikely(!inode)) {
- int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c)) ? 0 : -ENOMEM);
+ int ret = drop_locks_do(trans, (inode = __bch2_new_inode(trans->c, GFP_NOFS)) ? 0 : -ENOMEM);
if (ret && inode) {
__destroy_inode(&inode->v);
kmem_cache_free(bch2_inode_cache, inode);
@@ -398,7 +396,7 @@ __bch2_create(struct mnt_idmap *idmap,
if (ret)
return ERR_PTR(ret);
#endif
- inode = __bch2_new_inode(c);
+ inode = __bch2_new_inode(c, GFP_NOFS);
if (unlikely(!inode)) {
inode = ERR_PTR(-ENOMEM);
goto err;
diff --git a/fs/inode.c b/fs/inode.c
index 471ae4a31549..8dabb224f941 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -146,14 +146,16 @@ static int no_open(struct inode *inode, struct file *file)
}
/**
- * inode_init_always - perform inode structure initialisation
+ * inode_init_always_gfp - perform inode structure initialisation
* @sb: superblock inode belongs to
* @inode: inode to initialise
+ * @gfp: allocation flags
*
* These are initializations that need to be done on every inode
* allocation as the fields are not initialised by slab allocation.
+ * If there are additional allocations required @gfp is used.
*/
-int inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
static const struct inode_operations empty_iops;
static const struct file_operations no_open_fops = {.open = no_open};
@@ -230,14 +232,14 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
#endif
inode->i_flctx = NULL;
- if (unlikely(security_inode_alloc(inode)))
+ if (unlikely(security_inode_alloc(inode, gfp)))
return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
}
-EXPORT_SYMBOL(inode_init_always);
+EXPORT_SYMBOL(inode_init_always_gfp);
void free_inode_nonrcu(struct inode *inode)
{
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eae5b67e4a15..c2d925235e6c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3082,7 +3082,12 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);
-extern int inode_init_always(struct super_block *, struct inode *);
+extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
+static inline int inode_init_always(struct super_block *sb, struct inode *inode)
+{
+ return inode_init_always_gfp(sb, inode, GFP_NOFS);
+}
+
extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
diff --git a/include/linux/security.h b/include/linux/security.h
index b86ec2afc691..2ec8f3014757 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -348,7 +348,7 @@ int security_dentry_create_files_as(struct dentry *dentry, int mode,
struct cred *new);
int security_path_notify(const struct path *path, u64 mask,
unsigned int obj_type);
-int security_inode_alloc(struct inode *inode);
+int security_inode_alloc(struct inode *inode, gfp_t gfp);
void security_inode_free(struct inode *inode);
int security_inode_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr,
@@ -789,7 +789,7 @@ static inline int security_path_notify(const struct path *path, u64 mask,
return 0;
}
-static inline int security_inode_alloc(struct inode *inode)
+static inline int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
return 0;
}
diff --git a/security/security.c b/security/security.c
index 6875eb4a59fc..8947826cb756 100644
--- a/security/security.c
+++ b/security/security.c
@@ -745,14 +745,14 @@ static int lsm_file_alloc(struct file *file)
*
* Returns 0, or -ENOMEM if memory can't be allocated.
*/
-static int lsm_inode_alloc(struct inode *inode)
+static int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
{
if (!lsm_inode_cache) {
inode->i_security = NULL;
return 0;
}
- inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS);
+ inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
if (inode->i_security == NULL)
return -ENOMEM;
return 0;
@@ -1678,6 +1678,7 @@ int security_path_notify(const struct path *path, u64 mask,
/**
* security_inode_alloc() - Allocate an inode LSM blob
* @inode: the inode
+ * #gfp: allocation flags
*
* Allocate and attach a security structure to @inode->i_security. The
* i_security field is initialized to NULL when the inode structure is
@@ -1685,9 +1686,9 @@ int security_path_notify(const struct path *path, u64 mask,
*
* Return: Return 0 if operation was successful.
*/
-int security_inode_alloc(struct inode *inode)
+int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
- int rc = lsm_inode_alloc(inode);
+ int rc = lsm_inode_alloc(inode, gfp);
if (unlikely(rc))
return rc;
--
2.46.1
^ permalink raw reply related [flat|nested] 51+ messages in thread
end of thread, other threads:[~2024-09-26 17:29 UTC | newest]
Thread overview: 51+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-02 9:51 [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Michal Hocko
2024-09-02 9:51 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
2024-09-05 9:28 ` kernel test robot
2024-09-02 9:51 ` [PATCH 2/2] Revert "mm: introduce PF_MEMALLOC_NORECLAIM, PF_MEMALLOC_NOWARN" Michal Hocko
2024-09-02 9:53 ` [PATCH 0/2 v2] remove PF_MEMALLOC_NORECLAIM Kent Overstreet
2024-09-02 21:52 ` Andrew Morton
2024-09-02 22:32 ` Kent Overstreet
2024-09-03 7:06 ` Michal Hocko
2024-09-04 16:15 ` Kent Overstreet
2024-09-04 16:50 ` Michal Hocko
2024-09-03 23:53 ` Kent Overstreet
2024-09-04 7:14 ` Michal Hocko
2024-09-04 16:05 ` Kent Overstreet
2024-09-04 16:46 ` Michal Hocko
2024-09-04 18:03 ` Kent Overstreet
2024-09-04 22:34 ` Dave Chinner
2024-09-04 23:05 ` Kent Overstreet
2024-09-05 11:26 ` Michal Hocko
2024-09-05 13:53 ` Theodore Ts'o
2024-09-05 14:05 ` Kent Overstreet
2024-09-05 15:24 ` Theodore Ts'o
2024-09-05 14:12 ` Michal Hocko
2024-09-03 5:13 ` Christoph Hellwig
2024-09-04 16:27 ` Kent Overstreet
2024-09-04 17:01 ` Michal Hocko
2024-09-10 19:29 ` Andrew Morton
2024-09-10 19:37 ` Kent Overstreet
-- strict thread matches above, loose matches on Subject: below --
2024-09-26 17:11 [PATCH 0/2 v3] " Michal Hocko
2024-09-26 17:11 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
2024-08-26 8:47 [PATCH 0/2] get rid of PF_MEMALLOC_NORECLAIM Michal Hocko
2024-08-26 8:47 ` [PATCH 1/2] bcachefs: do not use PF_MEMALLOC_NORECLAIM Michal Hocko
2024-08-26 13:11 ` Matthew Wilcox
2024-08-26 16:48 ` Michal Hocko
2024-08-26 19:39 ` Kent Overstreet
2024-08-26 19:41 ` Matthew Wilcox
2024-08-26 19:42 ` Kent Overstreet
2024-08-26 19:47 ` Matthew Wilcox
2024-08-26 19:54 ` Kent Overstreet
2024-08-26 19:44 ` Kent Overstreet
2024-08-26 19:58 ` Michal Hocko
2024-08-26 20:00 ` Kent Overstreet
2024-08-26 20:27 ` Michal Hocko
2024-08-26 20:43 ` Kent Overstreet
2024-08-26 21:10 ` Kent Overstreet
2024-08-27 6:01 ` Michal Hocko
2024-08-27 6:40 ` Kent Overstreet
2024-08-27 6:58 ` Michal Hocko
2024-08-27 7:05 ` Kent Overstreet
2024-08-27 7:35 ` Michal Hocko
2024-08-26 19:52 ` kernel test robot
2024-08-26 20:53 ` kernel test robot
2024-08-27 2:23 ` kernel test robot
2024-08-29 9:37 ` Jan Kara
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).