* [PATCH resubmit] do_mount: reduce stack consumption
@ 2005-11-04 10:50 Heiko Carstens
2005-11-04 11:48 ` Al Viro
2005-11-04 16:48 ` Andrew Morton
0 siblings, 2 replies; 20+ messages in thread
From: Heiko Carstens @ 2005-11-04 10:50 UTC (permalink / raw)
To: Andrew Morton, linux-kernel; +Cc: Andreas Herrmann
From: Andreas Herrmann <aherrman@de.ibm.com>
This is a resubmit of Andreas' patch that reduces stackframe usage in
do_mount. Problem is that without this patch we get a kernel stack
overflow if we run with 4k stacks (s390 31 bit mode).
See original stack back trace below and Andreas' patch and analysis
here:
http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
<4>Call Trace:
<4>([<000000000014ade6>] buffered_rmqueue+0x36a/0x3b4)
<4> [<000000000014aed6>] __alloc_pages+0xa6/0x350
<4> [<000000000014b258>] __get_free_pages+0x38/0x74
<4> [<000000000014f9fe>] kmem_getpages+0x32/0x140
<4> [<0000000000150396>] cache_alloc_refill+0x3ae/0x58c
<4> [<000000000014feb0>] __kmalloc+0xc0/0xe8
<4> [<00000000002acc64>] zfcp_mempool_alloc+0x24/0x34
<4> [<00000000001494ea>] mempool_alloc+0x56/0x1e8
<4> [<00000000002bfb46>] zfcp_fsf_req_create+0x56/0x5d8
<4> [<00000000002c0cf0>] zfcp_fsf_send_fcp_command_task+0x44/0x5c4
<4> [<00000000002afbda>] zfcp_scsi_command_async+0x7a/0x1f8
<4> [<00000000002afe68>] zfcp_scsi_queuecommand+0x44/0x54
<4> [<000000000024f458>] scsi_dispatch_cmd+0x234/0x3c0
<4> [<0000000000255fb2>] scsi_request_fn+0x2c6/0x67c
<4> [<000000000023f3ea>] __generic_unplug_device+0x52/0x60
<4> [<000000000023d392>] __elv_add_request+0x8a/0x98
<4> [<0000000000240c5e>] __make_request+0x306/0x62c
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
<4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
<4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
<4> [<0000000000241074>] generic_make_request+0xf0/0x21c
<4> [<000000000024121a>] submit_bio+0x7a/0x154
<4> [<00000000001736e6>] submit_bh+0x106/0x184
<4> [<000000000017514e>] __block_write_full_page+0x23a/0x58c
<4> [<000000000017558e>] block_write_full_page+0xee/0x108
<4> [<000000000017a908>] blkdev_writepage+0x24/0x38
<4> [<00000000001a0f76>] mpage_writepages+0x1da/0x9d0
<4> [<000000000017a4fa>] generic_writepages+0x22/0x30
<4> [<000000000014cc1e>] do_writepages+0x36/0x54
<4> [<0000000000144fe2>] __filemap_fdatawrite+0x5a/0x6c
<4> [<0000000000145016>] filemap_fdatawrite+0x22/0x30
<4> [<0000000000171244>] sync_blockdev+0x30/0x5c
<4> [<00000000001df074>] journal_recover+0x88/0xbc
<4> [<00000000001e2876>] journal_load+0x52/0xcc
<4> [<00000000001d37b4>] ext3_fill_super+0x1a48/0x1d38
<4> [<0000000000179faa>] get_sb_bdev+0x13a/0x208
<4> [<00000000001d3bd6>] ext3_get_sb+0x22/0x34
<4> [<000000000017a32e>] do_kern_mount+0x86/0x1f4
<4> [<000000000019aaf4>] do_mount+0x1c8/0x8e4
<4> [<000000000019b7ba>] sys_mount+0xce/0x1dc
<4> [<000000000010f9a6>] sysc_do_restart+0xe/0x12
Signed-off-by: Andreas Herrmann <aherrman@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
fs/namespace.c | 80 ++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 51 insertions(+), 29 deletions(-)
diff -urpN linux-2.6/fs/namespace.c linux-2.6-patched/fs/namespace.c
--- linux-2.6/fs/namespace.c 2005-10-28 02:02:08.000000000 +0200
+++ linux-2.6-patched/fs/namespace.c 2005-11-03 13:39:30.000000000 +0100
@@ -619,25 +619,31 @@ out_unlock:
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
- struct nameidata old_nd;
+ struct nameidata *old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
if (err)
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
- if (err)
+
+ old_nd = kmalloc(sizeof(*old_nd), GFP_KERNEL);
+ if (!old_nd)
+ return -ENOMEM;
+ err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);
+ if (err) {
+ kfree(old_nd);
return err;
+ }
down_write(¤t->namespace->sem);
err = -EINVAL;
- if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+ if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd->mnt))) {
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd->mnt, old_nd->dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd->mnt, old_nd->dentry);
}
if (mnt) {
@@ -656,7 +662,8 @@ static int do_loopback(struct nameidata
}
up_write(¤t->namespace->sem);
- path_release(&old_nd);
+ path_release(old_nd);
+ kfree(old_nd);
return err;
}
@@ -693,22 +700,29 @@ static int do_remount(struct nameidata *
static int do_move_mount(struct nameidata *nd, char *old_name)
{
- struct nameidata old_nd, parent_nd;
+ struct {
+ struct nameidata old_nd, parent_nd;
+ } *loc;
struct vfsmount *p;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
- if (err)
+ loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc)
+ return -ENOMEM;
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &loc->old_nd);
+ if (err) {
+ kfree (loc);
return err;
+ }
down_write(¤t->namespace->sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+ if (!check_mnt(nd->mnt) || !check_mnt(loc->old_nd.mnt))
goto out;
err = -ENOENT;
@@ -721,28 +735,28 @@ static int do_move_mount(struct nameidat
goto out2;
err = -EINVAL;
- if (old_nd.dentry != old_nd.mnt->mnt_root)
+ if (loc->old_nd.dentry != loc->old_nd.mnt->mnt_root)
goto out2;
- if (old_nd.mnt == old_nd.mnt->mnt_parent)
+ if (loc->old_nd.mnt == loc->old_nd.mnt->mnt_parent)
goto out2;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
- S_ISDIR(old_nd.dentry->d_inode->i_mode))
+ S_ISDIR(loc->old_nd.dentry->d_inode->i_mode))
goto out2;
err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
- if (p == old_nd.mnt)
+ if (p == loc->old_nd.mnt)
goto out2;
err = 0;
- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_mnt(loc->old_nd.mnt, &loc->parent_nd);
+ attach_mnt(loc->old_nd.mnt, nd);
/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_nd.mnt->mnt_expire);
+ list_del_init(&loc->old_nd.mnt->mnt_expire);
out2:
spin_unlock(&vfsmount_lock);
out1:
@@ -750,8 +764,9 @@ out1:
out:
up_write(¤t->namespace->sem);
if (!err)
- path_release(&parent_nd);
- path_release(&old_nd);
+ path_release(&loc->parent_nd);
+ path_release(&loc->old_nd);
+ kfree(loc);
return err;
}
@@ -1014,7 +1029,7 @@ int copy_mount_options(const void __user
long do_mount(char * dev_name, char * dir_name, char *type_page,
unsigned long flags, void *data_page)
{
- struct nameidata nd;
+ struct nameidata *nd;
int retval = 0;
int mnt_flags = 0;
@@ -1041,27 +1056,34 @@ long do_mount(char * dev_name, char * di
mnt_flags |= MNT_NOEXEC;
flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (!nd)
+ return -ENOMEM;
+
/* ... and get the mountpoint */
- retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
- if (retval)
+ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
+ if (retval) {
+ kfree(nd);
return retval;
+ }
- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, nd, type_page, flags, data_page);
if (retval)
goto dput_out;
if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ retval = do_remount(nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(nd, dev_name, flags & MS_REC);
else if (flags & MS_MOVE)
- retval = do_move_mount(&nd, dev_name);
+ retval = do_move_mount(nd, dev_name);
else
- retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
- path_release(&nd);
+ path_release(nd);
+ kfree(nd);
return retval;
}
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 10:50 [PATCH resubmit] do_mount: reduce stack consumption Heiko Carstens
@ 2005-11-04 11:48 ` Al Viro
2005-11-04 12:57 ` Heiko Carstens
2005-11-04 16:48 ` Andrew Morton
1 sibling, 1 reply; 20+ messages in thread
From: Al Viro @ 2005-11-04 11:48 UTC (permalink / raw)
To: Heiko Carstens; +Cc: Andrew Morton, linux-kernel, Andreas Herrmann
On Fri, Nov 04, 2005 at 11:50:26AM +0100, Heiko Carstens wrote:
> From: Andreas Herrmann <aherrman@de.ibm.com>
>
> This is a resubmit of Andreas' patch that reduces stackframe usage in
> do_mount. Problem is that without this patch we get a kernel stack
> overflow if we run with 4k stacks (s390 31 bit mode).
> See original stack back trace below and Andreas' patch and analysis
> here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
NAK. Rationale: too ugly.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 11:48 ` Al Viro
@ 2005-11-04 12:57 ` Heiko Carstens
2005-11-04 14:06 ` Al Viro
0 siblings, 1 reply; 20+ messages in thread
From: Heiko Carstens @ 2005-11-04 12:57 UTC (permalink / raw)
To: Al Viro; +Cc: Andrew Morton, linux-kernel, Andreas Herrmann
> > This is a resubmit of Andreas' patch that reduces stackframe usage in
> > do_mount. Problem is that without this patch we get a kernel stack
> > overflow if we run with 4k stacks (s390 31 bit mode).
> > See original stack back trace below and Andreas' patch and analysis
> > here:
> > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> NAK. Rationale: too ugly.
Ok, since I can only guess what you don't like: here is an updated patch
that probably addresses a few things.
If you don't like this one too, could you please explain what should be
changed?
Thanks,
Heiko
---
fs/namespace.c | 79 ++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 50 insertions(+), 29 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c
index 2fa9fdf..c14fbfc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -619,25 +619,30 @@ out_unlock:
*/
static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
{
- struct nameidata old_nd;
+ struct nameidata *old_nd;
struct vfsmount *mnt = NULL;
int err = mount_is_safe(nd);
if (err)
return err;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+
+ old_nd = kmalloc(sizeof(*old_nd), GFP_KERNEL);
+ if (!old_nd)
+ return -ENOMEM;
+
+ err = path_lookup(old_name, LOOKUP_FOLLOW, old_nd);
if (err)
- return err;
+ goto out;
down_write(¤t->namespace->sem);
err = -EINVAL;
- if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd.mnt))) {
+ if (check_mnt(nd->mnt) && (!recurse || check_mnt(old_nd->mnt))) {
err = -ENOMEM;
if (recurse)
- mnt = copy_tree(old_nd.mnt, old_nd.dentry);
+ mnt = copy_tree(old_nd->mnt, old_nd->dentry);
else
- mnt = clone_mnt(old_nd.mnt, old_nd.dentry);
+ mnt = clone_mnt(old_nd->mnt, old_nd->dentry);
}
if (mnt) {
@@ -656,7 +661,9 @@ static int do_loopback(struct nameidata
}
up_write(¤t->namespace->sem);
- path_release(&old_nd);
+ path_release(old_nd);
+out:
+ kfree(old_nd);
return err;
}
@@ -693,22 +700,28 @@ static int do_remount(struct nameidata *
static int do_move_mount(struct nameidata *nd, char *old_name)
{
- struct nameidata old_nd, parent_nd;
+ struct {
+ struct nameidata old_nd, parent_nd;
+ } *loc;
struct vfsmount *p;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!old_name || !*old_name)
return -EINVAL;
- err = path_lookup(old_name, LOOKUP_FOLLOW, &old_nd);
+ loc = kmalloc(sizeof(*loc), GFP_KERNEL);
+ if (!loc)
+ return -ENOMEM;
+
+ err = path_lookup(old_name, LOOKUP_FOLLOW, &loc->old_nd);
if (err)
- return err;
+ goto path_out;
down_write(¤t->namespace->sem);
while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->mnt) || !check_mnt(old_nd.mnt))
+ if (!check_mnt(nd->mnt) || !check_mnt(loc->old_nd.mnt))
goto out;
err = -ENOENT;
@@ -721,28 +734,28 @@ static int do_move_mount(struct nameidat
goto out2;
err = -EINVAL;
- if (old_nd.dentry != old_nd.mnt->mnt_root)
+ if (loc->old_nd.dentry != loc->old_nd.mnt->mnt_root)
goto out2;
- if (old_nd.mnt == old_nd.mnt->mnt_parent)
+ if (loc->old_nd.mnt == loc->old_nd.mnt->mnt_parent)
goto out2;
if (S_ISDIR(nd->dentry->d_inode->i_mode) !=
- S_ISDIR(old_nd.dentry->d_inode->i_mode))
+ S_ISDIR(loc->old_nd.dentry->d_inode->i_mode))
goto out2;
err = -ELOOP;
for (p = nd->mnt; p->mnt_parent!=p; p = p->mnt_parent)
- if (p == old_nd.mnt)
+ if (p == loc->old_nd.mnt)
goto out2;
err = 0;
- detach_mnt(old_nd.mnt, &parent_nd);
- attach_mnt(old_nd.mnt, nd);
+ detach_mnt(loc->old_nd.mnt, &loc->parent_nd);
+ attach_mnt(loc->old_nd.mnt, nd);
/* if the mount is moved, it should no longer be expire
* automatically */
- list_del_init(&old_nd.mnt->mnt_expire);
+ list_del_init(&loc->old_nd.mnt->mnt_expire);
out2:
spin_unlock(&vfsmount_lock);
out1:
@@ -750,8 +763,10 @@ out1:
out:
up_write(¤t->namespace->sem);
if (!err)
- path_release(&parent_nd);
- path_release(&old_nd);
+ path_release(&loc->parent_nd);
+ path_release(&loc->old_nd);
+path_out:
+ kfree(loc);
return err;
}
@@ -1014,7 +1029,7 @@ int copy_mount_options(const void __user
long do_mount(char * dev_name, char * dir_name, char *type_page,
unsigned long flags, void *data_page)
{
- struct nameidata nd;
+ struct nameidata *nd;
int retval = 0;
int mnt_flags = 0;
@@ -1041,27 +1056,33 @@ long do_mount(char * dev_name, char * di
mnt_flags |= MNT_NOEXEC;
flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+ nd = kmalloc(sizeof(*nd), GFP_KERNEL);
+ if (!nd)
+ return -ENOMEM;
+
/* ... and get the mountpoint */
- retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
+ retval = path_lookup(dir_name, LOOKUP_FOLLOW, nd);
if (retval)
- return retval;
+ goto path_out;
- retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page);
+ retval = security_sb_mount(dev_name, nd, type_page, flags, data_page);
if (retval)
goto dput_out;
if (flags & MS_REMOUNT)
- retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ retval = do_remount(nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(nd, dev_name, flags & MS_REC);
else if (flags & MS_MOVE)
- retval = do_move_mount(&nd, dev_name);
+ retval = do_move_mount(nd, dev_name);
else
- retval = do_new_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
- path_release(&nd);
+ path_release(nd);
+path_out:
+ kfree(nd);
return retval;
}
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 12:57 ` Heiko Carstens
@ 2005-11-04 14:06 ` Al Viro
2005-11-04 22:03 ` Heiko Carstens
0 siblings, 1 reply; 20+ messages in thread
From: Al Viro @ 2005-11-04 14:06 UTC (permalink / raw)
To: Heiko Carstens; +Cc: Andrew Morton, linux-kernel, Andreas Herrmann
On Fri, Nov 04, 2005 at 01:57:05PM +0100, Heiko Carstens wrote:
> > > This is a resubmit of Andreas' patch that reduces stackframe usage in
> > > do_mount. Problem is that without this patch we get a kernel stack
> > > overflow if we run with 4k stacks (s390 31 bit mode).
> > > See original stack back trace below and Andreas' patch and analysis
> > > here:
> > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
> >
> > NAK. Rationale: too ugly.
>
> Ok, since I can only guess what you don't like: here is an updated patch
> that probably addresses a few things.
> If you don't like this one too, could you please explain what should be
> changed?
Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
is not going to overflow, no matter what. And do_add_mount() change
is also very suspicious - looks like you are attacking the wrong place
in call chain.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 14:06 ` Al Viro
@ 2005-11-04 22:03 ` Heiko Carstens
0 siblings, 0 replies; 20+ messages in thread
From: Heiko Carstens @ 2005-11-04 22:03 UTC (permalink / raw)
To: Al Viro; +Cc: Andrew Morton, linux-kernel, Andreas Herrmann
> > If you don't like this one too, could you please explain what should be
> > changed?
>
> Depth analysis. E.g. do_move_mount() change is simply nonsense - _this_
> is not going to overflow, no matter what. And do_add_mount() change
> is also very suspicious - looks like you are attacking the wrong place
> in call chain.
You're right (see also my other reply to Andrew).
Thanks,
Heiko
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 10:50 [PATCH resubmit] do_mount: reduce stack consumption Heiko Carstens
2005-11-04 11:48 ` Al Viro
@ 2005-11-04 16:48 ` Andrew Morton
2005-11-04 21:27 ` Heiko Carstens
1 sibling, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2005-11-04 16:48 UTC (permalink / raw)
To: Heiko Carstens; +Cc: linux-kernel, aherrman
Heiko Carstens <heiko.carstens@de.ibm.com> wrote:
>
> From: Andreas Herrmann <aherrman@de.ibm.com>
>
> This is a resubmit of Andreas' patch that reduces stackframe usage in
> do_mount. Problem is that without this patch we get a kernel stack
> overflow if we run with 4k stacks (s390 31 bit mode).
> See original stack back trace below and Andreas' patch and analysis
> here:
> http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> <4>Call Trace:
> <4>([<000000000014ade6>] buffered_rmqueue+0x36a/0x3b4)
> <4> [<000000000014aed6>] __alloc_pages+0xa6/0x350
> <4> [<000000000014b258>] __get_free_pages+0x38/0x74
> <4> [<000000000014f9fe>] kmem_getpages+0x32/0x140
> <4> [<0000000000150396>] cache_alloc_refill+0x3ae/0x58c
> <4> [<000000000014feb0>] __kmalloc+0xc0/0xe8
> <4> [<00000000002acc64>] zfcp_mempool_alloc+0x24/0x34
> <4> [<00000000001494ea>] mempool_alloc+0x56/0x1e8
> <4> [<00000000002bfb46>] zfcp_fsf_req_create+0x56/0x5d8
> <4> [<00000000002c0cf0>] zfcp_fsf_send_fcp_command_task+0x44/0x5c4
> <4> [<00000000002afbda>] zfcp_scsi_command_async+0x7a/0x1f8
> <4> [<00000000002afe68>] zfcp_scsi_queuecommand+0x44/0x54
> <4> [<000000000024f458>] scsi_dispatch_cmd+0x234/0x3c0
> <4> [<0000000000255fb2>] scsi_request_fn+0x2c6/0x67c
> <4> [<000000000023f3ea>] __generic_unplug_device+0x52/0x60
> <4> [<000000000023d392>] __elv_add_request+0x8a/0x98
> <4> [<0000000000240c5e>] __make_request+0x306/0x62c
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> <4> [<000000000024121a>] submit_bio+0x7a/0x154
> <4> [<00000000001736e6>] submit_bh+0x106/0x184
> <4> [<000000000017514e>] __block_write_full_page+0x23a/0x58c
> <4> [<000000000017558e>] block_write_full_page+0xee/0x108
> <4> [<000000000017a908>] blkdev_writepage+0x24/0x38
> <4> [<00000000001a0f76>] mpage_writepages+0x1da/0x9d0
> <4> [<000000000017a4fa>] generic_writepages+0x22/0x30
> <4> [<000000000014cc1e>] do_writepages+0x36/0x54
> <4> [<0000000000144fe2>] __filemap_fdatawrite+0x5a/0x6c
> <4> [<0000000000145016>] filemap_fdatawrite+0x22/0x30
> <4> [<0000000000171244>] sync_blockdev+0x30/0x5c
> <4> [<00000000001df074>] journal_recover+0x88/0xbc
> <4> [<00000000001e2876>] journal_load+0x52/0xcc
> <4> [<00000000001d37b4>] ext3_fill_super+0x1a48/0x1d38
> <4> [<0000000000179faa>] get_sb_bdev+0x13a/0x208
> <4> [<00000000001d3bd6>] ext3_get_sb+0x22/0x34
> <4> [<000000000017a32e>] do_kern_mount+0x86/0x1f4
> <4> [<000000000019aaf4>] do_mount+0x1c8/0x8e4
> <4> [<000000000019b7ba>] sys_mount+0xce/0x1dc
> <4> [<000000000010f9a6>] sysc_do_restart+0xe/0x12
I'd call that a device mapper bug. If you were to increase the stacking
from 4-deep to 5-deep, it will crash the kernel, patched or not.
I'm (very) surprised that DM doesn't perform its stacking iteratively.
Eliminating the 36-byte `struct clone_info' in __split_bio() would be most
effective here.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 16:48 ` Andrew Morton
@ 2005-11-04 21:27 ` Heiko Carstens
2005-11-04 23:55 ` Adrian Bunk
0 siblings, 1 reply; 20+ messages in thread
From: Heiko Carstens @ 2005-11-04 21:27 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, aherrman
> > See original stack back trace below and Andreas' patch and analysis
> > here:
> > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
I probably should add that with "original" stack back trace a trace of
a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
still the same in 2.6.14.
> > <4>Call Trace:
...
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
...
This part of the call trace is actually good for >1500 bytes of stack
usage and is what kills us and should be fixed.
I'm surprised that there are no other bug reports regarding DM and
stack overflow with 4k stacks.
> I'd call that a device mapper bug. If you were to increase the stacking
> from 4-deep to 5-deep, it will crash the kernel, patched or not.
Yes, you're right. Just forget the do_mount() patch. It's the wrong approach.
Heiko
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 21:27 ` Heiko Carstens
@ 2005-11-04 23:55 ` Adrian Bunk
2005-11-05 0:08 ` Andrew Morton
0 siblings, 1 reply; 20+ messages in thread
From: Adrian Bunk @ 2005-11-04 23:55 UTC (permalink / raw)
To: Heiko Carstens; +Cc: Andrew Morton, linux-kernel, aherrman
On Fri, Nov 04, 2005 at 10:27:42PM +0100, Heiko Carstens wrote:
> > > See original stack back trace below and Andreas' patch and analysis
> > > here:
> > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
>
> I probably should add that with "original" stack back trace a trace of
> a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
> still the same in 2.6.14.
>
> > > <4>Call Trace:
> ...
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> ...
>
> This part of the call trace is actually good for >1500 bytes of stack
> usage and is what kills us and should be fixed.
> I'm surprised that there are no other bug reports regarding DM and
> stack overflow with 4k stacks.
>...
There were some reports of dm+xfs overflows with 4k stacks on i386.
The xfs side was sorted out, but I son't know the state of the dm part.
> Heiko
cu
Adrian
--
"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-04 23:55 ` Adrian Bunk
@ 2005-11-05 0:08 ` Andrew Morton
2005-11-05 1:15 ` cplk
0 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2005-11-05 0:08 UTC (permalink / raw)
To: Adrian Bunk; +Cc: heiko.carstens, linux-kernel, aherrman, dm-devel
Adrian Bunk <bunk@stusta.de> wrote:
>
> On Fri, Nov 04, 2005 at 10:27:42PM +0100, Heiko Carstens wrote:
> > > > See original stack back trace below and Andreas' patch and analysis
> > > > here:
> > > > http://www.ussg.iu.edu/hypermail/linux/kernel/0410.3/1844.html
> >
> > I probably should add that with "original" stack back trace a trace of
> > a 2.6.10 kernel was meant, if that wasn't clear, but the DM code is
> > still the same in 2.6.14.
> >
> > > > <4>Call Trace:
> > ...
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > > > <4> [<0000000010831380>] __map_bio+0x70/0x160 [dm_mod]
> > > > <4> [<000000001083173e>] __split_bio+0x1e6/0x538 [dm_mod]
> > > > <4> [<0000000010831ba8>] dm_request+0x118/0x25c [dm_mod]
> > > > <4> [<0000000000241074>] generic_make_request+0xf0/0x21c
> > ...
> >
> > This part of the call trace is actually good for >1500 bytes of stack
> > usage and is what kills us and should be fixed.
> > I'm surprised that there are no other bug reports regarding DM and
> > stack overflow with 4k stacks.
> >...
>
> There were some reports of dm+xfs overflows with 4k stacks on i386.
>
> The xfs side was sorted out, but I son't know the state of the dm part.
>
The state is Very Bad, IMO. At the very least, DM should struggle to the
utmost to reduce the stack utilisation around that recursion point.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-05 0:08 ` Andrew Morton
@ 2005-11-05 1:15 ` cplk
2005-11-05 1:37 ` Andrew Morton
0 siblings, 1 reply; 20+ messages in thread
From: cplk @ 2005-11-05 1:15 UTC (permalink / raw)
To: Andrew Morton
Cc: Adrian Bunk, heiko.carstens, linux-kernel, aherrman, dm-devel
> > > This part of the call trace is actually good for >1500 bytes of stack
> > > usage and is what kills us and should be fixed.
> > > I'm surprised that there are no other bug reports regarding DM and
> > > stack overflow with 4k stacks.
> > >...
> >
> > There were some reports of dm+xfs overflows with 4k stacks on i386.
> >
> > The xfs side was sorted out, but I son't know the state of the dm part.
> >
>
> The state is Very Bad, IMO. At the very least, DM should struggle to the
> utmost to reduce the stack utilisation around that recursion point.
Neil Brown suggested a change to generic_make_request to convert recursion
through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
there was no discussion at the time. Would this help with this case?
Chris
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-05 1:15 ` cplk
@ 2005-11-05 1:37 ` Andrew Morton
2005-11-05 5:37 ` Neil Brown
0 siblings, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2005-11-05 1:37 UTC (permalink / raw)
To: cplk, Neil Brown; +Cc: bunk, heiko.carstens, linux-kernel, aherrman, dm-devel
cplk@itee.uq.edu.au wrote:
>
> > > > This part of the call trace is actually good for >1500 bytes of stack
> > > > usage and is what kills us and should be fixed.
> > > > I'm surprised that there are no other bug reports regarding DM and
> > > > stack overflow with 4k stacks.
> > > >...
> > >
> > > There were some reports of dm+xfs overflows with 4k stacks on i386.
> > >
> > > The xfs side was sorted out, but I son't know the state of the dm part.
> > >
> >
> > The state is Very Bad, IMO. At the very least, DM should struggle to the
> > utmost to reduce the stack utilisation around that recursion point.
>
> Neil Brown suggested a change to generic_make_request to convert recursion
> through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
> there was no discussion at the time. Would this help with this case?
It certainly would. That looks like a good thing to do some more work on.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-05 1:37 ` Andrew Morton
@ 2005-11-05 5:37 ` Neil Brown
2005-11-07 0:16 ` [dm-devel] " Neil Brown
0 siblings, 1 reply; 20+ messages in thread
From: Neil Brown @ 2005-11-05 5:37 UTC (permalink / raw)
To: Andrew Morton
Cc: cplk, bunk, heiko.carstens, linux-kernel, aherrman, dm-devel
On Friday November 4, akpm@osdl.org wrote:
> cplk@itee.uq.edu.au wrote:
> >
> > > > > This part of the call trace is actually good for >1500 bytes of stack
> > > > > usage and is what kills us and should be fixed.
> > > > > I'm surprised that there are no other bug reports regarding DM and
> > > > > stack overflow with 4k stacks.
> > > > >...
> > > >
> > > > There were some reports of dm+xfs overflows with 4k stacks on i386.
> > > >
> > > > The xfs side was sorted out, but I son't know the state of the dm part.
> > > >
> > >
> > > The state is Very Bad, IMO. At the very least, DM should struggle to the
> > > utmost to reduce the stack utilisation around that recursion point.
> >
> > Neil Brown suggested a change to generic_make_request to convert recursion
> > through it into iteration (see http://lkml.org/lkml/2005/9/2/34 ), but
> > there was no discussion at the time. Would this help with this case?
>
> It certainly would. That looks like a good thing to do some more work on.
Ok, I'll dust it off, make sure it seems to work (at the time I first
wrote it, I think it caused 'md' to deadlock) and submit it.
NeilBrown
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-05 5:37 ` Neil Brown
@ 2005-11-07 0:16 ` Neil Brown
2005-11-07 16:15 ` Heiko Carstens
2005-11-07 23:37 ` Andrew Morton
0 siblings, 2 replies; 20+ messages in thread
From: Neil Brown @ 2005-11-07 0:16 UTC (permalink / raw)
To: device-mapper development
Cc: Andrew Morton, heiko.carstens, linux-kernel, aherrman, bunk, cplk
On Saturday November 5, neilb@suse.de wrote:
>
> Ok, I'll dust it off, make sure it seems to work (at the time I first
> wrote it, I think it caused 'md' to deadlock) and submit it.
>
> NeilBrown
>
For your consideration and testing (it works for me, but I'd like to
see it tested a bit more heavily in a variety of configurations).
NeilBrown
--
Reduce stack usage with stacked block devices
When stacked block devices are in-use (e.g. md or dm), the recursive
calls to generic_make_request can use up a lot of space, and we would
rather they didn't.
As generic_make_request is a void function, and as it is generally not
expected that it will have any effect immediately, it is safe to delay
any call to generic_make_request until there is sufficient stack space
available.
As ->bi_next is reserved for the driver to use, it can have no valid
value when generic_make_request is called, and as __make_request
implicitly assumes it will be NULL (ELEVATOR_BACK_MERGE fork of
switch) we can be certain that all callers set it to NULL. We can
therefore safely use bi_next to link pending requests together,
providing we clear it before making the real call.
So, we choose to allow each thread to only be active in one
generic_make_request at a time. If a subsequent (recursive) call is
made, the bio is linked into a per-thread list, and is handled when
the active call completes.
As the list of pending bios is per-thread, there are no locking issues
to worry about.
I say above that it is "safe to delay any call...". There are,
however, some behaviours of a make_request_fn which would make it
unsafe. These include any behaviour that assumes anything will have
changed after a recursive call to generic_make_request.
These could include:
- waiting for that call to finish and call it's bi_end_io function.
md use to sometimes do this (marking the superblock dirty before
completing a write) but doesn't any more
- inspecting the bio for fields that generic_make_request might
change, such as bi_sector or bi_bdev. It is hard to see a good
reason for this, and I don't think anyone actually does it.
- inspecing the queue to see if, e.g. it is 'full' yet. Again, I
think this is very unlikely to be useful, or to be done.
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Neil Brown <neilb@suse.de>
### Diffstat output
./drivers/block/ll_rw_blk.c | 53 +++++++++++++++++++++++++++++++++++++++++++-
./include/linux/sched.h | 3 ++
2 files changed, 55 insertions(+), 1 deletion(-)
diff ./drivers/block/ll_rw_blk.c~current~ ./drivers/block/ll_rw_blk.c
--- ./drivers/block/ll_rw_blk.c~current~ 2005-11-07 10:01:36.000000000 +1100
+++ ./drivers/block/ll_rw_blk.c 2005-11-07 10:33:47.000000000 +1100
@@ -2957,7 +2957,7 @@ static void handle_bad_sector(struct bio
* bi_sector for remaps as it sees fit. So the values of these fields
* should NOT be depended on after the call to generic_make_request.
*/
-void generic_make_request(struct bio *bio)
+static inline void __generic_make_request(struct bio *bio)
{
request_queue_t *q;
sector_t maxsector;
@@ -3038,6 +3038,57 @@ end_io:
} while (ret);
}
+/*
+ * We only want one ->make_request_fn to be active at a time,
+ * else stack usage with stacked devices could be a problem.
+ * So use current->bio_{list,tail} to keep a list of requests
+ * submited by a make_request_fn function.
+ * current->bio_tail is also used as a flag to say if
+ * generic_make_request is currently active in this task or not.
+ * If it is NULL, then no make_request is active. If it is non-NULL,
+ * then a make_request is active, and new requests should be added
+ * at the tail
+ */
+void generic_make_request(struct bio *bio)
+{
+ if (current->bio_tail) {
+ /* make_request is active */
+ *(current->bio_tail) = bio;
+ bio->bi_next = NULL;
+ current->bio_tail = &bio->bi_next;
+ return;
+ }
+ /* following loop may be a bit non-obvious, and so deserves some
+ * explantion.
+ * Before entering the loop, bio->bi_next is NULL (as all callers
+ * ensure that) so we have a list with a single bio.
+ * We pretend that we have just taken it off a longer list, so
+ * we assign bio_list to the next (which is NULL) and bio_tail
+ * to &bio_list, thus initialising the bio_list of new bios to be
+ * added. __generic_make_request may indeed add some more bios
+ * through a recursive call to generic_make_request. If it
+ * did, we find a non-NULL value in bio_list and re-enter the loop
+ * from the top. In this case we really did just take the bio
+ * of the top of the list (no pretending) and so fixup bio_list and
+ * bio_tail or bi_next, and call into __generic_make_request again.
+ *
+ * The loop was structured like this to make only one call to
+ * __generic_make_request (which is important as it is large and inlined)
+ * and to keep the structure simple.
+ */
+ BUG_ON(bio->bi_next);
+ do {
+ current->bio_list = bio->bi_next;
+ if (bio->bi_next == NULL)
+ current->bio_tail = ¤t->bio_list;
+ else
+ bio->bi_next = NULL;
+ __generic_make_request(bio);
+ bio = current->bio_list;
+ } while (bio);
+ current->bio_tail = NULL; /* deactivate */
+}
+
EXPORT_SYMBOL(generic_make_request);
/**
diff ./include/linux/sched.h~current~ ./include/linux/sched.h
--- ./include/linux/sched.h~current~ 2005-11-07 10:01:36.000000000 +1100
+++ ./include/linux/sched.h 2005-11-07 10:02:23.000000000 +1100
@@ -829,6 +829,9 @@ struct task_struct {
/* journalling filesystem info */
void *journal_info;
+/* stacked block device info */
+ struct bio *bio_list, **bio_tail;
+
/* VM state */
struct reclaim_state *reclaim_state;
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-07 0:16 ` [dm-devel] " Neil Brown
@ 2005-11-07 16:15 ` Heiko Carstens
2005-11-07 22:12 ` Neil Brown
2005-11-07 23:37 ` Andrew Morton
1 sibling, 1 reply; 20+ messages in thread
From: Heiko Carstens @ 2005-11-07 16:15 UTC (permalink / raw)
To: Neil Brown
Cc: device-mapper development, Andrew Morton, linux-kernel, aherrman,
bunk, cplk
> > Ok, I'll dust it off, make sure it seems to work (at the time I first
> > wrote it, I think it caused 'md' to deadlock) and submit it.
> >
> > NeilBrown
> >
>
> For your consideration and testing (it works for me, but I'd like to
> see it tested a bit more heavily in a variety of configurations).
Works fine for me. Thank you!
However I noticed another recursive call while dumping all call traces:
...
[<00000000001e1952>] zfcp_scsi_command_async+0xf6/0x1c4
[<000000000013cd6a>] scsi_dispatch_cmd+0x1ae/0x37c
[<0000000000143dc4>] scsi_request_fn+0x1dc/0x390
[<000000000012f65a>] generic_unplug_device+0x2a/0x38
[<0000000000181262>] dm_table_unplug_all+0x42/0x58
[<000000000017e598>] dm_unplug_all+0x2c/0x44
[<0000000000181262>] dm_table_unplug_all+0x42/0x58
[<000000000017e598>] dm_unplug_all+0x2c/0x44
[<00000000000782aa>] block_sync_page+0x4e/0x68
[<00000000000514f8>] sync_page+0x48/0x68
[<00000000002ab36e>] __wait_on_bit+0x9a/0xe0
...
Even though there is a lot of space left on the stack (this time) I don't
think it's desirable to have recursive calls. Especially not when going
for 4k stacks.
Heiko
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-07 16:15 ` Heiko Carstens
@ 2005-11-07 22:12 ` Neil Brown
0 siblings, 0 replies; 20+ messages in thread
From: Neil Brown @ 2005-11-07 22:12 UTC (permalink / raw)
To: Heiko Carstens
Cc: device-mapper development, Andrew Morton, linux-kernel, aherrman,
bunk, cplk
On Monday November 7, heiko.carstens@de.ibm.com wrote:
> > > Ok, I'll dust it off, make sure it seems to work (at the time I first
> > > wrote it, I think it caused 'md' to deadlock) and submit it.
> > >
> > > NeilBrown
> > >
> >
> > For your consideration and testing (it works for me, but I'd like to
> > see it tested a bit more heavily in a variety of configurations).
>
> Works fine for me. Thank you!
>
Thanks for the feed-back.
> However I noticed another recursive call while dumping all call traces:
> ...
> [<00000000001e1952>] zfcp_scsi_command_async+0xf6/0x1c4
> [<000000000013cd6a>] scsi_dispatch_cmd+0x1ae/0x37c
> [<0000000000143dc4>] scsi_request_fn+0x1dc/0x390
> [<000000000012f65a>] generic_unplug_device+0x2a/0x38
> [<0000000000181262>] dm_table_unplug_all+0x42/0x58
> [<000000000017e598>] dm_unplug_all+0x2c/0x44
> [<0000000000181262>] dm_table_unplug_all+0x42/0x58
> [<000000000017e598>] dm_unplug_all+0x2c/0x44
> [<00000000000782aa>] block_sync_page+0x4e/0x68
> [<00000000000514f8>] sync_page+0x48/0x68
> [<00000000002ab36e>] __wait_on_bit+0x9a/0xe0
> ...
Hmm.. This (and a similar one with ->issue_flush_fn) would be a lot
harder to unwind, but it is also much more benign. These sorts of
calls should always be very shallow.
I don't think there is anything useful that can be done here, and I'm
not convinced there is a real problem.
It is worth noticing it and keeping it in mind though.
Thanks,
NeilBrown
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-07 0:16 ` [dm-devel] " Neil Brown
2005-11-07 16:15 ` Heiko Carstens
@ 2005-11-07 23:37 ` Andrew Morton
2005-11-08 0:32 ` Nick Piggin
1 sibling, 1 reply; 20+ messages in thread
From: Andrew Morton @ 2005-11-07 23:37 UTC (permalink / raw)
To: Neil Brown; +Cc: dm-devel, heiko.carstens, linux-kernel, aherrman, bunk, cplk
Neil Brown <neilb@suse.de> wrote:
>
> ...
> Reduce stack usage with stacked block devices
>
> ...
> diff ./include/linux/sched.h~current~ ./include/linux/sched.h
> --- ./include/linux/sched.h~current~ 2005-11-07 10:01:36.000000000 +1100
> +++ ./include/linux/sched.h 2005-11-07 10:02:23.000000000 +1100
> @@ -829,6 +829,9 @@ struct task_struct {
> /* journalling filesystem info */
> void *journal_info;
>
> +/* stacked block device info */
> + struct bio *bio_list, **bio_tail;
> +
> /* VM state */
> struct reclaim_state *reclaim_state;
>
More state in the task_strut is a bit sad, but not nearly as sad as deep
recursion in our deepest codepath..
Possibly one could do:
struct make_request_state {
struct bio *bio_list;
struct bio **bio_tail;
};
and stick a `struct make_request_state *' into the task_struct and actually
allocate the thing on the stack. That's not much nicer though.
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-07 23:37 ` Andrew Morton
@ 2005-11-08 0:32 ` Nick Piggin
2005-11-08 1:03 ` Neil Brown
0 siblings, 1 reply; 20+ messages in thread
From: Nick Piggin @ 2005-11-08 0:32 UTC (permalink / raw)
To: Andrew Morton
Cc: Neil Brown, dm-devel, heiko.carstens, linux-kernel, aherrman,
bunk, cplk
Andrew Morton wrote:
> Neil Brown <neilb@suse.de> wrote:
>
>>...
>>Reduce stack usage with stacked block devices
>>
>>...
>>diff ./include/linux/sched.h~current~ ./include/linux/sched.h
>>--- ./include/linux/sched.h~current~ 2005-11-07 10:01:36.000000000 +1100
>>+++ ./include/linux/sched.h 2005-11-07 10:02:23.000000000 +1100
>>@@ -829,6 +829,9 @@ struct task_struct {
>> /* journalling filesystem info */
>> void *journal_info;
>>
>>+/* stacked block device info */
>>+ struct bio *bio_list, **bio_tail;
>>+
>> /* VM state */
>> struct reclaim_state *reclaim_state;
>>
>
>
> More state in the task_strut is a bit sad, but not nearly as sad as deep
> recursion in our deepest codepath..
>
> Possibly one could do:
>
> struct make_request_state {
> struct bio *bio_list;
> struct bio **bio_tail;
> };
>
> and stick a `struct make_request_state *' into the task_struct and actually
> allocate the thing on the stack. That's not much nicer though.
Possibly it could go into struct io_context?
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-08 0:32 ` Nick Piggin
@ 2005-11-08 1:03 ` Neil Brown
2005-11-08 1:15 ` Andrew Morton
2005-11-08 1:37 ` Nick Piggin
0 siblings, 2 replies; 20+ messages in thread
From: Neil Brown @ 2005-11-08 1:03 UTC (permalink / raw)
To: Nick Piggin
Cc: Andrew Morton, dm-devel, heiko.carstens, linux-kernel, aherrman,
bunk, cplk
On Tuesday November 8, nickpiggin@yahoo.com.au wrote:
> Andrew Morton wrote:
> >
> > More state in the task_strut is a bit sad, but not nearly as sad as deep
> > recursion in our deepest codepath..
> >
> > Possibly one could do:
> >
> > struct make_request_state {
> > struct bio *bio_list;
> > struct bio **bio_tail;
> > };
> >
> > and stick a `struct make_request_state *' into the task_struct and actually
> > allocate the thing on the stack. That's not much nicer though.
>
> Possibly it could go into struct io_context?
>
My quick reading of the code says that we could have to
allocate the struct right there in generic_make_request, and I don't
think we can be certain that such an allocation will succeed.
Code that uses io_context can limp along if it doesn't exist.
The new generic_make_request needs this bio_list to be present
or it cannot do it's job.
Just how tight are we for space in task_struct? It seems to have a
fair amount of cruft in it.
Is it getting close to one-page or something?
Can we just split the less interesting stuff up into a separate
structure, allocate a separate page for that are fork time, and leave
just a pointer in the task_struct?
NeilBrown
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-08 1:03 ` Neil Brown
@ 2005-11-08 1:15 ` Andrew Morton
2005-11-08 1:37 ` Nick Piggin
1 sibling, 0 replies; 20+ messages in thread
From: Andrew Morton @ 2005-11-08 1:15 UTC (permalink / raw)
To: Neil Brown
Cc: nickpiggin, dm-devel, heiko.carstens, linux-kernel, aherrman,
bunk, cplk
Neil Brown <neilb@suse.de> wrote:
>
> On Tuesday November 8, nickpiggin@yahoo.com.au wrote:
> > Andrew Morton wrote:
> > >
> > > More state in the task_strut is a bit sad, but not nearly as sad as deep
> > > recursion in our deepest codepath..
> > >
> > > Possibly one could do:
> > >
> > > struct make_request_state {
> > > struct bio *bio_list;
> > > struct bio **bio_tail;
> > > };
> > >
> > > and stick a `struct make_request_state *' into the task_struct and actually
> > > allocate the thing on the stack. That's not much nicer though.
> >
> > Possibly it could go into struct io_context?
> >
>
> My quick reading of the code says that we could have to
> allocate the struct right there in generic_make_request, and I don't
> think we can be certain that such an allocation will succeed.
With this sort of lifecycle it's more appropriat to allocate the struct on
the stack and to put a pointer to it into task_struct.
> Code that uses io_context can limp along if it doesn't exist.
> The new generic_make_request needs this bio_list to be present
> or it cannot do it's job.
>
> Just how tight are we for space in task_struct?
I don't recall anyone getting outraged about it.
> It seems to have a
> fair amount of cruft in it.
yup.
> Is it getting close to one-page or something?
1280 bytes on my x86
> Can we just split the less interesting stuff up into a separate
> structure, allocate a separate page for that are fork time, and leave
> just a pointer in the task_struct?
Something like that, if it becomes a problem.
Probably there are various deporking opportunities in there.
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [dm-devel] Re: [PATCH resubmit] do_mount: reduce stack consumption
2005-11-08 1:03 ` Neil Brown
2005-11-08 1:15 ` Andrew Morton
@ 2005-11-08 1:37 ` Nick Piggin
1 sibling, 0 replies; 20+ messages in thread
From: Nick Piggin @ 2005-11-08 1:37 UTC (permalink / raw)
To: Neil Brown
Cc: Andrew Morton, dm-devel, heiko.carstens, linux-kernel, aherrman,
bunk, cplk
Neil Brown wrote:
> On Tuesday November 8, nickpiggin@yahoo.com.au wrote:
>>Possibly it could go into struct io_context?
>>
>
>
> My quick reading of the code says that we could have to
> allocate the struct right there in generic_make_request, and I don't
> think we can be certain that such an allocation will succeed.
>
> Code that uses io_context can limp along if it doesn't exist.
> The new generic_make_request needs this bio_list to be present
> or it cannot do it's job.
>
You can ask for the io context without having a request. However,
there is nothing like a mempool for them so code really should be
able to limp along without them.
I guess it would be silly to require such an allocation to succeed
here, because the block layer is pretty free of OOM deadlocks.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2005-11-08 1:37 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-11-04 10:50 [PATCH resubmit] do_mount: reduce stack consumption Heiko Carstens
2005-11-04 11:48 ` Al Viro
2005-11-04 12:57 ` Heiko Carstens
2005-11-04 14:06 ` Al Viro
2005-11-04 22:03 ` Heiko Carstens
2005-11-04 16:48 ` Andrew Morton
2005-11-04 21:27 ` Heiko Carstens
2005-11-04 23:55 ` Adrian Bunk
2005-11-05 0:08 ` Andrew Morton
2005-11-05 1:15 ` cplk
2005-11-05 1:37 ` Andrew Morton
2005-11-05 5:37 ` Neil Brown
2005-11-07 0:16 ` [dm-devel] " Neil Brown
2005-11-07 16:15 ` Heiko Carstens
2005-11-07 22:12 ` Neil Brown
2005-11-07 23:37 ` Andrew Morton
2005-11-08 0:32 ` Nick Piggin
2005-11-08 1:03 ` Neil Brown
2005-11-08 1:15 ` Andrew Morton
2005-11-08 1:37 ` Nick Piggin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox