From: Maneesh Soni <maneesh@in.ibm.com>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Andrew Morton <akpm@digeo.com>,
Al Viro <viro@parcelfarce.linux.theplanet.co.uk>,
Dipankar Sarma <dipankar@in.ibm.com>,
Paul McKenney <Paul.McKenney@us.ibm.com>
Subject: [patch 1/2] vfsmount_lock
Date: Wed, 21 May 2003 14:55:02 +0530 [thread overview]
Message-ID: <20030521092502.GD1198@in.ibm.com> (raw)
While path walking we do follow_mount or follow_down which uses
dcache_lock for serialisation. vfsmount related operations also use
dcache_lock for all updates. I think we can use a separate lock for
vfsmount related work and can improve path walking.
The following two patches does the same. The first one replaces
dcache_lock with new vfsmount_lock in namespace.c. The lock is
local to namespace.c and is not required outside. The second patch
uses RCU to have lock free lookup_mnt(). The patches are quite simple
and straight forward.
The lockmeter reults show reduced contention, and lock acquisitions
for dcache_lock while running dcachebench* on a 4-way SMP box
SPINLOCKS HOLD WAIT
UTIL CON MEAN( MAX ) MEAN( MAX )(% CPU) TOTAL NOWAIT SPIN RJECT NAME
baselkm-2569:
20.7% 20.9% 0.5us( 146us) 2.9us( 144us)(0.81%) 31590840 79.1% 20.9% 0% dcache_lock
mntlkm-2569:
14.3% 13.6% 0.4us( 170us) 2.9us( 187us)(0.42%) 23071746 86.4% 13.6% 0% dcache_lock
We get more than 8% improvement on 4-way SMP and 44% improvement on 16-way
NUMAQ while runing dcachebench*.
Average (usecs/iteration) Std. Deviation
(lower is better)
4-way SMP
2.5.69 15739.3 470.90
2.5.69-mnt 14459.6 298.51
16-way NUMAQ
2.5.69 120426.5 363.78
2.5.69-mnt 63225.8 427.60
*dcachebench is a microbenchmark written by Bill Hartner and is available at
http://www-124.ibm.com/developerworks/opensource/linuxperf/dcachebench/dcachebench.html
vfsmount_lock.patch
-------------------
- Patch for replacing dcache_lock with new vfsmount_lock for all mount
related operation. This removes the need to take dcache_lock while
doing follow_mount or follow_down operations in path walking.
fs/namei.c | 24 +++++++++------------
fs/namespace.c | 64 ++++++++++++++++++++++++++++++++-------------------------
2 files changed, 48 insertions(+), 40 deletions(-)
diff -puN fs/namespace.c~vfsmount_lock fs/namespace.c
--- linux-2.5.69/fs/namespace.c~vfsmount_lock 2003-05-21 10:53:00.000000000 +0530
+++ linux-2.5.69-maneesh/fs/namespace.c 2003-05-21 11:02:47.000000000 +0530
@@ -28,6 +28,8 @@ extern int do_remount_sb(struct super_bl
extern int __init init_rootfs(void);
extern int __init fs_subsys_init(void);
+/* spinlock for vfsmount related operation, inplace of dcache_lock */
+static spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
static struct list_head *mount_hashtable;
static int hash_mask, hash_bits;
static kmem_cache_t *mnt_cache;
@@ -69,30 +71,38 @@ void free_vfsmnt(struct vfsmount *mnt)
kmem_cache_free(mnt_cache, mnt);
}
+/*
+ * Now, lookup_mnt increments the ref count before returning
+ * the vfsmount struct.
+ */
struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
struct list_head * head = mount_hashtable + hash(mnt, dentry);
struct list_head * tmp = head;
- struct vfsmount *p;
+ struct vfsmount *p, *found = NULL;
+ spin_lock(&vfsmount_lock);
for (;;) {
tmp = tmp->next;
p = NULL;
if (tmp == head)
break;
p = list_entry(tmp, struct vfsmount, mnt_hash);
- if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry)
+ if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+ found = mntget(p);
break;
+ }
}
- return p;
+ spin_lock(&vfsmount_lock);
+ return found;
}
static int check_mnt(struct vfsmount *mnt)
{
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
while (mnt->mnt_parent != mnt)
mnt = mnt->mnt_parent;
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
return mnt == current->namespace->root;
}
@@ -273,15 +283,15 @@ void umount_tree(struct vfsmount *mnt)
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
if (mnt->mnt_parent == mnt) {
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
} else {
struct nameidata old_nd;
detach_mnt(mnt, &old_nd);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
path_release(&old_nd);
}
mntput(mnt);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
}
}
@@ -334,17 +344,17 @@ static int do_umount(struct vfsmount *mn
}
down_write(¤t->namespace->sem);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (atomic_read(&sb->s_active) == 1) {
/* last instance - try to be smart */
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
lock_kernel();
DQUOT_OFF(sb);
acct_auto_close(sb);
unlock_kernel();
security_sb_umount_close(mnt);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
}
retval = -EBUSY;
if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
@@ -352,7 +362,7 @@ static int do_umount(struct vfsmount *mn
umount_tree(mnt);
retval = 0;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
if (retval)
security_sb_umount_busy(mnt);
up_write(¤t->namespace->sem);
@@ -442,17 +452,17 @@ static struct vfsmount *copy_tree(struct
q = clone_mnt(p, p->mnt_root);
if (!q)
goto Enomem;
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, &nd);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
}
return res;
Enomem:
if (res) {
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
umount_tree(res);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
}
return NULL;
}
@@ -476,7 +486,7 @@ static int graft_tree(struct vfsmount *m
if (err)
goto out_unlock;
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
struct list_head head;
attach_mnt(mnt, nd);
@@ -485,7 +495,7 @@ static int graft_tree(struct vfsmount *m
mntget(mnt);
err = 0;
}
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
out_unlock:
up(&nd->dentry->d_inode->i_sem);
if (!err)
@@ -522,9 +532,9 @@ static int do_loopback(struct nameidata
if (mnt) {
err = graft_tree(mnt, nd);
if (err) {
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
umount_tree(mnt);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
} else
mntput(mnt);
}
@@ -589,7 +599,7 @@ static int do_move_mount(struct nameidat
if (IS_DEADDIR(nd->dentry->d_inode))
goto out1;
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (!IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
goto out2;
@@ -613,7 +623,7 @@ static int do_move_mount(struct nameidat
detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd);
out2:
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
out1:
up(&nd->dentry->d_inode->i_sem);
out:
@@ -794,9 +804,9 @@ int copy_namespace(int flags, struct tas
down_write(&tsk->namespace->sem);
/* First pass: copy the tree topology */
new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
/* Second pass: switch the tsk->fs->* elements */
if (fs) {
@@ -1017,7 +1027,7 @@ asmlinkage long sys_pivot_root(const cha
if (new_nd.mnt->mnt_root != new_nd.dentry)
goto out2; /* not a mountpoint */
tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
- spin_lock(&dcache_lock);
+ spin_lock(&vfsmount_lock);
if (tmp != new_nd.mnt) {
for (;;) {
if (tmp->mnt_parent == tmp)
@@ -1034,7 +1044,7 @@ asmlinkage long sys_pivot_root(const cha
detach_mnt(user_nd.mnt, &root_parent);
attach_mnt(user_nd.mnt, &old_nd);
attach_mnt(new_nd.mnt, &root_parent);
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
chroot_fs_refs(&user_nd, &new_nd);
security_sb_post_pivotroot(&user_nd, &new_nd);
error = 0;
@@ -1051,7 +1061,7 @@ out0:
unlock_kernel();
return error;
out3:
- spin_unlock(&dcache_lock);
+ spin_unlock(&vfsmount_lock);
goto out2;
}
diff -puN fs/namei.c~vfsmount_lock fs/namei.c
--- linux-2.5.69/fs/namei.c~vfsmount_lock 2003-05-21 10:53:00.000000000 +0530
+++ linux-2.5.69-maneesh/fs/namei.c 2003-05-21 10:53:00.000000000 +0530
@@ -434,19 +434,17 @@ int follow_up(struct vfsmount **mnt, str
return 1;
}
+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
{
int res = 0;
while (d_mountpoint(*dentry)) {
- struct vfsmount *mounted;
- spin_lock(&dcache_lock);
- mounted = lookup_mnt(*mnt, *dentry);
- if (!mounted) {
- spin_unlock(&dcache_lock);
+ struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+ if (!mounted)
break;
- }
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
+ *mnt = mounted;
dput(*dentry);
mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root);
@@ -455,21 +453,21 @@ static int follow_mount(struct vfsmount
return res;
}
+/* no need for dcache_lock, as serialization is taken care in
+ * namespace.c
+ */
static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
{
struct vfsmount *mounted;
-
- spin_lock(&dcache_lock);
+
mounted = lookup_mnt(*mnt, *dentry);
if (mounted) {
- *mnt = mntget(mounted);
- spin_unlock(&dcache_lock);
+ *mnt = mounted;
dput(*dentry);
mntput(mounted->mnt_parent);
*dentry = dget(mounted->mnt_root);
return 1;
}
- spin_unlock(&dcache_lock);
return 0;
}
_
--
Maneesh Soni
IBM Linux Technology Center,
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/
next reply other threads:[~2003-05-21 9:10 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-05-21 9:25 Maneesh Soni [this message]
2003-05-21 9:26 ` [patch 2/2] lockfree lookup_mnt Maneesh Soni
2003-05-21 9:35 ` [patch 1/2] vfsmount_lock Andrew Morton
2003-05-21 9:38 ` Paul Rolland
2003-05-21 9:43 ` Maneesh Soni
2003-05-21 9:45 ` [patch 2/2] lockfree lookup_mnt Maneesh Soni
2003-05-21 9:53 ` [patch 1/2] vfsmount_lock Dipankar Sarma
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20030521092502.GD1198@in.ibm.com \
--to=maneesh@in.ibm.com \
--cc=Paul.McKenney@us.ibm.com \
--cc=akpm@digeo.com \
--cc=dipankar@in.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=viro@parcelfarce.linux.theplanet.co.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.