[PATCH 2.5.44] dcache

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 2.5.44] dcache_rcu
@ 2002-10-30 10:49 Maneesh Soni
  2002-10-31 10:53 ` dcache_rcu [performance results] Dipankar Sarma
  0 siblings, 1 reply; 25+ messages in thread
From: Maneesh Soni @ 2002-10-30 10:49 UTC (permalink / raw)
  To: Al Viro; +Cc: LKML, Andrew Morton, Dipankar Sarma

Hello Viro,

Please consider forwarding the following patch ito Linus for dcache lookup 
using Read Copy Update. The patch has been there in -mm kernel since 
2.5.37-mm1. The patch is stable. A couple of bugs reported are solved. It 
helps a great deal on higher end SMP machines and there is no performance 
regression on UP and lower end SMP machines as seen in Dipankar's kernbench 
numbers.

http://marc.theaimsgroup.com/?l=linux-kernel&m=103462075416638&w=2

Regards,
Maneesh

diff -urN linux-2.5.44-base/drivers/usb/core/inode.c linux-2.5.44-dc12/drivers/usb/core/inode.c
--- linux-2.5.44-base/drivers/usb/core/inode.c	Sat Oct 19 09:31:09 2002
+++ linux-2.5.44-dc12/drivers/usb/core/inode.c	Wed Oct 30 15:20:09 2002
@@ -254,7 +254,7 @@
 		if (atomic_read(&dentry->d_count) != 2)
 			break;
 	case 2:
-		list_del_init(&dentry->d_hash);
+		__d_drop(dentry);
 	}
 	spin_unlock(&dcache_lock);
 }
diff -urN linux-2.5.44-base/fs/autofs4/root.c linux-2.5.44-dc12/fs/autofs4/root.c
--- linux-2.5.44-base/fs/autofs4/root.c	Sat Oct 19 09:31:08 2002
+++ linux-2.5.44-dc12/fs/autofs4/root.c	Wed Oct 30 15:20:09 2002
@@ -418,7 +418,7 @@
 		unlock_kernel();
 		return -ENOTEMPTY;
 	}
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 
 	dput(ino->dentry);
diff -urN linux-2.5.44-base/fs/dcache.c linux-2.5.44-dc12/fs/dcache.c
--- linux-2.5.44-base/fs/dcache.c	Sat Oct 19 09:31:13 2002
+++ linux-2.5.44-dc12/fs/dcache.c	Wed Oct 30 15:21:17 2002
@@ -23,6 +23,7 @@
 #include <linux/smp_lock.h>
 #include <linux/cache.h>
 #include <linux/module.h>
+#include <linux/rcupdate.h>
 
 #include <asm/uaccess.h>
 
@@ -55,14 +56,21 @@
 	.age_limit = 45,
 };
 
+static void d_callback(void *arg)
+{
+	struct dentry * dentry = (struct dentry *)arg;
+
+	if (dname_external(dentry)) 
+		kfree(dentry->d_name.name);
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
 /* no dcache_lock, please */
 static inline void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
-	if (dname_external(dentry)) 
-		kfree(dentry->d_name.name);
-	kmem_cache_free(dentry_cache, dentry); 
+	call_rcu(&dentry->d_rcu, d_callback, dentry);
 	dentry_stat.nr_dentry--;
 }
 
@@ -124,9 +132,13 @@
 	if (!atomic_dec_and_lock(&dentry->d_count, &dcache_lock))
 		return;
 
-	/* dput on a free dentry? */
-	if (!list_empty(&dentry->d_lru))
-		BUG();
+	spin_lock(&dentry->d_lock);
+	if (atomic_read(&dentry->d_count)) {
+		spin_unlock(&dentry->d_lock);
+		spin_unlock(&dcache_lock);
+		return;
+	}
+			
 	/*
 	 * AV: ->d_delete() is _NOT_ allowed to block now.
 	 */
@@ -135,20 +147,29 @@
 			goto unhash_it;
 	}
 	/* Unreachable? Get rid of it */
-	if (list_empty(&dentry->d_hash))
+ 	if (d_unhashed(dentry))
 		goto kill_it;
-	list_add(&dentry->d_lru, &dentry_unused);
-	dentry_stat.nr_unused++;
-	dentry->d_vfs_flags |= DCACHE_REFERENCED;
+ 	if (list_empty(&dentry->d_lru)) {
+ 		dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
+ 		list_add(&dentry->d_lru, &dentry_unused);
+ 		dentry_stat.nr_unused++;
+ 	}
+	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
 	return;
 
 unhash_it:
-	list_del_init(&dentry->d_hash);
+	dentry->d_vfs_flags |= DCACHE_UNHASHED;
+	list_del_rcu(&dentry->d_hash);
 
 kill_it: {
 		struct dentry *parent;
+ 		if (!list_empty(&dentry->d_lru)) {
+ 			list_del(&dentry->d_lru);
+ 			dentry_stat.nr_unused--;
+ 		}
 		list_del(&dentry->d_child);
+		spin_unlock(&dentry->d_lock);
 		/* drops the lock, at that point nobody can reach this dentry */
 		dentry_iput(dentry);
 		parent = dentry->d_parent;
@@ -178,7 +199,7 @@
 	 * If it's already been dropped, return OK.
 	 */
 	spin_lock(&dcache_lock);
-	if (list_empty(&dentry->d_hash)) {
+	if (d_unhashed(dentry)) {
 		spin_unlock(&dcache_lock);
 		return 0;
 	}
@@ -208,9 +229,9 @@
 			return -EBUSY;
 		}
 	}
-
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
+
 	return 0;
 }
 
@@ -219,6 +240,7 @@
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
+	dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_stat.nr_unused--;
 		list_del_init(&dentry->d_lru);
@@ -256,7 +278,7 @@
 		tmp = next;
 		next = tmp->next;
 		alias = list_entry(tmp, struct dentry, d_alias);
-		if (!list_empty(&alias->d_hash)) {
+ 		if (!d_unhashed(alias)) {
 			if (alias->d_flags & DCACHE_DISCONNECTED)
 				discon_alias = alias;
 			else {
@@ -286,8 +308,8 @@
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_alias);
 		if (!atomic_read(&dentry->d_count)) {
 			__dget_locked(dentry);
+			__d_drop(dentry);
 			spin_unlock(&dcache_lock);
-			d_drop(dentry);
 			dput(dentry);
 			goto restart;
 		}
@@ -305,8 +327,9 @@
 {
 	struct dentry * parent;
 
-	list_del_init(&dentry->d_hash);
+	__d_drop(dentry);
 	list_del(&dentry->d_child);
+	spin_unlock(&dentry->d_lock);
 	dentry_iput(dentry);
 	parent = dentry->d_parent;
 	d_free(dentry);
@@ -339,18 +362,20 @@
 		if (tmp == &dentry_unused)
 			break;
 		list_del_init(tmp);
+ 		dentry_stat.nr_unused--;
 		dentry = list_entry(tmp, struct dentry, d_lru);
 
+ 		spin_lock(&dentry->d_lock);
 		/* If the dentry was recently referenced, don't free it. */
 		if (dentry->d_vfs_flags & DCACHE_REFERENCED) {
 			dentry->d_vfs_flags &= ~DCACHE_REFERENCED;
-			list_add(&dentry->d_lru, &dentry_unused);
+ 			if (!atomic_read(&dentry->d_count)) {
+ 				list_add(&dentry->d_lru, &dentry_unused);
+ 				dentry_stat.nr_unused++;
+ 			}
+ 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-		dentry_stat.nr_unused--;
-
-		/* Unused dentry with a count? */
-		BUG_ON(atomic_read(&dentry->d_count));
 		prune_one_dentry(dentry);
 	}
 	spin_unlock(&dcache_lock);
@@ -410,10 +435,13 @@
 		dentry = list_entry(tmp, struct dentry, d_lru);
 		if (dentry->d_sb != sb)
 			continue;
-		if (atomic_read(&dentry->d_count))
-			continue;
 		dentry_stat.nr_unused--;
 		list_del_init(tmp);
+		spin_lock(&dentry->d_lock);
+		if (atomic_read(&dentry->d_count)) {
+			spin_unlock(&dentry->d_lock);
+			continue;
+		}
 		prune_one_dentry(dentry);
 		goto repeat;
 	}
@@ -493,8 +521,8 @@
 		struct list_head *tmp = next;
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
+		list_del_init(&dentry->d_lru);
 		if (!atomic_read(&dentry->d_count)) {
-			list_del(&dentry->d_lru);
 			list_add(&dentry->d_lru, dentry_unused.prev);
 			found++;
 		}
@@ -557,8 +585,8 @@
 		spin_lock(&dcache_lock);
 		list_for_each(lp, head) {
 			struct dentry *this = list_entry(lp, struct dentry, d_hash);
+			list_del(&this->d_lru);
 			if (!atomic_read(&this->d_count)) {
-				list_del(&this->d_lru);
 				list_add_tail(&this->d_lru, &dentry_unused);
 				found++;
 			}
@@ -626,7 +654,8 @@
 	str[name->len] = 0;
 
 	atomic_set(&dentry->d_count, 1);
-	dentry->d_vfs_flags = 0;
+	dentry->d_vfs_flags = DCACHE_UNHASHED;
+	dentry->d_lock = SPIN_LOCK_UNLOCKED;
 	dentry->d_flags = 0;
 	dentry->d_inode = NULL;
 	dentry->d_parent = NULL;
@@ -759,12 +788,15 @@
 		res = tmp;
 		tmp = NULL;
 		if (res) {
+			spin_lock(&res->d_lock);
 			res->d_sb = inode->i_sb;
 			res->d_parent = res;
 			res->d_inode = inode;
 			res->d_flags |= DCACHE_DISCONNECTED;
+			res->d_vfs_flags &= ~DCACHE_UNHASHED;
 			list_add(&res->d_alias, &inode->i_dentry);
 			list_add(&res->d_hash, &inode->i_sb->s_anon);
+			spin_unlock(&res->d_lock);
 		}
 		inode = NULL; /* don't drop reference */
 	}
@@ -833,30 +865,16 @@
  
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
-	struct dentry * dentry;
-	spin_lock(&dcache_lock);
-	dentry = __d_lookup(parent,name);
-	if (dentry)
-		__dget_locked(dentry);
-	spin_unlock(&dcache_lock);
-	return dentry;
-}
-
-struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)  
-{
-
 	unsigned int len = name->len;
 	unsigned int hash = name->hash;
 	const unsigned char *str = name->name;
 	struct list_head *head = d_hash(parent,hash);
 	struct list_head *tmp;
+	struct dentry *found = NULL;
 
-	tmp = head->next;
-	for (;;) {
+	rcu_read_lock();
+	__list_for_each_rcu(tmp, head) {
 		struct dentry * dentry = list_entry(tmp, struct dentry, d_hash);
-		if (tmp == head)
-			break;
-		tmp = tmp->next;
 		if (dentry->d_name.hash != hash)
 			continue;
 		if (dentry->d_parent != parent)
@@ -870,9 +888,14 @@
 			if (memcmp(dentry->d_name.name, str, len))
 				continue;
 		}
-		return dentry;
-	}
-	return NULL;
+		spin_lock(&dentry->d_lock);
+		if (!d_unhashed(dentry))
+			found = dget(dentry);
+		spin_unlock(&dentry->d_lock);
+		break;
+ 	}
+ 	rcu_read_unlock();
+ 	return found;
 }
 
 /**
@@ -911,7 +934,7 @@
 	lhp = base = d_hash(dparent, dentry->d_name.hash);
 	while ((lhp = lhp->next) != base) {
 		if (dentry == list_entry(lhp, struct dentry, d_hash)) {
-			__dget_locked(dentry);
+			dget(dentry);
 			spin_unlock(&dcache_lock);
 			return 1;
 		}
@@ -948,17 +971,18 @@
 	 * Are we the only user?
 	 */
 	spin_lock(&dcache_lock);
+	spin_lock(&dentry->d_lock);
 	if (atomic_read(&dentry->d_count) == 1) {
+		spin_unlock(&dentry->d_lock);
 		dentry_iput(dentry);
 		return;
 	}
-	spin_unlock(&dcache_lock);
 
-	/*
-	 * If not, just drop the dentry and let dput
-	 * pick up the tab..
-	 */
-	d_drop(dentry);
+	if (!d_unhashed(dentry))
+		__d_drop(dentry);
+
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dcache_lock);
 }
 
 /**
@@ -971,9 +995,10 @@
 void d_rehash(struct dentry * entry)
 {
 	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
-	if (!list_empty(&entry->d_hash)) BUG();
 	spin_lock(&dcache_lock);
-	list_add(&entry->d_hash, list);
+	if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG();
+	entry->d_vfs_flags &= ~DCACHE_UNHASHED;
+	list_add_rcu(&entry->d_hash, list);
 	spin_unlock(&dcache_lock);
 }
 
@@ -1043,7 +1068,7 @@
 	list_add(&dentry->d_hash, &target->d_hash);
 
 	/* Unhash the target: dput() will then get rid of it */
-	list_del_init(&target->d_hash);
+	__d_drop(target);
 
 	list_del(&dentry->d_child);
 	list_del(&target->d_child);
@@ -1095,7 +1120,7 @@
 
 	*--end = '\0';
 	buflen--;
-	if (!IS_ROOT(dentry) && list_empty(&dentry->d_hash)) {
+	if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
 		buflen -= 10;
 		end -= 10;
 		memcpy(end, " (deleted)", 10);
@@ -1178,7 +1203,7 @@
 	error = -ENOENT;
 	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (pwd->d_parent == pwd || !list_empty(&pwd->d_hash)) {
+	if (pwd->d_parent == pwd || !d_unhashed(pwd)) {
 		unsigned long len;
 		char * cwd;
 
diff -urN linux-2.5.44-base/fs/exec.c linux-2.5.44-dc12/fs/exec.c
--- linux-2.5.44-base/fs/exec.c	Sat Oct 19 09:31:48 2002
+++ linux-2.5.44-dc12/fs/exec.c	Wed Oct 30 15:20:09 2002
@@ -502,9 +502,9 @@
 
 	if (proc_dentry) {
 		spin_lock(&dcache_lock);
-		if (!list_empty(&proc_dentry->d_hash)) {
+		if (!d_unhashed(proc_dentry)) {
 			dget_locked(proc_dentry);
-			list_del_init(&proc_dentry->d_hash);
+			__d_drop(proc_dentry);
 		} else
 			proc_dentry = NULL;
 		spin_unlock(&dcache_lock);
diff -urN linux-2.5.44-base/fs/intermezzo/journal.c linux-2.5.44-dc12/fs/intermezzo/journal.c
--- linux-2.5.44-base/fs/intermezzo/journal.c	Sat Oct 19 09:31:08 2002
+++ linux-2.5.44-dc12/fs/intermezzo/journal.c	Wed Oct 30 15:20:09 2002
@@ -261,7 +261,7 @@
 
         *--end = '\0';
         buflen--;
-        if (dentry->d_parent != dentry && list_empty(&dentry->d_hash)) {
+        if (dentry->d_parent != dentry && d_unhashed(dentry)) {
                 buflen -= 10;
                 end -= 10;
                 memcpy(end, " (deleted)", 10);
@@ -1518,7 +1518,7 @@
         }
 
         if (!dentry->d_inode || (dentry->d_inode->i_nlink == 0) 
-            || ((dentry->d_parent != dentry) && list_empty(&dentry->d_hash))) {
+            || ((dentry->d_parent != dentry) && d_unhashed(dentry))) {
                 EXIT;
                 return 0;
         }
@@ -2129,7 +2129,7 @@
         }
 
         if (!dentry->d_inode || (dentry->d_inode->i_nlink == 0) 
-            || ((dentry->d_parent != dentry) && list_empty(&dentry->d_hash))) {
+            || ((dentry->d_parent != dentry) && d_unhashed(dentry))) {
                 EXIT;
                 return 0;
         }
@@ -2391,7 +2391,7 @@
         }
 
         if (!dentry->d_inode || (dentry->d_inode->i_nlink == 0) 
-            || ((dentry->d_parent != dentry) && list_empty(&dentry->d_hash))) {
+            || ((dentry->d_parent != dentry) && d_unhashed(dentry))) {
                 EXIT;
                 return 0;
         }
diff -urN linux-2.5.44-base/fs/libfs.c linux-2.5.44-dc12/fs/libfs.c
--- linux-2.5.44-base/fs/libfs.c	Sat Oct 19 09:31:07 2002
+++ linux-2.5.44-dc12/fs/libfs.c	Wed Oct 30 15:20:09 2002
@@ -70,7 +70,7 @@
 			while (n && p != &file->f_dentry->d_subdirs) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_child);
-				if (!list_empty(&next->d_hash) && next->d_inode)
+				if (!d_unhashed(next) && next->d_inode)
 					n--;
 				p = p->next;
 			}
@@ -127,7 +127,7 @@
 			for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
 				struct dentry *next;
 				next = list_entry(p, struct dentry, d_child);
-				if (list_empty(&next->d_hash) || !next->d_inode)
+				if (d_unhashed(next) || !next->d_inode)
 					continue;
 
 				spin_unlock(&dcache_lock);
diff -urN linux-2.5.44-base/fs/namei.c linux-2.5.44-dc12/fs/namei.c
--- linux-2.5.44-base/fs/namei.c	Sat Oct 19 09:31:22 2002
+++ linux-2.5.44-dc12/fs/namei.c	Wed Oct 30 15:20:09 2002
@@ -286,27 +286,6 @@
 	return dentry;
 }
 
-/*for fastwalking*/
-static inline void unlock_nd(struct nameidata *nd)
-{
-	struct vfsmount *mnt = nd->old_mnt;
-	struct dentry *dentry = nd->old_dentry;
-	mntget(nd->mnt);
-	dget_locked(nd->dentry);
-	nd->old_mnt = NULL;
-	nd->old_dentry = NULL;
-	spin_unlock(&dcache_lock);
-	dput(dentry);
-	mntput(mnt);
-}
-
-static inline void lock_nd(struct nameidata *nd)
-{
-	spin_lock(&dcache_lock);
-	nd->old_mnt = nd->mnt;
-	nd->old_dentry = nd->dentry;
-}
-
 /*
  * Short-cut version of permission(), for calling by
  * path_walk(), when dcache lock is held.  Combines parts
@@ -451,11 +430,18 @@
 {
 	int res = 0;
 	while (d_mountpoint(*dentry)) {
-		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
-		if (!mounted)
+		struct vfsmount *mounted;
+		spin_lock(&dcache_lock);
+		mounted = lookup_mnt(*mnt, *dentry);
+		if (!mounted) {
+			spin_unlock(&dcache_lock);
 			break;
-		*mnt = mounted;
-		*dentry = mounted->mnt_root;
+		}
+		*mnt = mntget(mounted);
+		spin_unlock(&dcache_lock);
+		dput(*dentry);
+		mntput(mounted->mnt_parent);
+		*dentry = dget(mounted->mnt_root);
 		res = 1;
 	}
 	return res;
@@ -488,17 +474,32 @@
 {
 	while(1) {
 		struct vfsmount *parent;
+		struct dentry *old = *dentry;
+
+                read_lock(&current->fs->lock);
 		if (*dentry == current->fs->root &&
-		    *mnt == current->fs->rootmnt)
+		    *mnt == current->fs->rootmnt) {
+                        read_unlock(&current->fs->lock);
 			break;
+		}
+                read_unlock(&current->fs->lock);
+		spin_lock(&dcache_lock);
 		if (*dentry != (*mnt)->mnt_root) {
-			*dentry = (*dentry)->d_parent;
+			*dentry = dget((*dentry)->d_parent);
+			spin_unlock(&dcache_lock);
+			dput(old);
 			break;
 		}
-		parent=(*mnt)->mnt_parent;
-		if (parent == *mnt)
+		parent = (*mnt)->mnt_parent;
+		if (parent == *mnt) {
+			spin_unlock(&dcache_lock);
 			break;
-		*dentry=(*mnt)->mnt_mountpoint;
+		}
+		mntget(parent);
+		*dentry = dget((*mnt)->mnt_mountpoint);
+		spin_unlock(&dcache_lock);
+		dput(old);
+		mntput(*mnt);
 		*mnt = parent;
 	}
 	follow_mount(mnt, dentry);
@@ -515,14 +516,13 @@
  *  It _is_ time-critical.
  */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-		     struct path *path, struct path *cached_path,
-		     int flags)
+		     struct path *path, int flags)
 {
 	struct vfsmount *mnt = nd->mnt;
-	struct dentry *dentry = __d_lookup(nd->dentry, name);
+	struct dentry *dentry = d_lookup(nd->dentry, name);
 
 	if (!dentry)
-		goto dcache_miss;
+		goto need_lookup;
 	if (dentry->d_op && dentry->d_op->d_revalidate)
 		goto need_revalidate;
 done:
@@ -530,36 +530,21 @@
 	path->dentry = dentry;
 	return 0;
 
-dcache_miss:
-	unlock_nd(nd);
-
 need_lookup:
 	dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE);
 	if (IS_ERR(dentry))
 		goto fail;
-	mntget(mnt);
-relock:
-	dput(cached_path->dentry);
-	mntput(cached_path->mnt);
-	cached_path->mnt = mnt;
-	cached_path->dentry = dentry;
-	lock_nd(nd);
 	goto done;
 
 need_revalidate:
-	mntget(mnt);
-	dget_locked(dentry);
-	unlock_nd(nd);
 	if (dentry->d_op->d_revalidate(dentry, flags))
-		goto relock;
+		goto done;
 	if (d_invalidate(dentry))
-		goto relock;
+		goto done;
 	dput(dentry);
-	mntput(mnt);
 	goto need_lookup;
 
 fail:
-	lock_nd(nd);
 	return PTR_ERR(dentry);
 }
 
@@ -573,7 +558,7 @@
  */
 int link_path_walk(const char * name, struct nameidata *nd)
 {
-	struct path next, pinned = {NULL, NULL};
+	struct path next;
 	struct inode *inode;
 	int err;
 	unsigned int lookup_flags = nd->flags;
@@ -594,10 +579,8 @@
 		unsigned int c;
 
 		err = exec_permission_lite(inode);
-		if (err == -EAGAIN) {
-			unlock_nd(nd);
+		if (err == -EAGAIN) { 
 			err = permission(inode, MAY_EXEC);
-			lock_nd(nd);
 		}
  		if (err)
 			break;
@@ -648,7 +631,7 @@
 				break;
 		}
 		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next, &pinned, LOOKUP_CONTINUE);
+		err = do_lookup(nd, &this, &next, LOOKUP_CONTINUE);
 		if (err)
 			break;
 		/* Check mountpoints.. */
@@ -657,21 +640,16 @@
 		err = -ENOENT;
 		inode = next.dentry->d_inode;
 		if (!inode)
-			break;
+			goto out_dput;
 		err = -ENOTDIR; 
 		if (!inode->i_op)
-			break;
+			goto out_dput;
 
 		if (inode->i_op->follow_link) {
-			mntget(next.mnt);
-			dget_locked(next.dentry);
-			unlock_nd(nd);
 			err = do_follow_link(next.dentry, nd);
 			dput(next.dentry);
-			mntput(next.mnt);
 			if (err)
 				goto return_err;
-			lock_nd(nd);
 			err = -ENOENT;
 			inode = nd->dentry->d_inode;
 			if (!inode)
@@ -680,6 +658,7 @@
 			if (!inode->i_op)
 				break;
 		} else {
+			dput(nd->dentry);
 			nd->mnt = next.mnt;
 			nd->dentry = next.dentry;
 		}
@@ -711,24 +690,20 @@
 			if (err < 0)
 				break;
 		}
-		err = do_lookup(nd, &this, &next, &pinned, 0);
+		err = do_lookup(nd, &this, &next, 0);
 		if (err)
 			break;
 		follow_mount(&next.mnt, &next.dentry);
 		inode = next.dentry->d_inode;
 		if ((lookup_flags & LOOKUP_FOLLOW)
 		    && inode && inode->i_op && inode->i_op->follow_link) {
-			mntget(next.mnt);
-			dget_locked(next.dentry);
-			unlock_nd(nd);
 			err = do_follow_link(next.dentry, nd);
 			dput(next.dentry);
-			mntput(next.mnt);
 			if (err)
 				goto return_err;
 			inode = nd->dentry->d_inode;
-			lock_nd(nd);
 		} else {
+			dput(nd->dentry);
 			nd->mnt = next.mnt;
 			nd->dentry = next.dentry;
 		}
@@ -751,23 +726,19 @@
 		else if (this.len == 2 && this.name[1] == '.')
 			nd->last_type = LAST_DOTDOT;
 return_base:
-		unlock_nd(nd);
-		dput(pinned.dentry);
-		mntput(pinned.mnt);
 		return 0;
+out_dput:
+		dput(next.dentry);
+		break;
 	}
-	unlock_nd(nd);
 	path_release(nd);
 return_err:
-	dput(pinned.dentry);
-	mntput(pinned.mnt);
 	return err;
 }
 
 int path_walk(const char * name, struct nameidata *nd)
 {
 	current->total_link_count = 0;
-	lock_nd(nd);
 	return link_path_walk(name, nd);
 }
 
@@ -855,8 +826,9 @@
 {
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
+
+	read_lock(&current->fs->lock);
 	if (*name=='/') {
-		read_lock(&current->fs->lock);
 		if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 			nd->mnt = mntget(current->fs->altrootmnt);
 			nd->dentry = dget(current->fs->altroot);
@@ -865,18 +837,14 @@
 				return 0;
 			read_lock(&current->fs->lock);
 		}
-		read_unlock(&current->fs->lock);
-		spin_lock(&dcache_lock);
-		nd->mnt = current->fs->rootmnt;
-		nd->dentry = current->fs->root;
+		nd->mnt = mntget(current->fs->rootmnt);
+		nd->dentry = dget(current->fs->root);
 	}
 	else{
-		spin_lock(&dcache_lock);
-		nd->mnt = current->fs->pwdmnt;
-		nd->dentry = current->fs->pwd;
+		nd->mnt = mntget(current->fs->pwdmnt);
+		nd->dentry = dget(current->fs->pwd);
 	}
-	nd->old_mnt = NULL;
-	nd->old_dentry = NULL;
+	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
 	return link_path_walk(name, nd);
 }
@@ -1548,7 +1516,7 @@
 		if (atomic_read(&dentry->d_count) != 2)
 			break;
 	case 2:
-		list_del_init(&dentry->d_hash);
+		__d_drop(dentry);
 	}
 	spin_unlock(&dcache_lock);
 }
@@ -2115,7 +2083,6 @@
 			/* weird __emul_prefix() stuff did it */
 			goto out;
 	}
-	lock_nd(nd);
 	res = link_path_walk(link, nd);
 out:
 	if (current->link_count || res || nd->last_type!=LAST_NORM)
diff -urN linux-2.5.44-base/fs/nfs/dir.c linux-2.5.44-dc12/fs/nfs/dir.c
--- linux-2.5.44-base/fs/nfs/dir.c	Sat Oct 19 09:31:53 2002
+++ linux-2.5.44-dc12/fs/nfs/dir.c	Wed Oct 30 15:20:09 2002
@@ -1001,7 +1001,7 @@
 		return error;
 	}
 	if (!d_unhashed(dentry)) {
-		list_del_init(&dentry->d_hash);
+		__d_drop(dentry);
 		need_rehash = 1;
 	}
 	spin_unlock(&dcache_lock);
diff -urN linux-2.5.44-base/include/linux/dcache.h linux-2.5.44-dc12/include/linux/dcache.h
--- linux-2.5.44-base/include/linux/dcache.h	Sat Oct 19 09:32:34 2002
+++ linux-2.5.44-dc12/include/linux/dcache.h	Wed Oct 30 15:20:09 2002
@@ -7,6 +7,7 @@
 #include <linux/mount.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 #include <asm/page.h>			/* for BUG() */
 
 /*
@@ -70,11 +71,13 @@
  
 struct dentry {
 	atomic_t d_count;
+	unsigned long d_vfs_flags;	/* moved here to be on same cacheline */
+	spinlock_t d_lock;		/* per dentry lock */
 	unsigned int d_flags;
 	struct inode  * d_inode;	/* Where the name belongs to - NULL is negative */
 	struct dentry * d_parent;	/* parent directory */
 	struct list_head d_hash;	/* lookup hash list */
-	struct list_head d_lru;		/* d_count = 0 LRU list */
+	struct list_head d_lru;		/* LRU list */
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
@@ -83,8 +86,8 @@
 	unsigned long d_time;		/* used by d_revalidate */
 	struct dentry_operations  *d_op;
 	struct super_block * d_sb;	/* The root of the dentry tree */
-	unsigned long d_vfs_flags;
 	void * d_fsdata;		/* fs-specific data */
+	struct rcu_head d_rcu;
 	unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
 	struct dcookie_struct * d_cookie; /* cookie, if any */
 };
@@ -135,6 +138,7 @@
       */
 
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
+#define DCACHE_UNHASHED		0x0010	
 
 extern spinlock_t dcache_lock;
 extern rwlock_t dparent_lock;
@@ -156,10 +160,16 @@
  * timeouts or autofs deletes).
  */
 
+static __inline__ void __d_drop(struct dentry * dentry)
+{
+	dentry->d_vfs_flags |= DCACHE_UNHASHED;
+	list_del_rcu(&dentry->d_hash);
+}
+
 static __inline__ void d_drop(struct dentry * dentry)
 {
 	spin_lock(&dcache_lock);
-	list_del_init(&dentry->d_hash);
+ 	__d_drop(dentry);
 	spin_unlock(&dcache_lock);
 }
 
@@ -246,9 +256,8 @@
 static __inline__ struct dentry * dget(struct dentry *dentry)
 {
 	if (dentry) {
-		if (!atomic_read(&dentry->d_count))
-			BUG();
 		atomic_inc(&dentry->d_count);
+		dentry->d_vfs_flags |= DCACHE_REFERENCED;
 	}
 	return dentry;
 }
@@ -264,7 +273,7 @@
  
 static __inline__ int d_unhashed(struct dentry *dentry)
 {
-	return list_empty(&dentry->d_hash);
+	return (dentry->d_vfs_flags & DCACHE_UNHASHED);
 }
 
 extern void dput(struct dentry *);
diff -urN linux-2.5.44-base/include/linux/fs_struct.h linux-2.5.44-dc12/include/linux/fs_struct.h
--- linux-2.5.44-base/include/linux/fs_struct.h	Sat Oct 19 09:32:26 2002
+++ linux-2.5.44-dc12/include/linux/fs_struct.h	Wed Oct 30 15:20:09 2002
@@ -35,12 +35,10 @@
 	struct dentry *old_root;
 	struct vfsmount *old_rootmnt;
 	write_lock(&fs->lock);
-	spin_lock(&dcache_lock);
 	old_root = fs->root;
 	old_rootmnt = fs->rootmnt;
 	fs->rootmnt = mntget(mnt);
 	fs->root = dget(dentry);
-	spin_unlock(&dcache_lock);
 	write_unlock(&fs->lock);
 	if (old_root) {
 		dput(old_root);
@@ -60,12 +58,10 @@
 	struct dentry *old_pwd;
 	struct vfsmount *old_pwdmnt;
 	write_lock(&fs->lock);
-	spin_lock(&dcache_lock);
 	old_pwd = fs->pwd;
 	old_pwdmnt = fs->pwdmnt;
 	fs->pwdmnt = mntget(mnt);
 	fs->pwd = dget(dentry);
-	spin_unlock(&dcache_lock);
 	write_unlock(&fs->lock);
 	if (old_pwd) {
 		dput(old_pwd);
diff -urN linux-2.5.44-base/include/linux/namei.h linux-2.5.44-dc12/include/linux/namei.h
--- linux-2.5.44-base/include/linux/namei.h	Sat Oct 19 09:31:21 2002
+++ linux-2.5.44-dc12/include/linux/namei.h	Wed Oct 30 15:20:09 2002
@@ -11,8 +11,6 @@
 	struct qstr	last;
 	unsigned int	flags;
 	int		last_type;
-	struct dentry	*old_dentry;
-	struct vfsmount	*old_mnt;
 };
 
 /*
diff -urN linux-2.5.44-base/kernel/exit.c linux-2.5.44-dc12/kernel/exit.c
--- linux-2.5.44-base/kernel/exit.c	Sat Oct 19 09:32:29 2002
+++ linux-2.5.44-dc12/kernel/exit.c	Wed Oct 30 15:20:09 2002
@@ -46,9 +46,9 @@
 	proc_dentry = p->proc_dentry;
 	if (unlikely(proc_dentry != NULL)) {
 		spin_lock(&dcache_lock);
-		if (!list_empty(&proc_dentry->d_hash)) {
+		if (!d_unhashed(proc_dentry)) {
 			dget_locked(proc_dentry);
-			list_del_init(&proc_dentry->d_hash);
+			__d_drop(proc_dentry);
 		} else
 			proc_dentry = NULL;
 		spin_unlock(&dcache_lock);


-- 
Maneesh Soni
IBM Linux Technology Center, 
IBM India Software Lab, Bangalore.
Phone: +91-80-5044999 email: maneesh@in.ibm.com
http://lse.sourceforge.net/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-10-30 10:49 [PATCH 2.5.44] dcache_rcu Maneesh Soni
@ 2002-10-31 10:53 ` Dipankar Sarma
  2002-11-02  1:36   ` Andrew Morton
  0 siblings, 1 reply; 25+ messages in thread
From: Dipankar Sarma @ 2002-10-31 10:53 UTC (permalink / raw)
  To: Maneesh Soni; +Cc: Al Viro, LKML, Andrew Morton, Anton Blanchard, Paul McKenney

On Wed, Oct 30, 2002 at 04:19:12PM +0530, Maneesh Soni wrote:
> Hello Viro,
> 
> Please consider forwarding the following patch ito Linus for dcache lookup 
> using Read Copy Update. The patch has been there in -mm kernel since 
> 2.5.37-mm1. The patch is stable. A couple of bugs reported are solved. It 
> helps a great deal on higher end SMP machines and there is no performance 
> regression on UP and lower end SMP machines as seen in Dipankar's kernbench 
> numbers.
> 
> http://marc.theaimsgroup.com/?l=linux-kernel&m=103462075416638&w=2
> 

Anton (Blanchard) did some benchmarking with this
in a 24-way ppc64 box and the results showed why we need this patch.
Here are some performace comparisons based on a multi-user benchmark 
that Anton ran with vanilla 2.5.40 and 2.5.40-mm. 

http://lse.sourceforge.net/locking/dcache/summary.png

base = 2.5.40
base-nops = 2.5.40 but ps command in benchmark scripts commented out
mm = 2.5.40-mm
mm-nops = 2.5.40-mm but ps command in benchmark scripts commented out

Here is a profile output snippet of base and mm runs at 200 scripts -

base :

Hits Percentage Function
------------------------
75185 100.00 total
11215 14.92 path_lookup
8578 11.41 atomic_dec_and_lock
5763 7.67 do_lookup
5745 7.64 proc_pid_readlink
4344 5.78 page_remove_rmap
2144 2.85 page_add_rmap
1587 2.11 link_path_walk
1531 2.04 proc_check_root
1461 1.94 save_remaining_regs
1345 1.79 inode_change_ok
1236 1.64 ext2_free_blocks
1215 1.62 ext2_new_block
1067 1.42 d_lookup
1053 1.40 number
907 1.21 release_pages


mm :

Hits Percentage Function
62369 100.00 total
5802 9.30 page_remove_rmap 
4092 6.56 atomic_dec_and_lock
3887 6.23 proc_pid_readlink
3207 5.14 follow_mount
2979 4.78 page_add_rmap
2066 3.31 save_remaining_regs 
1856 2.98 d_lookup
1629 2.61 number
1235 1.98 release_pages
1168 1.87 pSeries_flush_hash_range
1154 1.85 do_page_fault
1026 1.65 copy_page
1009 1.62 path_lookup


Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-10-31 10:53 ` dcache_rcu [performance results] Dipankar Sarma
@ 2002-11-02  1:36   ` Andrew Morton
  2002-11-02  9:13     ` Dipankar Sarma
  0 siblings, 1 reply; 25+ messages in thread
From: Andrew Morton @ 2002-11-02  1:36 UTC (permalink / raw)
  To: dipankar; +Cc: Maneesh Soni, Al Viro, LKML, Anton Blanchard, Paul McKenney

Dipankar Sarma wrote:
> 
> [ dcache-rcu ]
> 
> Anton (Blanchard) did some benchmarking with this
> in a 24-way ppc64 box and the results showed why we need this patch.
> Here are some performace comparisons based on a multi-user benchmark
> that Anton ran with vanilla 2.5.40 and 2.5.40-mm.
> 
> http://lse.sourceforge.net/locking/dcache/summary.png
> 
> base = 2.5.40
> base-nops = 2.5.40 but ps command in benchmark scripts commented out
> mm = 2.5.40-mm
> mm-nops = 2.5.40-mm but ps command in benchmark scripts commented out
> 

I'm going to need some help understanding what's going on in
there.  I assume the test is SDET (there, I said it), which
simulates lots of developers doing developer things on a multiuser
machine.  Lots of compiling, groffing, etc.

Why does the removal of `ps' from the test script make such a huge
difference?  That's silly, and we should fix it.

And it appears that dcache-rcu made a ~10% difference on a 24-way PPC64,
yes?  That is nice, and perhaps we should take that, but it is not a
tremendous speedup.

Thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-02  1:36   ` Andrew Morton
@ 2002-11-02  9:13     ` Dipankar Sarma
  2002-11-04 17:29       ` Martin J. Bligh
  0 siblings, 1 reply; 25+ messages in thread
From: Dipankar Sarma @ 2002-11-02  9:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: dipankar, Maneesh Soni, Al Viro, LKML, Anton Blanchard,
	Paul McKenney

On Fri, Nov 01, 2002 at 05:36:03PM -0800, Andrew Morton wrote:
> Dipankar Sarma wrote:
> > [ dcache-rcu ]
> > Anton (Blanchard) did some benchmarking with this
> > in a 24-way ppc64 box and the results showed why we need this patch.
> > Here are some performace comparisons based on a multi-user benchmark
> > that Anton ran with vanilla 2.5.40 and 2.5.40-mm.
> > 
> > http://lse.sourceforge.net/locking/dcache/summary.png
> > 
> simulates lots of developers doing developer things on a multiuser
> machine.  Lots of compiling, groffing, etc.
> 
> Why does the removal of `ps' from the test script make such a huge
> difference?  That's silly, and we should fix it.

I have uploaded the profiles from Anton's benchmark runs -

http://lse.sourceforge.net/locking/dcache/results/2.5.40/200-base.html
http://lse.sourceforge.net/locking/dcache/results/2.5.40/200-base-nops.html
http://lse.sourceforge.net/locking/dcache/results/2.5.40/200-mm.html
http://lse.sourceforge.net/locking/dcache/results/2.5.40/200-mm-nops.html

A quick comparison of base and base-nops profiles show this -

base :

Hits Percentage Function
75185 100.00 total
11215 14.92 path_lookup <1.html>
8578 11.41 atomic_dec_and_lock <2.html>
5763 7.67 do_lookup <3.html>
5745 7.64 proc_pid_readlink <4.html>
4344 5.78 page_remove_rmap <5.html>
2144 2.85 page_add_rmap <6.html>
1587 2.11 link_path_walk <7.html>
1531 2.04 proc_check_root <8.html>
1461 1.94 save_remaining_regs <9.html>
1345 1.79 inode_change_ok <10.html>
1236 1.64 ext2_free_blocks <11.html>
1215 1.62 ext2_new_block <12.html>
1067 1.42 d_lookup <13.html>

base-no-ps :

Hits Percentage Function
50895 100.00 total
8222 16.15 page_remove_rmap <1.html>
3837 7.54 page_add_rmap <2.html>
2222 4.37 save_remaining_regs <3.html>
1618 3.18 release_pages <4.html>
1533 3.01 pSeries_flush_hash_range <5.html>
1446 2.84 do_page_fault <6.html>
1343 2.64 find_get_page <7.html>
1273 2.50 copy_page <8.html>
1228 2.41 copy_page_range <9.html>
1186 2.33 path_lookup <10.html>
1186 2.33 pSeries_insert_hpte <11.html>
1171 2.30 atomic_dec_and_lock <12.html>
1152 2.26 zap_pte_range <13.html>
841 1.65 do_generic_file_read <14.html>

Clearly dcache_lock is the killer when 'ps' command is used in
the benchmark. My guess (without looking at 'ps' code) is that
it has to open/close a lot of files in /proc and that increases
the number of acquisitions of dcache_lock. Increased # of acquisition
add to cache line bouncing and contention.

I should add that this is a general trend we see in all workloads
that do a lot of open/closes and so much so that performance is very
sensitive to how close to / your application's working directory
is. You would get much better system time if you compile a kernel
in /linux as compared to say /home/fs01/users/akpm/kernel/linux ;-)

> And it appears that dcache-rcu made a ~10% difference on a 24-way PPC64,
> yes?  That is nice, and perhaps we should take that, but it is not a
> tremendous speedup.

Hmm.. based on Anton's graph it looked more like ~25% difference for
60 or more scripts. At 200 scripts it is ~27.6%. Without the ps
command, it seems more like ~4%.


Thanks
Dipankar

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
       [not found]     ` <20021102144306.A6736@dikhow.suse.lists.linux.kernel>
@ 2002-11-02 10:08       ` Andi Kleen
  2002-11-02 10:54         ` Dipankar Sarma
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2002-11-02 10:08 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: linux-kernel

Dipankar Sarma <woofwoof@hathway.com> writes:
> 
> I should add that this is a general trend we see in all workloads
> that do a lot of open/closes and so much so that performance is very
> sensitive to how close to / your application's working directory
> is. You would get much better system time if you compile a kernel
> in /linux as compared to say /home/fs01/users/akpm/kernel/linux ;-)

That's interesting. Perhaps it would make sense to have a fast path
that just does a string match of the to be looked up path to a cached copy 
of cwd and if it matches works as if cwd was the root. Would need to be 
careful with chroot where cwd could be outside the root and clear the
cached copy in this case. Then you could avoid all the locking overhead
for directories above your cwd if you stay in there.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-02 10:08       ` dcache_rcu [performance results] Andi Kleen
@ 2002-11-02 10:54         ` Dipankar Sarma
  2002-11-02 11:01           ` Andi Kleen
  0 siblings, 1 reply; 25+ messages in thread
From: Dipankar Sarma @ 2002-11-02 10:54 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

On Sat, Nov 02, 2002 at 11:08:44AM +0100, Andi Kleen wrote:
> Dipankar Sarma <woofwoof@hathway.com> writes:
> > 
> > I should add that this is a general trend we see in all workloads
> > that do a lot of open/closes and so much so that performance is very
> > sensitive to how close to / your application's working directory
> > is. You would get much better system time if you compile a kernel
> > in /linux as compared to say /home/fs01/users/akpm/kernel/linux ;-)
> 
> That's interesting. Perhaps it would make sense to have a fast path
> that just does a string match of the to be looked up path to a cached copy 
> of cwd and if it matches works as if cwd was the root. Would need to be 
> careful with chroot where cwd could be outside the root and clear the
> cached copy in this case. Then you could avoid all the locking overhead
> for directories above your cwd if you stay in there.

Well, on second thoughts I can't see why the path length for pwd
would make difference for kernel compilation - it uses relative
path and for path lookup, if the first character is not '/', then
lookup is done relative to current->fs->pwd. I will do some more
benchmarking on and verify.

I did get inputs from Troy Wilson who does specweb measurements
that the path name length of the location of the served files
make a difference. I presume his webserver setup used full path names.

Thanks
Dipankar

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-02 10:54         ` Dipankar Sarma
@ 2002-11-02 11:01           ` Andi Kleen
  2002-11-02 19:41             ` Linus Torvalds
  0 siblings, 1 reply; 25+ messages in thread
From: Andi Kleen @ 2002-11-02 11:01 UTC (permalink / raw)
  To: Dipankar Sarma; +Cc: Andi Kleen, linux-kernel

> Well, on second thoughts I can't see why the path length for pwd
> would make difference for kernel compilation - it uses relative
> path and for path lookup, if the first character is not '/', then
> lookup is done relative to current->fs->pwd. I will do some more
> benchmarking on and verify.

Kernel compilation actually uses absolute pathnames e.g. for dependency
checking. TOPDIR is also specified absolutely, so an include access likely
uses an absolute pathname too.

-Andi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-02 11:01           ` Andi Kleen
@ 2002-11-02 19:41             ` Linus Torvalds
  2002-11-02 21:16               ` Sam Ravnborg
  0 siblings, 1 reply; 25+ messages in thread
From: Linus Torvalds @ 2002-11-02 19:41 UTC (permalink / raw)
  To: linux-kernel

In article <20021102120155.A17591@wotan.suse.de>,
Andi Kleen  <ak@suse.de> wrote:
>
>Kernel compilation actually uses absolute pathnames e.g. for dependency
>checking.

This used to be true, but it shouldn't be true any more. TOPDIR should
be gone, and everything should be relative paths (and all "make"
invocations should just be done from the top kernel directory).

But yes, it certainly _used_ to be true (and hey, maybe I've missed some
reason for why it isn't still true).

		Linus

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-02 19:41             ` Linus Torvalds
@ 2002-11-02 21:16               ` Sam Ravnborg
  0 siblings, 0 replies; 25+ messages in thread
From: Sam Ravnborg @ 2002-11-02 21:16 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

On Sat, Nov 02, 2002 at 07:41:34PM +0000, Linus Torvalds wrote:
> >Kernel compilation actually uses absolute pathnames e.g. for dependency
> >checking.
> 
> This used to be true, but it shouldn't be true any more. TOPDIR should
> be gone, and everything should be relative paths (and all "make"
> invocations should just be done from the top kernel directory).
> 
> But yes, it certainly _used_ to be true (and hey, maybe I've missed some
> reason for why it isn't still true).
If there is any dependency left on absolute paths thats a bug.

I have tested this by doing a full make and copy the tree.
When executing make again nothing got rebuild - so it is OK for the
general case.

But please report it if you see something in contradiction with that.

	Sam

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-02  9:13     ` Dipankar Sarma
@ 2002-11-04 17:29       ` Martin J. Bligh
  2002-11-05  0:00         ` jw schultz
  0 siblings, 1 reply; 25+ messages in thread
From: Martin J. Bligh @ 2002-11-04 17:29 UTC (permalink / raw)
  To: woofwoof, Andrew Morton
  Cc: dipankar, Maneesh Soni, Al Viro, LKML, Anton Blanchard,
	Paul McKenney

> Clearly dcache_lock is the killer when 'ps' command is used in
> the benchmark. My guess (without looking at 'ps' code) is that
> it has to open/close a lot of files in /proc and that increases
> the number of acquisitions of dcache_lock. Increased # of acquisition
> add to cache line bouncing and contention.

Strace it - IIRC it does 5 opens per PID. Vomit.

M.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: dcache_rcu [performance results]
  2002-11-04 17:29       ` Martin J. Bligh
@ 2002-11-05  0:00         ` jw schultz
  2002-11-05  1:14           ` ps performance sucks (was Re: dcache_rcu [performance results]) Martin J. Bligh
  0 siblings, 1 reply; 25+ messages in thread
From: jw schultz @ 2002-11-05  0:00 UTC (permalink / raw)
  To: LKML

On Mon, Nov 04, 2002 at 09:29:14AM -0800, Martin J. Bligh wrote:
> > Clearly dcache_lock is the killer when 'ps' command is used in
> > the benchmark. My guess (without looking at 'ps' code) is that
> > it has to open/close a lot of files in /proc and that increases
> > the number of acquisitions of dcache_lock. Increased # of acquisition
> > add to cache line bouncing and contention.
> 
> Strace it - IIRC it does 5 opens per PID. Vomit.

I just did, had the same reaction.  This is ugly.

It opens stat, statm, status, cmdline and environ apparently
regardless of what will be in the ouput.  At least environ
will fail on most pids if you aren't root, saving on some of
the overhead.  It compunds this by doing so for every pid
even if you have explicitly requested only one pid by
number.

Clearly ps could do with a cleanup.  There is no reason to
read environ if it wasn't asked for.   Deciding which files
are needed based on the command line options would be a
start.

I'm thinking that ps, top and company are good reasons to
make an exception of one value per file in proc.  Clearly
open+read+close of 3-5 "files" each extracting data from
task_struct isn't more efficient than one "file" that
generates the needed data one field per line.

Don't get me wrong.  I believe in the one field per file
rule but ps &co are the exception that proves (tests) the
rule.  Especially on the heavily laden systems with
tens of thousands of tasks.  We could do with a something
between /dev/kmem and five files per pid.

-- 
________________________________________________________________
	J.W. Schultz            Pegasystems Technologies
	email address:		jw@pegasys.ws

		Remember Cernan and Schmitt

^ permalink raw reply	[flat|nested] 25+ messages in thread

* ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  0:00         ` jw schultz
@ 2002-11-05  1:14           ` Martin J. Bligh
  2002-11-05  3:57             ` Werner Almesberger
                               ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: Martin J. Bligh @ 2002-11-05  1:14 UTC (permalink / raw)
  To: jw schultz, LKML

> Clearly ps could do with a cleanup.  There is no reason to
> read environ if it wasn't asked for.   Deciding which files
> are needed based on the command line options would be a
> start.
> 
> I'm thinking that ps, top and company are good reasons to
> make an exception of one value per file in proc.  Clearly
> open+read+close of 3-5 "files" each extracting data from
> task_struct isn't more efficient than one "file" that
> generates the needed data one field per line.

I think it's pretty trivial to make /proc/<pid>/psinfo, which
dumps the garbage from all five files in one place. Which makes
it 5 times better, but it still sucks.

> Don't get me wrong.  I believe in the one field per file
> rule but ps &co are the exception that proves (tests) the
> rule.  Especially on the heavily laden systems with
> tens of thousands of tasks.  We could do with a something
> between /dev/kmem and five files per pid.

I had a very brief think about this at the weekend, seeing
if I could make a big melting pot /proc/psinfo file that did
seqfile and read everything out in one go, using seq_file
internally to interate over the tasklist. The most obvious
problem that sprung to mind seems to be the tasklist locking -
you obviously can't just hold a lock over the whole thing.
As I know very little about that, I'll let someone else suggest
how to do this, but I'm prepared to do the grunt work of implementing
it if need be.

M.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  1:14           ` ps performance sucks (was Re: dcache_rcu [performance results]) Martin J. Bligh
@ 2002-11-05  3:57             ` Werner Almesberger
  2002-11-05  4:42               ` Erik Andersen
  2002-11-05  4:26             ` jw schultz
                               ` (2 subsequent siblings)
  3 siblings, 1 reply; 25+ messages in thread
From: Werner Almesberger @ 2002-11-05  3:57 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: jw schultz, LKML

Martin J. Bligh wrote:
> I had a very brief think about this at the weekend, seeing
> if I could make a big melting pot /proc/psinfo file

You could take a more radical approach. Since the goal of such
a psinfo file would be to accelerate access to information
that's already available elsewhere, you can do away with many
of the niceties of procfs, e.g.

 - no need to be human-readable (e.g. binary or hex dump may
   make sense in this case)
 - may use other operations than just open and read (e.g.
   do an initial write to select what should be read)
 - you may cache previous responses and only output deltas
   (not sure if this is useful - all you'd safe is the
   actual copy to user space)

Actually, I think attempting to just make it brutally efficient,
no matter how much nastiness you amass doing that, might be
good approach for a first version. Then, if people are
digusted, you can make things nicer, and keep track of how
much performance you're losing.

Example:

First write says "pid,comm". Internally, this gets translated
to 0x8c+0x04, 0x2ee+0x10 (offset+length). Next read returns
"pid 4,comm 16" (include the name, so you can indicate fields
the kernel doesn't recognize). Then, kmalloc 20*tasks bytes,
lock, copy the fields from struct task_struct, unlock, let the
stuff be read by user space, kfree. Adjacent fields can be
optimized to single byte strings at setup time.

- Werner

-- 
  _________________________________________________________________________
 / Werner Almesberger, Buenos Aires, Argentina         wa@almesberger.net /
/_http://www.almesberger.net/____________________________________________/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  1:14           ` ps performance sucks (was Re: dcache_rcu [performance results]) Martin J. Bligh
  2002-11-05  3:57             ` Werner Almesberger
@ 2002-11-05  4:26             ` jw schultz
  2002-11-05  5:51               ` Martin J. Bligh
  2002-11-05 19:57             ` Kai Henningsen
  2002-11-05 22:09             ` Karim Yaghmour
  3 siblings, 1 reply; 25+ messages in thread
From: jw schultz @ 2002-11-05  4:26 UTC (permalink / raw)
  To: LKML

On Mon, Nov 04, 2002 at 05:14:19PM -0800, Martin J. Bligh wrote:
> I think it's pretty trivial to make /proc/<pid>/psinfo, which
> dumps the garbage from all five files in one place. Which makes
> it 5 times better, but it still sucks.

And i'd still keep environ seperate.  I'm inclined to think
ps should never have presented it in the first place.
This is the direction i (for what it's worth) favor.

> > Don't get me wrong.  I believe in the one field per file
> > rule but ps &co are the exception that proves (tests) the
> > rule.  Especially on the heavily laden systems with
> > tens of thousands of tasks.  We could do with a something
> > between /dev/kmem and five files per pid.
> 
> I had a very brief think about this at the weekend, seeing
> if I could make a big melting pot /proc/psinfo file that did
> seqfile and read everything out in one go, using seq_file
> internally to interate over the tasklist. The most obvious
> problem that sprung to mind seems to be the tasklist locking -
> you obviously can't just hold a lock over the whole thing.
> As I know very little about that, I'll let someone else suggest
> how to do this, but I'm prepared to do the grunt work of implementing
> it if need be.

Yep, can't hold the lock across syscalls.  That would be
quite a bit of data to hold in a per fd buffer.  Think of
the big iron with tons of processes.

The other way i could see this working is to present it as a
sparse file.  ps (or whatever) would first get a list of
pids then iterate over them using lseek to set the file
offset to pid * CONSTANT_SIZE and read would return
something smaller than CONSTANT_SIZE bytes.  If the pid no
longer exists return 0.

I really hate this idea.  It stinks almost as much as
/dev/kmem.


-- 
________________________________________________________________
	J.W. Schultz            Pegasystems Technologies
	email address:		jw@pegasys.ws

		Remember Cernan and Schmitt

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  3:57             ` Werner Almesberger
@ 2002-11-05  4:42               ` Erik Andersen
  2002-11-05  5:44                 ` Martin J. Bligh
  2002-11-05  6:14                 ` Werner Almesberger
  0 siblings, 2 replies; 25+ messages in thread
From: Erik Andersen @ 2002-11-05  4:42 UTC (permalink / raw)
  To: Werner Almesberger; +Cc: Martin J. Bligh, jw schultz, LKML

On Tue Nov 05, 2002 at 12:57:45AM -0300, Werner Almesberger wrote:
> Martin J. Bligh wrote:
> > I had a very brief think about this at the weekend, seeing
> > if I could make a big melting pot /proc/psinfo file
> 
> You could take a more radical approach. Since the goal of such
> a psinfo file would be to accelerate access to information
> that's already available elsewhere, you can do away with many
> of the niceties of procfs, e.g.
> 
>  - no need to be human-readable (e.g. binary or hex dump may
>    make sense in this case)
>  - may use other operations than just open and read (e.g.

Hehe.  You just reinvented my old /dev/ps driver.  :)

http://www.busybox.net/cgi-bin/cvsweb/busybox/examples/kernel-patches/devps.patch.9_25_2000?rev=1.2&content-type=text/vnd.viewcvs-markup

This is what Linus has to say on the subject:

    I do dislike /dev/ps mightily. If the problem is that /proc
    is too large, then the right solution is to just clean up
    /proc.  Which is getting done.  And yes, /proc will be larger
    than /dev/ps, but I still find that preferable to having two
    incompatible ways to do the same thing.

 -Erik

--
Erik B. Andersen             http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  4:42               ` Erik Andersen
@ 2002-11-05  5:44                 ` Martin J. Bligh
  2002-11-05  5:59                   ` Alexander Viro
  2002-11-05  6:13                   ` Erik Andersen
  2002-11-05  6:14                 ` Werner Almesberger
  1 sibling, 2 replies; 25+ messages in thread
From: Martin J. Bligh @ 2002-11-05  5:44 UTC (permalink / raw)
  To: andersen, Werner Almesberger; +Cc: jw schultz, LKML

> Hehe.  You just reinvented my old /dev/ps driver.  :)

Indeed, sounds much more like a /dev thing than a /proc thing
at this point ;-)

> http://www.busybox.net/cgi-bin/cvsweb/busybox/examples/kernel-patches/devps.patch.9_25_2000?rev=1.2&content-type=text/vnd.viewcvs-markup
> 
> This is what Linus has to say on the subject:
> 
> ... If the problem is that /proc
>     is too large, then the right solution is to just clean up
>     /proc.  Which is getting done.  And yes, /proc will be larger
>     than /dev/ps, but I still find that preferable to having two
>     incompatible ways to do the same thing.

Ummm ... how do we make /proc smaller than 1 file to open per PID?
It's pretty easy to get it down that far. But it still sucks.

>     I do dislike /dev/ps mightily.

Well it can't be any worse than the current crap. At least it'd 
stand a chance in hell of scaling a little bit. So I took a very 
quick look ... what syscalls are you reduced to per pid, one ioctl 
and one read?

M.




^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  4:26             ` jw schultz
@ 2002-11-05  5:51               ` Martin J. Bligh
  0 siblings, 0 replies; 25+ messages in thread
From: Martin J. Bligh @ 2002-11-05  5:51 UTC (permalink / raw)
  To: jw schultz, LKML

> And i'd still keep environ seperate.  I'm inclined to think
> ps should never have presented it in the first place.
> This is the direction i (for what it's worth) favor.

If it doesn't need it then sure, otherwise just dump whatever
it needs in there. The seperate files would still be there too.

> Yep, can't hold the lock across syscalls.  That would be
> quite a bit of data to hold in a per fd buffer.  Think of
> the big iron with tons of processes.

I *have* the big iron with tons of processes ;-) That's why
I care ...

> The other way i could see this working is to present it as a
> sparse file.  ps (or whatever) would first get a list of
> pids then iterate over them using lseek to set the file
> offset to pid * CONSTANT_SIZE and read would return
> something smaller than CONSTANT_SIZE bytes.  If the pid no
> longer exists return 0.
> 
> I really hate this idea.  It stinks almost as much as
> /dev/kmem.

Well if we want to be gross and efficient, we could just compile
a kmem-diving dynamic library with every kernel compile and stick
it in /boot or somewhere. Mildly less extreme is a flat index file
for the data you need a la System.map. Then just open /dev/kmem
and grab what you want. Walking the tasklist with no locking would
be an interesting challenge, but probably not insurmountable. 
That's how things like ps always used to work IIRC.

M.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  5:44                 ` Martin J. Bligh
@ 2002-11-05  5:59                   ` Alexander Viro
  2002-11-05  6:05                     ` Martin J. Bligh
  2002-11-05  6:15                     ` Robert Love
  2002-11-05  6:13                   ` Erik Andersen
  1 sibling, 2 replies; 25+ messages in thread
From: Alexander Viro @ 2002-11-05  5:59 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: andersen, Werner Almesberger, jw schultz, LKML



On Mon, 4 Nov 2002, Martin J. Bligh wrote:

> >     I do dislike /dev/ps mightily.
> 
> Well it can't be any worse than the current crap. At least it'd 
> stand a chance in hell of scaling a little bit. So I took a very 
> quick look ... what syscalls are you reduced to per pid, one ioctl 
> and one read?

Oh, yes it can.  Easily.
	* device is not network-transparent - even in principle
	* restricting data access would be harder - welcome to suid or
sgid country
	* real killer: you think Albert would fail to produce equally
crappy code and equally crappy behaviour?  Yeah, right.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  5:59                   ` Alexander Viro
@ 2002-11-05  6:05                     ` Martin J. Bligh
  2002-11-05  6:15                     ` Robert Love
  1 sibling, 0 replies; 25+ messages in thread
From: Martin J. Bligh @ 2002-11-05  6:05 UTC (permalink / raw)
  To: Alexander Viro; +Cc: andersen, Werner Almesberger, jw schultz, LKML

>> Well it can't be any worse than the current crap. At least it'd 
>> stand a chance in hell of scaling a little bit. So I took a very 
>> quick look ... what syscalls are you reduced to per pid, one ioctl 
>> and one read?
> 
> Oh, yes it can.  Easily.
> 	* device is not network-transparent - even in principle

Is that really a major issue for ps?

> 	* restricting data access would be harder - welcome to suid or
> sgid country

I can live with that level of pain if my benchmark doesn't get
driven into the wall by the tools that are meant to be montoring
it ...

I'm sure there are bigger rocks to be thrown at it as well, and
ugly critters to be found under those rocks, but I don't see anything
insurmountable here yet. Whereas opening billions of files is just
unworkable.

Better still, you seem like an excellent candidate to propose a good 
design that's efficient and workable? 

> 	* real killer: you think Albert would fail to produce equally
> crappy code and equally crappy behaviour?  Yeah, right.

Heh ;-) 
A hostile takeover might suffice here, if necessary ...

M.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  5:44                 ` Martin J. Bligh
  2002-11-05  5:59                   ` Alexander Viro
@ 2002-11-05  6:13                   ` Erik Andersen
  1 sibling, 0 replies; 25+ messages in thread
From: Erik Andersen @ 2002-11-05  6:13 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Werner Almesberger, jw schultz, LKML

On Mon Nov 04, 2002 at 09:44:07PM -0800, Martin J. Bligh wrote:
> > Hehe.  You just reinvented my old /dev/ps driver.  :)
> 
> Indeed, sounds much more like a /dev thing than a /proc thing
> at this point ;-)
> 
> > http://www.busybox.net/cgi-bin/cvsweb/busybox/examples/kernel-patches/devps.patch.9_25_2000?rev=1.2&content-type=text/vnd.viewcvs-markup
> > 
> > This is what Linus has to say on the subject:
> > 
> > ... If the problem is that /proc
> >     is too large, then the right solution is to just clean up
> >     /proc.  Which is getting done.  And yes, /proc will be larger
> >     than /dev/ps, but I still find that preferable to having two
> >     incompatible ways to do the same thing.
> 
> Ummm ... how do we make /proc smaller than 1 file to open per PID?
> It's pretty easy to get it down that far. But it still sucks.
> 
> >     I do dislike /dev/ps mightily.
> 
> Well it can't be any worse than the current crap. At least it'd 
> stand a chance in hell of scaling a little bit. So I took a very 
> quick look ... what syscalls are you reduced to per pid, one ioctl 
> and one read?

As I implemented it, it was one ioctl per pid...  Of course 
it could be easily modified to be one syscall, one read from
the /dev/ps char device, or similar...

 -Erik

--
Erik B. Andersen             http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  4:42               ` Erik Andersen
  2002-11-05  5:44                 ` Martin J. Bligh
@ 2002-11-05  6:14                 ` Werner Almesberger
  1 sibling, 0 replies; 25+ messages in thread
From: Werner Almesberger @ 2002-11-05  6:14 UTC (permalink / raw)
  To: Erik Andersen, Martin J. Bligh, jw schultz, LKML

Erik Andersen wrote:
> Hehe.  You just reinvented my old /dev/ps driver.  :)

Hmm, you still need 2+#pids operations, while my approach could
essentially do everything in a single read. Besides, many people
don't exactly love ioctls.

Furthermore, you'd need to version your big struct pid_info,
while my approach wouldn't have problems if fields are added or
removed (only if their content changes).

Two advantages of your approach are that the amount of data
cached in the kernel is limited to max(sizeof(pid_t)*#pids,
sizeof(struct pid_info)), and that you do less work under
tasklist_lock.

>     I do dislike /dev/ps mightily. If the problem is that /proc
>     is too large, then the right solution is to just clean up
>     /proc.  Which is getting done.  And yes, /proc will be larger
>     than /dev/ps, but I still find that preferable to having two
>     incompatible ways to do the same thing.

Hmm yes, so he might not like my /proc/grab-taskdata-FAST
either :)

- Werner

-- 
  _________________________________________________________________________
 / Werner Almesberger, Buenos Aires, Argentina         wa@almesberger.net /
/_http://www.almesberger.net/____________________________________________/

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  5:59                   ` Alexander Viro
  2002-11-05  6:05                     ` Martin J. Bligh
@ 2002-11-05  6:15                     ` Robert Love
  1 sibling, 0 replies; 25+ messages in thread
From: Robert Love @ 2002-11-05  6:15 UTC (permalink / raw)
  To: Alexander Viro
  Cc: Martin J. Bligh, andersen, Werner Almesberger, jw schultz, LKML

On Tue, 2002-11-05 at 00:59, Alexander Viro wrote:

> Oh, yes it can.  Easily.
> 	* device is not network-transparent - even in principle
> 	* restricting data access would be harder - welcome to suid or
> sgid country
> 	* real killer: you think Albert would fail to produce equally
> crappy code and equally crappy behaviour?  Yeah, right.

Well I think Rik and I can handle it in our tree :)

But I agree - I do not care much for this /dev idea either.

	Robert Love


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  1:14           ` ps performance sucks (was Re: dcache_rcu [performance results]) Martin J. Bligh
  2002-11-05  3:57             ` Werner Almesberger
  2002-11-05  4:26             ` jw schultz
@ 2002-11-05 19:57             ` Kai Henningsen
  2002-11-05 21:33               ` Erik Andersen
  2002-11-05 22:09             ` Karim Yaghmour
  3 siblings, 1 reply; 25+ messages in thread
From: Kai Henningsen @ 2002-11-05 19:57 UTC (permalink / raw)
  To: linux-kernel

mbligh@aracnet.com (Martin J. Bligh)  wrote on 04.11.02 in <1118170000.1036458859@flay>:

> I had a very brief think about this at the weekend, seeing
> if I could make a big melting pot /proc/psinfo file that did
> seqfile and read everything out in one go, using seq_file
> internally to interate over the tasklist. The most obvious
> problem that sprung to mind seems to be the tasklist locking -
> you obviously can't just hold a lock over the whole thing.

Well, one thing i to make certain you can actually do it with one or two  
system calls. Say, one system call to figure out how big a buffer is  
necessary (essentially, #tasks*size), then one read with a suitably-sized  
buffer. Then have a loop in the kernel that drops the lock as often as  
necessary, and otherwise puts it all in the buffer in one go. (If the  
#tasks grows too fast so it overruns the buffer even with some slack given  
in advance, tough, have a useful return code to indicate that and let ps  
retry.)

I briefly thought about mmap, but I don't think that actually buys  
anything.

MfG Kai

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05 19:57             ` Kai Henningsen
@ 2002-11-05 21:33               ` Erik Andersen
  0 siblings, 0 replies; 25+ messages in thread
From: Erik Andersen @ 2002-11-05 21:33 UTC (permalink / raw)
  To: Kai Henningsen; +Cc: linux-kernel

On Tue Nov 05, 2002 at 09:57:00PM +0200, Kai Henningsen wrote:
> mbligh@aracnet.com (Martin J. Bligh)  wrote on 04.11.02 in <1118170000.1036458859@flay>:
> 
> > I had a very brief think about this at the weekend, seeing
> > if I could make a big melting pot /proc/psinfo file that did
> > seqfile and read everything out in one go, using seq_file
> > internally to interate over the tasklist. The most obvious
> > problem that sprung to mind seems to be the tasklist locking -
> > you obviously can't just hold a lock over the whole thing.
> 
> Well, one thing i to make certain you can actually do it with one or two  
> system calls. Say, one system call to figure out how big a buffer is  
> necessary (essentially, #tasks*size), then one read with a suitably-sized  
> buffer. Then have a loop in the kernel that drops the lock as often as  
> necessary, and otherwise puts it all in the buffer in one go. (If the  
> #tasks grows too fast so it overruns the buffer even with some slack given  
> in advance, tough, have a useful return code to indicate that and let ps  
> retry.)
> 
> I briefly thought about mmap, but I don't think that actually buys  
> anything.

Once again, reminds me of my /dev/ps driver, which had the
following ioctls:

#define DEVPS_GET_NUM_PIDS     0xeba1 /* Get a list of all PIDs */ 
#define DEVPS_GET_PID_LIST     0xeba2 /* Get a list of all PIDs */ 
#define DEVPS_GET_PID_INFO     0xeba3 /* Get info about a specific PID */
#define DEVPS_GET_CURRENT_PID  0xeba4 /* Get the current PID */

So a user spave ps app would call DEVPS_GET_NUM_PIDS to find out
how many processes there are, then it would allocate some memory
(and would allocate a some extra just in case some new processes 
were to start up, the kernel would truncate things if we gave it
too little space) .  Then ps would grab the pid list by calling 
the DEVPS_GET_PID_LIST ioctl, and then for each item in the list
it would call DEVPS_GET_PID_INFO.  Assuming that call was
successful, ps would print out a line of output and move on to
the next pid in the list.

The idea need not be implemented without using ioctl and without
using binary structures (which were the things Linus objected to)

The same thing could be easily done using flat ascii...

 -Erik

--
Erik B. Andersen             http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: ps performance sucks (was Re: dcache_rcu [performance results])
  2002-11-05  1:14           ` ps performance sucks (was Re: dcache_rcu [performance results]) Martin J. Bligh
                               ` (2 preceding siblings ...)
  2002-11-05 19:57             ` Kai Henningsen
@ 2002-11-05 22:09             ` Karim Yaghmour
  3 siblings, 0 replies; 25+ messages in thread
From: Karim Yaghmour @ 2002-11-05 22:09 UTC (permalink / raw)
  To: LKML

I'm not sure why people are trying to make pigs fly, but if you
really need in-depth information regarding a process or a set
of processes, you should be looking at something that's been
designed from the ground up to actually carry this weight, which
is exactly what LTT is about. Using this approach, all the
accounting gets to be done in user-space. It's like using
"top -q" without the actual disadvantage of killing your system.

Karim

===================================================
                 Karim Yaghmour
               karim@opersys.com
      Embedded and Real-Time Linux Expert
===================================================

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2002-11-05 21:59 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-30 10:49 [PATCH 2.5.44] dcache_rcu Maneesh Soni
2002-10-31 10:53 ` dcache_rcu [performance results] Dipankar Sarma
2002-11-02  1:36   ` Andrew Morton
2002-11-02  9:13     ` Dipankar Sarma
2002-11-04 17:29       ` Martin J. Bligh
2002-11-05  0:00         ` jw schultz
2002-11-05  1:14           ` ps performance sucks (was Re: dcache_rcu [performance results]) Martin J. Bligh
2002-11-05  3:57             ` Werner Almesberger
2002-11-05  4:42               ` Erik Andersen
2002-11-05  5:44                 ` Martin J. Bligh
2002-11-05  5:59                   ` Alexander Viro
2002-11-05  6:05                     ` Martin J. Bligh
2002-11-05  6:15                     ` Robert Love
2002-11-05  6:13                   ` Erik Andersen
2002-11-05  6:14                 ` Werner Almesberger
2002-11-05  4:26             ` jw schultz
2002-11-05  5:51               ` Martin J. Bligh
2002-11-05 19:57             ` Kai Henningsen
2002-11-05 21:33               ` Erik Andersen
2002-11-05 22:09             ` Karim Yaghmour
     [not found] <20021030161912.E2613@in.ibm.com.suse.lists.linux.kernel>
     [not found] ` <20021031162330.B12797@in.ibm.com.suse.lists.linux.kernel>
     [not found]   ` <3DC32C03.C3910128@digeo.com.suse.lists.linux.kernel>
     [not found]     ` <20021102144306.A6736@dikhow.suse.lists.linux.kernel>
2002-11-02 10:08       ` dcache_rcu [performance results] Andi Kleen
2002-11-02 10:54         ` Dipankar Sarma
2002-11-02 11:01           ` Andi Kleen
2002-11-02 19:41             ` Linus Torvalds
2002-11-02 21:16               ` Sam Ravnborg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox