All of lore.kernel.org
 help / color / mirror / Atom feed
From: Trond Myklebust <trond.myklebust@fys.uio.no>
To: Linus Torvalds <torvalds@transmeta.com>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH] Remove BKL from NFS read/write code + SunRPC...
Date: Tue, 12 Feb 2002 12:51:56 +0100	[thread overview]
Message-ID: <15465.476.953349.720240@charged.uio.no> (raw)


 The following patch strongly reduces BKL contention within the NFS
read/write code, and within the generic RPC layer. Features:

  - BKL is added explicitly where needed for list protection in the
    NLM locking code.

  - BKL added where still needed to protect the NFS_FLAGS(inode) (this
    will of course be fixed in a later patch).

  - Replace BKL with spinlock "nfs_flushd_lock" for protecting the
    read/write asynchronous flush daemon.

  - Replace BKL with spinlock "nfs_inode_lock" for protecting the
    inode attribute updates.

This also allows us to drop the BKL from the SunRPC code. For the
latter, all the internals should already be SMP safe. The global BKL
was only being maintained in order to ensure that the user callbacks
don't race with one another (and because I was nervous of removing it
in a stable series ;-).

Cheers,
  Trond

diff -u --recursive --new-file linux-2.5.4/fs/lockd/clntproc.c linux-2.5.4-rpc_bkl/fs/lockd/clntproc.c
--- linux-2.5.4/fs/lockd/clntproc.c	Mon Feb 11 02:50:08 2002
+++ linux-2.5.4-rpc_bkl/fs/lockd/clntproc.c	Tue Feb 12 09:32:18 2002
@@ -569,11 +569,15 @@
 		printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status);
 
 die:
+	lock_kernel();
 	nlm_release_host(req->a_host);
+	unlock_kernel();
 	kfree(req);
 	return;
  retry_rebind:
+	lock_kernel();
 	nlm_rebind_host(req->a_host);
+	unlock_kernel();
  retry_unlock:
 	rpc_restart_call(task);
 }
@@ -650,12 +654,16 @@
 	}
 
 die:
+	lock_kernel();
 	nlm_release_host(req->a_host);
+	unlock_kernel();
 	kfree(req);
 	return;
 
 retry_cancel:
+	lock_kernel();
 	nlm_rebind_host(req->a_host);
+	unlock_kernel();
 	rpc_restart_call(task);
 	rpc_delay(task, 30 * HZ);
 }
diff -u --recursive --new-file linux-2.5.4/fs/lockd/svc4proc.c linux-2.5.4-rpc_bkl/fs/lockd/svc4proc.c
--- linux-2.5.4/fs/lockd/svc4proc.c	Mon Feb 11 02:50:14 2002
+++ linux-2.5.4-rpc_bkl/fs/lockd/svc4proc.c	Tue Feb 12 09:32:18 2002
@@ -17,6 +17,7 @@
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
 #include <linux/lockd/sm_inter.h>
+#include <linux/smp_lock.h>
 
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
@@ -499,7 +500,9 @@
 		dprintk("lockd: %4d callback failed (errno = %d)\n",
 					task->tk_pid, -task->tk_status);
 	}
+	lock_kernel();
 	nlm_release_host(call->a_host);
+	unlock_kernel();
 	kfree(call);
 }
 
diff -u --recursive --new-file linux-2.5.4/fs/lockd/svclock.c linux-2.5.4-rpc_bkl/fs/lockd/svclock.c
--- linux-2.5.4/fs/lockd/svclock.c	Mon Feb 11 02:50:13 2002
+++ linux-2.5.4-rpc_bkl/fs/lockd/svclock.c	Tue Feb 12 09:32:18 2002
@@ -576,9 +576,10 @@
 	dprintk("lockd: GRANT_MSG RPC callback\n");
 	dprintk("callback: looking for cookie %x \n", 
 		*(unsigned int *)(call->a_args.cookie.data));
+	lock_kernel();
 	if (!(block = nlmsvc_find_block(&call->a_args.cookie))) {
 		dprintk("lockd: no block for cookie %x\n", *(u32 *)(call->a_args.cookie.data));
-		return;
+		goto out;
 	}
 
 	/* Technically, we should down the file semaphore here. Since we
@@ -599,6 +600,8 @@
 	block->b_incall = 0;
 
 	nlm_release_host(call->a_host);
+ out:
+	unlock_kernel();
 }
 
 /*
diff -u --recursive --new-file linux-2.5.4/fs/lockd/svcproc.c linux-2.5.4-rpc_bkl/fs/lockd/svcproc.c
--- linux-2.5.4/fs/lockd/svcproc.c	Mon Feb 11 02:50:10 2002
+++ linux-2.5.4-rpc_bkl/fs/lockd/svcproc.c	Tue Feb 12 09:32:18 2002
@@ -18,6 +18,7 @@
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
 #include <linux/lockd/sm_inter.h>
+#include <linux/smp_lock.h>
 
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
@@ -527,7 +528,9 @@
 		dprintk("lockd: %4d callback failed (errno = %d)\n",
 					task->tk_pid, -task->tk_status);
 	}
+	lock_kernel();
 	nlm_release_host(call->a_host);
+	unlock_kernel();
 	kfree(call);
 }
 
diff -u --recursive --new-file linux-2.5.4/fs/nfs/file.c linux-2.5.4-rpc_bkl/fs/nfs/file.c
--- linux-2.5.4/fs/nfs/file.c	Mon Feb 11 02:50:09 2002
+++ linux-2.5.4-rpc_bkl/fs/nfs/file.c	Tue Feb 12 09:32:18 2002
@@ -99,7 +99,9 @@
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) *ppos);
 
+	lock_kernel();
 	result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	unlock_kernel();
 	if (!result)
 		result = generic_file_read(file, buf, count, ppos);
 	return result;
@@ -115,7 +117,9 @@
 	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
+	lock_kernel();
 	status = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	unlock_kernel();
 	if (!status)
 		status = generic_file_mmap(file, vma);
 	return status;
@@ -134,13 +138,11 @@
 
 	dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
 
-	lock_kernel();
 	status = nfs_wb_file(inode, file);
 	if (!status) {
 		status = file->f_error;
 		file->f_error = 0;
 	}
-	unlock_kernel();
 	return status;
 }
 
@@ -160,12 +162,7 @@
 
 static int nfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
 {
-	long status;
-
-	lock_kernel();
-	status = nfs_updatepage(file, page, offset, to-offset);
-	unlock_kernel();
-	return status;
+	return nfs_updatepage(file, page, offset, to-offset);
 }
 
 /*
@@ -219,7 +216,9 @@
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
 		goto out_swapfile;
+	lock_kernel();
 	result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	unlock_kernel();
 	if (result)
 		goto out;
 
diff -u --recursive --new-file linux-2.5.4/fs/nfs/flushd.c linux-2.5.4-rpc_bkl/fs/nfs/flushd.c
--- linux-2.5.4/fs/nfs/flushd.c	Mon Feb 11 02:50:09 2002
+++ linux-2.5.4-rpc_bkl/fs/nfs/flushd.c	Tue Feb 12 09:32:18 2002
@@ -51,6 +51,19 @@
  * This is the wait queue all cluster daemons sleep on
  */
 static struct rpc_wait_queue    flushd_queue = RPC_INIT_WAITQ("nfs_flushd");
+static spinlock_t nfs_flushd_lock = SPIN_LOCK_UNLOCKED;
+
+static inline void
+nfs_lock_flushd(void)
+{
+	spin_lock(&nfs_flushd_lock);
+}
+
+static inline void
+nfs_unlock_flushd(void)
+{
+	spin_unlock(&nfs_flushd_lock);
+}
 
 /*
  * Local function declarations.
@@ -67,12 +80,11 @@
 
 	dprintk("NFS: writecache_init\n");
 
-	lock_kernel();
-	status = -ENOMEM;
 	/* Create the RPC task */
 	if (!(task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC)))
-		goto out_unlock;
+		return -ENOMEM;
 
+	nfs_lock_flushd();
 	cache = server->rw_requests;
 
 	status = 0;
@@ -89,22 +101,21 @@
 	cache->auth = server->client->cl_auth;
 	task->tk_action   = nfs_flushd;
 	task->tk_exit   = nfs_flushd_exit;
+	nfs_unlock_flushd();
 
 	rpc_execute(task);
-	unlock_kernel();
 	return 0;
  out_unlock:
-	if (task)
-		rpc_release_task(task);
-	unlock_kernel();
-	return status;
+	nfs_unlock_flushd();
+	rpc_release_task(task);
+	return 0;
 }
 
 void nfs_reqlist_exit(struct nfs_server *server)
 {
 	struct nfs_reqlist      *cache;
 
-	lock_kernel();
+	nfs_lock_flushd();
 	cache = server->rw_requests;
 	if (!cache)
 		goto out;
@@ -114,11 +125,13 @@
 	while (cache->task) {
 		rpc_exit(cache->task, 0);
 		rpc_wake_up_task(cache->task);
+		nfs_unlock_flushd();
 
 		interruptible_sleep_on_timeout(&cache->request_wait, 1 * HZ);
+		nfs_lock_flushd();
 	}
  out:
-	unlock_kernel();
+	nfs_unlock_flushd();
 }
 
 int nfs_reqlist_alloc(struct nfs_server *server)
@@ -183,11 +196,13 @@
 	}
 
 	dprintk("NFS: %4d flushd back to sleep\n", task->tk_pid);
+	nfs_lock_flushd();
 	if (task->tk_action) {
 		task->tk_timeout = NFS_FLUSHD_TIMEOUT;
 		cache->runat = jiffies + task->tk_timeout;
 		rpc_sleep_on(&flushd_queue, task, NULL, NULL);
 	}
+	nfs_unlock_flushd();
 }
 
 static void
@@ -196,10 +211,13 @@
 	struct nfs_server	*server;
 	struct nfs_reqlist	*cache;
 	server = (struct nfs_server *) task->tk_calldata;
+
+	nfs_lock_flushd();
 	cache = server->rw_requests;
 
 	if (cache->task == task)
 		cache->task = NULL;
 	wake_up(&cache->request_wait);
+	nfs_unlock_flushd();
 }
 
diff -u --recursive --new-file linux-2.5.4/fs/nfs/inode.c linux-2.5.4-rpc_bkl/fs/nfs/inode.c
--- linux-2.5.4/fs/nfs/inode.c	Mon Feb 11 02:50:12 2002
+++ linux-2.5.4-rpc_bkl/fs/nfs/inode.c	Tue Feb 12 09:32:18 2002
@@ -90,6 +90,9 @@
 	&nfs_rpcstat,
 };
 
+/* Spinlock to protect the NFS inode update */
+static spinlock_t nfs_inode_lock = SPIN_LOCK_UNLOCKED;
+
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 {
@@ -586,18 +589,30 @@
 }
 
 /*
+ * Reset the read time on the local caches
+ */
+void
+nfs_invalidate_caches(struct inode *inode)
+{
+	spin_lock(&nfs_inode_lock);
+	NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1;
+	spin_unlock(&nfs_inode_lock);
+}
+
+/*
  * Invalidate the local caches
  */
 void
 nfs_zap_caches(struct inode *inode)
 {
-	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
-	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
-
 	invalidate_inode_pages(inode);
 
+	spin_lock(&nfs_inode_lock);
+	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
+	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
-	NFS_CACHEINV(inode);
+	NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1;
+	spin_unlock(&nfs_inode_lock);
 }
 
 /*
@@ -838,7 +853,11 @@
 nfs_revalidate(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	return nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	int status;
+	lock_kernel();
+	status = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	unlock_kernel();
+	return status;
 }
 
 /*
@@ -867,13 +886,11 @@
 	struct rpc_auth *auth;
 	struct rpc_cred *cred;
 
-	lock_kernel();
 	auth = NFS_CLIENT(inode)->cl_auth;
 	cred = rpcauth_lookupcred(auth, 0);
 	filp->private_data = cred;
 	if (filp->f_mode & FMODE_WRITE)
 		nfs_set_mmcred(inode, cred);
-	unlock_kernel();
 	return 0;
 }
 
@@ -881,11 +898,9 @@
 {
 	struct rpc_cred *cred;
 
-	lock_kernel();
 	cred = nfs_file_cred(filp);
 	if (cred)
 		put_rpccred(cred);
-	unlock_kernel();
 	return 0;
 }
 
@@ -902,7 +917,6 @@
 	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
 		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
 
-	lock_kernel();
 	if (!inode || is_bad_inode(inode))
  		goto out_nowait;
 	if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode)
@@ -948,10 +962,22 @@
 	NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING;
 	wake_up(&inode->i_wait);
  out_nowait:
-	unlock_kernel();
 	return status;
 }
 
+/**
+ * nfs_grow_isize - Extend inode->i_size
+ * @inode: inode
+ * @size: new file size
+ */
+void nfs_grow_isize(struct inode *inode, loff_t size)
+{
+	spin_lock(&nfs_inode_lock);
+	if (inode->i_size < size)
+		inode->i_size = size;
+	spin_unlock(&nfs_inode_lock);
+}
+
 /*
  * nfs_fattr_obsolete - Test if attribute data is newer than cached data
  * @inode: inode
@@ -1024,6 +1050,7 @@
 
 	new_atime = nfs_time_to_secs(fattr->atime);
 	/* Avoid races */
+	spin_lock(&nfs_inode_lock);
 	if (nfs_fattr_obsolete(inode, fattr))
 		goto out_nochange;
 
@@ -1109,6 +1136,7 @@
 			NFS_ATTRTIMEO(inode) = NFS_MAXATTRTIMEO(inode);
 		NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
 	}
+	spin_unlock(&nfs_inode_lock);
 
 	if (invalid)
 		nfs_zap_caches(inode);
@@ -1116,6 +1144,7 @@
  out_nochange:
 	if (new_atime - inode->i_atime > 0)
 		inode->i_atime = new_atime;
+	spin_unlock(&nfs_inode_lock);
 	return 0;
  out_changed:
 	/*
diff -u --recursive --new-file linux-2.5.4/fs/nfs/read.c linux-2.5.4-rpc_bkl/fs/nfs/read.c
--- linux-2.5.4/fs/nfs/read.c	Mon Feb 11 02:50:06 2002
+++ linux-2.5.4-rpc_bkl/fs/nfs/read.c	Tue Feb 12 09:32:18 2002
@@ -114,11 +114,9 @@
 			(long long)NFS_FILEID(inode),
 			(long long)offset, rsize, buffer);
 
-		lock_kernel();
 		result = NFS_PROTO(inode)->read(inode, cred, &fattr, flags,
 						offset, rsize, buffer, &eof);
 		nfs_refresh_inode(inode, &fattr);
-		unlock_kernel();
 
 		/*
 		 * Even if we had a partial success we can't mark the page
@@ -277,9 +275,7 @@
 
 	rpc_clnt_sigmask(clnt, &oldset);
 	rpc_call_setup(task, &msg, 0);
-	lock_kernel();
 	rpc_execute(task);
-	unlock_kernel();
 	rpc_clnt_sigunmask(clnt, &oldset);
 	return 0;
 out_bad:
diff -u --recursive --new-file linux-2.5.4/fs/nfs/write.c linux-2.5.4-rpc_bkl/fs/nfs/write.c
--- linux-2.5.4/fs/nfs/write.c	Mon Feb 11 02:50:11 2002
+++ linux-2.5.4-rpc_bkl/fs/nfs/write.c	Tue Feb 12 09:32:18 2002
@@ -194,8 +194,7 @@
 		 * If we've extended the file, update the inode
 		 * now so we don't invalidate the cache.
 		 */
-		if (base > inode->i_size)
-			inode->i_size = base;
+		nfs_grow_isize(inode, base);
 	} while (count);
 
 	if (PageError(page))
@@ -226,9 +225,7 @@
 	nfs_unlock_request(req);
 	nfs_strategy(inode);
 	end = ((loff_t)page->index<<PAGE_CACHE_SHIFT) + (loff_t)(offset + count);
-	if (inode->i_size < end)
-		inode->i_size = end;
-
+	nfs_grow_isize(inode, end);
  out:
 	return status;
 }
@@ -260,7 +257,6 @@
 	if (page->index >= end_index+1 || !offset)
 		goto out;
 do_it:
-	lock_kernel();
 	if (NFS_SERVER(inode)->wsize >= PAGE_CACHE_SIZE && !IS_SYNC(inode)) {
 		err = nfs_writepage_async(NULL, inode, page, 0, offset);
 		if (err >= 0)
@@ -270,7 +266,6 @@
 		if (err == offset)
 			err = 0;
 	}
-	unlock_kernel();
 out:
 	UnlockPage(page);
 	return err; 
@@ -828,8 +823,7 @@
 
 	status = 0;
 	end = ((loff_t)page->index<<PAGE_CACHE_SHIFT) + (loff_t)(offset + count);
-	if (inode->i_size < end)
-		inode->i_size = end;
+	nfs_grow_isize(inode, end);
 
 	/* If we wrote past the end of the page.
 	 * Call the strategy routine so it can send out a bunch
@@ -952,9 +946,7 @@
 
 	rpc_clnt_sigmask(clnt, &oldset);
 	rpc_call_setup(task, &msg, 0);
-	lock_kernel();
 	rpc_execute(task);
-	unlock_kernel();
 	rpc_clnt_sigunmask(clnt, &oldset);
 	return 0;
  out_bad:
@@ -1175,9 +1167,7 @@
 	dprintk("NFS: %4d initiated commit call\n", task->tk_pid);
 	rpc_clnt_sigmask(clnt, &oldset);
 	rpc_call_setup(task, &msg, 0);
-	lock_kernel();
 	rpc_execute(task);
-	unlock_kernel();
 	rpc_clnt_sigunmask(clnt, &oldset);
 	return 0;
  out_bad:
diff -u --recursive --new-file linux-2.5.4/include/linux/nfs_fs.h linux-2.5.4-rpc_bkl/include/linux/nfs_fs.h
--- linux-2.5.4/include/linux/nfs_fs.h	Mon Feb 11 02:50:14 2002
+++ linux-2.5.4-rpc_bkl/include/linux/nfs_fs.h	Tue Feb 12 11:14:21 2002
@@ -188,10 +188,7 @@
 #define NFS_CACHE_MTIME(inode)		(NFS_I(inode)->read_cache_mtime)
 #define NFS_CACHE_ISIZE(inode)		(NFS_I(inode)->read_cache_isize)
 #define NFS_NEXTSCAN(inode)		(NFS_I(inode)->nextscan)
-#define NFS_CACHEINV(inode) \
-do { \
-	NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; \
-} while (0)
+#define NFS_CACHEINV(inode)		nfs_invalidate_caches(inode)
 #define NFS_ATTRTIMEO(inode)		(NFS_I(inode)->attrtimeo)
 #define NFS_MINATTRTIMEO(inode) \
 	(S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \
@@ -227,11 +224,13 @@
  * linux/fs/nfs/inode.c
  */
 extern void nfs_zap_caches(struct inode *);
+extern void nfs_invalidate_caches(struct inode *);
 extern int nfs_inode_is_stale(struct inode *, struct nfs_fh *,
 				struct nfs_fattr *);
 extern struct inode *nfs_fhget(struct dentry *, struct nfs_fh *,
 				struct nfs_fattr *);
 extern int __nfs_refresh_inode(struct inode *, struct nfs_fattr *);
+extern void nfs_grow_isize(struct inode *, loff_t);
 extern int nfs_revalidate(struct dentry *);
 extern int nfs_permission(struct inode *, int);
 extern int nfs_open(struct inode *, struct file *);
diff -u --recursive --new-file linux-2.5.4/net/sunrpc/sched.c linux-2.5.4-rpc_bkl/net/sunrpc/sched.c
--- linux-2.5.4/net/sunrpc/sched.c	Mon Feb 11 02:50:07 2002
+++ linux-2.5.4-rpc_bkl/net/sunrpc/sched.c	Tue Feb 12 09:32:18 2002
@@ -1052,7 +1052,6 @@
 	int		rounds = 0;
 
 	MOD_INC_USE_COUNT;
-	lock_kernel();
 	/*
 	 * Let our maker know we're running ...
 	 */

             reply	other threads:[~2002-02-12 11:52 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-02-12 11:51 Trond Myklebust [this message]
2002-02-12 17:42 ` [PATCH] Remove BKL from NFS read/write code + SunRPC Dave Hansen
2002-02-12 17:47   ` Trond Myklebust
2002-02-12 18:26     ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=15465.476.953349.720240@charged.uio.no \
    --to=trond.myklebust@fys.uio.no \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@transmeta.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.