[PATCH] Smooth out NFS client writeback

Linux NFS development
 help / color / mirror / Atom feed

* [PATCH] Smooth out NFS client writeback
@ 2005-06-02  1:38 Shantanu Goel
  2005-06-02  1:56 ` Trond Myklebust
  0 siblings, 1 reply; 13+ messages in thread
From: Shantanu Goel @ 2005-06-02  1:38 UTC (permalink / raw)
  To: trond.myklebust, nfs

[-- Attachment #1: Type: text/plain, Size: 875 bytes --]

Hi Trond,

The current NFS client can cause a program to stall
for long periods of time because it flushes all dirty
pages at once.  The attached patch addresses this by
only writing back the amount requested by the VM
layer.  It also reduces the # commit requests by
waiting for some writeback to complete before issuing
a commit.  The patch also speeds up writebacks of
mmap'ed data by accumulating dirty pages but sending
commits earlier and omitting FLUSH_STABLE.

As part of the patch, I reinstated specifying the
commit range because I observed a marked speed up when
testing on a filesystem mounted from a Solaris 2.10
server.

The patch is against 2.6.12-rc5 w/NFS_ALL.

Thanks,
Shantanu

__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 2449169141-nfs-writeback.patch --]
[-- Type: text/x-patch; name="nfs-writeback.patch", Size: 18142 bytes --]

diff -ur -x '*~' 00-trond/fs/nfs/inode.c 01-nfs-writeback/fs/nfs/inode.c
--- 00-trond/fs/nfs/inode.c	2005-06-01 18:30:38.000000000 -0400
+++ 01-nfs-writeback/fs/nfs/inode.c	2005-06-01 18:31:00.000000000 -0400
@@ -60,7 +60,6 @@
 
 static struct inode *nfs_alloc_inode(struct super_block *sb);
 static void nfs_destroy_inode(struct inode *);
-static int nfs_write_inode(struct inode *,int);
 static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
@@ -74,7 +73,6 @@
 static struct super_operations nfs_sops = { 
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.delete_inode	= nfs_delete_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
@@ -187,18 +185,6 @@
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-static int
-nfs_write_inode(struct inode *inode, int sync)
-{
-	int flags = sync ? FLUSH_WAIT : 0;
-	int ret;
-
-	ret = nfs_commit_inode(inode, flags);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
 static void
 nfs_delete_inode(struct inode * inode)
 {
@@ -1904,7 +1890,6 @@
 static struct super_operations nfs4_sops = { 
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.delete_inode	= nfs_delete_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
diff -ur -x '*~' 00-trond/fs/nfs/pagelist.c 01-nfs-writeback/fs/nfs/pagelist.c
--- 00-trond/fs/nfs/pagelist.c	2005-06-01 18:30:38.000000000 -0400
+++ 01-nfs-writeback/fs/nfs/pagelist.c	2005-06-01 18:31:00.000000000 -0400
@@ -282,7 +282,8 @@
  */
 int
 nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
-	      unsigned long idx_start, unsigned int npages)
+		    unsigned long idx_start, unsigned int npages,
+		    unsigned int max)
 {
 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
 	struct nfs_page *req;
@@ -317,6 +318,8 @@
 				res++;
 			}
 		}
+		if (max > 0 && res >= max)
+			break;
 	}
 out:
 	return res;
diff -ur -x '*~' 00-trond/fs/nfs/write.c 01-nfs-writeback/fs/nfs/write.c
--- 00-trond/fs/nfs/write.c	2005-06-01 18:30:38.000000000 -0400
+++ 01-nfs-writeback/fs/nfs/write.c	2005-06-01 18:31:00.000000000 -0400
@@ -70,6 +70,9 @@
 #define MIN_POOL_WRITE		(32)
 #define MIN_POOL_COMMIT		(4)
 
+#define NFS_WRITE_CLUSTER	((256*1024) >> PAGE_CACHE_SHIFT)
+#define NFS_COMMIT_CLUSTER	((4*1024*1024) >> PAGE_CACHE_SHIFT)
+
 /*
  * Local function declarations
  */
@@ -79,16 +82,60 @@
 					    unsigned int, unsigned int);
 static void nfs_writeback_done_partial(struct nfs_write_data *, int);
 static void nfs_writeback_done_full(struct nfs_write_data *, int);
-static int nfs_wait_on_write_congestion(struct address_space *, int);
+static int nfs_wait_on_write_congestion(struct inode *, int);
 static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
-			   unsigned int npages, int how);
+			   unsigned int npages, int how, unsigned int max);
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int nfs_commit_inode(struct inode *inode, int how);
+static int nfs_commit_check(struct inode *inode, int how, unsigned int thresh);
+#else
+static inline int nfs_commit_inode(struct inode *inode, int how)
+{
+	return 0;
+}
+
+static inline int nfs_commit_check(struct inode *inode, int how, unsigned int thresh)
+{
+	return 0;
+}
+#endif
 
 static kmem_cache_t *nfs_wdata_cachep;
 mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
+static unsigned int nfs_write_throttle;
+
+static inline unsigned int min_wb(struct inode *inode)
+{
+	return max((unsigned int)NFS_WRITE_CLUSTER, NFS_SERVER(inode)->wpages);
+}
+
+static inline int is_congested(struct nfs_inode *nfsi)
+{
+	return (nfsi->npages - nfsi->ndirty - nfsi->ncommit) >= nfs_write_throttle;
+}
+
+static inline int mod_count(struct nfs_inode *nfsi, unsigned int *count, int x)
+{
+	int congested = is_congested(nfsi);
+
+	*count += x;
+	return congested && !is_congested(nfsi);
+}
+
+static int nfs_congested(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int congested;
+
+	spin_lock(&nfsi->req_lock);
+	congested = is_congested(nfsi);
+	spin_unlock(&nfsi->req_lock);
+	return congested;
+}
 
 static inline struct nfs_write_data *nfs_commit_alloc(void)
 {
@@ -269,6 +316,7 @@
 	loff_t i_size = i_size_read(inode);
 	int inode_referenced = 0;
 	int priority = wb_priority(wbc);
+	int is_async;
 	int err;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
@@ -284,6 +332,24 @@
 	if (igrab(inode) != 0)
 		inode_referenced = 1;
 	end_index = i_size >> PAGE_CACHE_SHIFT;
+	is_async = !IS_SYNC(inode) && inode_referenced;
+
+	/*
+	 * If we were called due to memory pressure, avoid
+	 * further pressure if there are many writebacks outstanding.
+	 */
+	if (wbc->for_reclaim && is_async) {
+		if (nfs_congested(inode)) {
+			if (wbc->nonblocking) {
+				err = 0;
+				redirty_page_for_writepage(wbc, page);
+				wbc->encountered_congestion = 1;
+				goto out;
+			}
+			nfs_wait_on_write_congestion(inode, 0);
+		}
+		nfs_commit_check(inode, priority, NFS_WRITE_CLUSTER);
+	}
 
 	/* Ensure we've flushed out any previous writes */
 	nfs_wb_page_priority(inode, page, priority);
@@ -305,12 +371,12 @@
 		goto out;
 	}
 	lock_kernel();
-	if (!IS_SYNC(inode) && inode_referenced) {
+	if (is_async) {
 		err = nfs_writepage_async(ctx, inode, page, 0, offset);
 		if (err >= 0) {
 			err = 0;
-			if (wbc->for_reclaim)
-				nfs_flush_inode(inode, 0, 0, FLUSH_STABLE);
+			if (wbc->for_reclaim && NFS_I(inode)->ndirty >= NFS_WRITE_CLUSTER)
+				nfs_flush_inode(inode, 0, 0, priority, min_wb(inode));
 		}
 	} else {
 		err = nfs_writepage_sync(ctx, inode, page, 0,
@@ -331,43 +397,77 @@
 }
 
 /*
- * Note: causes nfs_update_request() to block on the assumption
- * 	 that the writeback is generated due to memory pressure.
+ * Note: We make the assumption that pdflush()'s due to dirty
+ * thresholds are asynchronous so we can build up large commit lists
+ * during synchronous writes and still do the right thing when memory
+ * needs to be reclaimed.
  */
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct inode *inode = mapping->host;
-	int err;
+	long nr = wbc->nr_to_write;
+	int priority = wb_priority(wbc);
+	int nr_commit = 0;
+	int err, is_sync;
 
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
 
-	err = generic_writepages(mapping, wbc);
-	if (err)
-		return err;
-	while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
-		if (wbc->nonblocking)
-			return 0;
-		nfs_wait_on_write_congestion(mapping, 0);
+	is_sync = !wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL;
+
+	for (;;) {
+		if (nfs_congested(inode)) {
+			if (wbc->nonblocking) {
+				wbc->encountered_congestion = 1;
+				break;
+			}
+			nfs_wait_on_write_congestion(inode, 0);
+		}
+		if (!is_sync) {
+			err = nfs_commit_check(inode, priority, 0);
+			if (err < 0)
+				return err;
+			if (err > 0) {
+				nr_commit++;
+				nr -= err;
+				if (nr <= 0)
+					break;
+				continue;
+			}
+		}
+		err = nfs_flush_inode(inode, 0, 0, priority, min_wb(inode));
+		if (err < 0)
+			return err;
+		if (err > 0) {
+			nr -= err;
+			if (nr <= 0)
+				break;
+			continue;
+		}
+		if (wbc->nr_to_write > 0) {
+			long n = wbc->nr_to_write;
+
+			err = generic_writepages(mapping, wbc);
+			if (err)
+				return err;
+			if (n - wbc->nr_to_write > 0)
+				continue;
+		}
+		break;
 	}
-	err = nfs_flush_inode(inode, 0, 0, wb_priority(wbc));
-	if (err < 0)
-		goto out;
-	wbc->nr_to_write -= err;
-	if (!wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL) {
+	if (is_sync) {
 		err = nfs_wait_on_requests(inode, 0, 0);
 		if (err < 0)
-			goto out;
+			return err;
+		priority |= FLUSH_SYNC;
 	}
-	err = nfs_commit_inode(inode, wb_priority(wbc));
-	if (err > 0) {
-		wbc->nr_to_write -= err;
-		err = 0;
+	if (is_sync || (wbc->for_kupdate && !nr_commit)) {
+		err = nfs_commit_inode(inode, priority);
+		if (err < 0)
+			return err;
+		nr -= err;
 	}
-out:
-	clear_bit(BDI_write_congested, &bdi->state);
-	wake_up_all(&nfs_write_congestion);
-	return err;
+	wbc->nr_to_write = nr;
+	return 0;
 }
 
 /*
@@ -400,12 +500,13 @@
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int need_wakeup;
 
 	BUG_ON (!NFS_WBACK_BUSY(req));
 
 	spin_lock(&nfsi->req_lock);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
-	nfsi->npages--;
+	need_wakeup = mod_count(nfsi, &nfsi->npages, -1);
 	if (!nfsi->npages) {
 		spin_unlock(&nfsi->req_lock);
 		nfs_end_data_update(inode);
@@ -414,6 +515,8 @@
 		spin_unlock(&nfsi->req_lock);
 	nfs_clear_request(req);
 	nfs_release_request(req);
+	if (need_wakeup)
+		wake_up_all(&nfs_write_congestion);
 }
 
 /*
@@ -451,15 +554,18 @@
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int need_wakeup;
 
 	spin_lock(&nfsi->req_lock);
 	radix_tree_tag_set(&nfsi->nfs_page_tree,
 			req->wb_index, NFS_PAGE_TAG_DIRTY);
 	nfs_list_add_request(req, &nfsi->dirty);
-	nfsi->ndirty++;
+	need_wakeup = mod_count(nfsi, &nfsi->ndirty, 1);
 	spin_unlock(&nfsi->req_lock);
 	inc_page_state(nr_dirty);
 	mark_inode_dirty(inode);
+	if (need_wakeup)
+		wake_up_all(&nfs_write_congestion);
 }
 
 /*
@@ -481,13 +587,16 @@
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int need_wakeup;
 
 	spin_lock(&nfsi->req_lock);
 	nfs_list_add_request(req, &nfsi->commit);
-	nfsi->ncommit++;
+	need_wakeup = mod_count(nfsi, &nfsi->ncommit, 1);
 	spin_unlock(&nfsi->req_lock);
 	inc_page_state(nr_unstable);
 	mark_inode_dirty(inode);
+	if (need_wakeup)
+		wake_up_all(&nfs_write_congestion);
 }
 #endif
 
@@ -543,13 +652,15 @@
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+nfs_scan_dirty(struct inode *inode, struct list_head *dst,
+	       unsigned long idx_start, unsigned int npages,
+	       unsigned int max)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int res = 0;
 
 	if (nfsi->ndirty != 0) {
-		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
+		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages, max);
 		nfsi->ndirty -= res;
 		sub_page_state(nr_dirty,res);
 		if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
@@ -585,35 +696,38 @@
 }
 #endif
 
-static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
+static int nfs_wait_on_write_congestion(struct inode *inode, int intr)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	DEFINE_WAIT(wait);
 	int ret = 0;
+	int done = 0;
 
 	might_sleep();
 
-	if (!bdi_write_congested(bdi))
-		return 0;
-	if (intr) {
-		struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
-		sigset_t oldset;
-
-		rpc_clnt_sigmask(clnt, &oldset);
-		prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
-		if (bdi_write_congested(bdi)) {
-			if (signalled())
-				ret = -ERESTARTSYS;
-			else
+	do {
+		if (intr) {
+			struct rpc_clnt *clnt = NFS_CLIENT(inode);
+			sigset_t oldset;
+
+			rpc_clnt_sigmask(clnt, &oldset);
+			prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
+			if (nfs_congested(inode)) {
+				if (signalled())
+					ret = -ERESTARTSYS;
+				else
+					schedule();
+			} else
+				done = 1;
+			rpc_clnt_sigunmask(clnt, &oldset);
+		} else {
+			prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
+			if (nfs_congested(inode))
 				schedule();
+			else
+				done = 1;
 		}
-		rpc_clnt_sigunmask(clnt, &oldset);
-	} else {
-		prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
-		if (bdi_write_congested(bdi))
-			schedule();
-	}
-	finish_wait(&nfs_write_congestion, &wait);
+		finish_wait(&nfs_write_congestion, &wait);
+	} while (ret == 0 && !done);
 	return ret;
 }
 
@@ -629,15 +743,12 @@
 		struct inode *inode, struct page *page,
 		unsigned int offset, unsigned int bytes)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page		*req, *new = NULL;
 	unsigned long		rqend, end;
 
 	end = offset + bytes;
 
-	if (nfs_wait_on_write_congestion(page->mapping, server->flags & NFS_MOUNT_INTR))
-		return ERR_PTR(-ERESTARTSYS);
 	for (;;) {
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
@@ -1228,24 +1339,36 @@
 		struct nfs_write_data *data, int how)
 {
 	struct rpc_task		*task = &data->task;
-	struct nfs_page		*first;
+	struct nfs_page		*first, *last;
 	struct inode		*inode;
+	loff_t			start, end, len;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
 	list_splice_init(head, &data->pages);
 	first = nfs_list_entry(data->pages.next);
+	last = nfs_list_entry(data->pages.prev);
 	inode = first->wb_context->dentry->d_inode;
 
+	/*
+	 * Determine the offset range of requests in the COMMIT call.
+	 * We rely on the fact that data->pages is an ordered list...
+	 */
+	start = req_offset(first);
+	end = req_offset(last) + last->wb_bytes;
+	len = end - start;
+	/* If 'len' is not a 32-bit quantity, pass '0' in the COMMIT call */
+	if (end >= i_size_read(inode) || len < 0 || len > (~((u32)0) >> 1))
+		len = 0;
+
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
 
 	data->args.fh     = NFS_FH(data->inode);
-	/* Note: we always request a commit of the entire inode */
-	data->args.offset = 0;
-	data->args.count  = 0;
-	data->res.count   = 0;
+	data->args.offset = start;
+	data->args.count  = len;
+	data->res.count   = len;
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	
@@ -1338,7 +1461,7 @@
 #endif
 
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
-			   unsigned int npages, int how)
+			   unsigned int npages, int how, unsigned int max)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	LIST_HEAD(head);
@@ -1346,7 +1469,7 @@
 				error = 0;
 
 	spin_lock(&nfsi->req_lock);
-	res = nfs_scan_dirty(inode, &head, idx_start, npages);
+	res = nfs_scan_dirty(inode, &head, idx_start, npages, max);
 	spin_unlock(&nfsi->req_lock);
 	if (res) {
 		struct nfs_server *server = NFS_SERVER(inode);
@@ -1364,14 +1487,13 @@
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-int nfs_commit_inode(struct inode *inode, int how)
+static int nfs_commit_inode_locked(struct inode *inode, int how)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	LIST_HEAD(head);
 	int			res,
 				error = 0;
 
-	spin_lock(&nfsi->req_lock);
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	spin_unlock(&nfsi->req_lock);
 	if (res) {
@@ -1381,33 +1503,59 @@
 	}
 	return res;
 }
+
+static int nfs_commit_inode(struct inode *inode, int how)
+{
+	spin_lock(&NFS_I(inode)->req_lock);
+	return nfs_commit_inode_locked(inode, how);
+}
+
+static int nfs_commit_check(struct inode *inode, int how, unsigned int thresh)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&nfsi->req_lock);
+	if (nfsi->npages > 0) {
+		if (thresh == 0)
+			thresh = NFS_COMMIT_CLUSTER;
+		if (thresh > nfsi->npages)
+			thresh = nfsi->npages;
+		if (nfsi->ncommit >= thresh)
+			return nfs_commit_inode_locked(inode, how);
+	}
+	spin_unlock(&nfsi->req_lock);
+	return 0;
+}
 #endif
 
 int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
 		  unsigned int npages, int how)
 {
-	int	error,
+	int	error = 0,
 		wait;
 
 	wait = how & FLUSH_WAIT;
 	how &= ~FLUSH_WAIT;
 
 	do {
-		error = 0;
-		if (wait)
+		if (nfs_congested(inode)) {
+			if (!wait)
+				break;
+			nfs_wait_on_write_congestion(inode, 0);
+		}
+		error = nfs_flush_inode(inode, idx_start, npages, how, min_wb(inode));
+		if (error == 0 && wait)
 			error = nfs_wait_on_requests(inode, idx_start, npages);
 		if (error == 0)
-			error = nfs_flush_inode(inode, idx_start, npages, how);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-		if (error == 0)
 			error = nfs_commit_inode(inode, how);
-#endif
 	} while (error > 0);
 	return error;
 }
 
 int nfs_init_writepagecache(void)
 {
+	struct sysinfo si;
+
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
 					     0, SLAB_HWCACHE_ALIGN,
@@ -1429,6 +1577,13 @@
 	if (nfs_commit_mempool == NULL)
 		return -ENOMEM;
 
+	si_meminfo(&si);
+	nfs_write_throttle = ((si.totalram / 100) * si.mem_unit) >> PAGE_CACHE_SHIFT;
+	if (nfs_write_throttle < NFS_WRITE_CLUSTER)
+		nfs_write_throttle = NFS_WRITE_CLUSTER;
+	if (nfs_write_throttle > NFS_COMMIT_CLUSTER)
+		nfs_write_throttle = NFS_COMMIT_CLUSTER;
+
 	return 0;
 }
 
diff -ur -x '*~' 00-trond/include/linux/nfs_fs.h 01-nfs-writeback/include/linux/nfs_fs.h
--- 00-trond/include/linux/nfs_fs.h	2005-06-01 18:30:38.000000000 -0400
+++ 01-nfs-writeback/include/linux/nfs_fs.h	2005-06-01 18:31:00.000000000 -0400
@@ -408,15 +408,6 @@
  * return value!)
  */
 extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern int  nfs_commit_inode(struct inode *, int);
-#else
-static inline int
-nfs_commit_inode(struct inode *inode, int how)
-{
-	return 0;
-}
-#endif
 
 static inline int
 nfs_have_writebacks(struct inode *inode)
diff -ur -x '*~' 00-trond/include/linux/nfs_page.h 01-nfs-writeback/include/linux/nfs_page.h
--- 00-trond/include/linux/nfs_page.h	2005-06-01 18:30:38.000000000 -0400
+++ 01-nfs-writeback/include/linux/nfs_page.h	2005-06-01 18:31:00.000000000 -0400
@@ -61,7 +61,7 @@
 
 
 extern  int nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
-				unsigned long idx_start, unsigned int npages);
+				unsigned long idx_start, unsigned int npages, unsigned int max);
 extern	int nfs_scan_list(struct list_head *, struct list_head *,
 			  unsigned long, unsigned int);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Smooth out NFS client writeback
  2005-06-02  1:38 [PATCH] Smooth out NFS client writeback Shantanu Goel
@ 2005-06-02  1:56 ` Trond Myklebust
  2005-06-02  3:26   ` Shantanu Goel
  0 siblings, 1 reply; 13+ messages in thread
From: Trond Myklebust @ 2005-06-02  1:56 UTC (permalink / raw)
  To: Shantanu Goel; +Cc: nfs

on den 01.06.2005 Klokka 18:38 (-0700) skreiv Shantanu Goel:
> Hi Trond,
> 
> The current NFS client can cause a program to stall
> for long periods of time because it flushes all dirty
> pages at once.  The attached patch addresses this by
> only writing back the amount requested by the VM
> layer.  It also reduces the # commit requests by
> waiting for some writeback to complete before issuing
> a commit.  The patch also speeds up writebacks of
> mmap'ed data by accumulating dirty pages but sending
> commits earlier and omitting FLUSH_STABLE.

Hi Shantanu,

Do you have any figures on this subject? I'd very much like to see how
this changes the figures for both the throughput and the latency. We
should look at both slow and fast networks (say 10Mbit, 100Mbit &
1GigE).

My other question is how it affects stability in the case of low memory
situations?

Cheers,
  Trond



-------------------------------------------------------
This SF.Net email is sponsored by Yahoo.
Introducing Yahoo! Search Developer Network - Create apps using Yahoo!
Search APIs Find out how you can build Yahoo! directly into your own
Applications - visit http://developer.yahoo.net/?fr=offad-ysdn-ostg-q22005
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Smooth out NFS client writeback
  2005-06-02  1:56 ` Trond Myklebust
@ 2005-06-02  3:26   ` Shantanu Goel
  2005-06-02  4:38     ` Trond Myklebust
  0 siblings, 1 reply; 13+ messages in thread
From: Shantanu Goel @ 2005-06-02  3:26 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: nfs

[-- Attachment #1: Type: text/plain, Size: 2676 bytes --]

Here are some numbers with iozone (avg of 3 runs). 
The command was run as: iozone -i0 -r4k -s64m -c -t4
I manually booted the client machine with mem=256M.
It has 2 Xeon CPU's with hyperthreading enabled.  The
server is running 2.6.11.  The machines are on 100mb
ethernet.  The filesystem is mounted with
tcp,rsize=32768,wsize=32768.

                 Stock       w/Patch
parent write     7227        6155
parent rewrite   7068        6973
children write   7826       11659
children rewrite 7083        6974

I also ran iozone in mmap mode with same options as
above but specifying -B as well.  The machine hung
with the stock client so could not complete the test. 
Here are the numbers with the patched client.

parent write     6350
parent rewrite   4167
children write   9375
children rewrite 4792

I also ran a latency test using the script I'm
attaching to this email.  The script was invoked as:  
 write-test.pl 256m
It writes a 256MB and prints the time elapsed for each
5% increment.

Stock:

 5(0s)
10(0s)
15(0s)
20(0s)
25(0s)
30(7s)
35(3s)
40(1s)
45(0s)
50(0s)
55(0s)
60(7s)
65(3s)
70(0s)
75(0s)
80(1s)
85(0s)
90(7s)
95(3s)
100(0s)
sync(4s)

Throughput: 7084 KB/s (6 MB/s)

w/Patch:

 5(0s)
10(0s)
15(0s)
20(0s)
25(0s)
30(0s)
35(0s)
40(0s)
45(2s)
50(1s)
55(3s)
60(2s)
65(2s)
70(2s)
75(1s)
80(2s)
85(2s)
90(2s)
95(1s)
100(2s)
sync(11s)

Throughput: 7710 KB/s (7 MB/s)

--- Trond Myklebust <trond.myklebust@fys.uio.no>
wrote:

> on den 01.06.2005 Klokka 18:38 (-0700) skreiv
> Shantanu Goel:
> > Hi Trond,
> > 
> > The current NFS client can cause a program to
> stall
> > for long periods of time because it flushes all
> dirty
> > pages at once.  The attached patch addresses this
> by
> > only writing back the amount requested by the VM
> > layer.  It also reduces the # commit requests by
> > waiting for some writeback to complete before
> issuing
> > a commit.  The patch also speeds up writebacks of
> > mmap'ed data by accumulating dirty pages but
> sending
> > commits earlier and omitting FLUSH_STABLE.
> 
> Hi Shantanu,
> 
> Do you have any figures on this subject? I'd very
> much like to see how
> this changes the figures for both the throughput and
> the latency. We
> should look at both slow and fast networks (say
> 10Mbit, 100Mbit &
> 1GigE).
> 
> My other question is how it affects stability in the
> case of low memory
> situations?
> 
> Cheers,
>   Trond
> 
> 

__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

[-- Attachment #2: 3263050112-write-test.pl --]
[-- Type: application/x-perl, Size: 1687 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Smooth out NFS client writeback
  2005-06-02  3:26   ` Shantanu Goel
@ 2005-06-02  4:38     ` Trond Myklebust
  2005-06-02 12:16       ` Shantanu Goel
  0 siblings, 1 reply; 13+ messages in thread
From: Trond Myklebust @ 2005-06-02  4:38 UTC (permalink / raw)
  To: Shantanu Goel; +Cc: nfs

on den 01.06.2005 Klokka 20:26 (-0700) skreiv Shantanu Goel:
> Here are some numbers with iozone (avg of 3 runs). 
> The command was run as: iozone -i0 -r4k -s64m -c -t4
> I manually booted the client machine with mem=256M.

What happens when you increase the file size to significantly beyond the
memory size on a slow network. That is the interesting case.
Try, for instance, booting with mem=32m and iozone -s 

> I also ran iozone in mmap mode with same options as
> above but specifying -B as well.  The machine hung
> with the stock client so could not complete the test. 
> Here are the numbers with the patched client.

Where was the machine hanging?

Cheers,
  Trond

BTW: how did you determine the values for NFS_WRITE_CLUSTER and
NFS_COMMIT_CLUSTER. they appear to be completely arbitrary AFAICS.

Also, have you compared to the latest NFS_ALL kernels? They contain a
bunch of extra latency fixes that came from Ingo's RT work.

Finally, please explain _why_ have you removed the FLUSH_STABLE from
nfs_writepage()? The reason for it in the existing code is to avoid the
extra COMMIT call in situations where we know we are already very low on
memory. I don't see anything new in your patches that avoids these low
memory situations.

-------------------------------------------------------
This SF.Net email is sponsored by Yahoo.
Introducing Yahoo! Search Developer Network - Create apps using Yahoo!
Search APIs Find out how you can build Yahoo! directly into your own
Applications - visit http://developer.yahoo.net/?fr=offad-ysdn-ostg-q22005
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Smooth out NFS client writeback
  2005-06-02  4:38     ` Trond Myklebust
@ 2005-06-02 12:16       ` Shantanu Goel
  2005-06-07  4:12         ` Trond Myklebust
  0 siblings, 1 reply; 13+ messages in thread
From: Shantanu Goel @ 2005-06-02 12:16 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: nfs


--- Trond Myklebust <trond.myklebust@fys.uio.no>
wrote:
> What happens when you increase the file size to
> significantly beyond the
> memory size on a slow network. That is the
> interesting case.
> Try, for instance, booting with mem=3D32m and iozone
> -s=20
>=20

I'll run this case and report back.

> > I also ran iozone in mmap mode with same options
> as
> > above but specifying -B as well.  The machine hung
> > with the stock client so could not complete the
> test.=20
> > Here are the numbers with the patched client.
>=20
> Where was the machine hanging?
>=20

I didn't have console access due to X.  I'll try and
get a sysrq-t when I get a chance.

> BTW: how did you determine the values for
> NFS_WRITE_CLUSTER and
> NFS_COMMIT_CLUSTER. they appear to be completely
> arbitrary AFAICS.
>=20

NFS_WRITE_CLUSTER is a variant of SWAP_CLUSTER_MAX
used by kswapd.  NFS_COMMIT_CLUSTER was chosen to
match balance_dirty_pages() which uses 4MB for
throttling a writer.

> Also, have you compared to the latest NFS_ALL
> kernels? They contain a
> bunch of extra latency fixes that came from Ingo's
> RT work.
>=20

This patch is on top of 2.6.12-rc5 with NFS_ALL for
2.6.12-rc4.  The stock numbers I reported are with
NFS_ALL.  Is there a newer version somewhere?

> Finally, please explain _why_ have you removed the
> FLUSH_STABLE from
> nfs_writepage()? The reason for it in the existing
> code is to avoid the
> extra COMMIT call in situations where we know we are
> already very low on
> memory. I don't see anything new in your patches
> that avoids these low
> memory situations.

The current FLUSH_STABLE behaviour absolutely shoots
our performance on iozone -B compared to Solaris.
The low memory situation is avoided because of the
congestion check I added in writepage().  If the inode
is congested, it redirty's the page and returns.  It
also issues a commit request if there are at least
NFS_WRITE_CLUSTER pages.



=09
	=09
__________________________________=20
Do you Yahoo!?=20
Yahoo! Mail - You care about security. So do we.=20
http://promotions.yahoo.com/new_mail


-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=3D7477&alloc_id=3D16492&op=3Dclic=
k
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Smooth out NFS client writeback
  2005-06-02 12:16       ` Shantanu Goel
@ 2005-06-07  4:12         ` Trond Myklebust
  2005-06-28 22:43           ` Shantanu Goel
  0 siblings, 1 reply; 13+ messages in thread
From: Trond Myklebust @ 2005-06-07  4:12 UTC (permalink / raw)
  To: Shantanu Goel; +Cc: nfs

to den 02.06.2005 Klokka 05:16 (-0700) skreiv Shantanu Goel:

> The low memory situation is avoided because of the
> congestion check I added in writepage().  If the inode
> is congested, it redirty's the page and returns.  It
> also issues a commit request if there are at least
> NFS_WRITE_CLUSTER pages.

This sounds very much like what we were doing in the early 2.6 series
when we were returning WRITEPAGE_ACTIVATE in order to achieve the same
effect. We ended up reverting those patches after learning that
systematically redirtying pages in low-memory situations may lead to
some _very_ nasty deadlocks.
The above have "works" for ramdisks because they have hard limits on the
amount of physical memory they can use in the form of the ramdisk size.
NFS OTOH really shouldn't do it, since it has no such limits, and can
basically end up eating all of physical memory.

Have you BTW tested the performance of the other changes without this
part of the patch? I assume the numbers you presented last week were for
the combined changes.

Cheers,
  Trond

-------------------------------------------------------
This SF.Net email is sponsored by: NEC IT Guy Games.  How far can you shotput
a projector? How fast can you ride your desk chair down the office luge track?
If you want to score the big prize, get to know the little guy.  
Play to win an NEC 61" plasma display: http://www.necitguy.com/?r=20
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] Smooth out NFS client writeback
  2005-06-07  4:12         ` Trond Myklebust
@ 2005-06-28 22:43           ` Shantanu Goel
  2005-06-29 14:11             ` Peter Staubach
  0 siblings, 1 reply; 13+ messages in thread
From: Shantanu Goel @ 2005-06-28 22:43 UTC (permalink / raw)
  To: Trond Myklebust; +Cc: nfs

[-- Attachment #1: Type: text/plain, Size: 1174 bytes --]

Hi Trond,

Attached is the long delayed revised version of the
writeback smoothing patch this time against
2.6.12-mm2.  I have omitted the commit w/range and
mmap writeback from this one.  If this one is deemed
acceptable for inclusion I'll post the other 2 later. 
The commit w/range really should be restored as it
makes quite a difference against Solaris NFS servers
with regular disks.  I observed a difference of 2-3
MB/s under sustained writes.  It makes no difference
with the Linux NFS server since it ignores the range.

I tested this version by reducing memory to 32M but
iozone OOM'ed.  However, I observed the same behaviour
with the unpatched client.  If I reduce dirty_ratio to
20 from 40 (the default), iozone completes without a
problem on both versions.  I noticed the slab cache
gets as big as the page cache and the VM fails to take
that in account.  In summary, the behaviour is at
least as good as that of the stock client in this
case.

Thanks,
Shantanu


		
____________________________________________________ 
Yahoo! Sports 
Rekindle the Rivalries. Sign up for Fantasy Football 
http://football.fantasysports.yahoo.com

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 2449169141-nfs-writeback.patch --]
[-- Type: text/x-patch; name="nfs-writeback.patch", Size: 19774 bytes --]

--- .orig/fs/nfs/inode.c	2005-06-28 18:24:45.000000000 -0400
+++ nfs-writeback/fs/nfs/inode.c	2005-06-26 19:38:16.000000000 -0400
@@ -58,7 +58,6 @@
 
 static struct inode *nfs_alloc_inode(struct super_block *sb);
 static void nfs_destroy_inode(struct inode *);
-static int nfs_write_inode(struct inode *,int);
 static void nfs_delete_inode(struct inode *);
 static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
@@ -71,7 +70,6 @@
 static struct super_operations nfs_sops = { 
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.delete_inode	= nfs_delete_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
@@ -129,18 +127,6 @@
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-static int
-nfs_write_inode(struct inode *inode, int sync)
-{
-	int flags = sync ? FLUSH_WAIT : 0;
-	int ret;
-
-	ret = nfs_commit_inode(inode, flags);
-	if (ret < 0)
-		return ret;
-	return 0;
-}
-
 static void
 nfs_delete_inode(struct inode * inode)
 {
@@ -1565,7 +1551,6 @@
 static struct super_operations nfs4_sops = { 
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.delete_inode	= nfs_delete_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
--- .orig/fs/nfs/pagelist.c	2005-06-28 18:24:45.000000000 -0400
+++ nfs-writeback/fs/nfs/pagelist.c	2005-06-27 06:15:59.000000000 -0400
@@ -126,19 +126,6 @@
 }
 
 /**
- * nfs_clear_page_writeback - Unlock request and wake up sleepers
- */
-void nfs_clear_page_writeback(struct nfs_page *req)
-{
-	struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
-
-	spin_lock(&nfsi->req_lock);
-	radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK);
-	spin_unlock(&nfsi->req_lock);
-	nfs_unlock_request(req);
-}
-
-/**
  * nfs_clear_request - Free up all resources allocated to the request
  * @req:
  *
@@ -277,7 +264,7 @@
  */
 int
 nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
-	      unsigned long idx_start, unsigned int npages)
+	      unsigned long idx_start, unsigned int npages, unsigned int max)
 {
 	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
 	struct nfs_page *req;
@@ -310,6 +297,8 @@
 				nfs_list_remove_request(req);
 				nfs_list_add_request(req, dst);
 				res++;
+				if (max && res >= max)
+					goto out;
 			}
 		}
 	}
--- .orig/fs/nfs/write.c	2005-06-28 18:24:45.000000000 -0400
+++ nfs-writeback/fs/nfs/write.c	2005-06-28 18:02:47.000000000 -0400
@@ -78,16 +78,74 @@
 					    unsigned int, unsigned int);
 static void nfs_writeback_done_partial(struct nfs_write_data *, int);
 static void nfs_writeback_done_full(struct nfs_write_data *, int);
-static int nfs_wait_on_write_congestion(struct address_space *, int);
 static int nfs_wait_on_requests(struct inode *, unsigned long, unsigned int);
 static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
-			   unsigned int npages, int how);
+			   unsigned int npages, int how, unsigned int max);
+static int __nfs_flush_inode(struct inode *inode, unsigned long idx_start,
+			     unsigned int npages, int how, unsigned int max);
+#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int nfs_commit_inode(struct inode *, int);
+static int __nfs_commit_inode(struct inode *, int);
+#else
+static inline int
+nfs_commit_inode(struct inode *inode, int how)
+{
+	return 0;
+}
+
+static inline int
+__nfs_commit_inode(struct inode *inode, int how)
+{
+	return 0;
+}
+#endif
 
 static kmem_cache_t *nfs_wdata_cachep;
 mempool_t *nfs_wdata_mempool;
 static mempool_t *nfs_commit_mempool;
 
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
+static unsigned int nfs_write_throttle;
+
+static inline int nfs_should_commit(struct nfs_inode *nfsi, long nr)
+{
+	return nfsi->ncommit >= nr || (nfsi->ncommit > 0 && nfsi->ncommit == nfsi->npages);
+}
+
+static inline unsigned int nfs_wbpages(struct nfs_inode *nfsi)
+{
+	return nfsi->npages - nfsi->ndirty - nfsi->ncommit;
+}
+
+static inline unsigned int nfs_write_chunk(struct inode *inode, long todo)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned int wpages = NFS_SERVER(inode)->wpages;
+	unsigned int avail = nfs_write_throttle - nfs_wbpages(nfsi);
+
+	if (todo > avail)
+		todo = avail;
+	if (todo < wpages)
+		return wpages;
+	return ((todo + wpages - 1) / wpages) * wpages;
+}
+
+static void nfs_wait_on_write_congestion(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	DEFINE_WAIT(wait);
+
+	might_sleep();
+
+	prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
+	spin_lock(&nfsi->req_lock);
+	if (nfs_wbpages(nfsi) > 0) {
+		spin_unlock(&nfsi->req_lock);
+		io_schedule();
+	} else
+		spin_unlock(&nfsi->req_lock);
+	finish_wait(&nfs_write_congestion, &wait);
+}
 
 static inline struct nfs_write_data *nfs_commit_alloc(void)
 {
@@ -305,7 +363,7 @@
 		if (err >= 0) {
 			err = 0;
 			if (wbc->for_reclaim)
-				nfs_flush_inode(inode, 0, 0, FLUSH_STABLE);
+				nfs_flush_inode(inode, 0, 0, FLUSH_STABLE, 0);
 		}
 	} else {
 		err = nfs_writepage_sync(ctx, inode, page, 0,
@@ -325,44 +383,123 @@
 	return err; 
 }
 
-/*
- * Note: causes nfs_update_request() to block on the assumption
- * 	 that the writeback is generated due to memory pressure.
- */
+static inline int nfs_congested(struct nfs_inode *nfsi, long todo)
+{
+	unsigned int wbpages = nfs_wbpages(nfsi);
+
+	if (wbpages >= nfs_write_throttle)
+		return 1;
+	if (nfsi->ncommit > 0 && nfsi->ncommit + wbpages >= todo)
+		return 1;
+	return 0;
+}
+
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	struct inode *inode = mapping->host;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	long nr = wbc->nr_to_write;
+	long commit_thresh = nr;
+	int how = wb_priority(wbc);
 	int err;
 
-	err = generic_writepages(mapping, wbc);
-	if (err)
-		return err;
-	while (test_and_set_bit(BDI_write_congested, &bdi->state) != 0) {
-		if (wbc->nonblocking)
-			return 0;
-		nfs_wait_on_write_congestion(mapping, 0);
+	for (;;) {
+		spin_lock(&nfsi->req_lock);
+		if (nfs_should_commit(nfsi, commit_thresh)) {
+			err = __nfs_commit_inode(inode, how);
+		} else if (!nfs_congested(nfsi, commit_thresh)) {
+			long n = nr;
+
+			if (nfsi->ncommit > 0)
+				n -= nfsi->ncommit + nfs_wbpages(nfsi);
+			err = __nfs_flush_inode(inode, 0, 0, how, nfs_write_chunk(inode, n));
+		} else {
+			spin_unlock(&nfsi->req_lock);
+			if (wbc->nonblocking) {
+				wbc->encountered_congestion = 1;
+				break;
+			}
+			nfs_wait_on_write_congestion(inode);
+			continue;
+		}
+		if (err > 0) {
+			if ((nr -= err) <= 0)
+				break;
+			continue;
+		}
+		if (err < 0)
+			return err;
+		if (wbc->nr_to_write > 0) {
+			long n = wbc->nr_to_write;
+
+			err = generic_writepages(mapping, wbc);
+			if (err)
+				return err;
+			if (n - wbc->nr_to_write > 0)
+				continue;
+		}
+		break;
 	}
-	err = nfs_flush_inode(inode, 0, 0, wb_priority(wbc));
-	if (err < 0)
-		goto out;
-	wbc->nr_to_write -= err;
 	if (!wbc->nonblocking && wbc->sync_mode == WB_SYNC_ALL) {
 		err = nfs_wait_on_requests(inode, 0, 0);
+		if (err >= 0)
+			err = nfs_commit_inode(inode, how);
 		if (err < 0)
-			goto out;
+			return err;
+		nr -= err;
 	}
-	err = nfs_commit_inode(inode, wb_priority(wbc));
-	if (err > 0) {
-		wbc->nr_to_write -= err;
-		err = 0;
+	wbc->nr_to_write = nr;
+	return 0;
+}
+
+static int nfs_clear_page_writeback(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_page *p;
+
+	if (!radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&p,
+					req->wb_index, 1, NFS_PAGE_TAG_WRITEBACK))
+		return 0;
+	if (p != req)
+		return 0;
+	radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK);
+	return 1;
+}
+
+/*
+ * Add a request to the inode's dirty list.
+ */
+static void
+__nfs_mark_request_dirty(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_context->dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int wb;
+
+	wb = nfs_clear_page_writeback(req);
+	radix_tree_tag_set(&nfsi->nfs_page_tree,
+			req->wb_index, NFS_PAGE_TAG_DIRTY);
+	nfs_list_add_request(req, &nfsi->dirty);
+	nfsi->ndirty++;
+	spin_unlock(&nfsi->req_lock);
+	inc_page_state(nr_dirty);
+	mark_inode_dirty(inode);
+	if (wb) {
+		nfs_unlock_request(req);
+		wake_up_all(&nfs_write_congestion);
 	}
-out:
-	clear_bit(BDI_write_congested, &bdi->state);
-	wake_up_all(&nfs_write_congestion);
-	return err;
 }
 
+static void
+nfs_mark_request_dirty(struct nfs_page *req)
+{
+	struct inode *inode = req->wb_context->dentry->d_inode;
+
+	spin_lock(&NFS_I(inode)->req_lock);
+	__nfs_mark_request_dirty(req);
+}
+	
 /*
  * Insert a write request into an inode
  */
@@ -383,6 +520,7 @@
 	}
 	nfsi->npages++;
 	atomic_inc(&req->wb_count);
+	__nfs_mark_request_dirty(req);
 	return 0;
 }
 
@@ -393,10 +531,12 @@
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int wb;
 
 	BUG_ON (!NFS_WBACK_BUSY(req));
 
 	spin_lock(&nfsi->req_lock);
+	wb = nfs_clear_page_writeback(req);
 	radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
 	nfsi->npages--;
 	if (!nfsi->npages) {
@@ -407,6 +547,10 @@
 		spin_unlock(&nfsi->req_lock);
 	nfs_clear_request(req);
 	nfs_release_request(req);
+	if (wb) {
+		nfs_unlock_request(req);
+		wake_up_all(&nfs_write_congestion);
+	}
 }
 
 /*
@@ -437,25 +581,6 @@
 }
 
 /*
- * Add a request to the inode's dirty list.
- */
-static void
-nfs_mark_request_dirty(struct nfs_page *req)
-{
-	struct inode *inode = req->wb_context->dentry->d_inode;
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	spin_lock(&nfsi->req_lock);
-	radix_tree_tag_set(&nfsi->nfs_page_tree,
-			req->wb_index, NFS_PAGE_TAG_DIRTY);
-	nfs_list_add_request(req, &nfsi->dirty);
-	nfsi->ndirty++;
-	spin_unlock(&nfsi->req_lock);
-	inc_page_state(nr_dirty);
-	mark_inode_dirty(inode);
-}
-
-/*
  * Check if a request is dirty
  */
 static inline int
@@ -474,13 +599,21 @@
 {
 	struct inode *inode = req->wb_context->dentry->d_inode;
 	struct nfs_inode *nfsi = NFS_I(inode);
+	int wb;
+
+	BUG_ON(!NFS_WBACK_BUSY(req));
 
 	spin_lock(&nfsi->req_lock);
+	wb = nfs_clear_page_writeback(req);
 	nfs_list_add_request(req, &nfsi->commit);
 	nfsi->ncommit++;
 	spin_unlock(&nfsi->req_lock);
 	inc_page_state(nr_unstable);
 	mark_inode_dirty(inode);
+	if (wb) {
+		nfs_unlock_request(req);
+		wake_up_all(&nfs_write_congestion);
+	}
 }
 #endif
 
@@ -536,13 +669,13 @@
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
+nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages, unsigned int max)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int res = 0;
 
 	if (nfsi->ndirty != 0) {
-		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
+		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages, max);
 		nfsi->ndirty -= res;
 		sub_page_state(nr_dirty,res);
 		if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
@@ -578,39 +711,6 @@
 }
 #endif
 
-static int nfs_wait_on_write_congestion(struct address_space *mapping, int intr)
-{
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	DEFINE_WAIT(wait);
-	int ret = 0;
-
-	might_sleep();
-
-	if (!bdi_write_congested(bdi))
-		return 0;
-	if (intr) {
-		struct rpc_clnt *clnt = NFS_CLIENT(mapping->host);
-		sigset_t oldset;
-
-		rpc_clnt_sigmask(clnt, &oldset);
-		prepare_to_wait(&nfs_write_congestion, &wait, TASK_INTERRUPTIBLE);
-		if (bdi_write_congested(bdi)) {
-			if (signalled())
-				ret = -ERESTARTSYS;
-			else
-				schedule();
-		}
-		rpc_clnt_sigunmask(clnt, &oldset);
-	} else {
-		prepare_to_wait(&nfs_write_congestion, &wait, TASK_UNINTERRUPTIBLE);
-		if (bdi_write_congested(bdi))
-			schedule();
-	}
-	finish_wait(&nfs_write_congestion, &wait);
-	return ret;
-}
-
-
 /*
  * Try to update any existing write request, or create one if there is none.
  * In order to match, the request's credentials must match those of
@@ -622,15 +722,12 @@
 		struct inode *inode, struct page *page,
 		unsigned int offset, unsigned int bytes)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page		*req, *new = NULL;
 	unsigned long		rqend, end;
 
 	end = offset + bytes;
 
-	if (nfs_wait_on_write_congestion(page->mapping, server->flags & NFS_MOUNT_INTR))
-		return ERR_PTR(-ERESTARTSYS);
 	for (;;) {
 		/* Loop over all inode entries and see if we find
 		 * A request for the page we wish to update
@@ -662,8 +759,6 @@
 				nfs_unlock_request(new);
 				return ERR_PTR(error);
 			}
-			spin_unlock(&nfsi->req_lock);
-			nfs_mark_request_dirty(new);
 			return new;
 		}
 		spin_unlock(&nfsi->req_lock);
@@ -812,23 +907,23 @@
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 	if (!PageError(req->wb_page)) {
-		if (NFS_NEED_RESCHED(req)) {
+		int need_resched = test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags);
+		int need_commit = test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags);
+
+		if (need_resched) {
 			nfs_mark_request_dirty(req);
-			goto out;
-		} else if (NFS_NEED_COMMIT(req)) {
+			return;
+		}
+		if (need_commit) {
 			nfs_mark_request_commit(req);
-			goto out;
+			return;
 		}
+	} else {
+		nfs_clear_commit(req);
+		nfs_clear_reschedule(req);
 	}
-	nfs_inode_remove_request(req);
-
-out:
-	nfs_clear_commit(req);
-	nfs_clear_reschedule(req);
-#else
-	nfs_inode_remove_request(req);
 #endif
-	nfs_clear_page_writeback(req);
+	nfs_inode_remove_request(req);
 }
 
 static inline int flush_task_priority(int how)
@@ -959,7 +1054,6 @@
 		nfs_writedata_free(data);
 	}
 	nfs_mark_request_dirty(req);
-	nfs_clear_page_writeback(req);
 	return -ENOMEM;
 }
 
@@ -1009,7 +1103,6 @@
 		struct nfs_page *req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_dirty(req);
-		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
 }
@@ -1036,7 +1129,6 @@
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_dirty(req);
-		nfs_clear_page_writeback(req);
 	}
 	return error;
 }
@@ -1111,7 +1203,7 @@
 			end_page_writeback(page);
 			nfs_inode_remove_request(req);
 			dprintk(", error = %d\n", status);
-			goto next;
+			continue;
 		}
 		end_page_writeback(page);
 
@@ -1119,7 +1211,7 @@
 		if (data->args.stable != NFS_UNSTABLE || data->verf.committed == NFS_FILE_SYNC) {
 			nfs_inode_remove_request(req);
 			dprintk(" OK\n");
-			goto next;
+			continue;
 		}
 		memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
 		nfs_mark_request_commit(req);
@@ -1127,8 +1219,6 @@
 #else
 		nfs_inode_remove_request(req);
 #endif
-	next:
-		nfs_clear_page_writeback(req);
 	}
 }
 
@@ -1273,7 +1363,6 @@
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
-		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
 }
@@ -1319,23 +1408,21 @@
 		dprintk(" mismatch\n");
 		nfs_mark_request_dirty(req);
 	next:
-		nfs_clear_page_writeback(req);
 		res++;
 	}
 	sub_page_state(nr_unstable,res);
 }
 #endif
 
-static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
-			   unsigned int npages, int how)
+static int __nfs_flush_inode(struct inode *inode, unsigned long idx_start,
+			     unsigned int npages, int how, unsigned int max)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	LIST_HEAD(head);
 	int			res,
 				error = 0;
 
-	spin_lock(&nfsi->req_lock);
-	res = nfs_scan_dirty(inode, &head, idx_start, npages);
+	res = nfs_scan_dirty(inode, &head, idx_start, npages, max);
 	spin_unlock(&nfsi->req_lock);
 	if (res) {
 		struct nfs_server *server = NFS_SERVER(inode);
@@ -1352,15 +1439,21 @@
 	return res;
 }
 
+static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
+			   unsigned int npages, int how, unsigned int max)
+{
+	spin_lock(&NFS_I(inode)->req_lock);
+	return __nfs_flush_inode(inode, idx_start, npages, how, max);
+}
+
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-int nfs_commit_inode(struct inode *inode, int how)
+static int __nfs_commit_inode(struct inode *inode, int how)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	LIST_HEAD(head);
 	int			res,
 				error = 0;
 
-	spin_lock(&nfsi->req_lock);
 	res = nfs_scan_commit(inode, &head, 0, 0);
 	spin_unlock(&nfsi->req_lock);
 	if (res) {
@@ -1370,11 +1463,18 @@
 	}
 	return res;
 }
+
+static int nfs_commit_inode(struct inode *inode, int how)
+{
+	spin_lock(&NFS_I(inode)->req_lock);
+	return __nfs_commit_inode(inode, how);
+}
 #endif
 
 int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
 		  unsigned int npages, int how)
 {
+	struct nfs_inode *nfsi = NFS_I(inode);
 	int	error,
 		wait;
 
@@ -1382,21 +1482,36 @@
 	how &= ~FLUSH_WAIT;
 
 	do {
-		error = 0;
-		if (wait)
+		for (;;) {
+			spin_lock(&nfsi->req_lock);
+			if (nfs_wbpages(nfsi) < nfs_write_throttle)
+				break;
+			spin_unlock(&nfsi->req_lock);
+			if (!wait)
+				return 0;
+			nfs_wait_on_write_congestion(inode);
+		}
+		error = __nfs_flush_inode(inode, idx_start, npages, how, nfs_write_chunk(inode, LONG_MAX));
+		if (error == 0 && wait)
 			error = nfs_wait_on_requests(inode, idx_start, npages);
 		if (error == 0)
-			error = nfs_flush_inode(inode, idx_start, npages, how);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-		if (error == 0)
 			error = nfs_commit_inode(inode, how);
-#endif
 	} while (error > 0);
 	return error;
 }
 
+static inline unsigned long si2pages(struct sysinfo *si, unsigned long mem)
+{
+	if (si->mem_unit >= PAGE_CACHE_SIZE)
+		return mem / (si->mem_unit >> PAGE_CACHE_SHIFT);
+	return mem * (PAGE_CACHE_SIZE / si->mem_unit);
+}
+
 int nfs_init_writepagecache(void)
 {
+	struct sysinfo si;
+	unsigned long total, low;
+
 	nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
 					     sizeof(struct nfs_write_data),
 					     0, SLAB_HWCACHE_ALIGN,
@@ -1418,6 +1533,15 @@
 	if (nfs_commit_mempool == NULL)
 		return -ENOMEM;
 
+	/*
+	 * Limit writeback to 1% of memory or
+	 * 5% of lowmem whichever is smaller.
+	 */
+	si_meminfo(&si);
+	total = si2pages(&si, si.totalram);
+	low = si2pages(&si, si.totalram - si.totalhigh);
+	nfs_write_throttle = min(total / 100, low / 20);
+
 	return 0;
 }
 
--- .orig/include/linux/nfs_fs.h	2005-06-28 18:24:46.000000000 -0400
+++ nfs-writeback/include/linux/nfs_fs.h	2005-06-26 19:38:17.000000000 -0400
@@ -394,15 +394,6 @@
  * return value!)
  */
 extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
-#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern int  nfs_commit_inode(struct inode *, int);
-#else
-static inline int
-nfs_commit_inode(struct inode *inode, int how)
-{
-	return 0;
-}
-#endif
 
 static inline int
 nfs_have_writebacks(struct inode *inode)
--- .orig/include/linux/nfs_page.h	2005-06-28 18:24:46.000000000 -0400
+++ nfs-writeback/include/linux/nfs_page.h	2005-06-27 06:16:06.000000000 -0400
@@ -62,7 +62,7 @@
 
 
 extern  int nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
-				unsigned long idx_start, unsigned int npages);
+				unsigned long idx_start, unsigned int npages, unsigned int max);
 extern	int nfs_scan_list(struct list_head *, struct list_head *,
 			  unsigned long, unsigned int);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
@@ -70,7 +70,6 @@
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
 extern  int nfs_set_page_writeback_locked(struct nfs_page *req);
-extern  void nfs_clear_page_writeback(struct nfs_page *req);
 
 
 /*

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Re: [PATCH] Smooth out NFS client writeback
  2005-06-28 22:43           ` Shantanu Goel
@ 2005-06-29 14:11             ` Peter Staubach
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Staubach @ 2005-06-29 14:11 UTC (permalink / raw)
  To: Shantanu Goel; +Cc: Trond Myklebust, nfs

Shantanu Goel wrote:

>Hi Trond,
>
>Attached is the long delayed revised version of the
>writeback smoothing patch this time against
>2.6.12-mm2.  I have omitted the commit w/range and
>mmap writeback from this one.  If this one is deemed
>acceptable for inclusion I'll post the other 2 later. 
>The commit w/range really should be restored as it
>makes quite a difference against Solaris NFS servers
>with regular disks.  I observed a difference of 2-3
>MB/s under sustained writes.  It makes no difference
>with the Linux NFS server since it ignores the range.
>

On Solaris, at least with UFS as the underlying file system, the COMMIT
operations are processed by looking through the entire cached page list
or by doing page lookup operations on each individual page.  If the entire
file is specified, ie. len = 0, then the page list is walked.  If a range
is specified, then just the pages within the range are looked up.

Specifying the range can result in significantly less CPU overhead on the
server.  This is why the NFSv3 COMMIT operation has a range which can be
specified...  :-)

       ps


-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: Re: [PATCH] Smooth out NFS client writeback
@ 2005-06-29 14:25 Lever, Charles
  2005-06-29 15:07 ` Peter Staubach
  0 siblings, 1 reply; 13+ messages in thread
From: Lever, Charles @ 2005-06-29 14:25 UTC (permalink / raw)
  To: Peter Staubach, Shantanu Goel; +Cc: Trond Myklebust, nfs

hi peter-

> On Solaris, at least with UFS as the underlying file system,=20
> the COMMIT
> operations are processed by looking through the entire cached=20
> page list
> or by doing page lookup operations on each individual page. =20
> If the entire
> file is specified, ie. len =3D 0, then the page list is walked.=20
>  If a range
> is specified, then just the pages within the range are looked up.
>=20
> Specifying the range can result in significantly less CPU=20
> overhead on the
> server.  This is why the NFSv3 COMMIT operation has a range=20
> which can be specified...  :-)

a server CPU inefficiency hardly qualifies as a client bug.  in the most
common cases where the client is creating and writing to a file, then
closing with a COMMIT(0,0) request, the server should be changed to
behave in a more efficient manner.

in other words, i think the client should optimize the number of
requests on the wire, and the server should optimize for using its CPU
and disks most efficiently.  i haven't looked closely at shantanu's
patch, but i'm a little leary of the change if it means more wire
operations are generated than before.


-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Re: [PATCH] Smooth out NFS client writeback
  2005-06-29 14:25 Lever, Charles
@ 2005-06-29 15:07 ` Peter Staubach
  0 siblings, 0 replies; 13+ messages in thread
From: Peter Staubach @ 2005-06-29 15:07 UTC (permalink / raw)
  To: Lever, Charles; +Cc: Shantanu Goel, Trond Myklebust, nfs

Lever, Charles wrote:

>hi peter-
>
>  
>
>>On Solaris, at least with UFS as the underlying file system, 
>>the COMMIT
>>operations are processed by looking through the entire cached 
>>page list
>>or by doing page lookup operations on each individual page.  
>>If the entire
>>file is specified, ie. len = 0, then the page list is walked. 
>> If a range
>>is specified, then just the pages within the range are looked up.
>>
>>Specifying the range can result in significantly less CPU 
>>overhead on the
>>server.  This is why the NFSv3 COMMIT operation has a range 
>>which can be specified...  :-)
>>    
>>
>
>a server CPU inefficiency hardly qualifies as a client bug.  in the most
>common cases where the client is creating and writing to a file, then
>closing with a COMMIT(0,0) request, the server should be changed to
>behave in a more efficient manner.
>
>in other words, i think the client should optimize the number of
>requests on the wire, and the server should optimize for using its CPU
>and disks most efficiently.  i haven't looked closely at shantanu's
>patch, but i'm a little leary of the change if it means more wire
>operations are generated than before.
>  
>

I wouldn't claim that this is a client side bug either.  I would claim that
it is an opportunity, for very little cost, to help a server to perform
better and this makes the client, and NFS in general, look better.

I don't think that there will be more over the wire operations generated
when using the range versus always specifying the entire file.  Typically,
COMMITs are done for a range of the file or for the entire file at close.
The client typically needs to know what data is marked as needing to be
committed anyway, so it isn't very hard to figure out what the range that
needs to be committed is at the same time.  I would claim that a client,
which simply issues a blanket COMMIT(0,0), without already having gathered
up the buffers/pages/whatever that need committing, is broken.  It will be
unsafe because it will be subject to races like more data getting written
with UNSTABLE while the COMMIT is happening.  This new data may or may not
have been committed by the COMMIT.

Bottom line for me -- if the client can do something to help the server to
help the client and it is an overall win, then I think that it should do 
so...

    Thanx...

       ps

-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: Re: [PATCH] Smooth out NFS client writeback
@ 2005-06-29 15:35 Lever, Charles
  2005-06-29 15:50 ` Peter Staubach
  0 siblings, 1 reply; 13+ messages in thread
From: Lever, Charles @ 2005-06-29 15:35 UTC (permalink / raw)
  To: Peter Staubach; +Cc: Shantanu Goel, Trond Myklebust, nfs

> I don't think that there will be more over the wire=20
> operations generated
> when using the range versus always specifying the entire=20
> file.

committing the whole file at once is simply more efficient from a
network perspective when an application is performing random writes
(unless it is using O_SYNC).  there are some cases where it won't make a
difference whether a range or a whole file commit is used, but i think
it would be really hard to figure out a client-side heuristic to decide
which is better.

> I would claim that a client,
> which simply issues a blanket COMMIT(0,0), without already=20
> having gathered
> up the buffers/pages/whatever that need committing, is=20
> broken.  It will be
> unsafe because it will be subject to races like more data=20
> getting written
> with UNSTABLE while the COMMIT is happening.  This new data=20
> may or may not
> have been committed by the COMMIT.

the linux client already keeps track of the order of writes and commits
well enough that this isn't an issue.

> Bottom line for me -- if the client can do something to help=20
> the server to
> help the client and it is an overall win, then I think that=20
> it should do so...

we're talking about potentially adding complexity to the client and
increasing the number of write and commit operations on the wire, which
could have a negative performance impact in other environments (think
WAN).  all this to optimize a particular workload against a particular
server implementation.

i'm not saying this type of work shouldn't be explored, but as we
consider the change, we should be very careful especially since we don't
have adequate performance regression test coverage yet.

-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Re: [PATCH] Smooth out NFS client writeback
  2005-06-29 15:35 Lever, Charles
@ 2005-06-29 15:50 ` Peter Staubach
  2005-06-29 22:34   ` Shantanu Goel
  0 siblings, 1 reply; 13+ messages in thread
From: Peter Staubach @ 2005-06-29 15:50 UTC (permalink / raw)
  To: Lever, Charles; +Cc: Shantanu Goel, Trond Myklebust, nfs

Lever, Charles wrote:

>>I don't think that there will be more over the wire 
>>operations generated
>>when using the range versus always specifying the entire 
>>file.
>>    
>>
>
>committing the whole file at once is simply more efficient from a
>network perspective when an application is performing random writes
>(unless it is using O_SYNC).  there are some cases where it won't make a
>difference whether a range or a whole file commit is used, but i think
>it would be really hard to figure out a client-side heuristic to decide
>which is better.
>
>  
>

The network perspective is important, to be sure.  However, so is disk
i/o.

Having done an implementation, it isn't so hard to develop the heuristics,
actually.

Please keep in mind that COMMIT operations are very expensive.  They are
to be avoided whenever possible.  Clients should delay as long as possible
to commit data because they may be able to avoid committing it at all.
These benefits can include data which is repeatedly overwritten, small
writes to the same file system block on the server, temporary files, etc.
The list goes on and on.  Not going to stable storage on the server is a
huge win in performance all that way around.

>>I would claim that a client,
>>which simply issues a blanket COMMIT(0,0), without already 
>>having gathered
>>up the buffers/pages/whatever that need committing, is 
>>broken.  It will be
>>unsafe because it will be subject to races like more data 
>>getting written
>>with UNSTABLE while the COMMIT is happening.  This new data 
>>may or may not
>>have been committed by the COMMIT.
>>    
>>
>
>the linux client already keeps track of the order of writes and commits
>well enough that this isn't an issue.
>
>  
>

Good to hear!

>>Bottom line for me -- if the client can do something to help 
>>the server to
>>help the client and it is an overall win, then I think that 
>>it should do so...
>>    
>>
>
>we're talking about potentially adding complexity to the client and
>increasing the number of write and commit operations on the wire, which
>could have a negative performance impact in other environments (think
>WAN).  all this to optimize a particular workload against a particular
>server implementation.
>
>i'm not saying this type of work shouldn't be explored, but as we
>consider the change, we should be very careful especially since we don't
>have adequate performance regression test coverage yet.
>  
>

I think that there may be more server implementations out there that
could benefit from the client strategy than just the Solaris with UFS
NFS server.  I would guess that most, if not all, would benefit.  I
even suspect that we could make the Linux server go faster if it paid
attention to the byte range specified, instead of always sync'ing the
entire file.  The cost of looking at clean pages/buffers is small, but
adds up quickly...  :-)

       ps


-------------------------------------------------------
SF.Net email is sponsored by: Discover Easy Linux Migration Strategies
from IBM. Find simple to follow Roadmaps, straightforward articles,
informative Webcasts and more! Get everything you need to get up to
speed, fast. http://ads.osdn.com/?ad_id=7477&alloc_id=16492&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: Re: [PATCH] Smooth out NFS client writeback
  2005-06-29 15:50 ` Peter Staubach
@ 2005-06-29 22:34   ` Shantanu Goel
  0 siblings, 0 replies; 13+ messages in thread
From: Shantanu Goel @ 2005-06-29 22:34 UTC (permalink / raw)
  To: Peter Staubach, Lever, Charles; +Cc: Shantanu Goel, Trond Myklebust, nfs


> Lever, Charles wrote:
> >committing the whole file at once is simply more
> efficient from a
> >network perspective when an application is
> performing random writes
> >(unless it is using O_SYNC).  there are some cases
> where it won't make a
> >difference whether a range or a whole file commit
> is used, but i think
> >it would be really hard to figure out a client-side
> heuristic to decide
> >which is better.

The patch I posted is actually much better in terms of
the # commit requests it issues compared to the stock
client.  There are some #'s for iozone -r4k -s256m -c:

Proc            Stock         Patch
write           16486         16384
commit            518            80

Here are the numbers for write throughput as seen by
the children in KB/s in the iozone test above to see
the difference that commit range makes.

Stock    Patch w/o commit   Patch w/commit
7064     6038               10226    =20

When I get a chance I'll modify the Linux NFS server
to honour the range as well and see what if any
difference it makes.

Thanks,
Shantanu



	=09
____________________________________________________=20
Yahoo! Sports=20
Rekindle the Rivalries. Sign up for Fantasy Football=20
http://football.fantasysports.yahoo.com


-------------------------------------------------------
This SF.Net email is sponsored by the 'Do More With Dual!' webinar happen=
ing
July 14 at 8am PDT/11am EDT. We invite you to explore the latest in dual
core and dual graphics technology at this free one hour event hosted by H=
P,=20
AMD, and NVIDIA.  To register visit http://www.hp.com/go/dualwebinar
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2005-06-29 22:35 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-06-02  1:38 [PATCH] Smooth out NFS client writeback Shantanu Goel
2005-06-02  1:56 ` Trond Myklebust
2005-06-02  3:26   ` Shantanu Goel
2005-06-02  4:38     ` Trond Myklebust
2005-06-02 12:16       ` Shantanu Goel
2005-06-07  4:12         ` Trond Myklebust
2005-06-28 22:43           ` Shantanu Goel
2005-06-29 14:11             ` Peter Staubach
  -- strict thread matches above, loose matches on Subject: below --
2005-06-29 14:25 Lever, Charles
2005-06-29 15:07 ` Peter Staubach
2005-06-29 15:35 Lever, Charles
2005-06-29 15:50 ` Peter Staubach
2005-06-29 22:34   ` Shantanu Goel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox