[PATCH] improve the performance of large sequential write NFS workloads

linux-nfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] improve the performance of large sequential write NFS workloads
@ 2009-12-17  2:03 Steve Rago
  2009-12-17  8:17 ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-17  2:03 UTC (permalink / raw)
  To: linux-nfs; +Cc: linux-kernel, Trond.Myklebust

Eager Writeback for NFS Clients
-------------------------------
Prevent applications that write large sequential streams of data (like backup, for example)
from entering into a memory pressure state, which degrades performance by falling back to
synchronous operations (both synchronous writes and additional commits).  This is accomplished
by preventing the client application from dirtying pages faster than they can be written to
the server: clients write pages eagerly instead of lazily.

The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
the feature.  Otherwise it contains the maximum number of outstanding NFS writes that can be
in flight for a given file.  This is used to block the application from dirtying more pages
until the writes are complete.

This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach.  For
the original patch, see http://article.gmane.org/gmane.linux.nfs/24323.

The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32.

Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar).
With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet.
With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential
workloads (where "large" is defined to be greater than the memory size on the client).

Signed-off-by: Steve Rago <sar@nec-labs.com>
---

diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/fs-writeback.c linux-2.6.32-rc7/fs/fs-writeback.c
--- linux-2.6.32-rc7-orig/fs/fs-writeback.c	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/fs-writeback.c	2009-11-30 15:36:30.735453450 -0500
@@ -771,6 +771,8 @@ static long wb_writeback(struct bdi_writ
 		wbc.range_start = 0;
 		wbc.range_end = LLONG_MAX;
 	}
+	if (args->for_background || wbc.for_kupdate)
+		wbc.nonblocking = 1;
 
 	for (;;) {
 		/*
@@ -859,6 +861,8 @@ static long wb_check_old_data_flush(stru
 	unsigned long expired;
 	long nr_pages;
 
+	if (dirty_writeback_interval == 0)
+		return 0;
 	expired = wb->last_old_flush +
 			msecs_to_jiffies(dirty_writeback_interval * 10);
 	if (time_before(jiffies, expired))
@@ -954,7 +958,11 @@ int bdi_writeback_task(struct bdi_writeb
 				break;
 		}
 
-		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+		if (dirty_writeback_interval == 0)
+			wait_jiffies = msecs_to_jiffies(5000);	/* default */
+		else
+			wait_jiffies =
+				msecs_to_jiffies(dirty_writeback_interval * 10);
 		schedule_timeout_interruptible(wait_jiffies);
 		try_to_freeze();
 	}
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/file.c linux-2.6.32-rc7/fs/nfs/file.c
--- linux-2.6.32-rc7-orig/fs/nfs/file.c	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/file.c	2009-11-30 15:21:22.635101295 -0500
@@ -589,11 +589,17 @@ static int nfs_need_sync_write(struct fi
 	return 0;
 }
 
+static int nfs_is_seqwrite(struct inode *inode, loff_t pos)
+{
+	return NFS_I(inode)->wrpos == pos;
+}
+
 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
 	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
 	struct inode * inode = dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
 	ssize_t result;
 	size_t count = iov_length(iov, nr_segs);
 
@@ -607,6 +613,12 @@ static ssize_t nfs_file_write(struct kio
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
 		goto out_swapfile;
+
+	result = count;
+	if (!count)
+		goto out;
+	nfs_wait_woutstanding(inode);
+
 	/*
 	 * O_APPEND implies that we must revalidate the file length.
 	 */
@@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
 	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	/* Return error values for O_SYNC and IS_SYNC() */
-	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
-		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
-		if (err < 0)
-			result = err;
+	if (result >= 0) {
+		if (nfs_need_sync_write(iocb->ki_filp, inode)) {
+			int err;
+
+			err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
+					   inode);
+			if (err < 0)
+				result = err;
+		} else if (nfs_max_woutstanding != 0 &&
+		     nfs_is_seqwrite(inode, pos) &&
+		     atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) {
+			nfs_wb_eager(inode);
+		}
+		if (result > 0)
+			nfsi->wrpos = pos + result;
 	}
 out:
 	return result;
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/inode.c linux-2.6.32-rc7/fs/nfs/inode.c
--- linux-2.6.32-rc7-orig/fs/nfs/inode.c	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/inode.c	2009-11-13 11:36:43.888410914 -0500
@@ -508,7 +508,9 @@ void nfs_setattr_update_inode(struct ino
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
-	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
+	int woutstanding = nfs_max_woutstanding;
 	int err;
 
 	/*
@@ -519,9 +521,8 @@ int nfs_getattr(struct vfsmount *mnt, st
 	 * nfs_wb_nocommit.
 	 */
 	if (S_ISREG(inode->i_mode)) {
-		mutex_lock(&inode->i_mutex);
+		atomic_add(woutstanding, &nfsi->writes);
 		nfs_wb_nocommit(inode);
-		mutex_unlock(&inode->i_mutex);
 	}
 
 	/*
@@ -545,6 +546,11 @@ int nfs_getattr(struct vfsmount *mnt, st
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
 	}
+
+	if (S_ISREG(inode->i_mode)) {
+		atomic_sub(woutstanding, &nfsi->writes);
+		wake_up(&nfsi->writes_wq);
+	}
 	return err;
 }
 
@@ -1418,9 +1424,13 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
 	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
 	nfsi->npages = 0;
+	atomic_set(&nfsi->ndirty, 0);
 	atomic_set(&nfsi->silly_count, 1);
 	INIT_HLIST_HEAD(&nfsi->silly_list);
 	init_waitqueue_head(&nfsi->waitqueue);
+	atomic_set(&nfsi->writes, 0);
+	init_waitqueue_head(&nfsi->writes_wq);
+	nfsi->wrpos = 0;
 	nfs4_init_once(nfsi);
 }
 
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/sysctl.c linux-2.6.32-rc7/fs/nfs/sysctl.c
--- linux-2.6.32-rc7-orig/fs/nfs/sysctl.c	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/sysctl.c	2009-11-13 11:36:43.895459044 -0500
@@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "nfs_max_woutstanding",
+		.data           = &nfs_max_woutstanding,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/write.c linux-2.6.32-rc7/fs/nfs/write.c
--- linux-2.6.32-rc7-orig/fs/nfs/write.c	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/fs/nfs/write.c	2009-12-08 13:26:35.416629518 -0500
@@ -176,6 +176,8 @@ static void nfs_mark_uptodate(struct pag
 
 static int wb_priority(struct writeback_control *wbc)
 {
+	if (nfs_max_woutstanding != 0)
+		return 0;
 	if (wbc->for_reclaim)
 		return FLUSH_HIGHPRI | FLUSH_STABLE;
 	if (wbc->for_kupdate)
@@ -200,7 +202,9 @@ static int nfs_set_page_writeback(struct
 	if (!ret) {
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
+		struct nfs_inode *nfsi = NFS_I(inode);
 
+		atomic_dec(&nfsi->ndirty);
 		if (atomic_long_inc_return(&nfss->writeback) >
 				NFS_CONGESTION_ON_THRESH) {
 			set_bdi_congested(&nfss->backing_dev_info,
@@ -325,6 +329,39 @@ static int nfs_writepages_callback(struc
 	return ret;
 }
 
+int nfs_max_woutstanding = 16;
+
+static void nfs_inc_woutstanding(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	atomic_inc(&nfsi->writes);
+}
+
+static void nfs_dec_woutstanding(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	if (atomic_dec_return(&nfsi->writes) < nfs_max_woutstanding)
+		wake_up(&nfsi->writes_wq);
+}
+
+void nfs_wait_woutstanding(struct inode *inode)
+{
+	if (nfs_max_woutstanding != 0) {
+		unsigned long background_thresh;
+		unsigned long dirty_thresh;
+		long npages;
+		struct nfs_inode *nfsi = NFS_I(inode);
+
+		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+		npages = global_page_state(NR_FILE_DIRTY) +
+			 global_page_state(NR_UNSTABLE_NFS) +
+			 global_page_state(NR_WRITEBACK);
+		if (npages >= background_thresh)
+			wait_event(nfsi->writes_wq,
+			   atomic_read(&nfsi->writes) < nfs_max_woutstanding);
+	}
+}
+
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
@@ -420,6 +457,9 @@ static void nfs_inode_remove_request(str
 static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
+	struct inode *inode = req->wb_context->path.dentry->d_inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	atomic_inc(&nfsi->ndirty);
 	__set_page_dirty_nobuffers(req->wb_page);
 }
 
@@ -682,16 +722,18 @@ static struct nfs_page * nfs_setup_write
 
 	req = nfs_try_to_update_request(inode, page, offset, bytes);
 	if (req != NULL)
-		goto out;
+		return req;
 	req = nfs_create_request(ctx, inode, page, offset, bytes);
 	if (IS_ERR(req))
-		goto out;
+		return req;
 	error = nfs_inode_add_request(inode, req);
 	if (error != 0) {
 		nfs_release_request(req);
 		req = ERR_PTR(error);
+	} else {
+		struct nfs_inode *nfsi = NFS_I(inode);
+		atomic_inc(&nfsi->ndirty);
 	}
-out:
 	return req;
 }
 
@@ -877,6 +919,7 @@ static int nfs_write_rpcsetup(struct nfs
 		count,
 		(unsigned long long)data->args.offset);
 
+	nfs_inc_woutstanding(inode);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
@@ -1172,7 +1215,7 @@ int nfs_writeback_done(struct rpc_task *
 	 */
 	status = NFS_PROTO(data->inode)->write_done(task, data);
 	if (status != 0)
-		return status;
+		goto out;
 	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1229,6 +1272,8 @@ int nfs_writeback_done(struct rpc_task *
 		task->tk_status = -EIO;
 	}
 	nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
+out:
+	nfs_dec_woutstanding(data->inode);
 	return 0;
 }
 
@@ -1591,6 +1636,24 @@ int nfs_wb_page(struct inode *inode, str
 	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
 
+int nfs_wb_eager(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control wbc = {
+		.bdi            = mapping->backing_dev_info,
+		.sync_mode      = WB_SYNC_NONE,
+		.nr_to_write    = LONG_MAX,
+		.range_start    = 0,
+		.range_end      = LLONG_MAX,
+	};
+	int ret;
+
+	ret = nfs_writepages(mapping, &wbc);
+	if (ret < 0)
+		__mark_inode_dirty(inode, I_DIRTY_PAGES);
+	return ret;
+}
+
 #ifdef CONFIG_MIGRATION
 int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
 		struct page *page)
@@ -1674,4 +1737,3 @@ void nfs_destroy_writepagecache(void)
 	mempool_destroy(nfs_wdata_mempool);
 	kmem_cache_destroy(nfs_wdata_cachep);
 }
-
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/include/linux/nfs_fs.h linux-2.6.32-rc7/include/linux/nfs_fs.h
--- linux-2.6.32-rc7-orig/include/linux/nfs_fs.h	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/include/linux/nfs_fs.h	2009-11-13 11:36:43.982136105 -0500
@@ -166,6 +166,7 @@ struct nfs_inode {
 	struct radix_tree_root	nfs_page_tree;
 
 	unsigned long		npages;
+	atomic_t		ndirty;
 
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
@@ -187,6 +188,11 @@ struct nfs_inode {
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;
 #endif
+
+	loff_t			wrpos;
+	atomic_t		writes;
+	wait_queue_head_t	writes_wq;
+
 	struct inode		vfs_inode;
 };
 
@@ -467,11 +473,13 @@ extern void nfs_unblock_sillyrename(stru
  * linux/fs/nfs/write.c
  */
 extern int  nfs_congestion_kb;
+extern int  nfs_max_woutstanding;
 extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
 extern int  nfs_writepages(struct address_space *, struct writeback_control *);
 extern int  nfs_flush_incompatible(struct file *file, struct page *page);
 extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
+extern void nfs_wait_woutstanding(struct inode *);
 
 /*
  * Try to write back everything synchronously (but check the
@@ -482,6 +490,7 @@ extern int nfs_wb_all(struct inode *inod
 extern int nfs_wb_nocommit(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
+extern int nfs_wb_eager(struct inode *inode);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_write_data *nfs_commitdata_alloc(void);
diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/mm/page-writeback.c linux-2.6.32-rc7/mm/page-writeback.c
--- linux-2.6.32-rc7-orig/mm/page-writeback.c	2009-11-12 19:46:07.000000000 -0500
+++ linux-2.6.32-rc7/mm/page-writeback.c	2009-11-18 10:05:22.314373138 -0500
@@ -536,7 +536,7 @@ static void balance_dirty_pages(struct a
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
-		if (bdi_nr_reclaimable > bdi_thresh) {
+		if (bdi_nr_reclaimable != 0) {
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-17  2:03 [PATCH] improve the performance of large sequential write NFS workloads Steve Rago
@ 2009-12-17  8:17 ` Peter Zijlstra
  2009-12-18 19:33   ` Steve Rago
  2009-12-19 12:20   ` Wu Fengguang
  0 siblings, 2 replies; 96+ messages in thread
From: Peter Zijlstra @ 2009-12-17  8:17 UTC (permalink / raw)
  To: Steve Rago
  Cc: linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang,
	jens.axboe

On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> Eager Writeback for NFS Clients
> -------------------------------
> Prevent applications that write large sequential streams of data (like backup, for example)
> from entering into a memory pressure state, which degrades performance by falling back to
> synchronous operations (both synchronous writes and additional commits).  This is accomplished
> by preventing the client application from dirtying pages faster than they can be written to
> the server: clients write pages eagerly instead of lazily.
> 
> The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
> the feature.  Otherwise it contains the maximum number of outstanding NFS writes that can be
> in flight for a given file.  This is used to block the application from dirtying more pages
> until the writes are complete.
> 
> This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach.  For
> the original patch, see http://article.gmane.org/gmane.linux.nfs/24323.
> 
> The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32.
> 
> Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar).
> With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet.
> With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential
> workloads (where "large" is defined to be greater than the memory size on the client).
> 
> Signed-off-by: Steve Rago <sar@nec-labs.com>
> ---
> 
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/fs-writeback.c linux-2.6.32-rc7/fs/fs-writeback.c
> --- linux-2.6.32-rc7-orig/fs/fs-writeback.c	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/fs/fs-writeback.c	2009-11-30 15:36:30.735453450 -0500
> @@ -771,6 +771,8 @@ static long wb_writeback(struct bdi_writ
>  		wbc.range_start = 0;
>  		wbc.range_end = LLONG_MAX;
>  	}
> +	if (args->for_background || wbc.for_kupdate)
> +		wbc.nonblocking = 1;
>  
>  	for (;;) {
>  		/*
> @@ -859,6 +861,8 @@ static long wb_check_old_data_flush(stru
>  	unsigned long expired;
>  	long nr_pages;
>  
> +	if (dirty_writeback_interval == 0)
> +		return 0;
>  	expired = wb->last_old_flush +
>  			msecs_to_jiffies(dirty_writeback_interval * 10);
>  	if (time_before(jiffies, expired))
> @@ -954,7 +958,11 @@ int bdi_writeback_task(struct bdi_writeb
>  				break;
>  		}
>  
> -		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
> +		if (dirty_writeback_interval == 0)
> +			wait_jiffies = msecs_to_jiffies(5000);	/* default */
> +		else
> +			wait_jiffies =
> +				msecs_to_jiffies(dirty_writeback_interval * 10);

I'm not up-to-date on the bdi-writeout stuff, but this just looks wrong.

>  		schedule_timeout_interruptible(wait_jiffies);
>  		try_to_freeze();
>  	}
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/file.c linux-2.6.32-rc7/fs/nfs/file.c
> --- linux-2.6.32-rc7-orig/fs/nfs/file.c	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/fs/nfs/file.c	2009-11-30 15:21:22.635101295 -0500
> @@ -589,11 +589,17 @@ static int nfs_need_sync_write(struct fi
>  	return 0;
>  }
>  
> +static int nfs_is_seqwrite(struct inode *inode, loff_t pos)
> +{
> +	return NFS_I(inode)->wrpos == pos;
> +}
> +
>  static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
>  				unsigned long nr_segs, loff_t pos)
>  {
>  	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
>  	struct inode * inode = dentry->d_inode;
> +	struct nfs_inode *nfsi = NFS_I(inode);
>  	ssize_t result;
>  	size_t count = iov_length(iov, nr_segs);
>  
> @@ -607,6 +613,12 @@ static ssize_t nfs_file_write(struct kio
>  	result = -EBUSY;
>  	if (IS_SWAPFILE(inode))
>  		goto out_swapfile;
> +
> +	result = count;
> +	if (!count)
> +		goto out;
> +	nfs_wait_woutstanding(inode);
> +
>  	/*
>  	 * O_APPEND implies that we must revalidate the file length.
>  	 */
> @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio
>  	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
>  	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
>  	/* Return error values for O_SYNC and IS_SYNC() */
> -	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
> -		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
> -		if (err < 0)
> -			result = err;
> +	if (result >= 0) {
> +		if (nfs_need_sync_write(iocb->ki_filp, inode)) {
> +			int err;
> +
> +			err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
> +					   inode);
> +			if (err < 0)
> +				result = err;
> +		} else if (nfs_max_woutstanding != 0 &&
> +		     nfs_is_seqwrite(inode, pos) &&
> +		     atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) {
> +			nfs_wb_eager(inode);
> +		}
> +		if (result > 0)
> +			nfsi->wrpos = pos + result;
>  	}
>  out:
>  	return result;
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/inode.c linux-2.6.32-rc7/fs/nfs/inode.c
> --- linux-2.6.32-rc7-orig/fs/nfs/inode.c	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/fs/nfs/inode.c	2009-11-13 11:36:43.888410914 -0500
> @@ -508,7 +508,9 @@ void nfs_setattr_update_inode(struct ino
>  int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
>  {
>  	struct inode *inode = dentry->d_inode;
> -	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
> +	struct nfs_inode *nfsi = NFS_I(inode);
> +	int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
> +	int woutstanding = nfs_max_woutstanding;
>  	int err;
>  
>  	/*
> @@ -519,9 +521,8 @@ int nfs_getattr(struct vfsmount *mnt, st
>  	 * nfs_wb_nocommit.
>  	 */
>  	if (S_ISREG(inode->i_mode)) {
> -		mutex_lock(&inode->i_mutex);
> +		atomic_add(woutstanding, &nfsi->writes);
>  		nfs_wb_nocommit(inode);
> -		mutex_unlock(&inode->i_mutex);
>  	}
>  
>  	/*
> @@ -545,6 +546,11 @@ int nfs_getattr(struct vfsmount *mnt, st
>  		generic_fillattr(inode, stat);
>  		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
>  	}
> +
> +	if (S_ISREG(inode->i_mode)) {
> +		atomic_sub(woutstanding, &nfsi->writes);
> +		wake_up(&nfsi->writes_wq);
> +	}
>  	return err;
>  }
>  
> @@ -1418,9 +1424,13 @@ static void init_once(void *foo)
>  	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
>  	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
>  	nfsi->npages = 0;
> +	atomic_set(&nfsi->ndirty, 0);
>  	atomic_set(&nfsi->silly_count, 1);
>  	INIT_HLIST_HEAD(&nfsi->silly_list);
>  	init_waitqueue_head(&nfsi->waitqueue);
> +	atomic_set(&nfsi->writes, 0);
> +	init_waitqueue_head(&nfsi->writes_wq);
> +	nfsi->wrpos = 0;
>  	nfs4_init_once(nfsi);
>  }
>  
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/sysctl.c linux-2.6.32-rc7/fs/nfs/sysctl.c
> --- linux-2.6.32-rc7-orig/fs/nfs/sysctl.c	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/fs/nfs/sysctl.c	2009-11-13 11:36:43.895459044 -0500
> @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
>  		.mode		= 0644,
>  		.proc_handler	= &proc_dointvec,
>  	},
> +	{
> +		.ctl_name       = CTL_UNNUMBERED,
> +		.procname       = "nfs_max_woutstanding",
> +		.data           = &nfs_max_woutstanding,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0644,
> +		.proc_handler   = &proc_dointvec,
> +	},
>  	{ .ctl_name = 0 }
>  };
>  
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/write.c linux-2.6.32-rc7/fs/nfs/write.c
> --- linux-2.6.32-rc7-orig/fs/nfs/write.c	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/fs/nfs/write.c	2009-12-08 13:26:35.416629518 -0500
> @@ -176,6 +176,8 @@ static void nfs_mark_uptodate(struct pag
>  
>  static int wb_priority(struct writeback_control *wbc)
>  {
> +	if (nfs_max_woutstanding != 0)
> +		return 0;
>  	if (wbc->for_reclaim)
>  		return FLUSH_HIGHPRI | FLUSH_STABLE;
>  	if (wbc->for_kupdate)
> @@ -200,7 +202,9 @@ static int nfs_set_page_writeback(struct
>  	if (!ret) {
>  		struct inode *inode = page->mapping->host;
>  		struct nfs_server *nfss = NFS_SERVER(inode);
> +		struct nfs_inode *nfsi = NFS_I(inode);
>  
> +		atomic_dec(&nfsi->ndirty);
>  		if (atomic_long_inc_return(&nfss->writeback) >
>  				NFS_CONGESTION_ON_THRESH) {
>  			set_bdi_congested(&nfss->backing_dev_info,
> @@ -325,6 +329,39 @@ static int nfs_writepages_callback(struc
>  	return ret;
>  }
>  
> +int nfs_max_woutstanding = 16;
> +
> +static void nfs_inc_woutstanding(struct inode *inode)
> +{
> +	struct nfs_inode *nfsi = NFS_I(inode);
> +	atomic_inc(&nfsi->writes);
> +}
> +
> +static void nfs_dec_woutstanding(struct inode *inode)
> +{
> +	struct nfs_inode *nfsi = NFS_I(inode);
> +	if (atomic_dec_return(&nfsi->writes) < nfs_max_woutstanding)
> +		wake_up(&nfsi->writes_wq);
> +}
> +
> +void nfs_wait_woutstanding(struct inode *inode)
> +{
> +	if (nfs_max_woutstanding != 0) {
> +		unsigned long background_thresh;
> +		unsigned long dirty_thresh;
> +		long npages;
> +		struct nfs_inode *nfsi = NFS_I(inode);
> +
> +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> +		npages = global_page_state(NR_FILE_DIRTY) +
> +			 global_page_state(NR_UNSTABLE_NFS) +
> +			 global_page_state(NR_WRITEBACK);
> +		if (npages >= background_thresh)
> +			wait_event(nfsi->writes_wq,
> +			   atomic_read(&nfsi->writes) < nfs_max_woutstanding);
> +	}
> +}

This looks utterly busted too, why the global state and not the nfs
client's bdi state?

Also, why create this extra workqueue and not simply use the congestion
interface that is present? If the congestion stuff doesn't work for you,
fix that, don't add extra muck like this.

> +
>  int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
>  {
>  	struct inode *inode = mapping->host;
> @@ -420,6 +457,9 @@ static void nfs_inode_remove_request(str
>  static void
>  nfs_mark_request_dirty(struct nfs_page *req)
>  {
> +	struct inode *inode = req->wb_context->path.dentry->d_inode;
> +	struct nfs_inode *nfsi = NFS_I(inode);
> +	atomic_inc(&nfsi->ndirty);
>  	__set_page_dirty_nobuffers(req->wb_page);
>  }
>  
> @@ -682,16 +722,18 @@ static struct nfs_page * nfs_setup_write
>  
>  	req = nfs_try_to_update_request(inode, page, offset, bytes);
>  	if (req != NULL)
> -		goto out;
> +		return req;
>  	req = nfs_create_request(ctx, inode, page, offset, bytes);
>  	if (IS_ERR(req))
> -		goto out;
> +		return req;
>  	error = nfs_inode_add_request(inode, req);
>  	if (error != 0) {
>  		nfs_release_request(req);
>  		req = ERR_PTR(error);
> +	} else {
> +		struct nfs_inode *nfsi = NFS_I(inode);
> +		atomic_inc(&nfsi->ndirty);
>  	}
> -out:
>  	return req;
>  }
>  
> @@ -877,6 +919,7 @@ static int nfs_write_rpcsetup(struct nfs
>  		count,
>  		(unsigned long long)data->args.offset);
>  
> +	nfs_inc_woutstanding(inode);
>  	task = rpc_run_task(&task_setup_data);
>  	if (IS_ERR(task))
>  		return PTR_ERR(task);
> @@ -1172,7 +1215,7 @@ int nfs_writeback_done(struct rpc_task *
>  	 */
>  	status = NFS_PROTO(data->inode)->write_done(task, data);
>  	if (status != 0)
> -		return status;
> +		goto out;
>  	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
>  
>  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> @@ -1229,6 +1272,8 @@ int nfs_writeback_done(struct rpc_task *
>  		task->tk_status = -EIO;
>  	}
>  	nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
> +out:
> +	nfs_dec_woutstanding(data->inode);
>  	return 0;
>  }
>  
> @@ -1591,6 +1636,24 @@ int nfs_wb_page(struct inode *inode, str
>  	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
>  }
>  
> +int nfs_wb_eager(struct inode *inode)
> +{
> +	struct address_space *mapping = inode->i_mapping;
> +	struct writeback_control wbc = {
> +		.bdi            = mapping->backing_dev_info,
> +		.sync_mode      = WB_SYNC_NONE,
> +		.nr_to_write    = LONG_MAX,
> +		.range_start    = 0,
> +		.range_end      = LLONG_MAX,
> +	};
> +	int ret;
> +
> +	ret = nfs_writepages(mapping, &wbc);
> +	if (ret < 0)
> +		__mark_inode_dirty(inode, I_DIRTY_PAGES);
> +	return ret;
> +}
> +
>  #ifdef CONFIG_MIGRATION
>  int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
>  		struct page *page)
> @@ -1674,4 +1737,3 @@ void nfs_destroy_writepagecache(void)
>  	mempool_destroy(nfs_wdata_mempool);
>  	kmem_cache_destroy(nfs_wdata_cachep);
>  }
> -
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/include/linux/nfs_fs.h linux-2.6.32-rc7/include/linux/nfs_fs.h
> --- linux-2.6.32-rc7-orig/include/linux/nfs_fs.h	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/include/linux/nfs_fs.h	2009-11-13 11:36:43.982136105 -0500
> @@ -166,6 +166,7 @@ struct nfs_inode {
>  	struct radix_tree_root	nfs_page_tree;
>  
>  	unsigned long		npages;
> +	atomic_t		ndirty;
>  
>  	/* Open contexts for shared mmap writes */
>  	struct list_head	open_files;
> @@ -187,6 +188,11 @@ struct nfs_inode {
>  #ifdef CONFIG_NFS_FSCACHE
>  	struct fscache_cookie	*fscache;
>  #endif
> +
> +	loff_t			wrpos;
> +	atomic_t		writes;
> +	wait_queue_head_t	writes_wq;
> +
>  	struct inode		vfs_inode;
>  };
>  
> @@ -467,11 +473,13 @@ extern void nfs_unblock_sillyrename(stru
>   * linux/fs/nfs/write.c
>   */
>  extern int  nfs_congestion_kb;
> +extern int  nfs_max_woutstanding;
>  extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
>  extern int  nfs_writepages(struct address_space *, struct writeback_control *);
>  extern int  nfs_flush_incompatible(struct file *file, struct page *page);
>  extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
>  extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
> +extern void nfs_wait_woutstanding(struct inode *);
>  
>  /*
>   * Try to write back everything synchronously (but check the
> @@ -482,6 +490,7 @@ extern int nfs_wb_all(struct inode *inod
>  extern int nfs_wb_nocommit(struct inode *inode);
>  extern int nfs_wb_page(struct inode *inode, struct page* page);
>  extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
> +extern int nfs_wb_eager(struct inode *inode);
>  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
>  extern int  nfs_commit_inode(struct inode *, int);
>  extern struct nfs_write_data *nfs_commitdata_alloc(void);
> diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/mm/page-writeback.c linux-2.6.32-rc7/mm/page-writeback.c
> --- linux-2.6.32-rc7-orig/mm/page-writeback.c	2009-11-12 19:46:07.000000000 -0500
> +++ linux-2.6.32-rc7/mm/page-writeback.c	2009-11-18 10:05:22.314373138 -0500
> @@ -536,7 +536,7 @@ static void balance_dirty_pages(struct a
>  		 * threshold otherwise wait until the disk writes catch
>  		 * up.
>  		 */
> -		if (bdi_nr_reclaimable > bdi_thresh) {
> +		if (bdi_nr_reclaimable != 0) {
>  			writeback_inodes_wbc(&wbc);
>  			pages_written += write_chunk - wbc.nr_to_write;
>  			get_dirty_limits(&background_thresh, &dirty_thresh,

And I think you just broke regular writeback here, allowing for tons of
unneeded writeout of very small chunks.

This really needs to be multiple patches, and a proper changelog
describing why you do things. The above, because my benchmark goes
faster, just isn't sufficient.

Also, I don't think this needs to have a sysctl, it should just work.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-17  8:17 ` Peter Zijlstra
@ 2009-12-18 19:33   ` Steve Rago
  2009-12-18 19:41     ` Ingo Molnar
  2009-12-18 19:44     ` Peter Zijlstra
  2009-12-19 12:20   ` Wu Fengguang
  1 sibling, 2 replies; 96+ messages in thread
From: Steve Rago @ 2009-12-18 19:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang,
	jens.axboe


On Thu, 2009-12-17 at 09:17 +0100, Peter Zijlstra wrote:
> On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > Eager Writeback for NFS Clients
> > -------------------------------
> > Prevent applications that write large sequential streams of data (like backup, for example)
> > from entering into a memory pressure state, which degrades performance by falling back to
> > synchronous operations (both synchronous writes and additional commits).  This is accomplished
> > by preventing the client application from dirtying pages faster than they can be written to
> > the server: clients write pages eagerly instead of lazily.
> > 
> > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
> > the feature.  Otherwise it contains the maximum number of outstanding NFS writes that can be
> > in flight for a given file.  This is used to block the application from dirtying more pages
> > until the writes are complete.
> > 
> > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach.  For
> > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323.
> > 
> > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32.
> > 
> > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar).
> > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet.
> > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential
> > workloads (where "large" is defined to be greater than the memory size on the client).
> > 
> > Signed-off-by: Steve Rago <sar@nec-labs.com>
> > ---
> > 
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/fs-writeback.c linux-2.6.32-rc7/fs/fs-writeback.c
> > --- linux-2.6.32-rc7-orig/fs/fs-writeback.c	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/fs/fs-writeback.c	2009-11-30 15:36:30.735453450 -0500
> > @@ -771,6 +771,8 @@ static long wb_writeback(struct bdi_writ
> >  		wbc.range_start = 0;
> >  		wbc.range_end = LLONG_MAX;
> >  	}
> > +	if (args->for_background || wbc.for_kupdate)
> > +		wbc.nonblocking = 1;
> >  
> >  	for (;;) {
> >  		/*
> > @@ -859,6 +861,8 @@ static long wb_check_old_data_flush(stru
> >  	unsigned long expired;
> >  	long nr_pages;
> >  
> > +	if (dirty_writeback_interval == 0)
> > +		return 0;
> >  	expired = wb->last_old_flush +
> >  			msecs_to_jiffies(dirty_writeback_interval * 10);
> >  	if (time_before(jiffies, expired))
> > @@ -954,7 +958,11 @@ int bdi_writeback_task(struct bdi_writeb
> >  				break;
> >  		}
> >  
> > -		wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
> > +		if (dirty_writeback_interval == 0)
> > +			wait_jiffies = msecs_to_jiffies(5000);	/* default */
> > +		else
> > +			wait_jiffies =
> > +				msecs_to_jiffies(dirty_writeback_interval * 10);
> 
> I'm not up-to-date on the bdi-writeout stuff, but this just looks wrong.

The old behavior (before 2.6.32) allowed one to disable kupdate-style
periodic writeback by setting /proc/sys/vm/dirty_writeback_centisecs to
0.  In 2.6.32, the bdi flushing and kupdate-style flushing are done by
the same thread, so these changes restore that behavior.  kupdate-style
can interfere with eager writeback by generating smaller writes and more
commits.

> 
> >  		schedule_timeout_interruptible(wait_jiffies);
> >  		try_to_freeze();
> >  	}
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/file.c linux-2.6.32-rc7/fs/nfs/file.c
> > --- linux-2.6.32-rc7-orig/fs/nfs/file.c	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/fs/nfs/file.c	2009-11-30 15:21:22.635101295 -0500
> > @@ -589,11 +589,17 @@ static int nfs_need_sync_write(struct fi
> >  	return 0;
> >  }
> >  
> > +static int nfs_is_seqwrite(struct inode *inode, loff_t pos)
> > +{
> > +	return NFS_I(inode)->wrpos == pos;
> > +}
> > +
> >  static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
> >  				unsigned long nr_segs, loff_t pos)
> >  {
> >  	struct dentry * dentry = iocb->ki_filp->f_path.dentry;
> >  	struct inode * inode = dentry->d_inode;
> > +	struct nfs_inode *nfsi = NFS_I(inode);
> >  	ssize_t result;
> >  	size_t count = iov_length(iov, nr_segs);
> >  
> > @@ -607,6 +613,12 @@ static ssize_t nfs_file_write(struct kio
> >  	result = -EBUSY;
> >  	if (IS_SWAPFILE(inode))
> >  		goto out_swapfile;
> > +
> > +	result = count;
> > +	if (!count)
> > +		goto out;
> > +	nfs_wait_woutstanding(inode);
> > +
> >  	/*
> >  	 * O_APPEND implies that we must revalidate the file length.
> >  	 */
> > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio
> >  	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
> >  	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
> >  	/* Return error values for O_SYNC and IS_SYNC() */
> > -	if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
> > -		int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
> > -		if (err < 0)
> > -			result = err;
> > +	if (result >= 0) {
> > +		if (nfs_need_sync_write(iocb->ki_filp, inode)) {
> > +			int err;
> > +
> > +			err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
> > +					   inode);
> > +			if (err < 0)
> > +				result = err;
> > +		} else if (nfs_max_woutstanding != 0 &&
> > +		     nfs_is_seqwrite(inode, pos) &&
> > +		     atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) {
> > +			nfs_wb_eager(inode);
> > +		}
> > +		if (result > 0)
> > +			nfsi->wrpos = pos + result;
> >  	}
> >  out:
> >  	return result;
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/inode.c linux-2.6.32-rc7/fs/nfs/inode.c
> > --- linux-2.6.32-rc7-orig/fs/nfs/inode.c	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/fs/nfs/inode.c	2009-11-13 11:36:43.888410914 -0500
> > @@ -508,7 +508,9 @@ void nfs_setattr_update_inode(struct ino
> >  int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
> >  {
> >  	struct inode *inode = dentry->d_inode;
> > -	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
> > +	struct nfs_inode *nfsi = NFS_I(inode);
> > +	int need_atime = nfsi->cache_validity & NFS_INO_INVALID_ATIME;
> > +	int woutstanding = nfs_max_woutstanding;
> >  	int err;
> >  
> >  	/*
> > @@ -519,9 +521,8 @@ int nfs_getattr(struct vfsmount *mnt, st
> >  	 * nfs_wb_nocommit.
> >  	 */
> >  	if (S_ISREG(inode->i_mode)) {
> > -		mutex_lock(&inode->i_mutex);
> > +		atomic_add(woutstanding, &nfsi->writes);
> >  		nfs_wb_nocommit(inode);
> > -		mutex_unlock(&inode->i_mutex);
> >  	}
> >  
> >  	/*
> > @@ -545,6 +546,11 @@ int nfs_getattr(struct vfsmount *mnt, st
> >  		generic_fillattr(inode, stat);
> >  		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
> >  	}
> > +
> > +	if (S_ISREG(inode->i_mode)) {
> > +		atomic_sub(woutstanding, &nfsi->writes);
> > +		wake_up(&nfsi->writes_wq);
> > +	}
> >  	return err;
> >  }
> >  
> > @@ -1418,9 +1424,13 @@ static void init_once(void *foo)
> >  	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
> >  	INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
> >  	nfsi->npages = 0;
> > +	atomic_set(&nfsi->ndirty, 0);
> >  	atomic_set(&nfsi->silly_count, 1);
> >  	INIT_HLIST_HEAD(&nfsi->silly_list);
> >  	init_waitqueue_head(&nfsi->waitqueue);
> > +	atomic_set(&nfsi->writes, 0);
> > +	init_waitqueue_head(&nfsi->writes_wq);
> > +	nfsi->wrpos = 0;
> >  	nfs4_init_once(nfsi);
> >  }
> >  
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/sysctl.c linux-2.6.32-rc7/fs/nfs/sysctl.c
> > --- linux-2.6.32-rc7-orig/fs/nfs/sysctl.c	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/fs/nfs/sysctl.c	2009-11-13 11:36:43.895459044 -0500
> > @@ -58,6 +58,14 @@ static ctl_table nfs_cb_sysctls[] = {
> >  		.mode		= 0644,
> >  		.proc_handler	= &proc_dointvec,
> >  	},
> > +	{
> > +		.ctl_name       = CTL_UNNUMBERED,
> > +		.procname       = "nfs_max_woutstanding",
> > +		.data           = &nfs_max_woutstanding,
> > +		.maxlen         = sizeof(int),
> > +		.mode           = 0644,
> > +		.proc_handler   = &proc_dointvec,
> > +	},
> >  	{ .ctl_name = 0 }
> >  };
> >  
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/fs/nfs/write.c linux-2.6.32-rc7/fs/nfs/write.c
> > --- linux-2.6.32-rc7-orig/fs/nfs/write.c	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/fs/nfs/write.c	2009-12-08 13:26:35.416629518 -0500
> > @@ -176,6 +176,8 @@ static void nfs_mark_uptodate(struct pag
> >  
> >  static int wb_priority(struct writeback_control *wbc)
> >  {
> > +	if (nfs_max_woutstanding != 0)
> > +		return 0;
> >  	if (wbc->for_reclaim)
> >  		return FLUSH_HIGHPRI | FLUSH_STABLE;
> >  	if (wbc->for_kupdate)
> > @@ -200,7 +202,9 @@ static int nfs_set_page_writeback(struct
> >  	if (!ret) {
> >  		struct inode *inode = page->mapping->host;
> >  		struct nfs_server *nfss = NFS_SERVER(inode);
> > +		struct nfs_inode *nfsi = NFS_I(inode);
> >  
> > +		atomic_dec(&nfsi->ndirty);
> >  		if (atomic_long_inc_return(&nfss->writeback) >
> >  				NFS_CONGESTION_ON_THRESH) {
> >  			set_bdi_congested(&nfss->backing_dev_info,
> > @@ -325,6 +329,39 @@ static int nfs_writepages_callback(struc
> >  	return ret;
> >  }
> >  
> > +int nfs_max_woutstanding = 16;
> > +
> > +static void nfs_inc_woutstanding(struct inode *inode)
> > +{
> > +	struct nfs_inode *nfsi = NFS_I(inode);
> > +	atomic_inc(&nfsi->writes);
> > +}
> > +
> > +static void nfs_dec_woutstanding(struct inode *inode)
> > +{
> > +	struct nfs_inode *nfsi = NFS_I(inode);
> > +	if (atomic_dec_return(&nfsi->writes) < nfs_max_woutstanding)
> > +		wake_up(&nfsi->writes_wq);
> > +}
> > +
> > +void nfs_wait_woutstanding(struct inode *inode)
> > +{
> > +	if (nfs_max_woutstanding != 0) {
> > +		unsigned long background_thresh;
> > +		unsigned long dirty_thresh;
> > +		long npages;
> > +		struct nfs_inode *nfsi = NFS_I(inode);
> > +
> > +		get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> > +		npages = global_page_state(NR_FILE_DIRTY) +
> > +			 global_page_state(NR_UNSTABLE_NFS) +
> > +			 global_page_state(NR_WRITEBACK);
> > +		if (npages >= background_thresh)
> > +			wait_event(nfsi->writes_wq,
> > +			   atomic_read(&nfsi->writes) < nfs_max_woutstanding);
> > +	}
> > +}
> 
> This looks utterly busted too, why the global state and not the nfs
> client's bdi state?
> 
> Also, why create this extra workqueue and not simply use the congestion
> interface that is present? If the congestion stuff doesn't work for you,
> fix that, don't add extra muck like this.

Pages are a global resource.  Once we hit the dirty_threshold, the
system is going to work harder to flush the pages out.  This code
prevents the caller from creating more dirty pages in this state,
thereby making matters worse, when eager writeback is enabled.

This wait queue is used for different purposes than the congestion_wait
interface.  Here we are preventing the caller from proceeding if there
are too many NFS writes outstanding for this thread and we are in a
memory pressure state.  It has nothing to do with the state of the bdi
congestion.

> 
> > +
> >  int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
> >  {
> >  	struct inode *inode = mapping->host;
> > @@ -420,6 +457,9 @@ static void nfs_inode_remove_request(str
> >  static void
> >  nfs_mark_request_dirty(struct nfs_page *req)
> >  {
> > +	struct inode *inode = req->wb_context->path.dentry->d_inode;
> > +	struct nfs_inode *nfsi = NFS_I(inode);
> > +	atomic_inc(&nfsi->ndirty);
> >  	__set_page_dirty_nobuffers(req->wb_page);
> >  }
> >  
> > @@ -682,16 +722,18 @@ static struct nfs_page * nfs_setup_write
> >  
> >  	req = nfs_try_to_update_request(inode, page, offset, bytes);
> >  	if (req != NULL)
> > -		goto out;
> > +		return req;
> >  	req = nfs_create_request(ctx, inode, page, offset, bytes);
> >  	if (IS_ERR(req))
> > -		goto out;
> > +		return req;
> >  	error = nfs_inode_add_request(inode, req);
> >  	if (error != 0) {
> >  		nfs_release_request(req);
> >  		req = ERR_PTR(error);
> > +	} else {
> > +		struct nfs_inode *nfsi = NFS_I(inode);
> > +		atomic_inc(&nfsi->ndirty);
> >  	}
> > -out:
> >  	return req;
> >  }
> >  
> > @@ -877,6 +919,7 @@ static int nfs_write_rpcsetup(struct nfs
> >  		count,
> >  		(unsigned long long)data->args.offset);
> >  
> > +	nfs_inc_woutstanding(inode);
> >  	task = rpc_run_task(&task_setup_data);
> >  	if (IS_ERR(task))
> >  		return PTR_ERR(task);
> > @@ -1172,7 +1215,7 @@ int nfs_writeback_done(struct rpc_task *
> >  	 */
> >  	status = NFS_PROTO(data->inode)->write_done(task, data);
> >  	if (status != 0)
> > -		return status;
> > +		goto out;
> >  	nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
> >  
> >  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> > @@ -1229,6 +1272,8 @@ int nfs_writeback_done(struct rpc_task *
> >  		task->tk_status = -EIO;
> >  	}
> >  	nfs4_sequence_free_slot(server->nfs_client, &data->res.seq_res);
> > +out:
> > +	nfs_dec_woutstanding(data->inode);
> >  	return 0;
> >  }
> >  
> > @@ -1591,6 +1636,24 @@ int nfs_wb_page(struct inode *inode, str
> >  	return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
> >  }
> >  
> > +int nfs_wb_eager(struct inode *inode)
> > +{
> > +	struct address_space *mapping = inode->i_mapping;
> > +	struct writeback_control wbc = {
> > +		.bdi            = mapping->backing_dev_info,
> > +		.sync_mode      = WB_SYNC_NONE,
> > +		.nr_to_write    = LONG_MAX,
> > +		.range_start    = 0,
> > +		.range_end      = LLONG_MAX,
> > +	};
> > +	int ret;
> > +
> > +	ret = nfs_writepages(mapping, &wbc);
> > +	if (ret < 0)
> > +		__mark_inode_dirty(inode, I_DIRTY_PAGES);
> > +	return ret;
> > +}
> > +
> >  #ifdef CONFIG_MIGRATION
> >  int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
> >  		struct page *page)
> > @@ -1674,4 +1737,3 @@ void nfs_destroy_writepagecache(void)
> >  	mempool_destroy(nfs_wdata_mempool);
> >  	kmem_cache_destroy(nfs_wdata_cachep);
> >  }
> > -
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/include/linux/nfs_fs.h linux-2.6.32-rc7/include/linux/nfs_fs.h
> > --- linux-2.6.32-rc7-orig/include/linux/nfs_fs.h	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/include/linux/nfs_fs.h	2009-11-13 11:36:43.982136105 -0500
> > @@ -166,6 +166,7 @@ struct nfs_inode {
> >  	struct radix_tree_root	nfs_page_tree;
> >  
> >  	unsigned long		npages;
> > +	atomic_t		ndirty;
> >  
> >  	/* Open contexts for shared mmap writes */
> >  	struct list_head	open_files;
> > @@ -187,6 +188,11 @@ struct nfs_inode {
> >  #ifdef CONFIG_NFS_FSCACHE
> >  	struct fscache_cookie	*fscache;
> >  #endif
> > +
> > +	loff_t			wrpos;
> > +	atomic_t		writes;
> > +	wait_queue_head_t	writes_wq;
> > +
> >  	struct inode		vfs_inode;
> >  };
> >  
> > @@ -467,11 +473,13 @@ extern void nfs_unblock_sillyrename(stru
> >   * linux/fs/nfs/write.c
> >   */
> >  extern int  nfs_congestion_kb;
> > +extern int  nfs_max_woutstanding;
> >  extern int  nfs_writepage(struct page *page, struct writeback_control *wbc);
> >  extern int  nfs_writepages(struct address_space *, struct writeback_control *);
> >  extern int  nfs_flush_incompatible(struct file *file, struct page *page);
> >  extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int);
> >  extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *);
> > +extern void nfs_wait_woutstanding(struct inode *);
> >  
> >  /*
> >   * Try to write back everything synchronously (but check the
> > @@ -482,6 +490,7 @@ extern int nfs_wb_all(struct inode *inod
> >  extern int nfs_wb_nocommit(struct inode *inode);
> >  extern int nfs_wb_page(struct inode *inode, struct page* page);
> >  extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
> > +extern int nfs_wb_eager(struct inode *inode);
> >  #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
> >  extern int  nfs_commit_inode(struct inode *, int);
> >  extern struct nfs_write_data *nfs_commitdata_alloc(void);
> > diff -rupN -X linux-2.6.32-rc7/Documentation/dontdiff linux-2.6.32-rc7-orig/mm/page-writeback.c linux-2.6.32-rc7/mm/page-writeback.c
> > --- linux-2.6.32-rc7-orig/mm/page-writeback.c	2009-11-12 19:46:07.000000000 -0500
> > +++ linux-2.6.32-rc7/mm/page-writeback.c	2009-11-18 10:05:22.314373138 -0500
> > @@ -536,7 +536,7 @@ static void balance_dirty_pages(struct a
> >  		 * threshold otherwise wait until the disk writes catch
> >  		 * up.
> >  		 */
> > -		if (bdi_nr_reclaimable > bdi_thresh) {
> > +		if (bdi_nr_reclaimable != 0) {
> >  			writeback_inodes_wbc(&wbc);
> >  			pages_written += write_chunk - wbc.nr_to_write;
> >  			get_dirty_limits(&background_thresh, &dirty_thresh,
> 
> And I think you just broke regular writeback here, allowing for tons of
> unneeded writeout of very small chunks.

Maybe so.  I had originally worked from a 2.6.18 base, where the check
was "if (nr_reclaimable)", so I retained that check, because with eager
writeback, there should never be that many writeback pages and there is
a check above this to break out of the loop if (reclaimable+writeback <=
bdi_thresh).  But I probably ignored the effect on local disk
subsystems.  Do you know of any tests that will illustrate what effect
this will have?

> This really needs to be multiple patches, and a proper changelog
> describing why you do things. The above, because my benchmark goes
> faster, just isn't sufficient.

I can see splitting this into two separate patches, but then that would
be confusing, because the nfs-only patch depends on the mm/fs patch, and
the mm/fs patch looks odd all by itself (only about 10 lines of diffs),
so I thought it more prudent to keep them together.

> 
> Also, I don't think this needs to have a sysctl, it should just work.

The sysctl is a *good thing* in that it allows the eager writeback
behavior to be tuned and shut off if need be.  I can only test the
changes on a finite set of systems, so better safe than sorry.

> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-18 19:33   ` Steve Rago
@ 2009-12-18 19:41     ` Ingo Molnar
  2009-12-18 21:20       ` Steve Rago
  2009-12-18 19:44     ` Peter Zijlstra
  1 sibling, 1 reply; 96+ messages in thread
From: Ingo Molnar @ 2009-12-18 19:41 UTC (permalink / raw)
  To: Steve Rago
  Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust,
	Wu Fengguang, jens.axboe


* Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:

> > Also, I don't think this needs to have a sysctl, it should just work.
> 
> The sysctl is a *good thing* in that it allows the eager writeback behavior 
> to be tuned and shut off if need be.  I can only test the changes on a 
> finite set of systems, so better safe than sorry.

This issue has been settled many years ago and that's not what we do in the 
Linux kernel. We prefer patches to core code where we are reasonably sure they 
result in good behavior - and then we fix bugs in the new behavior, if any. 

(Otherwise odd sysctls would mushroom quickly and the system would become 
untestable in practice.)

	Ingo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-18 19:41     ` Ingo Molnar
@ 2009-12-18 21:20       ` Steve Rago
  2009-12-18 22:07         ` Ingo Molnar
  2009-12-19  8:08         ` Arjan van de Ven
  0 siblings, 2 replies; 96+ messages in thread
From: Steve Rago @ 2009-12-18 21:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust,
	Wu Fengguang, jens.axboe


On Fri, 2009-12-18 at 20:41 +0100, Ingo Molnar wrote:
> * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:
> 
> > > Also, I don't think this needs to have a sysctl, it should just work.
> > 
> > The sysctl is a *good thing* in that it allows the eager writeback behavior 
> > to be tuned and shut off if need be.  I can only test the changes on a 
> > finite set of systems, so better safe than sorry.
> 
> This issue has been settled many years ago and that's not what we do in the 
> Linux kernel. We prefer patches to core code where we are reasonably sure they 
> result in good behavior - and then we fix bugs in the new behavior, if any. 
> 
> (Otherwise odd sysctls would mushroom quickly and the system would become 
> untestable in practice.)
> 
> 	Ingo

I don't disagree, but "that's not what we do" hardly provides insight
into making the judgment call.  In this case, the variety of
combinations of NFS server speed, NFS client speed, transmission link
speed, client memory size, and server memory size argues for a tunable
parameter, because one value probably won't work well in all
combinations.  Making it change dynamically based on these parameters is
more complicated than these circumstances call for, IMHO.

Steve



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-18 21:20       ` Steve Rago
@ 2009-12-18 22:07         ` Ingo Molnar
  2009-12-18 22:46           ` Steve Rago
  2009-12-19  8:08         ` Arjan van de Ven
  1 sibling, 1 reply; 96+ messages in thread
From: Ingo Molnar @ 2009-12-18 22:07 UTC (permalink / raw)
  To: Steve Rago
  Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust,
	Wu Fengguang, jens.axboe


* Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:

> 
> On Fri, 2009-12-18 at 20:41 +0100, Ingo Molnar wrote:
> > * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:
> > 
> > > > Also, I don't think this needs to have a sysctl, it should just work.
> > > 
> > > The sysctl is a *good thing* in that it allows the eager writeback behavior 
> > > to be tuned and shut off if need be.  I can only test the changes on a 
> > > finite set of systems, so better safe than sorry.
> > 
> > This issue has been settled many years ago and that's not what we do in the 
> > Linux kernel. We prefer patches to core code where we are reasonably sure they 
> > result in good behavior - and then we fix bugs in the new behavior, if any. 
> > 
> > (Otherwise odd sysctls would mushroom quickly and the system would become 
> > untestable in practice.)
> > 
> > 	Ingo
> 
> I don't disagree, but "that's not what we do" hardly provides insight into 
> making the judgment call. [...]

I gave you an example of the problems that arise, see the last sentence above.

> [...]  In this case, the variety of combinations of NFS server speed, NFS 
> client speed, transmission link speed, client memory size, and server memory 
> size argues for a tunable parameter, because one value probably won't work 
> well in all combinations.  Making it change dynamically based on these 
> parameters is more complicated than these circumstances call for, IMHO.

So having crappy tunables is the reason to introduce even more tunables? I 
think you just gave a good second example of why we dont want sysctls for 
features like this.

	Ingo

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-18 22:07         ` Ingo Molnar
@ 2009-12-18 22:46           ` Steve Rago
  0 siblings, 0 replies; 96+ messages in thread
From: Steve Rago @ 2009-12-18 22:46 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, linux-nfs, linux-kernel, Trond.Myklebust,
	Wu Fengguang, jens.axboe


On Fri, 2009-12-18 at 23:07 +0100, Ingo Molnar wrote:
> * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:
> 
> > 
> > On Fri, 2009-12-18 at 20:41 +0100, Ingo Molnar wrote:
> > > * Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:
> > > 
> > > > > Also, I don't think this needs to have a sysctl, it should just work.
> > > > 
> > > > The sysctl is a *good thing* in that it allows the eager writeback behavior 
> > > > to be tuned and shut off if need be.  I can only test the changes on a 
> > > > finite set of systems, so better safe than sorry.
> > > 
> > > This issue has been settled many years ago and that's not what we do in the 
> > > Linux kernel. We prefer patches to core code where we are reasonably sure they 
> > > result in good behavior - and then we fix bugs in the new behavior, if any. 
> > > 
> > > (Otherwise odd sysctls would mushroom quickly and the system would become 
> > > untestable in practice.)
> > > 
> > > 	Ingo
> > 
> > I don't disagree, but "that's not what we do" hardly provides insight into 
> > making the judgment call. [...]
> 
> I gave you an example of the problems that arise, see the last sentence above.
> 
> > [...]  In this case, the variety of combinations of NFS server speed, NFS 
> > client speed, transmission link speed, client memory size, and server memory 
> > size argues for a tunable parameter, because one value probably won't work 
> > well in all combinations.  Making it change dynamically based on these 
> > parameters is more complicated than these circumstances call for, IMHO.
> 
> So having crappy tunables is the reason to introduce even more tunables? I 
> think you just gave a good second example of why we dont want sysctls for 
> features like this.
> 
> 	Ingo

The examples I cited are not tunables.  They are characteristics of the
systems we use.  I can't squeeze more than 1Gb/s out of my gigabit
Ethernet connection; I can't make my 2GHz CPU compute any faster; I am
limited by these components to the performance I can attain.  Writing
software that performs well in all combinations, especially to take
advantage of the myriad of combinations, is difficult at best.  The
tunable introduced in the patch is a compromise to writing a much more
complicated adaptive algorithm that most likely wouldn't have access to
all of the information it needed anyway.

Steve

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-18 21:20       ` Steve Rago
  2009-12-18 22:07         ` Ingo Molnar
@ 2009-12-19  8:08         ` Arjan van de Ven
  2009-12-19 13:37           ` Steve Rago
  1 sibling, 1 reply; 96+ messages in thread
From: Arjan van de Ven @ 2009-12-19  8:08 UTC (permalink / raw)
  To: Steve Rago
  Cc: Ingo Molnar, Peter Zijlstra, linux-nfs, linux-kernel,
	Trond.Myklebust, Wu Fengguang, jens.axboe

On Fri, 18 Dec 2009 16:20:11 -0500
Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:

> 
> I don't disagree, but "that's not what we do" hardly provides insight
> into making the judgment call.  In this case, the variety of
> combinations of NFS server speed, NFS client speed, transmission link
> speed, client memory size, and server memory size argues for a tunable
> parameter, because one value probably won't work well in all
> combinations.  Making it change dynamically based on these parameters
> is more complicated than these circumstances call for, IMHO.


if you as the expert do not know how to tune this... how is a sysadmin
supposed to know better?


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-19  8:08         ` Arjan van de Ven
@ 2009-12-19 13:37           ` Steve Rago
  0 siblings, 0 replies; 96+ messages in thread
From: Steve Rago @ 2009-12-19 13:37 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Peter Zijlstra, linux-nfs, linux-kernel,
	Trond.Myklebust, Wu Fengguang, jens.axboe

On Sat, 2009-12-19 at 09:08 +0100, Arjan van de Ven wrote:
> On Fri, 18 Dec 2009 16:20:11 -0500
> Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org> wrote:
> 
> > 
> > I don't disagree, but "that's not what we do" hardly provides insight
> > into making the judgment call.  In this case, the variety of
> > combinations of NFS server speed, NFS client speed, transmission link
> > speed, client memory size, and server memory size argues for a tunable
> > parameter, because one value probably won't work well in all
> > combinations.  Making it change dynamically based on these parameters
> > is more complicated than these circumstances call for, IMHO.
> 
> 
> if you as the expert do not know how to tune this... how is a sysadmin
> supposed to know better?
> 

I did not say I didn't know how to tune it.  I said you put the tunable
parameter in as a compromise to doing something far more complex.  You
then adjust the value according to various workloads (in this case,
iozone or something that more closely resembles your application).  The
same way I figure out how man NFSD processes to configure; the same way
I figure out acceptable values for dirty_ratio and
dirty_background_ratio.  The code has a reasonably conservative default,
and people can adjust it if their circumstances differ such that the
default doesn't provide acceptable results.

Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-18 19:33   ` Steve Rago
  2009-12-18 19:41     ` Ingo Molnar
@ 2009-12-18 19:44     ` Peter Zijlstra
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2009-12-18 19:44 UTC (permalink / raw)
  To: Steve Rago
  Cc: linux-nfs, linux-kernel, Trond.Myklebust, Wu Fengguang,
	jens.axboe

On Fri, 2009-12-18 at 14:33 -0500, Steve Rago wrote:
> > > +void nfs_wait_woutstanding(struct inode *inode)
> > > +{
> > > +   if (nfs_max_woutstanding != 0) {
> > > +           unsigned long background_thresh;
> > > +           unsigned long dirty_thresh;
> > > +           long npages;
> > > +           struct nfs_inode *nfsi = NFS_I(inode);
> > > +
> > > +           get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
> > > +           npages = global_page_state(NR_FILE_DIRTY) +
> > > +                    global_page_state(NR_UNSTABLE_NFS) +
> > > +                    global_page_state(NR_WRITEBACK);
> > > +           if (npages >= background_thresh)
> > > +                   wait_event(nfsi->writes_wq,
> > > +                      atomic_read(&nfsi->writes) < nfs_max_woutstanding);
> > > +   }
> > > +}
> > 
> > This looks utterly busted too, why the global state and not the nfs
> > client's bdi state?
> > 
> > Also, why create this extra workqueue and not simply use the congestion
> > interface that is present? If the congestion stuff doesn't work for you,
> > fix that, don't add extra muck like this.
> 
> Pages are a global resource.  Once we hit the dirty_threshold, the
> system is going to work harder to flush the pages out.  This code
> prevents the caller from creating more dirty pages in this state,
> thereby making matters worse, when eager writeback is enabled.

You misunderstand, dirty limits are per BDI, all those npages might be
for !NFS traffic, in which case forcing the NFS into sync mode might be
the wrong thing to do.

The dirty pages are no longer a global resource in the current Linux
tree.

> This wait queue is used for different purposes than the congestion_wait
> interface.  Here we are preventing the caller from proceeding if there
> are too many NFS writes outstanding for this thread and we are in a
> memory pressure state.  It has nothing to do with the state of the bdi
> congestion. 

I'm thinking it ought to, congestion is exactly that, when the device
gets backed up and need to get moving.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-17  8:17 ` Peter Zijlstra
  2009-12-18 19:33   ` Steve Rago
@ 2009-12-19 12:20   ` Wu Fengguang
  2009-12-19 14:25     ` Steve Rago
  1 sibling, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2009-12-19 12:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Steve Rago, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

Hi Steve,

// I should really read the NFS code, but maybe you can help us better
// understand the problem :)

On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote:
> On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > Eager Writeback for NFS Clients
> > -------------------------------
> > Prevent applications that write large sequential streams of data (like backup, for example)
> > from entering into a memory pressure state, which degrades performance by falling back to
> > synchronous operations (both synchronous writes and additional commits).

What exactly is the "memory pressure state" condition?  What's the
code to do the "synchronous writes and additional commits" and maybe
how they are triggered?

> > This is accomplished by preventing the client application from
> > dirtying pages faster than they can be written to the server:
> > clients write pages eagerly instead of lazily.

We already have the balance_dirty_pages() based global throttling.
So what makes the performance difference in your proposed "per-inode" throttling?
balance_dirty_pages() does have much larger threshold than yours. 

> > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
> > the feature.  Otherwise it contains the maximum number of outstanding NFS writes that can be
> > in flight for a given file.  This is used to block the application from dirtying more pages
> > until the writes are complete.

What if we do heuristic write-behind for sequential NFS writes?

Another related proposal from Peter Staubach is to start async writeback
(without the throttle in your proposal) when one inode have enough pages
dirtied:

        Another approach that I suggested was to keep track of the
        number of pages which are dirty on a per-inode basis.  When
        enough pages are dirty to fill an over the wire transfer,
        then schedule an asynchronous write to transmit that data to
        the server.  This ties in with support to ensure that the
        server/network is not completely overwhelmed by the client
        by flow controlling the writing application to better match
        the bandwidth and latencies of the network and server.
        With this support, the NFS client tends not to fill memory
        with dirty pages and thus, does not depend upon the other
        parts of the system to flush these pages.

Can the above alternatives fix the same problem? (or perhaps, is the
per-inode throttling really necessary?)

> > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach.  For
> > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323.
> >
> > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32.
> >
> > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar).
> > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet.
> > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential
> > workloads (where "large" is defined to be greater than the memory size on the client).

These are impressive numbers. I wonder what would be the minimal patch
(just hacking it to fast, without all the aux bits)? Is it this chunk
to call nfs_wb_eager()?

> > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio
> >       nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
> >       result = generic_file_aio_write(iocb, iov, nr_segs, pos);
> >       /* Return error values for O_SYNC and IS_SYNC() */
> > -     if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
> > -             int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
> > -             if (err < 0)
> > -                     result = err;
> > +     if (result >= 0) {
> > +             if (nfs_need_sync_write(iocb->ki_filp, inode)) {
> > +                     int err;
> > +
> > +                     err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
> > +                                        inode);
> > +                     if (err < 0)
> > +                             result = err;
> > +             } else if (nfs_max_woutstanding != 0 &&
> > +                  nfs_is_seqwrite(inode, pos) &&
> > +                  atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) {
> > +                     nfs_wb_eager(inode);
> > +             }
> > +             if (result > 0)
> > +                     nfsi->wrpos = pos + result;
> >       }

Thanks,
Fengguang


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-19 12:20   ` Wu Fengguang
@ 2009-12-19 14:25     ` Steve Rago
  2009-12-22  1:59       ` Wu Fengguang
  2009-12-22 12:25       ` Jan Kara
  0 siblings, 2 replies; 96+ messages in thread
From: Steve Rago @ 2009-12-19 14:25 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote:
> Hi Steve,
> 
> // I should really read the NFS code, but maybe you can help us better
> // understand the problem :)
> 
> On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote:
> > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > > Eager Writeback for NFS Clients
> > > -------------------------------
> > > Prevent applications that write large sequential streams of data (like backup, for example)
> > > from entering into a memory pressure state, which degrades performance by falling back to
> > > synchronous operations (both synchronous writes and additional commits).
> 
> What exactly is the "memory pressure state" condition?  What's the
> code to do the "synchronous writes and additional commits" and maybe
> how they are triggered?

Memory pressure occurs when most of the client pages have been dirtied
by an application (think backup server writing multi-gigabyte files that
exceed the size of main memory).  The system works harder to be able to
free dirty pages so that they can be reused.  For a local file system,
this means writing the pages to disk.  For NFS, however, the writes
leave the pages in an "unstable" state until the server responds to a
commit request.  Generally speaking, commit processing is far more
expensive than write processing on the server; both are done with the
inode locked, but since the commit takes so long, all writes are
blocked, which stalls the pipeline.

Flushing is generated from the flush kernel threads (nee pdflush) and
from the applications themselves (balance_dirty_pages), as well as
periodic sync (kupdate style).  This is roughly controlled by adjusting
dirty_ratio and dirty_background_ratio (along with
dirty_expire_centisecs and dirty_writeback_centisecs).

In addition, when the client system *really* needs a page deep down in
the page allocator, it can generate a synchronous write request of
individual pages.  This is just about as expensive as a commit, roughly
speaking, again stalling the pipeline.

> 
> > > This is accomplished by preventing the client application from
> > > dirtying pages faster than they can be written to the server:
> > > clients write pages eagerly instead of lazily.
> 
> We already have the balance_dirty_pages() based global throttling.
> So what makes the performance difference in your proposed "per-inode" throttling?
> balance_dirty_pages() does have much larger threshold than yours. 

I originally spent several months playing with the balance_dirty_pages
algorithm.  The main drawback is that it affects more than the inodes
that the caller is writing and that the control of what to do is too
coarse.  My final changes (which worked well for 1Gb connections) were
more heuristic than the changes in the patch -- I basically had to come
up with alternate ways to write pages without generating commits on
inodes.  Doing this was distasteful, as I was adjusting generic system
behavior for an NFS-only problem.  Then a colleague found Peter
Staubach's patch, which worked just as well in less code, and isolated
the change to the NFS component, which is where it belongs.

> 
> > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
> > > the feature.  Otherwise it contains the maximum number of outstanding NFS writes that can be
> > > in flight for a given file.  This is used to block the application from dirtying more pages
> > > until the writes are complete.
> 
> What if we do heuristic write-behind for sequential NFS writes?

Part of the patch does implement a heuristic write-behind.  See where
nfs_wb_eager() is called.

> 
> Another related proposal from Peter Staubach is to start async writeback
> (without the throttle in your proposal) when one inode have enough pages
> dirtied:
> 
>         Another approach that I suggested was to keep track of the
>         number of pages which are dirty on a per-inode basis.  When
>         enough pages are dirty to fill an over the wire transfer,
>         then schedule an asynchronous write to transmit that data to
>         the server.  This ties in with support to ensure that the
>         server/network is not completely overwhelmed by the client
>         by flow controlling the writing application to better match
>         the bandwidth and latencies of the network and server.
>         With this support, the NFS client tends not to fill memory
>         with dirty pages and thus, does not depend upon the other
>         parts of the system to flush these pages.
> 
> Can the above alternatives fix the same problem? (or perhaps, is the
> per-inode throttling really necessary?)

This alternative *is contained in* the patch (as this is mostly Peter's
code anyway; all I've done is the analysis and porting). The throttling
is necessary to prevent the client from continuing to fill all of its
memory with dirty pages, which it can always do faster than it can write
pages to the server.

> 
> > > This patch is based heavily (okay, almost entirely) on a prior patch by Peter Staubach.  For
> > > the original patch, see http://article.gmane.org/gmane.linux.nfs/24323.
> > >
> > > The patch below applies to linux-2.6.32-rc7, but it should apply cleanly to vanilla linux-2.6.32.
> > >
> > > Performance data and tuning notes can be found on my web site (http://www.nec-labs.com/~sar).
> > > With iozone, I see about 50% improvement for large sequential write workloads over a 1Gb Ethernet.
> > > With an in-house micro-benchmark, I see 80% improvement for large, single-stream, sequential
> > > workloads (where "large" is defined to be greater than the memory size on the client).
> 
> These are impressive numbers. I wonder what would be the minimal patch
> (just hacking it to fast, without all the aux bits)? Is it this chunk
> to call nfs_wb_eager()?

The first half is the same as before, with different indentation.  The
last half is indeed the heuristic to call nfs_wb_eager() to invoke
asynchronous write-behind.

With these changes, the number of NFS commit messages drops from a few
thousands to tens when writing a 32GB file over NFS.  This is mainly
because the server is writing dirty pages from its cache in the
background, so when a commit comes along, it has less work to do (as
opposed to writing all of the pages on demand and then waiting for the
commit).

I have a second set of changes, which I have not yet submitted, that
removes these commits, but it extends the protocol (in a
backward-compatible way), which will probably be a harder sell.

Steve

> 
> > > @@ -623,10 +635,21 @@ static ssize_t nfs_file_write(struct kio
> > >       nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
> > >       result = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > >       /* Return error values for O_SYNC and IS_SYNC() */
> > > -     if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
> > > -             int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode);
> > > -             if (err < 0)
> > > -                     result = err;
> > > +     if (result >= 0) {
> > > +             if (nfs_need_sync_write(iocb->ki_filp, inode)) {
> > > +                     int err;
> > > +
> > > +                     err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp),
> > > +                                        inode);
> > > +                     if (err < 0)
> > > +                             result = err;
> > > +             } else if (nfs_max_woutstanding != 0 &&
> > > +                  nfs_is_seqwrite(inode, pos) &&
> > > +                  atomic_read(&nfsi->ndirty) >= NFS_SERVER(inode)->wpages) {
> > > +                     nfs_wb_eager(inode);
> > > +             }
> > > +             if (result > 0)
> > > +                     nfsi->wrpos = pos + result;
> > >       }
> 
> Thanks,
> Fengguang
> 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-19 14:25     ` Steve Rago
@ 2009-12-22  1:59       ` Wu Fengguang
  2009-12-22 12:35         ` Jan Kara
                           ` (3 more replies)
  2009-12-22 12:25       ` Jan Kara
  1 sibling, 4 replies; 96+ messages in thread
From: Wu Fengguang @ 2009-12-22  1:59 UTC (permalink / raw)
  To: Steve Rago
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel

Steve,

On Sat, Dec 19, 2009 at 10:25:47PM +0800, Steve Rago wrote:
> 
> On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote:
> > 
> > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote:
> > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > > > Eager Writeback for NFS Clients
> > > > -------------------------------
> > > > Prevent applications that write large sequential streams of data (like backup, for example)
> > > > from entering into a memory pressure state, which degrades performance by falling back to
> > > > synchronous operations (both synchronous writes and additional commits).
> > 
> > What exactly is the "memory pressure state" condition?  What's the
> > code to do the "synchronous writes and additional commits" and maybe
> > how they are triggered?
> 
> Memory pressure occurs when most of the client pages have been dirtied
> by an application (think backup server writing multi-gigabyte files that
> exceed the size of main memory).  The system works harder to be able to
> free dirty pages so that they can be reused.  For a local file system,
> this means writing the pages to disk.  For NFS, however, the writes
> leave the pages in an "unstable" state until the server responds to a
> commit request.  Generally speaking, commit processing is far more
> expensive than write processing on the server; both are done with the
> inode locked, but since the commit takes so long, all writes are
> blocked, which stalls the pipeline.

Let me try reiterate the problem with code, please correct me if I'm
wrong.

1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS
   will set the flag for any pages written -- why this trick? To
   guarantee the call of nfs_commit_inode()? Which unfortunately turns
   almost every server side NFS write into sync writes..

 writeback_single_inode:
    do_writepages
      nfs_writepages
        nfs_writepage ----[short time later]---> nfs_writeback_release*
                                                   nfs_mark_request_commit
                                                     __mark_inode_dirty(I_DIRTY_DATASYNC);
                                    
    if (I_DIRTY_SYNC || I_DIRTY_DATASYNC)  <---- so this will be true for most time
      write_inode
        nfs_write_inode
          nfs_commit_inode


2) NFS commit stops pipeline because it sleep&wait inside i_mutex,
   which blocks all other NFSDs trying to write/writeback the inode.

   nfsd_sync:
     take i_mutex
       filemap_fdatawrite
       filemap_fdatawait
     drop i_mutex
     
   If filemap_fdatawait() can be moved out of i_mutex (or just remove
   the lock), we solve the root problem:

   nfsd_sync:
     [take i_mutex]
       filemap_fdatawrite  => can also be blocked, but less a problem
     [drop i_mutex]
       filemap_fdatawait
 
   Maybe it's a dumb question, but what's the purpose of i_mutex here?
   For correctness or to prevent livelock? I can imagine some livelock
   problem here (current implementation can easily wait for extra
   pages), however not too hard to fix.


The proposed patch essentially takes two actions in nfs_file_write()
- to start writeback when the per-file nr_dirty goes high
  without committing
- to throttle dirtying when the per-file nr_writeback goes high
  I guess this effectively prevents pdflush from kicking in with
  its bad committing behavior

In general it's reasonable to keep NFS per-file nr_dirty low, however
questionable to do per-file nr_writeback throttling. This does not
work well with the global limits - eg. when there are many dirty
files, the summed-up nr_writeback will still grow out of control.
And it's more likely to impact user visible responsiveness than
a global limit. But my opinion can be biased -- me have a patch to
do global NFS nr_writeback limit ;)

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22  1:59       ` Wu Fengguang
@ 2009-12-22 12:35         ` Jan Kara
       [not found]           ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
  2009-12-24  1:26           ` Wu Fengguang
  2009-12-22 13:01         ` Martin Knoblauch
                           ` (2 subsequent siblings)
  3 siblings, 2 replies; 96+ messages in thread
From: Jan Kara @ 2009-12-22 12:35 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel

> 2) NFS commit stops pipeline because it sleep&wait inside i_mutex,
>    which blocks all other NFSDs trying to write/writeback the inode.
> 
>    nfsd_sync:
>      take i_mutex
>        filemap_fdatawrite
>        filemap_fdatawait
>      drop i_mutex
  I believe this is unrelated to the problem Steve is trying to solve.
When we get to doing sync writes the performance is busted so we better
shouldn't get to that (unless user asked for that of course).

>    If filemap_fdatawait() can be moved out of i_mutex (or just remove
>    the lock), we solve the root problem:
> 
>    nfsd_sync:
>      [take i_mutex]
>        filemap_fdatawrite  => can also be blocked, but less a problem
>      [drop i_mutex]
>        filemap_fdatawait
>  
>    Maybe it's a dumb question, but what's the purpose of i_mutex here?
>    For correctness or to prevent livelock? I can imagine some livelock
>    problem here (current implementation can easily wait for extra
>    pages), however not too hard to fix.
  Generally, most filesystems take i_mutex during fsync to
a) avoid all sorts of livelocking problems
b) serialize fsyncs for one inode (mostly for simplicity)
  I don't see what advantage would it bring that we get rid of i_mutex
for fdatawait - only that maybe writers could proceed while we are
waiting but is that really the problem?

								Honza
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>]

* Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]           ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
@ 2009-12-23  8:43             ` Christoph Hellwig
  2009-12-23 13:32               ` Jan Kara
  0 siblings, 1 reply; 96+ messages in thread
From: Christoph Hellwig @ 2009-12-23  8:43 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Steve Rago, Peter Zijlstra,
	linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel

On Tue, Dec 22, 2009 at 01:35:39PM +0100, Jan Kara wrote:
> >    nfsd_sync:
> >      [take i_mutex]
> >        filemap_fdatawrite  => can also be blocked, but less a problem
> >      [drop i_mutex]
> >        filemap_fdatawait
> >  
> >    Maybe it's a dumb question, but what's the purpose of i_mutex here?
> >    For correctness or to prevent livelock? I can imagine some livelock
> >    problem here (current implementation can easily wait for extra
> >    pages), however not too hard to fix.
>   Generally, most filesystems take i_mutex during fsync to
> a) avoid all sorts of livelocking problems
> b) serialize fsyncs for one inode (mostly for simplicity)
>   I don't see what advantage would it bring that we get rid of i_mutex
> for fdatawait - only that maybe writers could proceed while we are
> waiting but is that really the problem?

It would match what we do in vfs_fsync for the non-nfsd path, so it's
a no-brainer to do it.  In fact I did switch it over to vfs_fsync a
while ago but that go reverted because it caused deadlocks for
nfsd_sync_dir which for some reason can't take the i_mutex (I'd have to
check the archives why).

Here's a RFC patch to make some more sense of the fsync callers in nfsd,
including fixing up the data write/wait calling conventions to match the
regular fsync path (which might make this a -stable candidate):


Index: linux-2.6/fs/nfsd/vfs.c
===================================================================
--- linux-2.6.orig/fs/nfsd/vfs.c	2009-12-23 09:32:45.693170043 +0100
+++ linux-2.6/fs/nfsd/vfs.c	2009-12-23 09:39:47.627170082 +0100
@@ -769,45 +769,27 @@ nfsd_close(struct file *filp)
 }
 
 /*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
+ * Sync a directory to disk.
+ *
+ * This is odd compared to all other fsync callers because we
+ *
+ *  a) do not have a file struct available
+ *  b) expect to have i_mutex already held by the caller
  */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-			      const struct file_operations *fop)
+int
+nfsd_sync_dir(struct dentry *dentry)
 {
-	struct inode *inode = dp->d_inode;
-	int (*fsync) (struct file *, struct dentry *, int);
+	struct inode *inode = dentry->d_inode;
 	int err;
 
-	err = filemap_fdatawrite(inode->i_mapping);
-	if (err == 0 && fop && (fsync = fop->fsync))
-		err = fsync(filp, dp, 0);
-	if (err == 0)
-		err = filemap_fdatawait(inode->i_mapping);
+	WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (err == 0 && inode->i_fop->fsync)
+		err = inode->i_fop->fsync(NULL, dentry, 0);
 	return err;
 }
 
-static int
-nfsd_sync(struct file *filp)
-{
-        int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
-	mutex_lock(&inode->i_mutex);
-	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
-	mutex_unlock(&inode->i_mutex);
-
-	return err;
-}
-
-int
-nfsd_sync_dir(struct dentry *dp)
-{
-	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
-}
-
 /*
  * Obtain the readahead parameters for the file
  * specified by (dev, ino).
@@ -1011,7 +993,7 @@ static int wait_for_concurrent_writes(st
 
 	if (inode->i_state & I_DIRTY) {
 		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
-		err = nfsd_sync(file);
+		err = vfs_fsync(file, file->f_path.dentry, 0);
 	}
 	last_ino = inode->i_ino;
 	last_dev = inode->i_sb->s_dev;
@@ -1180,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, stru
 		return err;
 	if (EX_ISSYNC(fhp->fh_export)) {
 		if (file->f_op && file->f_op->fsync) {
-			err = nfserrno(nfsd_sync(file));
+			err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0));
 		} else {
 			err = nfserr_notsupp;
 		}

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23  8:43             ` Christoph Hellwig
@ 2009-12-23 13:32               ` Jan Kara
       [not found]                 ` <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
  0 siblings, 1 reply; 96+ messages in thread
From: Jan Kara @ 2009-12-23 13:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Wu Fengguang, Steve Rago, Peter Zijlstra,
	linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel

On Wed 23-12-09 03:43:02, Christoph Hellwig wrote:
> On Tue, Dec 22, 2009 at 01:35:39PM +0100, Jan Kara wrote:
> > >    nfsd_sync:
> > >      [take i_mutex]
> > >        filemap_fdatawrite  => can also be blocked, but less a problem
> > >      [drop i_mutex]
> > >        filemap_fdatawait
> > >  
> > >    Maybe it's a dumb question, but what's the purpose of i_mutex here?
> > >    For correctness or to prevent livelock? I can imagine some livelock
> > >    problem here (current implementation can easily wait for extra
> > >    pages), however not too hard to fix.
> >   Generally, most filesystems take i_mutex during fsync to
> > a) avoid all sorts of livelocking problems
> > b) serialize fsyncs for one inode (mostly for simplicity)
> >   I don't see what advantage would it bring that we get rid of i_mutex
> > for fdatawait - only that maybe writers could proceed while we are
> > waiting but is that really the problem?
> 
> It would match what we do in vfs_fsync for the non-nfsd path, so it's
> a no-brainer to do it.  In fact I did switch it over to vfs_fsync a
> while ago but that go reverted because it caused deadlocks for
> nfsd_sync_dir which for some reason can't take the i_mutex (I'd have to
> check the archives why).
> 
> Here's a RFC patch to make some more sense of the fsync callers in nfsd,
> including fixing up the data write/wait calling conventions to match the
> regular fsync path (which might make this a -stable candidate):
  The patch looks good to me from general soundness point of view :).
Someone with more NFS knowledge should tell whether dropping i_mutex for
fdatawrite_and_wait is fine for NFS.

								Honza
 
> Index: linux-2.6/fs/nfsd/vfs.c
> ===================================================================
> --- linux-2.6.orig/fs/nfsd/vfs.c	2009-12-23 09:32:45.693170043 +0100
> +++ linux-2.6/fs/nfsd/vfs.c	2009-12-23 09:39:47.627170082 +0100
> @@ -769,45 +769,27 @@ nfsd_close(struct file *filp)
>  }
>  
>  /*
> - * Sync a file
> - * As this calls fsync (not fdatasync) there is no need for a write_inode
> - * after it.
> + * Sync a directory to disk.
> + *
> + * This is odd compared to all other fsync callers because we
> + *
> + *  a) do not have a file struct available
> + *  b) expect to have i_mutex already held by the caller
>   */
> -static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
> -			      const struct file_operations *fop)
> +int
> +nfsd_sync_dir(struct dentry *dentry)
>  {
> -	struct inode *inode = dp->d_inode;
> -	int (*fsync) (struct file *, struct dentry *, int);
> +	struct inode *inode = dentry->d_inode;
>  	int err;
>  
> -	err = filemap_fdatawrite(inode->i_mapping);
> -	if (err == 0 && fop && (fsync = fop->fsync))
> -		err = fsync(filp, dp, 0);
> -	if (err == 0)
> -		err = filemap_fdatawait(inode->i_mapping);
> +	WARN_ON(!mutex_is_locked(&inode->i_mutex));
>  
> +	err = filemap_write_and_wait(inode->i_mapping);
> +	if (err == 0 && inode->i_fop->fsync)
> +		err = inode->i_fop->fsync(NULL, dentry, 0);
>  	return err;
>  }
>  
> -static int
> -nfsd_sync(struct file *filp)
> -{
> -        int err;
> -	struct inode *inode = filp->f_path.dentry->d_inode;
> -	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
> -	mutex_lock(&inode->i_mutex);
> -	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
> -	mutex_unlock(&inode->i_mutex);
> -
> -	return err;
> -}
> -
> -int
> -nfsd_sync_dir(struct dentry *dp)
> -{
> -	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
> -}
> -
>  /*
>   * Obtain the readahead parameters for the file
>   * specified by (dev, ino).
> @@ -1011,7 +993,7 @@ static int wait_for_concurrent_writes(st
>  
>  	if (inode->i_state & I_DIRTY) {
>  		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
> -		err = nfsd_sync(file);
> +		err = vfs_fsync(file, file->f_path.dentry, 0);
>  	}
>  	last_ino = inode->i_ino;
>  	last_dev = inode->i_sb->s_dev;
> @@ -1180,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, stru
>  		return err;
>  	if (EX_ISSYNC(fhp->fh_export)) {
>  		if (file->f_op && file->f_op->fsync) {
> -			err = nfserrno(nfsd_sync(file));
> +			err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0));
>  		} else {
>  			err = nfserr_notsupp;
>  		}
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]

* Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]                 ` <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
@ 2009-12-24  5:25                   ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2009-12-24  5:25 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Steve Rago, Peter Zijlstra,
	linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	Trond.Myklebust@netapp.com, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, Dec 23, 2009 at 09:32:44PM +0800, Jan Kara wrote:
> On Wed 23-12-09 03:43:02, Christoph Hellwig wrote:
> > On Tue, Dec 22, 2009 at 01:35:39PM +0100, Jan Kara wrote:
> > > >    nfsd_sync:
> > > >      [take i_mutex]
> > > >        filemap_fdatawrite  => can also be blocked, but less a problem
> > > >      [drop i_mutex]
> > > >        filemap_fdatawait
> > > >  
> > > >    Maybe it's a dumb question, but what's the purpose of i_mutex here?
> > > >    For correctness or to prevent livelock? I can imagine some livelock
> > > >    problem here (current implementation can easily wait for extra
> > > >    pages), however not too hard to fix.
> > >   Generally, most filesystems take i_mutex during fsync to
> > > a) avoid all sorts of livelocking problems
> > > b) serialize fsyncs for one inode (mostly for simplicity)
> > >   I don't see what advantage would it bring that we get rid of i_mutex
> > > for fdatawait - only that maybe writers could proceed while we are
> > > waiting but is that really the problem?
> > 
> > It would match what we do in vfs_fsync for the non-nfsd path, so it's
> > a no-brainer to do it.  In fact I did switch it over to vfs_fsync a
> > while ago but that go reverted because it caused deadlocks for
> > nfsd_sync_dir which for some reason can't take the i_mutex (I'd have to
> > check the archives why).
> > 
> > Here's a RFC patch to make some more sense of the fsync callers in nfsd,
> > including fixing up the data write/wait calling conventions to match the
> > regular fsync path (which might make this a -stable candidate):
>   The patch looks good to me from general soundness point of view :).
> Someone with more NFS knowledge should tell whether dropping i_mutex for
> fdatawrite_and_wait is fine for NFS.

I believe it's safe to drop i_mutex for fdatawrite_and_wait().
Because NFS
1) client: collect all unstable pages (which server ACKed that have
   reach its page cache)
2) client: send COMMIT
3) server: fdatawrite_and_wait(), which makes sure pages in 1) get cleaned
4) client: put all pages collected in 1) to clean state

So there's no need to take i_mutex to prevent concurrent
write/commits.

If someone else concurrently truncate and then extend i_size, the NFS
verf will be changed and thus client will resend the pages? (whether
it should overwrite the pages is another problem..)

Thanks,
Fengguang


>  
> > Index: linux-2.6/fs/nfsd/vfs.c
> > ===================================================================
> > --- linux-2.6.orig/fs/nfsd/vfs.c	2009-12-23 09:32:45.693170043 +0100
> > +++ linux-2.6/fs/nfsd/vfs.c	2009-12-23 09:39:47.627170082 +0100
> > @@ -769,45 +769,27 @@ nfsd_close(struct file *filp)
> >  }
> >  
> >  /*
> > - * Sync a file
> > - * As this calls fsync (not fdatasync) there is no need for a write_inode
> > - * after it.
> > + * Sync a directory to disk.
> > + *
> > + * This is odd compared to all other fsync callers because we
> > + *
> > + *  a) do not have a file struct available
> > + *  b) expect to have i_mutex already held by the caller
> >   */
> > -static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
> > -			      const struct file_operations *fop)
> > +int
> > +nfsd_sync_dir(struct dentry *dentry)
> >  {
> > -	struct inode *inode = dp->d_inode;
> > -	int (*fsync) (struct file *, struct dentry *, int);
> > +	struct inode *inode = dentry->d_inode;
> >  	int err;
> >  
> > -	err = filemap_fdatawrite(inode->i_mapping);
> > -	if (err == 0 && fop && (fsync = fop->fsync))
> > -		err = fsync(filp, dp, 0);
> > -	if (err == 0)
> > -		err = filemap_fdatawait(inode->i_mapping);
> > +	WARN_ON(!mutex_is_locked(&inode->i_mutex));
> >  
> > +	err = filemap_write_and_wait(inode->i_mapping);
> > +	if (err == 0 && inode->i_fop->fsync)
> > +		err = inode->i_fop->fsync(NULL, dentry, 0);
> >  	return err;
> >  }
> >  
> > -static int
> > -nfsd_sync(struct file *filp)
> > -{
> > -        int err;
> > -	struct inode *inode = filp->f_path.dentry->d_inode;
> > -	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
> > -	mutex_lock(&inode->i_mutex);
> > -	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
> > -	mutex_unlock(&inode->i_mutex);
> > -
> > -	return err;
> > -}
> > -
> > -int
> > -nfsd_sync_dir(struct dentry *dp)
> > -{
> > -	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
> > -}
> > -
> >  /*
> >   * Obtain the readahead parameters for the file
> >   * specified by (dev, ino).
> > @@ -1011,7 +993,7 @@ static int wait_for_concurrent_writes(st
> >  
> >  	if (inode->i_state & I_DIRTY) {
> >  		dprintk("nfsd: write sync %d\n", task_pid_nr(current));
> > -		err = nfsd_sync(file);
> > +		err = vfs_fsync(file, file->f_path.dentry, 0);
> >  	}
> >  	last_ino = inode->i_ino;
> >  	last_dev = inode->i_sb->s_dev;
> > @@ -1180,7 +1162,7 @@ nfsd_commit(struct svc_rqst *rqstp, stru
> >  		return err;
> >  	if (EX_ISSYNC(fhp->fh_export)) {
> >  		if (file->f_op && file->f_op->fsync) {
> > -			err = nfserrno(nfsd_sync(file));
> > +			err = nfserrno(vfs_fsync(file, file->f_path.dentry, 0));
> >  		} else {
> >  			err = nfserr_notsupp;
> >  		}
> -- 
> Jan Kara <jack@suse.cz>
> SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22 12:35         ` Jan Kara
       [not found]           ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
@ 2009-12-24  1:26           ` Wu Fengguang
  1 sibling, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2009-12-24  1:26 UTC (permalink / raw)
  To: Jan Kara
  Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Tue, Dec 22, 2009 at 08:35:39PM +0800, Jan Kara wrote:
> > 2) NFS commit stops pipeline because it sleep&wait inside i_mutex,
> >    which blocks all other NFSDs trying to write/writeback the inode.
> > 
> >    nfsd_sync:
> >      take i_mutex
> >        filemap_fdatawrite
> >        filemap_fdatawait
> >      drop i_mutex
>   I believe this is unrelated to the problem Steve is trying to solve.
> When we get to doing sync writes the performance is busted so we better
> shouldn't get to that (unless user asked for that of course).

Yes, first priority is always to reduce the COMMITs and the number of
writeback pages they submitted under WB_SYNC_ALL. And I guess the
"increase write chunk beyond 128MB" patches can serve it well.

The i_mutex should impact NFS write performance for single big copy in
this way: pdflush submits many (4MB write, 1 commit) pairs, because
the write and commit each will take i_mutex, it effectively limits the
server side io queue depth to <=4MB: the next 4MB dirty data won't
reach page cache until the previous 4MB is completely synced to disk.

There are two kinds of inefficiency here:
- the small queue depth
- the interleaved use of CPU/DISK:
        loop {
                write 4MB       => normally only CPU
                writeback 4MB   => mostly disk
        }

When writing many small dirty files _plus_ one big file, there will
still be interleaved write/writeback: the 4MB write will be broken
into 8 NFS writes with the default wsize=524288. So there may be one
nfsd doing COMMIT, another 7 nfsd waiting for the big file's i_mutex.
All 8 nfsd are "busy" and pipeline is destroyed. Just a possibility.

> >    If filemap_fdatawait() can be moved out of i_mutex (or just remove
> >    the lock), we solve the root problem:
> > 
> >    nfsd_sync:
> >      [take i_mutex]
> >        filemap_fdatawrite  => can also be blocked, but less a problem
> >      [drop i_mutex]
> >        filemap_fdatawait
> >  
> >    Maybe it's a dumb question, but what's the purpose of i_mutex here?
> >    For correctness or to prevent livelock? I can imagine some livelock
> >    problem here (current implementation can easily wait for extra
> >    pages), however not too hard to fix.
>   Generally, most filesystems take i_mutex during fsync to
> a) avoid all sorts of livelocking problems
> b) serialize fsyncs for one inode (mostly for simplicity)
>   I don't see what advantage would it bring that we get rid of i_mutex
> for fdatawait - only that maybe writers could proceed while we are
> waiting but is that really the problem?

The i_mutex at least has some performance impact. Another one would be
the WB_SYNC_ALL. All are related to the COMMIT/sync write behavior.

Are there some other _direct_ causes?

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22  1:59       ` Wu Fengguang
  2009-12-22 12:35         ` Jan Kara
@ 2009-12-22 13:01         ` Martin Knoblauch
       [not found]           ` <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org>
  2009-12-22 16:41         ` Steve Rago
  2009-12-23 14:21         ` Trond Myklebust
  3 siblings, 1 reply; 96+ messages in thread
From: Martin Knoblauch @ 2009-12-22 13:01 UTC (permalink / raw)
  To: Wu Fengguang, Steve Rago
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe

----- Original Message ----

> From: Wu Fengguang <fengguang.wu@intel.com>
> To: Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org>
> Cc: Peter Zijlstra <peterz@infradead.org>; "linux-nfs@vger.kernel.org" <linux-nfs@vger.kernel.org>; "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>; "Trond.Myklebust@netapp.com" <Trond.Myklebust@netapp.com>; jens.axboe <jens.axboe@oracle.com>
> Sent: Tue, December 22, 2009 2:59:07 AM
> Subject: Re: [PATCH] improve the performance of large sequential write NFS workloads
> 

[big snip]

> 
> In general it's reasonable to keep NFS per-file nr_dirty low, however
> questionable to do per-file nr_writeback throttling. This does not
> work well with the global limits - eg. when there are many dirty
> files, the summed-up nr_writeback will still grow out of control.
> And it's more likely to impact user visible responsiveness than
> a global limit. But my opinion can be biased -- me have a patch to
> do global NFS nr_writeback limit ;)
> 


 is that  "NFS: introduce writeback wait queue", which you sent me a while ago and I did not test until now :-( ?

Cheers
Martin

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org>]

* Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]           ` <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org>
@ 2009-12-24  1:46             ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2009-12-24  1:46 UTC (permalink / raw)
  To: Martin Knoblauch
  Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe

Hi Martin,

On Tue, Dec 22, 2009 at 09:01:46PM +0800, Martin Knoblauch wrote:
> ----- Original Message ----
> 
> > From: Wu Fengguang <fengguang.wu@intel.com>
> > To: Steve Rago <sar-a+KepyhlMvJWk0Htik3J/w@public.gmane.org>
> > Cc: Peter Zijlstra <peterz@infradead.org>; "linux-nfs@vger.kernel.org" <linux-nfs@vger.kernel.org>; "linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>; "Trond.Myklebust@netapp.com" <Trond.Myklebust@netapp.com>; jens.axboe <jens.axboe@oracle.com>
> > Sent: Tue, December 22, 2009 2:59:07 AM
> > Subject: Re: [PATCH] improve the performance of large sequential write NFS workloads
> > 
> 
> [big snip]
> 
> > 
> > In general it's reasonable to keep NFS per-file nr_dirty low, however
> > questionable to do per-file nr_writeback throttling. This does not
> > work well with the global limits - eg. when there are many dirty
> > files, the summed-up nr_writeback will still grow out of control.
> > And it's more likely to impact user visible responsiveness than
> > a global limit. But my opinion can be biased -- me have a patch to
> > do global NFS nr_writeback limit ;)
> > 
> 
> 
>  is that  "NFS: introduce writeback wait queue", which you sent me a while ago and I did not test until now :-( ?

Yes it is - I've been puzzled by some bumpy NFS write problem, and 
the information in this thread seem to be helpful to understand it :)

Thanks,
Fengguang
---

NFS: introduce writeback wait queue

The generic writeback routines are departing from congestion_wait()
in preferance of get_request_wait(), aka. waiting on the block queues.

Introduce the missing writeback wait queue for NFS, otherwise its
writeback pages may grow out of control.

In perticular, balance_dirty_pages() will exit after it pushes
write_chunk pages into the PG_writeback page pool, _OR_ when the
background writeback work quits. The latter is new behavior, and could
not only quit (normally) after below background threshold, but also
quit when it finds _zero_ dirty pages to write. The latter case gives
rise to the number of PG_writeback pages if it is not explicitly limited.

CC: Jens Axboe <jens.axboe@oracle.com>
CC: Chris Mason <chris.mason@oracle.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Peter Staubach <staubach@redhat.com>
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---

The wait time and network throughput varies a lot! this is a major problem.
This means nfs_end_page_writeback() is not called smoothly over time,
even when there are plenty of PG_writeback pages on the client side.

[  397.828509] write_bandwidth: comm=nfsiod pages=192 time=16ms
[  397.850976] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  403.065244] write_bandwidth: comm=nfsiod pages=192 time=5212ms
[  403.549134] write_bandwidth: comm=nfsiod pages=1536 time=144ms
[  403.570717] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  403.595749] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  403.622171] write_bandwidth: comm=nfsiod pages=192 time=24ms
[  403.651779] write_bandwidth: comm=nfsiod pages=192 time=28ms
[  403.680543] write_bandwidth: comm=nfsiod pages=192 time=24ms
[  403.712572] write_bandwidth: comm=nfsiod pages=192 time=28ms
[  403.751552] write_bandwidth: comm=nfsiod pages=192 time=36ms
[  403.785979] write_bandwidth: comm=nfsiod pages=192 time=28ms
[  403.823995] write_bandwidth: comm=nfsiod pages=192 time=36ms
[  403.858970] write_bandwidth: comm=nfsiod pages=192 time=32ms
[  403.880786] write_bandwidth: comm=nfsiod pages=192 time=16ms
[  403.902732] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  403.925925] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  403.952044] write_bandwidth: comm=nfsiod pages=258 time=24ms
[  403.974006] write_bandwidth: comm=nfsiod pages=192 time=16ms
[  403.995989] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  405.031049] write_bandwidth: comm=nfsiod pages=192 time=1032ms
[  405.257635] write_bandwidth: comm=nfsiod pages=1536 time=192ms
[  405.279069] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  405.300843] write_bandwidth: comm=nfsiod pages=192 time=16ms
[  405.326031] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  405.350843] write_bandwidth: comm=nfsiod pages=192 time=24ms
[  405.375160] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  409.331015] write_bandwidth: comm=nfsiod pages=192 time=3952ms
[  409.587928] write_bandwidth: comm=nfsiod pages=1536 time=152ms
[  409.610068] write_bandwidth: comm=nfsiod pages=192 time=20ms
[  409.635736] write_bandwidth: comm=nfsiod pages=192 time=24ms

# vmmon -d 1 nr_writeback nr_dirty nr_unstable

     nr_writeback         nr_dirty      nr_unstable
            11227            41463            38044
            11227            41463            38044
            11227            41463            38044
            11227            41463            38044
            11045            53987             6490
            11033            53120             8145
            11195            52143            10886
            11211            52144            10913
            11211            52144            10913
            11211            52144            10913
            11056            56887             3876
            11062            55298             8155
            11214            54485             9838
            11225            54461             9852
            11225            54461             9852
            11225            54461             4582
            22342            35535             7823

----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
  0   0   9  92   0   0|   0     0 |  66B  306B|   0     0 |1003   377 
  0   1  39  60   0   1|   0     0 |  90k 1361k|   0     0 |1765  1599 
  0  15  12  43   0  31|   0     0 |2292k   34M|   0     0 |  12k   16k
  0   0  16  84   0   0|   0     0 | 132B  306B|   0     0 |1003   376 
  0   0  43  57   0   0|   0     0 |  66B  306B|   0     0 |1004   376 
  0   7  25  55   0  13|   0     0 |1202k   18M|   0     0 |7331  8921 
  0   8  21  55   0  15|   0     0 |1195k   18M|   0     0 |5382  6579 
  0   0  38  62   0   0|   0     0 |  66B  306B|   0     0 |1002   371 
  0   0  33  67   0   0|   0     0 |  66B  306B|   0     0 |1003   376 
  0  14  20  41   0  24|   0     0 |1621k   24M|   0     0 |8549    10k
  0   5  31  55   0   9|   0     0 | 769k   11M|   0     0 |4444  5180 
  0   0  18  82   0   0|   0     0 |  66B  568B|   0     0 |1004   377 
  0   1  41  54   0   3|   0     0 | 184k 2777k|   0     0 |2609  2619 
  1  13  22  43   0  22|   0     0 |1572k   23M|   0     0 |8138    10k
  0  11   9  59   0  20|   0     0 |1861k   27M|   0     0 |9576    13k
  0   5  23  66   0   5|   0     0 | 540k 8122k|   0     0 |2816  2885 


 fs/nfs/client.c           |    2 
 fs/nfs/write.c            |   92 +++++++++++++++++++++++++++++++-----
 include/linux/nfs_fs_sb.h |    1 
 3 files changed, 84 insertions(+), 11 deletions(-)

--- linux.orig/fs/nfs/write.c	2009-11-06 09:52:23.000000000 +0800
+++ linux/fs/nfs/write.c	2009-11-06 09:52:27.000000000 +0800
@@ -187,11 +187,65 @@ static int wb_priority(struct writeback_
  * NFS congestion control
  */
 
+#define NFS_WAIT_PAGES	(1024L >> (PAGE_CACHE_SHIFT - 10))
 int nfs_congestion_kb;
 
-#define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
-#define NFS_CONGESTION_OFF_THRESH	\
-	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+/*
+ * SYNC requests will block on (2*limit) and wakeup on (2*limit-NFS_WAIT_PAGES)
+ * ASYNC requests will block on (limit) and wakeup on (limit - NFS_WAIT_PAGES)
+ * In this way SYNC writes will never be blocked by ASYNC ones.
+ */
+
+static void nfs_set_congested(long nr, long limit,
+			      struct backing_dev_info *bdi)
+{
+	if (nr > limit && !test_bit(BDI_async_congested, &bdi->state))
+		set_bdi_congested(bdi, BLK_RW_ASYNC);
+	else if (nr > 2 * limit && !test_bit(BDI_sync_congested, &bdi->state))
+		set_bdi_congested(bdi, BLK_RW_SYNC);
+}
+
+static void nfs_wait_contested(int is_sync,
+			       struct backing_dev_info *bdi,
+			       wait_queue_head_t *wqh)
+{
+	int waitbit = is_sync ? BDI_sync_congested : BDI_async_congested;
+ 	DEFINE_WAIT(wait);
+
+	if (!test_bit(waitbit, &bdi->state))
+		return;
+
+	for (;;) {
+		prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(waitbit, &bdi->state))
+			break;
+
+		io_schedule();
+	}
+	finish_wait(&wqh[is_sync], &wait);
+}
+
+static void nfs_wakeup_congested(long nr, long limit,
+				 struct backing_dev_info *bdi,
+				 wait_queue_head_t *wqh)
+{
+	if (nr < 2*limit - min(limit/8, NFS_WAIT_PAGES)) {
+		if (test_bit(BDI_sync_congested, &bdi->state)) {
+			clear_bdi_congested(bdi, BLK_RW_SYNC);
+			smp_mb__after_clear_bit();
+		}
+		if (waitqueue_active(&wqh[BLK_RW_SYNC]))
+			wake_up(&wqh[BLK_RW_SYNC]);
+	}
+	if (nr < limit - min(limit/8, NFS_WAIT_PAGES)) {
+		if (test_bit(BDI_async_congested, &bdi->state)) {
+			clear_bdi_congested(bdi, BLK_RW_ASYNC);
+			smp_mb__after_clear_bit();
+		}
+		if (waitqueue_active(&wqh[BLK_RW_ASYNC]))
+			wake_up(&wqh[BLK_RW_ASYNC]);
+	}
+}
 
 static int nfs_set_page_writeback(struct page *page)
 {
@@ -201,11 +255,9 @@ static int nfs_set_page_writeback(struct
 		struct inode *inode = page->mapping->host;
 		struct nfs_server *nfss = NFS_SERVER(inode);
 
-		if (atomic_long_inc_return(&nfss->writeback) >
-				NFS_CONGESTION_ON_THRESH) {
-			set_bdi_congested(&nfss->backing_dev_info,
-						BLK_RW_ASYNC);
-		}
+		nfs_set_congested(atomic_long_inc_return(&nfss->writeback),
+				  nfs_congestion_kb >> (PAGE_SHIFT-10),
+				  &nfss->backing_dev_info);
 	}
 	return ret;
 }
@@ -216,8 +268,11 @@ static void nfs_end_page_writeback(struc
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
+	nfs_wakeup_congested(atomic_long_dec_return(&nfss->writeback),
+			     nfs_congestion_kb >> (PAGE_SHIFT-10),
+			     &nfss->backing_dev_info,
+			     nfss->writeback_wait);
 }
 
 static struct nfs_page *nfs_find_and_lock_request(struct page *page)
@@ -309,19 +364,34 @@ static int nfs_writepage_locked(struct p
 
 int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	int ret;
 
 	ret = nfs_writepage_locked(page, wbc);
 	unlock_page(page);
+
+	nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL,
+			   &nfss->backing_dev_info,
+			   nfss->writeback_wait);
+
 	return ret;
 }
 
-static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data)
+static int nfs_writepages_callback(struct page *page,
+				   struct writeback_control *wbc, void *data)
 {
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	int ret;
 
 	ret = nfs_do_writepage(page, wbc, data);
 	unlock_page(page);
+
+	nfs_wait_contested(wbc->sync_mode == WB_SYNC_ALL,
+			   &nfss->backing_dev_info,
+			   nfss->writeback_wait);
+
 	return ret;
 }
 
--- linux.orig/include/linux/nfs_fs_sb.h	2009-11-06 09:22:30.000000000 +0800
+++ linux/include/linux/nfs_fs_sb.h	2009-11-06 09:52:27.000000000 +0800
@@ -108,6 +108,7 @@ struct nfs_server {
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
+	wait_queue_head_t	writeback_wait[2];
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
--- linux.orig/fs/nfs/client.c	2009-11-06 09:22:30.000000000 +0800
+++ linux/fs/nfs/client.c	2009-11-06 09:52:27.000000000 +0800
@@ -991,6 +991,8 @@ static struct nfs_server *nfs_alloc_serv
 	INIT_LIST_HEAD(&server->master_link);
 
 	atomic_set(&server->active, 0);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]);
 
 	server->io_stats = nfs_alloc_iostats();
 	if (!server->io_stats) {

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22  1:59       ` Wu Fengguang
  2009-12-22 12:35         ` Jan Kara
  2009-12-22 13:01         ` Martin Knoblauch
@ 2009-12-22 16:41         ` Steve Rago
  2009-12-24  1:21           ` Wu Fengguang
  2009-12-23 14:21         ` Trond Myklebust
  3 siblings, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-22 16:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel


On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote:
> Steve,
> 
> On Sat, Dec 19, 2009 at 10:25:47PM +0800, Steve Rago wrote:
> > 
> > On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote:
> > > 
> > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote:
> > > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > > > > Eager Writeback for NFS Clients
> > > > > -------------------------------
> > > > > Prevent applications that write large sequential streams of data (like backup, for example)
> > > > > from entering into a memory pressure state, which degrades performance by falling back to
> > > > > synchronous operations (both synchronous writes and additional commits).
> > > 
> > > What exactly is the "memory pressure state" condition?  What's the
> > > code to do the "synchronous writes and additional commits" and maybe
> > > how they are triggered?
> > 
> > Memory pressure occurs when most of the client pages have been dirtied
> > by an application (think backup server writing multi-gigabyte files that
> > exceed the size of main memory).  The system works harder to be able to
> > free dirty pages so that they can be reused.  For a local file system,
> > this means writing the pages to disk.  For NFS, however, the writes
> > leave the pages in an "unstable" state until the server responds to a
> > commit request.  Generally speaking, commit processing is far more
> > expensive than write processing on the server; both are done with the
> > inode locked, but since the commit takes so long, all writes are
> > blocked, which stalls the pipeline.
> 
> Let me try reiterate the problem with code, please correct me if I'm
> wrong.
> 
> 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS
>    will set the flag for any pages written -- why this trick? To
>    guarantee the call of nfs_commit_inode()? Which unfortunately turns
>    almost every server side NFS write into sync writes..

Not really.  The commit needs to be sent, but the writes are still
asynchronous.  It's just that the pages can't be recycled until they are
on stable storage.

> 
>  writeback_single_inode:
>     do_writepages
>       nfs_writepages
>         nfs_writepage ----[short time later]---> nfs_writeback_release*
>                                                    nfs_mark_request_commit
>                                                      __mark_inode_dirty(I_DIRTY_DATASYNC);
>                                     
>     if (I_DIRTY_SYNC || I_DIRTY_DATASYNC)  <---- so this will be true for most time
>       write_inode
>         nfs_write_inode
>           nfs_commit_inode
> 
> 
> 2) NFS commit stops pipeline because it sleep&wait inside i_mutex,
>    which blocks all other NFSDs trying to write/writeback the inode.
> 
>    nfsd_sync:
>      take i_mutex
>        filemap_fdatawrite
>        filemap_fdatawait
>      drop i_mutex
>      
>    If filemap_fdatawait() can be moved out of i_mutex (or just remove
>    the lock), we solve the root problem:
> 
>    nfsd_sync:
>      [take i_mutex]
>        filemap_fdatawrite  => can also be blocked, but less a problem
>      [drop i_mutex]
>        filemap_fdatawait
>  
>    Maybe it's a dumb question, but what's the purpose of i_mutex here?
>    For correctness or to prevent livelock? I can imagine some livelock
>    problem here (current implementation can easily wait for extra
>    pages), however not too hard to fix.

Commits and writes on the same inode need to be serialized for
consistency (write can change the data and metadata; commit [fsync]
needs to provide guarantees that the written data are stable). The
performance problem arises because NFS writes are fast (they generally
just deposit data into the server's page cache), but commits can take a
long time, especially if there is a lot of cached data to flush to
stable storage.

> 
> 
> The proposed patch essentially takes two actions in nfs_file_write()
> - to start writeback when the per-file nr_dirty goes high
>   without committing
> - to throttle dirtying when the per-file nr_writeback goes high
>   I guess this effectively prevents pdflush from kicking in with
>   its bad committing behavior
> 
> In general it's reasonable to keep NFS per-file nr_dirty low, however
> questionable to do per-file nr_writeback throttling. This does not
> work well with the global limits - eg. when there are many dirty
> files, the summed-up nr_writeback will still grow out of control.

Not with the eager writeback patch.  The nr_writeback for NFS is limited
by the woutstanding tunable parameter multiplied by the number of active
NFS files being written.

> And it's more likely to impact user visible responsiveness than
> a global limit. But my opinion can be biased -- me have a patch to
> do global NFS nr_writeback limit ;)

What affects user-visible responsiveness is avoiding long delays and
avoiding delays that vary widely.  Whether the limit is global or
per-file is less important (but I'd be happy to be convinced otherwise).

Steve

> 
> Thanks,
> Fengguang
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22 16:41         ` Steve Rago
@ 2009-12-24  1:21           ` Wu Fengguang
  2009-12-24 14:49             ` Steve Rago
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2009-12-24  1:21 UTC (permalink / raw)
  To: Steve Rago
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, Dec 23, 2009 at 12:41:53AM +0800, Steve Rago wrote:
> 
> On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote:
> > Steve,
> > 
> > On Sat, Dec 19, 2009 at 10:25:47PM +0800, Steve Rago wrote:
> > > 
> > > On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote:
> > > > 
> > > > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote:
> > > > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > > > > > Eager Writeback for NFS Clients
> > > > > > -------------------------------
> > > > > > Prevent applications that write large sequential streams of data (like backup, for example)
> > > > > > from entering into a memory pressure state, which degrades performance by falling back to
> > > > > > synchronous operations (both synchronous writes and additional commits).
> > > > 
> > > > What exactly is the "memory pressure state" condition?  What's the
> > > > code to do the "synchronous writes and additional commits" and maybe
> > > > how they are triggered?
> > > 
> > > Memory pressure occurs when most of the client pages have been dirtied
> > > by an application (think backup server writing multi-gigabyte files that
> > > exceed the size of main memory).  The system works harder to be able to
> > > free dirty pages so that they can be reused.  For a local file system,
> > > this means writing the pages to disk.  For NFS, however, the writes
> > > leave the pages in an "unstable" state until the server responds to a
> > > commit request.  Generally speaking, commit processing is far more
> > > expensive than write processing on the server; both are done with the
> > > inode locked, but since the commit takes so long, all writes are
> > > blocked, which stalls the pipeline.
> > 
> > Let me try reiterate the problem with code, please correct me if I'm
> > wrong.
> > 
> > 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS
> >    will set the flag for any pages written -- why this trick? To
> >    guarantee the call of nfs_commit_inode()? Which unfortunately turns

> >    almost every server side NFS write into sync writes..

Ah sorry for the typo, here I mean: the commits by pdflush turn most
server side NFS _writeback_ into sync ones(ie, datawrite+datawait,
with WB_SYNC_ALL).

Just to clarify it:
        write     = from user buffer to page cache
        writeback = from page cache to disk

> Not really.  The commit needs to be sent, but the writes are still
> asynchronous.  It's just that the pages can't be recycled until they
> are on stable storage.

Right.

> > 
> >  writeback_single_inode:
> >     do_writepages
> >       nfs_writepages
> >         nfs_writepage ----[short time later]---> nfs_writeback_release*
> >                                                    nfs_mark_request_commit
> >                                                      __mark_inode_dirty(I_DIRTY_DATASYNC);
> >                                     
> >     if (I_DIRTY_SYNC || I_DIRTY_DATASYNC)  <---- so this will be true for most time
> >       write_inode
> >         nfs_write_inode
> >           nfs_commit_inode
> > 
> > 
> > 2) NFS commit stops pipeline because it sleep&wait inside i_mutex,
> >    which blocks all other NFSDs trying to write/writeback the inode.
> > 
> >    nfsd_sync:
> >      take i_mutex
> >        filemap_fdatawrite
> >        filemap_fdatawait
> >      drop i_mutex
> >      
> >    If filemap_fdatawait() can be moved out of i_mutex (or just remove
> >    the lock), we solve the root problem:
> > 
> >    nfsd_sync:
> >      [take i_mutex]
> >        filemap_fdatawrite  => can also be blocked, but less a problem
> >      [drop i_mutex]
> >        filemap_fdatawait
> >  
> >    Maybe it's a dumb question, but what's the purpose of i_mutex here?
> >    For correctness or to prevent livelock? I can imagine some livelock
> >    problem here (current implementation can easily wait for extra
> >    pages), however not too hard to fix.
> 
> Commits and writes on the same inode need to be serialized for
> consistency (write can change the data and metadata; commit [fsync]
> needs to provide guarantees that the written data are stable). The
> performance problem arises because NFS writes are fast (they generally
> just deposit data into the server's page cache), but commits can take a

Right. 

> long time, especially if there is a lot of cached data to flush to
> stable storage.

"a lot of cached data to flush" is not likely with pdflush, since it
roughly send one COMMIT per 4MB WRITEs. So in average each COMMIT
syncs 4MB at the server side.

Your patch adds another pre-pdlush async write logic, which greatly
reduced the number of COMMITs by pdflush. Can this be the major factor
of the performance gain?

Jan has been proposing to change the pdflush logic from

        loop over dirty files {
                writeback 4MB
                write_inode
        }
to
        loop over dirty files {
                writeback all its dirty pages
                write_inode
        }

This should also be able to reduce the COMMIT numbers. I wonder if
this (more general) approach can achieve the same performance gain.

> > The proposed patch essentially takes two actions in nfs_file_write()
> > - to start writeback when the per-file nr_dirty goes high
> >   without committing
> > - to throttle dirtying when the per-file nr_writeback goes high
> >   I guess this effectively prevents pdflush from kicking in with
> >   its bad committing behavior
> > 
> > In general it's reasonable to keep NFS per-file nr_dirty low, however
> > questionable to do per-file nr_writeback throttling. This does not
> > work well with the global limits - eg. when there are many dirty
> > files, the summed-up nr_writeback will still grow out of control.
> 
> Not with the eager writeback patch.  The nr_writeback for NFS is limited
> by the woutstanding tunable parameter multiplied by the number of active
> NFS files being written.

Ah yes - _active_ files. That makes it less likely, but still possible.
Imagine the summed-up nr_dirty exceeds global limit, and pdflush wakes
up. It will cycle through all dirty files and make them all in active
NFS write..  It's only a possibility though - NFS writes are fast in
normal.

> > And it's more likely to impact user visible responsiveness than
> > a global limit. But my opinion can be biased -- me have a patch to
> > do global NFS nr_writeback limit ;)
> 
> What affects user-visible responsiveness is avoiding long delays and
> avoiding delays that vary widely.  Whether the limit is global or
> per-file is less important (but I'd be happy to be convinced otherwise).

For example, one solution is to have max_global_writeback and another
is to have max_file_writeback. Then their default values may be

        max_file_writeback = max_global_writeback / 10

Obviously the smaller max_global_writeback is more likely to block
users when active write files < 10, which is the common case.

Or, in this fake workload (spike writes from time to time),

        for i in `seq 1 100`
        do
                cp 10MB-$i /nfs/
                sleep 1s
        done

When you have 5MB max_file_writeback, the copies will be bumpy, while
the max_global_writeback will never kick in..

Note that there is another difference: your per-file nr_writeback
throttles _dirtying_ process, while my per-NFS-mount nr_writeback
throttles pdflush (then indirectly throttles application).

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-24  1:21           ` Wu Fengguang
@ 2009-12-24 14:49             ` Steve Rago
  2009-12-25  7:37               ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-24 14:49 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel@vger.kernel.org

On Thu, 2009-12-24 at 09:21 +0800, Wu Fengguang wrote:

> > Commits and writes on the same inode need to be serialized for
> > consistency (write can change the data and metadata; commit [fsync]
> > needs to provide guarantees that the written data are stable). The
> > performance problem arises because NFS writes are fast (they generally
> > just deposit data into the server's page cache), but commits can take a
> 
> Right. 
> 
> > long time, especially if there is a lot of cached data to flush to
> > stable storage.
> 
> "a lot of cached data to flush" is not likely with pdflush, since it
> roughly send one COMMIT per 4MB WRITEs. So in average each COMMIT
> syncs 4MB at the server side.

Maybe on paper, but empirically I see anywhere from one commit per 8MB
to one commit per 64 MB.

> 
> Your patch adds another pre-pdlush async write logic, which greatly
> reduced the number of COMMITs by pdflush. Can this be the major factor
> of the performance gain?

My patch removes pdflush from the picture almost entirely.  See my
comments below.

> 
> Jan has been proposing to change the pdflush logic from
> 
>         loop over dirty files {
>                 writeback 4MB
>                 write_inode
>         }
> to
>         loop over dirty files {
>                 writeback all its dirty pages
>                 write_inode
>         }
> 
> This should also be able to reduce the COMMIT numbers. I wonder if
> this (more general) approach can achieve the same performance gain.

The pdflush mechanism is fine for random writes and small sequential
writes, because it promotes concurrency -- instead of the application
blocking while it tries to write and commit its data, the application
can go on doing other more useful things, and the data gets flushed in
the background.  There is also a benefit if the application makes
another modification to a page that is already dirty, because then
multiple modifications are coalesced into a single write.

However, the pdflush mechanism is wrong for large sequential writes
(like a backup stream, for example).  First, there is no concurrency to
exploit -- the application is only going to dirty more pages, so
removing the need for it to block writing the pages out only adds to the
problem of memory pressure.  Second, the application is not going to go
back and modify a page it has already written, so leaving it in the
cache for someone else to write provides no additional benefit.

Note that this assumes the application actually cares about the
consistency of its data and will call fsync() when it is done.  If the
application doesn't call fsync(), then it doesn't matter when the pages
are written to backing store, because the interface makes no guarantees
in this case.

Thanks,

Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-24 14:49             ` Steve Rago
@ 2009-12-25  7:37               ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2009-12-25  7:37 UTC (permalink / raw)
  To: Steve Rago
  Cc: Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel@vger.kernel.org

On Thu, Dec 24, 2009 at 10:49:40PM +0800, Steve Rago wrote:
> 
> On Thu, 2009-12-24 at 09:21 +0800, Wu Fengguang wrote:
> 
> > > Commits and writes on the same inode need to be serialized for
> > > consistency (write can change the data and metadata; commit [fsync]
> > > needs to provide guarantees that the written data are stable). The
> > > performance problem arises because NFS writes are fast (they generally
> > > just deposit data into the server's page cache), but commits can take a
> > 
> > Right. 
> > 
> > > long time, especially if there is a lot of cached data to flush to
> > > stable storage.
> > 
> > "a lot of cached data to flush" is not likely with pdflush, since it
> > roughly send one COMMIT per 4MB WRITEs. So in average each COMMIT
> > syncs 4MB at the server side.
> 
> Maybe on paper, but empirically I see anywhere from one commit per 8MB
> to one commit per 64 MB.

Thanks for the data. It seems that your CPU works faster than network,
so that non of the NFS writes (submitted by L543) return by the time we
try to COMMIT at L547.

 543         ret = do_writepages(mapping, wbc);
 544 
 545         /* Don't write the inode if only I_DIRTY_PAGES was set */
 546         if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
 547                 int err = write_inode(inode, wait);

Thus pdflush is able to do several rounds of do_writepages() before
write_inode() can actually collect some pages to be COMMITed.

> > 
> > Your patch adds another pre-pdlush async write logic, which greatly
> > reduced the number of COMMITs by pdflush. Can this be the major factor
> > of the performance gain?
> 
> My patch removes pdflush from the picture almost entirely.  See my
> comments below.

Yes for sequential async writes, so I said "pre-pdflush" :)
 
> > 
> > Jan has been proposing to change the pdflush logic from
> > 
> >         loop over dirty files {
> >                 writeback 4MB
> >                 write_inode
> >         }
> > to
> >         loop over dirty files {
> >                 writeback all its dirty pages
> >                 write_inode
> >         }
> > 
> > This should also be able to reduce the COMMIT numbers. I wonder if
> > this (more general) approach can achieve the same performance gain.
> 
> The pdflush mechanism is fine for random writes and small sequential
> writes, because it promotes concurrency -- instead of the application
> blocking while it tries to write and commit its data, the application
> can go on doing other more useful things, and the data gets flushed in
> the background.  There is also a benefit if the application makes
> another modification to a page that is already dirty, because then
> multiple modifications are coalesced into a single write.

Right.

> However, the pdflush mechanism is wrong for large sequential writes
> (like a backup stream, for example).  First, there is no concurrency to
> exploit -- the application is only going to dirty more pages, so
> removing the need for it to block writing the pages out only adds to the
> problem of memory pressure.  Second, the application is not going to go
> back and modify a page it has already written, so leaving it in the
> cache for someone else to write provides no additional benefit.

Well, in general pdflush does more good than bad, that's why we need it. 
The above two reasons are about "pdflush is not as helpful", but not
that it is wrong.

That said, I do agree to limit the per-file dirty pages for NFS -- because
it tends to flush before simple stat/read operations, which could be costly.
 
> Note that this assumes the application actually cares about the
> consistency of its data and will call fsync() when it is done.  If the
> application doesn't call fsync(), then it doesn't matter when the pages
> are written to backing store, because the interface makes no guarantees
> in this case.


Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22  1:59       ` Wu Fengguang
                           ` (2 preceding siblings ...)
  2009-12-22 16:41         ` Steve Rago
@ 2009-12-23 14:21         ` Trond Myklebust
  2009-12-23 18:05           ` Jan Kara
  3 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-23 14:21 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Jan Kara, Arjan van de Ven, Ingo Molnar, linux-fsdevel

On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote: 
> 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS
>    will set the flag for any pages written -- why this trick? To
>    guarantee the call of nfs_commit_inode()? Which unfortunately turns
>    almost every server side NFS write into sync writes..
> 
>  writeback_single_inode:
>     do_writepages
>       nfs_writepages
>         nfs_writepage ----[short time later]---> nfs_writeback_release*
>                                                    nfs_mark_request_commit
>                                                      __mark_inode_dirty(I_DIRTY_DATASYNC);
>                                     
>     if (I_DIRTY_SYNC || I_DIRTY_DATASYNC)  <---- so this will be true for most time
>       write_inode
>         nfs_write_inode
>           nfs_commit_inode


I have been working on a fix for this. We basically do want to ensure
that NFS calls commit (otherwise we're not finished cleaning the dirty
pages), but we want to do it _after_ we've waited for all the writes to
complete. See below...

Trond

------------------------------------------------------------------------------------------------------ 
VFS: Add a new inode state: I_UNSTABLE_PAGES

From: Trond Myklebust <Trond.Myklebust@netapp.com>

Add a new inode state to enable the vfs to commit the nfs unstable pages to
stable storage once the write back of dirty pages is done.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/fs-writeback.c  |   24 ++++++++++++++++++++++--
 fs/nfs/inode.c     |   13 +++++--------
 fs/nfs/write.c     |    2 +-
 include/linux/fs.h |    7 +++++++
 4 files changed, 35 insertions(+), 11 deletions(-)


diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 49bc1b8..c035efe 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -388,6 +388,14 @@ static int write_inode(struct inode *inode, int sync)
 }
 
 /*
+ * Commit the NFS unstable pages.
+ */
+static void commit_unstable_pages(struct inode *inode, int wait)
+{
+	return write_inode(inode, sync);
+}
+
+/*
  * Wait for writeback on an inode to complete.
  */
 static void inode_wait_for_writeback(struct inode *inode)
@@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	/*
+	 * Special state for cleaning NFS unstable pages
+	 */
+	if (inode->i_state & I_UNSTABLE_PAGES) {
+		int err;
+		inode->i_state &= ~I_UNSTABLE_PAGES;
+		spin_unlock(&inode_lock);
+		err = commit_unstable_pages(inode, wait);
+		if (ret == 0)
+			ret = err;
+		spin_lock(&inode_lock);
+	}
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
@@ -481,7 +501,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			 * More pages get dirtied by a fast dirtier.
 			 */
 			goto select_queue;
-		} else if (inode->i_state & I_DIRTY) {
+		} else if (inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES)) {
 			/*
 			 * At least XFS will redirty the inode during the
 			 * writeback (delalloc) and on io completion (isize).
@@ -1050,7 +1070,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
 
 		inode->i_state |= flags;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..4f129b3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -99,17 +99,14 @@ u64 nfs_compat_user_ino64(u64 fileid)
 
 int nfs_write_inode(struct inode *inode, int sync)
 {
+	int flags = 0;
 	int ret;
 
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
+	if (sync)
+		flags = FLUSH_SYNC;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret > 0)
 		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return ret;
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..2f74e44 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	mark_inode_unstable_pages(inode);
 }
 
 static int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cca1919..ab01af0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1637,6 +1637,8 @@ struct super_operations {
 #define I_CLEAR			64
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define __I_UNSTABLE_PAGES	9
+#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1651,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void mark_inode_unstable_pages(struct inode *inode)
+{
+	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23 14:21         ` Trond Myklebust
@ 2009-12-23 18:05           ` Jan Kara
       [not found]             ` <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
  0 siblings, 1 reply; 96+ messages in thread
From: Jan Kara @ 2009-12-23 18:05 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Steve Rago, Peter Zijlstra,
	linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	jens.axboe, Peter Staubach, Jan Kara, Arjan van de Ven,
	Ingo Molnar, linux-fsdevel

On Wed 23-12-09 15:21:47, Trond Myklebust wrote:
> On Tue, 2009-12-22 at 09:59 +0800, Wu Fengguang wrote: 
> > 1) normal fs sets I_DIRTY_DATASYNC when extending i_size, however NFS
> >    will set the flag for any pages written -- why this trick? To
> >    guarantee the call of nfs_commit_inode()? Which unfortunately turns
> >    almost every server side NFS write into sync writes..
> > 
> >  writeback_single_inode:
> >     do_writepages
> >       nfs_writepages
> >         nfs_writepage ----[short time later]---> nfs_writeback_release*
> >                                                    nfs_mark_request_commit
> >                                                      __mark_inode_dirty(I_DIRTY_DATASYNC);
> >                                     
> >     if (I_DIRTY_SYNC || I_DIRTY_DATASYNC)  <---- so this will be true for most time
> >       write_inode
> >         nfs_write_inode
> >           nfs_commit_inode
> 
> 
> I have been working on a fix for this. We basically do want to ensure
> that NFS calls commit (otherwise we're not finished cleaning the dirty
> pages), but we want to do it _after_ we've waited for all the writes to
> complete. See below...
> 
> Trond
> 
> ------------------------------------------------------------------------------------------------------ 
> VFS: Add a new inode state: I_UNSTABLE_PAGES
> 
> From: Trond Myklebust <Trond.Myklebust@netapp.com>
> 
> Add a new inode state to enable the vfs to commit the nfs unstable pages to
> stable storage once the write back of dirty pages is done.
  Hmm, does your patch really help?

> @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>  	}
>  
>  	spin_lock(&inode_lock);
> +	/*
> +	 * Special state for cleaning NFS unstable pages
> +	 */
> +	if (inode->i_state & I_UNSTABLE_PAGES) {
> +		int err;
> +		inode->i_state &= ~I_UNSTABLE_PAGES;
> +		spin_unlock(&inode_lock);
> +		err = commit_unstable_pages(inode, wait);
> +		if (ret == 0)
> +			ret = err;
> +		spin_lock(&inode_lock);
> +	}
  I don't quite understand this chunk: We've called writeback_single_inode
because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few
lines above your chunk, we've called nfs_write_inode which sent commit to
the server. Now here you sometimes send the commit again? What's the
purpose?

> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index faa0918..4f129b3 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -99,17 +99,14 @@ u64 nfs_compat_user_ino64(u64 fileid)
>  
>  int nfs_write_inode(struct inode *inode, int sync)
>  {
> +	int flags = 0;
>  	int ret;
>  
> -	if (sync) {
> -		ret = filemap_fdatawait(inode->i_mapping);
> -		if (ret == 0)
> -			ret = nfs_commit_inode(inode, FLUSH_SYNC);
> -	} else
> -		ret = nfs_commit_inode(inode, 0);
> -	if (ret >= 0)
> +	if (sync)
> +		flags = FLUSH_SYNC;
> +	ret = nfs_commit_inode(inode, flags);
> +	if (ret > 0)
>  		return 0;
> -	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
>  	return ret;
>  }
>  

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]

* Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]             ` <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
@ 2009-12-23 19:12               ` Trond Myklebust
  2009-12-24  2:52                 ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-23 19:12 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Steve Rago, Peter Zijlstra,
	linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel

On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: 
> On Wed 23-12-09 15:21:47, Trond Myklebust wrote:
> > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> >  	}
> >  
> >  	spin_lock(&inode_lock);
> > +	/*
> > +	 * Special state for cleaning NFS unstable pages
> > +	 */
> > +	if (inode->i_state & I_UNSTABLE_PAGES) {
> > +		int err;
> > +		inode->i_state &= ~I_UNSTABLE_PAGES;
> > +		spin_unlock(&inode_lock);
> > +		err = commit_unstable_pages(inode, wait);
> > +		if (ret == 0)
> > +			ret = err;
> > +		spin_lock(&inode_lock);
> > +	}
>   I don't quite understand this chunk: We've called writeback_single_inode
> because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few
> lines above your chunk, we've called nfs_write_inode which sent commit to
> the server. Now here you sometimes send the commit again? What's the
> purpose?

We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later
I_UNSTABLE_PAGES).

The point is that we now do the commit only _after_ we've sent all the
dirty pages, and waited for writeback to complete, whereas previously we
did it in the wrong order.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23 19:12               ` Trond Myklebust
@ 2009-12-24  2:52                 ` Wu Fengguang
  2009-12-24 12:04                   ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2009-12-24  2:52 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

Trond,

On Thu, Dec 24, 2009 at 03:12:54AM +0800, Trond Myklebust wrote:
> On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: 
> > On Wed 23-12-09 15:21:47, Trond Myklebust wrote:
> > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> > >  	}
> > >  
> > >  	spin_lock(&inode_lock);
> > > +	/*
> > > +	 * Special state for cleaning NFS unstable pages
> > > +	 */
> > > +	if (inode->i_state & I_UNSTABLE_PAGES) {
> > > +		int err;
> > > +		inode->i_state &= ~I_UNSTABLE_PAGES;
> > > +		spin_unlock(&inode_lock);
> > > +		err = commit_unstable_pages(inode, wait);
> > > +		if (ret == 0)
> > > +			ret = err;
> > > +		spin_lock(&inode_lock);
> > > +	}
> >   I don't quite understand this chunk: We've called writeback_single_inode
> > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few
> > lines above your chunk, we've called nfs_write_inode which sent commit to
> > the server. Now here you sometimes send the commit again? What's the
> > purpose?
> 
> We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later
> I_UNSTABLE_PAGES).
> 
> The point is that we now do the commit only _after_ we've sent all the
> dirty pages, and waited for writeback to complete, whereas previously we
> did it in the wrong order.

Sorry I still don't get it. The timing used to be:

write 4MB   ==> WRITE block 0 (ie. first 512KB)
                WRITE block 1
                WRITE block 2
                WRITE block 3         ack from server for WRITE block 0 => mark 0 as unstable (inode marked need-commit)
                WRITE block 4         ack from server for WRITE block 1 => mark 1 as unstable
                WRITE block 5         ack from server for WRITE block 2 => mark 2 as unstable
                WRITE block 6         ack from server for WRITE block 3 => mark 3 as unstable
                WRITE block 7         ack from server for WRITE block 4 => mark 4 as unstable
                                      ack from server for WRITE block 5 => mark 5 as unstable
write_inode ==> COMMIT blocks 0-5
                                      ack from server for WRITE block 6 => mark 6 as unstable (inode marked need-commit)
                                      ack from server for WRITE block 7 => mark 7 as unstable 

                                      ack from server for COMMIT blocks 0-5 => mark 0-5 as clean

write_inode ==> COMMIT blocks 6-7

                                      ack from server for COMMIT blocks 6-7 => mark 6-7 as clean

Note that the first COMMIT is submitted before receiving all ACKs for
the previous writes, hence the second COMMIT is necessary. It seems
that your patch does not improve the timing at all.

Thanks,
Fengguang


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-24  2:52                 ` Wu Fengguang
@ 2009-12-24 12:04                   ` Trond Myklebust
  2009-12-25  5:56                     ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-24 12:04 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Thu, 2009-12-24 at 10:52 +0800, Wu Fengguang wrote: 
> Trond,
> 
> On Thu, Dec 24, 2009 at 03:12:54AM +0800, Trond Myklebust wrote:
> > On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: 
> > > On Wed 23-12-09 15:21:47, Trond Myklebust wrote:
> > > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> > > >  	}
> > > >  
> > > >  	spin_lock(&inode_lock);
> > > > +	/*
> > > > +	 * Special state for cleaning NFS unstable pages
> > > > +	 */
> > > > +	if (inode->i_state & I_UNSTABLE_PAGES) {
> > > > +		int err;
> > > > +		inode->i_state &= ~I_UNSTABLE_PAGES;
> > > > +		spin_unlock(&inode_lock);
> > > > +		err = commit_unstable_pages(inode, wait);
> > > > +		if (ret == 0)
> > > > +			ret = err;
> > > > +		spin_lock(&inode_lock);
> > > > +	}
> > >   I don't quite understand this chunk: We've called writeback_single_inode
> > > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few
> > > lines above your chunk, we've called nfs_write_inode which sent commit to
> > > the server. Now here you sometimes send the commit again? What's the
> > > purpose?
> > 
> > We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later
> > I_UNSTABLE_PAGES).
> > 
> > The point is that we now do the commit only _after_ we've sent all the
> > dirty pages, and waited for writeback to complete, whereas previously we
> > did it in the wrong order.
> 
> Sorry I still don't get it. The timing used to be:
> 
> write 4MB   ==> WRITE block 0 (ie. first 512KB)
>                 WRITE block 1
>                 WRITE block 2
>                 WRITE block 3         ack from server for WRITE block 0 => mark 0 as unstable (inode marked need-commit)
>                 WRITE block 4         ack from server for WRITE block 1 => mark 1 as unstable
>                 WRITE block 5         ack from server for WRITE block 2 => mark 2 as unstable
>                 WRITE block 6         ack from server for WRITE block 3 => mark 3 as unstable
>                 WRITE block 7         ack from server for WRITE block 4 => mark 4 as unstable
>                                       ack from server for WRITE block 5 => mark 5 as unstable
> write_inode ==> COMMIT blocks 0-5
>                                       ack from server for WRITE block 6 => mark 6 as unstable (inode marked need-commit)
>                                       ack from server for WRITE block 7 => mark 7 as unstable 
> 
>                                       ack from server for COMMIT blocks 0-5 => mark 0-5 as clean
> 
> write_inode ==> COMMIT blocks 6-7
> 
>                                       ack from server for COMMIT blocks 6-7 => mark 6-7 as clean
> 
> Note that the first COMMIT is submitted before receiving all ACKs for
> the previous writes, hence the second COMMIT is necessary. It seems
> that your patch does not improve the timing at all.

That would indicate that we're cycling through writeback_single_inode()
more than once. Why?

Trond


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-24 12:04                   ` Trond Myklebust
@ 2009-12-25  5:56                     ` Wu Fengguang
  2009-12-30 16:22                       ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2009-12-25  5:56 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Thu, Dec 24, 2009 at 08:04:41PM +0800, Trond Myklebust wrote:
> On Thu, 2009-12-24 at 10:52 +0800, Wu Fengguang wrote: 
> > Trond,
> > 
> > On Thu, Dec 24, 2009 at 03:12:54AM +0800, Trond Myklebust wrote:
> > > On Wed, 2009-12-23 at 19:05 +0100, Jan Kara wrote: 
> > > > On Wed 23-12-09 15:21:47, Trond Myklebust wrote:
> > > > > @@ -474,6 +482,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
> > > > >  	}
> > > > >  
> > > > >  	spin_lock(&inode_lock);
> > > > > +	/*
> > > > > +	 * Special state for cleaning NFS unstable pages
> > > > > +	 */
> > > > > +	if (inode->i_state & I_UNSTABLE_PAGES) {
> > > > > +		int err;
> > > > > +		inode->i_state &= ~I_UNSTABLE_PAGES;
> > > > > +		spin_unlock(&inode_lock);
> > > > > +		err = commit_unstable_pages(inode, wait);
> > > > > +		if (ret == 0)
> > > > > +			ret = err;
> > > > > +		spin_lock(&inode_lock);
> > > > > +	}
> > > >   I don't quite understand this chunk: We've called writeback_single_inode
> > > > because it had some dirty pages. Thus it has I_DIRTY_DATASYNC set and a few
> > > > lines above your chunk, we've called nfs_write_inode which sent commit to
> > > > the server. Now here you sometimes send the commit again? What's the
> > > > purpose?
> > > 
> > > We no longer set I_DIRTY_DATASYNC. We only set I_DIRTY_PAGES (and later
> > > I_UNSTABLE_PAGES).
> > > 
> > > The point is that we now do the commit only _after_ we've sent all the
> > > dirty pages, and waited for writeback to complete, whereas previously we
> > > did it in the wrong order.
> > 
> > Sorry I still don't get it. The timing used to be:
> > 
> > write 4MB   ==> WRITE block 0 (ie. first 512KB)
> >                 WRITE block 1
> >                 WRITE block 2
> >                 WRITE block 3         ack from server for WRITE block 0 => mark 0 as unstable (inode marked need-commit)
> >                 WRITE block 4         ack from server for WRITE block 1 => mark 1 as unstable
> >                 WRITE block 5         ack from server for WRITE block 2 => mark 2 as unstable
> >                 WRITE block 6         ack from server for WRITE block 3 => mark 3 as unstable
> >                 WRITE block 7         ack from server for WRITE block 4 => mark 4 as unstable
> >                                       ack from server for WRITE block 5 => mark 5 as unstable
> > write_inode ==> COMMIT blocks 0-5
> >                                       ack from server for WRITE block 6 => mark 6 as unstable (inode marked need-commit)
> >                                       ack from server for WRITE block 7 => mark 7 as unstable 
> > 
> >                                       ack from server for COMMIT blocks 0-5 => mark 0-5 as clean
> > 
> > write_inode ==> COMMIT blocks 6-7
> > 
> >                                       ack from server for COMMIT blocks 6-7 => mark 6-7 as clean
> > 
> > Note that the first COMMIT is submitted before receiving all ACKs for
> > the previous writes, hence the second COMMIT is necessary. It seems
> > that your patch does not improve the timing at all.
> 
> That would indicate that we're cycling through writeback_single_inode()
> more than once. Why?

Yes. The above sequence can happen for a 4MB sized dirty file.
The first COMMIT is done by L547, while the second COMMIT will be
scheduled either by __mark_inode_dirty(), or scheduled by L583
(depending on the time ACKs for L543 but missed L547 arrives:
if an ACK missed L578, the inode will be queued into b_dirty list,
but if any ACK arrives between L547 and L578, the inode will enter
b_more_io_wait, which is a to-be-introduced new dirty list). 

         537         dirty = inode->i_state & I_DIRTY;
         538         inode->i_state |= I_SYNC;
         539         inode->i_state &= ~I_DIRTY;
         540 
         541         spin_unlock(&inode_lock);
         542 
==>      543         ret = do_writepages(mapping, wbc);
         544 
         545         /* Don't write the inode if only I_DIRTY_PAGES was set */
         546         if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
==>      547                 int err = write_inode(inode, wait);
         548                 if (ret == 0)
         549                         ret = err;
         550         }
         551 
         552         if (wait) {
         553                 int err = filemap_fdatawait(mapping);
         554                 if (ret == 0)
         555                         ret = err;
         556         }
         557 
         558         spin_lock(&inode_lock);
         559         inode->i_state &= ~I_SYNC;
         560         if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
         561                 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
         562                         /*
         563                          * We didn't write back all the pages.  nfs_writepages()
         564                          * sometimes bales out without doing anything.
         565                          */
         566                         inode->i_state |= I_DIRTY_PAGES;
         567                         if (wbc->nr_to_write <= 0) {
         568                                 /*
         569                                  * slice used up: queue for next turn
         570                                  */
         571                                 requeue_io(inode);
         572                         } else {
         573                                 /*
         574                                  * somehow blocked: retry later
         575                                  */
         576                                 requeue_io_wait(inode);
         577                         }
==>      578                 } else if (inode->i_state & I_DIRTY) {
         579                         /*
         580                          * At least XFS will redirty the inode during the
         581                          * writeback (delalloc) and on io completion (isize).
         582                          */
==>      583                         requeue_io_wait(inode);

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-25  5:56                     ` Wu Fengguang
@ 2009-12-30 16:22                       ` Trond Myklebust
  2009-12-31  5:04                         ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-30 16:22 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Fri, 2009-12-25 at 13:56 +0800, Wu Fengguang wrote: 
> On Thu, Dec 24, 2009 at 08:04:41PM +0800, Trond Myklebust wrote:
> > That would indicate that we're cycling through writeback_single_inode()
> > more than once. Why?
> 
> Yes. The above sequence can happen for a 4MB sized dirty file.
> The first COMMIT is done by L547, while the second COMMIT will be
> scheduled either by __mark_inode_dirty(), or scheduled by L583
> (depending on the time ACKs for L543 but missed L547 arrives:
> if an ACK missed L578, the inode will be queued into b_dirty list,
> but if any ACK arrives between L547 and L578, the inode will enter
> b_more_io_wait, which is a to-be-introduced new dirty list). 
> 
>          537         dirty = inode->i_state & I_DIRTY;
>          538         inode->i_state |= I_SYNC;
>          539         inode->i_state &= ~I_DIRTY;
>          540 
>          541         spin_unlock(&inode_lock);
>          542 
> ==>      543         ret = do_writepages(mapping, wbc);
>          544 
>          545         /* Don't write the inode if only I_DIRTY_PAGES was set */
>          546         if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
> ==>      547                 int err = write_inode(inode, wait);
>          548                 if (ret == 0)
>          549                         ret = err;
>          550         }
>          551 
>          552         if (wait) {
>          553                 int err = filemap_fdatawait(mapping);
>          554                 if (ret == 0)
>          555                         ret = err;
>          556         }
>          557 
>          558         spin_lock(&inode_lock);
>          559         inode->i_state &= ~I_SYNC;
>          560         if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
>          561                 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
>          562                         /*
>          563                          * We didn't write back all the pages.  nfs_writepages()
>          564                          * sometimes bales out without doing anything.
>          565                          */
>          566                         inode->i_state |= I_DIRTY_PAGES;
>          567                         if (wbc->nr_to_write <= 0) {
>          568                                 /*
>          569                                  * slice used up: queue for next turn
>          570                                  */
>          571                                 requeue_io(inode);
>          572                         } else {
>          573                                 /*
>          574                                  * somehow blocked: retry later
>          575                                  */
>          576                                 requeue_io_wait(inode);
>          577                         }
> ==>      578                 } else if (inode->i_state & I_DIRTY) {
>          579                         /*
>          580                          * At least XFS will redirty the inode during the
>          581                          * writeback (delalloc) and on io completion (isize).
>          582                          */
> ==>      583                         requeue_io_wait(inode);

Hi Fengguang,

Apologies for having taken time over this. Do you see any improvement
with the appended variant instead? It adds a new address_space_operation
in order to do the commit. Furthermore, it ignores the commit request if
the caller is just doing a WB_SYNC_NONE background flush, waiting
instead for the ensuing WB_SYNC_ALL request...

Cheers
  Trond
-------------------------------------------------------------------------------------------------------- 
VFS: Add a new inode state: I_UNSTABLE_PAGES

From: Trond Myklebust <Trond.Myklebust@netapp.com>

Add a new inode state to enable the vfs to commit the nfs unstable pages to
stable storage once the write back of dirty pages is done.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/fs-writeback.c  |   27 +++++++++++++++++++++++++--
 fs/nfs/file.c      |    1 +
 fs/nfs/inode.c     |   16 ----------------
 fs/nfs/internal.h  |    3 ++-
 fs/nfs/super.c     |    2 --
 fs/nfs/write.c     |   29 ++++++++++++++++++++++++++++-
 include/linux/fs.h |    9 +++++++++
 7 files changed, 65 insertions(+), 22 deletions(-)


diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 49bc1b8..24bc817 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -388,6 +388,17 @@ static int write_inode(struct inode *inode, int sync)
 }
 
 /*
+ * Commit the NFS unstable pages.
+ */
+static int commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
+		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
+	return 0;
+}
+
+/*
  * Wait for writeback on an inode to complete.
  */
 static void inode_wait_for_writeback(struct inode *inode)
@@ -474,6 +485,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	/*
+	 * Special state for cleaning NFS unstable pages
+	 */
+	if (inode->i_state & I_UNSTABLE_PAGES) {
+		int err;
+		inode->i_state &= ~I_UNSTABLE_PAGES;
+		spin_unlock(&inode_lock);
+		err = commit_unstable_pages(mapping, wbc);
+		if (ret == 0)
+			ret = err;
+		spin_lock(&inode_lock);
+	}
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
@@ -481,7 +504,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			 * More pages get dirtied by a fast dirtier.
 			 */
 			goto select_queue;
-		} else if (inode->i_state & I_DIRTY) {
+		} else if (inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES)) {
 			/*
 			 * At least XFS will redirty the inode during the
 			 * writeback (delalloc) and on io completion (isize).
@@ -1050,7 +1073,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
 
 		inode->i_state |= flags;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b89132..67e50ac 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 	.error_remove_page = generic_error_remove_page,
+	.commit_unstable_pages = nfs_commit_unstable_pages,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..8341709 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
-{
-	int ret;
-
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d..7bb326f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
+extern int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907ef..805c1a0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
 static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
 static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..187f3a9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	mark_inode_unstable_pages(inode);
 }
 
 static int
@@ -1406,11 +1406,38 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int flags = FLUSH_SYNC;
+	int ret;
+
+	/* Don't commit if this is just a non-blocking flush */
+	if (wbc->sync_mode != WB_SYNC_ALL) {
+		mark_inode_unstable_pages(inode);
+		return 0;
+	}
+	if (wbc->nonblocking)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret > 0)
+		return 0;
+	return ret;
+}
+
 #else
 static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	return 0;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9147ca8..ea0b7a3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,8 @@ struct address_space_operations {
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*commit_unstable_pages)(struct address_space *,
+			struct writeback_control *);
 };
 
 /*
@@ -1635,6 +1637,8 @@ struct super_operations {
 #define I_CLEAR			64
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define __I_UNSTABLE_PAGES	9
+#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void mark_inode_unstable_pages(struct inode *inode)
+{
+	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode

^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-30 16:22                       ` Trond Myklebust
@ 2009-12-31  5:04                         ` Wu Fengguang
  2009-12-31 19:13                           ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2009-12-31  5:04 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

Trond,

On Thu, Dec 31, 2009 at 12:22:48AM +0800, Trond Myklebust wrote:

> it ignores the commit request if the caller is just doing a
> WB_SYNC_NONE background flush, waiting instead for the ensuing
> WB_SYNC_ALL request...

I'm afraid this will block balance_dirty_pages() until explicit
sync/fsync calls: COMMITs are bad, however if we don't send them
regularly, NR_UNSTABLE_NFS will grow large and block
balance_dirty_pages() as well as throttle_vm_writeout()..

        > +int nfs_commit_unstable_pages(struct address_space *mapping,
        > +               struct writeback_control *wbc)
        > +{
        > +       struct inode *inode = mapping->host;
        > +       int flags = FLUSH_SYNC;
        > +       int ret;
        > +
==>     > +       /* Don't commit if this is just a non-blocking flush */
==>     > +       if (wbc->sync_mode != WB_SYNC_ALL) {
==>     > +               mark_inode_unstable_pages(inode);
==>     > +               return 0;
==>     > +       }
        > +       if (wbc->nonblocking)
        > +               flags = 0;
        > +       ret = nfs_commit_inode(inode, flags);
        > +       if (ret > 0)
        > +               return 0;
        > +       return ret;
        > +}

The NFS protocol provides no painless way to reclaim unstable pages
other than the COMMIT (or sync write)..  This leaves us in a dilemma.

We may reasonably reduce the number of COMMITs, and possibly even
delay them for a while (and hope the server have writeback the pages
before the COMMIT, somehow fragile).

What we can obviously do is to avoid sending a COMMIT
- if there are already an ongoing COMMIT for the same inode
- or when there are ongoing WRITE for the inode
  (are there easy way to detect this?)

What do you think?

Thanks,
Fengguang
---
 fs/nfs/inode.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

--- linux.orig/fs/nfs/inode.c	2009-12-25 09:25:38.000000000 +0800
+++ linux/fs/nfs/inode.c	2009-12-25 10:13:06.000000000 +0800
@@ -105,8 +105,11 @@ int nfs_write_inode(struct inode *inode,
 		ret = filemap_fdatawait(inode->i_mapping);
 		if (ret == 0)
 			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
+	} else if (!radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
+				      NFS_PAGE_TAG_LOCKED))
 		ret = nfs_commit_inode(inode, 0);
+	else
+		ret = -EAGAIN;
 	if (ret >= 0)
 		return 0;
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-31  5:04                         ` Wu Fengguang
@ 2009-12-31 19:13                           ` Trond Myklebust
  2010-01-06  3:03                             ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-31 19:13 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Thu, 2009-12-31 at 13:04 +0800, Wu Fengguang wrote:

> ---
>  fs/nfs/inode.c |    5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> --- linux.orig/fs/nfs/inode.c	2009-12-25 09:25:38.000000000 +0800
> +++ linux/fs/nfs/inode.c	2009-12-25 10:13:06.000000000 +0800
> @@ -105,8 +105,11 @@ int nfs_write_inode(struct inode *inode,
>  		ret = filemap_fdatawait(inode->i_mapping);
>  		if (ret == 0)
>  			ret = nfs_commit_inode(inode, FLUSH_SYNC);
> -	} else
> +	} else if (!radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> +				      NFS_PAGE_TAG_LOCKED))
>  		ret = nfs_commit_inode(inode, 0);
> +	else
> +		ret = -EAGAIN;
>  	if (ret >= 0)
>  		return 0;
>  	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);

The above change improves on the existing code, but doesn't solve the
problem that write_inode() isn't a good match for COMMIT. We need to
wait for all the unstable WRITE rpc calls to return before we can know
whether or not a COMMIT is needed (some commercial servers never require
commit, even if the client requested an unstable write). That was the
other reason for the change.

I do, however, agree that the above can provide a nice heuristic for the
WB_SYNC_NONE case (minus the -EAGAIN error). Mind if I integrate it?

Cheers (and Happy New Year!)
  Trond
------------------------------------------------------------------------------------------------------------ 
VFS: Ensure that writeback_single_inode() commits unstable writes

From: Trond Myklebust <Trond.Myklebust@netapp.com>

If the call to do_writepages() succeeded in starting writeback, we do not
know whether or not we will need to COMMIT any unstable writes until after
the write RPC calls are finished. Currently, we assume that at least one
write RPC call will have finished, and set I_DIRTY_DATASYNC by the time
do_writepages is done, so that write_inode() is triggered.

In order to ensure reliable operation (i.e. ensure that a single call to
writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages
are on disk) we need to first wait for filemap_fdatawait() to complete,
then test for unstable pages.

Since NFS is currently the only filesystem that has unstable pages, we can
add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set,
this will trigger a callback to a new address_space_operation to call the
COMMIT.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/fs-writeback.c  |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/file.c      |    1 +
 fs/nfs/inode.c     |   16 ----------------
 fs/nfs/internal.h  |    3 ++-
 fs/nfs/super.c     |    2 --
 fs/nfs/write.c     |   33 ++++++++++++++++++++++++++++++++-
 include/linux/fs.h |    9 +++++++++
 7 files changed, 74 insertions(+), 21 deletions(-)


diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f6c2155..b25efbb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -388,6 +388,17 @@ static int write_inode(struct inode *inode, int sync)
 }
 
 /*
+ * Commit the NFS unstable pages.
+ */
+static int commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
+		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
+	return 0;
+}
+
+/*
  * Wait for writeback on an inode to complete.
  */
 static void inode_wait_for_writeback(struct inode *inode)
@@ -474,6 +485,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	/*
+	 * Special state for cleaning NFS unstable pages
+	 */
+	if (inode->i_state & I_UNSTABLE_PAGES) {
+		int err;
+		inode->i_state &= ~I_UNSTABLE_PAGES;
+		spin_unlock(&inode_lock);
+		err = commit_unstable_pages(mapping, wbc);
+		if (ret == 0)
+			ret = err;
+		spin_lock(&inode_lock);
+	}
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
@@ -532,6 +555,12 @@ select_queue:
 				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
+		} else if (inode->i_state & I_UNSTABLE_PAGES) {
+			/*
+			 * The inode has got yet more unstable pages to
+			 * commit. Requeue on b_more_io
+			 */
+			requeue_io(inode);
 		} else if (atomic_read(&inode->i_count)) {
 			/*
 			 * The inode is clean, inuse
@@ -1050,7 +1079,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
 
 		inode->i_state |= flags;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b89132..67e50ac 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 	.error_remove_page = generic_error_remove_page,
+	.commit_unstable_pages = nfs_commit_unstable_pages,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..8341709 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
-{
-	int ret;
-
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d..7bb326f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
+extern int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907ef..805c1a0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
 static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
 static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..910be28 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	mark_inode_unstable_pages(inode);
 }
 
 static int
@@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int flags = FLUSH_SYNC;
+	int ret;
+
+	/* Don't commit yet if this is a non-blocking flush and there are
+	 * outstanding writes for this mapping.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
+		    NFS_PAGE_TAG_LOCKED)) {
+		mark_inode_unstable_pages(inode);
+		return 0;
+	}
+	if (wbc->nonblocking)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
 #else
 static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	return 0;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9147ca8..ea0b7a3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,8 @@ struct address_space_operations {
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*commit_unstable_pages)(struct address_space *,
+			struct writeback_control *);
 };
 
 /*
@@ -1635,6 +1637,8 @@ struct super_operations {
 #define I_CLEAR			64
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define __I_UNSTABLE_PAGES	9
+#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void mark_inode_unstable_pages(struct inode *inode)
+{
+	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-31 19:13                           ` Trond Myklebust
@ 2010-01-06  3:03                             ` Wu Fengguang
  2010-01-06 16:56                               ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-06  3:03 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

Trond,

On Fri, Jan 01, 2010 at 03:13:48AM +0800, Trond Myklebust wrote:
> On Thu, 2009-12-31 at 13:04 +0800, Wu Fengguang wrote:
> 
> > ---
> >  fs/nfs/inode.c |    5 ++++-
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> > 
> > --- linux.orig/fs/nfs/inode.c	2009-12-25 09:25:38.000000000 +0800
> > +++ linux/fs/nfs/inode.c	2009-12-25 10:13:06.000000000 +0800
> > @@ -105,8 +105,11 @@ int nfs_write_inode(struct inode *inode,
> >  		ret = filemap_fdatawait(inode->i_mapping);
> >  		if (ret == 0)
> >  			ret = nfs_commit_inode(inode, FLUSH_SYNC);
> > -	} else
> > +	} else if (!radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> > +				      NFS_PAGE_TAG_LOCKED))
> >  		ret = nfs_commit_inode(inode, 0);
> > +	else
> > +		ret = -EAGAIN;
> >  	if (ret >= 0)
> >  		return 0;
> >  	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> 
> The above change improves on the existing code, but doesn't solve the
> problem that write_inode() isn't a good match for COMMIT. We need to
> wait for all the unstable WRITE rpc calls to return before we can know
> whether or not a COMMIT is needed (some commercial servers never require
> commit, even if the client requested an unstable write). That was the
> other reason for the change.

Ah good to know that reason. However we cannot wait for ongoing WRITEs
for unlimited time or pages, otherwise nr_unstable goes up and squeeze 
nr_dirty and nr_writeback to zero, and stall the cp process for a long
time, as demonstrated by the trace (more reasoning in previous email).

> 
> I do, however, agree that the above can provide a nice heuristic for the
> WB_SYNC_NONE case (minus the -EAGAIN error). Mind if I integrate it?

Sure, thank you.

Here is the trace I collected with this patch.
The pipeline is often stalled and throughput is poor..

Thanks,
Fengguang


% vmmon -d 1 nr_writeback nr_dirty nr_unstable

     nr_writeback         nr_dirty      nr_unstable
                0                0                0
                0                0                0
                0                0                0
            31609            71540              146
            45293            60500             2832
            44418            58964             5246
            44927            55903             7806
            44672            55901             8064
            44159            52840            11646
            43120            51317            14224
            43556            48256            16857
            42532            46728            19417
            43044            43672            21977
            42093            42144            24464
            40999            40621            27097
            41508            37560            29657
            40612            36032            32089
            41600            34509            32640
            41600            34509            32640
            41600            34509            32640
            41454            32976            34319
            40466            31448            36843

     nr_writeback         nr_dirty      nr_unstable
            39699            29920            39146
            40210            26864            41707
            39168            25336            44285
            38126            25341            45330
            38144            25341            45312
            37779            23808            47210
            38254            20752            49807
            37358            19224            52239
            36334            19229            53266
            36352            17696            54781
            35438            16168            57231
            35496            13621            59736
            47463                0            61420
            47421                0            61440
            44389                0            64472
            41829                0            67032
            39342                0            69519
            39357                0            69504
            36656                0            72205
            34131                0            74730
            31717                0            77144
            31165                0            77696
            28975                0            79886
            26451                0            82410

     nr_writeback         nr_dirty      nr_unstable
            23873                0            84988
            22992                0            85869
            21586                0            87275
            19027                0            89834
            16467                0            92394
            14765                0            94096
            14781                0            94080
            12080                0            96781
             9391                0            99470
             6831                0           102030
             6589                0           102272
             6589                0           102272
             3669                0           105192
             1089                0           107772
               44                0           108817
                0                0           108861
                0                0           108861
            35186            71874             1679
            32626            71913             4238
            30121            71913             6743
            28802            71913             8062
            26610            71913            10254
            36953            59138            12686
            34473            59114            15191

     nr_writeback         nr_dirty      nr_unstable
            33446            59114            16218
            33408            59114            16256
            30707            59114            18957
            28183            59114            21481
            25988            59114            23676
            25253            59114            24411
            25216            59114            24448
            22953            59114            26711
            35351            44274            29161
            32645            44274            31867
            32384            44274            32128
            32384            44274            32128
            32384            44274            32128
            28928            44274            35584
            26350            44274            38162
            26112            44274            38400
            26112            44274            38400
            26112            44274            38400
            22565            44274            41947
            36989            27364            44434
            35440            27379            45968
            32805            27379            48603
            30245            27379            51163
            28672            27379            52736

     nr_writeback         nr_dirty      nr_unstable
            56047                4            52736
            56051                0            52736
            56051                0            52736
            56051                0            52736
            56051                0            52736
            54279                0            54508
            51846                0            56941
            49158                0            59629
            47987                0            60800
            47987                0            60800
            47987                0            60800
            47987                0            60800
            47987                0            60800
            47987                0            60800
            44612                0            62976
            42228                0            62976
            39650                0            62976
            37236                0            62976
            34658                0            62976
            32226                0            62976
            29722                0            62976
            27161                0            62976
            24674                0            62976
            22242                0            62976

     nr_writeback         nr_dirty      nr_unstable
            19737                0            62976
            17306                0            62976
            14745                0            62976
            12313                0            62976
             9753                0            62976
             7321                0            62976
             4743                0            62976
             2329                0            62976
               43                0            14139
                0                0                0
                0                0                0
                0                0                0

wfg ~% dstat
----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw
  2   9  89   0   0   0|   0     0 | 729B  720B|   0     0 | 875  2136
  6   9  76   8   0   1|   0   352k|9532B 4660B|   0     0 |1046  2091
  3   8  89   0   0   0|   0     0 |1153B  426B|   0     0 | 870  1870
  1   9  89   0   0   0|   0    72k|1218B  246B|   0     0 | 853  1757
  3   8  89   0   0   0|   0     0 | 844B   66B|   0     0 | 865  1695
  2   7  91   0   0   0|   0     0 | 523B   66B|   0     0 | 818  1576
  3   7  90   0   0   0|   0     0 | 901B   66B|   0     0 | 820  1590
  6  11  68  11   0   4|   0   456k|2028k   51k|   0     0 |1560  2756
  7  21  52   0   0  20|   0     0 |  11M  238k|   0     0 |4627  7423
  2  22  51   0   0  24|   0    80k|  10M  230k|   0     0 |4200  6469
  4  19  54   0   0  23|   0     0 |  10M  236k|   0     0 |4277  6629
  3  15  37  31   0  14|   0    64M|5377k  115k|   0     0 |2229  2972
  3  27  45   0   0  26|   0     0 |  10M  237k|   0     0 |4416  6743
  3  20  51   0   0  27|   0  1024k|  10M  233k|   0     0 |4284  6694 ^C
----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw
  5   9  84   2   0   1| 225k  443k|   0     0 |   0     0 | 950  1985
  4  28  25  22   0  21|   0    62M|  10M  235k|   0     0 |4529  6686
  5  23  30  11   0  31|   0    23M|  10M  239k|   0     0 |4570  6948
  2  24  48   0   0  26|   0     0 |  10M  234k|   0     0 |4334  6796
  2  25  34  17   0  22|   0    50M|  10M  236k|   0     0 |4546  6944
  2  29  46   7   0  18|   0    14M|  10M  236k|   0     0 |4411  6998
  2  23  53   0   0  22|   0     0 |  10M  232k|   0     0 |4100  6595
  3  19  20  32   0  26|   0    39M|9466k  207k|   0     0 |3455  4617
  2  13  40  43   0   1|   0    41M| 930B  264B|   0     0 | 906  1545
  3   7  45  43   0   1|   0    57M| 713B  132B|   0     0 | 859  1669
  3   9  47  40   0   1|   0    54M| 376B   66B|   0     0 | 944  1741
  5  25  47   0   0  21|   0    16k|9951k  222k|   0     0 |4227  6697
  5  20  38  14   0  23|   0    36M|9388k  204k|   0     0 |3650  5135
  3  28  46   0   0  24|   0  8192B|  11M  241k|   0     0 |4612  7115
  2  24  49   0   0  25|   0     0 |  10M  234k|   0     0 |4120  6477
  2  25  37  12   0  23|   0    56M|  11M  239k|   0     0 |4406  6237
  3   7  38  44   0   7|   0    48M|1529k   32k|   0     0 |1071  1635
  3   8  41  45   0   2|   0    58M| 602B  198B|   0     0 | 886  1613
  2  25  45   2   0  27|   0  2056k|  10M  228k|   0     0 |4233  6623
  2  24  49   0   0  24|   0     0 |  10M  235k|   0     0 |4292  6815
  2  27  41   8   0  22|   0    50M|  10M  234k|   0     0 |4381  6394
  1   9  41  41   0   7|   0    59M|1790k   38k|   0     0 |1226  1823
  2  26  40  10   0  22|   0    17M|8185k  183k|   0     0 |3584  5410
  1  23  54   0   0  22|   0     0 |  10M  228k|   0     0 |4153  6672
  1  22  49   0   0  28|   0    37M|  11M  239k|   0     0 |4499  6938
  2  15  37  32   0  13|   0    57M|5078k  110k|   0     0 |2154  2903
  3  20  45  21   0  10|   0    31M|4268k   96k|   0     0 |2338  3712
  2  21  55   0   0  21|   0     0 |  10M  231k|   0     0 |4292  6940
  2  22  49   0   0  27|   0    25M|  11M  238k|   0     0 |4338  6677
  2  17  42  19   0  19|   0    53M|8269k  180k|   0     0 |3341  4501
  3  17  45  33   0   2|   0    50M|2083k   49k|   0     0 |1778  2733
  2  23  53   0   0  22|   0     0 |  11M  240k|   0     0 |4482  7108
  2  23  51   0   0  25|   0  9792k|  10M  230k|   0     0 |4220  6563
  3  21  38  15   0  24|   0    53M|  11M  240k|   0     0 |4038  5697
  3  10  41  43   0   3|   0    65M|  80k  660B|   0     0 | 984  1725
  1  23  51   0   0  25|   0  8192B|  10M  230k|   0     0 |4301  6652
  2  21  48   0   0  29|   0     0 |  10M  237k|   0     0 |4267  6956
  2  26  43   5   0  23|   0    52M|  10M  236k|   0     0 |4553  6764
  7   7  34  41   0  10|   0    57M|2596k   56k|   0     0 |1210  1680
  6  21  44  12   0  17|   0    19M|7053k  158k|   0     0 |3194  4902
  4  24  51   0   0  21|   0     0 |  10M  237k|   0     0 |4406  6724
  4  22  53   0   0  21|   0    31M|  10M  237k|   0     0 |4752  7286
  4  15  32  32   0  17|   0    49M|5777k  125k|   0     0 |2379  3015
  5  14  43  34   0   3|   0    48M|1781k   42k|   0     0 |1578  2492
  4  22  42   0   0  32|   0     0 |  10M  236k|   0     0 |4318  6763
  3  22  50   4   0  21|   0  7072k|  10M  236k|   0     0 |4509  6859
  6  21  28  16   0  28|   0    41M|  11M  241k|   0     0 |4289  5928
  7   8  39  44   0   2|   0    40M| 217k 3762B|   0     0 |1024  1763
  4  15  46  28   0   6|   0    39M|2377k   55k|   0     0 |1683  2678
  4  24  45   0   0  26|   0     0 |  10M  232k|   0     0 |4207  6596
  3  24  50   5   0  19|   0    10M|9472k  210k|   0     0 |3976  6122
  5   7  40  46   0   1|   0    32M|1230B   66B|   0     0 | 967  1676
----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw
  5   7  47  40   0   1|   0    39M| 651B   66B|   0     0 | 916  1583
  4  12  54  22   0   7|   0    35M|1815k   41k|   0     0 |1448  2383
  4  22  52   0   0  21|   0     0 |  10M  233k|   0     0 |4258  6705
  4  22  52   0   0  22|   0    24M|  10M  236k|   0     0 |4480  7097
  3  23  48   0   0  26|   0    28M|  10M  234k|   0     0 |4402  6798
  5  12  36  29   0  19|   0    59M|5464k  118k|   0     0 |2358  2963
  4  26  47   4   0  19|   0  5184k|8684k  194k|   0     0 |3786  5852
  4  22  43   0   0  32|   0     0 |  10M  233k|   0     0 |4350  6779
  3  26  44   0   0  27|   0    36M|  10M  233k|   0     0 |4360  6619
  4  11  39  33   0  13|   0    46M|4545k   98k|   0     0 |2159  2600
  3  14  40  40   0   2|   0    46M| 160k 4198B|   0     0 |1070  1610
  4  25  45   0   0  27|   0     0 |  10M  236k|   0     0 |4435  6760
  4  25  48   0   0  24|   0  3648k|  10M  235k|   0     0 |4595  6950
  3  24  29  22   0  21|   0    37M|  10M  236k|   0     0 |4335  6461
  5  11  42  36   0   6|   0    45M|2257k   48k|   0     0 |1440  1755
  5   6  41  47   0   1|   0    43M| 768B  198B|   0     0 | 989  1592
  5  30  47   3   0  15|   0    24k|8598k  192k|   0     0 |3694  5580
  2  23  49   0   0  26|   0     0 |  10M  229k|   0     0 |4319  6805
  4  22  32  20   0  22|   0    26M|  10M  234k|   0     0 |4487  6751
  4  11  24  53   0   8|   0    32M|2503k   55k|   0     0 |1287  1654
  8  10  42  39   0   0|   0    43M|1783B  132B|   0     0 |1054  1900
  6  16  43  27   0   8|   0    24M|2790k   64k|   0     0 |2150  3370
  4  24  51   0   0  21|   0     0 |  10M  231k|   0     0 |4308  6589
  3  24  36  13   0  24|   0  9848k|  10M  231k|   0     0 |4394  6742
  6  10  11  62   0   9|   0    27M|2519k   55k|   0     0 |1482  1723
  3  12  23  61   0   2|   0    34M| 608B  132B|   0     0 | 927  1623
  3  15  38  38   0   6|   0    36M|2077k   48k|   0     0 |1801  2651
  7  25  45   6   0  17|   0  3000k|  11M  241k|   0     0 |5071  7687
  3  26  45   3   0  23|   0    13M|  11M  238k|   0     0 |4473  6650
  4  17  40  21   0  17|   0    37M|6253k  139k|   0     0 |2891  3746
  3  24  48   0   0  25|   0     0 |  10M  238k|   0     0 |4736  7189
  1  28  38   7   0  25|   0  9160k|  10M  232k|   0     0 |4689  7026
  4  17  26  35   0  18|   0    21M|8707k  190k|   0     0 |3346  4488
  4  11  12  72   0   1|   0    29M|1459B  264B|   0     0 | 947  1643
  4  10  20  64   0   1|   0    28M| 728B  132B|   0     0 |1010  1531
  6   8   7  78   0   1|   0    25M| 869B   66B|   0     0 | 945  1620
  5  10  15  69   0   1|   0    27M| 647B  132B|   0     0 |1052  1553
  5  11   0  82   0   1|   0    16M| 724B   66B|   0     0 |1063  1679
  3  22  18  49   0   9|   0    14M|4560k  103k|   0     0 |2931  4039
  3  24  44   0   0  29|   0     0 |  10M  236k|   0     0 |4863  7497
  3  30  42   0   0  24|   0  4144k|  11M  250k|   0     0 |5505  7945
  3  18  13  45   0  20|   0    15M|7234k  157k|   0     0 |3197  4021
  7   9   0  82   0   1|   0    23M| 356B  198B|   0     0 | 979  1738
  3  11   9  77   0   0|   0    22M| 802B  132B|   0     0 | 994  1635
  5   9   1  84   0   2|   0    31M| 834B   66B|   0     0 | 996  1534
  4  10  14  71   0   1|   0    20M| 288B  132B|   0     0 | 976  1627
  4  14  22  59   0   1|   0  8032k| 865k   20k|   0     0 |1222  1589
  4  23  46   0   0  26|   0     0 |  10M  239k|   0     0 |3791  5035
  5  17  43   6   0  29|   0    17M|  10M  233k|   0     0 |3198  4372
  4  19  50   0   0  27|   0     0 |  10M  231k|   0     0 |2952  4447
  5  19  37  14   0  26|   0  8568k|  10M  227k|   0     0 |3562  5251
  3  21  23  25   0  28|   0  9560k|  10M  230k|   0     0 |3390  5038
----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw
  5  19  24  26   0  26|   0    11M|  10M  229k|   0     0 |3282  4749
  4  20   8  39   0  28|   0  7992k|  10M  230k|   0     0 |3302  4488
  4  17   3  47   0  30|   0  8616k|  10M  231k|   0     0 |3440  4909
  5  16  22  25   0  31|   0  6556k|  10M  227k|   0     0 |3291  4671
  3  18  22  24   0  32|   0  5588k|  10M  230k|   0     0 |3345  4822
  4  16  26  25   0  29|   0  4744k|  10M  230k|   0     0 |3331  4854
  3  18  16  37   0  26|   0  4296k|  10M  228k|   0     0 |3056  4139
  3  17  18  25   0  36|   0  3016k|  10M  230k|   0     0 |3239  4623
  4  19  23  26   0  27|   0  2216k|  10M  229k|   0     0 |3331  4777
  4  20  41   8   0  26|   0  8584k|  10M  228k|   0     0 |3434  5114
  4  17  50   0   0  29|   0  1000k|  10M  229k|   0     0 |3151  4878
  2  18  50   1   0  29|   0    32k|  10M  232k|   0     0 |3176  4951
  3  19  51   0   0  28|   0     0 |  10M  232k|   0     0 |3014  4567
  4  17  53   1   0  24|   0    32k|8787k  195k|   0     0 |2768  4382
  3   8  89   0   0   0|   0     0 |4013B 2016B|   0     0 | 866  1653
  3   8  88   0   0   0|   0    16k|1017B    0 |   0     0 | 828  1660
  6   8  86   0   0   0|   0     0 |1320B   66B|   0     0 | 821  1713
  4   8  88   0   0   0|   0     0 | 692B   66B|   0     0 | 806  1665
 
> ------------------------------------------------------------------------------------------------------------ 
> VFS: Ensure that writeback_single_inode() commits unstable writes
> 
> From: Trond Myklebust <Trond.Myklebust@netapp.com>
> 
> If the call to do_writepages() succeeded in starting writeback, we do not
> know whether or not we will need to COMMIT any unstable writes until after
> the write RPC calls are finished. Currently, we assume that at least one
> write RPC call will have finished, and set I_DIRTY_DATASYNC by the time
> do_writepages is done, so that write_inode() is triggered.
> 
> In order to ensure reliable operation (i.e. ensure that a single call to
> writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages
> are on disk) we need to first wait for filemap_fdatawait() to complete,
> then test for unstable pages.
> 
> Since NFS is currently the only filesystem that has unstable pages, we can
> add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set,
> this will trigger a callback to a new address_space_operation to call the
> COMMIT.
> 
> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> ---
> 
>  fs/fs-writeback.c  |   31 ++++++++++++++++++++++++++++++-
>  fs/nfs/file.c      |    1 +
>  fs/nfs/inode.c     |   16 ----------------
>  fs/nfs/internal.h  |    3 ++-
>  fs/nfs/super.c     |    2 --
>  fs/nfs/write.c     |   33 ++++++++++++++++++++++++++++++++-
>  include/linux/fs.h |    9 +++++++++
>  7 files changed, 74 insertions(+), 21 deletions(-)
> 
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index f6c2155..b25efbb 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -388,6 +388,17 @@ static int write_inode(struct inode *inode, int sync)
>  }
>  
>  /*
> + * Commit the NFS unstable pages.
> + */
> +static int commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc)
> +{
> +	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
> +		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
> +	return 0;
> +}
> +
> +/*
>   * Wait for writeback on an inode to complete.
>   */
>  static void inode_wait_for_writeback(struct inode *inode)
> @@ -474,6 +485,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>  	}
>  
>  	spin_lock(&inode_lock);
> +	/*
> +	 * Special state for cleaning NFS unstable pages
> +	 */
> +	if (inode->i_state & I_UNSTABLE_PAGES) {
> +		int err;
> +		inode->i_state &= ~I_UNSTABLE_PAGES;
> +		spin_unlock(&inode_lock);
> +		err = commit_unstable_pages(mapping, wbc);
> +		if (ret == 0)
> +			ret = err;
> +		spin_lock(&inode_lock);
> +	}
>  	inode->i_state &= ~I_SYNC;
>  	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
>  		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
> @@ -532,6 +555,12 @@ select_queue:
>  				inode->i_state |= I_DIRTY_PAGES;
>  				redirty_tail(inode);
>  			}
> +		} else if (inode->i_state & I_UNSTABLE_PAGES) {
> +			/*
> +			 * The inode has got yet more unstable pages to
> +			 * commit. Requeue on b_more_io
> +			 */
> +			requeue_io(inode);
>  		} else if (atomic_read(&inode->i_count)) {
>  			/*
>  			 * The inode is clean, inuse
> @@ -1050,7 +1079,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  
>  	spin_lock(&inode_lock);
>  	if ((inode->i_state & flags) != flags) {
> -		const int was_dirty = inode->i_state & I_DIRTY;
> +		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
>  
>  		inode->i_state |= flags;
>  
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 6b89132..67e50ac 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
>  	.migratepage = nfs_migrate_page,
>  	.launder_page = nfs_launder_page,
>  	.error_remove_page = generic_error_remove_page,
> +	.commit_unstable_pages = nfs_commit_unstable_pages,
>  };
>  
>  /*
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index faa0918..8341709 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
>  	return ino;
>  }
>  
> -int nfs_write_inode(struct inode *inode, int sync)
> -{
> -	int ret;
> -
> -	if (sync) {
> -		ret = filemap_fdatawait(inode->i_mapping);
> -		if (ret == 0)
> -			ret = nfs_commit_inode(inode, FLUSH_SYNC);
> -	} else
> -		ret = nfs_commit_inode(inode, 0);
> -	if (ret >= 0)
> -		return 0;
> -	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> -	return ret;
> -}
> -
>  void nfs_clear_inode(struct inode *inode)
>  {
>  	/*
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 29e464d..7bb326f 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
>  extern struct workqueue_struct *nfsiod_workqueue;
>  extern struct inode *nfs_alloc_inode(struct super_block *sb);
>  extern void nfs_destroy_inode(struct inode *);
> -extern int nfs_write_inode(struct inode *,int);
>  extern void nfs_clear_inode(struct inode *);
>  #ifdef CONFIG_NFS_V4
>  extern void nfs4_clear_inode(struct inode *);
> @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
>  extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
>  
>  /* write.c */
> +extern int nfs_commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc);
>  extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
>  #ifdef CONFIG_MIGRATION
>  extern int nfs_migrate_page(struct address_space *,
> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
> index ce907ef..805c1a0 100644
> --- a/fs/nfs/super.c
> +++ b/fs/nfs/super.c
> @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
>  static const struct super_operations nfs_sops = {
>  	.alloc_inode	= nfs_alloc_inode,
>  	.destroy_inode	= nfs_destroy_inode,
> -	.write_inode	= nfs_write_inode,
>  	.statfs		= nfs_statfs,
>  	.clear_inode	= nfs_clear_inode,
>  	.umount_begin	= nfs_umount_begin,
> @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
>  static const struct super_operations nfs4_sops = {
>  	.alloc_inode	= nfs_alloc_inode,
>  	.destroy_inode	= nfs_destroy_inode,
> -	.write_inode	= nfs_write_inode,
>  	.statfs		= nfs_statfs,
>  	.clear_inode	= nfs4_clear_inode,
>  	.umount_begin	= nfs_umount_begin,
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index d171696..910be28 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
>  	spin_unlock(&inode->i_lock);
>  	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
>  	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> -	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> +	mark_inode_unstable_pages(inode);
>  }
>  
>  static int
> @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how)
>  	}
>  	return res;
>  }
> +
> +int nfs_commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc)
> +{
> +	struct inode *inode = mapping->host;
> +	int flags = FLUSH_SYNC;
> +	int ret;
> +
> +	/* Don't commit yet if this is a non-blocking flush and there are
> +	 * outstanding writes for this mapping.
> +	 */
> +	if (wbc->sync_mode != WB_SYNC_ALL &&
> +	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> +		    NFS_PAGE_TAG_LOCKED)) {
> +		mark_inode_unstable_pages(inode);
> +		return 0;
> +	}
> +	if (wbc->nonblocking)
> +		flags = 0;
> +	ret = nfs_commit_inode(inode, flags);
> +	if (ret > 0)
> +		ret = 0;
> +	return ret;
> +}
> +
>  #else
>  static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
>  {
>  	return 0;
>  }
> +
> +int nfs_commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc)
> +{
> +	return 0;
> +}
>  #endif
>  
>  long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9147ca8..ea0b7a3 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -602,6 +602,8 @@ struct address_space_operations {
>  	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
>  					unsigned long);
>  	int (*error_remove_page)(struct address_space *, struct page *);
> +	int (*commit_unstable_pages)(struct address_space *,
> +			struct writeback_control *);
>  };
>  
>  /*
> @@ -1635,6 +1637,8 @@ struct super_operations {
>  #define I_CLEAR			64
>  #define __I_SYNC		7
>  #define I_SYNC			(1 << __I_SYNC)
> +#define __I_UNSTABLE_PAGES	9
> +#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
>  
>  #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
>  
> @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
>  	__mark_inode_dirty(inode, I_DIRTY_SYNC);
>  }
>  
> +static inline void mark_inode_unstable_pages(struct inode *inode)
> +{
> +	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
> +}
> +
>  /**
>   * inc_nlink - directly increment an inode's link count
>   * @inode: inode
> 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06  3:03                             ` Wu Fengguang
@ 2010-01-06 16:56                               ` Trond Myklebust
  2010-01-06 18:26                                 ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 16:56 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 11:03 +0800, Wu Fengguang wrote: 
> Trond,
> 
> On Fri, Jan 01, 2010 at 03:13:48AM +0800, Trond Myklebust wrote:
> > The above change improves on the existing code, but doesn't solve the
> > problem that write_inode() isn't a good match for COMMIT. We need to
> > wait for all the unstable WRITE rpc calls to return before we can know
> > whether or not a COMMIT is needed (some commercial servers never require
> > commit, even if the client requested an unstable write). That was the
> > other reason for the change.
> 
> Ah good to know that reason. However we cannot wait for ongoing WRITEs
> for unlimited time or pages, otherwise nr_unstable goes up and squeeze 
> nr_dirty and nr_writeback to zero, and stall the cp process for a long
> time, as demonstrated by the trace (more reasoning in previous email).

OK. I think we need a mechanism to allow balance_dirty_pages() to
communicate to the filesystem that it really is holding too many
unstable pages. Currently, all we do is say that 'your total is too
big', and then let the filesystem figure out what it needs to do.

So how about if we modify your heuristic to do something like this? It
applies on top of the previous patch.

Cheers
  Trond
--------------------------------------------------------------------------------------------------------- 
VM/NFS: The VM must tell the filesystem when to free reclaimable pages

From: Trond Myklebust <Trond.Myklebust@netapp.com>

balance_dirty_pages() should really tell the filesystem whether or not it
has an excess of actual dirty pages, or whether it would be more useful to
start freeing up the reclaimable pages.

Assume that if the number of dirty pages associated with this backing-dev
is less than 1/2 the number of reclaimable pages, then we should
concentrate on freeing up the latter.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/nfs/write.c              |    9 +++++++--
 include/linux/backing-dev.h |    6 ++++++
 mm/page-writeback.c         |    7 +++++--
 3 files changed, 18 insertions(+), 4 deletions(-)


diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 910be28..36113e6 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1420,8 +1420,10 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 	if (wbc->sync_mode != WB_SYNC_ALL &&
 	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
 		    NFS_PAGE_TAG_LOCKED)) {
-		mark_inode_unstable_pages(inode);
-		return 0;
+		if (wbc->bdi == NULL)
+			goto out_nocommit;
+		if (wbc->bdi->dirty_exceeded != BDI_RECLAIMABLE_EXCEEDED)
+			goto out_nocommit;
 	}
 	if (wbc->nonblocking)
 		flags = 0;
@@ -1429,6 +1431,9 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 	if (ret > 0)
 		ret = 0;
 	return ret;
+out_nocommit:
+	mark_inode_unstable_pages(inode);
+	return 0;
 }
 
 #else
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26a..cd1645e 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -94,6 +94,12 @@ struct backing_dev_info {
 #endif
 };
 
+enum bdi_dirty_exceeded_state {
+	BDI_NO_DIRTY_EXCESS = 0,
+	BDI_DIRTY_EXCEEDED,
+	BDI_RECLAIMABLE_EXCEEDED,
+};
+
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..0133c8f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -524,8 +524,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 				(background_thresh + dirty_thresh) / 2)
 			break;
 
-		if (!bdi->dirty_exceeded)
-			bdi->dirty_exceeded = 1;
+		if (bdi_nr_writeback > bdi_nr_reclaimable / 2) {
+			if (bdi->dirty_exceeded != BDI_DIRTY_EXCEEDED)
+				bdi->dirty_exceeded = BDI_DIRTY_EXCEEDED;
+		} else if (bdi->dirty_exceeded != BDI_RECLAIMABLE_EXCEEDED)
+				bdi->dirty_exceeded = BDI_RECLAIMABLE_EXCEEDED;
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 16:56                               ` Trond Myklebust
@ 2010-01-06 18:26                                 ` Trond Myklebust
  2010-01-06 18:37                                   ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 18:26 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Jan Kara, Steve Rago, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 11:56 -0500, Trond Myklebust wrote: 
> On Wed, 2010-01-06 at 11:03 +0800, Wu Fengguang wrote: 
> > Trond,
> > 
> > On Fri, Jan 01, 2010 at 03:13:48AM +0800, Trond Myklebust wrote:
> > > The above change improves on the existing code, but doesn't solve the
> > > problem that write_inode() isn't a good match for COMMIT. We need to
> > > wait for all the unstable WRITE rpc calls to return before we can know
> > > whether or not a COMMIT is needed (some commercial servers never require
> > > commit, even if the client requested an unstable write). That was the
> > > other reason for the change.
> > 
> > Ah good to know that reason. However we cannot wait for ongoing WRITEs
> > for unlimited time or pages, otherwise nr_unstable goes up and squeeze 
> > nr_dirty and nr_writeback to zero, and stall the cp process for a long
> > time, as demonstrated by the trace (more reasoning in previous email).
> 
> OK. I think we need a mechanism to allow balance_dirty_pages() to
> communicate to the filesystem that it really is holding too many
> unstable pages. Currently, all we do is say that 'your total is too
> big', and then let the filesystem figure out what it needs to do.
> 
> So how about if we modify your heuristic to do something like this? It
> applies on top of the previous patch.

Gah! I misread the definitions of bdi_nr_reclaimable and
bdi_nr_writeback. Please ignore the previous patch.

OK. It looks as if the only key to finding out how many unstable writes
we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't
specifically target our own backing-dev.

Also, on reflection, I think it might be more helpful to use the
writeback control to signal when we want to force a commit. That makes
it a more general mechanism.

There is one thing that we might still want to do here. Currently we do
not update wbc->nr_to_write inside nfs_commit_unstable_pages(), which
again means that we don't update 'pages_written' if the only effect of
the writeback_inodes_wbc() was to commit pages. Perhaps it might not be
a bad idea to do this (but that should be in a separate patch)...

Cheers
  Trond
------------------------------------------------------------------------------------- 
VM/NFS: The VM must tell the filesystem when to free reclaimable pages

From: Trond Myklebust <Trond.Myklebust@netapp.com>

balance_dirty_pages() should really tell the filesystem whether or not it
has an excess of actual dirty pages, or whether it would be more useful to
start freeing up the unstable writes.

Assume that if the number of unstable writes is more than 1/2 the number of
reclaimable pages, then we should force NFS to free up the former.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/nfs/write.c            |    2 +-
 include/linux/writeback.h |    5 +++++
 mm/page-writeback.c       |    9 ++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)


diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 910be28..ee3daf4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 	/* Don't commit yet if this is a non-blocking flush and there are
 	 * outstanding writes for this mapping.
 	 */
-	if (wbc->sync_mode != WB_SYNC_ALL &&
+	if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL &&
 	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
 		    NFS_PAGE_TAG_LOCKED)) {
 		mark_inode_unstable_pages(inode);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 76e8903..3fd5c3e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -62,6 +62,11 @@ struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+	/*
+	 * The following is used by balance_dirty_pages() to
+	 * force NFS to commit unstable pages.
+	 */
+	unsigned force_commit:1;
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..ede5356 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -485,6 +485,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
+	long nr_unstable_nfs;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
@@ -505,8 +506,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
 
+		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
+					nr_unstable_nfs;
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
@@ -537,6 +539,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * up.
 		 */
 		if (bdi_nr_reclaimable > bdi_thresh) {
+			wbc.force_commit = 0;
+			/* Force NFS to also free up unstable writes. */
+			if (nr_unstable_nfs > nr_reclaimable / 2)
+				wbc.force_commit = 1;
+
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 18:26                                 ` Trond Myklebust
@ 2010-01-06 18:37                                   ` Peter Zijlstra
  2010-01-06 18:52                                     ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2010-01-06 18:37 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote:
> OK. It looks as if the only key to finding out how many unstable writes
> we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't
> specifically target our own backing-dev. 

Would be a simple matter of splitting BDI_UNSTABLE out from
BDI_RECLAIMABLE, no?

Something like

---
 fs/nfs/write.c              |    6 +++---
 include/linux/backing-dev.h |    3 ++-
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   16 ++++++++++------
 mm/truncate.c               |    2 +-
 6 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..7ba56f8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 			NFS_PAGE_TAG_COMMIT);
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
@@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE);
 		return 1;
 	}
 	return 0;
@@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-				BDI_RECLAIMABLE);
+				BDI_UNSTABLE);
 		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26a..1ef1e5c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -36,7 +36,8 @@ enum bdi_state {
 typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
-	BDI_RECLAIMABLE,
+	BDI_DIRTY,
+	DBI_UNSTABLE,
 	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca03..88f3655 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
 		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
+		   "BdiDirty:         %8lu kB\n"
+		   "BdiUnstable:      %8lu kB\n"
 		   "BdiDirtyThresh:   %8lu kB\n"
 		   "DirtyThresh:      %8lu kB\n"
 		   "BackgroundThresh: %8lu kB\n"
@@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "wb_list:          %8u\n"
 		   "wb_cnt:           %8u\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)),
 		   K(bdi_thresh), K(dirty_thresh),
 		   K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0..458387d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 	}
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..b1d31be 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 	else
 		avail_dirty = 0;
 
-	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
+		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
@@ -509,7 +510,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) + 
+				     bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -554,10 +556,12 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+					     bdi_stat_sum(bdi, DBI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
@@ -1079,7 +1083,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
@@ -1255,7 +1259,7 @@ int clear_page_dirty_for_io(struct page *page)
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			return 1;
 		}
 		return 0;
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee..b0ce8fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}



^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 18:37                                   ` Peter Zijlstra
@ 2010-01-06 18:52                                     ` Trond Myklebust
  2010-01-06 19:07                                       ` Peter Zijlstra
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 18:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: 
> On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote:
> > OK. It looks as if the only key to finding out how many unstable writes
> > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't
> > specifically target our own backing-dev. 
> 
> Would be a simple matter of splitting BDI_UNSTABLE out from
> BDI_RECLAIMABLE, no?
> 
> Something like

OK. How about if we also add in a bdi->capabilities flag to tell that we
might have BDI_UNSTABLE? That would allow us to avoid the potentially
expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs
case?

Cheers
   Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 18:52                                     ` Trond Myklebust
@ 2010-01-06 19:07                                       ` Peter Zijlstra
  2010-01-06 19:21                                         ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2010-01-06 19:07 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 13:52 -0500, Trond Myklebust wrote:
> On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: 
> > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote:
> > > OK. It looks as if the only key to finding out how many unstable writes
> > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't
> > > specifically target our own backing-dev. 
> > 
> > Would be a simple matter of splitting BDI_UNSTABLE out from
> > BDI_RECLAIMABLE, no?
> > 
> > Something like
> 
> OK. How about if we also add in a bdi->capabilities flag to tell that we
> might have BDI_UNSTABLE? That would allow us to avoid the potentially
> expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs
> case?

The bdi_stat_sum() in the error limit is basically the only such
expensive op, but I suspect we might hit that more than enough. So sure
that sounds like a plan.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 19:07                                       ` Peter Zijlstra
@ 2010-01-06 19:21                                         ` Trond Myklebust
  2010-01-06 19:53                                           ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 19:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 20:07 +0100, Peter Zijlstra wrote: 
> On Wed, 2010-01-06 at 13:52 -0500, Trond Myklebust wrote:
> > On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: 
> > > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote:
> > > > OK. It looks as if the only key to finding out how many unstable writes
> > > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't
> > > > specifically target our own backing-dev. 
> > > 
> > > Would be a simple matter of splitting BDI_UNSTABLE out from
> > > BDI_RECLAIMABLE, no?
> > > 
> > > Something like
> > 
> > OK. How about if we also add in a bdi->capabilities flag to tell that we
> > might have BDI_UNSTABLE? That would allow us to avoid the potentially
> > expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs
> > case?
> 
> The bdi_stat_sum() in the error limit is basically the only such
> expensive op, but I suspect we might hit that more than enough. So sure
> that sounds like a plan.
> 

This should apply on top of your patch....

Cheers
  Trond
------------------------------------------------------------------------------------------------ 
VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices

From: Trond Myklebust <Trond.Myklebust@netapp.com>

Speeds up the accounting in balance_dirty_pages() for non-nfs devices.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/nfs/client.c             |    1 +
 include/linux/backing-dev.h |    6 ++++++
 mm/page-writeback.c         |   16 +++++++++++-----
 3 files changed, 18 insertions(+), 5 deletions(-)


diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713..d0b060a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 
 	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE;
 
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 42c3e2a..8b45166 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
 #define BDI_CAP_SWAP_BACKED	0x00000100
+#define BDI_CAP_ACCT_UNSTABLE	0x00000200
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
 	return bdi == &default_backing_dev_info;
 }
 
+static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index aa26b0f..d90a0db 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 		avail_dirty = 0;
 
 	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
-		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
+	if (bdi_cap_account_unstable(bdi))
+		avail_dirty += bdi_stat(bdi, BDI_UNSTABLE);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
 }
@@ -512,8 +513,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 					nr_unstable_nfs;
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
-				     bdi_stat(bdi, BDI_UNSTABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+		if (bdi_cap_account_unstable(bdi))
+			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -563,11 +565,15 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 19:21                                         ` Trond Myklebust
@ 2010-01-06 19:53                                           ` Trond Myklebust
  2010-01-06 20:09                                             ` Jan Kara
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 19:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 14:21 -0500, Trond Myklebust wrote: 
> On Wed, 2010-01-06 at 20:07 +0100, Peter Zijlstra wrote: 
> > On Wed, 2010-01-06 at 13:52 -0500, Trond Myklebust wrote:
> > > On Wed, 2010-01-06 at 19:37 +0100, Peter Zijlstra wrote: 
> > > > On Wed, 2010-01-06 at 13:26 -0500, Trond Myklebust wrote:
> > > > > OK. It looks as if the only key to finding out how many unstable writes
> > > > > we have is to use global_page_state(NR_UNSTABLE_NFS), so we can't
> > > > > specifically target our own backing-dev. 
> > > > 
> > > > Would be a simple matter of splitting BDI_UNSTABLE out from
> > > > BDI_RECLAIMABLE, no?
> > > > 
> > > > Something like
> > > 
> > > OK. How about if we also add in a bdi->capabilities flag to tell that we
> > > might have BDI_UNSTABLE? That would allow us to avoid the potentially
> > > expensive extra calls to bdi_stat() and bdi_stat_sum() for the non-nfs
> > > case?
> > 
> > The bdi_stat_sum() in the error limit is basically the only such
> > expensive op, but I suspect we might hit that more than enough. So sure
> > that sounds like a plan.
> > 
> 
> This should apply on top of your patch....

...and finally, this should convert the previous NFS patch to use the
per-bdi accounting.

Cheers
  Trond

-------------------------------------------------------------------------------------- 
VM: Use per-bdi unstable accounting to improve use of wbc->force_commit

From: Trond Myklebust <Trond.Myklebust@netapp.com>

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 mm/page-writeback.c |   13 +++++++------
 1 files changed, 7 insertions(+), 6 deletions(-)


diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d90a0db..c537543 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping,
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
-	long nr_unstable_nfs;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
@@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping,
 			.nr_to_write	= write_chunk,
 			.range_cyclic	= 1,
 		};
+		long bdi_nr_unstable = 0;
 
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
 
-		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					nr_unstable_nfs;
+					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
-		if (bdi_cap_account_unstable(bdi))
-			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
+		if (bdi_cap_account_unstable(bdi)) {
+			bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE);
+			bdi_nr_reclaimable += bdi_nr_unstable;
+		}
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			wbc.force_commit = 0;
 			/* Force NFS to also free up unstable writes. */
-			if (nr_unstable_nfs > nr_reclaimable / 2)
+			if (bdi_nr_unstable > bdi_nr_reclaimable / 2)
 				wbc.force_commit = 1;
 
 			writeback_inodes_wbc(&wbc);


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 19:53                                           ` Trond Myklebust
@ 2010-01-06 20:09                                             ` Jan Kara
  2010-01-06 20:51                                               ` [PATCH 0/6] " Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Jan Kara @ 2010-01-06 20:09 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Wu Fengguang, Jan Kara, Steve Rago,
	linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Wed 06-01-10 14:53:14, Trond Myklebust wrote:
> ...and finally, this should convert the previous NFS patch to use the
> per-bdi accounting.
> 
> Cheers
>   Trond
> 
> -------------------------------------------------------------------------------------- 
> VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
> 
> From: Trond Myklebust <Trond.Myklebust@netapp.com>
> 
> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
  I like this. You can add
Acked-by: Jan Kara <jack@suse.cz>
  to this patch previous patches adding unstable pages accounting.

								Honza
> ---
> 
>  mm/page-writeback.c |   13 +++++++------
>  1 files changed, 7 insertions(+), 6 deletions(-)
> 
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index d90a0db..c537543 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping,
>  {
>  	long nr_reclaimable, bdi_nr_reclaimable;
>  	long nr_writeback, bdi_nr_writeback;
> -	long nr_unstable_nfs;
>  	unsigned long background_thresh;
>  	unsigned long dirty_thresh;
>  	unsigned long bdi_thresh;
> @@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping,
>  			.nr_to_write	= write_chunk,
>  			.range_cyclic	= 1,
>  		};
> +		long bdi_nr_unstable = 0;
>  
>  		get_dirty_limits(&background_thresh, &dirty_thresh,
>  				&bdi_thresh, bdi);
>  
> -		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
>  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					nr_unstable_nfs;
> +					global_page_state(NR_UNSTABLE_NFS);
>  		nr_writeback = global_page_state(NR_WRITEBACK);
>  
>  		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
> -		if (bdi_cap_account_unstable(bdi))
> -			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
> +		if (bdi_cap_account_unstable(bdi)) {
> +			bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE);
> +			bdi_nr_reclaimable += bdi_nr_unstable;
> +		}
>  		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
>  
>  		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> @@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping,
>  		if (bdi_nr_reclaimable > bdi_thresh) {
>  			wbc.force_commit = 0;
>  			/* Force NFS to also free up unstable writes. */
> -			if (nr_unstable_nfs > nr_reclaimable / 2)
> +			if (bdi_nr_unstable > bdi_nr_reclaimable / 2)
>  				wbc.force_commit = 1;
>  
>  			writeback_inodes_wbc(&wbc);
> 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 20:09                                             ` Jan Kara
@ 2010-01-06 20:51                                               ` Trond Myklebust
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-07  8:16                                                 ` Peter Zijlstra
  0 siblings, 2 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

OK, here is the full series so far. I'm resending because I had to fix
up a couple of BDI_UNSTABLE typos in Peter's patch...

Cheers
  Trond

---

Peter Zijlstra (1):
      VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE

Trond Myklebust (5):
      NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
      VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
      VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
      VM/NFS: The VM must tell the filesystem when to free reclaimable pages
      VFS: Ensure that writeback_single_inode() commits unstable writes


 fs/fs-writeback.c           |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/client.c             |    1 +
 fs/nfs/file.c               |    1 +
 fs/nfs/inode.c              |   16 ----------------
 fs/nfs/internal.h           |    3 ++-
 fs/nfs/super.c              |    2 --
 fs/nfs/write.c              |   39 +++++++++++++++++++++++++++++++++++----
 include/linux/backing-dev.h |    9 ++++++++-
 include/linux/fs.h          |    9 +++++++++
 include/linux/writeback.h   |    5 +++++
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   30 ++++++++++++++++++++++++------
 mm/truncate.c               |    2 +-
 14 files changed, 121 insertions(+), 35 deletions(-)


^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-06 20:51                                                   ` Trond Myklebust
       [not found]                                                     ` <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-06 20:51                                                   ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
                                                                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 mm/page-writeback.c |   13 +++++++------
 1 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d90a0db..c537543 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping,
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
-	long nr_unstable_nfs;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
@@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping,
 			.nr_to_write	= write_chunk,
 			.range_cyclic	= 1,
 		};
+		long bdi_nr_unstable = 0;
 
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
 
-		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					nr_unstable_nfs;
+					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
-		if (bdi_cap_account_unstable(bdi))
-			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
+		if (bdi_cap_account_unstable(bdi)) {
+			bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE);
+			bdi_nr_reclaimable += bdi_nr_unstable;
+		}
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (bdi_nr_reclaimable > bdi_thresh) {
 			wbc.force_commit = 0;
 			/* Force NFS to also free up unstable writes. */
-			if (nr_unstable_nfs > nr_reclaimable / 2)
+			if (bdi_nr_unstable > bdi_nr_reclaimable / 2)
 				wbc.force_commit = 1;
 
 			writeback_inodes_wbc(&wbc);


^ permalink raw reply related	[flat|nested] 96+ messages in thread

[parent not found: <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* Re: [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
       [not found]                                                     ` <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07  2:34                                                       ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  2:34 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Trond,

I'm with Jan that this patch can folded :)

Thanks,
Fengguang

On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> Acked-by: Jan Kara <jack@suse.cz>
> ---
> 
>  mm/page-writeback.c |   13 +++++++------
>  1 files changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index d90a0db..c537543 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -487,7 +487,6 @@ static void balance_dirty_pages(struct address_space *mapping,
>  {
>  	long nr_reclaimable, bdi_nr_reclaimable;
>  	long nr_writeback, bdi_nr_writeback;
> -	long nr_unstable_nfs;
>  	unsigned long background_thresh;
>  	unsigned long dirty_thresh;
>  	unsigned long bdi_thresh;
> @@ -504,18 +503,20 @@ static void balance_dirty_pages(struct address_space *mapping,
>  			.nr_to_write	= write_chunk,
>  			.range_cyclic	= 1,
>  		};
> +		long bdi_nr_unstable = 0;
>  
>  		get_dirty_limits(&background_thresh, &dirty_thresh,
>  				&bdi_thresh, bdi);
>  
> -		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
>  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					nr_unstable_nfs;
> +					global_page_state(NR_UNSTABLE_NFS);
>  		nr_writeback = global_page_state(NR_WRITEBACK);
>  
>  		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
> -		if (bdi_cap_account_unstable(bdi))
> -			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
> +		if (bdi_cap_account_unstable(bdi)) {
> +			bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE);
> +			bdi_nr_reclaimable += bdi_nr_unstable;
> +		}
>  		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
>  
>  		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
> @@ -545,7 +546,7 @@ static void balance_dirty_pages(struct address_space *mapping,
>  		if (bdi_nr_reclaimable > bdi_thresh) {
>  			wbc.force_commit = 0;
>  			/* Force NFS to also free up unstable writes. */
> -			if (nr_unstable_nfs > nr_reclaimable / 2)
> +			if (bdi_nr_unstable > bdi_nr_reclaimable / 2)
>  				wbc.force_commit = 1;
>  
>  			writeback_inodes_wbc(&wbc);
> 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-06 20:51                                                   ` [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit Trond Myklebust
@ 2010-01-06 20:51                                                   ` Trond Myklebust
  2010-01-07  2:29                                                     ` Wu Fengguang
  2010-01-06 20:51                                                   ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
                                                                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

balance_dirty_pages() should really tell the filesystem whether or not it
has an excess of actual dirty pages, or whether it would be more useful to
start freeing up the unstable writes.

Assume that if the number of unstable writes is more than 1/2 the number of
reclaimable pages, then we should force NFS to free up the former.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/nfs/write.c            |    2 +-
 include/linux/writeback.h |    5 +++++
 mm/page-writeback.c       |    9 ++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 910be28..ee3daf4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 	/* Don't commit yet if this is a non-blocking flush and there are
 	 * outstanding writes for this mapping.
 	 */
-	if (wbc->sync_mode != WB_SYNC_ALL &&
+	if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL &&
 	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
 		    NFS_PAGE_TAG_LOCKED)) {
 		mark_inode_unstable_pages(inode);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 76e8903..3fd5c3e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -62,6 +62,11 @@ struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+	/*
+	 * The following is used by balance_dirty_pages() to
+	 * force NFS to commit unstable pages.
+	 */
+	unsigned force_commit:1;
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..ede5356 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -485,6 +485,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 {
 	long nr_reclaimable, bdi_nr_reclaimable;
 	long nr_writeback, bdi_nr_writeback;
+	long nr_unstable_nfs;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
@@ -505,8 +506,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
 
+		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-					global_page_state(NR_UNSTABLE_NFS);
+					nr_unstable_nfs;
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
@@ -537,6 +539,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * up.
 		 */
 		if (bdi_nr_reclaimable > bdi_thresh) {
+			wbc.force_commit = 0;
+			/* Force NFS to also free up unstable writes. */
+			if (nr_unstable_nfs > nr_reclaimable / 2)
+				wbc.force_commit = 1;
+
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
  2010-01-06 20:51                                                   ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
@ 2010-01-07  2:29                                                     ` Wu Fengguang
  2010-01-07  4:49                                                       ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  2:29 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> balance_dirty_pages() should really tell the filesystem whether or not it
> has an excess of actual dirty pages, or whether it would be more useful to
> start freeing up the unstable writes.
> 
> Assume that if the number of unstable writes is more than 1/2 the number of
> reclaimable pages, then we should force NFS to free up the former.
> 
> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> Acked-by: Jan Kara <jack@suse.cz>
> ---
> 
>  fs/nfs/write.c            |    2 +-
>  include/linux/writeback.h |    5 +++++
>  mm/page-writeback.c       |    9 ++++++++-
>  3 files changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index 910be28..ee3daf4 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
>  	/* Don't commit yet if this is a non-blocking flush and there are
>  	 * outstanding writes for this mapping.
>  	 */
> -	if (wbc->sync_mode != WB_SYNC_ALL &&
> +	if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL &&
>  	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
>  		    NFS_PAGE_TAG_LOCKED)) {
>  		mark_inode_unstable_pages(inode);
> diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> index 76e8903..3fd5c3e 100644
> --- a/include/linux/writeback.h
> +++ b/include/linux/writeback.h
> @@ -62,6 +62,11 @@ struct writeback_control {
>  	 * so we use a single control to update them
>  	 */
>  	unsigned no_nrwrite_index_update:1;
> +	/*
> +	 * The following is used by balance_dirty_pages() to
> +	 * force NFS to commit unstable pages.
> +	 */

In fact it may be too late to force commit at balance_dirty_pages()
time: commit takes time and the application has already been blocked.

If not convenient for now, I can make the change -- I'll remove the
writeback_inodes_wbc() call altogether from balance_dirty_pages().

> +	unsigned force_commit:1;
>  };

nfs_commit may be a more newbie friendly name?

Thanks,
Fengguang

>  /*
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0b19943..ede5356 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -485,6 +485,7 @@ static void balance_dirty_pages(struct address_space *mapping,
>  {
>  	long nr_reclaimable, bdi_nr_reclaimable;
>  	long nr_writeback, bdi_nr_writeback;
> +	long nr_unstable_nfs;
>  	unsigned long background_thresh;
>  	unsigned long dirty_thresh;
>  	unsigned long bdi_thresh;
> @@ -505,8 +506,9 @@ static void balance_dirty_pages(struct address_space *mapping,
>  		get_dirty_limits(&background_thresh, &dirty_thresh,
>  				&bdi_thresh, bdi);
>  
> +		nr_unstable_nfs = global_page_state(NR_UNSTABLE_NFS);
>  		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
> -					global_page_state(NR_UNSTABLE_NFS);
> +					nr_unstable_nfs;
>  		nr_writeback = global_page_state(NR_WRITEBACK);
>  
>  		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> @@ -537,6 +539,11 @@ static void balance_dirty_pages(struct address_space *mapping,
>  		 * up.
>  		 */
>  		if (bdi_nr_reclaimable > bdi_thresh) {
> +			wbc.force_commit = 0;
> +			/* Force NFS to also free up unstable writes. */
> +			if (nr_unstable_nfs > nr_reclaimable / 2)
> +				wbc.force_commit = 1;
> +
>  			writeback_inodes_wbc(&wbc);
>  			pages_written += write_chunk - wbc.nr_to_write;
>  			get_dirty_limits(&background_thresh, &dirty_thresh,
> 

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
  2010-01-07  2:29                                                     ` Wu Fengguang
@ 2010-01-07  4:49                                                       ` Trond Myklebust
  2010-01-07  5:03                                                         ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:49 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, 2010-01-07 at 10:29 +0800, Wu Fengguang wrote: 
> On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> > balance_dirty_pages() should really tell the filesystem whether or not it
> > has an excess of actual dirty pages, or whether it would be more useful to
> > start freeing up the unstable writes.
> > 
> > Assume that if the number of unstable writes is more than 1/2 the number of
> > reclaimable pages, then we should force NFS to free up the former.
> > 
> > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> > Acked-by: Jan Kara <jack@suse.cz>
> > ---
> > 
> >  fs/nfs/write.c            |    2 +-
> >  include/linux/writeback.h |    5 +++++
> >  mm/page-writeback.c       |    9 ++++++++-
> >  3 files changed, 14 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > index 910be28..ee3daf4 100644
> > --- a/fs/nfs/write.c
> > +++ b/fs/nfs/write.c
> > @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
> >  	/* Don't commit yet if this is a non-blocking flush and there are
> >  	 * outstanding writes for this mapping.
> >  	 */
> > -	if (wbc->sync_mode != WB_SYNC_ALL &&
> > +	if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL &&
> >  	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> >  		    NFS_PAGE_TAG_LOCKED)) {
> >  		mark_inode_unstable_pages(inode);
> > diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> > index 76e8903..3fd5c3e 100644
> > --- a/include/linux/writeback.h
> > +++ b/include/linux/writeback.h
> > @@ -62,6 +62,11 @@ struct writeback_control {
> >  	 * so we use a single control to update them
> >  	 */
> >  	unsigned no_nrwrite_index_update:1;
> > +	/*
> > +	 * The following is used by balance_dirty_pages() to
> > +	 * force NFS to commit unstable pages.
> > +	 */
> 
> In fact it may be too late to force commit at balance_dirty_pages()
> time: commit takes time and the application has already been blocked.
> 
> If not convenient for now, I can make the change -- I'll remove the
> writeback_inodes_wbc() call altogether from balance_dirty_pages().

You could always set the 'for_background' flag instead. 

> > +	unsigned force_commit:1;
> >  };
> 
> nfs_commit may be a more newbie friendly name?

We could possibly rename it to something like 'force_nfs_commit', but
the comment above the declaration should really be sufficient.

Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
  2010-01-07  4:49                                                       ` Trond Myklebust
@ 2010-01-07  5:03                                                         ` Wu Fengguang
  2010-01-07  5:30                                                           ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  5:03 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 12:49:23PM +0800, Trond Myklebust wrote:
> On Thu, 2010-01-07 at 10:29 +0800, Wu Fengguang wrote: 
> > On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> > > balance_dirty_pages() should really tell the filesystem whether or not it
> > > has an excess of actual dirty pages, or whether it would be more useful to
> > > start freeing up the unstable writes.
> > > 
> > > Assume that if the number of unstable writes is more than 1/2 the number of
> > > reclaimable pages, then we should force NFS to free up the former.
> > > 
> > > Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> > > Acked-by: Jan Kara <jack@suse.cz>
> > > ---
> > > 
> > >  fs/nfs/write.c            |    2 +-
> > >  include/linux/writeback.h |    5 +++++
> > >  mm/page-writeback.c       |    9 ++++++++-
> > >  3 files changed, 14 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > index 910be28..ee3daf4 100644
> > > --- a/fs/nfs/write.c
> > > +++ b/fs/nfs/write.c
> > > @@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
> > >  	/* Don't commit yet if this is a non-blocking flush and there are
> > >  	 * outstanding writes for this mapping.
> > >  	 */
> > > -	if (wbc->sync_mode != WB_SYNC_ALL &&
> > > +	if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL &&
> > >  	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> > >  		    NFS_PAGE_TAG_LOCKED)) {
> > >  		mark_inode_unstable_pages(inode);
> > > diff --git a/include/linux/writeback.h b/include/linux/writeback.h
> > > index 76e8903..3fd5c3e 100644
> > > --- a/include/linux/writeback.h
> > > +++ b/include/linux/writeback.h
> > > @@ -62,6 +62,11 @@ struct writeback_control {
> > >  	 * so we use a single control to update them
> > >  	 */
> > >  	unsigned no_nrwrite_index_update:1;
> > > +	/*
> > > +	 * The following is used by balance_dirty_pages() to
> > > +	 * force NFS to commit unstable pages.
> > > +	 */
> > 
> > In fact it may be too late to force commit at balance_dirty_pages()
> > time: commit takes time and the application has already been blocked.
> > 
> > If not convenient for now, I can make the change -- I'll remove the
> > writeback_inodes_wbc() call altogether from balance_dirty_pages().
> 
> You could always set the 'for_background' flag instead. 

Please this is misusing ->for_background.. Anyway it's not a big
problem. I'll set the force_nfs_commit flag in background writeback.

> > > +	unsigned force_commit:1;
> > >  };
> > 
> > nfs_commit may be a more newbie friendly name?
> 
> We could possibly rename it to something like 'force_nfs_commit', but
> the comment above the declaration should really be sufficient.

"commit" could also be misread as "commit a transaction"?
Anyway I think adding an "nfs" limits the scope to NFS thus makes code
reading somehow easier. Just a personal feeling.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
  2010-01-07  5:03                                                         ` Wu Fengguang
@ 2010-01-07  5:30                                                           ` Trond Myklebust
  2010-01-07 14:37                                                             ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  5:30 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, 2010-01-07 at 13:03 +0800, Wu Fengguang wrote:

> "commit" could also be misread as "commit a transaction"?
> Anyway I think adding an "nfs" limits the scope to NFS thus makes code
> reading somehow easier. Just a personal feeling.

How about 'force_commit_unstable' instead? That ties it up to the
unstable writes rather than NFS.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
  2010-01-07  5:30                                                           ` Trond Myklebust
@ 2010-01-07 14:37                                                             ` Wu Fengguang
  2010-01-07 14:41                                                               ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07 14:37 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 01:30:43PM +0800, Trond Myklebust wrote:
> On Thu, 2010-01-07 at 13:03 +0800, Wu Fengguang wrote:
> 
> > "commit" could also be misread as "commit a transaction"?
> > Anyway I think adding an "nfs" limits the scope to NFS thus makes code
> > reading somehow easier. Just a personal feeling.
> 
> How about 'force_commit_unstable' instead? That ties it up to the
> unstable writes rather than NFS.

That would be good, thanks!

Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-07 14:37                                                             ` Wu Fengguang
@ 2010-01-07 14:41                                                               ` Trond Myklebust
       [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Take 3. Incorporate Fengguang's suggestion that we change the name of
the force_commit flag.

Cheers
  Trond

---

Peter Zijlstra (1):
      VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE

Trond Myklebust (4):
      NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
      VM/NFS: The VM must tell the filesystem when to free reclaimable pages
      VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
      VFS: Ensure that writeback_single_inode() commits unstable writes


 fs/fs-writeback.c           |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/client.c             |    1 +
 fs/nfs/file.c               |    1 +
 fs/nfs/inode.c              |   16 ----------------
 fs/nfs/internal.h           |    3 ++-
 fs/nfs/super.c              |    2 --
 fs/nfs/write.c              |   39 +++++++++++++++++++++++++++++++++++----
 include/linux/backing-dev.h |    9 ++++++++-
 include/linux/fs.h          |    9 +++++++++
 include/linux/writeback.h   |    5 +++++
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   30 ++++++++++++++++++++++++------
 mm/truncate.c               |    2 +-
 14 files changed, 121 insertions(+), 35 deletions(-)


^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
       [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07 14:41                                                                   ` Trond Myklebust
  2010-01-07 14:41                                                                   ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
                                                                                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
---

 fs/nfs/write.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 43e47b4..7f1f2aa 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 		mark_inode_unstable_pages(inode);
 		return 0;
 	}
-	if (wbc->nonblocking)
+	if (wbc->nonblocking || wbc->for_background)
 		flags = 0;
 	ret = nfs_commit_inode(inode, flags);
 	if (ret > 0)


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
       [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-07 14:41                                                                   ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
@ 2010-01-07 14:41                                                                   ` Trond Myklebust
  2010-01-07 14:41                                                                   ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
                                                                                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Speeds up the accounting in balance_dirty_pages() for non-nfs devices.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
---

 fs/nfs/client.c             |    1 +
 include/linux/backing-dev.h |    6 ++++++
 mm/page-writeback.c         |   16 +++++++++++-----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713..d0b060a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 
 	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE;
 
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 42c3e2a..8b45166 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
 #define BDI_CAP_SWAP_BACKED	0x00000100
+#define BDI_CAP_ACCT_UNSTABLE	0x00000200
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
 	return bdi == &default_backing_dev_info;
 }
 
+static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 23d3fc6..c06739b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 		avail_dirty = 0;
 
 	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
-		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
+	if (bdi_cap_account_unstable(bdi))
+		avail_dirty += bdi_stat(bdi, BDI_UNSTABLE);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
 }
@@ -510,8 +511,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
-				     bdi_stat(bdi, BDI_UNSTABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+		if (bdi_cap_account_unstable(bdi))
+			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -556,11 +558,15 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-07 14:41                                                                   ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
  2010-01-07 14:41                                                                   ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
@ 2010-01-07 14:41                                                                   ` Trond Myklebust
  2010-01-07 14:41                                                                   ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
  2010-01-07 14:41                                                                   ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

If the call to do_writepages() succeeded in starting writeback, we do not
know whether or not we will need to COMMIT any unstable writes until after
the write RPC calls are finished. Currently, we assume that at least one
write RPC call will have finished, and set I_DIRTY_DATASYNC by the time
do_writepages is done, so that write_inode() is triggered.

In order to ensure reliable operation (i.e. ensure that a single call to
writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages
are on disk) we need to first wait for filemap_fdatawait() to complete,
then test for unstable pages.

Since NFS is currently the only filesystem that has unstable pages, we can
add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set,
this will trigger a callback to a new address_space_operation to call the
COMMIT.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---

 fs/fs-writeback.c  |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/file.c      |    1 +
 fs/nfs/inode.c     |   16 ----------------
 fs/nfs/internal.h  |    3 ++-
 fs/nfs/super.c     |    2 --
 fs/nfs/write.c     |   33 ++++++++++++++++++++++++++++++++-
 include/linux/fs.h |    9 +++++++++
 7 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c..3640769 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync)
 }
 
 /*
+ * Commit the NFS unstable pages.
+ */
+static int commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
+		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
+	return 0;
+}
+
+/*
  * Wait for writeback on an inode to complete.
  */
 static void inode_wait_for_writeback(struct inode *inode)
@@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	/*
+	 * Special state for cleaning NFS unstable pages
+	 */
+	if (inode->i_state & I_UNSTABLE_PAGES) {
+		int err;
+		inode->i_state &= ~I_UNSTABLE_PAGES;
+		spin_unlock(&inode_lock);
+		err = commit_unstable_pages(mapping, wbc);
+		if (ret == 0)
+			ret = err;
+		spin_lock(&inode_lock);
+	}
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
@@ -533,6 +556,12 @@ select_queue:
 				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
+		} else if (inode->i_state & I_UNSTABLE_PAGES) {
+			/*
+			 * The inode has got yet more unstable pages to
+			 * commit. Requeue...
+			 */
+			redirty_tail(inode);
 		} else if (atomic_read(&inode->i_count)) {
 			/*
 			 * The inode is clean, inuse
@@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
 
 		inode->i_state |= flags;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b89132..67e50ac 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 	.error_remove_page = generic_error_remove_page,
+	.commit_unstable_pages = nfs_commit_unstable_pages,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..8341709 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
-{
-	int ret;
-
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d..7bb326f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
+extern int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907ef..805c1a0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
 static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
 static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..910be28 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	mark_inode_unstable_pages(inode);
 }
 
 static int
@@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int flags = FLUSH_SYNC;
+	int ret;
+
+	/* Don't commit yet if this is a non-blocking flush and there are
+	 * outstanding writes for this mapping.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
+		    NFS_PAGE_TAG_LOCKED)) {
+		mark_inode_unstable_pages(inode);
+		return 0;
+	}
+	if (wbc->nonblocking)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
 #else
 static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	return 0;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9147ca8..de594b3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,8 @@ struct address_space_operations {
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*commit_unstable_pages)(struct address_space *,
+			struct writeback_control *);
 };
 
 /*
@@ -1635,6 +1637,8 @@ struct super_operations {
 #define I_CLEAR			64
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define __I_UNSTABLE_PAGES	8
+#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void mark_inode_unstable_pages(struct inode *inode)
+{
+	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
       [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                                     ` (2 preceding siblings ...)
  2010-01-07 14:41                                                                   ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
@ 2010-01-07 14:41                                                                   ` Trond Myklebust
  2010-01-07 14:41                                                                   ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

From: Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
---

 fs/nfs/write.c              |    6 +++---
 include/linux/backing-dev.h |    3 ++-
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   16 ++++++++++------
 mm/truncate.c               |    2 +-
 6 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 910be28..36549b1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 			NFS_PAGE_TAG_COMMIT);
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 	mark_inode_unstable_pages(inode);
 }
 
@@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE);
 		return 1;
 	}
 	return 0;
@@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-				BDI_RECLAIMABLE);
+				BDI_UNSTABLE);
 		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26a..42c3e2a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -36,7 +36,8 @@ enum bdi_state {
 typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
-	BDI_RECLAIMABLE,
+	BDI_DIRTY,
+	BDI_UNSTABLE,
 	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca03..88f3655 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
 		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
+		   "BdiDirty:         %8lu kB\n"
+		   "BdiUnstable:      %8lu kB\n"
 		   "BdiDirtyThresh:   %8lu kB\n"
 		   "DirtyThresh:      %8lu kB\n"
 		   "BackgroundThresh: %8lu kB\n"
@@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "wb_list:          %8u\n"
 		   "wb_cnt:           %8u\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)),
 		   K(bdi_thresh), K(dirty_thresh),
 		   K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0..458387d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 	}
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..23d3fc6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 	else
 		avail_dirty = 0;
 
-	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
+		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
@@ -509,7 +510,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+				     bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -554,10 +556,12 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
@@ -1079,7 +1083,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
@@ -1255,7 +1259,7 @@ int clear_page_dirty_for_io(struct page *page)
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			return 1;
 		}
 		return 0;
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee..b0ce8fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
       [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                                     ` (3 preceding siblings ...)
  2010-01-07 14:41                                                                   ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
@ 2010-01-07 14:41                                                                   ` Trond Myklebust
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 14:41 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

balance_dirty_pages() should really tell the filesystem whether or not it
has an excess of actual dirty pages, or whether it would be more useful to
start freeing up the unstable writes.

Assume that if the number of unstable writes is more than 1/2 the number of
reclaimable pages, then we should force NFS to free up the former.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---

 fs/nfs/write.c            |    2 +-
 include/linux/writeback.h |    5 +++++
 mm/page-writeback.c       |   12 ++++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 36549b1..43e47b4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 	/* Don't commit yet if this is a non-blocking flush and there are
 	 * outstanding writes for this mapping.
 	 */
-	if (wbc->sync_mode != WB_SYNC_ALL &&
+	if (!wbc->force_commit_unstable && wbc->sync_mode != WB_SYNC_ALL &&
 	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
 		    NFS_PAGE_TAG_LOCKED)) {
 		mark_inode_unstable_pages(inode);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 76e8903..8229139 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -62,6 +62,11 @@ struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+	/*
+	 * The following is used by balance_dirty_pages() to
+	 * force NFS to commit unstable pages.
+	 */
+	unsigned force_commit_unstable:1;
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c06739b..6a0aec7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -503,6 +503,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 			.nr_to_write	= write_chunk,
 			.range_cyclic	= 1,
 		};
+		long bdi_nr_unstable = 0;
 
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
@@ -512,8 +513,10 @@ static void balance_dirty_pages(struct address_space *mapping,
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
-		if (bdi_cap_account_unstable(bdi))
-			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
+		if (bdi_cap_account_unstable(bdi)) {
+			bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE);
+			bdi_nr_reclaimable += bdi_nr_unstable;
+		}
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -541,6 +544,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * up.
 		 */
 		if (bdi_nr_reclaimable > bdi_thresh) {
+			wbc.force_commit_unstable = 0;
+			/* Force NFS to also free up unstable writes. */
+			if (bdi_nr_unstable > bdi_nr_reclaimable / 2)
+				wbc.force_commit_unstable = 1;
+
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-06 20:51                                                   ` [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit Trond Myklebust
  2010-01-06 20:51                                                   ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
@ 2010-01-06 20:51                                                   ` Trond Myklebust
       [not found]                                                     ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-07  2:18                                                     ` Wu Fengguang
  2010-01-06 20:51                                                   ` [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
                                                                     ` (3 subsequent siblings)
  6 siblings, 2 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

If the call to do_writepages() succeeded in starting writeback, we do not
know whether or not we will need to COMMIT any unstable writes until after
the write RPC calls are finished. Currently, we assume that at least one
write RPC call will have finished, and set I_DIRTY_DATASYNC by the time
do_writepages is done, so that write_inode() is triggered.

In order to ensure reliable operation (i.e. ensure that a single call to
writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages
are on disk) we need to first wait for filemap_fdatawait() to complete,
then test for unstable pages.

Since NFS is currently the only filesystem that has unstable pages, we can
add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set,
this will trigger a callback to a new address_space_operation to call the
COMMIT.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/fs-writeback.c  |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/file.c      |    1 +
 fs/nfs/inode.c     |   16 ----------------
 fs/nfs/internal.h  |    3 ++-
 fs/nfs/super.c     |    2 --
 fs/nfs/write.c     |   33 ++++++++++++++++++++++++++++++++-
 include/linux/fs.h |    9 +++++++++
 7 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c..3bc0a96 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync)
 }
 
 /*
+ * Commit the NFS unstable pages.
+ */
+static int commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
+		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
+	return 0;
+}
+
+/*
  * Wait for writeback on an inode to complete.
  */
 static void inode_wait_for_writeback(struct inode *inode)
@@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	/*
+	 * Special state for cleaning NFS unstable pages
+	 */
+	if (inode->i_state & I_UNSTABLE_PAGES) {
+		int err;
+		inode->i_state &= ~I_UNSTABLE_PAGES;
+		spin_unlock(&inode_lock);
+		err = commit_unstable_pages(mapping, wbc);
+		if (ret == 0)
+			ret = err;
+		spin_lock(&inode_lock);
+	}
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
@@ -533,6 +556,12 @@ select_queue:
 				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
+		} else if (inode->i_state & I_UNSTABLE_PAGES) {
+			/*
+			 * The inode has got yet more unstable pages to
+			 * commit. Requeue on b_more_io
+			 */
+			requeue_io(inode);
 		} else if (atomic_read(&inode->i_count)) {
 			/*
 			 * The inode is clean, inuse
@@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
 
 		inode->i_state |= flags;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b89132..67e50ac 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 	.error_remove_page = generic_error_remove_page,
+	.commit_unstable_pages = nfs_commit_unstable_pages,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..8341709 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
-{
-	int ret;
-
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d..7bb326f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
+extern int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907ef..805c1a0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
 static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
 static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..910be28 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	mark_inode_unstable_pages(inode);
 }
 
 static int
@@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int flags = FLUSH_SYNC;
+	int ret;
+
+	/* Don't commit yet if this is a non-blocking flush and there are
+	 * outstanding writes for this mapping.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
+		    NFS_PAGE_TAG_LOCKED)) {
+		mark_inode_unstable_pages(inode);
+		return 0;
+	}
+	if (wbc->nonblocking)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
 #else
 static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	return 0;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9147ca8..ea0b7a3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,8 @@ struct address_space_operations {
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*commit_unstable_pages)(struct address_space *,
+			struct writeback_control *);
 };
 
 /*
@@ -1635,6 +1637,8 @@ struct super_operations {
 #define I_CLEAR			64
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define __I_UNSTABLE_PAGES	9
+#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void mark_inode_unstable_pages(struct inode *inode)
+{
+	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode


^ permalink raw reply related	[flat|nested] 96+ messages in thread

[parent not found: <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                     ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-06 21:38                                                       ` Jan Kara
       [not found]                                                         ` <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
  0 siblings, 1 reply; 96+ messages in thread
From: Jan Kara @ 2010-01-06 21:38 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed 06-01-10 15:51:10, Trond Myklebust wrote:
...
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9147ca8..ea0b7a3 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1635,6 +1637,8 @@ struct super_operations {
>  #define I_CLEAR			64
>  #define __I_SYNC		7
>  #define I_SYNC			(1 << __I_SYNC)
> +#define __I_UNSTABLE_PAGES	9
  Hum, why isn't this 8?

							Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                         ` <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
@ 2010-01-06 21:48                                                           ` Trond Myklebust
  0 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 21:48 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Peter Zijlstra, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 22:38 +0100, Jan Kara wrote: 
> On Wed 06-01-10 15:51:10, Trond Myklebust wrote:
> ...
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 9147ca8..ea0b7a3 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -1635,6 +1637,8 @@ struct super_operations {
> >  #define I_CLEAR			64
> >  #define __I_SYNC		7
> >  #define I_SYNC			(1 << __I_SYNC)
> > +#define __I_UNSTABLE_PAGES	9
>   Hum, why isn't this 8?
> 
> 							Honza

I missed Christoph's patch that got rid of I_LOCK. I think that was
merged after I started work on these patches.

I'd be quite OK with changing the above value to 8 if that is
preferable.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-06 20:51                                                   ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
       [not found]                                                     ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07  2:18                                                     ` Wu Fengguang
       [not found]                                                       ` <1262839082.2185.15.camel@localhost>
  1 sibling, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  2:18 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> If the call to do_writepages() succeeded in starting writeback, we do not
> know whether or not we will need to COMMIT any unstable writes until after
> the write RPC calls are finished. Currently, we assume that at least one
> write RPC call will have finished, and set I_DIRTY_DATASYNC by the time
> do_writepages is done, so that write_inode() is triggered.
> 
> In order to ensure reliable operation (i.e. ensure that a single call to
> writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages
> are on disk) we need to first wait for filemap_fdatawait() to complete,
> then test for unstable pages.
> 
> Since NFS is currently the only filesystem that has unstable pages, we can
> add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set,
> this will trigger a callback to a new address_space_operation to call the
> COMMIT.
> 
> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
> Acked-by: Jan Kara <jack@suse.cz>
> ---
> 
>  fs/fs-writeback.c  |   31 ++++++++++++++++++++++++++++++-
>  fs/nfs/file.c      |    1 +
>  fs/nfs/inode.c     |   16 ----------------
>  fs/nfs/internal.h  |    3 ++-
>  fs/nfs/super.c     |    2 --
>  fs/nfs/write.c     |   33 ++++++++++++++++++++++++++++++++-
>  include/linux/fs.h |    9 +++++++++
>  7 files changed, 74 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
> index 1a7c42c..3bc0a96 100644
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync)
>  }
>  
>  /*
> + * Commit the NFS unstable pages.
> + */
> +static int commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc)
> +{
> +	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
> +		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
> +	return 0;
> +}
> +
> +/*
>   * Wait for writeback on an inode to complete.
>   */
>  static void inode_wait_for_writeback(struct inode *inode)
> @@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
>  	}
>  
>  	spin_lock(&inode_lock);
> +	/*
> +	 * Special state for cleaning NFS unstable pages
> +	 */
> +	if (inode->i_state & I_UNSTABLE_PAGES) {
> +		int err;
> +		inode->i_state &= ~I_UNSTABLE_PAGES;
> +		spin_unlock(&inode_lock);
> +		err = commit_unstable_pages(mapping, wbc);
> +		if (ret == 0)
> +			ret = err;
> +		spin_lock(&inode_lock);
> +	}
>  	inode->i_state &= ~I_SYNC;
>  	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
>  		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
> @@ -533,6 +556,12 @@ select_queue:
>  				inode->i_state |= I_DIRTY_PAGES;
>  				redirty_tail(inode);
>  			}
> +		} else if (inode->i_state & I_UNSTABLE_PAGES) {
> +			/*
> +			 * The inode has got yet more unstable pages to
> +			 * commit. Requeue on b_more_io
> +			 */
> +			requeue_io(inode);

This risks "busy retrying" inodes with unstable pages, when

- nfs_commit_unstable_pages() don't think it's time to commit
- NFS server somehow response slowly

The workaround is to use redirty_tail() for now. But that risks delay
the COMMIT for up to 30s, which obviously might stuck applications in
balance_dirty_pages() for too long.

I have a patch to shorten the retry time to 1s (or other constant)
by introducing b_more_io_wait. It currently sits in my writeback queue
series whose main blocking issue is the constantly broken NFS pipeline..

>  		} else if (atomic_read(&inode->i_count)) {
>  			/*
>  			 * The inode is clean, inuse
> @@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
>  
>  	spin_lock(&inode_lock);
>  	if ((inode->i_state & flags) != flags) {
> -		const int was_dirty = inode->i_state & I_DIRTY;
> +		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
>  
>  		inode->i_state |= flags;
>  
> diff --git a/fs/nfs/file.c b/fs/nfs/file.c
> index 6b89132..67e50ac 100644
> --- a/fs/nfs/file.c
> +++ b/fs/nfs/file.c
> @@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
>  	.migratepage = nfs_migrate_page,
>  	.launder_page = nfs_launder_page,
>  	.error_remove_page = generic_error_remove_page,
> +	.commit_unstable_pages = nfs_commit_unstable_pages,
>  };
>  
>  /*
> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
> index faa0918..8341709 100644
> --- a/fs/nfs/inode.c
> +++ b/fs/nfs/inode.c
> @@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
>  	return ino;
>  }
>  
> -int nfs_write_inode(struct inode *inode, int sync)
> -{
> -	int ret;
> -
> -	if (sync) {
> -		ret = filemap_fdatawait(inode->i_mapping);
> -		if (ret == 0)
> -			ret = nfs_commit_inode(inode, FLUSH_SYNC);
> -	} else
> -		ret = nfs_commit_inode(inode, 0);
> -	if (ret >= 0)
> -		return 0;
> -	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> -	return ret;
> -}
> -
>  void nfs_clear_inode(struct inode *inode)
>  {
>  	/*
> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
> index 29e464d..7bb326f 100644
> --- a/fs/nfs/internal.h
> +++ b/fs/nfs/internal.h
> @@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
>  extern struct workqueue_struct *nfsiod_workqueue;
>  extern struct inode *nfs_alloc_inode(struct super_block *sb);
>  extern void nfs_destroy_inode(struct inode *);
> -extern int nfs_write_inode(struct inode *,int);
>  extern void nfs_clear_inode(struct inode *);
>  #ifdef CONFIG_NFS_V4
>  extern void nfs4_clear_inode(struct inode *);
> @@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
>  extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
>  
>  /* write.c */
> +extern int nfs_commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc);
>  extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
>  #ifdef CONFIG_MIGRATION
>  extern int nfs_migrate_page(struct address_space *,
> diff --git a/fs/nfs/super.c b/fs/nfs/super.c
> index ce907ef..805c1a0 100644
> --- a/fs/nfs/super.c
> +++ b/fs/nfs/super.c
> @@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
>  static const struct super_operations nfs_sops = {
>  	.alloc_inode	= nfs_alloc_inode,
>  	.destroy_inode	= nfs_destroy_inode,
> -	.write_inode	= nfs_write_inode,
>  	.statfs		= nfs_statfs,
>  	.clear_inode	= nfs_clear_inode,
>  	.umount_begin	= nfs_umount_begin,
> @@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
>  static const struct super_operations nfs4_sops = {
>  	.alloc_inode	= nfs_alloc_inode,
>  	.destroy_inode	= nfs_destroy_inode,
> -	.write_inode	= nfs_write_inode,
>  	.statfs		= nfs_statfs,
>  	.clear_inode	= nfs4_clear_inode,
>  	.umount_begin	= nfs_umount_begin,
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index d171696..910be28 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
>  	spin_unlock(&inode->i_lock);
>  	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
>  	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> -	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> +	mark_inode_unstable_pages(inode);

Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.

>  }
>  
>  static int
> @@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how)
>  	}
>  	return res;
>  }
> +
> +int nfs_commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc)
> +{
> +	struct inode *inode = mapping->host;
> +	int flags = FLUSH_SYNC;
> +	int ret;
> +
> +	/* Don't commit yet if this is a non-blocking flush and there are
> +	 * outstanding writes for this mapping.
> +	 */
> +	if (wbc->sync_mode != WB_SYNC_ALL &&
> +	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> +		    NFS_PAGE_TAG_LOCKED)) {
> +		mark_inode_unstable_pages(inode);
> +		return 0;
> +	}

A dumb question: does NFS_PAGE_TAG_LOCKED means either flying COMMITs
or WRITEs? As an NFS newbie, I'm only confident on the COMMIT part :)

> +	if (wbc->nonblocking)
> +		flags = 0;
> +	ret = nfs_commit_inode(inode, flags);
> +	if (ret > 0)
> +		ret = 0;
> +	return ret;
> +}
> +
>  #else
>  static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
>  {
>  	return 0;
>  }
> +
> +int nfs_commit_unstable_pages(struct address_space *mapping,
> +		struct writeback_control *wbc)
> +{
> +	return 0;
> +}
>  #endif
>  
>  long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9147ca8..ea0b7a3 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -602,6 +602,8 @@ struct address_space_operations {
>  	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
>  					unsigned long);
>  	int (*error_remove_page)(struct address_space *, struct page *);
> +	int (*commit_unstable_pages)(struct address_space *,
> +			struct writeback_control *);
>  };
>  
>  /*
> @@ -1635,6 +1637,8 @@ struct super_operations {
>  #define I_CLEAR			64
>  #define __I_SYNC		7
>  #define I_SYNC			(1 << __I_SYNC)
> +#define __I_UNSTABLE_PAGES	9
> +#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
>  
>  #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
>  
> @@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
>  	__mark_inode_dirty(inode, I_DIRTY_SYNC);
>  }
>  
> +static inline void mark_inode_unstable_pages(struct inode *inode)
> +{
> +	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
> +}
> +
>  /**
>   * inc_nlink - directly increment an inode's link count
>   * @inode: inode
> 

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <1262839082.2185.15.camel@localhost>]

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                       ` <1262839082.2185.15.camel@localhost>
@ 2010-01-07  4:48                                                         ` Wu Fengguang
  2010-01-07  4:53                                                           ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust
  2010-01-07 14:56                                                         ` [PATCH 1/6] " Wu Fengguang
  1 sibling, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  4:48 UTC (permalink / raw)
  To: Myklebust, Trond
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote:
> On Thu, 2010-01-07 at 10:18 +0800, Wu Fengguang wrote:
> > On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> > > @@ -533,6 +556,12 @@ select_queue:
> > >                             inode->i_state |= I_DIRTY_PAGES;
> > >                             redirty_tail(inode);
> > >                     }
> > > +           } else if (inode->i_state & I_UNSTABLE_PAGES) {
> > > +                   /*
> > > +                    * The inode has got yet more unstable pages to
> > > +                    * commit. Requeue on b_more_io
> > > +                    */
> > > +                   requeue_io(inode);
> >
> > This risks "busy retrying" inodes with unstable pages, when
> >
> > - nfs_commit_unstable_pages() don't think it's time to commit
> > - NFS server somehow response slowly
> >
> > The workaround is to use redirty_tail() for now. But that risks delay
> > the COMMIT for up to 30s, which obviously might stuck applications in
> > balance_dirty_pages() for too long.
> >
> > I have a patch to shorten the retry time to 1s (or other constant)
> > by introducing b_more_io_wait. It currently sits in my writeback queue
> > series whose main blocking issue is the constantly broken NFS pipeline..
> 
> 
> OK. Should I use redirty_tail() for the moment then, and assume you will
> fix when you introduce the new state?

OK, I'll change your redirty_tail() then :)

> > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > index d171696..910be28 100644
> > > --- a/fs/nfs/write.c
> > > +++ b/fs/nfs/write.c
> > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
> > >     spin_unlock(&inode->i_lock);
> > >     inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
> > >     inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> > > -   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> > > +   mark_inode_unstable_pages(inode);
> >
> > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.
> 
> Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after
> this patch is applied. We won't ever need it.
> 
> If the VM or VFS is doing it, then they ought to be fixed: there is no
> reason to assume that all filesystems need to sync their inodes on
> i_size changes.

Ah OK, I took it for certain..

> > > +   /* Don't commit yet if this is a non-blocking flush and there are
> > > +    * outstanding writes for this mapping.
> > > +    */
> > > +   if (wbc->sync_mode != WB_SYNC_ALL &&
> > > +       radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
> > > +               NFS_PAGE_TAG_LOCKED)) {
> > > +           mark_inode_unstable_pages(inode);
> > > +           return 0;
> > > +   }
> >
> > A dumb question: does NFS_PAGE_TAG_LOCKED means either flying COMMITs
> > or WRITEs? As an NFS newbie, I'm only confident on the COMMIT part :)
> 
> Both writebacks and commits will cause NFS_PAGE_TAG_LOCKED to be set, as
> will attempts to change the page contents. See the calls to
> nfs_set_page_tag_locked()...

Thanks for the tip!

> IOW: the above code will fail to trigger if there are outstanding WRITE
> RPC calls, or if there is a second process that happens to be writing to
> this inode's page cache...

IOW: for a busy "cp", the commit of inode may be delayed until
"nr_unstable > nr_dirty / 2"?  OK that's what we want.

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-07  4:48                                                         ` Wu Fengguang
@ 2010-01-07  4:53                                                           ` Trond Myklebust
       [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:53 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Take 2 of this series, incorporating the suggested changes from Jan
and Fengguang...

Cheers
  Trond

---

Peter Zijlstra (1):
      VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE

Trond Myklebust (4):
      NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
      VM/NFS: The VM must tell the filesystem when to free reclaimable pages
      VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
      VFS: Ensure that writeback_single_inode() commits unstable writes


 fs/fs-writeback.c           |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/client.c             |    1 +
 fs/nfs/file.c               |    1 +
 fs/nfs/inode.c              |   16 ----------------
 fs/nfs/internal.h           |    3 ++-
 fs/nfs/super.c              |    2 --
 fs/nfs/write.c              |   39 +++++++++++++++++++++++++++++++++++----
 include/linux/backing-dev.h |    9 ++++++++-
 include/linux/fs.h          |    9 +++++++++
 include/linux/writeback.h   |    5 +++++
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   30 ++++++++++++++++++++++++------
 mm/truncate.c               |    2 +-
 14 files changed, 121 insertions(+), 35 deletions(-)

-- 
Signature

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
       [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07  4:53                                                               ` Trond Myklebust
  2010-01-07  4:53                                                               ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
                                                                                 ` (3 subsequent siblings)
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:53 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Speeds up the accounting in balance_dirty_pages() for non-nfs devices.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/nfs/client.c             |    1 +
 include/linux/backing-dev.h |    6 ++++++
 mm/page-writeback.c         |   16 +++++++++++-----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713..d0b060a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 
 	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE;
 
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 42c3e2a..8b45166 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
 #define BDI_CAP_SWAP_BACKED	0x00000100
+#define BDI_CAP_ACCT_UNSTABLE	0x00000200
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
 	return bdi == &default_backing_dev_info;
 }
 
+static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 23d3fc6..c06739b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 		avail_dirty = 0;
 
 	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
-		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
+	if (bdi_cap_account_unstable(bdi))
+		avail_dirty += bdi_stat(bdi, BDI_UNSTABLE);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
 }
@@ -510,8 +511,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
-				     bdi_stat(bdi, BDI_UNSTABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+		if (bdi_cap_account_unstable(bdi))
+			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -556,11 +558,15 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages
       [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-07  4:53                                                               ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
@ 2010-01-07  4:53                                                               ` Trond Myklebust
  2010-01-07  4:53                                                               ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
                                                                                 ` (2 subsequent siblings)
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:53 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

balance_dirty_pages() should really tell the filesystem whether or not it
has an excess of actual dirty pages, or whether it would be more useful to
start freeing up the unstable writes.

Assume that if the number of unstable writes is more than 1/2 the number of
reclaimable pages, then we should force NFS to free up the former.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/nfs/write.c            |    2 +-
 include/linux/writeback.h |    5 +++++
 mm/page-writeback.c       |   12 ++++++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 36549b1..978de7f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1417,7 +1417,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 	/* Don't commit yet if this is a non-blocking flush and there are
 	 * outstanding writes for this mapping.
 	 */
-	if (wbc->sync_mode != WB_SYNC_ALL &&
+	if (!wbc->force_commit && wbc->sync_mode != WB_SYNC_ALL &&
 	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
 		    NFS_PAGE_TAG_LOCKED)) {
 		mark_inode_unstable_pages(inode);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 76e8903..3fd5c3e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -62,6 +62,11 @@ struct writeback_control {
 	 * so we use a single control to update them
 	 */
 	unsigned no_nrwrite_index_update:1;
+	/*
+	 * The following is used by balance_dirty_pages() to
+	 * force NFS to commit unstable pages.
+	 */
+	unsigned force_commit:1;
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c06739b..c537543 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -503,6 +503,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 			.nr_to_write	= write_chunk,
 			.range_cyclic	= 1,
 		};
+		long bdi_nr_unstable = 0;
 
 		get_dirty_limits(&background_thresh, &dirty_thresh,
 				&bdi_thresh, bdi);
@@ -512,8 +513,10 @@ static void balance_dirty_pages(struct address_space *mapping,
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
 		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
-		if (bdi_cap_account_unstable(bdi))
-			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
+		if (bdi_cap_account_unstable(bdi)) {
+			bdi_nr_unstable = bdi_stat(bdi, BDI_UNSTABLE);
+			bdi_nr_reclaimable += bdi_nr_unstable;
+		}
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -541,6 +544,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * up.
 		 */
 		if (bdi_nr_reclaimable > bdi_thresh) {
+			wbc.force_commit = 0;
+			/* Force NFS to also free up unstable writes. */
+			if (bdi_nr_unstable > bdi_nr_reclaimable / 2)
+				wbc.force_commit = 1;
+
 			writeback_inodes_wbc(&wbc);
 			pages_written += write_chunk - wbc.nr_to_write;
 			get_dirty_limits(&background_thresh, &dirty_thresh,


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
       [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-07  4:53                                                               ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
  2010-01-07  4:53                                                               ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
@ 2010-01-07  4:53                                                               ` Trond Myklebust
  2010-01-07  4:53                                                               ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
  2010-01-07  4:53                                                               ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:53 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

From: Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/nfs/write.c              |    6 +++---
 include/linux/backing-dev.h |    3 ++-
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   16 ++++++++++------
 mm/truncate.c               |    2 +-
 6 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 910be28..36549b1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 			NFS_PAGE_TAG_COMMIT);
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 	mark_inode_unstable_pages(inode);
 }
 
@@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE);
 		return 1;
 	}
 	return 0;
@@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-				BDI_RECLAIMABLE);
+				BDI_UNSTABLE);
 		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26a..42c3e2a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -36,7 +36,8 @@ enum bdi_state {
 typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
-	BDI_RECLAIMABLE,
+	BDI_DIRTY,
+	BDI_UNSTABLE,
 	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca03..88f3655 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
 		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
+		   "BdiDirty:         %8lu kB\n"
+		   "BdiUnstable:      %8lu kB\n"
 		   "BdiDirtyThresh:   %8lu kB\n"
 		   "DirtyThresh:      %8lu kB\n"
 		   "BackgroundThresh: %8lu kB\n"
@@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "wb_list:          %8u\n"
 		   "wb_cnt:           %8u\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)),
 		   K(bdi_thresh), K(dirty_thresh),
 		   K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0..458387d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 	}
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..23d3fc6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 	else
 		avail_dirty = 0;
 
-	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
+		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
@@ -509,7 +510,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+				     bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -554,10 +556,12 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
@@ -1079,7 +1083,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
@@ -1255,7 +1259,7 @@ int clear_page_dirty_for_io(struct page *page)
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			return 1;
 		}
 		return 0;
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee..b0ce8fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
       [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                                 ` (2 preceding siblings ...)
  2010-01-07  4:53                                                               ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
@ 2010-01-07  4:53                                                               ` Trond Myklebust
  2010-01-07  4:53                                                               ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:53 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/nfs/write.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 978de7f..d6d8048 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 		mark_inode_unstable_pages(inode);
 		return 0;
 	}
-	if (wbc->nonblocking)
+	if (wbc->nonblocking || wbc->for_background)
 		flags = 0;
 	ret = nfs_commit_inode(inode, flags);
 	if (ret > 0)


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                                 ` (3 preceding siblings ...)
  2010-01-07  4:53                                                               ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
@ 2010-01-07  4:53                                                               ` Trond Myklebust
  4 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07  4:53 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

If the call to do_writepages() succeeded in starting writeback, we do not
know whether or not we will need to COMMIT any unstable writes until after
the write RPC calls are finished. Currently, we assume that at least one
write RPC call will have finished, and set I_DIRTY_DATASYNC by the time
do_writepages is done, so that write_inode() is triggered.

In order to ensure reliable operation (i.e. ensure that a single call to
writeback_single_inode() with WB_SYNC_ALL set suffices to ensure that pages
are on disk) we need to first wait for filemap_fdatawait() to complete,
then test for unstable pages.

Since NFS is currently the only filesystem that has unstable pages, we can
add a new inode state I_UNSTABLE_PAGES that NFS alone will set. When set,
this will trigger a callback to a new address_space_operation to call the
COMMIT.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/fs-writeback.c  |   31 ++++++++++++++++++++++++++++++-
 fs/nfs/file.c      |    1 +
 fs/nfs/inode.c     |   16 ----------------
 fs/nfs/internal.h  |    3 ++-
 fs/nfs/super.c     |    2 --
 fs/nfs/write.c     |   33 ++++++++++++++++++++++++++++++++-
 include/linux/fs.h |    9 +++++++++
 7 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a7c42c..3640769 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -389,6 +389,17 @@ static int write_inode(struct inode *inode, int sync)
 }
 
 /*
+ * Commit the NFS unstable pages.
+ */
+static int commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	if (mapping->a_ops && mapping->a_ops->commit_unstable_pages)
+		return mapping->a_ops->commit_unstable_pages(mapping, wbc);
+	return 0;
+}
+
+/*
  * Wait for writeback on an inode to complete.
  */
 static void inode_wait_for_writeback(struct inode *inode)
@@ -475,6 +486,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	/*
+	 * Special state for cleaning NFS unstable pages
+	 */
+	if (inode->i_state & I_UNSTABLE_PAGES) {
+		int err;
+		inode->i_state &= ~I_UNSTABLE_PAGES;
+		spin_unlock(&inode_lock);
+		err = commit_unstable_pages(mapping, wbc);
+		if (ret == 0)
+			ret = err;
+		spin_lock(&inode_lock);
+	}
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
 		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
@@ -533,6 +556,12 @@ select_queue:
 				inode->i_state |= I_DIRTY_PAGES;
 				redirty_tail(inode);
 			}
+		} else if (inode->i_state & I_UNSTABLE_PAGES) {
+			/*
+			 * The inode has got yet more unstable pages to
+			 * commit. Requeue...
+			 */
+			redirty_tail(inode);
 		} else if (atomic_read(&inode->i_count)) {
 			/*
 			 * The inode is clean, inuse
@@ -1051,7 +1080,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-		const int was_dirty = inode->i_state & I_DIRTY;
+		const int was_dirty = inode->i_state & (I_DIRTY|I_UNSTABLE_PAGES);
 
 		inode->i_state |= flags;
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 6b89132..67e50ac 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -526,6 +526,7 @@ const struct address_space_operations nfs_file_aops = {
 	.migratepage = nfs_migrate_page,
 	.launder_page = nfs_launder_page,
 	.error_remove_page = generic_error_remove_page,
+	.commit_unstable_pages = nfs_commit_unstable_pages,
 };
 
 /*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index faa0918..8341709 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -97,22 +97,6 @@ u64 nfs_compat_user_ino64(u64 fileid)
 	return ino;
 }
 
-int nfs_write_inode(struct inode *inode, int sync)
-{
-	int ret;
-
-	if (sync) {
-		ret = filemap_fdatawait(inode->i_mapping);
-		if (ret == 0)
-			ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	} else
-		ret = nfs_commit_inode(inode, 0);
-	if (ret >= 0)
-		return 0;
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-	return ret;
-}
-
 void nfs_clear_inode(struct inode *inode)
 {
 	/*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 29e464d..7bb326f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -211,7 +211,6 @@ extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
 extern struct workqueue_struct *nfsiod_workqueue;
 extern struct inode *nfs_alloc_inode(struct super_block *sb);
 extern void nfs_destroy_inode(struct inode *);
-extern int nfs_write_inode(struct inode *,int);
 extern void nfs_clear_inode(struct inode *);
 #ifdef CONFIG_NFS_V4
 extern void nfs4_clear_inode(struct inode *);
@@ -253,6 +252,8 @@ extern int nfs4_path_walk(struct nfs_server *server,
 extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 
 /* write.c */
+extern int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc);
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ce907ef..805c1a0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -265,7 +265,6 @@ struct file_system_type nfs_xdev_fs_type = {
 static const struct super_operations nfs_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs_clear_inode,
 	.umount_begin	= nfs_umount_begin,
@@ -334,7 +333,6 @@ struct file_system_type nfs4_referral_fs_type = {
 static const struct super_operations nfs4_sops = {
 	.alloc_inode	= nfs_alloc_inode,
 	.destroy_inode	= nfs_destroy_inode,
-	.write_inode	= nfs_write_inode,
 	.statfs		= nfs_statfs,
 	.clear_inode	= nfs4_clear_inode,
 	.umount_begin	= nfs_umount_begin,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d171696..910be28 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
-	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	mark_inode_unstable_pages(inode);
 }
 
 static int
@@ -1406,11 +1406,42 @@ int nfs_commit_inode(struct inode *inode, int how)
 	}
 	return res;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int flags = FLUSH_SYNC;
+	int ret;
+
+	/* Don't commit yet if this is a non-blocking flush and there are
+	 * outstanding writes for this mapping.
+	 */
+	if (wbc->sync_mode != WB_SYNC_ALL &&
+	    radix_tree_tagged(&NFS_I(inode)->nfs_page_tree,
+		    NFS_PAGE_TAG_LOCKED)) {
+		mark_inode_unstable_pages(inode);
+		return 0;
+	}
+	if (wbc->nonblocking)
+		flags = 0;
+	ret = nfs_commit_inode(inode, flags);
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
 #else
 static inline int nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 {
 	return 0;
 }
+
+int nfs_commit_unstable_pages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	return 0;
+}
 #endif
 
 long nfs_sync_mapping_wait(struct address_space *mapping, struct writeback_control *wbc, int how)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9147ca8..de594b3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -602,6 +602,8 @@ struct address_space_operations {
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
 	int (*error_remove_page)(struct address_space *, struct page *);
+	int (*commit_unstable_pages)(struct address_space *,
+			struct writeback_control *);
 };
 
 /*
@@ -1635,6 +1637,8 @@ struct super_operations {
 #define I_CLEAR			64
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
+#define __I_UNSTABLE_PAGES	8
+#define I_UNSTABLE_PAGES	(1 << __I_UNSTABLE_PAGES)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
@@ -1649,6 +1653,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
+static inline void mark_inode_unstable_pages(struct inode *inode)
+{
+	__mark_inode_dirty(inode, I_UNSTABLE_PAGES);
+}
+
 /**
  * inc_nlink - directly increment an inode's link count
  * @inode: inode


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
       [not found]                                                       ` <1262839082.2185.15.camel@localhost>
  2010-01-07  4:48                                                         ` Wu Fengguang
@ 2010-01-07 14:56                                                         ` Wu Fengguang
  2010-01-07 15:10                                                           ` Trond Myklebust
  1 sibling, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07 14:56 UTC (permalink / raw)
  To: Myklebust, Trond
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote:

> > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > index d171696..910be28 100644
> > > --- a/fs/nfs/write.c
> > > +++ b/fs/nfs/write.c
> > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
> > >     spin_unlock(&inode->i_lock);
> > >     inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
> > >     inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> > > -   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> > > +   mark_inode_unstable_pages(inode);
> >
> > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.
> 
> Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after
> this patch is applied. We won't ever need it.
> 
> If the VM or VFS is doing it, then they ought to be fixed: there is no
> reason to assume that all filesystems need to sync their inodes on
> i_size changes.

Sorry, one more question.

It seems to me that you are replacing

        I_DIRTY_DATASYNC => write_inode()
with
        I_UNSTABLE_PAGES => commit_unstable_pages()

Is that change for the sake of clarity? Or to fix some problem?
(This patch does fix some problems, but do they inherently require
the above change?)

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-07 14:56                                                         ` [PATCH 1/6] " Wu Fengguang
@ 2010-01-07 15:10                                                           ` Trond Myklebust
  2010-01-08  1:17                                                             ` Wu Fengguang
  2010-01-08  9:25                                                             ` Christoph Hellwig
  0 siblings, 2 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-07 15:10 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: 
> On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote:
> 
> > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > > index d171696..910be28 100644
> > > > --- a/fs/nfs/write.c
> > > > +++ b/fs/nfs/write.c
> > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
> > > >     spin_unlock(&inode->i_lock);
> > > >     inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
> > > >     inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> > > > -   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> > > > +   mark_inode_unstable_pages(inode);
> > >
> > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.
> > 
> > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after
> > this patch is applied. We won't ever need it.
> > 
> > If the VM or VFS is doing it, then they ought to be fixed: there is no
> > reason to assume that all filesystems need to sync their inodes on
> > i_size changes.
> 
> Sorry, one more question.
> 
> It seems to me that you are replacing
> 
>         I_DIRTY_DATASYNC => write_inode()
> with
>         I_UNSTABLE_PAGES => commit_unstable_pages()
> 
> Is that change for the sake of clarity? Or to fix some problem?
> (This patch does fix some problems, but do they inherently require
> the above change?)

As I said previously, the write_inode() call is done _before_ you sync
the dirty pages to the server, whereas commit_unstable_pages() wants to
be done _after_ syncing. So the two are not the same, and we cannot
replace commit_unstable_pages() with write_inode().

Replacing I_DIRTY_DATASYNC with I_UNSTABLE_PAGES is more for the sake of
clarity. The difference between the two is that in the I_UNSTABLE_PAGES
case, the inode itself isn't actually dirty; it just contains pages that
are not guaranteed to be on permanent storage until we commit.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-07 15:10                                                           ` Trond Myklebust
@ 2010-01-08  1:17                                                             ` Wu Fengguang
  2010-01-08  1:37                                                               ` Trond Myklebust
  2010-01-08  9:25                                                             ` Christoph Hellwig
  1 sibling, 1 reply; 96+ messages in thread
From: Wu Fengguang @ 2010-01-08  1:17 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 11:10:22PM +0800, Trond Myklebust wrote:
> On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: 
> > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote:
> > 
> > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > > > index d171696..910be28 100644
> > > > > --- a/fs/nfs/write.c
> > > > > +++ b/fs/nfs/write.c
> > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
> > > > >     spin_unlock(&inode->i_lock);
> > > > >     inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
> > > > >     inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> > > > > -   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> > > > > +   mark_inode_unstable_pages(inode);
> > > >
> > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.
> > > 
> > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after
> > > this patch is applied. We won't ever need it.
> > > 
> > > If the VM or VFS is doing it, then they ought to be fixed: there is no
> > > reason to assume that all filesystems need to sync their inodes on
> > > i_size changes.
> > 
> > Sorry, one more question.
> > 
> > It seems to me that you are replacing
> > 
> >         I_DIRTY_DATASYNC => write_inode()
> > with
> >         I_UNSTABLE_PAGES => commit_unstable_pages()
> > 
> > Is that change for the sake of clarity? Or to fix some problem?
> > (This patch does fix some problems, but do they inherently require
> > the above change?)
> 
> As I said previously, the write_inode() call is done _before_ you sync
> the dirty pages to the server, whereas commit_unstable_pages() wants to
> be done _after_ syncing. So the two are not the same, and we cannot
> replace commit_unstable_pages() with write_inode().

This is the ordering:

0       do_writepages()
1       if (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
2               write_inode()
3       if (wait)
4               filemap_fdatawait()
5       if (I_UNSTABLE_PAGES)
6               commit_unstable_pages()

The page is synced to NFS server in line 0.

The only difference is write_inode() is called before filemap_fdatawait(),
while commit_unstable_pages() is called after it.

Note that filemap_fdatawait() will only be called on WB_SYNC_ALL, so I
still cannot understand the difference..

> Replacing I_DIRTY_DATASYNC with I_UNSTABLE_PAGES is more for the sake of
> clarity. The difference between the two is that in the I_UNSTABLE_PAGES
> case, the inode itself isn't actually dirty; it just contains pages that
> are not guaranteed to be on permanent storage until we commit.

And I_UNSTABLE_PAGES is necessary for calling commit_unstable_pages() :)

Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-08  1:17                                                             ` Wu Fengguang
@ 2010-01-08  1:37                                                               ` Trond Myklebust
  2010-01-08  1:53                                                                 ` Wu Fengguang
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-08  1:37 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Fri, 2010-01-08 at 09:17 +0800, Wu Fengguang wrote: 
> On Thu, Jan 07, 2010 at 11:10:22PM +0800, Trond Myklebust wrote:
> > On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: 
> > > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote:
> > > 
> > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > > > > index d171696..910be28 100644
> > > > > > --- a/fs/nfs/write.c
> > > > > > +++ b/fs/nfs/write.c
> > > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
> > > > > >     spin_unlock(&inode->i_lock);
> > > > > >     inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
> > > > > >     inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> > > > > > -   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> > > > > > +   mark_inode_unstable_pages(inode);
> > > > >
> > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.
> > > > 
> > > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after
> > > > this patch is applied. We won't ever need it.
> > > > 
> > > > If the VM or VFS is doing it, then they ought to be fixed: there is no
> > > > reason to assume that all filesystems need to sync their inodes on
> > > > i_size changes.
> > > 
> > > Sorry, one more question.
> > > 
> > > It seems to me that you are replacing
> > > 
> > >         I_DIRTY_DATASYNC => write_inode()
> > > with
> > >         I_UNSTABLE_PAGES => commit_unstable_pages()
> > > 
> > > Is that change for the sake of clarity? Or to fix some problem?
> > > (This patch does fix some problems, but do they inherently require
> > > the above change?)
> > 
> > As I said previously, the write_inode() call is done _before_ you sync
> > the dirty pages to the server, whereas commit_unstable_pages() wants to
> > be done _after_ syncing. So the two are not the same, and we cannot
> > replace commit_unstable_pages() with write_inode().
> 
> This is the ordering:
> 
> 0       do_writepages()
> 1       if (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
> 2               write_inode()
> 3       if (wait)
> 4               filemap_fdatawait()
> 5       if (I_UNSTABLE_PAGES)
> 6               commit_unstable_pages()
> 
> The page is synced to NFS server in line 0.
> 
> The only difference is write_inode() is called before filemap_fdatawait(),
> while commit_unstable_pages() is called after it.
> 
> Note that filemap_fdatawait() will only be called on WB_SYNC_ALL, so I
> still cannot understand the difference..

The difference is precisely that...

In the case of WB_SYNC_ALL we want the call to filemap_fdatawait() to
occur before we call commit_unstable_pages(), so that we know that all
the in-flight write rpc calls are done before we ask that they be
committed to stable storage.

In the case of WB_SYNC_NONE, there is no wait, and so we are forced to
play games with heuristics and/or add the force_commit_unstable flag
because we don't wait for the dirty pages to be cleaned. I don't like
this, but those are the semantics that we've defined for WB_SYNC_NONE.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-08  1:37                                                               ` Trond Myklebust
@ 2010-01-08  1:53                                                                 ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2010-01-08  1:53 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Fri, Jan 08, 2010 at 09:37:31AM +0800, Trond Myklebust wrote:
> On Fri, 2010-01-08 at 09:17 +0800, Wu Fengguang wrote: 
> > On Thu, Jan 07, 2010 at 11:10:22PM +0800, Trond Myklebust wrote:
> > > On Thu, 2010-01-07 at 22:56 +0800, Wu Fengguang wrote: 
> > > > On Thu, Jan 07, 2010 at 12:38:02PM +0800, Myklebust, Trond wrote:
> > > > 
> > > > > > > diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> > > > > > > index d171696..910be28 100644
> > > > > > > --- a/fs/nfs/write.c
> > > > > > > +++ b/fs/nfs/write.c
> > > > > > > @@ -441,7 +441,7 @@ nfs_mark_request_commit(struct nfs_page *req)
> > > > > > >     spin_unlock(&inode->i_lock);
> > > > > > >     inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
> > > > > > >     inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
> > > > > > > -   __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
> > > > > > > +   mark_inode_unstable_pages(inode);
> > > > > >
> > > > > > Then we shall mark I_DIRTY_DATASYNC on other places that extend i_size.
> > > > > 
> > > > > Why? The NFS client itself shouldn't ever set I_DIRTY_DATASYNC after
> > > > > this patch is applied. We won't ever need it.
> > > > > 
> > > > > If the VM or VFS is doing it, then they ought to be fixed: there is no
> > > > > reason to assume that all filesystems need to sync their inodes on
> > > > > i_size changes.
> > > > 
> > > > Sorry, one more question.
> > > > 
> > > > It seems to me that you are replacing
> > > > 
> > > >         I_DIRTY_DATASYNC => write_inode()
> > > > with
> > > >         I_UNSTABLE_PAGES => commit_unstable_pages()
> > > > 
> > > > Is that change for the sake of clarity? Or to fix some problem?
> > > > (This patch does fix some problems, but do they inherently require
> > > > the above change?)
> > > 
> > > As I said previously, the write_inode() call is done _before_ you sync
> > > the dirty pages to the server, whereas commit_unstable_pages() wants to
> > > be done _after_ syncing. So the two are not the same, and we cannot
> > > replace commit_unstable_pages() with write_inode().
> > 
> > This is the ordering:
> > 
> > 0       do_writepages()
> > 1       if (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
> > 2               write_inode()
> > 3       if (wait)
> > 4               filemap_fdatawait()
> > 5       if (I_UNSTABLE_PAGES)
> > 6               commit_unstable_pages()
> > 
> > The page is synced to NFS server in line 0.
> > 
> > The only difference is write_inode() is called before filemap_fdatawait(),
> > while commit_unstable_pages() is called after it.
> > 
> > Note that filemap_fdatawait() will only be called on WB_SYNC_ALL, so I
> > still cannot understand the difference..
> 
> The difference is precisely that...

Thanks, I got it.

> In the case of WB_SYNC_ALL we want the call to filemap_fdatawait() to
> occur before we call commit_unstable_pages(), so that we know that all
> the in-flight write rpc calls are done before we ask that they be
> committed to stable storage.

That's good order for WB_SYNC_ALL. However this is optimizing a minor
case, and what I cared is WB_SYNC_NONE :)

> In the case of WB_SYNC_NONE, there is no wait, and so we are forced to
> play games with heuristics and/or add the force_commit_unstable flag
> because we don't wait for the dirty pages to be cleaned. I don't like
> this, but those are the semantics that we've defined for WB_SYNC_NONE.

For WB_SYNC_NONE we will now also wait for WRITE completion with the
combination of NFS_PAGE_TAG_LOCKED-based-bail-out and redirty_tail().
This is retry based, so less elegant.

But that's not the whole story. The
I_UNSTABLE_PAGES+commit_unstable_pages() scheme seems elegant for
WB_SYNC_ALL, however it may break the pipeline for big files in a
perfect loop:
        loop {
                WRITE 4MB
                COMMIT 4MB
        }
While the retry based WB_SYNC_ALL will keep back-off COMMITs because
do_writepages() keep submit new WRITEs. So its loop would be
        loop {
                WRITE 4MB
                <skip COMMIT>
                WRITE 4MB
                <skip COMMIT>
                WRITE 4MB
                <skip COMMIT>
                WRITE 4MB
                <skip COMMIT>
                ...
                <redirty_tail timeout>
                COMMIT 400MB
        }

That can be improved by lifting the writeback chunk size from 4MB to >=128MB.


Thanks,
Fengguang

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-07 15:10                                                           ` Trond Myklebust
  2010-01-08  1:17                                                             ` Wu Fengguang
@ 2010-01-08  9:25                                                             ` Christoph Hellwig
  2010-01-08 13:46                                                               ` Trond Myklebust
  1 sibling, 1 reply; 96+ messages in thread
From: Christoph Hellwig @ 2010-01-08  9:25 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 10:10:22AM -0500, Trond Myklebust wrote:
> As I said previously, the write_inode() call is done _before_ you sync
> the dirty pages to the server, whereas commit_unstable_pages() wants to
> be done _after_ syncing. So the two are not the same, and we cannot
> replace commit_unstable_pages() with write_inode().

But that's more an accident of how this code was written.  The right
order nees to be to write the pages first, then call write_inode.  Most
modern filesystems have to work around this in their write_inode method
by waiting for the pages themselves.  I already fixed the same ordering
issue in fsync, and the writeback code is next on the agenda.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-08  9:25                                                             ` Christoph Hellwig
@ 2010-01-08 13:46                                                               ` Trond Myklebust
  2010-01-08 13:54                                                                 ` Christoph Hellwig
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-08 13:46 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Fri, 2010-01-08 at 04:25 -0500, Christoph Hellwig wrote: 
> On Thu, Jan 07, 2010 at 10:10:22AM -0500, Trond Myklebust wrote:
> > As I said previously, the write_inode() call is done _before_ you sync
> > the dirty pages to the server, whereas commit_unstable_pages() wants to
> > be done _after_ syncing. So the two are not the same, and we cannot
> > replace commit_unstable_pages() with write_inode().
> 
> But that's more an accident of how this code was written.  The right
> order nees to be to write the pages first, then call write_inode.  Most
> modern filesystems have to work around this in their write_inode method
> by waiting for the pages themselves.  I already fixed the same ordering
> issue in fsync, and the writeback code is next on the agenda.
> 

Could we in that case replace write_inode() with something that takes a
struct writeback_control? It is very useful to have full information
about the write range and flags as it allows us to tweak the COMMIT RPC
call.

Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-08 13:46                                                               ` Trond Myklebust
@ 2010-01-08 13:54                                                                 ` Christoph Hellwig
  2010-01-08 14:15                                                                   ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Christoph Hellwig @ 2010-01-08 13:54 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Christoph Hellwig, Wu Fengguang, Peter Zijlstra, Jan Kara,
	Steve Rago, linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Fri, Jan 08, 2010 at 08:46:46AM -0500, Trond Myklebust wrote:
> Could we in that case replace write_inode() with something that takes a
> struct writeback_control? It is very useful to have full information
> about the write range and flags as it allows us to tweak the COMMIT RPC
> call.

At this point I do not plan to change the write_inode interface.  But
changing the ->write_inode operation to take a writeback control instead
of the sync flag should be a prety easy change if you want to do it.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes
  2010-01-08 13:54                                                                 ` Christoph Hellwig
@ 2010-01-08 14:15                                                                   ` Trond Myklebust
  0 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-08 14:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Fri, 2010-01-08 at 08:54 -0500, Christoph Hellwig wrote: 
> On Fri, Jan 08, 2010 at 08:46:46AM -0500, Trond Myklebust wrote:
> > Could we in that case replace write_inode() with something that takes a
> > struct writeback_control? It is very useful to have full information
> > about the write range and flags as it allows us to tweak the COMMIT RPC
> > call.
> 
> At this point I do not plan to change the write_inode interface.  But
> changing the ->write_inode operation to take a writeback control instead
> of the sync flag should be a prety easy change if you want to do it.
> 

OK. Please could you let me know when you're done with your changes so
that I can adapt this patch?

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                     ` (2 preceding siblings ...)
  2010-01-06 20:51                                                   ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
@ 2010-01-06 20:51                                                   ` Trond Myklebust
       [not found]                                                     ` <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-06 20:51                                                   ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
                                                                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---

 fs/nfs/write.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 978de7f..d6d8048 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
 		mark_inode_unstable_pages(inode);
 		return 0;
 	}
-	if (wbc->nonblocking)
+	if (wbc->nonblocking || wbc->for_background)
 		flags = 0;
 	ret = nfs_commit_inode(inode, flags);
 	if (ret > 0)


^ permalink raw reply related	[flat|nested] 96+ messages in thread

[parent not found: <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* Re: [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
       [not found]                                                     ` <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07  2:32                                                       ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  2:32 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
> @@ -1423,7 +1423,7 @@ int nfs_commit_unstable_pages(struct address_space *mapping,
>  		mark_inode_unstable_pages(inode);
>  		return 0;
>  	}
> -	if (wbc->nonblocking)
> +	if (wbc->nonblocking || wbc->for_background)
>  		flags = 0;
>  	ret = nfs_commit_inode(inode, flags);
>  	if (ret > 0)

Acked-by: Wu Fengguang <fengguang.wu@intel.com>


^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                     ` (3 preceding siblings ...)
  2010-01-06 20:51                                                   ` [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
@ 2010-01-06 20:51                                                   ` Trond Myklebust
  2010-01-07  1:56                                                     ` Wu Fengguang
  2010-01-06 20:51                                                   ` [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
  2010-01-06 21:44                                                   ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara
  6 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

Speeds up the accounting in balance_dirty_pages() for non-nfs devices.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/nfs/client.c             |    1 +
 include/linux/backing-dev.h |    6 ++++++
 mm/page-writeback.c         |   16 +++++++++++-----
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ee77713..d0b060a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -890,6 +890,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
 
 	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	server->backing_dev_info.capabilities |= BDI_CAP_ACCT_UNSTABLE;
 
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 42c3e2a..8b45166 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -232,6 +232,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
 #define BDI_CAP_SWAP_BACKED	0x00000100
+#define BDI_CAP_ACCT_UNSTABLE	0x00000200
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -311,6 +312,11 @@ static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
 	return bdi == &default_backing_dev_info;
 }
 
+static inline bool bdi_cap_account_unstable(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_ACCT_UNSTABLE;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index aa26b0f..d90a0db 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -273,8 +273,9 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 		avail_dirty = 0;
 
 	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
-		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
+	if (bdi_cap_account_unstable(bdi))
+		avail_dirty += bdi_stat(bdi, BDI_UNSTABLE);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
 }
@@ -512,8 +513,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 					nr_unstable_nfs;
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
-				     bdi_stat(bdi, BDI_UNSTABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+		if (bdi_cap_account_unstable(bdi))
+			bdi_nr_reclaimable += bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -563,11 +565,15 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY);
+			if (bdi_cap_account_unstable(bdi))
+				bdi_nr_reclaimable +=
 					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

* Re: [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
  2010-01-06 20:51                                                   ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
@ 2010-01-07  1:56                                                     ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  1:56 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Thu, Jan 07, 2010 at 04:51:10AM +0800, Trond Myklebust wrote:
  
>  	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
> -		bdi_stat(bdi, BDI_UNSTABLE) +
>  		bdi_stat(bdi, BDI_WRITEBACK);
> +	if (bdi_cap_account_unstable(bdi))
> +		avail_dirty += bdi_stat(bdi, BDI_UNSTABLE);

It seems that not changing the bdi_stat()s makes more readable code,
otherwise looks OK to me.

Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>


^ permalink raw reply	[flat|nested] 96+ messages in thread

* [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                     ` (4 preceding siblings ...)
  2010-01-06 20:51                                                   ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
@ 2010-01-06 20:51                                                   ` Trond Myklebust
       [not found]                                                     ` <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  2010-01-06 21:44                                                   ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara
  6 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 20:51 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

From: Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Jan Kara <jack@suse.cz>
---

 fs/nfs/write.c              |    6 +++---
 include/linux/backing-dev.h |    3 ++-
 mm/backing-dev.c            |    6 ++++--
 mm/filemap.c                |    2 +-
 mm/page-writeback.c         |   16 ++++++++++------
 mm/truncate.c               |    2 +-
 6 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee3daf4..978de7f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -440,7 +440,7 @@ nfs_mark_request_commit(struct nfs_page *req)
 			NFS_PAGE_TAG_COMMIT);
 	spin_unlock(&inode->i_lock);
 	inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+	inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_UNSTABLE);
 	mark_inode_unstable_pages(inode);
 }
 
@@ -451,7 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
 
 	if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
 		dec_zone_page_state(page, NR_UNSTABLE_NFS);
-		dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(page->mapping->backing_dev_info, BDI_UNSTABLE);
 		return 1;
 	}
 	return 0;
@@ -1322,7 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
 		nfs_mark_request_commit(req);
 		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
-				BDI_RECLAIMABLE);
+				BDI_UNSTABLE);
 		nfs_clear_page_tag_locked(req);
 	}
 	return -ENOMEM;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index fcbc26a..42c3e2a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -36,7 +36,8 @@ enum bdi_state {
 typedef int (congested_fn)(void *, int);
 
 enum bdi_stat_item {
-	BDI_RECLAIMABLE,
+	BDI_DIRTY,
+	BDI_UNSTABLE,
 	BDI_WRITEBACK,
 	NR_BDI_STAT_ITEMS
 };
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca03..88f3655 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -88,7 +88,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
 		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
+		   "BdiDirty:         %8lu kB\n"
+		   "BdiUnstable:      %8lu kB\n"
 		   "BdiDirtyThresh:   %8lu kB\n"
 		   "DirtyThresh:      %8lu kB\n"
 		   "BackgroundThresh: %8lu kB\n"
@@ -102,7 +103,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "wb_list:          %8u\n"
 		   "wb_cnt:           %8u\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_DIRTY)),
+		   (unsigned long) K(bdi_stat(bdi, BDI_UNSTABLE)),
 		   K(bdi_thresh), K(dirty_thresh),
 		   K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
diff --git a/mm/filemap.c b/mm/filemap.c
index 96ac6b0..458387d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -136,7 +136,7 @@ void __remove_from_page_cache(struct page *page)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 	}
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index ede5356..aa26b0f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -272,7 +272,8 @@ static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
 	else
 		avail_dirty = 0;
 
-	avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+	avail_dirty += bdi_stat(bdi, BDI_DIRTY) +
+		bdi_stat(bdi, BDI_UNSTABLE) +
 		bdi_stat(bdi, BDI_WRITEBACK);
 
 	*pbdi_dirty = min(*pbdi_dirty, avail_dirty);
@@ -511,7 +512,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 					nr_unstable_nfs;
 		nr_writeback = global_page_state(NR_WRITEBACK);
 
-		bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+				     bdi_stat(bdi, BDI_UNSTABLE);
 		bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 
 		if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
@@ -561,10 +563,12 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * deltas.
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
-			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_DIRTY) +
+					     bdi_stat_sum(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else if (bdi_nr_reclaimable) {
-			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+			bdi_nr_reclaimable = bdi_stat(bdi, BDI_DIRTY) +
+					     bdi_stat(bdi, BDI_UNSTABLE);
 			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
@@ -1086,7 +1090,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
 	if (mapping_cap_account_dirty(mapping)) {
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTY);
 		task_dirty_inc(current);
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
@@ -1262,7 +1266,7 @@ int clear_page_dirty_for_io(struct page *page)
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			return 1;
 		}
 		return 0;
diff --git a/mm/truncate.c b/mm/truncate.c
index 342deee..b0ce8fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -75,7 +75,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
 			dec_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
+					BDI_DIRTY);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
 		}


^ permalink raw reply related	[flat|nested] 96+ messages in thread

[parent not found: <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>]

* Re: [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
       [not found]                                                     ` <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07  1:48                                                       ` Wu Fengguang
  0 siblings, 0 replies; 96+ messages in thread
From: Wu Fengguang @ 2010-01-07  1:48 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Peter Zijlstra, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

>  	seq_printf(m,
>  		   "BdiWriteback:     %8lu kB\n"
> -		   "BdiReclaimable:   %8lu kB\n"
> +		   "BdiDirty:         %8lu kB\n"
> +		   "BdiUnstable:      %8lu kB\n"
>  		   "BdiDirtyThresh:   %8lu kB\n"
>  		   "DirtyThresh:      %8lu kB\n"
>  		   "BackgroundThresh: %8lu kB\n"

This also reduces one synthetic concept ;) Thanks!

Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
                                                                     ` (5 preceding siblings ...)
  2010-01-06 20:51                                                   ` [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
@ 2010-01-06 21:44                                                   ` Jan Kara
  2010-01-06 22:03                                                     ` Trond Myklebust
  6 siblings, 1 reply; 96+ messages in thread
From: Jan Kara @ 2010-01-06 21:44 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Peter Zijlstra, Jan Kara, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed 06-01-10 15:51:10, Trond Myklebust wrote:
> Peter Zijlstra (1):
>       VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
> 
> Trond Myklebust (5):
>       NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
>       VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
>       VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
>       VM/NFS: The VM must tell the filesystem when to free reclaimable pages
>       VFS: Ensure that writeback_single_inode() commits unstable writes
  I think the series would be nicer if you made Peter's patch #2 and join
your patches
"VM/NFS: The VM must tell the filesystem when to free reclaimable pages"
and
"VM: Use per-bdi unstable accounting to improve use of wbc->force_commit"

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 21:44                                                   ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara
@ 2010-01-06 22:03                                                     ` Trond Myklebust
  0 siblings, 0 replies; 96+ messages in thread
From: Trond Myklebust @ 2010-01-06 22:03 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Peter Zijlstra, Steve Rago,
	linux-nfs@vger.kernel.org, jens.axboe, Peter Staubach,
	Arjan van de Ven, Ingo Molnar, linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 22:44 +0100, Jan Kara wrote: 
> On Wed 06-01-10 15:51:10, Trond Myklebust wrote:
> > Peter Zijlstra (1):
> >       VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
> > 
> > Trond Myklebust (5):
> >       NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
> >       VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
> >       VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
> >       VM/NFS: The VM must tell the filesystem when to free reclaimable pages
> >       VFS: Ensure that writeback_single_inode() commits unstable writes
>   I think the series would be nicer if you made Peter's patch #2 and join
> your patches
> "VM/NFS: The VM must tell the filesystem when to free reclaimable pages"
> and
> "VM: Use per-bdi unstable accounting to improve use of wbc->force_commit"
> 
> 									Honza

Indeed, and if everyone is OK with Peter's patch, I'll do that.

Cheers
  Trond


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads
  2010-01-06 20:51                                               ` [PATCH 0/6] " Trond Myklebust
       [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2010-01-07  8:16                                                 ` Peter Zijlstra
  1 sibling, 0 replies; 96+ messages in thread
From: Peter Zijlstra @ 2010-01-07  8:16 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Wu Fengguang, Jan Kara, Steve Rago, linux-nfs@vger.kernel.org,
	jens.axboe, Peter Staubach, Arjan van de Ven, Ingo Molnar,
	linux-fsdevel@vger.kernel.org

On Wed, 2010-01-06 at 15:51 -0500, Trond Myklebust wrote:
> OK, here is the full series so far. I'm resending because I had to fix
> up a couple of BDI_UNSTABLE typos in Peter's patch...

Looks good and thanks for fixing things up!

Acked-by: Peter Zijlstra <peterz@infradead.org>

> 
> Peter Zijlstra (1):
>       VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE
> 
> Trond Myklebust (5):
>       NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set
>       VM: Use per-bdi unstable accounting to improve use of wbc->force_commit
>       VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices
>       VM/NFS: The VM must tell the filesystem when to free reclaimable pages
>       VFS: Ensure that writeback_single_inode() commits unstable writes
> 
> 
>  fs/fs-writeback.c           |   31 ++++++++++++++++++++++++++++++-
>  fs/nfs/client.c             |    1 +
>  fs/nfs/file.c               |    1 +
>  fs/nfs/inode.c              |   16 ----------------
>  fs/nfs/internal.h           |    3 ++-
>  fs/nfs/super.c              |    2 --
>  fs/nfs/write.c              |   39 +++++++++++++++++++++++++++++++++++----
>  include/linux/backing-dev.h |    9 ++++++++-
>  include/linux/fs.h          |    9 +++++++++
>  include/linux/writeback.h   |    5 +++++
>  mm/backing-dev.c            |    6 ++++--
>  mm/filemap.c                |    2 +-
>  mm/page-writeback.c         |   30 ++++++++++++++++++++++++------
>  mm/truncate.c               |    2 +-
>  14 files changed, 121 insertions(+), 35 deletions(-)
> 



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-19 14:25     ` Steve Rago
  2009-12-22  1:59       ` Wu Fengguang
@ 2009-12-22 12:25       ` Jan Kara
       [not found]         ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
  2009-12-22 16:20         ` Steve Rago
  1 sibling, 2 replies; 96+ messages in thread
From: Jan Kara @ 2009-12-22 12:25 UTC (permalink / raw)
  To: Steve Rago
  Cc: Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

  Hi,

> On Sat, 2009-12-19 at 20:20 +0800, Wu Fengguang wrote:
> > Hi Steve,
> > 
> > // I should really read the NFS code, but maybe you can help us better
> > // understand the problem :)
> > 
> > On Thu, Dec 17, 2009 at 04:17:57PM +0800, Peter Zijlstra wrote:
> > > On Wed, 2009-12-16 at 21:03 -0500, Steve Rago wrote:
> > > > Eager Writeback for NFS Clients
> > > > -------------------------------
> > > > Prevent applications that write large sequential streams of data (like backup, for example)
> > > > from entering into a memory pressure state, which degrades performance by falling back to
> > > > synchronous operations (both synchronous writes and additional commits).
> > 
> > What exactly is the "memory pressure state" condition?  What's the
> > code to do the "synchronous writes and additional commits" and maybe
> > how they are triggered?
> 
> Memory pressure occurs when most of the client pages have been dirtied
> by an application (think backup server writing multi-gigabyte files that
> exceed the size of main memory).  The system works harder to be able to
> free dirty pages so that they can be reused.  For a local file system,
> this means writing the pages to disk.  For NFS, however, the writes
> leave the pages in an "unstable" state until the server responds to a
> commit request.  Generally speaking, commit processing is far more
> expensive than write processing on the server; both are done with the
> inode locked, but since the commit takes so long, all writes are
> blocked, which stalls the pipeline.
  I'm not sure I understand the problem you are trying to solve. So we
generate dirty pages on an NFS filesystem. At some point we reach the
dirty_threshold so writing process is throttled and forced to do some
writes. For NFS it takes a longer time before we can really free the
pages because sever has to acknowledge the write (BTW e.g. for optical
media like DVD-RW it also takes a long time to really write the data).
  Now the problem you are trying to solve is that the system basically
gets out of free memory (so that it has to start doing synchronous
writeback from the allocator) because the writer manages to dirty
remaining free memory before the submitted writes are acknowledged from
the server? If that is so, then it might make sence to introduce also
per-bdi equivalent of dirty_background_ratio so that we can start
background writeback of dirty data at different times for different
backing devices - that would make sence to me in general, not only
for NFS.
  Another complementary piece to my above proposal would be something
like Fenguang's patches that actually don't let the process dirty too
much memory by throttling it in balance_dirty_pages until number of
unstable pages gets lower.

> > > > This is accomplished by preventing the client application from
> > > > dirtying pages faster than they can be written to the server:
> > > > clients write pages eagerly instead of lazily.
> > 
> > We already have the balance_dirty_pages() based global throttling.
> > So what makes the performance difference in your proposed "per-inode" throttling?
> > balance_dirty_pages() does have much larger threshold than yours. 
> 
> I originally spent several months playing with the balance_dirty_pages
> algorithm.  The main drawback is that it affects more than the inodes
> that the caller is writing and that the control of what to do is too
  Can you be more specific here please?

> coarse.  My final changes (which worked well for 1Gb connections) were
> more heuristic than the changes in the patch -- I basically had to come
> up with alternate ways to write pages without generating commits on
> inodes.  Doing this was distasteful, as I was adjusting generic system
> behavior for an NFS-only problem.  Then a colleague found Peter
> Staubach's patch, which worked just as well in less code, and isolated
> the change to the NFS component, which is where it belongs.
  As I said above, the problem of slow writes happens also in other
cases. What's specific to NFS is that pages aren't in writeback state
for long but they instead stay in unstable state. But
balance_dirty_pages should handle that and if it does not, it should be
fixed.

> > > > The eager writeback is controlled by a sysctl: fs.nfs.nfs_max_woutstanding set to 0 disables
> > > > the feature.  Otherwise it contains the maximum number of outstanding NFS writes that can be
> > > > in flight for a given file.  This is used to block the application from dirtying more pages
> > > > until the writes are complete.
> > 
> > What if we do heuristic write-behind for sequential NFS writes?
> 
> Part of the patch does implement a heuristic write-behind.  See where
> nfs_wb_eager() is called.
  I believe that if we had per-bdi dirty_background_ratio and set it low
for NFS's bdi, then the write-behind logic would not be needed
(essentially the flusher thread should submit the writes to the server
early).

								Honza
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>]

* Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]         ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
@ 2009-12-22 12:38           ` Peter Zijlstra
  2009-12-22 12:55             ` Jan Kara
  0 siblings, 1 reply; 96+ messages in thread
From: Peter Zijlstra @ 2009-12-22 12:38 UTC (permalink / raw)
  To: Jan Kara
  Cc: Steve Rago, Wu Fengguang, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote:
>   I believe that if we had per-bdi dirty_background_ratio and set it low
> for NFS's bdi, 

There's two things there I think:
 1) bdi_background
 2) different background ratios per bdi

1) could be 'trivially' done much like we do bdi_dirty in
get_dirty_limits().

2) I'm not at all convinced we want to go there.


^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22 12:38           ` Peter Zijlstra
@ 2009-12-22 12:55             ` Jan Kara
  0 siblings, 0 replies; 96+ messages in thread
From: Jan Kara @ 2009-12-22 12:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jan Kara, Steve Rago, Wu Fengguang, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

> On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote:
> >   I believe that if we had per-bdi dirty_background_ratio and set it low
> > for NFS's bdi, 
> 
> There's two things there I think:
>  1) bdi_background
>  2) different background ratios per bdi
  Right.

> 1) could be 'trivially' done much like we do bdi_dirty in
> get_dirty_limits().
> 
> 2) I'm not at all convinced we want to go there.
  Yeah. Doing 1) and playing with bdi->min_ratio and bdi->max_ratio should
be the first thing we should try...

									Honza
-- 
Jan Kara <jack@suse.cz>
SuSE CR Labs

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22 12:25       ` Jan Kara
       [not found]         ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
@ 2009-12-22 16:20         ` Steve Rago
  2009-12-23 18:39           ` Jan Kara
  1 sibling, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-22 16:20 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote:
> > I originally spent several months playing with the balance_dirty_pages
> > algorithm.  The main drawback is that it affects more than the inodes
> > that the caller is writing and that the control of what to do is too
>   Can you be more specific here please?

Sure;  balance_dirty_pages() will schedule writeback by the flusher
thread once the number of dirty pages exceeds dirty_background_ratio.
The flusher thread calls writeback_inodes_wb() to flush all dirty inodes
associated with the bdi.  Similarly, the process dirtying the pages will
call writeback_inodes_wbc() when it's bdi threshold has been exceeded.
The first problem is that these functions process all dirty inodes with
the same backing device, which can lead to excess (duplicate) flushing
of the same inode.  Second, there is no distinction between pages that
need to be committed and pages that have commits pending in
NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't
be cleaned any faster by sending more commits).  This tends to overstate
the amount of memory that can be cleaned, leading to additional commit
requests.  Third, these functions generate a commit for each set of
writes they do, which might not be appropriate.  For background writing,
you'd like to delay the commit as long as possible.

[snip]

> > 
> > Part of the patch does implement a heuristic write-behind.  See where
> > nfs_wb_eager() is called.
>   I believe that if we had per-bdi dirty_background_ratio and set it low
> for NFS's bdi, then the write-behind logic would not be needed
> (essentially the flusher thread should submit the writes to the server
> early).
> 
> 								Honza

Maybe so, but you still need something to prevent the process that is
dirtying pages from continuing, because a process can always write to
memory faster than writing to disk/network, so the flusher won't be able
to keep up.

Steve

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-22 16:20         ` Steve Rago
@ 2009-12-23 18:39           ` Jan Kara
       [not found]             ` <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
  0 siblings, 1 reply; 96+ messages in thread
From: Jan Kara @ 2009-12-23 18:39 UTC (permalink / raw)
  To: Steve Rago
  Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach

On Tue 22-12-09 11:20:15, Steve Rago wrote:
> 
> On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote:
> > > I originally spent several months playing with the balance_dirty_pages
> > > algorithm.  The main drawback is that it affects more than the inodes
> > > that the caller is writing and that the control of what to do is too
> >   Can you be more specific here please?
> 
> Sure;  balance_dirty_pages() will schedule writeback by the flusher
> thread once the number of dirty pages exceeds dirty_background_ratio.
> The flusher thread calls writeback_inodes_wb() to flush all dirty inodes
> associated with the bdi.  Similarly, the process dirtying the pages will
> call writeback_inodes_wbc() when it's bdi threshold has been exceeded.
> The first problem is that these functions process all dirty inodes with
> the same backing device, which can lead to excess (duplicate) flushing
> of the same inode.  Second, there is no distinction between pages that
> need to be committed and pages that have commits pending in
> NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't
> be cleaned any faster by sending more commits).  This tends to overstate
> the amount of memory that can be cleaned, leading to additional commit
> requests.  Third, these functions generate a commit for each set of
> writes they do, which might not be appropriate.  For background writing,
> you'd like to delay the commit as long as possible.
  Ok, I get it. Thanks for explanation. The problem with more writing
threads bites us also for ordinary SATA drives (the IO pattern and thus
throughput gets worse and worse the more threads do writes). The plan is to
let only flusher thread do the IO and throttled thread in
balance_dirty_pages just waits for flusher thread to do the work. There
were even patches for this floating around but I'm not sure what's happened
to them. So that part of the problem should be easy to solve.
  Another part is about sending commits - if we have just one thread doing
flushing, we have no problems with excessive commits for one inode. You're
right that we may want to avoid sending commits for background writeback
but until we send the commit, pages are just accumulating in the unstable
state, aren't they? So we might want to periodically send the commit for
the inode anyway to get rid of those pages. So from this point of view,
sending commit after each writepages call does not seem like a so bad idea
- although it might be more appropriate to send it some time after the
writepages call when we are not close to dirty limit so that server has
more time to do more natural "unforced" writeback...

> > > Part of the patch does implement a heuristic write-behind.  See where
> > > nfs_wb_eager() is called.
> >   I believe that if we had per-bdi dirty_background_ratio and set it low
> > for NFS's bdi, then the write-behind logic would not be needed
> > (essentially the flusher thread should submit the writes to the server
> > early).
> > 
> Maybe so, but you still need something to prevent the process that is
> dirtying pages from continuing, because a process can always write to
> memory faster than writing to disk/network, so the flusher won't be able
> to keep up.
  Yes, I agree that part is needed. But Fengguang already had patches in
that direction if my memory serves me well.

  So to recap: If we block tasks in balance_dirty_pages until unstable
pages are committed and make just one thread do the writing, what else is
missing to make you happy? :)
								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 96+ messages in thread

[parent not found: <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>]

* Re: [PATCH] improve the performance of large sequential write NFS workloads
       [not found]             ` <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
@ 2009-12-23 20:16               ` Steve Rago
  2009-12-23 21:49                 ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-23 20:16 UTC (permalink / raw)
  To: Jan Kara
  Cc: Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trond.Myklebust@netapp.com,
	jens.axboe, Peter Staubach


On Wed, 2009-12-23 at 19:39 +0100, Jan Kara wrote:
> On Tue 22-12-09 11:20:15, Steve Rago wrote:
> > 
> > On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote:
> > > > I originally spent several months playing with the balance_dirty_pages
> > > > algorithm.  The main drawback is that it affects more than the inodes
> > > > that the caller is writing and that the control of what to do is too
> > >   Can you be more specific here please?
> > 
> > Sure;  balance_dirty_pages() will schedule writeback by the flusher
> > thread once the number of dirty pages exceeds dirty_background_ratio.
> > The flusher thread calls writeback_inodes_wb() to flush all dirty inodes
> > associated with the bdi.  Similarly, the process dirtying the pages will
> > call writeback_inodes_wbc() when it's bdi threshold has been exceeded.
> > The first problem is that these functions process all dirty inodes with
> > the same backing device, which can lead to excess (duplicate) flushing
> > of the same inode.  Second, there is no distinction between pages that
> > need to be committed and pages that have commits pending in
> > NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't
> > be cleaned any faster by sending more commits).  This tends to overstate
> > the amount of memory that can be cleaned, leading to additional commit
> > requests.  Third, these functions generate a commit for each set of
> > writes they do, which might not be appropriate.  For background writing,
> > you'd like to delay the commit as long as possible.
>   Ok, I get it. Thanks for explanation. The problem with more writing
> threads bites us also for ordinary SATA drives (the IO pattern and thus
> throughput gets worse and worse the more threads do writes). The plan is to
> let only flusher thread do the IO and throttled thread in
> balance_dirty_pages just waits for flusher thread to do the work. There
> were even patches for this floating around but I'm not sure what's happened
> to them. So that part of the problem should be easy to solve.
>   Another part is about sending commits - if we have just one thread doing
> flushing, we have no problems with excessive commits for one inode. You're
> right that we may want to avoid sending commits for background writeback
> but until we send the commit, pages are just accumulating in the unstable
> state, aren't they? So we might want to periodically send the commit for
> the inode anyway to get rid of those pages. So from this point of view,
> sending commit after each writepages call does not seem like a so bad idea
> - although it might be more appropriate to send it some time after the
> writepages call when we are not close to dirty limit so that server has
> more time to do more natural "unforced" writeback...

When to send the commit is a complex question to answer.  If you delay
it long enough, the server's flusher threads will have already done most
of the work for you, so commits can be cheap, but you don't have access
to the necessary information to figure this out.  You can't delay it too
long, though, because the unstable pages on the client will grow too
large, creating memory pressure.  I have a second patch, which I haven't
posted yet, that adds feedback piggy-backed on the NFS write response,
which allows the NFS client to free pages proactively.  This greatly
reduces the need to send commit messages, but it extends the protocol
(in a backward-compatible manner), so it could be hard to convince
people to accept.

> 
> > > > Part of the patch does implement a heuristic write-behind.  See where
> > > > nfs_wb_eager() is called.
> > >   I believe that if we had per-bdi dirty_background_ratio and set it low
> > > for NFS's bdi, then the write-behind logic would not be needed
> > > (essentially the flusher thread should submit the writes to the server
> > > early).
> > > 
> > Maybe so, but you still need something to prevent the process that is
> > dirtying pages from continuing, because a process can always write to
> > memory faster than writing to disk/network, so the flusher won't be able
> > to keep up.
>   Yes, I agree that part is needed. But Fengguang already had patches in
> that direction if my memory serves me well.
> 
>   So to recap: If we block tasks in balance_dirty_pages until unstable
> pages are committed and make just one thread do the writing, what else is
> missing to make you happy? :)
> 								Honza

As long as the performance improves substantially, I'll be happy.

Part of the problem that isn't addressed by your summary is the
synchronous writes.  With the eager writeback patch, these are removed
[see the short-circuit in wb_priority()].  I would have expected that
change to be controversial, but I've not heard any complaints (yet).  If
the process or the bdi flusher is writing and committing regularly, then
pages should be recycled quickly and the change shouldn't matter, but
I'd need to run my systemtap scripts to make sure.  

Steve



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23 20:16               ` Steve Rago
@ 2009-12-23 21:49                 ` Trond Myklebust
  2009-12-23 23:13                   ` Steve Rago
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-23 21:49 UTC (permalink / raw)
  To: Steve Rago
  Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach

On Wed, 2009-12-23 at 15:16 -0500, Steve Rago wrote: 
> On Wed, 2009-12-23 at 19:39 +0100, Jan Kara wrote:
> > On Tue 22-12-09 11:20:15, Steve Rago wrote:
> > > 
> > > On Tue, 2009-12-22 at 13:25 +0100, Jan Kara wrote:
> > > > > I originally spent several months playing with the balance_dirty_pages
> > > > > algorithm.  The main drawback is that it affects more than the inodes
> > > > > that the caller is writing and that the control of what to do is too
> > > >   Can you be more specific here please?
> > > 
> > > Sure;  balance_dirty_pages() will schedule writeback by the flusher
> > > thread once the number of dirty pages exceeds dirty_background_ratio.
> > > The flusher thread calls writeback_inodes_wb() to flush all dirty inodes
> > > associated with the bdi.  Similarly, the process dirtying the pages will
> > > call writeback_inodes_wbc() when it's bdi threshold has been exceeded.
> > > The first problem is that these functions process all dirty inodes with
> > > the same backing device, which can lead to excess (duplicate) flushing
> > > of the same inode.  Second, there is no distinction between pages that
> > > need to be committed and pages that have commits pending in
> > > NR_UNSTABLE_NFS/BDI_RECLAIMABLE (a page that has a commit pending won't
> > > be cleaned any faster by sending more commits).  This tends to overstate
> > > the amount of memory that can be cleaned, leading to additional commit
> > > requests.  Third, these functions generate a commit for each set of
> > > writes they do, which might not be appropriate.  For background writing,
> > > you'd like to delay the commit as long as possible.
> >   Ok, I get it. Thanks for explanation. The problem with more writing
> > threads bites us also for ordinary SATA drives (the IO pattern and thus
> > throughput gets worse and worse the more threads do writes). The plan is to
> > let only flusher thread do the IO and throttled thread in
> > balance_dirty_pages just waits for flusher thread to do the work. There
> > were even patches for this floating around but I'm not sure what's happened
> > to them. So that part of the problem should be easy to solve.
> >   Another part is about sending commits - if we have just one thread doing
> > flushing, we have no problems with excessive commits for one inode. You're
> > right that we may want to avoid sending commits for background writeback
> > but until we send the commit, pages are just accumulating in the unstable
> > state, aren't they? So we might want to periodically send the commit for
> > the inode anyway to get rid of those pages. So from this point of view,
> > sending commit after each writepages call does not seem like a so bad idea
> > - although it might be more appropriate to send it some time after the
> > writepages call when we are not close to dirty limit so that server has
> > more time to do more natural "unforced" writeback...
> 
> When to send the commit is a complex question to answer.  If you delay
> it long enough, the server's flusher threads will have already done most
> of the work for you, so commits can be cheap, but you don't have access
> to the necessary information to figure this out.  You can't delay it too
> long, though, because the unstable pages on the client will grow too
> large, creating memory pressure.  I have a second patch, which I haven't
> posted yet, that adds feedback piggy-backed on the NFS write response,
> which allows the NFS client to free pages proactively.  This greatly
> reduces the need to send commit messages, but it extends the protocol
> (in a backward-compatible manner), so it could be hard to convince
> people to accept.

There are only 2 cases when the client should send a COMMIT: 
     1. When it hits a synchronisation point (i.e. when the user calls
        f/sync(), or close(), or when the user sets/clears a file
        lock). 
     2. When memory pressure causes the VM to wants to free up those
        pages that are marked as clean but unstable.

We should never be sending COMMIT in any other situation, since that
would imply that the client somehow has better information on how to
manage dirty pages on the server than the server's own VM.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23 21:49                 ` Trond Myklebust
@ 2009-12-23 23:13                   ` Steve Rago
  2009-12-23 23:44                     ` Trond Myklebust
  0 siblings, 1 reply; 96+ messages in thread
From: Steve Rago @ 2009-12-23 23:13 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach


On Wed, 2009-12-23 at 22:49 +0100, Trond Myklebust wrote:

> > When to send the commit is a complex question to answer.  If you delay
> > it long enough, the server's flusher threads will have already done most
> > of the work for you, so commits can be cheap, but you don't have access
> > to the necessary information to figure this out.  You can't delay it too
> > long, though, because the unstable pages on the client will grow too
> > large, creating memory pressure.  I have a second patch, which I haven't
> > posted yet, that adds feedback piggy-backed on the NFS write response,
> > which allows the NFS client to free pages proactively.  This greatly
> > reduces the need to send commit messages, but it extends the protocol
> > (in a backward-compatible manner), so it could be hard to convince
> > people to accept.
> 
> There are only 2 cases when the client should send a COMMIT: 
>      1. When it hits a synchronisation point (i.e. when the user calls
>         f/sync(), or close(), or when the user sets/clears a file
>         lock). 
>      2. When memory pressure causes the VM to wants to free up those
>         pages that are marked as clean but unstable.
> 
> We should never be sending COMMIT in any other situation, since that
> would imply that the client somehow has better information on how to
> manage dirty pages on the server than the server's own VM.
> 
> Cheers
>   Trond

#2 is the difficult one.  If you wait for memory pressure, you could
have waited too long, because depending on the latency of the commit,
you could run into low-memory situations.  Then mayhem ensues, the
oom-killer gets cranky (if you haven't disabled it), and stuff starts
failing and/or hanging.  So you need to be careful about setting the
threshold for generating a commit so that the client doesn't run out of
memory before the server can respond.

Steve



^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23 23:13                   ` Steve Rago
@ 2009-12-23 23:44                     ` Trond Myklebust
  2009-12-24  4:30                       ` Steve Rago
  0 siblings, 1 reply; 96+ messages in thread
From: Trond Myklebust @ 2009-12-23 23:44 UTC (permalink / raw)
  To: Steve Rago
  Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach

On Wed, 2009-12-23 at 18:13 -0500, Steve Rago wrote: 
> On Wed, 2009-12-23 at 22:49 +0100, Trond Myklebust wrote:
> 
> > > When to send the commit is a complex question to answer.  If you delay
> > > it long enough, the server's flusher threads will have already done most
> > > of the work for you, so commits can be cheap, but you don't have access
> > > to the necessary information to figure this out.  You can't delay it too
> > > long, though, because the unstable pages on the client will grow too
> > > large, creating memory pressure.  I have a second patch, which I haven't
> > > posted yet, that adds feedback piggy-backed on the NFS write response,
> > > which allows the NFS client to free pages proactively.  This greatly
> > > reduces the need to send commit messages, but it extends the protocol
> > > (in a backward-compatible manner), so it could be hard to convince
> > > people to accept.
> > 
> > There are only 2 cases when the client should send a COMMIT: 
> >      1. When it hits a synchronisation point (i.e. when the user calls
> >         f/sync(), or close(), or when the user sets/clears a file
> >         lock). 
> >      2. When memory pressure causes the VM to wants to free up those
> >         pages that are marked as clean but unstable.
> > 
> > We should never be sending COMMIT in any other situation, since that
> > would imply that the client somehow has better information on how to
> > manage dirty pages on the server than the server's own VM.
> > 
> > Cheers
> >   Trond
> 
> #2 is the difficult one.  If you wait for memory pressure, you could
> have waited too long, because depending on the latency of the commit,
> you could run into low-memory situations.  Then mayhem ensues, the
> oom-killer gets cranky (if you haven't disabled it), and stuff starts
> failing and/or hanging.  So you need to be careful about setting the
> threshold for generating a commit so that the client doesn't run out of
> memory before the server can respond.

Right, but this is why we have limits on the total number of dirty pages
that can be kept in memory. The NFS unstable writes don't significantly
change that model, they just add an extra step: once all the dirty data
has been transmitted to the server, your COMMIT defines a
synchronisation point after which you know that the data you just sent
is all on disk. Given a reasonable NFS server implementation, it will
already have started the write out of that data, and so hopefully the
COMMIT operation itself will run reasonably quickly.

Any userland application with basic data integrity requirements will
have the same expectations. It will write out the data and then fsync()
at regular intervals. I've never heard of any expectations from
filesystem and VM designers that applications should be required to
fine-tune the length of those intervals in order to achieve decent
performance.

Cheers
  Trond

^ permalink raw reply	[flat|nested] 96+ messages in thread

* Re: [PATCH] improve the performance of large sequential write NFS workloads
  2009-12-23 23:44                     ` Trond Myklebust
@ 2009-12-24  4:30                       ` Steve Rago
  0 siblings, 0 replies; 96+ messages in thread
From: Steve Rago @ 2009-12-24  4:30 UTC (permalink / raw)
  To: Trond Myklebust
  Cc: Jan Kara, Wu Fengguang, Peter Zijlstra, linux-nfs@vger.kernel.org,
	linux-kernel@vger.kernel.org, jens.axboe, Peter Staubach


On Thu, 2009-12-24 at 00:44 +0100, Trond Myklebust wrote:

> > #2 is the difficult one.  If you wait for memory pressure, you could
> > have waited too long, because depending on the latency of the commit,
> > you could run into low-memory situations.  Then mayhem ensues, the
> > oom-killer gets cranky (if you haven't disabled it), and stuff starts
> > failing and/or hanging.  So you need to be careful about setting the
> > threshold for generating a commit so that the client doesn't run out of
> > memory before the server can respond.
> 
> Right, but this is why we have limits on the total number of dirty pages
> that can be kept in memory. The NFS unstable writes don't significantly
> change that model, they just add an extra step: once all the dirty data
> has been transmitted to the server, your COMMIT defines a
> synchronisation point after which you know that the data you just sent
> is all on disk. Given a reasonable NFS server implementation, it will
> already have started the write out of that data, and so hopefully the
> COMMIT operation itself will run reasonably quickly.

Right.  The trick is to do this with the best performance possible.

> 
> Any userland application with basic data integrity requirements will
> have the same expectations. It will write out the data and then fsync()
> at regular intervals. I've never heard of any expectations from
> filesystem and VM designers that applications should be required to
> fine-tune the length of those intervals in order to achieve decent
> performance.

Agreed, except that the more you call fsync(), the more you are stalling
the writing, so application designers must use fsync() judiciously.
Otherwise they'd just use synchronous writes.  (Apologies if I sound
like Captain Obvious.)

Thanks,

Steve

> 
> Cheers
>   Trond
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 96+ messages in thread

end of thread, other threads:[~2010-01-08 14:15 UTC | newest]

Thread overview: 96+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-17  2:03 [PATCH] improve the performance of large sequential write NFS workloads Steve Rago
2009-12-17  8:17 ` Peter Zijlstra
2009-12-18 19:33   ` Steve Rago
2009-12-18 19:41     ` Ingo Molnar
2009-12-18 21:20       ` Steve Rago
2009-12-18 22:07         ` Ingo Molnar
2009-12-18 22:46           ` Steve Rago
2009-12-19  8:08         ` Arjan van de Ven
2009-12-19 13:37           ` Steve Rago
2009-12-18 19:44     ` Peter Zijlstra
2009-12-19 12:20   ` Wu Fengguang
2009-12-19 14:25     ` Steve Rago
2009-12-22  1:59       ` Wu Fengguang
2009-12-22 12:35         ` Jan Kara
     [not found]           ` <20091222123538.GB604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
2009-12-23  8:43             ` Christoph Hellwig
2009-12-23 13:32               ` Jan Kara
     [not found]                 ` <20091223133244.GB3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2009-12-24  5:25                   ` Wu Fengguang
2009-12-24  1:26           ` Wu Fengguang
2009-12-22 13:01         ` Martin Knoblauch
     [not found]           ` <787373.9318.qm-rpBZDh8Qtqs5A34FEqDeB/u2YVrzzGjVVpNB7YpNyf8@public.gmane.org>
2009-12-24  1:46             ` Wu Fengguang
2009-12-22 16:41         ` Steve Rago
2009-12-24  1:21           ` Wu Fengguang
2009-12-24 14:49             ` Steve Rago
2009-12-25  7:37               ` Wu Fengguang
2009-12-23 14:21         ` Trond Myklebust
2009-12-23 18:05           ` Jan Kara
     [not found]             ` <20091223180551.GD3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2009-12-23 19:12               ` Trond Myklebust
2009-12-24  2:52                 ` Wu Fengguang
2009-12-24 12:04                   ` Trond Myklebust
2009-12-25  5:56                     ` Wu Fengguang
2009-12-30 16:22                       ` Trond Myklebust
2009-12-31  5:04                         ` Wu Fengguang
2009-12-31 19:13                           ` Trond Myklebust
2010-01-06  3:03                             ` Wu Fengguang
2010-01-06 16:56                               ` Trond Myklebust
2010-01-06 18:26                                 ` Trond Myklebust
2010-01-06 18:37                                   ` Peter Zijlstra
2010-01-06 18:52                                     ` Trond Myklebust
2010-01-06 19:07                                       ` Peter Zijlstra
2010-01-06 19:21                                         ` Trond Myklebust
2010-01-06 19:53                                           ` Trond Myklebust
2010-01-06 20:09                                             ` Jan Kara
2010-01-06 20:51                                               ` [PATCH 0/6] " Trond Myklebust
     [not found]                                                 ` <20100106205110.22547.85345.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-06 20:51                                                   ` [PATCH 5/6] VM: Use per-bdi unstable accounting to improve use of wbc->force_commit Trond Myklebust
     [not found]                                                     ` <20100106205110.22547.32584.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07  2:34                                                       ` Wu Fengguang
2010-01-06 20:51                                                   ` [PATCH 2/6] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
2010-01-07  2:29                                                     ` Wu Fengguang
2010-01-07  4:49                                                       ` Trond Myklebust
2010-01-07  5:03                                                         ` Wu Fengguang
2010-01-07  5:30                                                           ` Trond Myklebust
2010-01-07 14:37                                                             ` Wu Fengguang
2010-01-07 14:41                                                               ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust
     [not found]                                                                 ` <20100107144137.17158.53673.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07 14:41                                                                   ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
2010-01-07 14:41                                                                   ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
2010-01-07 14:41                                                                   ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
2010-01-07 14:41                                                                   ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
2010-01-07 14:41                                                                   ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
2010-01-06 20:51                                                   ` [PATCH 1/6] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
     [not found]                                                     ` <20100106205110.22547.17971.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-06 21:38                                                       ` Jan Kara
     [not found]                                                         ` <20100106213843.GD22781-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2010-01-06 21:48                                                           ` Trond Myklebust
2010-01-07  2:18                                                     ` Wu Fengguang
     [not found]                                                       ` <1262839082.2185.15.camel@localhost>
2010-01-07  4:48                                                         ` Wu Fengguang
2010-01-07  4:53                                                           ` [PATCH 0/5] Re: [PATCH] improve the performance of large sequential write NFS workloads Trond Myklebust
     [not found]                                                             ` <20100107045330.5986.55090.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07  4:53                                                               ` [PATCH 3/5] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
2010-01-07  4:53                                                               ` [PATCH 4/5] VM/NFS: The VM must tell the filesystem when to free reclaimable pages Trond Myklebust
2010-01-07  4:53                                                               ` [PATCH 2/5] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
2010-01-07  4:53                                                               ` [PATCH 5/5] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
2010-01-07  4:53                                                               ` [PATCH 1/5] VFS: Ensure that writeback_single_inode() commits unstable writes Trond Myklebust
2010-01-07 14:56                                                         ` [PATCH 1/6] " Wu Fengguang
2010-01-07 15:10                                                           ` Trond Myklebust
2010-01-08  1:17                                                             ` Wu Fengguang
2010-01-08  1:37                                                               ` Trond Myklebust
2010-01-08  1:53                                                                 ` Wu Fengguang
2010-01-08  9:25                                                             ` Christoph Hellwig
2010-01-08 13:46                                                               ` Trond Myklebust
2010-01-08 13:54                                                                 ` Christoph Hellwig
2010-01-08 14:15                                                                   ` Trond Myklebust
2010-01-06 20:51                                                   ` [PATCH 6/6] NFS: Run COMMIT as an asynchronous RPC call when wbc->for_background is set Trond Myklebust
     [not found]                                                     ` <20100106205110.22547.31434.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07  2:32                                                       ` Wu Fengguang
2010-01-06 20:51                                                   ` [PATCH 4/6] VM: Don't call bdi_stat(BDI_UNSTABLE) on non-nfs backing-devices Trond Myklebust
2010-01-07  1:56                                                     ` Wu Fengguang
2010-01-06 20:51                                                   ` [PATCH 3/6] VM: Split out the accounting of unstable writes from BDI_RECLAIMABLE Trond Myklebust
     [not found]                                                     ` <20100106205110.22547.93554.stgit-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2010-01-07  1:48                                                       ` Wu Fengguang
2010-01-06 21:44                                                   ` [PATCH 0/6] Re: [PATCH] improve the performance of large sequential write NFS workloads Jan Kara
2010-01-06 22:03                                                     ` Trond Myklebust
2010-01-07  8:16                                                 ` Peter Zijlstra
2009-12-22 12:25       ` Jan Kara
     [not found]         ` <20091222122557.GA604-jyMamyUUXNJG4ohzP4jBZS1Fcj925eT/@public.gmane.org>
2009-12-22 12:38           ` Peter Zijlstra
2009-12-22 12:55             ` Jan Kara
2009-12-22 16:20         ` Steve Rago
2009-12-23 18:39           ` Jan Kara
     [not found]             ` <20091223183912.GE3159-+0h/O2h83AeN3ZZ/Hiejyg@public.gmane.org>
2009-12-23 20:16               ` Steve Rago
2009-12-23 21:49                 ` Trond Myklebust
2009-12-23 23:13                   ` Steve Rago
2009-12-23 23:44                     ` Trond Myklebust
2009-12-24  4:30                       ` Steve Rago

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).