[PATCH] NFS: introduce writeback wait queue

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] NFS: introduce writeback wait queue
@ 2009-10-04  3:01 Wu Fengguang
  2009-10-04  3:05 ` Wu Fengguang
  2009-10-05  7:10 ` [PATCH v2] " Wu Fengguang
  0 siblings, 2 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-10-04  3:01 UTC (permalink / raw)
  To: jens.axboe@oracle.com
  Cc: Trond Myklebust, Andrew Morton, linux-fsdevel, LKML, linux-nfs

The generic writeback routines are departing from congestion_wait()
in preference of get_request_wait(), aka. to wait on the block queues.

Introduce the missing writeback wait queue for NFS, otherwise its
writeback pages will grow out of control.

The SYNC writes can use the full queue space (2*nfs_congestion_kb), while
the ASYNC writes can only use half queue space. This way SYNC writes won't
be blocked by the ASYNC ones at all.

We'll be waiting inside the NFS_INO_FLUSHING lock, hence also be
blocking possible dirtiers.  This should not be a bit problem.
And we should be able to obsolete the NFS_INO_FLUSHING with more
general writeback improvements in long term.

CC: Jens Axboe <jens.axboe@oracle.com> 
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/nfs/client.c           |    2 
 fs/nfs/write.c            |   73 +++++++++++++++++++++++++++++-------
 include/linux/nfs_fs_sb.h |    1 
 3 files changed, 62 insertions(+), 14 deletions(-)

--- linux.orig/fs/nfs/write.c	2009-10-04 08:47:16.000000000 +0800
+++ linux/fs/nfs/write.c	2009-10-04 10:55:32.000000000 +0800
@@ -189,24 +189,58 @@ static int wb_priority(struct writeback_
 
 int nfs_congestion_kb;
 
+/*
+ * SYNC requests will be blocked on NFS_SYNC_*_THRESH
+ * ASYNC requests will be blocked on NFS_CONGESTION_*_THRESH
+ */
+#define NFS_SYNC_WAIT_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-11))
+#define NFS_SYNC_WAKEUP_THRESH	\
+	(NFS_SYNC_WAIT_THRESH - (NFS_SYNC_WAIT_THRESH >> 2))
+
 #define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
 #define NFS_CONGESTION_OFF_THRESH	\
 	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
 
-static int nfs_set_page_writeback(struct page *page)
+static void nfs_writeback_wait(struct page *page, struct writeback_control *wbc)
 {
-	int ret = test_set_page_writeback(page);
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	int is_sync = wbc->sync_mode == WB_SYNC_NONE;
+	DEFINE_WAIT(wait);
 
-	if (!ret) {
-		struct inode *inode = page->mapping->host;
-		struct nfs_server *nfss = NFS_SERVER(inode);
+	if (atomic_long_inc_return(&nfss->writeback) < NFS_CONGESTION_ON_THRESH)
+		return;
 
-		if (atomic_long_inc_return(&nfss->writeback) >
-				NFS_CONGESTION_ON_THRESH) {
-			set_bdi_congested(&nfss->backing_dev_info,
-						BLK_RW_ASYNC);
-		}
+	set_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
+	if (is_sync && atomic_long_read(&nfss->writeback) <
+	    NFS_SYNC_WAIT_THRESH)
+		return;
+
+	for (;;) {
+		prepare_to_wait_exclusive(&nfss->writeback_wait[is_sync], &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		io_schedule();
+
+		finish_wait(&nfss->writeback_wait[is_sync], &wait);
+
+		if (atomic_long_read(&nfss->writeback) <
+		    NFS_CONGESTION_OFF_THRESH)
+			break;
+		if (is_sync && atomic_long_read(&nfss->writeback) <
+		    NFS_SYNC_WAKEUP_THRESH)
+			break;
 	}
+}
+
+static int nfs_set_page_writeback(struct page *page, struct writeback_control *wbc)
+{
+	int ret = test_set_page_writeback(page);
+
+	if (!ret)
+		nfs_writeback_wait(page, wbc);
+
 	return ret;
 }
 
@@ -216,8 +250,18 @@ static void nfs_end_page_writeback(struc
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
+	if (atomic_long_dec_return(&nfss->writeback) < NFS_SYNC_WAKEUP_THRESH) {
+		if (waitqueue_active(&nfss->writeback_wait[1]))
+			wake_up(&nfss->writeback_wait[1]);
+		if (atomic_long_read(&nfss->writeback) <
+		    NFS_CONGESTION_OFF_THRESH) {
+			clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+			if (waitqueue_active(&nfss->writeback_wait[0]))
+				wake_up(&nfss->writeback_wait[0]);
+		}
+	}
+
 }
 
 static struct nfs_page *nfs_find_and_lock_request(struct page *page)
@@ -254,6 +298,7 @@ static struct nfs_page *nfs_find_and_loc
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+				struct writeback_control *wbc,
 				struct page *page)
 {
 	struct nfs_page *req;
@@ -266,7 +311,7 @@ static int nfs_page_async_flush(struct n
 	if (IS_ERR(req))
 		goto out;
 
-	ret = nfs_set_page_writeback(page);
+	ret = nfs_set_page_writeback(page, wbc);
 	BUG_ON(ret != 0);
 	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
 
@@ -286,7 +331,7 @@ static int nfs_do_writepage(struct page 
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
 	nfs_pageio_cond_complete(pgio, page->index);
-	return nfs_page_async_flush(pgio, page);
+	return nfs_page_async_flush(pgio, wbc, page);
 }
 
 /*
--- linux.orig/include/linux/nfs_fs_sb.h	2009-10-04 09:31:25.000000000 +0800
+++ linux/include/linux/nfs_fs_sb.h	2009-10-04 09:58:11.000000000 +0800
@@ -108,6 +108,7 @@ struct nfs_server {
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
+	wait_queue_head_t	writeback_wait[2];
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
--- linux.orig/fs/nfs/client.c	2009-10-04 09:59:46.000000000 +0800
+++ linux/fs/nfs/client.c	2009-10-04 10:00:55.000000000 +0800
@@ -991,6 +991,8 @@ static struct nfs_server *nfs_alloc_serv
 	INIT_LIST_HEAD(&server->master_link);
 
 	atomic_set(&server->active, 0);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]);
 
 	server->io_stats = nfs_alloc_iostats();
 	if (!server->io_stats) {

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] NFS: introduce writeback wait queue
  2009-10-04  3:01 [PATCH] NFS: introduce writeback wait queue Wu Fengguang
@ 2009-10-04  3:05 ` Wu Fengguang
  2009-10-05 11:00   ` Jens Axboe
  2009-10-05  7:10 ` [PATCH v2] " Wu Fengguang
  1 sibling, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-10-04  3:05 UTC (permalink / raw)
  To: jens.axboe@oracle.com
  Cc: Trond Myklebust, Andrew Morton, linux-fsdevel, LKML, linux-nfs

Hi Jens,

This is a bug fix for 2.6.32. Maybe other not block-queue based
filesystems will have similar issues ..

Thanks,
Fengguang


On Sun, Oct 04, 2009 at 11:01:53AM +0800, Wu Fengguang wrote:
> The generic writeback routines are departing from congestion_wait()
> in preference of get_request_wait(), aka. to wait on the block queues.
> 
> Introduce the missing writeback wait queue for NFS, otherwise its
> writeback pages will grow out of control.
> 
> The SYNC writes can use the full queue space (2*nfs_congestion_kb), while
> the ASYNC writes can only use half queue space. This way SYNC writes won't
> be blocked by the ASYNC ones at all.
> 
> We'll be waiting inside the NFS_INO_FLUSHING lock, hence also be
> blocking possible dirtiers.  This should not be a bit problem.
> And we should be able to obsolete the NFS_INO_FLUSHING with more
> general writeback improvements in long term.
> 
> CC: Jens Axboe <jens.axboe@oracle.com> 
> CC: Trond Myklebust <Trond.Myklebust@netapp.com>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> ---
>  fs/nfs/client.c           |    2 
>  fs/nfs/write.c            |   73 +++++++++++++++++++++++++++++-------
>  include/linux/nfs_fs_sb.h |    1 
>  3 files changed, 62 insertions(+), 14 deletions(-)
> 
> --- linux.orig/fs/nfs/write.c	2009-10-04 08:47:16.000000000 +0800
> +++ linux/fs/nfs/write.c	2009-10-04 10:55:32.000000000 +0800
> @@ -189,24 +189,58 @@ static int wb_priority(struct writeback_
>  
>  int nfs_congestion_kb;
>  
> +/*
> + * SYNC requests will be blocked on NFS_SYNC_*_THRESH
> + * ASYNC requests will be blocked on NFS_CONGESTION_*_THRESH
> + */
> +#define NFS_SYNC_WAIT_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-11))
> +#define NFS_SYNC_WAKEUP_THRESH	\
> +	(NFS_SYNC_WAIT_THRESH - (NFS_SYNC_WAIT_THRESH >> 2))
> +
>  #define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
>  #define NFS_CONGESTION_OFF_THRESH	\
>  	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
>  
> -static int nfs_set_page_writeback(struct page *page)
> +static void nfs_writeback_wait(struct page *page, struct writeback_control *wbc)
>  {
> -	int ret = test_set_page_writeback(page);
> +	struct inode *inode = page->mapping->host;
> +	struct nfs_server *nfss = NFS_SERVER(inode);
> +	int is_sync = wbc->sync_mode == WB_SYNC_NONE;
> +	DEFINE_WAIT(wait);
>  
> -	if (!ret) {
> -		struct inode *inode = page->mapping->host;
> -		struct nfs_server *nfss = NFS_SERVER(inode);
> +	if (atomic_long_inc_return(&nfss->writeback) < NFS_CONGESTION_ON_THRESH)
> +		return;
>  
> -		if (atomic_long_inc_return(&nfss->writeback) >
> -				NFS_CONGESTION_ON_THRESH) {
> -			set_bdi_congested(&nfss->backing_dev_info,
> -						BLK_RW_ASYNC);
> -		}
> +	set_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
> +
> +	if (is_sync && atomic_long_read(&nfss->writeback) <
> +	    NFS_SYNC_WAIT_THRESH)
> +		return;
> +
> +	for (;;) {
> +		prepare_to_wait_exclusive(&nfss->writeback_wait[is_sync], &wait,
> +					  TASK_UNINTERRUPTIBLE);
> +
> +		io_schedule();
> +
> +		finish_wait(&nfss->writeback_wait[is_sync], &wait);
> +
> +		if (atomic_long_read(&nfss->writeback) <
> +		    NFS_CONGESTION_OFF_THRESH)
> +			break;
> +		if (is_sync && atomic_long_read(&nfss->writeback) <
> +		    NFS_SYNC_WAKEUP_THRESH)
> +			break;
>  	}
> +}
> +
> +static int nfs_set_page_writeback(struct page *page, struct writeback_control *wbc)
> +{
> +	int ret = test_set_page_writeback(page);
> +
> +	if (!ret)
> +		nfs_writeback_wait(page, wbc);
> +
>  	return ret;
>  }
>  
> @@ -216,8 +250,18 @@ static void nfs_end_page_writeback(struc
>  	struct nfs_server *nfss = NFS_SERVER(inode);
>  
>  	end_page_writeback(page);
> -	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
> -		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
> +
> +	if (atomic_long_dec_return(&nfss->writeback) < NFS_SYNC_WAKEUP_THRESH) {
> +		if (waitqueue_active(&nfss->writeback_wait[1]))
> +			wake_up(&nfss->writeback_wait[1]);
> +		if (atomic_long_read(&nfss->writeback) <
> +		    NFS_CONGESTION_OFF_THRESH) {
> +			clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
> +			if (waitqueue_active(&nfss->writeback_wait[0]))
> +				wake_up(&nfss->writeback_wait[0]);
> +		}
> +	}
> +
>  }
>  
>  static struct nfs_page *nfs_find_and_lock_request(struct page *page)
> @@ -254,6 +298,7 @@ static struct nfs_page *nfs_find_and_loc
>   * May return an error if the user signalled nfs_wait_on_request().
>   */
>  static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
> +				struct writeback_control *wbc,
>  				struct page *page)
>  {
>  	struct nfs_page *req;
> @@ -266,7 +311,7 @@ static int nfs_page_async_flush(struct n
>  	if (IS_ERR(req))
>  		goto out;
>  
> -	ret = nfs_set_page_writeback(page);
> +	ret = nfs_set_page_writeback(page, wbc);
>  	BUG_ON(ret != 0);
>  	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
>  
> @@ -286,7 +331,7 @@ static int nfs_do_writepage(struct page 
>  	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
>  
>  	nfs_pageio_cond_complete(pgio, page->index);
> -	return nfs_page_async_flush(pgio, page);
> +	return nfs_page_async_flush(pgio, wbc, page);
>  }
>  
>  /*
> --- linux.orig/include/linux/nfs_fs_sb.h	2009-10-04 09:31:25.000000000 +0800
> +++ linux/include/linux/nfs_fs_sb.h	2009-10-04 09:58:11.000000000 +0800
> @@ -108,6 +108,7 @@ struct nfs_server {
>  	struct nfs_iostats *	io_stats;	/* I/O statistics */
>  	struct backing_dev_info	backing_dev_info;
>  	atomic_long_t		writeback;	/* number of writeback pages */
> +	wait_queue_head_t	writeback_wait[2];
>  	int			flags;		/* various flags */
>  	unsigned int		caps;		/* server capabilities */
>  	unsigned int		rsize;		/* read size */
> --- linux.orig/fs/nfs/client.c	2009-10-04 09:59:46.000000000 +0800
> +++ linux/fs/nfs/client.c	2009-10-04 10:00:55.000000000 +0800
> @@ -991,6 +991,8 @@ static struct nfs_server *nfs_alloc_serv
>  	INIT_LIST_HEAD(&server->master_link);
>  
>  	atomic_set(&server->active, 0);
> +	init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]);
> +	init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]);
>  
>  	server->io_stats = nfs_alloc_iostats();
>  	if (!server->io_stats) {

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2] NFS: introduce writeback wait queue
  2009-10-04  3:01 [PATCH] NFS: introduce writeback wait queue Wu Fengguang
  2009-10-04  3:05 ` Wu Fengguang
@ 2009-10-05  7:10 ` Wu Fengguang
  2009-10-05  7:35   ` Wu Fengguang
  2009-10-05 11:01   ` Myklebust, Trond
  1 sibling, 2 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-10-05  7:10 UTC (permalink / raw)
  To: jens.axboe-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
  Cc: Chris Mason, Trond Myklebust, Andrew Morton,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

Hi all,

This version makes two standalone functions for easier reuse.

Before patch, nr_writeback is near 1G on my 2GB laptop:

     nr_writeback         nr_dirty      nr_unstable
           203994                2           154469
           203994                2           154469

After patch, nr_writeback is limited to nfs_congestion_kb=42MB.

     nr_writeback         nr_dirty      nr_unstable
            11180            34195            11754
             9865            36821             8234
            10137            36695             9338

One minor problem I noticed is, NFS writeback is not very smooth.
This per 0.1s sampled trace shows that it can sometimes stuck for
up to 0.5s:

     nr_writeback         nr_dirty      nr_unstable
            11055            37408             9599
            10311            37315            10529
            10869            35920            11459
            10869            35920            11459
            10869            35920            11459
            10869            35920            11459
            10869            35920            11459
            10838            35891            10042
            10466            35891            10414
            10900            34744            11437
            10249            34744            12088
            10249            34744            12088
            10249            34744            12088
            10249            34744            12088
            10249            34744            12088
            10249            34744            12088
            10133            34743            10663
            10505            34743            11035
            10970            34991            11345
            10691            34991            11593
            10691            34991            11593
            10691            34991            11593
            10691            34991            11593
            10691            34991            11593

Trond, I guess nr_writeback/nr_unstable are decreased in async RPC
"complete" events. It is understandable that nr_dirty can sometimes
stuck on local waits, but the "local determined" nr_dirty and "remote
determined" nr_writeback/nr_unstable tend to stuck at the same time?
Did I miss something (that could be obvious to you)?

Thanks,
Fengguang
---
Subject: NFS: introduce writeback wait queue

The generic writeback routines are departing from congestion_wait()
in preferance of get_request_wait(), aka. waiting on the block queues.

Introduce the missing writeback wait queue for NFS, otherwise its
writeback pages will grow out of control.

CC: Jens Axboe <jens.axboe-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org> 
CC: Chris Mason <chris.mason-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
CC: Trond Myklebust <Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Wu Fengguang <fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---

 fs/nfs/client.c           |    2 
 fs/nfs/write.c            |   86 ++++++++++++++++++++++++++++--------
 include/linux/nfs_fs_sb.h |    1 
 3 files changed, 72 insertions(+), 17 deletions(-)

--- linux.orig/fs/nfs/write.c	2009-10-05 13:27:20.000000000 +0800
+++ linux/fs/nfs/write.c	2009-10-05 14:48:39.000000000 +0800
@@ -189,24 +189,72 @@ static int wb_priority(struct writeback_
 
 int nfs_congestion_kb;
 
-#define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
-#define NFS_CONGESTION_OFF_THRESH	\
-	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
+/*
+ * SYNC requests will be blocked on (2*limit) and wakeup on (2*limit - limit/8)
+ * ASYNC requests will be blocked on (limit) and wakeup on (limit - limit/8)
+ * In this way SYNC writes will never be blocked by ASYNC ones.
+ */
 
-static int nfs_set_page_writeback(struct page *page)
+static void nfs_writeback_wait(atomic_long_t *nr, long limit, int is_sync,
+			       struct backing_dev_info *bdi,
+			       wait_queue_head_t *wqh)
 {
-	int ret = test_set_page_writeback(page);
+	DEFINE_WAIT(wait);
+	int hard_limit = limit * 2;
 
-	if (!ret) {
-		struct inode *inode = page->mapping->host;
-		struct nfs_server *nfss = NFS_SERVER(inode);
+	if (atomic_long_read(nr) <= limit)
+		return;
+
+	set_bdi_congested(bdi, BLK_RW_ASYNC);
 
-		if (atomic_long_inc_return(&nfss->writeback) >
-				NFS_CONGESTION_ON_THRESH) {
-			set_bdi_congested(&nfss->backing_dev_info,
-						BLK_RW_ASYNC);
+	if (is_sync && atomic_long_read(nr) <= hard_limit)
+		return;
+
+	for (;;) {
+		prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE);
+
+		io_schedule();
+
+		if (atomic_long_read(nr) <= limit - limit/8)
+			break;
+		if (is_sync && atomic_long_read(nr) <= hard_limit - limit/8)
+			break;
+	}
+	finish_wait(&wqh[is_sync], &wait);
+}
+
+static void nfs_writeback_wakeup(long nr, long limit,
+				 struct backing_dev_info *bdi,
+				 wait_queue_head_t *wqh)
+{
+	int hard_limit = limit * 2;
+
+	if (nr < hard_limit - limit/8) {
+		if (waitqueue_active(&wqh[BLK_RW_SYNC]))
+			wake_up(&wqh[BLK_RW_SYNC]);
+		if (nr < limit - limit/8) {
+			clear_bdi_congested(bdi, BLK_RW_ASYNC);
+			if (waitqueue_active(&wqh[BLK_RW_ASYNC]))
+				wake_up(&wqh[BLK_RW_ASYNC]);
 		}
 	}
+}
+
+static int nfs_set_page_writeback(struct page *page,
+				  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
+	int ret = test_set_page_writeback(page);
+
+	if (!ret) {
+		atomic_long_inc(&nfss->writeback);
+		nfs_writeback_wait(&nfss->writeback,
+				   nfs_congestion_kb >> (PAGE_SHIFT-10),
+				   wbc->sync_mode == WB_SYNC_ALL,
+				   &nfss->backing_dev_info,
+				   nfss->writeback_wait);
+	}
 	return ret;
 }
 
@@ -216,8 +264,11 @@ static void nfs_end_page_writeback(struc
 	struct nfs_server *nfss = NFS_SERVER(inode);
 
 	end_page_writeback(page);
-	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+
+	nfs_writeback_wakeup(atomic_long_dec_return(&nfss->writeback),
+			     nfs_congestion_kb >> (PAGE_SHIFT-10),
+			     &nfss->backing_dev_info,
+			     nfss->writeback_wait);
 }
 
 static struct nfs_page *nfs_find_and_lock_request(struct page *page)
@@ -254,7 +305,8 @@ static struct nfs_page *nfs_find_and_loc
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page)
+				struct page *page,
+				struct writeback_control *wbc)
 {
 	struct nfs_page *req;
 	int ret = 0;
@@ -266,7 +318,7 @@ static int nfs_page_async_flush(struct n
 	if (IS_ERR(req))
 		goto out;
 
-	ret = nfs_set_page_writeback(page);
+	ret = nfs_set_page_writeback(page, wbc);
 	BUG_ON(ret != 0);
 	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
 
@@ -286,7 +338,7 @@ static int nfs_do_writepage(struct page 
 	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
 	nfs_pageio_cond_complete(pgio, page->index);
-	return nfs_page_async_flush(pgio, page);
+	return nfs_page_async_flush(pgio, page, wbc);
 }
 
 /*
--- linux.orig/include/linux/nfs_fs_sb.h	2009-10-05 13:27:20.000000000 +0800
+++ linux/include/linux/nfs_fs_sb.h	2009-10-05 13:28:31.000000000 +0800
@@ -108,6 +108,7 @@ struct nfs_server {
 	struct nfs_iostats *	io_stats;	/* I/O statistics */
 	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
+	wait_queue_head_t	writeback_wait[2];
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
 	unsigned int		rsize;		/* read size */
--- linux.orig/fs/nfs/client.c	2009-10-05 13:27:20.000000000 +0800
+++ linux/fs/nfs/client.c	2009-10-05 13:28:31.000000000 +0800
@@ -991,6 +991,8 @@ static struct nfs_server *nfs_alloc_serv
 	INIT_LIST_HEAD(&server->master_link);
 
 	atomic_set(&server->active, 0);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]);
+	init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]);
 
 	server->io_stats = nfs_alloc_iostats();
 	if (!server->io_stats) {
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] NFS: introduce writeback wait queue
  2009-10-05  7:10 ` [PATCH v2] " Wu Fengguang
@ 2009-10-05  7:35   ` Wu Fengguang
  2009-10-05  7:39     ` Wu Fengguang
  2009-10-05 11:01   ` Myklebust, Trond
  1 sibling, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-10-05  7:35 UTC (permalink / raw)
  To: jens.axboe@oracle.com
  Cc: Chris Mason, Trond Myklebust, Andrew Morton, linux-fsdevel, LKML,
	linux-nfs

On Mon, Oct 05, 2009 at 03:10:26PM +0800, Wu Fengguang wrote:
> Hi all,
> 
> This version makes two standalone functions for easier reuse.
> 
> Before patch, nr_writeback is near 1G on my 2GB laptop:
> 
>      nr_writeback         nr_dirty      nr_unstable
>            203994                2           154469
>            203994                2           154469

Sorry, I cannot reproduce the above trace on linux-next.  Maybe it's
one of my private patches' fault.

Trond, I see this trace on linux-next. There are no more dirty pages
when `cp' aborts after filling up the partition:

        cp: writing `/mnt/test/zero3': No space left on device

I noticed that since then nr_writeback is decreased very slowly
(~100 pages per second). Looks like an interesting behavior.

     nr_writeback         nr_dirty      nr_unstable
            41230            36284             8764
            41230            37307             7755
            40009            42812             3818
            32619            42812            11198
            32314            42812            11503
            31894            42812            11862
            31832            42812            11871
            31770            42812            11871
            31721            42812            11871
            31653            42812            11871
            40789            33754            11871
            40713            33754            11871
            40638            33754            11871
            40566            33754            11871
            43901            30313            11871
            74164                0            11871
            74062                0            11871
            73978                0            11871
            73904                0            11871
            73858                0            11871
            73798                0            11871
            73688                0            11871
            73580                0            11871
            73477                0            11871

Thanks,
Fengguang

> After patch, nr_writeback is limited to nfs_congestion_kb=42MB.
> 
>      nr_writeback         nr_dirty      nr_unstable
>             11180            34195            11754
>              9865            36821             8234
>             10137            36695             9338
> 
> One minor problem I noticed is, NFS writeback is not very smooth.
> This per 0.1s sampled trace shows that it can sometimes stuck for
> up to 0.5s:
> 
>      nr_writeback         nr_dirty      nr_unstable
>             11055            37408             9599
>             10311            37315            10529
>             10869            35920            11459
>             10869            35920            11459
>             10869            35920            11459
>             10869            35920            11459
>             10869            35920            11459
>             10838            35891            10042
>             10466            35891            10414
>             10900            34744            11437
>             10249            34744            12088
>             10249            34744            12088
>             10249            34744            12088
>             10249            34744            12088
>             10249            34744            12088
>             10249            34744            12088
>             10133            34743            10663
>             10505            34743            11035
>             10970            34991            11345
>             10691            34991            11593
>             10691            34991            11593
>             10691            34991            11593
>             10691            34991            11593
>             10691            34991            11593
> 
> Trond, I guess nr_writeback/nr_unstable are decreased in async RPC
> "complete" events. It is understandable that nr_dirty can sometimes
> stuck on local waits, but the "local determined" nr_dirty and "remote
> determined" nr_writeback/nr_unstable tend to stuck at the same time?
> Did I miss something (that could be obvious to you)?
> 
> Thanks,
> Fengguang
> ---
> Subject: NFS: introduce writeback wait queue
> 
> The generic writeback routines are departing from congestion_wait()
> in preferance of get_request_wait(), aka. waiting on the block queues.
> 
> Introduce the missing writeback wait queue for NFS, otherwise its
> writeback pages will grow out of control.
> 
> CC: Jens Axboe <jens.axboe@oracle.com> 
> CC: Chris Mason <chris.mason@oracle.com>
> CC: Trond Myklebust <Trond.Myklebust@netapp.com>
> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
> ---
> 
>  fs/nfs/client.c           |    2 
>  fs/nfs/write.c            |   86 ++++++++++++++++++++++++++++--------
>  include/linux/nfs_fs_sb.h |    1 
>  3 files changed, 72 insertions(+), 17 deletions(-)
> 
> --- linux.orig/fs/nfs/write.c	2009-10-05 13:27:20.000000000 +0800
> +++ linux/fs/nfs/write.c	2009-10-05 14:48:39.000000000 +0800
> @@ -189,24 +189,72 @@ static int wb_priority(struct writeback_
>  
>  int nfs_congestion_kb;
>  
> -#define NFS_CONGESTION_ON_THRESH 	(nfs_congestion_kb >> (PAGE_SHIFT-10))
> -#define NFS_CONGESTION_OFF_THRESH	\
> -	(NFS_CONGESTION_ON_THRESH - (NFS_CONGESTION_ON_THRESH >> 2))
> +/*
> + * SYNC requests will be blocked on (2*limit) and wakeup on (2*limit - limit/8)
> + * ASYNC requests will be blocked on (limit) and wakeup on (limit - limit/8)
> + * In this way SYNC writes will never be blocked by ASYNC ones.
> + */
>  
> -static int nfs_set_page_writeback(struct page *page)
> +static void nfs_writeback_wait(atomic_long_t *nr, long limit, int is_sync,
> +			       struct backing_dev_info *bdi,
> +			       wait_queue_head_t *wqh)
>  {
> -	int ret = test_set_page_writeback(page);
> +	DEFINE_WAIT(wait);
> +	int hard_limit = limit * 2;
>  
> -	if (!ret) {
> -		struct inode *inode = page->mapping->host;
> -		struct nfs_server *nfss = NFS_SERVER(inode);
> +	if (atomic_long_read(nr) <= limit)
> +		return;
> +
> +	set_bdi_congested(bdi, BLK_RW_ASYNC);
>  
> -		if (atomic_long_inc_return(&nfss->writeback) >
> -				NFS_CONGESTION_ON_THRESH) {
> -			set_bdi_congested(&nfss->backing_dev_info,
> -						BLK_RW_ASYNC);
> +	if (is_sync && atomic_long_read(nr) <= hard_limit)
> +		return;
> +
> +	for (;;) {
> +		prepare_to_wait(&wqh[is_sync], &wait, TASK_UNINTERRUPTIBLE);
> +
> +		io_schedule();
> +
> +		if (atomic_long_read(nr) <= limit - limit/8)
> +			break;
> +		if (is_sync && atomic_long_read(nr) <= hard_limit - limit/8)
> +			break;
> +	}
> +	finish_wait(&wqh[is_sync], &wait);
> +}
> +
> +static void nfs_writeback_wakeup(long nr, long limit,
> +				 struct backing_dev_info *bdi,
> +				 wait_queue_head_t *wqh)
> +{
> +	int hard_limit = limit * 2;
> +
> +	if (nr < hard_limit - limit/8) {
> +		if (waitqueue_active(&wqh[BLK_RW_SYNC]))
> +			wake_up(&wqh[BLK_RW_SYNC]);
> +		if (nr < limit - limit/8) {
> +			clear_bdi_congested(bdi, BLK_RW_ASYNC);
> +			if (waitqueue_active(&wqh[BLK_RW_ASYNC]))
> +				wake_up(&wqh[BLK_RW_ASYNC]);
>  		}
>  	}
> +}
> +
> +static int nfs_set_page_writeback(struct page *page,
> +				  struct writeback_control *wbc)
> +{
> +	struct inode *inode = page->mapping->host;
> +	struct nfs_server *nfss = NFS_SERVER(inode);
> +	int ret = test_set_page_writeback(page);
> +
> +	if (!ret) {
> +		atomic_long_inc(&nfss->writeback);
> +		nfs_writeback_wait(&nfss->writeback,
> +				   nfs_congestion_kb >> (PAGE_SHIFT-10),
> +				   wbc->sync_mode == WB_SYNC_ALL,
> +				   &nfss->backing_dev_info,
> +				   nfss->writeback_wait);
> +	}
>  	return ret;
>  }
>  
> @@ -216,8 +264,11 @@ static void nfs_end_page_writeback(struc
>  	struct nfs_server *nfss = NFS_SERVER(inode);
>  
>  	end_page_writeback(page);
> -	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
> -		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
> +
> +	nfs_writeback_wakeup(atomic_long_dec_return(&nfss->writeback),
> +			     nfs_congestion_kb >> (PAGE_SHIFT-10),
> +			     &nfss->backing_dev_info,
> +			     nfss->writeback_wait);
>  }
>  
>  static struct nfs_page *nfs_find_and_lock_request(struct page *page)
> @@ -254,7 +305,8 @@ static struct nfs_page *nfs_find_and_loc
>   * May return an error if the user signalled nfs_wait_on_request().
>   */
>  static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
> -				struct page *page)
> +				struct page *page,
> +				struct writeback_control *wbc)
>  {
>  	struct nfs_page *req;
>  	int ret = 0;
> @@ -266,7 +318,7 @@ static int nfs_page_async_flush(struct n
>  	if (IS_ERR(req))
>  		goto out;
>  
> -	ret = nfs_set_page_writeback(page);
> +	ret = nfs_set_page_writeback(page, wbc);
>  	BUG_ON(ret != 0);
>  	BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
>  
> @@ -286,7 +338,7 @@ static int nfs_do_writepage(struct page 
>  	nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
>  
>  	nfs_pageio_cond_complete(pgio, page->index);
> -	return nfs_page_async_flush(pgio, page);
> +	return nfs_page_async_flush(pgio, page, wbc);
>  }
>  
>  /*
> --- linux.orig/include/linux/nfs_fs_sb.h	2009-10-05 13:27:20.000000000 +0800
> +++ linux/include/linux/nfs_fs_sb.h	2009-10-05 13:28:31.000000000 +0800
> @@ -108,6 +108,7 @@ struct nfs_server {
>  	struct nfs_iostats *	io_stats;	/* I/O statistics */
>  	struct backing_dev_info	backing_dev_info;
>  	atomic_long_t		writeback;	/* number of writeback pages */
> +	wait_queue_head_t	writeback_wait[2];
>  	int			flags;		/* various flags */
>  	unsigned int		caps;		/* server capabilities */
>  	unsigned int		rsize;		/* read size */
> --- linux.orig/fs/nfs/client.c	2009-10-05 13:27:20.000000000 +0800
> +++ linux/fs/nfs/client.c	2009-10-05 13:28:31.000000000 +0800
> @@ -991,6 +991,8 @@ static struct nfs_server *nfs_alloc_serv
>  	INIT_LIST_HEAD(&server->master_link);
>  
>  	atomic_set(&server->active, 0);
> +	init_waitqueue_head(&server->writeback_wait[BLK_RW_SYNC]);
> +	init_waitqueue_head(&server->writeback_wait[BLK_RW_ASYNC]);
>  
>  	server->io_stats = nfs_alloc_iostats();
>  	if (!server->io_stats) {

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] NFS: introduce writeback wait queue
  2009-10-05  7:35   ` Wu Fengguang
@ 2009-10-05  7:39     ` Wu Fengguang
  2009-10-05 10:55       ` Myklebust, Trond
  0 siblings, 1 reply; 11+ messages in thread
From: Wu Fengguang @ 2009-10-05  7:39 UTC (permalink / raw)
  To: jens.axboe-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org
  Cc: Chris Mason, Trond Myklebust, Andrew Morton,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

On Mon, Oct 05, 2009 at 03:35:51PM +0800, Wu Fengguang wrote:
> Trond, I see this trace on linux-next. There are no more dirty pages
> when `cp' aborts after filling up the partition:
> 
>         cp: writing `/mnt/test/zero3': No space left on device
> 
> I noticed that since then nr_writeback is decreased very slowly
> (~100 pages per second). Looks like an interesting behavior.

In the mean time, there are constant 7-8MB/s writes in the NFS server.
The network flow is much smaller ~400K/s. How can I debug this issue?

Thanks,
Fengguang

>      nr_writeback         nr_dirty      nr_unstable
>             41230            36284             8764
>             41230            37307             7755
>             40009            42812             3818
>             32619            42812            11198
>             32314            42812            11503
>             31894            42812            11862
>             31832            42812            11871
>             31770            42812            11871
>             31721            42812            11871
>             31653            42812            11871
>             40789            33754            11871
>             40713            33754            11871
>             40638            33754            11871
>             40566            33754            11871
>             43901            30313            11871
>             74164                0            11871
>             74062                0            11871
>             73978                0            11871
>             73904                0            11871
>             73858                0            11871
>             73798                0            11871
>             73688                0            11871
>             73580                0            11871
>             73477                0            11871
> 
> Thanks,
> Fengguang
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] NFS: introduce writeback wait queue
  2009-10-05  7:39     ` Wu Fengguang
@ 2009-10-05 10:55       ` Myklebust, Trond
       [not found]         ` <F047DA72-75B3-4447-84EB-7115C77ECBA3-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Myklebust, Trond @ 2009-10-05 10:55 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: jens.axboe, Chris Mason, Andrew Morton, linux-fsdevel, LKML,
	linux-nfs

On Oct 5, 2009, at 3:40, "Wu Fengguang" <fengguang.wu@intel.com> wrote:

> On Mon, Oct 05, 2009 at 03:35:51PM +0800, Wu Fengguang wrote:
>> Trond, I see this trace on linux-next. There are no more dirty pages
>> when `cp' aborts after filling up the partition:
>>
>>        cp: writing `/mnt/test/zero3': No space left on device
>>
>> I noticed that since then nr_writeback is decreased very slowly
>> (~100 pages per second). Looks like an interesting behavior.
>
> In the mean time, there are constant 7-8MB/s writes in the NFS server.
> The network flow is much smaller ~400K/s. How can I debug this issue?

Hi Fengguang

This is deliberate behaviour. When asynchronous writes start recieving  
errors, then we switch to synchronous write mode until the error  
condition clears.

The reason is for doing so is firstly because some filesystems (XFS)  
perform very poorly under ENOSPC, and so it takes forever to write  
back pages (we don't want to cancel all writebacks for temporary  
conditions like ENOSPC). It also allows us to deliver the errors more  
promptly to the application.

Cheers
      Trond

>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] NFS: introduce writeback wait queue
  2009-10-04  3:05 ` Wu Fengguang
@ 2009-10-05 11:00   ` Jens Axboe
       [not found]     ` <20091005110010.GW26573-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
  0 siblings, 1 reply; 11+ messages in thread
From: Jens Axboe @ 2009-10-05 11:00 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: Trond Myklebust, Andrew Morton, linux-fsdevel, LKML, linux-nfs

On Sun, Oct 04 2009, Wu Fengguang wrote:
> Hi Jens,
> 
> This is a bug fix for 2.6.32. Maybe other not block-queue based
> filesystems will have similar issues ..

Not that I'm aware of, the NFS use is fairly special. Given that this is
purely in the realm of nfs/, I'll let Trond decide how to include and
push this (when a final patch is agreed upon).

Thanks for looking into this!

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] NFS: introduce writeback wait queue
  2009-10-05  7:10 ` [PATCH v2] " Wu Fengguang
  2009-10-05  7:35   ` Wu Fengguang
@ 2009-10-05 11:01   ` Myklebust, Trond
       [not found]     ` <86DD3A52-EE5C-4378-BEB6-6336E17CFCD5-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 11+ messages in thread
From: Myklebust, Trond @ 2009-10-05 11:01 UTC (permalink / raw)
  To: Wu Fengguang
  Cc: jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, Chris Mason, Andrew Morton,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

On Oct 5, 2009, at 3:11, "Wu Fengguang" <fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:

> Hi all,
>
> This version makes two standalone functions for easier reuse.
>
> Before patch, nr_writeback is near 1G on my 2GB laptop:
>
>     nr_writeback         nr_dirty      nr_unstable
>           203994                2           154469
>           203994                2           154469
>
> After patch, nr_writeback is limited to nfs_congestion_kb=42MB.
>
>     nr_writeback         nr_dirty      nr_unstable
>            11180            34195            11754
>             9865            36821             8234
>            10137            36695             9338
>
> One minor problem I noticed is, NFS writeback is not very smooth.
> This per 0.1s sampled trace shows that it can sometimes stuck for
> up to 0.5s:
>
>     nr_writeback         nr_dirty      nr_unstable
>            11055            37408             9599
>            10311            37315            10529
>            10869            35920            11459
>            10869            35920            11459
>            10869            35920            11459
>            10869            35920            11459
>            10869            35920            11459
>            10838            35891            10042
>            10466            35891            10414
>            10900            34744            11437
>            10249            34744            12088
>            10249            34744            12088
>            10249            34744            12088
>            10249            34744            12088
>            10249            34744            12088
>            10249            34744            12088
>            10133            34743            10663
>            10505            34743            11035
>            10970            34991            11345
>            10691            34991            11593
>            10691            34991            11593
>            10691            34991            11593
>            10691            34991            11593
>            10691            34991            11593
>
> Trond, I guess nr_writeback/nr_unstable are decreased in async RPC
> "complete" events. It is understandable that nr_dirty can sometimes
> stuck on local waits, but the "local determined" nr_dirty and "remote
> determined" nr_writeback/nr_unstable tend to stuck at the same time?
> Did I miss something (that could be obvious to you)?

It looks (at 7am in the morning after getting up at 4:30am) as though  
the number of unstable pages is remaining constant, which would mean  
that we're sending a lot of COMMIT requests (see nfsstat). Since  
COMMIT is essentially an fsync call, it means that the server is going  
to be slower.

Cheers
     Trond
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] NFS: introduce writeback wait queue
       [not found]         ` <F047DA72-75B3-4447-84EB-7115C77ECBA3-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
@ 2009-10-05 13:08           ` Wu Fengguang
  0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-10-05 13:08 UTC (permalink / raw)
  To: Myklebust, Trond
  Cc: jens.axboe-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org, Chris Mason,
	Andrew Morton,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, LKML,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Mon, Oct 05, 2009 at 06:55:54PM +0800, Myklebust, Trond wrote:
> On Oct 5, 2009, at 3:40, "Wu Fengguang" <fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
> > On Mon, Oct 05, 2009 at 03:35:51PM +0800, Wu Fengguang wrote:
> >> Trond, I see this trace on linux-next. There are no more dirty pages
> >> when `cp' aborts after filling up the partition:
> >>
> >>        cp: writing `/mnt/test/zero3': No space left on device
> >>
> >> I noticed that since then nr_writeback is decreased very slowly
> >> (~100 pages per second). Looks like an interesting behavior.
> >
> > In the mean time, there are constant 7-8MB/s writes in the NFS server.
> > The network flow is much smaller ~400K/s. How can I debug this issue?
> 
> Hi Fengguang
> 
> This is deliberate behaviour. When asynchronous writes start recieving  
> errors, then we switch to synchronous write mode until the error  
> condition clears.

Ah yes. After ENOSPC, with nfsstat I saw the client side write/commit
numbers remain constant, while the server side write number increases
~200 per-second, and commit number also remain static. When all client
side nr_writeback drops to 0, the server side write number also stops.

> The reason is for doing so is firstly because some filesystems (XFS)  
> perform very poorly under ENOSPC, and so it takes forever to write  
> back pages (we don't want to cancel all writebacks for temporary  
> conditions like ENOSPC). It also allows us to deliver the errors more  
> promptly to the application.

Thanks,
Fengguang
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] NFS: introduce writeback wait queue
       [not found]     ` <86DD3A52-EE5C-4378-BEB6-6336E17CFCD5-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
@ 2009-10-05 13:51       ` Wu Fengguang
  0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-10-05 13:51 UTC (permalink / raw)
  To: Myklebust, Trond
  Cc: jens.axboe-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org, Chris Mason,
	Andrew Morton,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, LKML,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Mon, Oct 05, 2009 at 07:01:10PM +0800, Myklebust, Trond wrote:
> On Oct 5, 2009, at 3:11, "Wu Fengguang" <fengguang.wu-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> 
> > Hi all,
> >
> > This version makes two standalone functions for easier reuse.
> >
> > Before patch, nr_writeback is near 1G on my 2GB laptop:
> >
> >     nr_writeback         nr_dirty      nr_unstable
> >           203994                2           154469
> >           203994                2           154469
> >
> > After patch, nr_writeback is limited to nfs_congestion_kb=42MB.
> >
> >     nr_writeback         nr_dirty      nr_unstable
> >            11180            34195            11754
> >             9865            36821             8234
> >            10137            36695             9338
> >
> > One minor problem I noticed is, NFS writeback is not very smooth.
> > This per 0.1s sampled trace shows that it can sometimes stuck for
> > up to 0.5s:
> >
> >     nr_writeback         nr_dirty      nr_unstable
> >            11055            37408             9599
> >            10311            37315            10529
> >            10869            35920            11459
> >            10869            35920            11459
> >            10869            35920            11459
> >            10869            35920            11459
> >            10869            35920            11459
> >            10838            35891            10042
> >            10466            35891            10414
> >            10900            34744            11437
> >            10249            34744            12088
> >            10249            34744            12088
> >            10249            34744            12088
> >            10249            34744            12088
> >            10249            34744            12088
> >            10249            34744            12088
> >            10133            34743            10663
> >            10505            34743            11035
> >            10970            34991            11345
> >            10691            34991            11593
> >            10691            34991            11593
> >            10691            34991            11593
> >            10691            34991            11593
> >            10691            34991            11593
> >
> > Trond, I guess nr_writeback/nr_unstable are decreased in async RPC
> > "complete" events. It is understandable that nr_dirty can sometimes
> > stuck on local waits, but the "local determined" nr_dirty and "remote
> > determined" nr_writeback/nr_unstable tend to stuck at the same time?
> > Did I miss something (that could be obvious to you)?
> 
> It looks (at 7am in the morning after getting up at 4:30am) as though  

Wow early bird!

> the number of unstable pages is remaining constant, which would mean  
> that we're sending a lot of COMMIT requests (see nfsstat). Since  
> COMMIT is essentially an fsync call, it means that the server is going  
> to be slower.

Here are the numbers:

Client nfs v3:
null         getattr      setattr      lookup       access       readlink
0         0% 18007     3% 67        0% 1752      0% 2499      0% 109       0%
read         write        create       mkdir        symlink      mknod
1742      0% 518695   95% 77        0% 0         0% 0         0% 2         0%
remove       rmdir        rename       link         readdir      readdirplus
104       0% 0         0% 30        0% 40        0% 15        0% 178       0%
fsstat       fsinfo       pathconf     commit
140       0% 2         0% 1         0% 2461      0%


Server nfs v3:
null         getattr      setattr      lookup       access       readlink
2         0% 18000     3% 67        0% 1752      0% 2495      0% 109       0%
read         write        create       mkdir        symlink      mknod
1742      0% 518695   95% 77        0% 0         0% 0         0% 2         0%
remove       rmdir        rename       link         readdir      readdirplus
104       0% 0         0% 30        0% 40        0% 15        0% 178       0%
fsstat       fsinfo       pathconf     commit
140       0% 3         0% 1         0% 2461      0%


I noticed that dd often sleep in nfs_updatepage/nfs_wait_on_request,
is it because it's doing 512byte writes and thus have to wait on
subsequent in-page writes? I guess this may hurt performance on big
network latency?

[  268.020588] dd            D ffff880002735460  2608  3688   3534 0x00000000
[  268.020588]  ffff8800777c3a38 0000000000000046 0000000000000000 ffff880002735460
[  268.020588]  000000000000e388 ffff8800777c3fd8 ffff8800775346e0 ffffffff8192c8e0
[  268.020588]  ffff880077534a68 0000000000000082 00000000ffffc9fb ffff8800061d4758
[  268.020588] Call Trace:
[  268.020588]  [<ffffffff8109365d>] ? trace_hardirqs_on+0xd/0x10
[  268.020588]  [<ffffffff8123dc7a>] nfs_wait_bit_killable+0x2a/0x40
[  268.020588]  [<ffffffff81695a82>] __wait_on_bit+0x62/0x90
[  268.020588]  [<ffffffff8123dc50>] ? nfs_wait_bit_killable+0x0/0x40
[  268.020588]  [<ffffffff8123dc50>] ? nfs_wait_bit_killable+0x0/0x40
[  268.020588]  [<ffffffff81695b29>] out_of_line_wait_on_bit+0x79/0x90
[  268.020588]  [<ffffffff8107e2f0>] ? wake_bit_function+0x0/0x50
[  268.020588]  [<ffffffff81243aef>] nfs_wait_on_request+0x2f/0x40
[  268.020588]  [<ffffffff812490a6>] nfs_updatepage+0x2e6/0x540
[  268.020588]  [<ffffffff81239dc2>] nfs_write_end+0x62/0x2c0
[  268.020588]  [<ffffffff810fd469>] generic_file_buffered_write+0x179/0x2a0
[  268.020588]  [<ffffffff810935f5>] ? trace_hardirqs_on_caller+0x155/0x1b0
[  268.020588]  [<ffffffff810fd99d>] __generic_file_aio_write+0x26d/0x440
[  268.020588]  [<ffffffff810fdbbe>] ? generic_file_aio_write+0x4e/0xd0
[  268.020588]  [<ffffffff810fdbd4>] generic_file_aio_write+0x64/0xd0
[  268.020588]  [<ffffffff8123ae66>] nfs_file_write+0x136/0x210
[  268.020588]  [<ffffffff8114d1e9>] do_sync_write+0xf9/0x140
[  268.020588]  [<ffffffff8107e2b0>] ? autoremove_wake_function+0x0/0x40
[  268.020588]  [<ffffffff8111905c>] ? might_fault+0x5c/0xb0
[  268.020588]  [<ffffffff8114de3f>] vfs_write+0xcf/0x1c0
[  268.020588]  [<ffffffff8114e035>] sys_write+0x55/0x90
[  268.020588]  [<ffffffff8100c0b2>] system_call_fastpath+0x16/0x1b

Thanks,
Fengguang

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] NFS: introduce writeback wait queue
       [not found]     ` <20091005110010.GW26573-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
@ 2009-10-06  0:12       ` Wu Fengguang
  0 siblings, 0 replies; 11+ messages in thread
From: Wu Fengguang @ 2009-10-06  0:12 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Trond Myklebust, Andrew Morton,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, LKML,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Mon, Oct 05, 2009 at 07:00:11PM +0800, Jens Axboe wrote:
> On Sun, Oct 04 2009, Wu Fengguang wrote:
> > Hi Jens,
> > 
> > This is a bug fix for 2.6.32. Maybe other not block-queue based
> > filesystems will have similar issues ..
> 
> Not that I'm aware of, the NFS use is fairly special. Given that this is

Sorry for the confusion. 2.6.32 is safe.  I tested NFS, fuse and cifs.
NFS writes will be throttled at the dirty limit, and fuse/cifs see
near zero nr_writeback/nr_dirty numbers during heavy write.

NFS and fuse does set the bdi congestion state and somehow expects it to
backoff background flushing. This IO priority thing could be fixed for
next merged window.

> purely in the realm of nfs/, I'll let Trond decide how to include and
> push this (when a final patch is agreed upon).
> 
> Thanks for looking into this!

Thanks,
Fengguang
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-10-06  0:12 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-10-04  3:01 [PATCH] NFS: introduce writeback wait queue Wu Fengguang
2009-10-04  3:05 ` Wu Fengguang
2009-10-05 11:00   ` Jens Axboe
     [not found]     ` <20091005110010.GW26573-tSWWG44O7X1aa/9Udqfwiw@public.gmane.org>
2009-10-06  0:12       ` Wu Fengguang
2009-10-05  7:10 ` [PATCH v2] " Wu Fengguang
2009-10-05  7:35   ` Wu Fengguang
2009-10-05  7:39     ` Wu Fengguang
2009-10-05 10:55       ` Myklebust, Trond
     [not found]         ` <F047DA72-75B3-4447-84EB-7115C77ECBA3-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
2009-10-05 13:08           ` Wu Fengguang
2009-10-05 11:01   ` Myklebust, Trond
     [not found]     ` <86DD3A52-EE5C-4378-BEB6-6336E17CFCD5-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
2009-10-05 13:51       ` Wu Fengguang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).