Linux NFS development
 help / color / mirror / Atom feed
* [PATCH 2.6.3] Add write throttling to NFS client
@ 2004-02-25  2:22 Shantanu Goel
  2004-02-26  3:34 ` Greg Banks
  0 siblings, 1 reply; 30+ messages in thread
From: Shantanu Goel @ 2004-02-25  2:22 UTC (permalink / raw)
  To: nfs

[-- Attachment #1: Type: text/plain, Size: 798 bytes --]

Hi,

I posted an earlier version of the attached patch to
the kernel mailing list a few days back but did not
receive any feedback.  Hopefully, I'll have better
luck here. ;-)

The stock NFS client does not regulate the # async
write requests causing other accesses to block in the
presence of streaming writes.  This patch adds such
support.  For instance, a single dd running in the
background writing to a file in my home directory
causes my X session to hang until dd exits.  With this
patch the session does not experience such hangs. 
Please test it out and let me know if you see anything
problems.  I'd like to see this integrated soon.

Thanks,
Shantanu

__________________________________
Do you Yahoo!?
Yahoo! Mail SpamGuard - Read only the mail you want.
http://antispam.yahoo.com/tools

[-- Attachment #2: nfs-write-throttle.patch --]
[-- Type: application/octet-stream, Size: 10596 bytes --]

Index: include/linux/nfs_page.h
===================================================================
--- orig/include/linux/nfs_page.h	(.../stock)	(revision 6)
+++ nfs-write-throttle/include/linux/nfs_page.h	(.../nfs-write-throttle)	(revision 6)
@@ -53,7 +53,8 @@
 extern	void nfs_list_add_request(struct nfs_page *, struct list_head *);
 
 extern	int nfs_scan_list(struct list_head *, struct list_head *,
-			  struct file *, unsigned long, unsigned int);
+			  struct file *, unsigned long, unsigned int,
+			  unsigned int, unsigned int);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);
Index: include/linux/nfs_fs_sb.h
===================================================================
--- orig/include/linux/nfs_fs_sb.h	(.../stock)	(revision 6)
+++ nfs-write-throttle/include/linux/nfs_fs_sb.h	(.../nfs-write-throttle)	(revision 6)
@@ -28,6 +28,7 @@
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
 	struct sockaddr_in	addr;
+	atomic_t		wactive;	/* # active write requests */
 #ifdef CONFIG_NFS_V4
 	/* Our own IP address, as a null-terminated string.
 	 * This is used to generate the clientid, and the callback address.
Index: fs/nfs/inode.c
===================================================================
--- orig/fs/nfs/inode.c	(.../stock)	(revision 6)
+++ nfs-write-throttle/fs/nfs/inode.c	(.../nfs-write-throttle)	(revision 6)
@@ -364,6 +364,8 @@
 	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
 		sb->s_maxbytes = MAX_LFS_FILESIZE; 
 
+	atomic_set(&server->wactive, 0);
+
 	/* We're airborne Set socket buffersize */
 	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 	return 0;
Index: fs/nfs/pagelist.c
===================================================================
--- orig/fs/nfs/pagelist.c	(.../stock)	(revision 6)
+++ nfs-write-throttle/fs/nfs/pagelist.c	(.../nfs-write-throttle)	(revision 6)
@@ -249,6 +249,8 @@
  * @file: if set, ensure we match requests from this file
  * @idx_start: lower bound of page->index to scan
  * @npages: idx_start + npages sets the upper bound to scan.
+ * @nreq: if set, stop after this many coalesced requests.
+ * @rpages: if nreq is set, # pages per request.
  *
  * Moves elements from one of the inode request lists.
  * If the number of requests is set to 0, the entire address_space
@@ -259,18 +261,22 @@
 int
 nfs_scan_list(struct list_head *head, struct list_head *dst,
 	      struct file *file,
-	      unsigned long idx_start, unsigned int npages)
+	      unsigned long idx_start, unsigned int npages,
+	      unsigned int nreq, unsigned int rpages)
 {
 	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
+	struct nfs_page		*req, *prev;
 	unsigned long		idx_end;
-	int			res;
+	int			res, is_contig;
+	unsigned int		pages, nr;
 
 	res = 0;
 	if (npages == 0)
 		idx_end = ~0;
 	else
 		idx_end = idx_start + npages - 1;
+	nr = pages = 0;
+	prev = NULL;
 
 	list_for_each_safe(pos, tmp, head) {
 
@@ -284,11 +290,30 @@
 		if (req->wb_index > idx_end)
 			break;
 
+		is_contig = (nreq &&
+			     prev &&
+			     pages < rpages &&
+			     req->wb_pgbase == 0 &&
+			     prev->wb_pgbase + prev->wb_bytes == PAGE_CACHE_SIZE &&
+			     req->wb_index == prev->wb_index + 1 &&
+			     req->wb_cred == prev->wb_cred);
+
+		if (nreq && !is_contig && nr == nreq)
+			break;
+
 		if (!nfs_lock_request(req))
 			continue;
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, dst);
 		res++;
+
+		if (is_contig)
+			pages++;
+		else {
+			pages = 1;
+			nr++;
+		}
+		prev = req;
 	}
 	return res;
 }
Index: fs/nfs/write.c
===================================================================
--- orig/fs/nfs/write.c	(.../stock)	(revision 6)
+++ nfs-write-throttle/fs/nfs/write.c	(.../nfs-write-throttle)	(revision 6)
@@ -75,6 +75,8 @@
 					    struct page *,
 					    unsigned int, unsigned int);
 static void	nfs_strategy(struct inode *inode);
+static int	nfs_wait_on_requests(struct inode *, struct file *,
+				     unsigned long, unsigned int, unsigned int);
 
 static kmem_cache_t *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -125,6 +127,47 @@
 }
 
 /*
+ * Max # active write requests.
+ */
+#define ASYNC_REQ_LIMIT		(RPC_MAXREQS * 3 / 4)
+
+/*
+ * Max # queued dirty pages.
+ */
+#define ASYNC_QUEUE_LIMIT	(256)
+
+/*
+ * Count of active write requests.
+ */
+#define ACTIVE_CNT(inode)	atomic_read(&NFS_SERVER(inode)->wactive)
+#define LIMIT_EXCEEDED(inode)	(ACTIVE_CNT(inode) > ASYNC_REQ_LIMIT)
+#define WRITE_START(inode)	atomic_inc(&NFS_SERVER(inode)->wactive)
+#define WRITE_END(inode)	atomic_dec(&NFS_SERVER(inode)->wactive)
+
+/*
+ * Drain dirty queue if limits have been exceeded.
+ */
+static int queue_drain(struct inode *inode)
+{
+	int error;
+
+	do {
+		error = 0;
+		if (NFS_I(inode)->ndirty > ASYNC_QUEUE_LIMIT)
+			error = nfs_flush_file(inode, NULL, 0, 0, 0);
+		while (LIMIT_EXCEEDED(inode)) {
+			int err = nfs_wait_on_requests(inode, NULL, 0, 0, 1);
+			if (err <= 0) {
+				if (err < 0)
+					error = err;
+				break;
+			}
+		}
+	} while (error > 0);
+	return error;
+}
+
+/*
  * Write a page synchronously.
  * Offset is the data offset within the page.
  */
@@ -162,7 +205,9 @@
 			wdata.args.count = count;
 		wdata.args.offset = page_offset(page) + wdata.args.pgbase;
 
+		WRITE_START(inode);
 		result = NFS_PROTO(inode)->write(&wdata, file);
+		WRITE_END(inode);
 
 		if (result < 0) {
 			/* Must mark the page invalid after I/O error */
@@ -271,6 +316,8 @@
 	unlock_kernel();
 out:
 	unlock_page(page);
+	if (!err && !wbc->nonblocking)
+		err = queue_drain(inode);
 	if (inode_referenced)
 		iput(inode);
 	return err; 
@@ -282,20 +329,41 @@
 	struct inode *inode = mapping->host;
 	int is_sync = !wbc->nonblocking;
 	int err;
+	long npages = wbc->nr_to_write;
 
 	err = generic_writepages(mapping, wbc);
 	if (err)
 		goto out;
-	err = nfs_flush_file(inode, NULL, 0, 0, 0);
-	if (err < 0)
-		goto out;
-	if (wbc->sync_mode == WB_SYNC_HOLD)
-		goto out;
 	if (is_sync && wbc->sync_mode == WB_SYNC_ALL) {
 		err = nfs_wb_all(inode);
-	} else
+		goto out;
+	}
+	if (wbc->sync_mode != WB_SYNC_HOLD)
+		npages -= NFS_I(inode)->ncommit;
+	wbc->encountered_congestion = 0;
+	while (npages > 0 && !wbc->encountered_congestion) {
+		err = nfs_flush_file(inode, NULL, 0, 0, 0);
+		if (err < 0)
+			goto out;
+		if (err == 0)
+			break;
+		npages -= err;
+		while (LIMIT_EXCEEDED(inode)) {
+			if (wbc->nonblocking) {
+				wbc->encountered_congestion = 1;
+				break;
+			}
+			err = nfs_wait_on_requests(inode, NULL, 0, 0, 1);
+			if (err < 0)
+				goto out;
+			if (err == 0)
+				break;
+		}
+	}
+	if (wbc->sync_mode != WB_SYNC_HOLD)
 		nfs_commit_file(inode, NULL, 0, 0, 0);
 out:
+	wbc->nr_to_write = npages;
 	return err;
 }
 
@@ -421,7 +489,7 @@
  * Interruptible by signals only if mounted with intr flag.
  */
 static int
-nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long idx_start, unsigned int npages)
+nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long idx_start, unsigned int npages, unsigned int nreq)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
@@ -455,6 +523,8 @@
 		spin_lock(&nfs_wreq_lock);
 		next = idx_start;
 		res++;
+		if (nreq && res == nreq)
+			break;
 	}
 	spin_unlock(&nfs_wreq_lock);
 	return res;
@@ -472,11 +542,11 @@
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages)
+nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages, unsigned int nreq)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int	res;
-	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages);
+	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages, nreq, NFS_SERVER(inode)->wpages);
 	nfsi->ndirty -= res;
 	sub_page_state(nr_dirty,res);
 	if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
@@ -501,7 +571,7 @@
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int	res;
-	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages);
+	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages, 0, 0);
 	nfsi->ncommit -= res;
 	if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -728,6 +798,7 @@
 		nfs_strategy(inode);
 	} else
 		nfs_unlock_request(req);
+	return queue_drain(inode);
 done:
         dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
 			status, (long long)i_size_read(inode));
@@ -767,6 +838,8 @@
 
 	NFS_PROTO(inode)->write_setup(data, count, how);
 
+	WRITE_START(inode);
+
 	dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 		task->tk_pid,
 		inode->i_sb->s_id,
@@ -856,6 +929,8 @@
 	dprintk("NFS: %4d nfs_writeback_done (status %d)\n",
 		task->tk_pid, task->tk_status);
 
+	WRITE_END(data->inode);
+
 	/* We can't handle that yet but we check for it nevertheless */
 	if (resp->count < argp->count && task->tk_status >= 0) {
 		static unsigned long    complain;
@@ -1066,10 +1141,16 @@
 {
 	LIST_HEAD(head);
 	int			res,
+				nreq,
 				error = 0;
 
+	nreq = ASYNC_REQ_LIMIT - ACTIVE_CNT(inode);
+	if (nreq < 1)
+		nreq = 1;
+	else if (nreq > 4)
+		nreq = 4;
 	spin_lock(&nfs_wreq_lock);
-	res = nfs_scan_dirty(inode, &head, file, idx_start, npages);
+	res = nfs_scan_dirty(inode, &head, file, idx_start, npages, nreq);
 	spin_unlock(&nfs_wreq_lock);
 	if (res)
 		error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how);
@@ -1106,6 +1187,10 @@
 	int	error,
 		wait;
 
+	error = queue_drain(inode);
+	if (error < 0)
+		return error;
+
 	wait = how & FLUSH_WAIT;
 	how &= ~FLUSH_WAIT;
 
@@ -1113,11 +1198,18 @@
 		inode = file->f_dentry->d_inode;
 
 	do {
-		error = 0;
-		if (wait)
-			error = nfs_wait_on_requests(inode, file, idx_start, npages);
-		if (error == 0)
-			error = nfs_flush_file(inode, file, idx_start, npages, how);
+		error = nfs_flush_file(inode, file, idx_start, npages, how);
+		if (error > 0 && LIMIT_EXCEEDED(inode)) {
+			int err;
+
+			do {
+				err = nfs_wait_on_requests(inode, file, idx_start, npages, 1);
+			} while (err > 0 && LIMIT_EXCEEDED(inode));
+			if (err < 0)
+				error = err;
+		}
+		if (error == 0 && wait)
+			error = nfs_wait_on_requests(inode, file, idx_start, npages, 0);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 		if (error == 0)
 			error = nfs_commit_file(inode, file, idx_start, npages, how);

^ permalink raw reply	[flat|nested] 30+ messages in thread
* Re: [PATCH 2.6.3] Add write throttling to NFS client
@ 2004-02-25  3:19 Shantanu Goel
  0 siblings, 0 replies; 30+ messages in thread
From: Shantanu Goel @ 2004-02-25  3:19 UTC (permalink / raw)
  To: nfs

[-- Attachment #1: Type: text/plain, Size: 1040 bytes --]

Apologies for replying to my own message.  I realized
there was a bug in the patch shortly after sending it.
Attached is a fixed one.

Thanks,
Shantanu
--- Shantanu Goel <sgoel01@yahoo.com> wrote:
> Hi,
> 
> I posted an earlier version of the attached patch to
> the kernel mailing list a few days back but did not
> receive any feedback.  Hopefully, I'll have better
> luck here. ;-)
> 
> The stock NFS client does not regulate the # async
> write requests causing other accesses to block in
> the
> presence of streaming writes.  This patch adds such
> support.  For instance, a single dd running in the
> background writing to a file in my home directory
> causes my X session to hang until dd exits.  With
> this
> patch the session does not experience such hangs. 
> Please test it out and let me know if you see
> anything
> problems.  I'd like to see this integrated soon.
> 
> Thanks,
> Shantanu


__________________________________
Do you Yahoo!?
Yahoo! Mail SpamGuard - Read only the mail you want.
http://antispam.yahoo.com/tools

[-- Attachment #2: nfs-write-throttle.patch --]
[-- Type: application/octet-stream, Size: 10654 bytes --]

Index: include/linux/nfs_page.h
===================================================================
--- orig/include/linux/nfs_page.h	(.../stock)	(revision 7)
+++ nfs-write-throttle/include/linux/nfs_page.h	(.../nfs-write-throttle)	(revision 7)
@@ -53,7 +53,8 @@
 extern	void nfs_list_add_request(struct nfs_page *, struct list_head *);
 
 extern	int nfs_scan_list(struct list_head *, struct list_head *,
-			  struct file *, unsigned long, unsigned int);
+			  struct file *, unsigned long, unsigned int,
+			  unsigned int, unsigned int);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);
Index: include/linux/nfs_fs_sb.h
===================================================================
--- orig/include/linux/nfs_fs_sb.h	(.../stock)	(revision 7)
+++ nfs-write-throttle/include/linux/nfs_fs_sb.h	(.../nfs-write-throttle)	(revision 7)
@@ -28,6 +28,7 @@
 	char *			hostname;	/* remote hostname */
 	struct nfs_fh		fh;
 	struct sockaddr_in	addr;
+	atomic_t		wactive;	/* # active write requests */
 #ifdef CONFIG_NFS_V4
 	/* Our own IP address, as a null-terminated string.
 	 * This is used to generate the clientid, and the callback address.
Index: fs/nfs/inode.c
===================================================================
--- orig/fs/nfs/inode.c	(.../stock)	(revision 7)
+++ nfs-write-throttle/fs/nfs/inode.c	(.../nfs-write-throttle)	(revision 7)
@@ -364,6 +364,8 @@
 	if (sb->s_maxbytes > MAX_LFS_FILESIZE) 
 		sb->s_maxbytes = MAX_LFS_FILESIZE; 
 
+	atomic_set(&server->wactive, 0);
+
 	/* We're airborne Set socket buffersize */
 	rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
 	return 0;
Index: fs/nfs/pagelist.c
===================================================================
--- orig/fs/nfs/pagelist.c	(.../stock)	(revision 7)
+++ nfs-write-throttle/fs/nfs/pagelist.c	(.../nfs-write-throttle)	(revision 7)
@@ -249,6 +249,8 @@
  * @file: if set, ensure we match requests from this file
  * @idx_start: lower bound of page->index to scan
  * @npages: idx_start + npages sets the upper bound to scan.
+ * @nreq: if set, stop after this many coalesced requests.
+ * @rpages: if nreq is set, # pages per request.
  *
  * Moves elements from one of the inode request lists.
  * If the number of requests is set to 0, the entire address_space
@@ -259,18 +261,22 @@
 int
 nfs_scan_list(struct list_head *head, struct list_head *dst,
 	      struct file *file,
-	      unsigned long idx_start, unsigned int npages)
+	      unsigned long idx_start, unsigned int npages,
+	      unsigned int nreq, unsigned int rpages)
 {
 	struct list_head	*pos, *tmp;
-	struct nfs_page		*req;
+	struct nfs_page		*req, *prev;
 	unsigned long		idx_end;
-	int			res;
+	int			res, is_contig;
+	unsigned int		pages, nr;
 
 	res = 0;
 	if (npages == 0)
 		idx_end = ~0;
 	else
 		idx_end = idx_start + npages - 1;
+	nr = pages = 0;
+	prev = NULL;
 
 	list_for_each_safe(pos, tmp, head) {
 
@@ -286,9 +292,33 @@
 
 		if (!nfs_lock_request(req))
 			continue;
+
+		is_contig = (nreq &&
+			     prev &&
+			     pages < rpages &&
+			     req->wb_pgbase == 0 &&
+			     prev->wb_pgbase + prev->wb_bytes == PAGE_CACHE_SIZE &&
+			     req->wb_index == prev->wb_index + 1 &&
+			     req->wb_cred == prev->wb_cred);
+
+		if (!is_contig && nreq && nr == nreq) {
+			spin_unlock(&nfs_wreq_lock);
+			nfs_unlock_request(req);
+			spin_lock(&nfs_wreq_lock);
+			break;
+		}
+
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, dst);
 		res++;
+
+		if (is_contig)
+			pages++;
+		else {
+			pages = 1;
+			nr++;
+		}
+		prev = req;
 	}
 	return res;
 }
Index: fs/nfs/write.c
===================================================================
--- orig/fs/nfs/write.c	(.../stock)	(revision 7)
+++ nfs-write-throttle/fs/nfs/write.c	(.../nfs-write-throttle)	(revision 7)
@@ -75,6 +75,8 @@
 					    struct page *,
 					    unsigned int, unsigned int);
 static void	nfs_strategy(struct inode *inode);
+static int	nfs_wait_on_requests(struct inode *, struct file *,
+				     unsigned long, unsigned int, unsigned int);
 
 static kmem_cache_t *nfs_wdata_cachep;
 static mempool_t *nfs_wdata_mempool;
@@ -125,6 +127,47 @@
 }
 
 /*
+ * Max # active write requests.
+ */
+#define ASYNC_REQ_LIMIT		(RPC_MAXREQS * 3 / 4)
+
+/*
+ * Max # queued dirty pages.
+ */
+#define ASYNC_QUEUE_LIMIT	(256)
+
+/*
+ * Count of active write requests.
+ */
+#define ACTIVE_CNT(inode)	atomic_read(&NFS_SERVER(inode)->wactive)
+#define LIMIT_EXCEEDED(inode)	(ACTIVE_CNT(inode) > ASYNC_REQ_LIMIT)
+#define WRITE_START(inode)	atomic_inc(&NFS_SERVER(inode)->wactive)
+#define WRITE_END(inode)	atomic_dec(&NFS_SERVER(inode)->wactive)
+
+/*
+ * Drain dirty queue if limits have been exceeded.
+ */
+static int queue_drain(struct inode *inode)
+{
+	int error;
+
+	do {
+		error = 0;
+		if (NFS_I(inode)->ndirty > ASYNC_QUEUE_LIMIT)
+			error = nfs_flush_file(inode, NULL, 0, 0, 0);
+		while (LIMIT_EXCEEDED(inode)) {
+			int err = nfs_wait_on_requests(inode, NULL, 0, 0, 1);
+			if (err <= 0) {
+				if (err < 0)
+					error = err;
+				break;
+			}
+		}
+	} while (error > 0);
+	return error;
+}
+
+/*
  * Write a page synchronously.
  * Offset is the data offset within the page.
  */
@@ -162,7 +205,9 @@
 			wdata.args.count = count;
 		wdata.args.offset = page_offset(page) + wdata.args.pgbase;
 
+		WRITE_START(inode);
 		result = NFS_PROTO(inode)->write(&wdata, file);
+		WRITE_END(inode);
 
 		if (result < 0) {
 			/* Must mark the page invalid after I/O error */
@@ -271,6 +316,8 @@
 	unlock_kernel();
 out:
 	unlock_page(page);
+	if (!err && !wbc->nonblocking)
+		err = queue_drain(inode);
 	if (inode_referenced)
 		iput(inode);
 	return err; 
@@ -282,20 +329,41 @@
 	struct inode *inode = mapping->host;
 	int is_sync = !wbc->nonblocking;
 	int err;
+	long npages = wbc->nr_to_write;
 
 	err = generic_writepages(mapping, wbc);
 	if (err)
 		goto out;
-	err = nfs_flush_file(inode, NULL, 0, 0, 0);
-	if (err < 0)
-		goto out;
-	if (wbc->sync_mode == WB_SYNC_HOLD)
-		goto out;
 	if (is_sync && wbc->sync_mode == WB_SYNC_ALL) {
 		err = nfs_wb_all(inode);
-	} else
+		goto out;
+	}
+	if (wbc->sync_mode != WB_SYNC_HOLD)
+		npages -= NFS_I(inode)->ncommit;
+	wbc->encountered_congestion = 0;
+	while (npages > 0 && !wbc->encountered_congestion) {
+		err = nfs_flush_file(inode, NULL, 0, 0, 0);
+		if (err < 0)
+			goto out;
+		if (err == 0)
+			break;
+		npages -= err;
+		while (LIMIT_EXCEEDED(inode)) {
+			if (wbc->nonblocking) {
+				wbc->encountered_congestion = 1;
+				break;
+			}
+			err = nfs_wait_on_requests(inode, NULL, 0, 0, 1);
+			if (err < 0)
+				goto out;
+			if (err == 0)
+				break;
+		}
+	}
+	if (wbc->sync_mode != WB_SYNC_HOLD)
 		nfs_commit_file(inode, NULL, 0, 0, 0);
 out:
+	wbc->nr_to_write = npages;
 	return err;
 }
 
@@ -421,7 +489,7 @@
  * Interruptible by signals only if mounted with intr flag.
  */
 static int
-nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long idx_start, unsigned int npages)
+nfs_wait_on_requests(struct inode *inode, struct file *file, unsigned long idx_start, unsigned int npages, unsigned int nreq)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_page *req;
@@ -455,6 +523,8 @@
 		spin_lock(&nfs_wreq_lock);
 		next = idx_start;
 		res++;
+		if (nreq && res == nreq)
+			break;
 	}
 	spin_unlock(&nfs_wreq_lock);
 	return res;
@@ -472,11 +542,11 @@
  * The requests are *not* checked to ensure that they form a contiguous set.
  */
 static int
-nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages)
+nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages, unsigned int nreq)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int	res;
-	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages);
+	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages, nreq, NFS_SERVER(inode)->wpages);
 	nfsi->ndirty -= res;
 	sub_page_state(nr_dirty,res);
 	if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
@@ -501,7 +571,7 @@
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int	res;
-	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages);
+	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages, 0, 0);
 	nfsi->ncommit -= res;
 	if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
@@ -728,6 +798,7 @@
 		nfs_strategy(inode);
 	} else
 		nfs_unlock_request(req);
+	return queue_drain(inode);
 done:
         dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
 			status, (long long)i_size_read(inode));
@@ -767,6 +838,8 @@
 
 	NFS_PROTO(inode)->write_setup(data, count, how);
 
+	WRITE_START(inode);
+
 	dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
 		task->tk_pid,
 		inode->i_sb->s_id,
@@ -856,6 +929,8 @@
 	dprintk("NFS: %4d nfs_writeback_done (status %d)\n",
 		task->tk_pid, task->tk_status);
 
+	WRITE_END(data->inode);
+
 	/* We can't handle that yet but we check for it nevertheless */
 	if (resp->count < argp->count && task->tk_status >= 0) {
 		static unsigned long    complain;
@@ -1066,10 +1141,16 @@
 {
 	LIST_HEAD(head);
 	int			res,
+				nreq,
 				error = 0;
 
+	nreq = ASYNC_REQ_LIMIT - ACTIVE_CNT(inode);
+	if (nreq < 1)
+		nreq = 1;
+	else if (nreq > 4)
+		nreq = 4;
 	spin_lock(&nfs_wreq_lock);
-	res = nfs_scan_dirty(inode, &head, file, idx_start, npages);
+	res = nfs_scan_dirty(inode, &head, file, idx_start, npages, nreq);
 	spin_unlock(&nfs_wreq_lock);
 	if (res)
 		error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how);
@@ -1106,6 +1187,10 @@
 	int	error,
 		wait;
 
+	error = queue_drain(inode);
+	if (error < 0)
+		return error;
+
 	wait = how & FLUSH_WAIT;
 	how &= ~FLUSH_WAIT;
 
@@ -1113,11 +1198,18 @@
 		inode = file->f_dentry->d_inode;
 
 	do {
-		error = 0;
-		if (wait)
-			error = nfs_wait_on_requests(inode, file, idx_start, npages);
-		if (error == 0)
-			error = nfs_flush_file(inode, file, idx_start, npages, how);
+		error = nfs_flush_file(inode, file, idx_start, npages, how);
+		if (error > 0 && LIMIT_EXCEEDED(inode)) {
+			int err;
+
+			do {
+				err = nfs_wait_on_requests(inode, file, idx_start, npages, 1);
+			} while (err > 0 && LIMIT_EXCEEDED(inode));
+			if (err < 0)
+				error = err;
+		}
+		if (error == 0 && wait)
+			error = nfs_wait_on_requests(inode, file, idx_start, npages, 0);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 		if (error == 0)
 			error = nfs_commit_file(inode, file, idx_start, npages, how);

^ permalink raw reply	[flat|nested] 30+ messages in thread
* RE: [PATCH 2.6.3] Add write throttling to NFS client
@ 2004-02-26 17:22 Lever, Charles
  2004-02-26 19:18 ` Bogdan Costescu
  0 siblings, 1 reply; 30+ messages in thread
From: Lever, Charles @ 2004-02-26 17:22 UTC (permalink / raw)
  To: Bogdan Costescu, Olaf Kirch
  Cc: trond.myklebust, Greg Banks, ShantanuGoel, nfs

seems like we have three different kinds of writes.

1.  writes that need to go now because the VM needs to reclaim
    the dirty pages

2.  async writebehind writes

3.  writes generated by the synchronous path because of the
    "sync" mount option or O_SYNC open flag.

1 and 3 are usually a bounded number of writes and fairly
urgent, but 2 can be effectively unbounded, but not terribly
urgent at all.

likewise for reads, you have reads that are due to readahead,
and reads that are pushed out because of a sync_page (someone
is waiting for that very page).

rather than using something like the RPC_SWAPPER_TASK flag
to change queuing behavior in the RPC client, perhaps the
RPC client should have a two- (or more) level scheduler
queue.  place the requests that have to go now on the top
queue, and the readahead/writebehind tasks on the bottom.
have a mechanism for boosting the priority of bottom
queue requests that suddenly become important.

then the RPC client can visit requests on the top queue
first, and when that is empty, visit requests on the
bottom queue.  there is still the potential for some
unfairness here, so, after every 'n' top queue requests,
service a bottom queue request so that eventually all
bottom queue requests are completed even if there is
a steady flow of top queue requests.

this is a mechanism that has worked in the past, and
requires very little tuning (ie low maintenance).


> -----Original Message-----
> From: Bogdan Costescu [mailto:bogdan.costescu@iwr.uni-heidelberg.de]=20
> Sent: Thursday, February 26, 2004 8:25 AM
> To: Olaf Kirch
> Cc: trond.myklebust@fys.uio.no; Greg Banks; ShantanuGoel;=20
> nfs@lists.sourceforge.net
> Subject: Re: [NFS] [PATCH 2.6.3] Add write throttling to NFS client
>=20
>=20
> On Thu, 26 Feb 2004, Olaf Kirch wrote:
>=20
> > The general "unfairness" observed in writes is due to the=20
> fact that we=20
> > allow a writing process to dirty a large number of pages without=20
> > blocking for the actual IO.
>=20
> Yes, and I think that this is pretty similar to the situation=20
> encountered with block IO, where queued writes had to be serviced=20
> before later queued reads, giving an impression of unresponsivness.=20
> It's easier to accept a delayed write than a delayed read.
>=20
> > Applications doing lots of stats etc are punished so=20
> severely because=20
> > all these operations are synchronous.
>=20
> That's exactly the case for block IO reads.
>=20
> > BTW I think reading is much less of a problem. The number of=20
> > readaheads scheduled by generic_file_read is limited,
>=20
> On the other hand, based on the block IO experience, reads=20
> should have=20
> higher priority than writes :-)
>=20
> --=20
> Bogdan Costescu
>=20
> IWR - Interdisziplinaeres Zentrum fuer Wissenschaftliches=20
> Rechnen Universitaet Heidelberg, INF 368, D-69120 Heidelberg, GERMANY
> Telephone: +49 6221 54 8869, Telefax: +49 6221 54 8868
> E-mail: Bogdan.Costescu@IWR.Uni-Heidelberg.De
>=20
>=20
>=20
> -------------------------------------------------------
> SF.Net is sponsored by: Speed Start Your Linux Apps Now.
> Build and deploy apps & Web services for Linux with
> a free DVD software kit from IBM. Click Now!=20
> http://ads.osdn.com/?ad_id=3D1356&alloc_id=3D3438> &op=3Dclick
>=20
> _______________________________________________
>=20
> NFS maillist  -  NFS@lists.sourceforge.net=20
> https://lists.sourceforge.net/lists/listinfo/n> fs
>=20


-------------------------------------------------------
SF.Net is sponsored by: Speed Start Your Linux Apps Now.
Build and deploy apps & Web services for Linux with
a free DVD software kit from IBM. Click Now!
http://ads.osdn.com/?ad_id=1356&alloc_id=3438&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 30+ messages in thread
[parent not found: <20040301043316.69563.qmail@web12822.mail.yahoo.com>]
* RE: [PATCH 2.6.3] Add write throttling to NFS client
@ 2004-03-01  6:52 Lever, Charles
  0 siblings, 0 replies; 30+ messages in thread
From: Lever, Charles @ 2004-03-01  6:52 UTC (permalink / raw)
  To: trond.myklebust, Shantanu Goel; +Cc: nfs, Shantanu.Goel

> If you need to sort everything by pid (...and I'm not=20
> convinced that is
> such a great idea) then you can do it by adding 1 extra=20
> linked list in the
> struct rpc_task to act as the head of the pid list... No need for any
> fragile allocation of extra structures from inside
> __rpc_add_wait_queue().

i agree.

i'd rather keep __rpc_add_wait_queue and __rpc_remove_wait_queue
as simple as possible.  more complexity here will significantly
impact the per-op overhead, especially if we're holding the BKL
and/or other spinlocks when these are called.


-------------------------------------------------------
SF.Net is sponsored by: Speed Start Your Linux Apps Now.
Build and deploy apps & Web services for Linux with
a free DVD software kit from IBM. Click Now!
http://ads.osdn.com/?ad_id=1356&alloc_id=3438&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 30+ messages in thread
[parent not found: <20040301065511.29682.qmail@web12824.mail.yahoo.com>]
[parent not found: <20040301081456.37082.qmail@web12823.mail.yahoo.com>]
[parent not found: <20040303032213.58494.qmail@web12823.mail.yahoo.com>]
[parent not found: <20040303235442.31317.qmail@web12826.mail.yahoo.com>]
* RE: [PATCH 2.6.3] Add write throttling to NFS client
@ 2004-03-04  2:08 Lever, Charles
  2004-03-04  2:13 ` Trond Myklebust
  0 siblings, 1 reply; 30+ messages in thread
From: Lever, Charles @ 2004-03-04  2:08 UTC (permalink / raw)
  To: Shantanu Goel, Trond Myklebust
  Cc: Shantanu Goel, Bogdan Costescu, Olaf Kirch, Greg Banks, nfs

> BTW, there was a one line nfs_rename fix I sent a
> while back.  At the time 2.6.0 was in freeze so it
> couldn't be integrated.  The issue was the NFS client
> does not refresh attributes after a rename causing
> some programs to spew out bogus errors.  One such
> culprit is GNU tar.

that fix is appropriate for 2.4, but trond tells me
that his attribute cache fixes will address this issue
in 2.6.


-------------------------------------------------------
This SF.Net email is sponsored by: IBM Linux Tutorials
Free Linux tutorial presented by Daniel Robbins, President and CEO of
GenToo technologies. Learn everything from fundamentals to system
administration.http://ads.osdn.com/?ad_id=1470&alloc_id=3638&op=click
_______________________________________________
NFS maillist  -  NFS@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nfs

^ permalink raw reply	[flat|nested] 30+ messages in thread
[parent not found: <20040306021648.85080.qmail@web12824.mail.yahoo.com>]
[parent not found: <20040306035627.79190.qmail@web12822.mail.yahoo.com>]

end of thread, other threads:[~2004-03-06  4:22 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-25  2:22 [PATCH 2.6.3] Add write throttling to NFS client Shantanu Goel
2004-02-26  3:34 ` Greg Banks
2004-02-26  4:10   ` Shantanu Goel
2004-02-26  8:41   ` Olaf Kirch
2004-02-26 12:49     ` trond.myklebust
2004-02-26 13:20       ` Bogdan Costescu
2004-02-26 13:40         ` trond.myklebust
2004-02-26 14:50           ` Olaf Kirch
2004-02-26 16:24             ` Bogdan Costescu
2004-02-26 16:16           ` Bogdan Costescu
  -- strict thread matches above, loose matches on Subject: below --
2004-02-25  3:19 Shantanu Goel
2004-02-26 17:22 Lever, Charles
2004-02-26 19:18 ` Bogdan Costescu
     [not found] <20040301043316.69563.qmail@web12822.mail.yahoo.com>
2004-03-01  5:50 ` trond.myklebust
2004-03-01  6:52 Lever, Charles
     [not found] <20040301065511.29682.qmail@web12824.mail.yahoo.com>
2004-03-01  7:49 ` trond.myklebust
     [not found] <20040301081456.37082.qmail@web12823.mail.yahoo.com>
2004-03-01 17:38 ` trond.myklebust
     [not found]   ` <40437AE4.4030407@lehman.com>
2004-03-01 18:18     ` Bogdan Costescu
2004-03-01 18:58       ` trond.myklebust
     [not found]       ` <35345.207.214.87.84.1078167489.squirrel@webmail.uio.no>
2004-03-02  0:49         ` trond.myklebust
2004-03-01 18:48     ` trond.myklebust
     [not found]       ` <404387DD.5010205@lehman.com>
2004-03-01 19:03         ` trond.myklebust
     [not found] <20040303032213.58494.qmail@web12823.mail.yahoo.com>
     [not found] ` <40461CCC.5040609@lehman.com>
2004-03-03 18:35   ` Trond Myklebust
     [not found]     ` <40462D4C.1060006@lehman.com>
2004-03-03 23:27       ` Trond Myklebust
     [not found] <20040303235442.31317.qmail@web12826.mail.yahoo.com>
2004-03-04  0:18 ` Trond Myklebust
2004-03-05  2:22   ` Trond Myklebust
2004-03-04  2:08 Lever, Charles
2004-03-04  2:13 ` Trond Myklebust
     [not found] <20040306021648.85080.qmail@web12824.mail.yahoo.com>
2004-03-06  3:42 ` Trond Myklebust
     [not found] <20040306035627.79190.qmail@web12822.mail.yahoo.com>
2004-03-06  4:15 ` Trond Myklebust

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox