Linux NFS development
 help / color / mirror / Atom feed
From: Martin Knoblauch <knobi@knobisoft.de>
To: linux-nfs list <linux-nfs@vger.kernel.org>
Cc: linux-kernel@vger.kernel.org
Subject: [RFC][Resend] Make NFS-Client readahead tunable
Date: Wed, 17 Sep 2008 06:06:40 -0700 (PDT)	[thread overview]
Message-ID: <997439.5560.qm@web32601.mail.mud.yahoo.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 6902 bytes --]

Hi,

the following/attached patch works around a [obscure] problem when an 2.6 (not sure/caring about 2.4) NFS client accesses an "offline" file on a Sun/Solaris-10 NFS server when the underlying filesystem is of type SAM-FS. Happens with RHEL4/5 and mainline kernels. Frankly, it is not a Linux problem, but the chance for a short-/mid-term solution from Sun are very slim. So, being lazy, I would love to get this patch into Linux. If not, I just will have to maintain it for eternity out of tree.

The problem: SAM-FS is Suns proprietary HSM filesystem. It stores meta-data and a relatively small amount of data "online" on disk and pushes old or infrequently used data to "offline" media like e.g. tape. This is completely transparent to the users. If the date for an "offline" file is needed, the so called "stager daemon" copies it back from the offline medium. All of this works great most of the time. Now, if an Linux NFS client tries to read such an offline file, performance drops to "extremely slow". After lengthly investigation of tcp-dumps, mount options and procedures involving black cats at midnight, we found out that the readahead behaviour of the Linux NFS client causes the problem. Basically it seems to issue read requests up to 15*rsize to the server. In the case of the "offl
 ine" files, this behaviour causes heavy competition for the inode lock between the NFSD process and the stager daemon on the Solaris server.

- The real solution: fixing SAM-FS/NFSD interaction. Sun engineering acks the problem, but a solution will need time. Lots of it.
- The working solution: disable the client side readahead, or make it tunable. The patch does that by introducing a NFS module parameter "ra_factor" which can take values between 1 and 15 (default 15) and a tunable "/proc/sys/fs/nfs/nfs_ra_factor" with the same range and default.

Signed-off-by: Martin Knoblauch <knobi@knobisoft.de>

diff -urp linux-2.6.27-rc6-git4/fs/nfs/client.c linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/client.c
--- linux-2.6.27-rc6-git4/fs/nfs/client.c       2008-09-17 11:35:21.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/client.c        2008-09-17 11:55:18.000000000 +0200
@@ -722,6 +722,11 @@ error:
 }

 /*
+ * NFS Client Read-Ahead factor
+*/
+unsigned int nfs_ra_factor;
+
+/*
  * Load up the server record from information gained in an fsinfo record
  */
 static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
@@ -746,7 +751,11 @@ static void nfs_server_set_fsinfo(struct
                server->rsize = NFS_MAX_FILE_IO_SIZE;
        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

-       server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+       dprintk("nfs_server_set_fsinfo: rsize, wsize, rpages, \
+               nfs_ra_factor, ra_pages: %d %d %d %d %d\n",
+               server->rsize,server->wsize,server->rpages,
+               nfs_ra_factor,server->rpages * nfs_ra_factor);
+       server->backing_dev_info.ra_pages = server->rpages * nfs_ra_factor;

        if (server->wsize > max_rpc_payload)
                server->wsize = max_rpc_payload;
diff -urp linux-2.6.27-rc6-git4/fs/nfs/inode.c linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/inode.c
--- linux-2.6.27-rc6-git4/fs/nfs/inode.c        2008-09-17 11:35:21.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/inode.c 2008-09-17 11:45:09.000000000 +0200
@@ -53,6 +53,8 @@

 /* Default is to see 64-bit inode numbers */
 static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+static unsigned int ra_factor __read_mostly = NFS_MAX_READAHEAD;
+

 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -1347,6 +1349,12 @@ static int __init init_nfs_fs(void)
 #endif
        if ((err = register_nfs_fs()) != 0)
                goto out;
+
+       if (ra_factor < 1 || ra_factor > NFS_MAX_READAHEAD)
+               nfs_ra_factor = NFS_MAX_READAHEAD;
+       else
+               nfs_ra_factor = ra_factor;
+
        return 0;
 out:
 #ifdef CONFIG_PROC_FS
@@ -1388,6 +1396,10 @@ static void __exit exit_nfs_fs(void)
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
 MODULE_LICENSE("GPL");
 module_param(enable_ino64, bool, 0644);
+MODULE_PARM_DESC(enable_ino64, "Enable 64-bit inode numbers (Default: 1)");
+module_param(ra_factor, uint, 0644);
+MODULE_PARM_DESC(ra_factor,
+       "Number of rsize read-ahead requests (Default/Max: 15, Min: 1)");

 module_init(init_nfs_fs)
 module_exit(exit_nfs_fs)
diff -urp linux-2.6.27-rc6-git4/fs/nfs/sysctl.c linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/sysctl.c
--- linux-2.6.27-rc6-git4/fs/nfs/sysctl.c       2008-07-13 23:51:29.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/sysctl.c        2008-09-17 11:45:09.000000000 +0200
@@ -14,9 +14,12 @@
 #include <linux/nfs_fs.h>

 #include "callback.h"
+#include "internal.h"

 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
+static const unsigned int min_nfs_ra_factor = 1;
+static const unsigned int max_nfs_ra_factor = NFS_MAX_READAHEAD;
 static struct ctl_table_header *nfs_callback_sysctl_table;

 static ctl_table nfs_cb_sysctls[] = {
@@ -58,6 +61,16 @@ static ctl_table nfs_cb_sysctls[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
+       {
+               .ctl_name = CTL_UNNUMBERED,
+               .procname = "nfs_ra_factor",
+               .data = &nfs_ra_factor,
+               .maxlen = sizeof(unsigned int),
+               .mode = 0644,
+               .proc_handler = &proc_dointvec_minmax,
+               .extra1 = (unsigned int *)&min_nfs_ra_factor,
+               .extra2 = (unsigned int *)&max_nfs_ra_factor,
+       },
        { .ctl_name = 0 }
 };

diff -urp linux-2.6.27-rc6-git4/include/linux/nfs_fs.h linux-2.6.27-rc6-git4-nfs_ra/include/linux/nfs_fs.h
--- linux-2.6.27-rc6-git4/include/linux/nfs_fs.h        2008-09-17 11:35:25.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/include/linux/nfs_fs.h 2008-09-17 11:45:09.000000000 +0200
@@ -464,6 +464,11 @@ extern int nfs_writeback_done(struct rpc
 extern void nfs_writedata_release(void *);

 /*
+ * linux/fs/nfs/client.c
+*/
+extern unsigned int nfs_ra_factor;
+
+/*
  * Try to write back everything synchronously (but check the
  * return value!)
  */
diff -urp linux-2.6.27-rc6-git4/Makefile linux-2.6.27-rc6-git4-nfs_ra/Makefile
--- linux-2.6.27-rc6-git4/Makefile      2008-09-17 11:35:56.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/Makefile       2008-09-17 11:45:09.000000000 +0200
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 27
-EXTRAVERSION = -rc6-git4
+EXTRAVERSION = -rc6-git4-nfs_ra
 NAME = Rotary Wombat

 # *DOCUMENTATION*



Cheers
Martin

------------------------------------------------------
Martin Knoblauch
email: k n o b i AT knobisoft DOT de
www:  http://www.knobisoft.de

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: nfs_ra-2.6.27-rc6-git4.diff --]
[-- Type: text/x-patch; name="nfs_ra-2.6.27-rc6-git4.diff", Size: 4467 bytes --]

diff -urp linux-2.6.27-rc6-git4/fs/nfs/client.c linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/client.c
--- linux-2.6.27-rc6-git4/fs/nfs/client.c	2008-09-17 11:35:21.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/client.c	2008-09-17 11:55:18.000000000 +0200
@@ -722,6 +722,11 @@ error:
 }
 
 /*
+ * NFS Client Read-Ahead factor
+*/
+unsigned int nfs_ra_factor;
+
+/*
  * Load up the server record from information gained in an fsinfo record
  */
 static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo)
@@ -746,7 +751,11 @@ static void nfs_server_set_fsinfo(struct
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
-	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	dprintk("nfs_server_set_fsinfo: rsize, wsize, rpages, \
+		nfs_ra_factor, ra_pages: %d %d %d %d %d\n",
+		server->rsize,server->wsize,server->rpages,
+		nfs_ra_factor,server->rpages * nfs_ra_factor);
+	server->backing_dev_info.ra_pages = server->rpages * nfs_ra_factor;
 
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
diff -urp linux-2.6.27-rc6-git4/fs/nfs/inode.c linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/inode.c
--- linux-2.6.27-rc6-git4/fs/nfs/inode.c	2008-09-17 11:35:21.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/inode.c	2008-09-17 11:45:09.000000000 +0200
@@ -53,6 +53,8 @@
 
 /* Default is to see 64-bit inode numbers */
 static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
+static unsigned int ra_factor __read_mostly = NFS_MAX_READAHEAD;
+
 
 static void nfs_invalidate_inode(struct inode *);
 static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -1347,6 +1349,12 @@ static int __init init_nfs_fs(void)
 #endif
 	if ((err = register_nfs_fs()) != 0)
 		goto out;
+
+	if (ra_factor < 1 || ra_factor > NFS_MAX_READAHEAD)
+		nfs_ra_factor = NFS_MAX_READAHEAD;
+	else
+		nfs_ra_factor = ra_factor;
+
 	return 0;
 out:
 #ifdef CONFIG_PROC_FS
@@ -1388,6 +1396,10 @@ static void __exit exit_nfs_fs(void)
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
 MODULE_LICENSE("GPL");
 module_param(enable_ino64, bool, 0644);
+MODULE_PARM_DESC(enable_ino64, "Enable 64-bit inode numbers (Default: 1)");
+module_param(ra_factor, uint, 0644);
+MODULE_PARM_DESC(ra_factor,
+	"Number of rsize read-ahead requests (Default/Max: 15, Min: 1)");
 
 module_init(init_nfs_fs)
 module_exit(exit_nfs_fs)
diff -urp linux-2.6.27-rc6-git4/fs/nfs/sysctl.c linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/sysctl.c
--- linux-2.6.27-rc6-git4/fs/nfs/sysctl.c	2008-07-13 23:51:29.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/fs/nfs/sysctl.c	2008-09-17 11:45:09.000000000 +0200
@@ -14,9 +14,12 @@
 #include <linux/nfs_fs.h>
 
 #include "callback.h"
+#include "internal.h"
 
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
+static const unsigned int min_nfs_ra_factor = 1;
+static const unsigned int max_nfs_ra_factor = NFS_MAX_READAHEAD;
 static struct ctl_table_header *nfs_callback_sysctl_table;
 
 static ctl_table nfs_cb_sysctls[] = {
@@ -58,6 +61,16 @@ static ctl_table nfs_cb_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name = CTL_UNNUMBERED,
+		.procname = "nfs_ra_factor",
+		.data = &nfs_ra_factor,
+		.maxlen = sizeof(unsigned int),
+		.mode = 0644,
+		.proc_handler = &proc_dointvec_minmax,
+		.extra1 = (unsigned int *)&min_nfs_ra_factor,
+		.extra2 = (unsigned int *)&max_nfs_ra_factor,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -urp linux-2.6.27-rc6-git4/include/linux/nfs_fs.h linux-2.6.27-rc6-git4-nfs_ra/include/linux/nfs_fs.h
--- linux-2.6.27-rc6-git4/include/linux/nfs_fs.h	2008-09-17 11:35:25.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/include/linux/nfs_fs.h	2008-09-17 11:45:09.000000000 +0200
@@ -464,6 +464,11 @@ extern int nfs_writeback_done(struct rpc
 extern void nfs_writedata_release(void *);
 
 /*
+ * linux/fs/nfs/client.c
+*/
+extern unsigned int nfs_ra_factor;
+
+/*
  * Try to write back everything synchronously (but check the
  * return value!)
  */
diff -urp linux-2.6.27-rc6-git4/Makefile linux-2.6.27-rc6-git4-nfs_ra/Makefile
--- linux-2.6.27-rc6-git4/Makefile	2008-09-17 11:35:56.000000000 +0200
+++ linux-2.6.27-rc6-git4-nfs_ra/Makefile	2008-09-17 11:45:09.000000000 +0200
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 27
-EXTRAVERSION = -rc6-git4
+EXTRAVERSION = -rc6-git4-nfs_ra
 NAME = Rotary Wombat
 
 # *DOCUMENTATION*

             reply	other threads:[~2008-09-17 13:06 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-09-17 13:06 Martin Knoblauch [this message]
     [not found] ` <997439.5560.qm-VAEUvbQToQWvuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-17 13:21   ` [RFC][Resend] Make NFS-Client readahead tunable Jim Rees
2008-09-17 14:06   ` Peter Staubach
2008-09-17 15:41     ` Chuck Lever
2008-09-18  1:42   ` Greg Banks
     [not found]     ` <48D1B21E.3060509-cP1dWloDopni96+mSzHFpQC/G2K4zDHf@public.gmane.org>
2008-09-18  3:13       ` Andrew Morton
  -- strict thread matches above, loose matches on Subject: below --
2008-09-17 13:19 Michael Trimarchi
2008-09-17 13:25 Martin Knoblauch
     [not found] ` <995475.95604.qm-f6uctMgKLEavuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-17 15:31   ` Jim Rees
2008-09-17 13:27 Martin Knoblauch
2008-09-17 13:42 Michael Trimarchi
2008-09-17 16:03 Martin Knoblauch
2008-09-17 16:10 Martin Knoblauch
2008-09-17 16:15 Martin Knoblauch
2008-09-17 16:23 Martin Knoblauch
     [not found] ` <804604.40886.qm-f6uctMgKLEavuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-17 16:43   ` Chuck Lever
2008-09-17 17:01 Martin Knoblauch
2008-09-18  7:42 Martin Knoblauch
     [not found] ` <418380.19358.qm-1+WuAixcP4WvuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-18  8:18   ` Andrew Morton
2008-09-18  8:19 Martin Knoblauch
     [not found] ` <136998.55258.qm-RqHyxEpxwZuvuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-18  8:45   ` Greg Banks
2008-09-18  8:38 Martin Knoblauch
     [not found] ` <124712.40022.qm-n7KXdZBPtPqvuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-18  8:47   ` Andrew Morton
2008-09-18  8:57     ` Greg Banks
2008-09-18 13:20 ` Peter Zijlstra
2008-09-18  9:32 Martin Knoblauch
2008-09-18 11:53 Martin Knoblauch
     [not found] ` <688309.69831.qm-lSXk2nNw7cevuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-18 18:24   ` Chuck Lever
     [not found]     ` <76bd70e30809181124t78c0d574gaed5702095c02921-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2008-09-18 19:03       ` Peter Staubach
2008-09-21 12:50 Martin Knoblauch
     [not found] ` <968192.84087.qm-f6uctMgKLEavuULXzWHTWIglqE1Y4D90QQ4Iyu8u01E@public.gmane.org>
2008-09-21 13:53   ` Chuck Lever
2008-09-21 12:53 Martin Knoblauch

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=997439.5560.qm@web32601.mail.mud.yahoo.com \
    --to=knobi@knobisoft.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox