From: Wendy Cheng <wcheng@redhat.com>
To: NFS list <linux-nfs@vger.kernel.org>
Cc: cluster-devel@redhat.com
Subject: [PATCH 1/2] NLM failover unlock commands
Date: Mon, 07 Jan 2008 00:39:25 -0500 [thread overview]
Message-ID: <4781BB0D.90706@redhat.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1105 bytes --]
We've implemented two new NFSD procfs files:
o /proc/fs/nfsd/unlock_ip
o /proc/fs/nfsd/unlock_filesystem
They are intended to allow admin or user mode script to release NLM
locks based on either a path name or a server in-bound ip address (ipv4
for now)
as;
shell> echo 10.1.1.2 > /proc/fs/nfsd/unlock_ip
shell> echo /mnt/sfs1 > /proc/fs/nfsd/unlock_filesystem
The expected usage is for High Availability (HA) environment where nfs
servers are clustered together to provide either load balancing or take
over upon server failure. The task is normally started by transferring a
floating IP address from serverA to serverB with the following sequences:
ServerA:
1. Tear down the IP address
2. Unexport the path
3. Write IP to /proc/fs/nfsd/unlock_ip to unlock files
4. If unmount required,
write path name to /proc/fs/nfsd/unlock_filesystem, then unmount.
5. Signal peer to begin take-over.
For details, check out:
http://people.redhat.com/wcheng/Patches/NFS/NLM/004.txt
Acknowledgment goes to Neil Brown who has been offered support and
guidance during our prototype efforts.
-- Wendy
[-- Attachment #2: unlock_001.patch --]
[-- Type: text/x-patch, Size: 11591 bytes --]
Two new NFSD procfs files are added:
/proc/fs/nfsd/unlock_ip
/proc/fs/nfsd/unlock_filesystem
They are intended to allow admin or user mode script to release NLM locks
based on either a path name or a server in-bound ip address (ipv4 for now)
as;
shell> echo 10.1.1.2 > /proc/fs/nfsd/unlock_ip
shell> echo /mnt/sfs1 > /proc/fs/nfsd/unlock_filesystem
Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Lon Hohberger <lhh@redhat.com>
fs/lockd/svcsubs.c | 117 +++++++++++++++++++++++++++++++++++++++++++-
fs/nfsd/export.c | 20 +++++++
fs/nfsd/nfsctl.c | 60 ++++++++++++++++++++++
include/linux/lockd/bind.h | 2
include/linux/lockd/lockd.h | 14 ++++-
include/linux/nfsd/export.h | 12 ++++
6 files changed, 221 insertions(+), 4 deletions(-)
--- linux-o/include/linux/nfsd/export.h 2008-01-04 10:01:08.000000000 -0500
+++ linux/include/linux/nfsd/export.h 2008-01-06 15:33:13.000000000 -0500
@@ -138,6 +138,18 @@ int exp_rootfh(struct auth_domain *,
__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
__be32 nfserrno(int errno);
+/* cluster failover support */
+
+#define NFSD_FO_VIP 0
+#define NFSD_FO_PATH 1
+
+#define DEBUG 0
+#define fo_printk(x...) ((void)(DEBUG && printk(x)))
+
+int nfsd_fo_cmd(int cmd, char *datap, int grace_time);
+
+/* end of failover addition */
+
extern struct cache_detail svc_export_cache;
static inline void exp_put(struct svc_export *exp)
--- linux-o/fs/nfsd/nfsctl.c 2008-01-04 10:01:08.000000000 -0500
+++ linux/fs/nfsd/nfsctl.c 2008-01-06 15:27:34.000000000 -0500
@@ -52,6 +52,8 @@ enum {
NFSD_Getfs,
NFSD_List,
NFSD_Fh,
+ NFSD_FO_UnlockIP,
+ NFSD_FO_UnlockFS,
NFSD_Threads,
NFSD_Pool_Threads,
NFSD_Versions,
@@ -88,6 +90,9 @@ static ssize_t write_leasetime(struct fi
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
#endif
+static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
+
static ssize_t (*write_op[])(struct file *, char *, size_t) = {
[NFSD_Svc] = write_svc,
[NFSD_Add] = write_add,
@@ -97,6 +102,8 @@ static ssize_t (*write_op[])(struct file
[NFSD_Getfd] = write_getfd,
[NFSD_Getfs] = write_getfs,
[NFSD_Fh] = write_filehandle,
+ [NFSD_FO_UnlockIP] = failover_unlock_ip,
+ [NFSD_FO_UnlockFS] = failover_unlock_fs,
[NFSD_Threads] = write_threads,
[NFSD_Pool_Threads] = write_pool_threads,
[NFSD_Versions] = write_versions,
@@ -288,6 +295,56 @@ static ssize_t write_getfd(struct file *
return err;
}
+extern __u32 in_aton(const char *str);
+
+static
+ssize_t failover_parse(int where, struct file *file, char *buf, size_t size)
+{
+ char *fo_path, *mesg;
+ __be32 server_ip[4];
+
+ /* sanity check */
+ if (size <= 0) {
+ fo_printk("nfsd fo buf size not correct\n");
+ return -EINVAL;
+ }
+ if (buf[size-1] == '\n')
+ buf[size-1] = 0;
+
+ /* get the string */
+ fo_printk("nfsd fo buf = %s\n", buf);
+
+ fo_path = mesg = buf;
+ if (qword_get(&mesg, fo_path, size) < 0)
+ return EINVAL;
+
+ fo_printk("fo_dev=%s\n", fo_path);
+
+ switch (where) {
+ case NFSD_FO_PATH:
+ break;
+ case NFSD_FO_VIP:
+ server_ip[0] = in_aton(fo_path);
+ fo_path = (char *) server_ip;
+ break;
+ default:
+ fo_printk("nfsd unknown fo cmd (%d)\n", where);
+ return -EINVAL;
+ }
+
+ return (nfsd_fo_cmd(where, fo_path, 0));
+}
+
+static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+{
+ return (failover_parse(NFSD_FO_VIP, file, buf, size));
+}
+
+static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+{
+ return (failover_parse(NFSD_FO_PATH, file, buf, size));
+}
+
static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
{
/* request is:
@@ -646,6 +703,8 @@ static int nfsd_fill_super(struct super_
[NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+ [NFSD_FO_UnlockIP] = {"unlock_ip", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_FO_UnlockFS] = {"unlock_filesystem", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
@@ -717,7 +776,6 @@ static void __exit exit_nfsd(void)
nfsd4_free_slabs();
unregister_filesystem(&nfsd_fs_type);
}
-
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
MODULE_LICENSE("GPL");
module_init(init_nfsd)
--- linux-o/fs/nfsd/export.c 2008-01-04 10:01:08.000000000 -0500
+++ linux/fs/nfsd/export.c 2008-01-06 15:14:55.000000000 -0500
@@ -1679,3 +1679,23 @@ nfsd_export_shutdown(void)
exp_writeunlock();
dprintk("nfsd: export shutdown complete.\n");
}
+
+int
+nfsd_fo_cmd(int cmd, char *datap, int grace_period)
+{
+ struct nameidata nd;
+ void *objp = (void *)datap;
+ int rc=0;
+
+ if (cmd == NFSD_FO_PATH) {
+ rc = path_lookup((const char *)datap, 0, &nd);
+ if (rc) {
+ fo_printk("nfsd: nfsd_fo path (%s) not found\n", datap);
+ return rc;
+ }
+ fo_printk("nfsd: nfsd_fo lookup path = (0x%p,0x%p)\n",
+ nd.mnt, nd.dentry);
+ objp = (void *) &nd;
+ }
+ return (nlmsvc_fo_cmd(cmd, objp, grace_period));
+}
--- linux-o/fs/lockd/svcsubs.c 2008-01-04 10:01:08.000000000 -0500
+++ linux/fs/lockd/svcsubs.c 2008-01-06 16:20:37.000000000 -0500
@@ -18,10 +18,11 @@
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
#include <linux/lockd/sm_inter.h>
+#include <linux/module.h>
+#include <linux/mount.h>
#define NLMDBG_FACILITY NLMDBG_SVCSUBS
-
/*
* Global file hash table
*/
@@ -87,7 +88,7 @@ nlm_lookup_file(struct svc_rqst *rqstp,
unsigned int hash;
__be32 nfserr;
- nlm_debug_print_fh("nlm_file_lookup", f);
+ nlm_debug_print_fh("nlm_lookup_file", f);
hash = file_hash(f);
@@ -123,6 +124,11 @@ nlm_lookup_file(struct svc_rqst *rqstp,
hlist_add_head(&file->f_list, &nlm_files[hash]);
+ /* fill in f_iaddr for nlm lock failover */
+ file->f_iaddr = rqstp->rq_daddr;
+ fo_printk("lockd: file->f_iaddr = %u.%u.%u.%u\n",
+ NIPQUAD(file->f_iaddr.addr.s_addr));
+
found:
dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
*result = file;
@@ -194,12 +200,88 @@ again:
return 0;
}
+static inline int
+nlmsvc_fo_unlock_match(void *datap, struct nlm_file *file)
+{
+ nlm_fo_cmd *fo_cmd = (nlm_fo_cmd *) datap;
+ int cmd = fo_cmd->cmd;
+ struct path *f_path;
+
+ fo_printk("nlm_fo_unlock_match cmd=%d\n", cmd);
+
+ if (cmd == NFSD_FO_VIP) {
+ if (file->f_iaddr.addr.s_addr ==
+ ((struct in_addr *)fo_cmd->datap)->s_addr) {
+ fo_printk("lockd: fo ip matches %u.%u.%u.%u\n",
+ NIPQUAD(file->f_iaddr.addr.s_addr));
+ goto nlmsvc_fo_unlock_match_found;
+ } else {
+ fo_printk("lockd: fo ip no match %u.%u.%u.%u\n",
+ NIPQUAD(((struct in_addr *)fo_cmd->datap)->s_addr));
+ return 0;
+ }
+ }
+
+ /* looking for match using file's vfsmount */
+ f_path = &(file->f_file->f_path);
+
+ if (cmd == NFSD_FO_PATH) {
+ struct path fo_path;
+ /*
+ * The dentry is not really used but stays here for
+ * debugging purpose.
+ */
+ fo_path.mnt = ((struct nameidata *) fo_cmd->datap)->mnt;
+ fo_path.dentry = ((struct nameidata *) fo_cmd->datap)->dentry;
+ fo_printk("f_path->mnt (0x%p) f_path->dentry (0x%p)\n",
+ f_path->mnt, f_path->dentry);
+ fo_printk("fo_path (0x%p) fo_path->dentry (0x%p)\n",
+ fo_path.mnt, fo_path.dentry);
+ /* check vfsmount */
+ if (fo_path.mnt == f_path->mnt)
+ goto nlmsvc_fo_unlock_match_found;
+ return 0; /* not found */
+ }
+
+ fo_printk("nlmsvc_fo_unlock_match - unknown cmd\n");
+ return 0; /* should never reach here */
+
+nlmsvc_fo_unlock_match_found:
+ fo_printk("nlmsvc_fo_unlock_match found file=0x%p\n", file);
+ fo_cmd->stat++;
+ return 1;
+}
+
+/* To fit the logic into current lockd code structure, we add a
+ * little wrapper function here. The real matching task should be
+ * carried out by nlm_fo_check_fsid().
+ */
+int nlmsvc_fo_match(struct nlm_host *dummy1, struct nlm_host *dummy2)
+{
+ return 1;
+}
+
/*
* Inspect a single file
*/
static inline int
nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
{
+ /* Cluster failover has timing constraints. There is a slight
+ * performance hit if nlm_fo_unlock_match() is implemented as
+ * a match fn (since it will be invoked for each block, share,
+ * and lock later when the lists are traversed). Instead, we
+ * add path-matching logic into the following unlikely clause.
+ * If matches, the dummy nlmsvc_fo_match will always return
+ * true.
+ */
+ dprintk("nlm_inspect_files: file=%p\n", file);
+ if (unlikely(match == nlmsvc_fo_match)) {
+ if (!nlmsvc_fo_unlock_match((void *)host, file))
+ return 0;
+ fo_printk("nlm_fo find lock file entry (0x%p)\n", file);
+ }
+
nlmsvc_traverse_blocks(host, file, match);
nlmsvc_traverse_shares(host, file, match);
return nlm_traverse_locks(host, file, match);
@@ -370,3 +452,34 @@ nlmsvc_invalidate_all(void)
*/
nlm_traverse_files(NULL, nlmsvc_is_client);
}
+
+/*
+ * Release locks associated with an export fsid upon failover
+ * invoked via nfsd nfsctl call (write_fo_unlock).
+ */
+int
+nlmsvc_fo_cmd(int cmd, void *datap, int grace_time)
+{
+ nlm_fo_cmd fo_cmd;
+ int rc=-EINVAL;
+
+ fo_printk("lockd: nlmsvc_fo_cmd enter, cmd=%d, datap=0x%p, gp=%d\n",
+ cmd, datap, grace_time);
+
+ fo_cmd.cmd = cmd;
+ fo_cmd.stat = 0;
+ fo_cmd.gp = 0;
+ fo_cmd.datap = datap;
+
+ /* "if" place holder for NFSD_FO_RESUME */
+ {
+ /* fo_start */
+ rc = nlm_traverse_files((struct nlm_host*) &fo_cmd,
+ nlmsvc_fo_match);
+ fo_printk("nlmsvc_fo_cmd rc=%d, stat=%d\n", rc, fo_cmd.stat);
+ }
+
+ return rc;
+}
+
+EXPORT_SYMBOL(nlmsvc_fo_cmd);
--- linux-o/include/linux/lockd/bind.h 2008-01-04 10:01:08.000000000 -0500
+++ linux/include/linux/lockd/bind.h 2008-01-06 15:14:55.000000000 -0500
@@ -47,4 +47,6 @@ unsigned long get_nfs4_grace_period(void
static inline unsigned long get_nfs4_grace_period(void) {return 0;}
#endif
+extern int nlmsvc_fo_cmd(int cmd, void *datap, int grace_time);
+
#endif /* LINUX_LOCKD_BIND_H */
--- linux-o/include/linux/lockd/lockd.h 2008-01-04 10:01:08.000000000 -0500
+++ linux/include/linux/lockd/lockd.h 2008-01-06 15:14:55.000000000 -0500
@@ -39,7 +39,7 @@
struct nlm_host {
struct hlist_node h_hash; /* doubly linked list */
struct sockaddr_in h_addr; /* peer address */
- struct sockaddr_in h_saddr; /* our address (optional) */
+ struct sockaddr_in h_saddr; /* our address (optional) */
struct rpc_clnt * h_rpcclnt; /* RPC client to talk to peer */
char * h_name; /* remote hostname */
u32 h_version; /* interface version */
@@ -113,6 +113,7 @@ struct nlm_file {
unsigned int f_locks; /* guesstimate # of locks */
unsigned int f_count; /* reference count */
struct mutex f_mutex; /* avoid concurrent access */
+ union svc_addr_u f_iaddr; /* server ip for failover */
};
/*
@@ -214,6 +215,17 @@ void nlmsvc_mark_resources(void);
void nlmsvc_free_host_resources(struct nlm_host *);
void nlmsvc_invalidate_all(void);
+/* cluster failover support */
+
+typedef struct {
+ int cmd;
+ int stat;
+ int gp;
+ void *datap;
+} nlm_fo_cmd;
+
+int nlmsvc_fo_cmd(int cmd, void *datap, int grace_time);
+
static __inline__ struct inode *
nlmsvc_file_inode(struct nlm_file *file)
{
next reply other threads:[~2008-01-07 5:39 UTC|newest]
Thread overview: 54+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-07 5:39 Wendy Cheng [this message]
2008-01-08 5:18 ` [PATCH 1/2] NLM failover unlock commands Neil Brown
2008-01-09 2:51 ` Wendy Cheng
2008-01-08 17:02 ` Christoph Hellwig
2008-01-08 17:49 ` Christoph Hellwig
2008-01-08 20:57 ` Wendy Cheng
2008-01-09 18:02 ` Christoph Hellwig
2008-01-10 7:59 ` Christoph Hellwig
2008-01-12 7:03 ` Wendy Cheng
2008-01-12 9:38 ` Christoph Hellwig
2008-01-14 23:07 ` J. Bruce Fields
2008-01-14 23:31 ` Neil Brown
[not found] ` <18315.61638.14133.308991-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2008-01-15 16:38 ` Chuck Lever
2008-01-22 22:53 ` J. Bruce Fields
2008-01-24 4:02 ` Neil Brown
2008-01-15 16:14 ` Wendy Cheng
2008-01-15 16:30 ` J. Bruce Fields
2008-01-14 23:52 ` Neil Brown
2008-01-15 20:17 ` Wendy Cheng
2008-01-15 20:50 ` Neil Brown
2008-01-15 20:56 ` Wendy Cheng
2008-01-15 22:48 ` Wendy Cheng
2008-01-16 4:19 ` Neil Brown
2008-01-17 15:10 ` J. Bruce Fields
2008-01-17 15:48 ` Wendy Cheng
2008-01-17 16:08 ` Wendy Cheng
2008-01-17 16:10 ` Wendy Cheng
2008-01-18 10:21 ` Frank van Maarseveen
2008-01-18 15:00 ` Wendy Cheng
2008-01-17 16:14 ` J. Bruce Fields
2008-01-17 16:17 ` Wendy Cheng
2008-01-17 16:21 ` J. Bruce Fields
2008-01-17 16:31 ` J. Bruce Fields
2008-01-17 16:31 ` Wendy Cheng
2008-01-17 16:40 ` J. Bruce Fields
2008-01-17 17:35 ` Frank Filz
2008-01-17 17:59 ` Wendy Cheng
2008-01-17 18:07 ` Wendy Cheng
2008-01-17 20:23 ` J. Bruce Fields
2008-01-18 10:03 ` Frank van Maarseveen
2008-01-18 14:56 ` Wendy Cheng
2008-01-24 16:00 ` J. Bruce Fields
2008-01-24 16:19 ` Peter Staubach
2008-01-24 16:39 ` J. Bruce Fields
2008-01-24 19:45 ` Wendy Cheng
2008-01-24 20:19 ` J. Bruce Fields
2008-01-24 21:06 ` Wendy Cheng
2008-01-24 21:40 ` J. Bruce Fields
2008-01-24 21:49 ` Wendy Cheng
2008-01-28 3:46 ` Felix Blyakher
2008-01-28 15:56 ` Wendy Cheng
2008-01-28 17:06 ` [Cluster-devel] " Felix Blyakher
2008-01-09 3:49 ` Wendy Cheng
2008-01-09 16:13 ` J. Bruce Fields
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4781BB0D.90706@redhat.com \
--to=wcheng@redhat.com \
--cc=cluster-devel@redhat.com \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox