From: Alexandros Batsakis <batsakis@netapp.com>
To: linux-nfs@vger.kernel.org
Cc: bhalevy@panasas.com, Alexandros Batsakis <batsakis@netapp.com>
Subject: [PATCH 7/8] pnfs-submit: forgetful client model
Date: Wed, 5 May 2010 10:00:57 -0700 [thread overview]
Message-ID: <1273078858-1923-8-git-send-email-batsakis@netapp.com> (raw)
In-Reply-To: <1273078858-1923-7-git-send-email-batsakis@netapp.com>
Forgetful client model:
If we receive a CB_LAYOUTRECALL
- we spawn a thread to handle the recall
(xxx: now only one recall can be active at a time, else NFS4ERR_DELAY)
- we check the stateid seqid
if it does not match we return NFS4ERR_DELAY
- we check for pending I/O
if there is we return NFS4ERR_DELAY
Else we return NO_MATCHING_LAYOUT.
Note that for whole file layouts there is no need to serialize LAYOUTGETs/LAYOUTRETURNs
For bulk layouts, if there is a layout active, we return NFS4_OK and we start
cleaning the layouts asynchronously. At the end we send a bulk LAYOUTRETURN.
Note that there is no need to prevent any new LAYOUTGETs explicitly as the server should reject them.
Signed-off-by: Alexandros Batsakis <batsakis@netapp.com>
---
fs/nfs/callback_proc.c | 146 ++++++++++++++++++++++++++++++++++--------------
fs/nfs/nfs4_fs.h | 1 +
fs/nfs/pnfs.c | 70 ++++++++++-------------
3 files changed, 136 insertions(+), 81 deletions(-)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 0400dbe..9a38725 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -129,6 +129,38 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf
#if defined(CONFIG_NFS_V4_1)
+static bool
+pnfs_is_next_layout_stateid(const struct pnfs_layout_type *lo,
+ const nfs4_stateid stateid)
+{
+ int seqlock;
+ bool res;
+ u32 oldseqid, newseqid;
+
+ do {
+ seqlock = read_seqbegin(&lo->seqlock);
+ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid);
+ newseqid = be32_to_cpu(stateid.u.stateid.seqid);
+ res = !memcmp(lo->stateid.u.stateid.other,
+ stateid.u.stateid.other,
+ NFS4_STATEID_OTHER_SIZE);
+ if (res) { /* comparing layout stateids */
+ if (oldseqid == ~0)
+ res = (newseqid == 1);
+ else
+ res = (newseqid == oldseqid + 1);
+ } else { /* open stateid */
+ res = !memcmp(lo->stateid.u.data,
+ &zero_stateid,
+ NFS4_STATEID_SIZE);
+ if (res)
+ res = (newseqid == 1);
+ }
+ } while (read_seqretry(&lo->seqlock, seqlock));
+
+ return res;
+}
+
/*
* Retrieve an inode based on layout recall parameters
*
@@ -191,9 +223,10 @@ static int pnfs_recall_layout(void *data)
struct inode *inode, *ino;
struct nfs_client *clp;
struct cb_pnfs_layoutrecallargs rl;
+ struct nfs4_pnfs_layoutreturn *lrp;
struct recall_layout_threadargs *args =
(struct recall_layout_threadargs *)data;
- int status;
+ int status = 0;
daemonize("nfsv4-layoutreturn");
@@ -204,47 +237,59 @@ static int pnfs_recall_layout(void *data)
clp = args->clp;
inode = args->inode;
rl = *args->rl;
- args->result = 0;
- complete(&args->started);
- args = NULL;
- /* Note: args must not be used after this point!!! */
-
-/* FIXME: need barrier here:
- pause I/O to data servers
- pause layoutgets
- drain all outstanding writes to storage devices
- wait for any outstanding layoutreturns and layoutgets mentioned in
- cb_sequence.
- then return layouts, resume after layoutreturns complete
- */
/* support whole file layouts only */
rl.cbl_seg.offset = 0;
rl.cbl_seg.length = NFS4_MAX_UINT64;
if (rl.cbl_recall_type == RETURN_FILE) {
- status = pnfs_return_layout(inode, &rl.cbl_seg, &rl.cbl_stateid,
- RETURN_FILE, true);
+ if (pnfs_is_next_layout_stateid(&NFS_I(inode)->layout,
+ rl.cbl_stateid))
+ status = pnfs_return_layout(inode, &rl.cbl_seg,
+ &rl.cbl_stateid, RETURN_FILE,
+ false);
+ else
+ status = cpu_to_be32(NFS4ERR_DELAY);
if (status)
dprintk("%s RETURN_FILE error: %d\n", __func__, status);
+ else
+ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
+ args->result = status;
+ complete(&args->started);
goto out;
}
- /* FIXME: This loop is inefficient, running in O(|s_inodes|^2) */
+ status = cpu_to_be32(NFS4_OK);
+ args->result = status;
+ complete(&args->started);
+ args = NULL;
+
+ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */
while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) {
- /* XXX need to check status on pnfs_return_layout */
- pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, true);
+ /* FIXME: need to check status on pnfs_return_layout */
+ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false);
iput(ino);
}
+ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+ if (!lrp) {
+ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n",
+ __func__);
+ goto out;
+ }
+
/* send final layoutreturn */
- status = pnfs_return_layout(inode, &rl.cbl_seg, NULL,
- rl.cbl_recall_type, true);
- if (status)
- printk(KERN_INFO "%s: ignoring pnfs_return_layout status=%d\n",
- __func__, status);
+ lrp->args.reclaim = 0;
+ lrp->args.layout_type = rl.cbl_layout_type;
+ lrp->args.return_type = rl.cbl_recall_type;
+ lrp->args.lseg = rl.cbl_seg;
+ lrp->args.inode = inode;
+ lrp->lo = NULL;
+ pnfs4_proc_layoutreturn(lrp, true);
+
out:
- iput(inode);
+ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
+ nfs_put_client(clp);
module_put_and_exit(0);
dprintk("%s: exit status %d\n", __func__, 0);
return 0;
@@ -262,15 +307,18 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
.rl = rl,
};
struct task_struct *t;
- int status;
-
- /* should have returned NFS4ERR_NOMATCHING_LAYOUT... */
- BUG_ON(inode == NULL);
+ int status = -EAGAIN;
dprintk("%s: -->\n", __func__);
+ /* FIXME: do not allow two concurrent layout recalls */
+ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state))
+ return status;
+
init_completion(&data.started);
__module_get(THIS_MODULE);
+ if (!atomic_inc_not_zero(&clp->cl_count))
+ goto out_put_no_client;
t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout");
if (IS_ERR(t)) {
@@ -284,6 +332,9 @@ static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode,
wait_for_completion(&data.started);
return data.result;
out_module_put:
+ nfs_put_client(clp);
+out_put_no_client:
+ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state);
module_put(THIS_MODULE);
return status;
}
@@ -294,35 +345,46 @@ __be32 pnfs_cb_layoutrecall(struct cb_pnfs_layoutrecallargs *args,
struct nfs_client *clp;
struct inode *inode = NULL;
__be32 res;
+ int status;
unsigned int num_client = 0;
dprintk("%s: -->\n", __func__);
- res = htonl(NFS4ERR_INVAL);
- clp = nfs_find_client(args->cbl_addr, 4);
+ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+ clp = nfs_find_client(args->cbl_addr, 4);
if (clp == NULL) {
dprintk("%s: no client for addr %u.%u.%u.%u\n",
__func__, NIPQUAD(args->cbl_addr));
goto out;
}
- res = htonl(NFS4ERR_NOMATCHING_LAYOUT);
+ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT);
do {
struct nfs_client *prev = clp;
num_client++;
- inode = nfs_layoutrecall_find_inode(clp, args);
- if (inode != NULL) {
- if (PNFS_LD(&NFS_I(inode)->layout)->id ==
- args->cbl_layout_type) {
- /* Set up a helper thread to actually
- * return the delegation */
- res = pnfs_async_return_layout(clp, inode, args);
- if (res != 0)
- res = htonl(NFS4ERR_RESOURCE);
- break;
+ /* the callback must come from the MDS personality */
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS))
+ goto loop;
+ if (args->cbl_recall_type == RETURN_FILE) {
+ inode = nfs_layoutrecall_find_inode(clp, args);
+ if (inode != NULL) {
+ status = pnfs_async_return_layout(clp, inode,
+ args);
+ if (status == -EAGAIN)
+ res = cpu_to_be32(NFS4ERR_DELAY);
+ iput(inode);
}
+ } else { /* _ALL or _FSID */
+ /* we need the inode to get the nfs_server struct */
+ inode = nfs_layoutrecall_find_inode(clp, args);
+ if (!inode)
+ goto loop;
+ status = pnfs_async_return_layout(clp, inode, args);
+ if (status == -EAGAIN)
+ res = cpu_to_be32(NFS4ERR_DELAY);
iput(inode);
}
+loop:
clp = nfs_find_client_next(prev);
nfs_put_client(prev);
} while (clp != NULL);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 0abcd45..4e48aea 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -47,6 +47,7 @@ enum nfs4_client_state {
NFS4CLNT_SESSION_RESET,
NFS4CLNT_SESSION_DRAINING,
NFS4CLNT_RECALL_SLOT,
+ NFS4CLNT_LAYOUT_RECALL,
};
/*
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 546e2f4..3104730 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -710,6 +710,8 @@ return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
dprintk("--> %s\n", __func__);
+ BUG_ON(type != RETURN_FILE);
+
lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
if (lrp == NULL) {
if (lo && (type == RETURN_FILE))
@@ -746,13 +748,11 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
dprintk("--> %s type %d\n", __func__, type);
- if (range)
- arg = *range;
- else {
- arg.iomode = IOMODE_ANY;
- arg.offset = 0;
- arg.length = NFS4_MAX_UINT64;
- }
+
+ arg.iomode = range ? range->iomode : IOMODE_ANY;
+ arg.offset = 0;
+ arg.length = NFS4_MAX_UINT64;
+
if (type == RETURN_FILE) {
lo = get_lock_current_layout(nfsi);
if (lo && !has_layout_to_return(lo, &arg)) {
@@ -761,11 +761,7 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
}
if (!lo) {
dprintk("%s: no layout segments to return\n", __func__);
- /* must send the LAYOUTRETURN in response to recall */
- if (stateid)
- goto send_return;
- else
- goto out;
+ goto out;
}
/* unlock w/o put rebalanced by eventual call to
@@ -774,12 +770,23 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
unlock_current_layout(nfsi);
if (pnfs_return_layout_barrier(nfsi, &arg)) {
+ if (stateid) { /* callback */
+ status = -EAGAIN;
+ lock_current_layout(nfsi);
+ put_unlock_current_layout(lo);
+ goto out;
+ }
dprintk("%s: waiting\n", __func__);
wait_event(nfsi->lo_waitq,
- !pnfs_return_layout_barrier(nfsi, &arg));
+ !pnfs_return_layout_barrier(nfsi, &arg));
}
if (layoutcommit_needed(nfsi)) {
+ if (stateid && !wait) { /* callback */
+ dprintk("%s: layoutcommit pending\n", __func__);
+ status = -EAGAIN;
+ goto out;
+ }
status = pnfs_layoutcommit_inode(ino, wait);
if (status) {
dprintk("%s: layoutcommit failed, status=%d. "
@@ -788,9 +795,13 @@ _pnfs_return_layout(struct inode *ino, struct nfs4_pnfs_layout_segment *range,
status = 0;
}
}
+
+ if (stateid && wait)
+ status = return_layout(ino, &arg, stateid, type,
+ lo, wait);
+ else
+ pnfs_layout_release(lo, &arg);
}
-send_return:
- status = return_layout(ino, &arg, stateid, type, lo, wait);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
@@ -1069,7 +1080,7 @@ pnfs_update_layout(struct inode *ino,
struct nfs4_pnfs_layout_segment arg = {
.iomode = iomode,
.offset = 0,
- .length = ~0
+ .length = NFS4_MAX_UINT64,
};
struct nfs_inode *nfsi = NFS_I(ino);
struct pnfs_layout_type *lo;
@@ -1088,31 +1099,12 @@ pnfs_update_layout(struct inode *ino,
/* Check to see if the layout for the given range already exists */
lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
if (lseg && !lseg->valid) {
- unlock_current_layout(nfsi);
if (take_ref)
put_lseg(lseg);
- for (;;) {
- prepare_to_wait(&nfsi->lo_waitq, &__wait,
- TASK_KILLABLE);
- lock_current_layout(nfsi);
- lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref);
- if (!lseg || lseg->valid)
- break;
- dprintk("%s: invalid lseg %p ref %d\n", __func__,
- lseg, atomic_read(&lseg->kref.refcount)-1);
- if (take_ref)
- put_lseg(lseg);
- if (signal_pending(current)) {
- lseg = NULL;
- result = -ERESTARTSYS;
- break;
- }
- unlock_current_layout(nfsi);
- schedule();
- }
- finish_wait(&nfsi->lo_waitq, &__wait);
- if (result)
- goto out_put;
+
+ /* someone is cleaning the layout */
+ result = -EAGAIN;
+ goto out_put;
}
if (lseg) {
--
1.6.2.5
next prev parent reply other threads:[~2010-06-07 5:27 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-05-05 17:00 [PATCH 0/8] pnfs-submit: forgetful client v2 Alexandros Batsakis
2010-05-05 17:00 ` [PATCH 1/8] pnfs-submit: clean struct nfs_inode Alexandros Batsakis
2010-05-05 17:00 ` [PATCH 2/8] pnfs-submit: clean locking infrastructure Alexandros Batsakis
2010-05-05 17:00 ` [PATCH 3/8] pnfs-submit: remove lgetcount, lretcount (outstanding LAYOUTGETs/LAYOUTRETUNs) Alexandros Batsakis
2010-05-05 17:00 ` [PATCH 4/8] pnfs-submit: change stateid to be a union Alexandros Batsakis
2010-05-05 17:00 ` [PATCH 5/8] pnfs-submit: request whole file layouts only Alexandros Batsakis
2010-05-05 17:00 ` [PATCH 6/8] pnfs-submit: change layouts list to be similar to the other state list management Alexandros Batsakis
2010-05-05 17:00 ` Alexandros Batsakis [this message]
2010-05-05 17:00 ` [PATCH 8/8] pnfs-submit: support for cb_recall_any (layouts) Alexandros Batsakis
2010-06-07 14:34 ` [PATCH 2/8] pnfs-submit: clean locking infrastructure Fred Isaman
2010-06-07 8:52 ` [PATCH 0/8] pnfs-submit: forgetful client v2 Boaz Harrosh
2010-06-07 8:54 ` Boaz Harrosh
2010-06-07 15:38 ` Alexandros Batsakis
-- strict thread matches above, loose matches on Subject: below --
2010-05-17 17:56 [PATCH 0/8] pnfs-submit: Forgetful cleint and some layout cleanups Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 1/8] pnfs-submit: clean struct nfs_inode Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 2/8] pnfs-submit: clean locking infrastructure Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 3/8] pnfs-submit: remove lgetcount, lretcount (outstanding LAYOUTGETs/LAYOUTRETUNs) Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 4/8] pnfs-submit: change stateid to be a union Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 5/8] pnfs-submit: request whole file layouts only Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 6/8] pnfs-submit: change layouts list to be similar to the other state list management Alexandros Batsakis
2010-05-17 17:56 ` [PATCH 7/8] pnfs-submit: forgetful client model Alexandros Batsakis
2010-05-17 20:38 ` J. Bruce Fields
2010-05-18 0:06 ` Alexandros Batsakis
2010-05-18 14:16 ` J. Bruce Fields
2010-05-18 17:33 ` Alexandros Batsakis
2010-05-18 18:22 ` J. Bruce Fields
2010-05-26 9:20 ` Benny Halevy
2010-05-27 18:38 ` Batsakis, Alexandros
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1273078858-1923-8-git-send-email-batsakis@netapp.com \
--to=batsakis@netapp.com \
--cc=bhalevy@panasas.com \
--cc=linux-nfs@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).