From: "J. Bruce Fields" <bfields@fieldses.org>
To: Neil Brown <neilb@suse.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>,
linux-nfs@vger.kernel.org, Theodore Tso <tytso@mit.edu>
Subject: Re: Thoughts about cache consistency and directories in particular.
Date: Thu, 23 Apr 2009 17:34:36 -0400 [thread overview]
Message-ID: <20090423213436.GA1906@fieldses.org> (raw)
In-Reply-To: <20090421214349.GD27411@fieldses.org>
On Tue, Apr 21, 2009 at 05:43:49PM -0400, bfields wrote:
> There's also the pre-/post- op attributes to take care of. I'm thinking
> of applying something like the following.
Having heard no objections I'll go ahead and apply this, though I hope
we'll come up with a better solution for turning on the ext4 i_version
support in the future. Also:
> Actually, I did this and then realized: I'm using IS_I_VERSION(inode) to
> turn on use of the i_version as the change attribute on both files and
> directories. But it actually only makes a difference for files.
>
> So I guess I should just be using i_version as the change attribute
> unconditionally for directories? (Will that work on any filesystem?)
I'm still curious about this.
--b.
> commit 6bc1eaa4eb1bb8af1878be354962e6cedcaece60
> Author: J. Bruce Fields <bfields@citi.umich.edu>
> Date: Thu Apr 16 17:33:25 2009 -0400
>
> nfsd: support ext4 i_version
>
> ext4 supports a real NFSv4 change attribute, which is bumped whenever
> the ctime would be updated, including times when two updates arrive
> within a jiffy of each other. (Note that although ext4 has space for
> nanosecond-precision ctime, the real resolution is lower: it actually
> uses jiffies as the time-source.) This ensures clients will invalidate
> their caches when they need to.
>
> There is some fear that keeping the i_version up-to-date could have
> performance drawbacks, so for now it's turned on only by a mount option.
> We hope to do something better eventually.
>
> Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
> Cc: Theodore Tso <tytso@mit.edu>
>
> diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
> index 17d0dd9..01d4ec1 100644
> --- a/fs/nfsd/nfs3xdr.c
> +++ b/fs/nfsd/nfs3xdr.c
> @@ -272,6 +272,7 @@ void fill_post_wcc(struct svc_fh *fhp)
>
> err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry,
> &fhp->fh_post_attr);
> + fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version;
> if (err)
> fhp->fh_post_saved = 0;
> else
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 4a71fcd..12d36a7 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -1490,13 +1490,41 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
> memcpy(p, ptr, nbytes); \
> p += XDR_QUADLEN(nbytes); \
> }} while (0)
> -#define WRITECINFO(c) do { \
> - *p++ = htonl(c.atomic); \
> - *p++ = htonl(c.before_ctime_sec); \
> - *p++ = htonl(c.before_ctime_nsec); \
> - *p++ = htonl(c.after_ctime_sec); \
> - *p++ = htonl(c.after_ctime_nsec); \
> -} while (0)
> +
> +static void write32(__be32 **p, u32 n)
> +{
> + *(*p)++ = n;
> +}
> +
> +static void write64(__be32 **p, u64 n)
> +{
> + write32(p, (u32)(n >> 32));
> + write32(p, (u32)n);
> +}
> +
> +static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
> +{
> + if (IS_I_VERSION(inode)) {
> + write64(p, inode->i_version);
> + } else {
> + write32(p, stat->ctime.tv_sec);
> + write32(p, stat->ctime.tv_nsec);
> + }
> +}
> +
> +static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
> +{
> + write32(p, c->atomic);
> + if (c->change_supported) {
> + write64(p, c->before_change);
> + write64(p, c->after_change);
> + } else {
> + write32(p, c->before_ctime_sec);
> + write32(p, c->before_ctime_nsec);
> + write32(p, c->after_ctime_sec);
> + write32(p, c->after_ctime_nsec);
> + }
> +}
>
> #define RESERVE_SPACE(nbytes) do { \
> p = resp->p; \
> @@ -1849,16 +1877,9 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
> WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
> }
> if (bmval0 & FATTR4_WORD0_CHANGE) {
> - /*
> - * Note: This _must_ be consistent with the scheme for writing
> - * change_info, so any changes made here must be reflected there
> - * as well. (See xdr4.h:set_change_info() and the WRITECINFO()
> - * macro above.)
> - */
> if ((buflen -= 8) < 0)
> goto out_resource;
> - WRITE32(stat.ctime.tv_sec);
> - WRITE32(stat.ctime.tv_nsec);
> + write_change(&p, &stat, dentry->d_inode);
> }
> if (bmval0 & FATTR4_WORD0_SIZE) {
> if ((buflen -= 8) < 0)
> @@ -2364,7 +2385,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
>
> if (!nfserr) {
> RESERVE_SPACE(32);
> - WRITECINFO(create->cr_cinfo);
> + write_cinfo(&p, &create->cr_cinfo);
> WRITE32(2);
> WRITE32(create->cr_bmval[0]);
> WRITE32(create->cr_bmval[1]);
> @@ -2475,7 +2496,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
>
> if (!nfserr) {
> RESERVE_SPACE(20);
> - WRITECINFO(link->li_cinfo);
> + write_cinfo(&p, &link->li_cinfo);
> ADJUST_ARGS();
> }
> return nfserr;
> @@ -2493,7 +2514,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
>
> nfsd4_encode_stateid(resp, &open->op_stateid);
> RESERVE_SPACE(40);
> - WRITECINFO(open->op_cinfo);
> + write_cinfo(&p, &open->op_cinfo);
> WRITE32(open->op_rflags);
> WRITE32(2);
> WRITE32(open->op_bmval[0]);
> @@ -2771,7 +2792,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
>
> if (!nfserr) {
> RESERVE_SPACE(20);
> - WRITECINFO(remove->rm_cinfo);
> + write_cinfo(&p, &remove->rm_cinfo);
> ADJUST_ARGS();
> }
> return nfserr;
> @@ -2784,8 +2805,8 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
>
> if (!nfserr) {
> RESERVE_SPACE(40);
> - WRITECINFO(rename->rn_sinfo);
> - WRITECINFO(rename->rn_tinfo);
> + write_cinfo(&p, &rename->rn_sinfo);
> + write_cinfo(&p, &rename->rn_tinfo);
> ADJUST_ARGS();
> }
> return nfserr;
> diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h
> index afa1901..8f641c9 100644
> --- a/include/linux/nfsd/nfsfh.h
> +++ b/include/linux/nfsd/nfsfh.h
> @@ -151,9 +151,15 @@ typedef struct svc_fh {
> __u64 fh_pre_size; /* size before operation */
> struct timespec fh_pre_mtime; /* mtime before oper */
> struct timespec fh_pre_ctime; /* ctime before oper */
> + /*
> + * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
> + * to find out if it is valid.
> + */
> + u64 fh_pre_change;
>
> /* Post-op attributes saved in fh_unlock */
> struct kstat fh_post_attr; /* full attrs after operation */
> + u64 fh_post_change; /* nfsv4 change; see above */
> #endif /* CONFIG_NFSD_V3 */
>
> } svc_fh;
> @@ -298,6 +304,7 @@ fill_pre_wcc(struct svc_fh *fhp)
> fhp->fh_pre_mtime = inode->i_mtime;
> fhp->fh_pre_ctime = inode->i_ctime;
> fhp->fh_pre_size = inode->i_size;
> + fhp->fh_pre_change = inode->i_version;
> fhp->fh_pre_saved = 1;
> }
> }
> diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
> index f80d601..d0f050f 100644
> --- a/include/linux/nfsd/xdr4.h
> +++ b/include/linux/nfsd/xdr4.h
> @@ -64,10 +64,13 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
>
> struct nfsd4_change_info {
> u32 atomic;
> + bool change_supported;
> u32 before_ctime_sec;
> u32 before_ctime_nsec;
> + u64 before_change;
> u32 after_ctime_sec;
> u32 after_ctime_nsec;
> + u64 after_change;
> };
>
> struct nfsd4_access {
> @@ -503,10 +506,16 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
> {
> BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
> cinfo->atomic = 1;
> - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
> - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
> - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
> - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
> + cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
> + if (cinfo->change_supported) {
> + cinfo->before_change = fhp->fh_pre_change;
> + cinfo->after_change = fhp->fh_post_change;
> + } else {
> + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
> + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
> + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
> + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
> + }
> }
>
> int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
next prev parent reply other threads:[~2009-04-23 21:34 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-02-20 2:18 Thoughts about cache consistency and directories in particular Neil Brown
2009-02-20 18:23 ` J. Bruce Fields
2009-02-20 19:47 ` Neil Brown
[not found] ` <18847.2286.101191.989726-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2009-02-20 20:14 ` J. Bruce Fields
2009-02-20 21:04 ` Neil Brown
[not found] ` <18847.6886.50844.260910-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2009-04-21 21:43 ` J. Bruce Fields
2009-04-23 21:34 ` J. Bruce Fields [this message]
2009-04-23 21:52 ` Trond Myklebust
[not found] ` <1240523577.8583.13.camel-rJ7iovZKK19ZJLDQqaL3InhyD016LWXt@public.gmane.org>
2009-04-23 22:07 ` J. Bruce Fields
2009-04-23 22:24 ` Trond Myklebust
[not found] ` <18846.4842.625445.980681-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2009-02-20 18:56 ` Trond Myklebust
2009-02-20 19:52 ` Neil Brown
[not found] ` <18847.2578.480148.216735-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2009-02-20 20:32 ` Trond Myklebust
2009-02-20 21:06 ` Neil Brown
[not found] ` <18847.6988.418374.839185-wvvUuzkyo1EYVZTmpyfIwg@public.gmane.org>
2009-02-20 22:14 ` Trond Myklebust
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20090423213436.GA1906@fieldses.org \
--to=bfields@fieldses.org \
--cc=linux-nfs@vger.kernel.org \
--cc=neilb@suse.de \
--cc=trond.myklebust@fys.uio.no \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox