* [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 15:11 [PATCH v2 0/3] fs: multigrain timestamps Jeff Layton
@ 2023-04-24 15:11 ` Jeff Layton
2023-04-24 21:47 ` NeilBrown
` (2 more replies)
2023-04-24 15:11 ` [PATCH v2 2/3] shmem: mark for high-res timestamps on next update after getattr Jeff Layton
2023-04-24 15:11 ` [PATCH v2 3/3] xfs: mark the inode for high-res timestamp update in getattr Jeff Layton
2 siblings, 3 replies; 18+ messages in thread
From: Jeff Layton @ 2023-04-24 15:11 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever
Cc: Jan Kara, Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs,
linux-mm, linux-nfs
The VFS always uses coarse-grained timestamp updates for filling out the
ctime and mtime after a change. This has the benefit of allowing
filesystems to optimize away a lot metaupdates, to around once per
jiffy, even when a file is under heavy writes.
Unfortunately, this has always been an issue when we're exporting via
NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
lot of exported filesystems don't properly support a change attribute
and are subject to the same problems with timestamp granularity. Other
applications have similar issues (e.g backup applications).
Switching to always using fine-grained timestamps would improve the
situation for NFS, but that becomes rather expensive, as the underlying
filesystem will have to log a lot more metadata updates.
What we need is a way to only use fine-grained timestamps when they are
being actively queried:
Whenever the mtime changes, the ctime must also change since we're
changing the metadata. When a superblock has a s_time_gran >1, we can
use the lowest-order bit of the inode->i_ctime as a flag to indicate
that the value has been queried. Then on the next write, we'll fetch a
fine-grained timestamp instead of the usual coarse-grained one.
We could enable this for any filesystem that has a s_time_gran >1, but
for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
to opt-in to this behavior.
It then adds a new current_ctime function that acts like the
current_time helper, but will conditionally grab fine-grained timestamps
when the flag is set in the current ctime. Also, there is a new
generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
and atomically marking the ctime as queried.
Later patches will convert filesystems over to this new scheme.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
fs/stat.c | 24 ++++++++++++++++++
include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
3 files changed, 121 insertions(+), 22 deletions(-)
diff --git a/fs/inode.c b/fs/inode.c
index 4558dc2f1355..4bd11bdb46d4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{
int sync_it = 0;
+ struct timespec64 ctime = inode->i_ctime;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
@@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
if (!timespec64_equal(&inode->i_mtime, now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, now))
+ if (is_multigrain_ts(inode))
+ ctime.tv_nsec &= ~I_CTIME_QUERIED;
+ if (!timespec64_equal(&ctime, now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
return ret;
}
+/**
+ * current_ctime - Return FS time (possibly high-res)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change.
+ *
+ * For a multigrain timestamp, if the timestamp is flagged as having been
+ * QUERIED, then get a fine-grained timestamp.
+ */
+struct timespec64 current_ctime(struct inode *inode)
+{
+ struct timespec64 now;
+ long nsec = 0;
+ bool multigrain = is_multigrain_ts(inode);
+
+ if (multigrain) {
+ atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
+
+ nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
+ }
+
+ if (nsec & I_CTIME_QUERIED) {
+ ktime_get_real_ts64(&now);
+ } else {
+ ktime_get_coarse_real_ts64(&now);
+
+ if (multigrain) {
+ /*
+ * If we've recently fetched a fine-grained timestamp
+ * then the coarse-grained one may be earlier than the
+ * existing one. Just keep the existing ctime if so.
+ */
+ struct timespec64 ctime = inode->i_ctime;
+
+ if (timespec64_compare(&ctime, &now) > 0)
+ now = ctime;
+ }
+ }
+
+ return timestamp_truncate(now, inode);
+}
+EXPORT_SYMBOL(current_ctime);
+
/**
* file_update_time - update mtime and ctime time
* @file: file accessed
@@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
{
int ret;
struct inode *inode = file_inode(file);
- struct timespec64 now = current_time(inode);
+ struct timespec64 now = current_ctime(inode);
ret = inode_needs_update_time(inode, &now);
if (ret <= 0)
@@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
{
int ret;
struct inode *inode = file_inode(file);
- struct timespec64 now = current_time(inode);
+ struct timespec64 now = current_ctime(inode);
/*
* Clear the security bits if the process is not being run by root.
@@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
t.tv_nsec = 0;
- /* Avoid division in the common cases 1 ns and 1 s. */
+ /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
if (gran == 1)
; /* nothing */
+ else if (gran == 2)
+ t.tv_nsec &= ~1L;
else if (gran == NSEC_PER_SEC)
t.tv_nsec = 0;
else if (gran > 1 && gran < NSEC_PER_SEC)
diff --git a/fs/stat.c b/fs/stat.c
index 7c238da22ef0..67b56daf9663 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -26,6 +26,30 @@
#include "internal.h"
#include "mount.h"
+/**
+ * generic_fill_multigrain_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
+ * @inode: inode from which to grab the c/mtime
+ * @stat: where to store the resulting values
+ *
+ * Given @inode, grab the ctime and mtime out if it and store the result
+ * in @stat. When fetching the value, flag it as queried so the next write
+ * will use a fine-grained timestamp.
+ */
+void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat)
+{
+ atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
+
+ stat->mtime = inode->i_mtime;
+ stat->ctime.tv_sec = inode->i_ctime.tv_sec;
+ /*
+ * Atomically set the QUERIED flag and fetch the new value with
+ * the flag masked off.
+ */
+ stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec)
+ & ~I_CTIME_QUERIED;
+}
+EXPORT_SYMBOL(generic_fill_multigrain_cmtime);
+
/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @idmap: idmap of the mount the inode was found from
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c85916e9f7db..e6dd3ce051ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1059,21 +1059,22 @@ extern int send_sigurg(struct fown_struct *fown);
* sb->s_flags. Note that these mirror the equivalent MS_* flags where
* represented in both.
*/
-#define SB_RDONLY 1 /* Mount read-only */
-#define SB_NOSUID 2 /* Ignore suid and sgid bits */
-#define SB_NODEV 4 /* Disallow access to device special files */
-#define SB_NOEXEC 8 /* Disallow program execution */
-#define SB_SYNCHRONOUS 16 /* Writes are synced at once */
-#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */
-#define SB_DIRSYNC 128 /* Directory modifications are synchronous */
-#define SB_NOATIME 1024 /* Do not update access times. */
-#define SB_NODIRATIME 2048 /* Do not update directory access times */
-#define SB_SILENT 32768
-#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
-#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
-#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
-#define SB_I_VERSION (1<<23) /* Update inode I_version field */
-#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
+#define SB_RDONLY (1<<0) /* Mount read-only */
+#define SB_NOSUID (1<<1) /* Ignore suid and sgid bits */
+#define SB_NODEV (1<<2) /* Disallow access to device special files */
+#define SB_NOEXEC (1<<3) /* Disallow program execution */
+#define SB_SYNCHRONOUS (1<<4) /* Writes are synced at once */
+#define SB_MANDLOCK (1<<6) /* Allow mandatory locks on an FS */
+#define SB_DIRSYNC (1<<7) /* Directory modifications are synchronous */
+#define SB_NOATIME (1<<10) /* Do not update access times. */
+#define SB_NODIRATIME (1<<11) /* Do not update directory access times */
+#define SB_SILENT (1<<15)
+#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
+#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
+#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
+#define SB_I_VERSION (1<<23) /* Update inode I_version field */
+#define SB_MULTIGRAIN_TS (1<<24) /* Use multigrain c/mtimes */
+#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
/* These sb flags are internal to the kernel */
#define SB_SUBMOUNT (1<<26)
@@ -1457,7 +1458,8 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
kgid_has_mapping(fs_userns, kgid);
}
-extern struct timespec64 current_time(struct inode *inode);
+struct timespec64 current_time(struct inode *inode);
+struct timespec64 current_ctime(struct inode *inode);
/*
* Snapshotting support.
@@ -2171,8 +2173,31 @@ enum file_time_flags {
S_VERSION = 8,
};
-extern bool atime_needs_update(const struct path *, struct inode *);
-extern void touch_atime(const struct path *);
+/*
+ * Multigrain timestamps
+ *
+ * Conditionally use fine-grained ctime and mtime timestamps
+ *
+ * When s_time_gran is >1, and SB_MULTIGRAIN_TS is set, use the lowest-order bit
+ * in the tv_nsec field as a flag to indicate that the value was recently queried
+ * and that the next update should use a fine-grained timestamp.
+ */
+#define I_CTIME_QUERIED 1L
+
+static inline bool is_multigrain_ts(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ /*
+ * Warn if someone sets SB_MULTIGRAIN_TS, but doesn't turn down the ts
+ * granularity.
+ */
+ return (sb->s_flags & SB_MULTIGRAIN_TS) &&
+ !WARN_ON_ONCE(sb->s_time_gran == 1);
+}
+
+bool atime_needs_update(const struct path *, struct inode *);
+void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
static inline void file_accessed(struct file *file)
@@ -2838,6 +2863,7 @@ extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
+void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat);
void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
--
2.40.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 15:11 ` [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime Jeff Layton
@ 2023-04-24 21:47 ` NeilBrown
2023-04-24 22:30 ` Jeff Layton
2023-04-26 6:53 ` Christian Brauner
2023-04-26 7:07 ` Christian Brauner
2 siblings, 1 reply; 18+ messages in thread
From: NeilBrown @ 2023-04-24 21:47 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Tue, 25 Apr 2023, Jeff Layton wrote:
> The VFS always uses coarse-grained timestamp updates for filling out the
> ctime and mtime after a change. This has the benefit of allowing
> filesystems to optimize away a lot metaupdates, to around once per
> jiffy, even when a file is under heavy writes.
>
> Unfortunately, this has always been an issue when we're exporting via
> NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> lot of exported filesystems don't properly support a change attribute
> and are subject to the same problems with timestamp granularity. Other
> applications have similar issues (e.g backup applications).
>
> Switching to always using fine-grained timestamps would improve the
> situation for NFS, but that becomes rather expensive, as the underlying
> filesystem will have to log a lot more metadata updates.
>
> What we need is a way to only use fine-grained timestamps when they are
> being actively queried:
>
> Whenever the mtime changes, the ctime must also change since we're
> changing the metadata. When a superblock has a s_time_gran >1, we can
> use the lowest-order bit of the inode->i_ctime as a flag to indicate
> that the value has been queried. Then on the next write, we'll fetch a
> fine-grained timestamp instead of the usual coarse-grained one.
This assumes that any s_time_gran value greater then 1, is even. This is
currently true in practice (it is always a power of 10 I think).
But should we have a WARN_ON_ONCE() somewhere just in case?
>
> We could enable this for any filesystem that has a s_time_gran >1, but
> for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> to opt-in to this behavior.
>
> It then adds a new current_ctime function that acts like the
> current_time helper, but will conditionally grab fine-grained timestamps
> when the flag is set in the current ctime. Also, there is a new
> generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
> and atomically marking the ctime as queried.
>
> Later patches will convert filesystems over to this new scheme.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
> fs/stat.c | 24 ++++++++++++++++++
> include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
> 3 files changed, 121 insertions(+), 22 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 4558dc2f1355..4bd11bdb46d4 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
> static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> {
> int sync_it = 0;
> + struct timespec64 ctime = inode->i_ctime;
>
> /* First try to exhaust all avenues to not sync */
> if (IS_NOCMTIME(inode))
> @@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> if (!timespec64_equal(&inode->i_mtime, now))
> sync_it = S_MTIME;
>
> - if (!timespec64_equal(&inode->i_ctime, now))
> + if (is_multigrain_ts(inode))
> + ctime.tv_nsec &= ~I_CTIME_QUERIED;
> + if (!timespec64_equal(&ctime, now))
> sync_it |= S_CTIME;
>
> if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
> @@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
> return ret;
> }
>
> +/**
> + * current_ctime - Return FS time (possibly high-res)
> + * @inode: inode.
> + *
> + * Return the current time truncated to the time granularity supported by
> + * the fs, as suitable for a ctime/mtime change.
> + *
> + * For a multigrain timestamp, if the timestamp is flagged as having been
> + * QUERIED, then get a fine-grained timestamp.
> + */
> +struct timespec64 current_ctime(struct inode *inode)
> +{
> + struct timespec64 now;
> + long nsec = 0;
> + bool multigrain = is_multigrain_ts(inode);
> +
> + if (multigrain) {
> + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> +
> + nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
atomic_long_fetch_andnot(I_CTIME_QUERIED, pnsec) ??
> + }
> +
> + if (nsec & I_CTIME_QUERIED) {
> + ktime_get_real_ts64(&now);
> + } else {
> + ktime_get_coarse_real_ts64(&now);
> +
> + if (multigrain) {
> + /*
> + * If we've recently fetched a fine-grained timestamp
> + * then the coarse-grained one may be earlier than the
> + * existing one. Just keep the existing ctime if so.
> + */
> + struct timespec64 ctime = inode->i_ctime;
> +
> + if (timespec64_compare(&ctime, &now) > 0)
> + now = ctime;
I think this ctime could have the I_CTIME_QUERIED bit set. We probably
don't want that ??
> + }
> + }
> +
> + return timestamp_truncate(now, inode);
> +}
> +EXPORT_SYMBOL(current_ctime);
> +
> /**
> * file_update_time - update mtime and ctime time
> * @file: file accessed
> @@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
> {
> int ret;
> struct inode *inode = file_inode(file);
> - struct timespec64 now = current_time(inode);
> + struct timespec64 now = current_ctime(inode);
>
> ret = inode_needs_update_time(inode, &now);
> if (ret <= 0)
> @@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
> {
> int ret;
> struct inode *inode = file_inode(file);
> - struct timespec64 now = current_time(inode);
> + struct timespec64 now = current_ctime(inode);
>
> /*
> * Clear the security bits if the process is not being run by root.
> @@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
> if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
> t.tv_nsec = 0;
>
> - /* Avoid division in the common cases 1 ns and 1 s. */
> + /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
> if (gran == 1)
> ; /* nothing */
> + else if (gran == 2)
> + t.tv_nsec &= ~1L;
> else if (gran == NSEC_PER_SEC)
> t.tv_nsec = 0;
> else if (gran > 1 && gran < NSEC_PER_SEC)
> diff --git a/fs/stat.c b/fs/stat.c
> index 7c238da22ef0..67b56daf9663 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -26,6 +26,30 @@
> #include "internal.h"
> #include "mount.h"
>
> +/**
> + * generic_fill_multigrain_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
> + * @inode: inode from which to grab the c/mtime
> + * @stat: where to store the resulting values
> + *
> + * Given @inode, grab the ctime and mtime out if it and store the result
> + * in @stat. When fetching the value, flag it as queried so the next write
> + * will use a fine-grained timestamp.
> + */
> +void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat)
> +{
> + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> +
> + stat->mtime = inode->i_mtime;
> + stat->ctime.tv_sec = inode->i_ctime.tv_sec;
> + /*
> + * Atomically set the QUERIED flag and fetch the new value with
> + * the flag masked off.
> + */
> + stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec)
> + & ~I_CTIME_QUERIED;
> +}
> +EXPORT_SYMBOL(generic_fill_multigrain_cmtime);
> +
> /**
> * generic_fillattr - Fill in the basic attributes from the inode struct
> * @idmap: idmap of the mount the inode was found from
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c85916e9f7db..e6dd3ce051ef 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1059,21 +1059,22 @@ extern int send_sigurg(struct fown_struct *fown);
> * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> * represented in both.
> */
> -#define SB_RDONLY 1 /* Mount read-only */
> -#define SB_NOSUID 2 /* Ignore suid and sgid bits */
> -#define SB_NODEV 4 /* Disallow access to device special files */
> -#define SB_NOEXEC 8 /* Disallow program execution */
> -#define SB_SYNCHRONOUS 16 /* Writes are synced at once */
> -#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */
> -#define SB_DIRSYNC 128 /* Directory modifications are synchronous */
> -#define SB_NOATIME 1024 /* Do not update access times. */
> -#define SB_NODIRATIME 2048 /* Do not update directory access times */
> -#define SB_SILENT 32768
> -#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
> -#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
> -#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
> -#define SB_I_VERSION (1<<23) /* Update inode I_version field */
> -#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> +#define SB_RDONLY (1<<0) /* Mount read-only */
BIT(0) ???
> +#define SB_NOSUID (1<<1) /* Ignore suid and sgid bits */
BIT(1) ??
> +#define SB_NODEV (1<<2) /* Disallow access to device special files */
> +#define SB_NOEXEC (1<<3) /* Disallow program execution */
> +#define SB_SYNCHRONOUS (1<<4) /* Writes are synced at once */
> +#define SB_MANDLOCK (1<<6) /* Allow mandatory locks on an FS */
> +#define SB_DIRSYNC (1<<7) /* Directory modifications are synchronous */
> +#define SB_NOATIME (1<<10) /* Do not update access times. */
> +#define SB_NODIRATIME (1<<11) /* Do not update directory access times */
> +#define SB_SILENT (1<<15)
> +#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
> +#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
> +#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
> +#define SB_I_VERSION (1<<23) /* Update inode I_version field */
> +#define SB_MULTIGRAIN_TS (1<<24) /* Use multigrain c/mtimes */
> +#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
>
> /* These sb flags are internal to the kernel */
> #define SB_SUBMOUNT (1<<26)
Why not align this one too?
> @@ -1457,7 +1458,8 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
> kgid_has_mapping(fs_userns, kgid);
> }
>
> -extern struct timespec64 current_time(struct inode *inode);
> +struct timespec64 current_time(struct inode *inode);
> +struct timespec64 current_ctime(struct inode *inode);
>
> /*
> * Snapshotting support.
> @@ -2171,8 +2173,31 @@ enum file_time_flags {
> S_VERSION = 8,
> };
>
> -extern bool atime_needs_update(const struct path *, struct inode *);
> -extern void touch_atime(const struct path *);
> +/*
> + * Multigrain timestamps
> + *
> + * Conditionally use fine-grained ctime and mtime timestamps
> + *
> + * When s_time_gran is >1, and SB_MULTIGRAIN_TS is set, use the lowest-order bit
> + * in the tv_nsec field as a flag to indicate that the value was recently queried
> + * and that the next update should use a fine-grained timestamp.
> + */
> +#define I_CTIME_QUERIED 1L
> +
> +static inline bool is_multigrain_ts(struct inode *inode)
> +{
> + struct super_block *sb = inode->i_sb;
> +
> + /*
> + * Warn if someone sets SB_MULTIGRAIN_TS, but doesn't turn down the ts
> + * granularity.
> + */
> + return (sb->s_flags & SB_MULTIGRAIN_TS) &&
> + !WARN_ON_ONCE(sb->s_time_gran == 1);
Maybe
!WARN_ON_ONCE(sb->s_time_gran & SB_MULTIGRAIN_TS);
??
> +}
> +
> +bool atime_needs_update(const struct path *, struct inode *);
> +void touch_atime(const struct path *);
> int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
>
> static inline void file_accessed(struct file *file)
> @@ -2838,6 +2863,7 @@ extern void page_put_link(void *);
> extern int page_symlink(struct inode *inode, const char *symname, int len);
> extern const struct inode_operations page_symlink_inode_operations;
> extern void kfree_link(void *);
> +void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat);
> void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *);
> void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
> extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
> --
> 2.40.0
>
>
Looks generally sensible, thanks!
NeilBrown
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 21:47 ` NeilBrown
@ 2023-04-24 22:30 ` Jeff Layton
2023-04-24 22:40 ` NeilBrown
2023-04-26 6:41 ` Christian Brauner
0 siblings, 2 replies; 18+ messages in thread
From: Jeff Layton @ 2023-04-24 22:30 UTC (permalink / raw)
To: NeilBrown
Cc: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Tue, 2023-04-25 at 07:47 +1000, NeilBrown wrote:
> On Tue, 25 Apr 2023, Jeff Layton wrote:
> > The VFS always uses coarse-grained timestamp updates for filling out the
> > ctime and mtime after a change. This has the benefit of allowing
> > filesystems to optimize away a lot metaupdates, to around once per
> > jiffy, even when a file is under heavy writes.
> >
> > Unfortunately, this has always been an issue when we're exporting via
> > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > lot of exported filesystems don't properly support a change attribute
> > and are subject to the same problems with timestamp granularity. Other
> > applications have similar issues (e.g backup applications).
> >
> > Switching to always using fine-grained timestamps would improve the
> > situation for NFS, but that becomes rather expensive, as the underlying
> > filesystem will have to log a lot more metadata updates.
> >
> > What we need is a way to only use fine-grained timestamps when they are
> > being actively queried:
> >
> > Whenever the mtime changes, the ctime must also change since we're
> > changing the metadata. When a superblock has a s_time_gran >1, we can
> > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > that the value has been queried. Then on the next write, we'll fetch a
> > fine-grained timestamp instead of the usual coarse-grained one.
>
> This assumes that any s_time_gran value greater then 1, is even. This is
> currently true in practice (it is always a power of 10 I think).
> But should we have a WARN_ON_ONCE() somewhere just in case?
>
> >
> > We could enable this for any filesystem that has a s_time_gran >1, but
> > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > to opt-in to this behavior.
> >
> > It then adds a new current_ctime function that acts like the
> > current_time helper, but will conditionally grab fine-grained timestamps
> > when the flag is set in the current ctime. Also, there is a new
> > generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
> > and atomically marking the ctime as queried.
> >
> > Later patches will convert filesystems over to this new scheme.
> >
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> > fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
> > fs/stat.c | 24 ++++++++++++++++++
> > include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
> > 3 files changed, 121 insertions(+), 22 deletions(-)
> >
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 4558dc2f1355..4bd11bdb46d4 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
> > static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > {
> > int sync_it = 0;
> > + struct timespec64 ctime = inode->i_ctime;
> >
> > /* First try to exhaust all avenues to not sync */
> > if (IS_NOCMTIME(inode))
> > @@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > if (!timespec64_equal(&inode->i_mtime, now))
> > sync_it = S_MTIME;
> >
> > - if (!timespec64_equal(&inode->i_ctime, now))
> > + if (is_multigrain_ts(inode))
> > + ctime.tv_nsec &= ~I_CTIME_QUERIED;
> > + if (!timespec64_equal(&ctime, now))
> > sync_it |= S_CTIME;
> >
> > if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
> > @@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
> > return ret;
> > }
> >
> > +/**
> > + * current_ctime - Return FS time (possibly high-res)
> > + * @inode: inode.
> > + *
> > + * Return the current time truncated to the time granularity supported by
> > + * the fs, as suitable for a ctime/mtime change.
> > + *
> > + * For a multigrain timestamp, if the timestamp is flagged as having been
> > + * QUERIED, then get a fine-grained timestamp.
> > + */
> > +struct timespec64 current_ctime(struct inode *inode)
> > +{
> > + struct timespec64 now;
> > + long nsec = 0;
> > + bool multigrain = is_multigrain_ts(inode);
> > +
> > + if (multigrain) {
> > + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> > +
> > + nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
>
> atomic_long_fetch_andnot(I_CTIME_QUERIED, pnsec) ??
>
I didn't realize that existed! Sure, I can make that change.
> > + }
> > +
> > + if (nsec & I_CTIME_QUERIED) {
> > + ktime_get_real_ts64(&now);
> > + } else {
> > + ktime_get_coarse_real_ts64(&now);
> > +
> > + if (multigrain) {
> > + /*
> > + * If we've recently fetched a fine-grained timestamp
> > + * then the coarse-grained one may be earlier than the
> > + * existing one. Just keep the existing ctime if so.
> > + */
> > + struct timespec64 ctime = inode->i_ctime;
> > +
> > + if (timespec64_compare(&ctime, &now) > 0)
> > + now = ctime;
>
> I think this ctime could have the I_CTIME_QUERIED bit set. We probably
> don't want that ??
>
>
The timestamp_truncate below will take care of it.
> > + }
> > + }
> > +
> > + return timestamp_truncate(now, inode);
> > +}
> > +EXPORT_SYMBOL(current_ctime);
> > +
> > /**
> > * file_update_time - update mtime and ctime time
> > * @file: file accessed
> > @@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
> > {
> > int ret;
> > struct inode *inode = file_inode(file);
> > - struct timespec64 now = current_time(inode);
> > + struct timespec64 now = current_ctime(inode);
> >
> > ret = inode_needs_update_time(inode, &now);
> > if (ret <= 0)
> > @@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
> > {
> > int ret;
> > struct inode *inode = file_inode(file);
> > - struct timespec64 now = current_time(inode);
> > + struct timespec64 now = current_ctime(inode);
> >
> > /*
> > * Clear the security bits if the process is not being run by root.
> > @@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
> > if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
> > t.tv_nsec = 0;
> >
> > - /* Avoid division in the common cases 1 ns and 1 s. */
> > + /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
> > if (gran == 1)
> > ; /* nothing */
> > + else if (gran == 2)
> > + t.tv_nsec &= ~1L;
> > else if (gran == NSEC_PER_SEC)
> > t.tv_nsec = 0;
> > else if (gran > 1 && gran < NSEC_PER_SEC)
> > diff --git a/fs/stat.c b/fs/stat.c
> > index 7c238da22ef0..67b56daf9663 100644
> > --- a/fs/stat.c
> > +++ b/fs/stat.c
> > @@ -26,6 +26,30 @@
> > #include "internal.h"
> > #include "mount.h"
> >
> > +/**
> > + * generic_fill_multigrain_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
> > + * @inode: inode from which to grab the c/mtime
> > + * @stat: where to store the resulting values
> > + *
> > + * Given @inode, grab the ctime and mtime out if it and store the result
> > + * in @stat. When fetching the value, flag it as queried so the next write
> > + * will use a fine-grained timestamp.
> > + */
> > +void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat)
> > +{
> > + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> > +
> > + stat->mtime = inode->i_mtime;
> > + stat->ctime.tv_sec = inode->i_ctime.tv_sec;
> > + /*
> > + * Atomically set the QUERIED flag and fetch the new value with
> > + * the flag masked off.
> > + */
> > + stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec)
> > + & ~I_CTIME_QUERIED;
> > +}
> > +EXPORT_SYMBOL(generic_fill_multigrain_cmtime);
> > +
> > /**
> > * generic_fillattr - Fill in the basic attributes from the inode struct
> > * @idmap: idmap of the mount the inode was found from
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index c85916e9f7db..e6dd3ce051ef 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -1059,21 +1059,22 @@ extern int send_sigurg(struct fown_struct *fown);
> > * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> > * represented in both.
> > */
> > -#define SB_RDONLY 1 /* Mount read-only */
> > -#define SB_NOSUID 2 /* Ignore suid and sgid bits */
> > -#define SB_NODEV 4 /* Disallow access to device special files */
> > -#define SB_NOEXEC 8 /* Disallow program execution */
> > -#define SB_SYNCHRONOUS 16 /* Writes are synced at once */
> > -#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */
> > -#define SB_DIRSYNC 128 /* Directory modifications are synchronous */
> > -#define SB_NOATIME 1024 /* Do not update access times. */
> > -#define SB_NODIRATIME 2048 /* Do not update directory access times */
> > -#define SB_SILENT 32768
> > -#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
> > -#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
> > -#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
> > -#define SB_I_VERSION (1<<23) /* Update inode I_version field */
> > -#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > +#define SB_RDONLY (1<<0) /* Mount read-only */
>
> BIT(0) ???
>
Even better. I'll revise it.
> > +#define SB_NOSUID (1<<1) /* Ignore suid and sgid bits */
>
> BIT(1) ??
>
> > +#define SB_NODEV (1<<2) /* Disallow access to device special files */
> > +#define SB_NOEXEC (1<<3) /* Disallow program execution */
> > +#define SB_SYNCHRONOUS (1<<4) /* Writes are synced at once */
> > +#define SB_MANDLOCK (1<<6) /* Allow mandatory locks on an FS */
> > +#define SB_DIRSYNC (1<<7) /* Directory modifications are synchronous */
> > +#define SB_NOATIME (1<<10) /* Do not update access times. */
> > +#define SB_NODIRATIME (1<<11) /* Do not update directory access times */
> > +#define SB_SILENT (1<<15)
> > +#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
> > +#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
> > +#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
> > +#define SB_I_VERSION (1<<23) /* Update inode I_version field */
> > +#define SB_MULTIGRAIN_TS (1<<24) /* Use multigrain c/mtimes */
> > +#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> >
> > /* These sb flags are internal to the kernel */
> > #define SB_SUBMOUNT (1<<26)
>
> Why not align this one too?
>
Sure. I'll add that in for the next one.
> > @@ -1457,7 +1458,8 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
> > kgid_has_mapping(fs_userns, kgid);
> > }
> >
> > -extern struct timespec64 current_time(struct inode *inode);
> > +struct timespec64 current_time(struct inode *inode);
> > +struct timespec64 current_ctime(struct inode *inode);
> >
> > /*
> > * Snapshotting support.
> > @@ -2171,8 +2173,31 @@ enum file_time_flags {
> > S_VERSION = 8,
> > };
> >
> > -extern bool atime_needs_update(const struct path *, struct inode *);
> > -extern void touch_atime(const struct path *);
> > +/*
> > + * Multigrain timestamps
> > + *
> > + * Conditionally use fine-grained ctime and mtime timestamps
> > + *
> > + * When s_time_gran is >1, and SB_MULTIGRAIN_TS is set, use the lowest-order bit
> > + * in the tv_nsec field as a flag to indicate that the value was recently queried
> > + * and that the next update should use a fine-grained timestamp.
> > + */
> > +#define I_CTIME_QUERIED 1L
> > +
> > +static inline bool is_multigrain_ts(struct inode *inode)
> > +{
> > + struct super_block *sb = inode->i_sb;
> > +
> > + /*
> > + * Warn if someone sets SB_MULTIGRAIN_TS, but doesn't turn down the ts
> > + * granularity.
> > + */
> > + return (sb->s_flags & SB_MULTIGRAIN_TS) &&
> > + !WARN_ON_ONCE(sb->s_time_gran == 1);
>
> Maybe
> !WARN_ON_ONCE(sb->s_time_gran & SB_MULTIGRAIN_TS);
> ??
>
I'm not sure I understand what you mean here. We want to check whether
SB_MULTIGRAIN_TS is set in the flags, and that s_time_gran > 1. The
latter is required so that we have space for the I_CTIME_QUERIED flag.
If SB_MULTIGRAIN_TS is set, but the s_time_gran is too low, we want to
throw a warning (since something is clearly wrong).
> > +}
> > +
> > +bool atime_needs_update(const struct path *, struct inode *);
> > +void touch_atime(const struct path *);
> > int inode_update_time(struct inode *inode, struct timespec64 *time, int flags);
> >
> > static inline void file_accessed(struct file *file)
> > @@ -2838,6 +2863,7 @@ extern void page_put_link(void *);
> > extern int page_symlink(struct inode *inode, const char *symname, int len);
> > extern const struct inode_operations page_symlink_inode_operations;
> > extern void kfree_link(void *);
> > +void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat);
> > void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *);
> > void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
> > extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
> > --
> > 2.40.0
> >
> >
>
>
> Looks generally sensible, thanks!
>
Thanks for taking a look! I think this has the potential to fix some
very long standing cache coherency issues in all NFS versions (v3 and
up).
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 22:30 ` Jeff Layton
@ 2023-04-24 22:40 ` NeilBrown
2023-04-25 17:45 ` Jeff Layton
2023-04-26 6:41 ` Christian Brauner
1 sibling, 1 reply; 18+ messages in thread
From: NeilBrown @ 2023-04-24 22:40 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Tue, 25 Apr 2023, Jeff Layton wrote:
> On Tue, 2023-04-25 at 07:47 +1000, NeilBrown wrote:
> > On Tue, 25 Apr 2023, Jeff Layton wrote:
> > > + /*
> > > + * Warn if someone sets SB_MULTIGRAIN_TS, but doesn't turn down the ts
> > > + * granularity.
> > > + */
> > > + return (sb->s_flags & SB_MULTIGRAIN_TS) &&
> > > + !WARN_ON_ONCE(sb->s_time_gran == 1);
> >
> > Maybe
> > !WARN_ON_ONCE(sb->s_time_gran & SB_MULTIGRAIN_TS);
> > ??
> >
>
> I'm not sure I understand what you mean here.
That's fair, as what I wrote didn't make any sense.
I meant to write:
!WARN_ON_ONCE(sb->s_time_gran & I_CTIME_QUERIED);
to make it explicit that s_time_gran must leave space for
I_CTIME_QUERIED to be set (as you write below). Specifically that
s_time_gran mustn't be odd.
> We want to check whether
> SB_MULTIGRAIN_TS is set in the flags, and that s_time_gran > 1. The
> latter is required so that we have space for the I_CTIME_QUERIED flag.
>
> If SB_MULTIGRAIN_TS is set, but the s_time_gran is too low, we want to
> throw a warning (since something is clearly wrong).
>
NeilBrown
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 22:40 ` NeilBrown
@ 2023-04-25 17:45 ` Jeff Layton
2023-04-25 17:56 ` Matthew Wilcox
0 siblings, 1 reply; 18+ messages in thread
From: Jeff Layton @ 2023-04-25 17:45 UTC (permalink / raw)
To: NeilBrown
Cc: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Tue, 2023-04-25 at 08:40 +1000, NeilBrown wrote:
> On Tue, 25 Apr 2023, Jeff Layton wrote:
> > On Tue, 2023-04-25 at 07:47 +1000, NeilBrown wrote:
> > > On Tue, 25 Apr 2023, Jeff Layton wrote:
> > > > + /*
> > > > + * Warn if someone sets SB_MULTIGRAIN_TS, but doesn't turn down the ts
> > > > + * granularity.
> > > > + */
> > > > + return (sb->s_flags & SB_MULTIGRAIN_TS) &&
> > > > + !WARN_ON_ONCE(sb->s_time_gran == 1);
> > >
> > > Maybe
> > > !WARN_ON_ONCE(sb->s_time_gran & SB_MULTIGRAIN_TS);
> > > ??
> > >
> >
> > I'm not sure I understand what you mean here.
>
> That's fair, as what I wrote didn't make any sense.
> I meant to write:
>
> !WARN_ON_ONCE(sb->s_time_gran & I_CTIME_QUERIED);
>
> to make it explicit that s_time_gran must leave space for
> I_CTIME_QUERIED to be set (as you write below). Specifically that
> s_time_gran mustn't be odd.
>
Erm...it may be an unpopular opinion, but I find that more confusing
than just ensuring that the s_time_gran > 1. I keep wondering if we
might want to carve out other low-order bits too for some later purpose,
at which point trying to check this using flags wouldn't work right. I
think I might just stick with what I have here, at least for now.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-25 17:45 ` Jeff Layton
@ 2023-04-25 17:56 ` Matthew Wilcox
2023-04-25 19:12 ` Jeff Layton
0 siblings, 1 reply; 18+ messages in thread
From: Matthew Wilcox @ 2023-04-25 17:56 UTC (permalink / raw)
To: Jeff Layton
Cc: NeilBrown, Alexander Viro, Christian Brauner, Darrick J. Wong,
Hugh Dickins, Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Tue, Apr 25, 2023 at 01:45:19PM -0400, Jeff Layton wrote:
> Erm...it may be an unpopular opinion, but I find that more confusing
> than just ensuring that the s_time_gran > 1. I keep wondering if we
> might want to carve out other low-order bits too for some later purpose,
> at which point trying to check this using flags wouldn't work right. I
> think I might just stick with what I have here, at least for now.
But what if I set s_time_gran to 3 or 5? You'd really want a warning
about that.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-25 17:56 ` Matthew Wilcox
@ 2023-04-25 19:12 ` Jeff Layton
0 siblings, 0 replies; 18+ messages in thread
From: Jeff Layton @ 2023-04-25 19:12 UTC (permalink / raw)
To: Matthew Wilcox
Cc: NeilBrown, Alexander Viro, Christian Brauner, Darrick J. Wong,
Hugh Dickins, Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Tue, 2023-04-25 at 18:56 +0100, Matthew Wilcox wrote:
> On Tue, Apr 25, 2023 at 01:45:19PM -0400, Jeff Layton wrote:
> > Erm...it may be an unpopular opinion, but I find that more confusing
> > than just ensuring that the s_time_gran > 1. I keep wondering if we
> > might want to carve out other low-order bits too for some later purpose,
> > at which point trying to check this using flags wouldn't work right. I
> > think I might just stick with what I have here, at least for now.
>
> But what if I set s_time_gran to 3 or 5? You'd really want a warning
> about that.
Ugh, I hadn't considered that. I don't see anyone that sets an odd
s_time_gran that isn't 1, but OK, good point. I'll change it.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 22:30 ` Jeff Layton
2023-04-24 22:40 ` NeilBrown
@ 2023-04-26 6:41 ` Christian Brauner
1 sibling, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2023-04-26 6:41 UTC (permalink / raw)
To: Jeff Layton
Cc: NeilBrown, Alexander Viro, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever, Jan Kara,
Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs, linux-mm,
linux-nfs
On Mon, Apr 24, 2023 at 06:30:45PM -0400, Jeff Layton wrote:
> On Tue, 2023-04-25 at 07:47 +1000, NeilBrown wrote:
> > On Tue, 25 Apr 2023, Jeff Layton wrote:
> > > The VFS always uses coarse-grained timestamp updates for filling out the
> > > ctime and mtime after a change. This has the benefit of allowing
> > > filesystems to optimize away a lot metaupdates, to around once per
> > > jiffy, even when a file is under heavy writes.
> > >
> > > Unfortunately, this has always been an issue when we're exporting via
> > > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > > lot of exported filesystems don't properly support a change attribute
> > > and are subject to the same problems with timestamp granularity. Other
> > > applications have similar issues (e.g backup applications).
> > >
> > > Switching to always using fine-grained timestamps would improve the
> > > situation for NFS, but that becomes rather expensive, as the underlying
> > > filesystem will have to log a lot more metadata updates.
> > >
> > > What we need is a way to only use fine-grained timestamps when they are
> > > being actively queried:
> > >
> > > Whenever the mtime changes, the ctime must also change since we're
> > > changing the metadata. When a superblock has a s_time_gran >1, we can
> > > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > > that the value has been queried. Then on the next write, we'll fetch a
> > > fine-grained timestamp instead of the usual coarse-grained one.
> >
> > This assumes that any s_time_gran value greater then 1, is even. This is
> > currently true in practice (it is always a power of 10 I think).
> > But should we have a WARN_ON_ONCE() somewhere just in case?
> >
> > >
> > > We could enable this for any filesystem that has a s_time_gran >1, but
> > > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > > to opt-in to this behavior.
> > >
> > > It then adds a new current_ctime function that acts like the
> > > current_time helper, but will conditionally grab fine-grained timestamps
> > > when the flag is set in the current ctime. Also, there is a new
> > > generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
> > > and atomically marking the ctime as queried.
> > >
> > > Later patches will convert filesystems over to this new scheme.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > > fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
> > > fs/stat.c | 24 ++++++++++++++++++
> > > include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
> > > 3 files changed, 121 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/fs/inode.c b/fs/inode.c
> > > index 4558dc2f1355..4bd11bdb46d4 100644
> > > --- a/fs/inode.c
> > > +++ b/fs/inode.c
> > > @@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
> > > static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > > {
> > > int sync_it = 0;
> > > + struct timespec64 ctime = inode->i_ctime;
> > >
> > > /* First try to exhaust all avenues to not sync */
> > > if (IS_NOCMTIME(inode))
> > > @@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > > if (!timespec64_equal(&inode->i_mtime, now))
> > > sync_it = S_MTIME;
> > >
> > > - if (!timespec64_equal(&inode->i_ctime, now))
> > > + if (is_multigrain_ts(inode))
> > > + ctime.tv_nsec &= ~I_CTIME_QUERIED;
> > > + if (!timespec64_equal(&ctime, now))
> > > sync_it |= S_CTIME;
> > >
> > > if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
> > > @@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
> > > return ret;
> > > }
> > >
> > > +/**
> > > + * current_ctime - Return FS time (possibly high-res)
> > > + * @inode: inode.
> > > + *
> > > + * Return the current time truncated to the time granularity supported by
> > > + * the fs, as suitable for a ctime/mtime change.
> > > + *
> > > + * For a multigrain timestamp, if the timestamp is flagged as having been
> > > + * QUERIED, then get a fine-grained timestamp.
> > > + */
> > > +struct timespec64 current_ctime(struct inode *inode)
> > > +{
> > > + struct timespec64 now;
> > > + long nsec = 0;
> > > + bool multigrain = is_multigrain_ts(inode);
> > > +
> > > + if (multigrain) {
> > > + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> > > +
> > > + nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
> >
> > atomic_long_fetch_andnot(I_CTIME_QUERIED, pnsec) ??
> >
>
> I didn't realize that existed! Sure, I can make that change.
>
> > > + }
> > > +
> > > + if (nsec & I_CTIME_QUERIED) {
> > > + ktime_get_real_ts64(&now);
> > > + } else {
> > > + ktime_get_coarse_real_ts64(&now);
> > > +
> > > + if (multigrain) {
> > > + /*
> > > + * If we've recently fetched a fine-grained timestamp
> > > + * then the coarse-grained one may be earlier than the
> > > + * existing one. Just keep the existing ctime if so.
> > > + */
> > > + struct timespec64 ctime = inode->i_ctime;
> > > +
> > > + if (timespec64_compare(&ctime, &now) > 0)
> > > + now = ctime;
> >
> > I think this ctime could have the I_CTIME_QUERIED bit set. We probably
> > don't want that ??
> >
> >
>
> The timestamp_truncate below will take care of it.
>
> > > + }
> > > + }
> > > +
> > > + return timestamp_truncate(now, inode);
> > > +}
> > > +EXPORT_SYMBOL(current_ctime);
> > > +
> > > /**
> > > * file_update_time - update mtime and ctime time
> > > * @file: file accessed
> > > @@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
> > > {
> > > int ret;
> > > struct inode *inode = file_inode(file);
> > > - struct timespec64 now = current_time(inode);
> > > + struct timespec64 now = current_ctime(inode);
> > >
> > > ret = inode_needs_update_time(inode, &now);
> > > if (ret <= 0)
> > > @@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
> > > {
> > > int ret;
> > > struct inode *inode = file_inode(file);
> > > - struct timespec64 now = current_time(inode);
> > > + struct timespec64 now = current_ctime(inode);
> > >
> > > /*
> > > * Clear the security bits if the process is not being run by root.
> > > @@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
> > > if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
> > > t.tv_nsec = 0;
> > >
> > > - /* Avoid division in the common cases 1 ns and 1 s. */
> > > + /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
> > > if (gran == 1)
> > > ; /* nothing */
> > > + else if (gran == 2)
> > > + t.tv_nsec &= ~1L;
> > > else if (gran == NSEC_PER_SEC)
> > > t.tv_nsec = 0;
> > > else if (gran > 1 && gran < NSEC_PER_SEC)
> > > diff --git a/fs/stat.c b/fs/stat.c
> > > index 7c238da22ef0..67b56daf9663 100644
> > > --- a/fs/stat.c
> > > +++ b/fs/stat.c
> > > @@ -26,6 +26,30 @@
> > > #include "internal.h"
> > > #include "mount.h"
> > >
> > > +/**
> > > + * generic_fill_multigrain_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
> > > + * @inode: inode from which to grab the c/mtime
> > > + * @stat: where to store the resulting values
> > > + *
> > > + * Given @inode, grab the ctime and mtime out if it and store the result
> > > + * in @stat. When fetching the value, flag it as queried so the next write
> > > + * will use a fine-grained timestamp.
> > > + */
> > > +void generic_fill_multigrain_cmtime(struct inode *inode, struct kstat *stat)
> > > +{
> > > + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> > > +
> > > + stat->mtime = inode->i_mtime;
> > > + stat->ctime.tv_sec = inode->i_ctime.tv_sec;
> > > + /*
> > > + * Atomically set the QUERIED flag and fetch the new value with
> > > + * the flag masked off.
> > > + */
> > > + stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec)
> > > + & ~I_CTIME_QUERIED;
> > > +}
> > > +EXPORT_SYMBOL(generic_fill_multigrain_cmtime);
> > > +
> > > /**
> > > * generic_fillattr - Fill in the basic attributes from the inode struct
> > > * @idmap: idmap of the mount the inode was found from
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index c85916e9f7db..e6dd3ce051ef 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -1059,21 +1059,22 @@ extern int send_sigurg(struct fown_struct *fown);
> > > * sb->s_flags. Note that these mirror the equivalent MS_* flags where
> > > * represented in both.
> > > */
> > > -#define SB_RDONLY 1 /* Mount read-only */
> > > -#define SB_NOSUID 2 /* Ignore suid and sgid bits */
> > > -#define SB_NODEV 4 /* Disallow access to device special files */
> > > -#define SB_NOEXEC 8 /* Disallow program execution */
> > > -#define SB_SYNCHRONOUS 16 /* Writes are synced at once */
> > > -#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */
> > > -#define SB_DIRSYNC 128 /* Directory modifications are synchronous */
> > > -#define SB_NOATIME 1024 /* Do not update access times. */
> > > -#define SB_NODIRATIME 2048 /* Do not update directory access times */
> > > -#define SB_SILENT 32768
> > > -#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
> > > -#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
> > > -#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
> > > -#define SB_I_VERSION (1<<23) /* Update inode I_version field */
> > > -#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > > +#define SB_RDONLY (1<<0) /* Mount read-only */
> >
> > BIT(0) ???
> >
>
> Even better. I'll revise it.
Please split this and the alignment stuff below into a preparatory
cleanup patch.
>
> > > +#define SB_NOSUID (1<<1) /* Ignore suid and sgid bits */
> >
> > BIT(1) ??
> >
> > > +#define SB_NODEV (1<<2) /* Disallow access to device special files */
> > > +#define SB_NOEXEC (1<<3) /* Disallow program execution */
> > > +#define SB_SYNCHRONOUS (1<<4) /* Writes are synced at once */
> > > +#define SB_MANDLOCK (1<<6) /* Allow mandatory locks on an FS */
> > > +#define SB_DIRSYNC (1<<7) /* Directory modifications are synchronous */
> > > +#define SB_NOATIME (1<<10) /* Do not update access times. */
> > > +#define SB_NODIRATIME (1<<11) /* Do not update directory access times */
> > > +#define SB_SILENT (1<<15)
> > > +#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */
> > > +#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */
> > > +#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */
> > > +#define SB_I_VERSION (1<<23) /* Update inode I_version field */
> > > +#define SB_MULTIGRAIN_TS (1<<24) /* Use multigrain c/mtimes */
> > > +#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */
> > >
> > > /* These sb flags are internal to the kernel */
> > > #define SB_SUBMOUNT (1<<26)
> >
> > Why not align this one too?
> >
>
> Sure. I'll add that in for the next one.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 15:11 ` [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime Jeff Layton
2023-04-24 21:47 ` NeilBrown
@ 2023-04-26 6:53 ` Christian Brauner
2023-04-26 9:46 ` Jeff Layton
2023-04-26 7:07 ` Christian Brauner
2 siblings, 1 reply; 18+ messages in thread
From: Christian Brauner @ 2023-04-26 6:53 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> The VFS always uses coarse-grained timestamp updates for filling out the
> ctime and mtime after a change. This has the benefit of allowing
> filesystems to optimize away a lot metaupdates, to around once per
> jiffy, even when a file is under heavy writes.
>
> Unfortunately, this has always been an issue when we're exporting via
> NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> lot of exported filesystems don't properly support a change attribute
> and are subject to the same problems with timestamp granularity. Other
> applications have similar issues (e.g backup applications).
>
> Switching to always using fine-grained timestamps would improve the
> situation for NFS, but that becomes rather expensive, as the underlying
> filesystem will have to log a lot more metadata updates.
>
> What we need is a way to only use fine-grained timestamps when they are
> being actively queried:
>
> Whenever the mtime changes, the ctime must also change since we're
> changing the metadata. When a superblock has a s_time_gran >1, we can
> use the lowest-order bit of the inode->i_ctime as a flag to indicate
> that the value has been queried. Then on the next write, we'll fetch a
> fine-grained timestamp instead of the usual coarse-grained one.
>
> We could enable this for any filesystem that has a s_time_gran >1, but
> for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> to opt-in to this behavior.
>
> It then adds a new current_ctime function that acts like the
> current_time helper, but will conditionally grab fine-grained timestamps
> when the flag is set in the current ctime. Also, there is a new
> generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
> and atomically marking the ctime as queried.
>
> Later patches will convert filesystems over to this new scheme.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
> fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
> fs/stat.c | 24 ++++++++++++++++++
> include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
> 3 files changed, 121 insertions(+), 22 deletions(-)
>
> diff --git a/fs/inode.c b/fs/inode.c
> index 4558dc2f1355..4bd11bdb46d4 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
> static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> {
> int sync_it = 0;
> + struct timespec64 ctime = inode->i_ctime;
>
> /* First try to exhaust all avenues to not sync */
> if (IS_NOCMTIME(inode))
> @@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> if (!timespec64_equal(&inode->i_mtime, now))
> sync_it = S_MTIME;
>
> - if (!timespec64_equal(&inode->i_ctime, now))
> + if (is_multigrain_ts(inode))
> + ctime.tv_nsec &= ~I_CTIME_QUERIED;
> + if (!timespec64_equal(&ctime, now))
> sync_it |= S_CTIME;
>
> if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
> @@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
> return ret;
> }
>
> +/**
> + * current_ctime - Return FS time (possibly high-res)
> + * @inode: inode.
> + *
> + * Return the current time truncated to the time granularity supported by
> + * the fs, as suitable for a ctime/mtime change.
> + *
> + * For a multigrain timestamp, if the timestamp is flagged as having been
> + * QUERIED, then get a fine-grained timestamp.
> + */
> +struct timespec64 current_ctime(struct inode *inode)
> +{
> + struct timespec64 now;
> + long nsec = 0;
> + bool multigrain = is_multigrain_ts(inode);
> +
> + if (multigrain) {
> + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> +
> + nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
> + }
> +
> + if (nsec & I_CTIME_QUERIED) {
> + ktime_get_real_ts64(&now);
> + } else {
> + ktime_get_coarse_real_ts64(&now);
> +
> + if (multigrain) {
> + /*
> + * If we've recently fetched a fine-grained timestamp
> + * then the coarse-grained one may be earlier than the
> + * existing one. Just keep the existing ctime if so.
> + */
> + struct timespec64 ctime = inode->i_ctime;
> +
> + if (timespec64_compare(&ctime, &now) > 0)
> + now = ctime;
> + }
> + }
> +
> + return timestamp_truncate(now, inode);
> +}
> +EXPORT_SYMBOL(current_ctime);
> +
> /**
> * file_update_time - update mtime and ctime time
> * @file: file accessed
> @@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
> {
> int ret;
> struct inode *inode = file_inode(file);
> - struct timespec64 now = current_time(inode);
> + struct timespec64 now = current_ctime(inode);
>
> ret = inode_needs_update_time(inode, &now);
> if (ret <= 0)
> @@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
> {
> int ret;
> struct inode *inode = file_inode(file);
> - struct timespec64 now = current_time(inode);
> + struct timespec64 now = current_ctime(inode);
>
> /*
> * Clear the security bits if the process is not being run by root.
> @@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
> if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
> t.tv_nsec = 0;
>
> - /* Avoid division in the common cases 1 ns and 1 s. */
> + /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
> if (gran == 1)
> ; /* nothing */
> + else if (gran == 2)
> + t.tv_nsec &= ~1L;
Is that trying to mask off I_CTIME_QUERIED?
If so, can we please use that constant as raw constants tend to be
confusing in the long run.
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-26 6:53 ` Christian Brauner
@ 2023-04-26 9:46 ` Jeff Layton
2023-04-27 9:44 ` Christian Brauner
0 siblings, 1 reply; 18+ messages in thread
From: Jeff Layton @ 2023-04-26 9:46 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Wed, 2023-04-26 at 08:53 +0200, Christian Brauner wrote:
> On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> > The VFS always uses coarse-grained timestamp updates for filling out the
> > ctime and mtime after a change. This has the benefit of allowing
> > filesystems to optimize away a lot metaupdates, to around once per
> > jiffy, even when a file is under heavy writes.
> >
> > Unfortunately, this has always been an issue when we're exporting via
> > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > lot of exported filesystems don't properly support a change attribute
> > and are subject to the same problems with timestamp granularity. Other
> > applications have similar issues (e.g backup applications).
> >
> > Switching to always using fine-grained timestamps would improve the
> > situation for NFS, but that becomes rather expensive, as the underlying
> > filesystem will have to log a lot more metadata updates.
> >
> > What we need is a way to only use fine-grained timestamps when they are
> > being actively queried:
> >
> > Whenever the mtime changes, the ctime must also change since we're
> > changing the metadata. When a superblock has a s_time_gran >1, we can
> > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > that the value has been queried. Then on the next write, we'll fetch a
> > fine-grained timestamp instead of the usual coarse-grained one.
> >
> > We could enable this for any filesystem that has a s_time_gran >1, but
> > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > to opt-in to this behavior.
> >
> > It then adds a new current_ctime function that acts like the
> > current_time helper, but will conditionally grab fine-grained timestamps
> > when the flag is set in the current ctime. Also, there is a new
> > generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
> > and atomically marking the ctime as queried.
> >
> > Later patches will convert filesystems over to this new scheme.
> >
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> > fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
> > fs/stat.c | 24 ++++++++++++++++++
> > include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
> > 3 files changed, 121 insertions(+), 22 deletions(-)
> >
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 4558dc2f1355..4bd11bdb46d4 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
> > static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > {
> > int sync_it = 0;
> > + struct timespec64 ctime = inode->i_ctime;
> >
> > /* First try to exhaust all avenues to not sync */
> > if (IS_NOCMTIME(inode))
> > @@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > if (!timespec64_equal(&inode->i_mtime, now))
> > sync_it = S_MTIME;
> >
> > - if (!timespec64_equal(&inode->i_ctime, now))
> > + if (is_multigrain_ts(inode))
> > + ctime.tv_nsec &= ~I_CTIME_QUERIED;
> > + if (!timespec64_equal(&ctime, now))
> > sync_it |= S_CTIME;
> >
> > if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
> > @@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
> > return ret;
> > }
> >
> > +/**
> > + * current_ctime - Return FS time (possibly high-res)
> > + * @inode: inode.
> > + *
> > + * Return the current time truncated to the time granularity supported by
> > + * the fs, as suitable for a ctime/mtime change.
> > + *
> > + * For a multigrain timestamp, if the timestamp is flagged as having been
> > + * QUERIED, then get a fine-grained timestamp.
> > + */
> > +struct timespec64 current_ctime(struct inode *inode)
> > +{
> > + struct timespec64 now;
> > + long nsec = 0;
> > + bool multigrain = is_multigrain_ts(inode);
> > +
> > + if (multigrain) {
> > + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> > +
> > + nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
> > + }
> > +
> > + if (nsec & I_CTIME_QUERIED) {
> > + ktime_get_real_ts64(&now);
> > + } else {
> > + ktime_get_coarse_real_ts64(&now);
> > +
> > + if (multigrain) {
> > + /*
> > + * If we've recently fetched a fine-grained timestamp
> > + * then the coarse-grained one may be earlier than the
> > + * existing one. Just keep the existing ctime if so.
> > + */
> > + struct timespec64 ctime = inode->i_ctime;
> > +
> > + if (timespec64_compare(&ctime, &now) > 0)
> > + now = ctime;
> > + }
> > + }
> > +
> > + return timestamp_truncate(now, inode);
> > +}
> > +EXPORT_SYMBOL(current_ctime);
> > +
> > /**
> > * file_update_time - update mtime and ctime time
> > * @file: file accessed
> > @@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
> > {
> > int ret;
> > struct inode *inode = file_inode(file);
> > - struct timespec64 now = current_time(inode);
> > + struct timespec64 now = current_ctime(inode);
> >
> > ret = inode_needs_update_time(inode, &now);
> > if (ret <= 0)
> > @@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
> > {
> > int ret;
> > struct inode *inode = file_inode(file);
> > - struct timespec64 now = current_time(inode);
> > + struct timespec64 now = current_ctime(inode);
> >
> > /*
> > * Clear the security bits if the process is not being run by root.
> > @@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
> > if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
> > t.tv_nsec = 0;
> >
> > - /* Avoid division in the common cases 1 ns and 1 s. */
> > + /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
> > if (gran == 1)
> > ; /* nothing */
> > + else if (gran == 2)
> > + t.tv_nsec &= ~1L;
>
> Is that trying to mask off I_CTIME_QUERIED?
> If so, can we please use that constant as raw constants tend to be
> confusing in the long run.
Sort of. In principle you could set s_time_gran to 2 without setting
SB_MULTIGRAIN_TS. In that case, would it be correct to use the flag
there?
In any case, I can certainly make it use that constant though if that's
what you'd prefer.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-26 9:46 ` Jeff Layton
@ 2023-04-27 9:44 ` Christian Brauner
0 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2023-04-27 9:44 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Wed, Apr 26, 2023 at 05:46:25AM -0400, Jeff Layton wrote:
> On Wed, 2023-04-26 at 08:53 +0200, Christian Brauner wrote:
> > On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> > > The VFS always uses coarse-grained timestamp updates for filling out the
> > > ctime and mtime after a change. This has the benefit of allowing
> > > filesystems to optimize away a lot metaupdates, to around once per
> > > jiffy, even when a file is under heavy writes.
> > >
> > > Unfortunately, this has always been an issue when we're exporting via
> > > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > > lot of exported filesystems don't properly support a change attribute
> > > and are subject to the same problems with timestamp granularity. Other
> > > applications have similar issues (e.g backup applications).
> > >
> > > Switching to always using fine-grained timestamps would improve the
> > > situation for NFS, but that becomes rather expensive, as the underlying
> > > filesystem will have to log a lot more metadata updates.
> > >
> > > What we need is a way to only use fine-grained timestamps when they are
> > > being actively queried:
> > >
> > > Whenever the mtime changes, the ctime must also change since we're
> > > changing the metadata. When a superblock has a s_time_gran >1, we can
> > > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > > that the value has been queried. Then on the next write, we'll fetch a
> > > fine-grained timestamp instead of the usual coarse-grained one.
> > >
> > > We could enable this for any filesystem that has a s_time_gran >1, but
> > > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > > to opt-in to this behavior.
> > >
> > > It then adds a new current_ctime function that acts like the
> > > current_time helper, but will conditionally grab fine-grained timestamps
> > > when the flag is set in the current ctime. Also, there is a new
> > > generic_fill_multigrain_cmtime for grabbing the c/mtime out of the inode
> > > and atomically marking the ctime as queried.
> > >
> > > Later patches will convert filesystems over to this new scheme.
> > >
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > > fs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++---
> > > fs/stat.c | 24 ++++++++++++++++++
> > > include/linux/fs.h | 62 ++++++++++++++++++++++++++++++++--------------
> > > 3 files changed, 121 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/fs/inode.c b/fs/inode.c
> > > index 4558dc2f1355..4bd11bdb46d4 100644
> > > --- a/fs/inode.c
> > > +++ b/fs/inode.c
> > > @@ -2030,6 +2030,7 @@ EXPORT_SYMBOL(file_remove_privs);
> > > static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > > {
> > > int sync_it = 0;
> > > + struct timespec64 ctime = inode->i_ctime;
> > >
> > > /* First try to exhaust all avenues to not sync */
> > > if (IS_NOCMTIME(inode))
> > > @@ -2038,7 +2039,9 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
> > > if (!timespec64_equal(&inode->i_mtime, now))
> > > sync_it = S_MTIME;
> > >
> > > - if (!timespec64_equal(&inode->i_ctime, now))
> > > + if (is_multigrain_ts(inode))
> > > + ctime.tv_nsec &= ~I_CTIME_QUERIED;
> > > + if (!timespec64_equal(&ctime, now))
> > > sync_it |= S_CTIME;
> > >
> > > if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
> > > @@ -2062,6 +2065,50 @@ static int __file_update_time(struct file *file, struct timespec64 *now,
> > > return ret;
> > > }
> > >
> > > +/**
> > > + * current_ctime - Return FS time (possibly high-res)
> > > + * @inode: inode.
> > > + *
> > > + * Return the current time truncated to the time granularity supported by
> > > + * the fs, as suitable for a ctime/mtime change.
> > > + *
> > > + * For a multigrain timestamp, if the timestamp is flagged as having been
> > > + * QUERIED, then get a fine-grained timestamp.
> > > + */
> > > +struct timespec64 current_ctime(struct inode *inode)
> > > +{
> > > + struct timespec64 now;
> > > + long nsec = 0;
> > > + bool multigrain = is_multigrain_ts(inode);
> > > +
> > > + if (multigrain) {
> > > + atomic_long_t *pnsec = (atomic_long_t *)&inode->i_ctime.tv_nsec;
> > > +
> > > + nsec = atomic_long_fetch_and(~I_CTIME_QUERIED, pnsec);
> > > + }
> > > +
> > > + if (nsec & I_CTIME_QUERIED) {
> > > + ktime_get_real_ts64(&now);
> > > + } else {
> > > + ktime_get_coarse_real_ts64(&now);
> > > +
> > > + if (multigrain) {
> > > + /*
> > > + * If we've recently fetched a fine-grained timestamp
> > > + * then the coarse-grained one may be earlier than the
> > > + * existing one. Just keep the existing ctime if so.
> > > + */
> > > + struct timespec64 ctime = inode->i_ctime;
> > > +
> > > + if (timespec64_compare(&ctime, &now) > 0)
> > > + now = ctime;
> > > + }
> > > + }
> > > +
> > > + return timestamp_truncate(now, inode);
> > > +}
> > > +EXPORT_SYMBOL(current_ctime);
> > > +
> > > /**
> > > * file_update_time - update mtime and ctime time
> > > * @file: file accessed
> > > @@ -2080,7 +2127,7 @@ int file_update_time(struct file *file)
> > > {
> > > int ret;
> > > struct inode *inode = file_inode(file);
> > > - struct timespec64 now = current_time(inode);
> > > + struct timespec64 now = current_ctime(inode);
> > >
> > > ret = inode_needs_update_time(inode, &now);
> > > if (ret <= 0)
> > > @@ -2109,7 +2156,7 @@ static int file_modified_flags(struct file *file, int flags)
> > > {
> > > int ret;
> > > struct inode *inode = file_inode(file);
> > > - struct timespec64 now = current_time(inode);
> > > + struct timespec64 now = current_ctime(inode);
> > >
> > > /*
> > > * Clear the security bits if the process is not being run by root.
> > > @@ -2419,9 +2466,11 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
> > > if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
> > > t.tv_nsec = 0;
> > >
> > > - /* Avoid division in the common cases 1 ns and 1 s. */
> > > + /* Avoid division in the common cases 1 ns, 2 ns and 1 s. */
> > > if (gran == 1)
> > > ; /* nothing */
> > > + else if (gran == 2)
> > > + t.tv_nsec &= ~1L;
> >
> > Is that trying to mask off I_CTIME_QUERIED?
> > If so, can we please use that constant as raw constants tend to be
> > confusing in the long run.
>
> Sort of. In principle you could set s_time_gran to 2 without setting
> SB_MULTIGRAIN_TS. In that case, would it be correct to use the flag
> there?
Fair, then maybe just leave a comment in there. My main worry is going
back to this later and then staring at this trying to remember what's
happening...
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-24 15:11 ` [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime Jeff Layton
2023-04-24 21:47 ` NeilBrown
2023-04-26 6:53 ` Christian Brauner
@ 2023-04-26 7:07 ` Christian Brauner
2023-04-26 9:48 ` Jeff Layton
2 siblings, 1 reply; 18+ messages in thread
From: Christian Brauner @ 2023-04-26 7:07 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> The VFS always uses coarse-grained timestamp updates for filling out the
> ctime and mtime after a change. This has the benefit of allowing
> filesystems to optimize away a lot metaupdates, to around once per
> jiffy, even when a file is under heavy writes.
>
> Unfortunately, this has always been an issue when we're exporting via
> NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> lot of exported filesystems don't properly support a change attribute
> and are subject to the same problems with timestamp granularity. Other
> applications have similar issues (e.g backup applications).
>
> Switching to always using fine-grained timestamps would improve the
> situation for NFS, but that becomes rather expensive, as the underlying
> filesystem will have to log a lot more metadata updates.
>
> What we need is a way to only use fine-grained timestamps when they are
> being actively queried:
>
> Whenever the mtime changes, the ctime must also change since we're
> changing the metadata. When a superblock has a s_time_gran >1, we can
> use the lowest-order bit of the inode->i_ctime as a flag to indicate
> that the value has been queried. Then on the next write, we'll fetch a
> fine-grained timestamp instead of the usual coarse-grained one.
>
> We could enable this for any filesystem that has a s_time_gran >1, but
> for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> to opt-in to this behavior.
Hm, the patch raises the flag in s_flags. Please at least move this to
s_iflags as SB_I_MULTIGRAIN and treat this as an internal flag. There's
no need to give the impression that this will become a mount option.
Also, this looks like it's a filesystem property not a superblock
property as the granularity isn't changeable. So shouldn't this be an
FS_* flag instead?
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-26 7:07 ` Christian Brauner
@ 2023-04-26 9:48 ` Jeff Layton
2023-04-27 9:51 ` Christian Brauner
0 siblings, 1 reply; 18+ messages in thread
From: Jeff Layton @ 2023-04-26 9:48 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Wed, 2023-04-26 at 09:07 +0200, Christian Brauner wrote:
> On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> > The VFS always uses coarse-grained timestamp updates for filling out the
> > ctime and mtime after a change. This has the benefit of allowing
> > filesystems to optimize away a lot metaupdates, to around once per
> > jiffy, even when a file is under heavy writes.
> >
> > Unfortunately, this has always been an issue when we're exporting via
> > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > lot of exported filesystems don't properly support a change attribute
> > and are subject to the same problems with timestamp granularity. Other
> > applications have similar issues (e.g backup applications).
> >
> > Switching to always using fine-grained timestamps would improve the
> > situation for NFS, but that becomes rather expensive, as the underlying
> > filesystem will have to log a lot more metadata updates.
> >
> > What we need is a way to only use fine-grained timestamps when they are
> > being actively queried:
> >
> > Whenever the mtime changes, the ctime must also change since we're
> > changing the metadata. When a superblock has a s_time_gran >1, we can
> > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > that the value has been queried. Then on the next write, we'll fetch a
> > fine-grained timestamp instead of the usual coarse-grained one.
> >
> > We could enable this for any filesystem that has a s_time_gran >1, but
> > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > to opt-in to this behavior.
>
> Hm, the patch raises the flag in s_flags. Please at least move this to
> s_iflags as SB_I_MULTIGRAIN and treat this as an internal flag. There's
> no need to give the impression that this will become a mount option.
>
> Also, this looks like it's a filesystem property not a superblock
> property as the granularity isn't changeable. So shouldn't this be an
> FS_* flag instead?
It could be a per-sb thing if there was some filesystem that wanted to
do that, but I'm hoping that most will not want to do that.
My initial patches for this actually did use a FS_* flag, but I figured
that was one more pointer to chase when you wanted to check the flag.
I can change it back to that though. Let me know what you'd prefer.
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-26 9:48 ` Jeff Layton
@ 2023-04-27 9:51 ` Christian Brauner
2023-04-27 9:57 ` Jeff Layton
0 siblings, 1 reply; 18+ messages in thread
From: Christian Brauner @ 2023-04-27 9:51 UTC (permalink / raw)
To: Jeff Layton
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Wed, Apr 26, 2023 at 05:48:38AM -0400, Jeff Layton wrote:
> On Wed, 2023-04-26 at 09:07 +0200, Christian Brauner wrote:
> > On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> > > The VFS always uses coarse-grained timestamp updates for filling out the
> > > ctime and mtime after a change. This has the benefit of allowing
> > > filesystems to optimize away a lot metaupdates, to around once per
> > > jiffy, even when a file is under heavy writes.
> > >
> > > Unfortunately, this has always been an issue when we're exporting via
> > > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > > lot of exported filesystems don't properly support a change attribute
> > > and are subject to the same problems with timestamp granularity. Other
> > > applications have similar issues (e.g backup applications).
> > >
> > > Switching to always using fine-grained timestamps would improve the
> > > situation for NFS, but that becomes rather expensive, as the underlying
> > > filesystem will have to log a lot more metadata updates.
> > >
> > > What we need is a way to only use fine-grained timestamps when they are
> > > being actively queried:
> > >
> > > Whenever the mtime changes, the ctime must also change since we're
> > > changing the metadata. When a superblock has a s_time_gran >1, we can
> > > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > > that the value has been queried. Then on the next write, we'll fetch a
> > > fine-grained timestamp instead of the usual coarse-grained one.
> > >
> > > We could enable this for any filesystem that has a s_time_gran >1, but
> > > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > > to opt-in to this behavior.
> >
> > Hm, the patch raises the flag in s_flags. Please at least move this to
> > s_iflags as SB_I_MULTIGRAIN and treat this as an internal flag. There's
> > no need to give the impression that this will become a mount option.
> >
> > Also, this looks like it's a filesystem property not a superblock
> > property as the granularity isn't changeable. So shouldn't this be an
> > FS_* flag instead?
>
> It could be a per-sb thing if there was some filesystem that wanted to
> do that, but I'm hoping that most will not want to do that.
Yeah, I'd really hope this isn't an sb thing.
>
> My initial patches for this actually did use a FS_* flag, but I figured
Oh, I might've just missed that.
> that was one more pointer to chase when you wanted to check the flag.
Hm, unless you have reasons to think that it would be noticable in terms
of perf I'd rather do the correct thing and have it be an FS_* flag.
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime
2023-04-27 9:51 ` Christian Brauner
@ 2023-04-27 9:57 ` Jeff Layton
0 siblings, 0 replies; 18+ messages in thread
From: Jeff Layton @ 2023-04-27 9:57 UTC (permalink / raw)
To: Christian Brauner
Cc: Alexander Viro, Darrick J. Wong, Hugh Dickins, Andrew Morton,
Dave Chinner, Chuck Lever, Jan Kara, Amir Goldstein,
linux-fsdevel, linux-kernel, linux-xfs, linux-mm, linux-nfs
On Thu, 2023-04-27 at 11:51 +0200, Christian Brauner wrote:
> On Wed, Apr 26, 2023 at 05:48:38AM -0400, Jeff Layton wrote:
> > On Wed, 2023-04-26 at 09:07 +0200, Christian Brauner wrote:
> > > On Mon, Apr 24, 2023 at 11:11:02AM -0400, Jeff Layton wrote:
> > > > The VFS always uses coarse-grained timestamp updates for filling out the
> > > > ctime and mtime after a change. This has the benefit of allowing
> > > > filesystems to optimize away a lot metaupdates, to around once per
> > > > jiffy, even when a file is under heavy writes.
> > > >
> > > > Unfortunately, this has always been an issue when we're exporting via
> > > > NFSv3, which relies on timestamps to validate caches. Even with NFSv4, a
> > > > lot of exported filesystems don't properly support a change attribute
> > > > and are subject to the same problems with timestamp granularity. Other
> > > > applications have similar issues (e.g backup applications).
> > > >
> > > > Switching to always using fine-grained timestamps would improve the
> > > > situation for NFS, but that becomes rather expensive, as the underlying
> > > > filesystem will have to log a lot more metadata updates.
> > > >
> > > > What we need is a way to only use fine-grained timestamps when they are
> > > > being actively queried:
> > > >
> > > > Whenever the mtime changes, the ctime must also change since we're
> > > > changing the metadata. When a superblock has a s_time_gran >1, we can
> > > > use the lowest-order bit of the inode->i_ctime as a flag to indicate
> > > > that the value has been queried. Then on the next write, we'll fetch a
> > > > fine-grained timestamp instead of the usual coarse-grained one.
> > > >
> > > > We could enable this for any filesystem that has a s_time_gran >1, but
> > > > for now, this patch adds a new SB_MULTIGRAIN_TS flag to allow filesystems
> > > > to opt-in to this behavior.
> > >
> > > Hm, the patch raises the flag in s_flags. Please at least move this to
> > > s_iflags as SB_I_MULTIGRAIN and treat this as an internal flag. There's
> > > no need to give the impression that this will become a mount option.
> > >
> > > Also, this looks like it's a filesystem property not a superblock
> > > property as the granularity isn't changeable. So shouldn't this be an
> > > FS_* flag instead?
> >
> > It could be a per-sb thing if there was some filesystem that wanted to
> > do that, but I'm hoping that most will not want to do that.
>
> Yeah, I'd really hope this isn't an sb thing.
>
> >
> > My initial patches for this actually did use a FS_* flag, but I figured
>
> Oh, I might've just missed that.
>
Sorry, I didn't actually post that set. But I did go with a FS_* flag
before I made it a SB_* flag.
> > that was one more pointer to chase when you wanted to check the flag.
>
> Hm, unless you have reasons to think that it would be noticable in terms
> of perf I'd rather do the correct thing and have it be an FS_* flag.
Sure. I'll make the switch before the next posting.
Thanks for the review!
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH v2 2/3] shmem: mark for high-res timestamps on next update after getattr
2023-04-24 15:11 [PATCH v2 0/3] fs: multigrain timestamps Jeff Layton
2023-04-24 15:11 ` [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime Jeff Layton
@ 2023-04-24 15:11 ` Jeff Layton
2023-04-24 15:11 ` [PATCH v2 3/3] xfs: mark the inode for high-res timestamp update in getattr Jeff Layton
2 siblings, 0 replies; 18+ messages in thread
From: Jeff Layton @ 2023-04-24 15:11 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever
Cc: Jan Kara, Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs,
linux-mm, linux-nfs
Change the s_time_gran for tmpfs to 2, to make it eligible to use the
low order bit as a flag. Switch to the current_ctime helper instead
of current_time.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
mm/shmem.c | 29 +++++++++++++++++------------
1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 448f393d8ab2..497ffa29d172 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1039,7 +1039,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_ctime = inode->i_mtime = current_ctime(inode);
inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1070,6 +1070,11 @@ static int shmem_getattr(struct mnt_idmap *idmap,
if (shmem_is_huge(inode, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
+ if (request_mask & (STATX_CTIME|STATX_MTIME))
+ generic_fill_multigrain_cmtime(inode, stat);
+ else
+ stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = info->i_crtime.tv_sec;
@@ -1136,7 +1141,7 @@ static int shmem_setattr(struct mnt_idmap *idmap,
if (attr->ia_valid & ATTR_MODE)
error = posix_acl_chmod(idmap, dentry, inode->i_mode);
if (!error && update_ctime) {
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = current_ctime(inode);
if (update_mtime)
inode->i_mtime = inode->i_ctime;
inode_inc_iversion(inode);
@@ -2361,7 +2366,7 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
inode->i_ino = ino;
inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode->i_ctime = current_ctime(inode);
inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
@@ -2940,7 +2945,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime = current_ctime(dir);
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
@@ -3016,7 +3021,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
}
dir->i_size += BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_ctime(inode);
inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
@@ -3034,7 +3039,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
shmem_free_inode(inode->i_sb);
dir->i_size -= BOGO_DIRENT_SIZE;
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = current_ctime(inode);
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
@@ -3124,7 +3129,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
new_dir->i_size += BOGO_DIRENT_SIZE;
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
- inode->i_ctime = current_time(old_dir);
+ inode->i_ctime = current_ctime(old_dir);
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
return 0;
@@ -3178,7 +3183,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_ctime = dir->i_mtime = current_ctime(dir);
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
@@ -3250,7 +3255,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
(fa->flags & SHMEM_FL_USER_MODIFIABLE);
shmem_set_inode_flags(inode, info->fsflags);
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = current_ctime(inode);
inode_inc_iversion(inode);
return 0;
}
@@ -3320,7 +3325,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
name = xattr_full_name(handler, name);
err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
if (!err) {
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = current_ctime(inode);
inode_inc_iversion(inode);
}
return err;
@@ -3786,7 +3791,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+ sb->s_flags |= SB_NOSEC | SB_I_VERSION | SB_MULTIGRAIN_TS;
#else
sb->s_flags |= SB_NOUSER;
#endif
@@ -3816,7 +3821,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
- sb->s_time_gran = 1;
+ sb->s_time_gran = 2;
#ifdef CONFIG_TMPFS_XATTR
sb->s_xattr = shmem_xattr_handlers;
#endif
--
2.40.0
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH v2 3/3] xfs: mark the inode for high-res timestamp update in getattr
2023-04-24 15:11 [PATCH v2 0/3] fs: multigrain timestamps Jeff Layton
2023-04-24 15:11 ` [PATCH v2 1/3] fs: add infrastructure for multigrain inode i_m/ctime Jeff Layton
2023-04-24 15:11 ` [PATCH v2 2/3] shmem: mark for high-res timestamps on next update after getattr Jeff Layton
@ 2023-04-24 15:11 ` Jeff Layton
2 siblings, 0 replies; 18+ messages in thread
From: Jeff Layton @ 2023-04-24 15:11 UTC (permalink / raw)
To: Alexander Viro, Christian Brauner, Darrick J. Wong, Hugh Dickins,
Andrew Morton, Dave Chinner, Chuck Lever
Cc: Jan Kara, Amir Goldstein, linux-fsdevel, linux-kernel, linux-xfs,
linux-mm, linux-nfs
When the mtime or ctime is being queried via getattr, ensure that we
mark the inode for a high-res timestamp update on the next pass. Also,
switch to current_cmtime for other c/mtime updates.
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
fs/xfs/libxfs/xfs_trans_inode.c | 2 +-
fs/xfs/xfs_acl.c | 2 +-
fs/xfs/xfs_inode.c | 2 +-
fs/xfs/xfs_inode_item.c | 2 +-
fs/xfs/xfs_iops.c | 9 ++++++---
fs/xfs/xfs_super.c | 5 ++++-
6 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 8b5547073379..c08be3aa3339 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -63,7 +63,7 @@ xfs_trans_ichgtime(
ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- tv = current_time(inode);
+ tv = current_ctime(inode);
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 791db7d9c849..85353e6e9004 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -233,7 +233,7 @@ xfs_acl_set_mode(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode->i_ctime = current_ctime(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xfs_has_wsync(mp))
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5808abab786c..ac299c1a9838 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -843,7 +843,7 @@ xfs_init_new_inode(
ip->i_df.if_nextents = 0;
ASSERT(ip->i_nblocks == 0);
- tv = current_time(inode);
+ tv = current_ctime(inode);
inode->i_mtime = tv;
inode->i_atime = tv;
inode->i_ctime = tv;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index ca2941ab6cbc..dc33f495f4fa 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -381,7 +381,7 @@ xfs_inode_to_log_dinode(
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, timestamp_truncate(inode->i_ctime, inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 24718adb3c16..b0490e46e825 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -573,8 +573,11 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
- stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
+ if (request_mask & (STATX_MTIME|STATX_CTIME))
+ generic_fill_multigrain_cmtime(inode, stat);
+ else
+ stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
@@ -917,7 +920,7 @@ xfs_setattr_size(
if (newsize != oldsize &&
!(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
- current_time(inode);
+ current_ctime(inode);
iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 4f814f9e12ab..ee9d0b43bf15 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1620,7 +1620,7 @@ xfs_fs_fill_super(
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
- sb->s_time_gran = 1;
+ sb->s_time_gran = 2;
if (xfs_has_bigtime(mp)) {
sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN);
sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX);
@@ -1633,6 +1633,9 @@ xfs_fs_fill_super(
set_posix_acl_flag(sb);
+ /* Enable multigrain timestamps */
+ sb->s_flags |= SB_MULTIGRAIN_TS;
+
/* version 5 superblocks support inode version counters. */
if (xfs_has_crc(mp))
sb->s_flags |= SB_I_VERSION;
--
2.40.0
^ permalink raw reply related [flat|nested] 18+ messages in thread