* [PATCH v4 0/3] shmemfs stable directory offsets
@ 2023-06-26 18:21 Chuck Lever
2023-06-26 18:21 ` [PATCH v4 1/3] libfs: Add directory operations for stable offsets Chuck Lever
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Chuck Lever @ 2023-06-26 18:21 UTC (permalink / raw)
To: viro, brauner, hughd, akpm
Cc: Chuck Lever, Jeff Layton, jlayton, linux-mm, linux-fsdevel
The following series is for continued discussion of the need for
and implementation of stable directory offsets for shmemfs/tmpfs.
Changes since v3:
- Rebased on v6.4
- Fixed error handling bugs
Changes since v2:
- Move bulk of stable offset support into fs/libfs.c
- Replace xa_find_after with xas_find_next for efficiency
Changes since v1:
- Break the single patch up into a series
Changes since RFC:
- Destroy xarray in shmem_destroy_inode() instead of free_in_core_inode()
- A few cosmetic updates
---
Chuck Lever (3):
libfs: Add directory operations for stable offsets
shmem: Refactor shmem_symlink()
shmem: stable directory offsets
fs/dcache.c | 1 +
fs/libfs.c | 185 +++++++++++++++++++++++++++++++++++++++++
include/linux/dcache.h | 1 +
include/linux/fs.h | 9 ++
mm/shmem.c | 69 +++++++++++----
5 files changed, 250 insertions(+), 15 deletions(-)
--
Chuck Lever
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v4 1/3] libfs: Add directory operations for stable offsets
2023-06-26 18:21 [PATCH v4 0/3] shmemfs stable directory offsets Chuck Lever
@ 2023-06-26 18:21 ` Chuck Lever
2023-06-27 6:44 ` Christoph Hellwig
2023-06-26 18:21 ` [PATCH v4 2/3] shmem: Refactor shmem_symlink() Chuck Lever
2023-06-26 18:21 ` [PATCH v4 3/3] shmem: stable directory offsets Chuck Lever
2 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2023-06-26 18:21 UTC (permalink / raw)
To: viro, brauner, hughd, akpm; +Cc: Chuck Lever, jlayton, linux-mm, linux-fsdevel
From: Chuck Lever <chuck.lever@oracle.com>
Create a vector of directory operations in fs/libfs.c that handles
directory seeks and readdir via stable offsets instead of the
current cursor-based mechanism.
For the moment these are unused.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
fs/dcache.c | 1
fs/libfs.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/dcache.h | 1
include/linux/fs.h | 9 ++
4 files changed, 196 insertions(+)
diff --git a/fs/dcache.c b/fs/dcache.c
index 52e6d5fdab6b..9c9a801f3b33 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1813,6 +1813,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_sb = sb;
dentry->d_op = NULL;
dentry->d_fsdata = NULL;
+ dentry->d_offset = 0;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/libfs.c b/fs/libfs.c
index 89cf614a3271..07317bbe1668 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,6 +239,191 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
+/**
+ * stable_offset_init - initialize a parent directory
+ * @dir: parent directory to be initialized
+ *
+ */
+void stable_offset_init(struct inode *dir)
+{
+ xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
+ dir->i_next_offset = 0;
+}
+EXPORT_SYMBOL(stable_offset_init);
+
+/**
+ * stable_offset_add - Add an entry to a directory's stable offset map
+ * @dir: parent directory being modified
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. Otherwise, a negative errno value is returned.
+ */
+int stable_offset_add(struct inode *dir, struct dentry *dentry)
+{
+ struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+ u32 offset = 0;
+ int ret;
+
+ if (dentry->d_offset)
+ return -EBUSY;
+
+ ret = xa_alloc_cyclic(&dir->i_doff_map, &offset, dentry, limit,
+ &dir->i_next_offset, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ dentry->d_offset = offset;
+ return 0;
+}
+EXPORT_SYMBOL(stable_offset_add);
+
+/**
+ * stable_offset_remove - Remove an entry to a directory's stable offset map
+ * @dir: parent directory being modified
+ * @dentry: dentry being removed
+ *
+ */
+void stable_offset_remove(struct inode *dir, struct dentry *dentry)
+{
+ if (!dentry->d_offset)
+ return;
+
+ xa_erase(&dir->i_doff_map, dentry->d_offset);
+ dentry->d_offset = 0;
+}
+EXPORT_SYMBOL(stable_offset_remove);
+
+/**
+ * stable_offset_destroy - Release offset map
+ * @dir: parent directory that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void stable_offset_destroy(struct inode *dir)
+{
+ xa_destroy(&dir->i_doff_map);
+}
+EXPORT_SYMBOL(stable_offset_destroy);
+
+/**
+ * stable_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t stable_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ fallthrough;
+ case SEEK_SET:
+ if (offset >= 0)
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *stable_find_next(struct xa_state *xas)
+{
+ struct dentry *child, *found = NULL;
+
+ rcu_read_lock();
+ child = xas_next_entry(xas, U32_MAX);
+ if (!child)
+ goto out;
+ spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
+ if (simple_positive(child))
+ found = dget_dlock(child);
+ spin_unlock(&child->d_lock);
+out:
+ rcu_read_unlock();
+ return found;
+}
+
+static bool stable_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len,
+ dentry->d_offset, inode->i_ino,
+ fs_umode_to_dtype(inode->i_mode));
+}
+
+static void stable_iterate_dir(struct dentry *dir, struct dir_context *ctx)
+{
+ XA_STATE(xas, &((d_inode(dir))->i_doff_map), ctx->pos);
+ struct dentry *dentry;
+
+ while (true) {
+ spin_lock(&dir->d_lock);
+ dentry = stable_find_next(&xas);
+ spin_unlock(&dir->d_lock);
+ if (!dentry)
+ break;
+
+ if (!stable_dir_emit(ctx, dentry)) {
+ dput(dentry);
+ break;
+ }
+
+ dput(dentry);
+ ctx->pos = xas.xa_index + 1;
+ }
+}
+
+/**
+ * stable_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when shmem_readdir() is called again with @ctx.
+ *
+ * Return values:
+ * %0 - Complete
+ */
+static int stable_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dir = file->f_path.dentry;
+
+ lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ stable_iterate_dir(dir, ctx);
+ return 0;
+}
+
+const struct file_operations stable_dir_operations = {
+ .llseek = stable_dir_llseek,
+ .iterate_shared = stable_readdir,
+ .read = generic_read_dir,
+ .fsync = noop_fsync,
+};
+EXPORT_SYMBOL(stable_dir_operations);
+
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 6b351e009f59..579ce1800efe 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -96,6 +96,7 @@ struct dentry {
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
+ u32 d_offset; /* directory offset in parent */
union {
struct list_head d_lru; /* LRU list */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 133f0640fb24..3fc2c04ed8ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -719,6 +719,10 @@ struct inode {
#endif
void *i_private; /* fs or device private pointer */
+
+ /* simplefs stable directory offset tracking */
+ struct xarray i_doff_map;
+ u32 i_next_offset;
} __randomize_layout;
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);
@@ -2924,6 +2928,10 @@ extern int simple_rename(struct mnt_idmap *, struct inode *,
unsigned int);
extern void simple_recursive_removal(struct dentry *,
void (*callback)(struct dentry *));
+extern void stable_offset_init(struct inode *dir);
+extern int stable_offset_add(struct inode *dir, struct dentry *dentry);
+extern void stable_offset_remove(struct inode *dir, struct dentry *dentry);
+extern void stable_offset_destroy(struct inode *dir);
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
@@ -2939,6 +2947,7 @@ extern const struct dentry_operations simple_dentry_operations;
extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
+extern const struct file_operations stable_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v4 2/3] shmem: Refactor shmem_symlink()
2023-06-26 18:21 [PATCH v4 0/3] shmemfs stable directory offsets Chuck Lever
2023-06-26 18:21 ` [PATCH v4 1/3] libfs: Add directory operations for stable offsets Chuck Lever
@ 2023-06-26 18:21 ` Chuck Lever
2023-06-27 6:44 ` Christoph Hellwig
2023-06-26 18:21 ` [PATCH v4 3/3] shmem: stable directory offsets Chuck Lever
2 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2023-06-26 18:21 UTC (permalink / raw)
To: viro, brauner, hughd, akpm
Cc: Jeff Layton, Chuck Lever, jlayton, linux-mm, linux-fsdevel
From: Chuck Lever <chuck.lever@oracle.com>
De-duplicate the error handling paths. No change in behavior is
expected.
Suggested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
mm/shmem.c | 19 +++++++++----------
1 file changed, 9 insertions(+), 10 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index e40a08c5c6d7..721f9fd064aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3161,26 +3161,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error && error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) {
- iput(inode);
- return -ENOMEM;
+ error = -ENOMEM;
+ goto out_iput;
}
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
- if (error) {
- iput(inode);
- return error;
- }
+ if (error)
+ goto out_iput;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
@@ -3195,6 +3191,9 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
dget(dentry);
return 0;
+out_iput:
+ iput(inode);
+ return error;
}
static void shmem_put_link(void *arg)
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v4 3/3] shmem: stable directory offsets
2023-06-26 18:21 [PATCH v4 0/3] shmemfs stable directory offsets Chuck Lever
2023-06-26 18:21 ` [PATCH v4 1/3] libfs: Add directory operations for stable offsets Chuck Lever
2023-06-26 18:21 ` [PATCH v4 2/3] shmem: Refactor shmem_symlink() Chuck Lever
@ 2023-06-26 18:21 ` Chuck Lever
2023-06-27 14:06 ` Bernd Schubert
2 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever @ 2023-06-26 18:21 UTC (permalink / raw)
To: viro, brauner, hughd, akpm; +Cc: Chuck Lever, jlayton, linux-mm, linux-fsdevel
From: Chuck Lever <chuck.lever@oracle.com>
The current cursor-based directory offset mechanism doesn't work
when a tmpfs filesystem is exported via NFS. This is because NFS
clients do not open directories. Each server-side READDIR operation
has to open the directory, read it, then close it. The cursor state
for that directory, being associated strictly with the opened
struct file, is thus discarded after each NFS READDIR operation.
Directory offsets are cached not only by NFS clients, but also by
user space libraries on those clients. Essentially there is no way
to invalidate those caches when directory offsets have changed on
an NFS server after the offset-to-dentry mapping changes. Thus the
whole application stack depends on unchanging directory offsets.
The solution we've come up with is to make the directory offset for
each file in a tmpfs filesystem stable for the life of the directory
entry it represents.
shmem_readdir() and shmem_dir_llseek() now use an xarray to map each
directory offset (an loff_t integer) to the memory address of a
struct dentry.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
mm/shmem.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 47 insertions(+), 7 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 721f9fd064aa..89012f3583b1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
/* Some things misbehave if size == 0 on a directory */
inode->i_size = 2 * BOGO_DIRENT_SIZE;
inode->i_op = &shmem_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
+ inode->i_fop = &stable_dir_operations;
+ stable_offset_init(inode);
break;
case S_IFLNK:
/*
@@ -2950,7 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
if (error && error != -EOPNOTSUPP)
goto out_iput;
- error = 0;
+ error = stable_offset_add(dir, dentry);
+ if (error)
+ goto out_iput;
+
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
inode_inc_iversion(dir);
@@ -3027,6 +3031,13 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
goto out;
}
+ ret = stable_offset_add(dir, dentry);
+ if (ret) {
+ if (inode->i_nlink)
+ shmem_free_inode(inode->i_sb);
+ goto out;
+ }
+
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
inode_inc_iversion(dir);
@@ -3045,6 +3056,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
shmem_free_inode(inode->i_sb);
+ stable_offset_remove(dir, dentry);
+
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
inode_inc_iversion(dir);
@@ -3103,24 +3116,41 @@ static int shmem_rename2(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
+ int error;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
- if (flags & RENAME_EXCHANGE)
+ if (flags & RENAME_EXCHANGE) {
+ error = stable_offset_add(new_dir, old_dentry);
+ if (error)
+ return error;
+ error = stable_offset_add(old_dir, new_dentry);
+ if (error) {
+ stable_offset_remove(new_dir, old_dentry);
+ return error;
+ }
+ stable_offset_remove(old_dir, old_dentry);
+ stable_offset_remove(new_dir, new_dentry);
+
+ /* Always returns zero */
return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ }
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
- int error;
-
error = shmem_whiteout(idmap, old_dir, old_dentry);
if (error)
return error;
}
+ stable_offset_remove(old_dir, old_dentry);
+ error = stable_offset_add(new_dir, old_dentry);
+ if (error)
+ return error;
+
if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {
@@ -3164,19 +3194,23 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (error && error != -EOPNOTSUPP)
goto out_iput;
+ error = stable_offset_add(dir, dentry);
+ if (error)
+ goto out_iput;
+
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
if (!inode->i_link) {
error = -ENOMEM;
- goto out_iput;
+ goto out_remove_offset;
}
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error)
- goto out_iput;
+ goto out_remove_offset;
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
@@ -3185,12 +3219,16 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
folio_unlock(folio);
folio_put(folio);
}
+
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
+
+out_remove_offset:
+ stable_offset_remove(dir, dentry);
out_iput:
iput(inode);
return error;
@@ -3920,6 +3958,8 @@ static void shmem_destroy_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ if (S_ISDIR(inode->i_mode))
+ stable_offset_destroy(inode);
}
static void shmem_init_inode(void *foo)
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/3] libfs: Add directory operations for stable offsets
2023-06-26 18:21 ` [PATCH v4 1/3] libfs: Add directory operations for stable offsets Chuck Lever
@ 2023-06-27 6:44 ` Christoph Hellwig
2023-06-27 8:52 ` Christian Brauner
0 siblings, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2023-06-27 6:44 UTC (permalink / raw)
To: Chuck Lever
Cc: viro, brauner, hughd, akpm, Chuck Lever, jlayton, linux-mm,
linux-fsdevel
> + * @dir: parent directory to be initialized
> + *
> + */
> +void stable_offset_init(struct inode *dir)
> +{
> + xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
> + dir->i_next_offset = 0;
> +}
> +EXPORT_SYMBOL(stable_offset_init);
If this is exported I'd much prefer a EXPORT_SYMBOL_GPL. But the only
user so far is shmfs, which can't be modular anyway, so I think we can
drop the exports.
> --- a/include/linux/dcache.h
> +++ b/include/linux/dcache.h
> @@ -96,6 +96,7 @@ struct dentry {
> struct super_block *d_sb; /* The root of the dentry tree */
> unsigned long d_time; /* used by d_revalidate */
> void *d_fsdata; /* fs-specific data */
> + u32 d_offset; /* directory offset in parent */
>
> union {
> struct list_head d_lru; /* LRU list */
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 133f0640fb24..3fc2c04ed8ff 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -719,6 +719,10 @@ struct inode {
> #endif
>
> void *i_private; /* fs or device private pointer */
> +
> + /* simplefs stable directory offset tracking */
> + struct xarray i_doff_map;
> + u32 i_next_offset;
We can't just increase the size of the dentry and inode for everyone
for something that doesn't make any sense for normal file systems.
This needs to move out into structures allocated by the file system
and embedded into or used as the private dentry/inode data.
> +extern void stable_offset_init(struct inode *dir);
> +extern int stable_offset_add(struct inode *dir, struct dentry *dentry);
> +extern void stable_offset_remove(struct inode *dir, struct dentry *dentry);
> +extern void stable_offset_destroy(struct inode *dir);
Please drop all the pointless externs for function prototypes.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 2/3] shmem: Refactor shmem_symlink()
2023-06-26 18:21 ` [PATCH v4 2/3] shmem: Refactor shmem_symlink() Chuck Lever
@ 2023-06-27 6:44 ` Christoph Hellwig
0 siblings, 0 replies; 12+ messages in thread
From: Christoph Hellwig @ 2023-06-27 6:44 UTC (permalink / raw)
To: Chuck Lever
Cc: viro, brauner, hughd, akpm, Jeff Layton, Chuck Lever, jlayton,
linux-mm, linux-fsdevel
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/3] libfs: Add directory operations for stable offsets
2023-06-27 6:44 ` Christoph Hellwig
@ 2023-06-27 8:52 ` Christian Brauner
2023-06-27 14:04 ` Chuck Lever III
0 siblings, 1 reply; 12+ messages in thread
From: Christian Brauner @ 2023-06-27 8:52 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Chuck Lever, viro, hughd, akpm, Chuck Lever, jlayton, linux-mm,
linux-fsdevel
On Mon, Jun 26, 2023 at 11:44:15PM -0700, Christoph Hellwig wrote:
> > + * @dir: parent directory to be initialized
> > + *
> > + */
> > +void stable_offset_init(struct inode *dir)
> > +{
> > + xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
> > + dir->i_next_offset = 0;
> > +}
> > +EXPORT_SYMBOL(stable_offset_init);
>
> If this is exported I'd much prefer a EXPORT_SYMBOL_GPL. But the only
> user so far is shmfs, which can't be modular anyway, so I think we can
> drop the exports.
>
> > --- a/include/linux/dcache.h
> > +++ b/include/linux/dcache.h
> > @@ -96,6 +96,7 @@ struct dentry {
> > struct super_block *d_sb; /* The root of the dentry tree */
> > unsigned long d_time; /* used by d_revalidate */
> > void *d_fsdata; /* fs-specific data */
> > + u32 d_offset; /* directory offset in parent */
> >
> > union {
> > struct list_head d_lru; /* LRU list */
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 133f0640fb24..3fc2c04ed8ff 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -719,6 +719,10 @@ struct inode {
> > #endif
> >
> > void *i_private; /* fs or device private pointer */
> > +
> > + /* simplefs stable directory offset tracking */
> > + struct xarray i_doff_map;
> > + u32 i_next_offset;
>
> We can't just increase the size of the dentry and inode for everyone
> for something that doesn't make any sense for normal file systems.
> This needs to move out into structures allocated by the file system
> and embedded into or used as the private dentry/inode data.
I agree. I prefer if this could be done on a per filesystem basis as
well. Especially since, this is currently only useful for a single
filesystem.
We've tried to be very conservative in increasing inode and dentry size
and we should continue with that.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/3] libfs: Add directory operations for stable offsets
2023-06-27 8:52 ` Christian Brauner
@ 2023-06-27 14:04 ` Chuck Lever III
2023-06-27 14:19 ` Jeff Layton
0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever III @ 2023-06-27 14:04 UTC (permalink / raw)
To: Christian Brauner
Cc: Christoph Hellwig, Chuck Lever, Al Viro, Hugh Dickins,
Andrew Morton, Jeff Layton, linux-mm,
linux-fsdevel@vger.kernel.org
> On Jun 27, 2023, at 4:52 AM, Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Jun 26, 2023 at 11:44:15PM -0700, Christoph Hellwig wrote:
>>> + * @dir: parent directory to be initialized
>>> + *
>>> + */
>>> +void stable_offset_init(struct inode *dir)
>>> +{
>>> + xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
>>> + dir->i_next_offset = 0;
>>> +}
>>> +EXPORT_SYMBOL(stable_offset_init);
>>
>> If this is exported I'd much prefer a EXPORT_SYMBOL_GPL. But the only
>> user so far is shmfs, which can't be modular anyway, so I think we can
>> drop the exports.
>>
>>> --- a/include/linux/dcache.h
>>> +++ b/include/linux/dcache.h
>>> @@ -96,6 +96,7 @@ struct dentry {
>>> struct super_block *d_sb; /* The root of the dentry tree */
>>> unsigned long d_time; /* used by d_revalidate */
>>> void *d_fsdata; /* fs-specific data */
>>> + u32 d_offset; /* directory offset in parent */
>>>
>>> union {
>>> struct list_head d_lru; /* LRU list */
>>> diff --git a/include/linux/fs.h b/include/linux/fs.h
>>> index 133f0640fb24..3fc2c04ed8ff 100644
>>> --- a/include/linux/fs.h
>>> +++ b/include/linux/fs.h
>>> @@ -719,6 +719,10 @@ struct inode {
>>> #endif
>>>
>>> void *i_private; /* fs or device private pointer */
>>> +
>>> + /* simplefs stable directory offset tracking */
>>> + struct xarray i_doff_map;
>>> + u32 i_next_offset;
>>
>> We can't just increase the size of the dentry and inode for everyone
>> for something that doesn't make any sense for normal file systems.
>> This needs to move out into structures allocated by the file system
>> and embedded into or used as the private dentry/inode data.
>
> I agree. I prefer if this could be done on a per filesystem basis as
> well. Especially since, this is currently only useful for a single
> filesystem.
>
> We've tried to be very conservative in increasing inode and dentry size
> and we should continue with that.
I had thought we were going to adopt the stable offset mechanism
in more than just shmemfs. That's why we're putting it in libfs.c
after all. So this was not going to be "just for shmemfs" in the
long run.
That said, I can move the stable-offset-related inode fields back
into the shmemfs private inode struct, as it was in v2 of this
series.
For d_offset, I was (ab)using the d_fsdata field by casting the
offset value as a pointer. That's kind of ugly. Any suggestions?
--
Chuck Lever
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 3/3] shmem: stable directory offsets
2023-06-26 18:21 ` [PATCH v4 3/3] shmem: stable directory offsets Chuck Lever
@ 2023-06-27 14:06 ` Bernd Schubert
2023-06-27 14:11 ` Chuck Lever III
0 siblings, 1 reply; 12+ messages in thread
From: Bernd Schubert @ 2023-06-27 14:06 UTC (permalink / raw)
To: Chuck Lever, viro, brauner, hughd, akpm
Cc: Chuck Lever, jlayton, linux-mm, linux-fsdevel
On 6/26/23 20:21, Chuck Lever wrote:
> From: Chuck Lever <chuck.lever@oracle.com>
>
> The current cursor-based directory offset mechanism doesn't work
> when a tmpfs filesystem is exported via NFS. This is because NFS
> clients do not open directories. Each server-side READDIR operation
> has to open the directory, read it, then close it. The cursor state
> for that directory, being associated strictly with the opened
> struct file, is thus discarded after each NFS READDIR operation.
>
> Directory offsets are cached not only by NFS clients, but also by
> user space libraries on those clients. Essentially there is no way
> to invalidate those caches when directory offsets have changed on
> an NFS server after the offset-to-dentry mapping changes. Thus the
> whole application stack depends on unchanging directory offsets.
>
> The solution we've come up with is to make the directory offset for
> each file in a tmpfs filesystem stable for the life of the directory
> entry it represents.
>
> shmem_readdir() and shmem_dir_llseek() now use an xarray to map each
> directory offset (an loff_t integer) to the memory address of a
> struct dentry.
>
> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
> ---
> mm/shmem.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 47 insertions(+), 7 deletions(-)
>
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 721f9fd064aa..89012f3583b1 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
> /* Some things misbehave if size == 0 on a directory */
> inode->i_size = 2 * BOGO_DIRENT_SIZE;
> inode->i_op = &shmem_dir_inode_operations;
> - inode->i_fop = &simple_dir_operations;
> + inode->i_fop = &stable_dir_operations;
> + stable_offset_init(inode);
> break;
> case S_IFLNK:
> /*
> @@ -2950,7 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
> if (error && error != -EOPNOTSUPP)
> goto out_iput;
>
> - error = 0;
> + error = stable_offset_add(dir, dentry);
> + if (error)
> + goto out_iput;
> +
> dir->i_size += BOGO_DIRENT_SIZE;
> dir->i_ctime = dir->i_mtime = current_time(dir);
> inode_inc_iversion(dir);
> @@ -3027,6 +3031,13 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
> goto out;
> }
>
> + ret = stable_offset_add(dir, dentry);
> + if (ret) {
> + if (inode->i_nlink)
> + shmem_free_inode(inode->i_sb);
> + goto out;
> + }
> +
> dir->i_size += BOGO_DIRENT_SIZE;
> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
> inode_inc_iversion(dir);
> @@ -3045,6 +3056,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
> shmem_free_inode(inode->i_sb);
>
> + stable_offset_remove(dir, dentry);
> +
> dir->i_size -= BOGO_DIRENT_SIZE;
> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
> inode_inc_iversion(dir);
> @@ -3103,24 +3116,41 @@ static int shmem_rename2(struct mnt_idmap *idmap,
> {
> struct inode *inode = d_inode(old_dentry);
> int they_are_dirs = S_ISDIR(inode->i_mode);
> + int error;
>
> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
> return -EINVAL;
>
> - if (flags & RENAME_EXCHANGE)
> + if (flags & RENAME_EXCHANGE) {
> + error = stable_offset_add(new_dir, old_dentry);
> + if (error)
> + return error;
Won't this fail in stable_offset_add() with -EBUSY, because
dentry->d_offset is set?
> + error = stable_offset_add(old_dir, new_dentry);
> + if (error) {
> + stable_offset_remove(new_dir, old_dentry);
> + return error;
> + }
> + stable_offset_remove(old_dir, old_dentry);
> + stable_offset_remove(new_dir, new_dentry);
Assuming stable_offset_add() would have succeeded (somehow), old_dentry
and new_dentry would have gotten reset their dentry->d_offset?
> +
> + /* Always returns zero */
> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
> + }
Hmm, let's assume simple_rename_exchange() fails, stable entries are now
the other way around?
I actually start to wonder if we need to introduce error injection for
RENAME_EXCHANGE, I think it the most complex part in the series.
>
> if (!simple_empty(new_dentry))
> return -ENOTEMPTY;
>
> if (flags & RENAME_WHITEOUT) {
> - int error;
> -
> error = shmem_whiteout(idmap, old_dir, old_dentry);
> if (error)
> return error;
> }
>
> + stable_offset_remove(old_dir, old_dentry);
> + error = stable_offset_add(new_dir, old_dentry);
> + if (error)
> + return error;
> +
> if (d_really_is_positive(new_dentry)) {
> (void) shmem_unlink(new_dir, new_dentry);
> if (they_are_dirs) {
> @@ -3164,19 +3194,23 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
> if (error && error != -EOPNOTSUPP)
> goto out_iput;
>
> + error = stable_offset_add(dir, dentry);
> + if (error)
> + goto out_iput;
> +
> inode->i_size = len-1;
> if (len <= SHORT_SYMLINK_LEN) {
> inode->i_link = kmemdup(symname, len, GFP_KERNEL);
> if (!inode->i_link) {
> error = -ENOMEM;
> - goto out_iput;
> + goto out_remove_offset;
> }
> inode->i_op = &shmem_short_symlink_operations;
> } else {
> inode_nohighmem(inode);
> error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
> if (error)
> - goto out_iput;
> + goto out_remove_offset;
> inode->i_mapping->a_ops = &shmem_aops;
> inode->i_op = &shmem_symlink_inode_operations;
> memcpy(folio_address(folio), symname, len);
> @@ -3185,12 +3219,16 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
> folio_unlock(folio);
> folio_put(folio);
> }
> +
> dir->i_size += BOGO_DIRENT_SIZE;
> dir->i_ctime = dir->i_mtime = current_time(dir);
> inode_inc_iversion(dir);
> d_instantiate(dentry, inode);
> dget(dentry);
> return 0;
> +
> +out_remove_offset:
> + stable_offset_remove(dir, dentry);
> out_iput:
> iput(inode);
> return error;
> @@ -3920,6 +3958,8 @@ static void shmem_destroy_inode(struct inode *inode)
> {
> if (S_ISREG(inode->i_mode))
> mpol_free_shared_policy(&SHMEM_I(inode)->policy);
> + if (S_ISDIR(inode->i_mode))
> + stable_offset_destroy(inode);
> }
>
> static void shmem_init_inode(void *foo)
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 3/3] shmem: stable directory offsets
2023-06-27 14:06 ` Bernd Schubert
@ 2023-06-27 14:11 ` Chuck Lever III
2023-06-27 14:48 ` Bernd Schubert
0 siblings, 1 reply; 12+ messages in thread
From: Chuck Lever III @ 2023-06-27 14:11 UTC (permalink / raw)
To: Bernd Schubert
Cc: Chuck Lever, Al Viro, brauner@kernel.org, hughd@google.com,
Andrew Morton, Jeff Layton, linux-mm@kvack.org,
linux-fsdevel@vger.kernel.org
> On Jun 27, 2023, at 10:06 AM, Bernd Schubert <bernd.schubert@fastmail.fm> wrote:
>
>
>
> On 6/26/23 20:21, Chuck Lever wrote:
>> From: Chuck Lever <chuck.lever@oracle.com>
>> The current cursor-based directory offset mechanism doesn't work
>> when a tmpfs filesystem is exported via NFS. This is because NFS
>> clients do not open directories. Each server-side READDIR operation
>> has to open the directory, read it, then close it. The cursor state
>> for that directory, being associated strictly with the opened
>> struct file, is thus discarded after each NFS READDIR operation.
>> Directory offsets are cached not only by NFS clients, but also by
>> user space libraries on those clients. Essentially there is no way
>> to invalidate those caches when directory offsets have changed on
>> an NFS server after the offset-to-dentry mapping changes. Thus the
>> whole application stack depends on unchanging directory offsets.
>> The solution we've come up with is to make the directory offset for
>> each file in a tmpfs filesystem stable for the life of the directory
>> entry it represents.
>> shmem_readdir() and shmem_dir_llseek() now use an xarray to map each
>> directory offset (an loff_t integer) to the memory address of a
>> struct dentry.
>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>> ---
>> mm/shmem.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-------
>> 1 file changed, 47 insertions(+), 7 deletions(-)
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 721f9fd064aa..89012f3583b1 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
>> /* Some things misbehave if size == 0 on a directory */
>> inode->i_size = 2 * BOGO_DIRENT_SIZE;
>> inode->i_op = &shmem_dir_inode_operations;
>> - inode->i_fop = &simple_dir_operations;
>> + inode->i_fop = &stable_dir_operations;
>> + stable_offset_init(inode);
>> break;
>> case S_IFLNK:
>> /*
>> @@ -2950,7 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
>> if (error && error != -EOPNOTSUPP)
>> goto out_iput;
>> - error = 0;
>> + error = stable_offset_add(dir, dentry);
>> + if (error)
>> + goto out_iput;
>> +
>> dir->i_size += BOGO_DIRENT_SIZE;
>> dir->i_ctime = dir->i_mtime = current_time(dir);
>> inode_inc_iversion(dir);
>> @@ -3027,6 +3031,13 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
>> goto out;
>> }
>> + ret = stable_offset_add(dir, dentry);
>> + if (ret) {
>> + if (inode->i_nlink)
>> + shmem_free_inode(inode->i_sb);
>> + goto out;
>> + }
>> +
>> dir->i_size += BOGO_DIRENT_SIZE;
>> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
>> inode_inc_iversion(dir);
>> @@ -3045,6 +3056,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
>> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
>> shmem_free_inode(inode->i_sb);
>> + stable_offset_remove(dir, dentry);
>> +
>> dir->i_size -= BOGO_DIRENT_SIZE;
>> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
>> inode_inc_iversion(dir);
>> @@ -3103,24 +3116,41 @@ static int shmem_rename2(struct mnt_idmap *idmap,
>> {
>> struct inode *inode = d_inode(old_dentry);
>> int they_are_dirs = S_ISDIR(inode->i_mode);
>> + int error;
>> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
>> return -EINVAL;
>> - if (flags & RENAME_EXCHANGE)
>> + if (flags & RENAME_EXCHANGE) {
>> + error = stable_offset_add(new_dir, old_dentry);
>> + if (error)
>> + return error;
>
> Won't this fail in stable_offset_add() with -EBUSY, because dentry->d_offset is set?
>
>> + error = stable_offset_add(old_dir, new_dentry);
>> + if (error) {
>> + stable_offset_remove(new_dir, old_dentry);
>> + return error;
>> + }
>> + stable_offset_remove(old_dir, old_dentry);
>> + stable_offset_remove(new_dir, new_dentry);
>
> Assuming stable_offset_add() would have succeeded (somehow), old_dentry and new_dentry would have gotten reset their dentry->d_offset?
Probably. I can have another look.
>> +
>> + /* Always returns zero */
>> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
>> + }
>
> Hmm, let's assume simple_rename_exchange() fails, stable entries are now the other way around?
Today it never fails. We can add an assertion here.
Otherwise cleaning up after a simple_rename_exchange() failure is going to be even more hairy.
> I actually start to wonder if we need to introduce error injection for RENAME_EXCHANGE, I think it the most complex part in the series.
And it's the least frequently used mode of rename.
>> if (!simple_empty(new_dentry))
>> return -ENOTEMPTY;
>> if (flags & RENAME_WHITEOUT) {
>> - int error;
>> -
>> error = shmem_whiteout(idmap, old_dir, old_dentry);
>> if (error)
>> return error;
>> }
>> + stable_offset_remove(old_dir, old_dentry);
>> + error = stable_offset_add(new_dir, old_dentry);
>> + if (error)
>> + return error;
>> +
>> if (d_really_is_positive(new_dentry)) {
>> (void) shmem_unlink(new_dir, new_dentry);
>> if (they_are_dirs) {
>> @@ -3164,19 +3194,23 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>> if (error && error != -EOPNOTSUPP)
>> goto out_iput;
>> + error = stable_offset_add(dir, dentry);
>> + if (error)
>> + goto out_iput;
>> +
>> inode->i_size = len-1;
>> if (len <= SHORT_SYMLINK_LEN) {
>> inode->i_link = kmemdup(symname, len, GFP_KERNEL);
>> if (!inode->i_link) {
>> error = -ENOMEM;
>> - goto out_iput;
>> + goto out_remove_offset;
>> }
>> inode->i_op = &shmem_short_symlink_operations;
>> } else {
>> inode_nohighmem(inode);
>> error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
>> if (error)
>> - goto out_iput;
>> + goto out_remove_offset;
>> inode->i_mapping->a_ops = &shmem_aops;
>> inode->i_op = &shmem_symlink_inode_operations;
>> memcpy(folio_address(folio), symname, len);
>> @@ -3185,12 +3219,16 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>> folio_unlock(folio);
>> folio_put(folio);
>> }
>> +
>> dir->i_size += BOGO_DIRENT_SIZE;
>> dir->i_ctime = dir->i_mtime = current_time(dir);
>> inode_inc_iversion(dir);
>> d_instantiate(dentry, inode);
>> dget(dentry);
>> return 0;
>> +
>> +out_remove_offset:
>> + stable_offset_remove(dir, dentry);
>> out_iput:
>> iput(inode);
>> return error;
>> @@ -3920,6 +3958,8 @@ static void shmem_destroy_inode(struct inode *inode)
>> {
>> if (S_ISREG(inode->i_mode))
>> mpol_free_shared_policy(&SHMEM_I(inode)->policy);
>> + if (S_ISDIR(inode->i_mode))
>> + stable_offset_destroy(inode);
>> }
>> static void shmem_init_inode(void *foo)
--
Chuck Lever
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 1/3] libfs: Add directory operations for stable offsets
2023-06-27 14:04 ` Chuck Lever III
@ 2023-06-27 14:19 ` Jeff Layton
0 siblings, 0 replies; 12+ messages in thread
From: Jeff Layton @ 2023-06-27 14:19 UTC (permalink / raw)
To: Chuck Lever III, Christian Brauner
Cc: Christoph Hellwig, Chuck Lever, Al Viro, Hugh Dickins,
Andrew Morton, linux-mm, linux-fsdevel@vger.kernel.org
On Tue, 2023-06-27 at 14:04 +0000, Chuck Lever III wrote:
>
> > On Jun 27, 2023, at 4:52 AM, Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Mon, Jun 26, 2023 at 11:44:15PM -0700, Christoph Hellwig wrote:
> > > > + * @dir: parent directory to be initialized
> > > > + *
> > > > + */
> > > > +void stable_offset_init(struct inode *dir)
> > > > +{
> > > > + xa_init_flags(&dir->i_doff_map, XA_FLAGS_ALLOC1);
> > > > + dir->i_next_offset = 0;
> > > > +}
> > > > +EXPORT_SYMBOL(stable_offset_init);
> > >
> > > If this is exported I'd much prefer a EXPORT_SYMBOL_GPL. But the only
> > > user so far is shmfs, which can't be modular anyway, so I think we can
> > > drop the exports.
> > >
> > > > --- a/include/linux/dcache.h
> > > > +++ b/include/linux/dcache.h
> > > > @@ -96,6 +96,7 @@ struct dentry {
> > > > struct super_block *d_sb; /* The root of the dentry tree */
> > > > unsigned long d_time; /* used by d_revalidate */
> > > > void *d_fsdata; /* fs-specific data */
> > > > + u32 d_offset; /* directory offset in parent */
> > > >
> > > > union {
> > > > struct list_head d_lru; /* LRU list */
> > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > index 133f0640fb24..3fc2c04ed8ff 100644
> > > > --- a/include/linux/fs.h
> > > > +++ b/include/linux/fs.h
> > > > @@ -719,6 +719,10 @@ struct inode {
> > > > #endif
> > > >
> > > > void *i_private; /* fs or device private pointer */
> > > > +
> > > > + /* simplefs stable directory offset tracking */
> > > > + struct xarray i_doff_map;
> > > > + u32 i_next_offset;
> > >
> > > We can't just increase the size of the dentry and inode for everyone
> > > for something that doesn't make any sense for normal file systems.
> > > This needs to move out into structures allocated by the file system
> > > and embedded into or used as the private dentry/inode data.
> >
> > I agree. I prefer if this could be done on a per filesystem basis as
> > well. Especially since, this is currently only useful for a single
> > filesystem.
> >
> > We've tried to be very conservative in increasing inode and dentry size
> > and we should continue with that.
>
> I had thought we were going to adopt the stable offset mechanism
> in more than just shmemfs. That's why we're putting it in libfs.c
> after all. So this was not going to be "just for shmemfs" in the
> long run.
>
> That said, I can move the stable-offset-related inode fields back
> into the shmemfs private inode struct, as it was in v2 of this
> series.
>
I too think that would be best. The libfs helpers will probably need to
take an extra pointer to the relevant fields in the inode and dentry,
but that's not too hard to do.
> For d_offset, I was (ab)using the d_fsdata field by casting the
> offset value as a pointer. That's kind of ugly. Any suggestions?
>
>
Absolutely nothing wrong with interpreting that as an integer. It's a
void * which means that anything that wants to dereference it better
know what it is ahead of time anyway.
--
Jeff Layton <jlayton@redhat.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4 3/3] shmem: stable directory offsets
2023-06-27 14:11 ` Chuck Lever III
@ 2023-06-27 14:48 ` Bernd Schubert
0 siblings, 0 replies; 12+ messages in thread
From: Bernd Schubert @ 2023-06-27 14:48 UTC (permalink / raw)
To: Chuck Lever III
Cc: Chuck Lever, Al Viro, brauner@kernel.org, hughd@google.com,
Andrew Morton, Jeff Layton, linux-mm@kvack.org,
linux-fsdevel@vger.kernel.org
On 6/27/23 16:11, Chuck Lever III wrote:
>
>
>> On Jun 27, 2023, at 10:06 AM, Bernd Schubert <bernd.schubert@fastmail.fm> wrote:
>>
>>
>>
>> On 6/26/23 20:21, Chuck Lever wrote:
>>> From: Chuck Lever <chuck.lever@oracle.com>
>>> The current cursor-based directory offset mechanism doesn't work
>>> when a tmpfs filesystem is exported via NFS. This is because NFS
>>> clients do not open directories. Each server-side READDIR operation
>>> has to open the directory, read it, then close it. The cursor state
>>> for that directory, being associated strictly with the opened
>>> struct file, is thus discarded after each NFS READDIR operation.
>>> Directory offsets are cached not only by NFS clients, but also by
>>> user space libraries on those clients. Essentially there is no way
>>> to invalidate those caches when directory offsets have changed on
>>> an NFS server after the offset-to-dentry mapping changes. Thus the
>>> whole application stack depends on unchanging directory offsets.
>>> The solution we've come up with is to make the directory offset for
>>> each file in a tmpfs filesystem stable for the life of the directory
>>> entry it represents.
>>> shmem_readdir() and shmem_dir_llseek() now use an xarray to map each
>>> directory offset (an loff_t integer) to the memory address of a
>>> struct dentry.
>>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
>>> ---
>>> mm/shmem.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-------
>>> 1 file changed, 47 insertions(+), 7 deletions(-)
>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>> index 721f9fd064aa..89012f3583b1 100644
>>> --- a/mm/shmem.c
>>> +++ b/mm/shmem.c
>>> @@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
>>> /* Some things misbehave if size == 0 on a directory */
>>> inode->i_size = 2 * BOGO_DIRENT_SIZE;
>>> inode->i_op = &shmem_dir_inode_operations;
>>> - inode->i_fop = &simple_dir_operations;
>>> + inode->i_fop = &stable_dir_operations;
>>> + stable_offset_init(inode);
>>> break;
>>> case S_IFLNK:
>>> /*
>>> @@ -2950,7 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
>>> if (error && error != -EOPNOTSUPP)
>>> goto out_iput;
>>> - error = 0;
>>> + error = stable_offset_add(dir, dentry);
>>> + if (error)
>>> + goto out_iput;
>>> +
>>> dir->i_size += BOGO_DIRENT_SIZE;
>>> dir->i_ctime = dir->i_mtime = current_time(dir);
>>> inode_inc_iversion(dir);
>>> @@ -3027,6 +3031,13 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
>>> goto out;
>>> }
>>> + ret = stable_offset_add(dir, dentry);
>>> + if (ret) {
>>> + if (inode->i_nlink)
>>> + shmem_free_inode(inode->i_sb);
>>> + goto out;
>>> + }
>>> +
>>> dir->i_size += BOGO_DIRENT_SIZE;
>>> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
>>> inode_inc_iversion(dir);
>>> @@ -3045,6 +3056,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
>>> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
>>> shmem_free_inode(inode->i_sb);
>>> + stable_offset_remove(dir, dentry);
>>> +
>>> dir->i_size -= BOGO_DIRENT_SIZE;
>>> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
>>> inode_inc_iversion(dir);
>>> @@ -3103,24 +3116,41 @@ static int shmem_rename2(struct mnt_idmap *idmap,
>>> {
>>> struct inode *inode = d_inode(old_dentry);
>>> int they_are_dirs = S_ISDIR(inode->i_mode);
>>> + int error;
>>> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
>>> return -EINVAL;
>>> - if (flags & RENAME_EXCHANGE)
>>> + if (flags & RENAME_EXCHANGE) {
>>> + error = stable_offset_add(new_dir, old_dentry);
>>> + if (error)
>>> + return error;
>>
>> Won't this fail in stable_offset_add() with -EBUSY, because dentry->d_offset is set?
>>
>>> + error = stable_offset_add(old_dir, new_dentry);
>>> + if (error) {
>>> + stable_offset_remove(new_dir, old_dentry);
>>> + return error;
>>> + }
>>> + stable_offset_remove(old_dir, old_dentry);
>>> + stable_offset_remove(new_dir, new_dentry);
>>
>> Assuming stable_offset_add() would have succeeded (somehow), old_dentry and new_dentry would have gotten reset their dentry->d_offset?
>
> Probably. I can have another look.
>
>
>>> +
>>> + /* Always returns zero */
>>> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
>>> + }
>>
>> Hmm, let's assume simple_rename_exchange() fails, stable entries are now the other way around?
>
> Today it never fails. We can add an assertion here.
>
> Otherwise cleaning up after a simple_rename_exchange() failure is going to be even more hair >
>
Hmm, then add in the commit message that error handling for
RENAME_EXCHANGE is not perfect?
>> I actually start to wonder if we need to introduce error injection for RENAME_EXCHANGE, I think it the most complex part in the series.
>
> And it's the least frequently used mode of rename.
>
>
>>> if (!simple_empty(new_dentry))
>>> return -ENOTEMPTY;
>>> if (flags & RENAME_WHITEOUT) {
>>> - int error;
>>> -
>>> error = shmem_whiteout(idmap, old_dir, old_dentry);
>>> if (error)
>>> return error;
>>> }
>>> + stable_offset_remove(old_dir, old_dentry);
>>> + error = stable_offset_add(new_dir, old_dentry);
>>> + if (error)
>>> + return error;
>>> +
>>> if (d_really_is_positive(new_dentry)) {
>>> (void) shmem_unlink(new_dir, new_dentry);
>>> if (they_are_dirs) {
>>> @@ -3164,19 +3194,23 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>>> if (error && error != -EOPNOTSUPP)
>>> goto out_iput;
>>> + error = stable_offset_add(dir, dentry);
>>> + if (error)
>>> + goto out_iput;
>>> +
>>> inode->i_size = len-1;
>>> if (len <= SHORT_SYMLINK_LEN) {
>>> inode->i_link = kmemdup(symname, len, GFP_KERNEL);
>>> if (!inode->i_link) {
>>> error = -ENOMEM;
>>> - goto out_iput;
>>> + goto out_remove_offset;
>>> }
>>> inode->i_op = &shmem_short_symlink_operations;
>>> } else {
>>> inode_nohighmem(inode);
>>> error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
>>> if (error)
>>> - goto out_iput;
>>> + goto out_remove_offset;
>>> inode->i_mapping->a_ops = &shmem_aops;
>>> inode->i_op = &shmem_symlink_inode_operations;
>>> memcpy(folio_address(folio), symname, len);
>>> @@ -3185,12 +3219,16 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
>>> folio_unlock(folio);
>>> folio_put(folio);
>>> }
>>> +
>>> dir->i_size += BOGO_DIRENT_SIZE;
>>> dir->i_ctime = dir->i_mtime = current_time(dir);
>>> inode_inc_iversion(dir);
>>> d_instantiate(dentry, inode);
>>> dget(dentry);
>>> return 0;
>>> +
>>> +out_remove_offset:
>>> + stable_offset_remove(dir, dentry);
>>> out_iput:
>>> iput(inode);
>>> return error;
>>> @@ -3920,6 +3958,8 @@ static void shmem_destroy_inode(struct inode *inode)
>>> {
>>> if (S_ISREG(inode->i_mode))
>>> mpol_free_shared_policy(&SHMEM_I(inode)->policy);
>>> + if (S_ISDIR(inode->i_mode))
>>> + stable_offset_destroy(inode);
>>> }
>>> static void shmem_init_inode(void *foo)
>
>
> --
> Chuck Lever
>
>
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2023-06-27 14:48 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-06-26 18:21 [PATCH v4 0/3] shmemfs stable directory offsets Chuck Lever
2023-06-26 18:21 ` [PATCH v4 1/3] libfs: Add directory operations for stable offsets Chuck Lever
2023-06-27 6:44 ` Christoph Hellwig
2023-06-27 8:52 ` Christian Brauner
2023-06-27 14:04 ` Chuck Lever III
2023-06-27 14:19 ` Jeff Layton
2023-06-26 18:21 ` [PATCH v4 2/3] shmem: Refactor shmem_symlink() Chuck Lever
2023-06-27 6:44 ` Christoph Hellwig
2023-06-26 18:21 ` [PATCH v4 3/3] shmem: stable directory offsets Chuck Lever
2023-06-27 14:06 ` Bernd Schubert
2023-06-27 14:11 ` Chuck Lever III
2023-06-27 14:48 ` Bernd Schubert
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).