* [PATCH v4 0/2] ceph: fix client race conditions with stale r_parent
@ 2025-08-12 9:57 Alex Markuze
2025-08-12 9:57 ` [PATCH v4 1/2] ceph: fix client race condition validating r_parent before applying state Alex Markuze
2025-08-12 9:57 ` [PATCH v4 2/2] ceph: fix client race condition where r_parent becomes stale before sending message Alex Markuze
0 siblings, 2 replies; 5+ messages in thread
From: Alex Markuze @ 2025-08-12 9:57 UTC (permalink / raw)
To: ceph-devel; +Cc: linux-kernel, Slava.Dubeyko, idryomov, Alex Markuze
This patch series addresses client-side race conditions in the Ceph filesystem
where the cached parent directory inode (r_parent) can become stale during
concurrent operations like rename, leading to incorrect state application.
The first patch adds validation during reply processing to ensure the cached
parent directory inode matches the directory info in MDS replies. It refactors
the path building API to use a structured approach and prevents applying state
changes to incorrect directory inodes.
The second patch addresses cases where r_parent becomes stale between request
creation and message sending when the parent directory's i_rwsem is not locked.
It validates that r_parent matches the encoded parent inode and updates to the
correct inode if a mismatch is detected, with appropriate warnings for this
Alex Markuze (2):
ceph: fix client race condition validating r_parent before applying
state
ceph: fix client race condition where r_parent becomes stale before
sending message
fs/ceph/debugfs.c | 14 ++--
fs/ceph/dir.c | 17 ++---
fs/ceph/file.c | 24 +++----
fs/ceph/inode.c | 59 ++++++++++++++--
fs/ceph/mds_client.c | 165 ++++++++++++++++++++++++++-----------------
fs/ceph/mds_client.h | 18 +++--
6 files changed, 189 insertions(+), 108 deletions(-)
--
2.34.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v4 1/2] ceph: fix client race condition validating r_parent before applying state
2025-08-12 9:57 [PATCH v4 0/2] ceph: fix client race conditions with stale r_parent Alex Markuze
@ 2025-08-12 9:57 ` Alex Markuze
2025-08-13 17:56 ` Viacheslav Dubeyko
2025-08-12 9:57 ` [PATCH v4 2/2] ceph: fix client race condition where r_parent becomes stale before sending message Alex Markuze
1 sibling, 1 reply; 5+ messages in thread
From: Alex Markuze @ 2025-08-12 9:57 UTC (permalink / raw)
To: ceph-devel; +Cc: linux-kernel, Slava.Dubeyko, idryomov, Alex Markuze
Add validation to ensure the cached parent directory inode matches the
directory info in MDS replies. This prevents client-side race conditions
where concurrent operations (e.g. rename) cause r_parent to become stale
between request initiation and reply processing, which could lead to
applying state changes to incorrect directory inodes.
Signed-off-by: Alex Markuze <amarkuze@redhat.com>
---
fs/ceph/debugfs.c | 14 ++--
fs/ceph/dir.c | 17 ++---
fs/ceph/file.c | 24 +++----
fs/ceph/inode.c | 7 +-
fs/ceph/mds_client.c | 165 ++++++++++++++++++++++++++-----------------
fs/ceph/mds_client.h | 18 +++--
6 files changed, 139 insertions(+), 106 deletions(-)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdd404fc8112..f3fe786b4143 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct rb_node *rp;
- int pathlen = 0;
- u64 pathbase;
char *path;
mutex_lock(&mdsc->mutex);
@@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
} else if (req->r_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_dentry->d_lock);
@@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_dentry,
path ? path : "");
spin_unlock(&req->r_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
@@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
}
if (req->r_old_dentry) {
- path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
- &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
if (IS_ERR(path))
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
@@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
req->r_old_dentry,
path ? path : "");
spin_unlock(&req->r_old_dentry->d_lock);
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
} else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
if (req->r_ino2.ino)
seq_printf(s, " #%llx/%s", req->r_ino2.ino,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1535b6540e9d..08c3129cd25d 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
/* If op failed, mark everyone involved for errors */
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
/* mark error on parent + clear complete */
mapping_set_error(req->r_parent->i_mapping, result);
@@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_old_inode->i_mapping, result);
pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
}
out:
iput(req->r_old_inode);
@@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
int err = -EROFS;
int op;
char *path;
- int pathlen;
- u64 pathbase;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index c92ef1fc5b8a..d222dd192fec 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
int flags, fmode, wanted;
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
int mask = MAY_READ;
@@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
@@ -613,15 +612,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- int pathlen = 0;
- u64 base = 0;
- char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
- &base, 0);
+ struct ceph_path_info path_info = {0};
+ char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
pr_warn_client(cl,
"async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path_info(&path_info);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
@@ -789,8 +786,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
int mask;
int err;
char *path;
- int pathlen;
- u64 pathbase;
doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
dir, ceph_vinop(dir), dentry, dentry,
@@ -812,7 +807,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (!dn) {
try_async = false;
} else {
- path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
if (IS_ERR(path)) {
try_async = false;
err = 0;
@@ -824,7 +820,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
mask |= MAY_WRITE;
err = ceph_mds_check_access(mdsc, path, mask);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dn);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4365d17ce6d7..36981950a368 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2502,22 +2502,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
int truncate_retry = 20; /* The RMW will take around 50ms */
struct dentry *dentry;
char *path;
- int pathlen;
- u64 pathbase;
bool do_sync = false;
dentry = d_find_alias(inode);
if (!dentry) {
do_sync = true;
} else {
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
+ struct ceph_path_info path_info;
+ path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
if (IS_ERR(path)) {
do_sync = true;
err = 0;
} else {
err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
}
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
dput(dentry);
/* For none EACCES cases will let the MDS do the mds auth check */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9eed6d73a508..f31278859119 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2729,7 +2729,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- int *plen, u64 *pbase, int for_wire)
+ struct ceph_path_info *path_info, int for_wire)
{
struct ceph_client *cl = mdsc->fsc->client;
struct dentry *cur;
@@ -2839,16 +2839,30 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
return ERR_PTR(-ENAMETOOLONG);
}
- *pbase = base;
- *plen = PATH_MAX - 1 - pos;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
+ path_info->vino.ino = base;
+ path_info->pathlen = PATH_MAX - 1 - pos;
+ path_info->path = path + pos;
+ path_info->freepath = true;
+
+ /* Set snap from dentry if available */
+ if (d_inode(dentry))
+ path_info->vino.snap = ceph_snap(d_inode(dentry));
+ else
+ path_info->vino.snap = CEPH_NOSNAP;
+
doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
- base, *plen, path + pos);
+ base, PATH_MAX - 1 - pos, path + pos);
return path + pos;
}
+
+
static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
- struct inode *dir, const char **ppath, int *ppathlen,
- u64 *pino, bool *pfreepath, bool parent_locked)
+ struct inode *dir, struct ceph_path_info *path_info,
+ bool parent_locked)
{
char *path;
@@ -2857,41 +2871,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
dir = d_inode_rcu(dentry->d_parent);
if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
!IS_ENCRYPTED(dir)) {
- *pino = ceph_ino(dir);
+ path_info->vino.ino = ceph_ino(dir);
+ path_info->vino.snap = ceph_snap(dir);
rcu_read_unlock();
- *ppath = dentry->d_name.name;
- *ppathlen = dentry->d_name.len;
+ path_info->path = dentry->d_name.name;
+ path_info->pathlen = dentry->d_name.len;
+ path_info->freepath = false;
return 0;
}
rcu_read_unlock();
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap handling.
+ */
return 0;
}
-static int build_inode_path(struct inode *inode,
- const char **ppath, int *ppathlen, u64 *pino,
- bool *pfreepath)
+static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct dentry *dentry;
char *path;
if (ceph_snap(inode) == CEPH_NOSNAP) {
- *pino = ceph_ino(inode);
- *ppathlen = 0;
+ path_info->vino.ino = ceph_ino(inode);
+ path_info->vino.snap = ceph_snap(inode);
+ path_info->pathlen = 0;
+ path_info->freepath = false;
return 0;
}
dentry = d_find_alias(inode);
- path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
+ path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
dput(dentry);
if (IS_ERR(path))
return PTR_ERR(path);
- *ppath = path;
- *pfreepath = true;
+ /*
+ * ceph_mdsc_build_path already fills path_info, including snap from dentry.
+ * Override with inode's snap since that's what this function is for.
+ */
+ path_info->vino.snap = ceph_snap(inode);
return 0;
}
@@ -2901,26 +2921,31 @@ static int build_inode_path(struct inode *inode,
*/
static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
struct dentry *rdentry, struct inode *rdiri,
- const char *rpath, u64 rino, const char **ppath,
- int *pathlen, u64 *ino, bool *freepath,
+ const char *rpath, u64 rino, struct ceph_path_info *path_info,
bool parent_locked)
{
struct ceph_client *cl = mdsc->fsc->client;
int r = 0;
+ /* Initialize the output structure */
+ memset(path_info, 0, sizeof(*path_info));
+
if (rinode) {
- r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
+ r = build_inode_path(rinode, path_info);
doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
ceph_snap(rinode));
} else if (rdentry) {
- r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
- freepath, parent_locked);
- doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
+ r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
+ doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
+ path_info->pathlen, path_info->path);
} else if (rpath || rino) {
- *ino = rino;
- *ppath = rpath;
- *pathlen = rpath ? strlen(rpath) : 0;
- doutc(cl, " path %.*s\n", *pathlen, rpath);
+ path_info->vino.ino = rino;
+ path_info->vino.snap = CEPH_NOSNAP;
+ path_info->path = rpath;
+ path_info->pathlen = rpath ? strlen(rpath) : 0;
+ path_info->freepath = false;
+
+ doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
}
return r;
@@ -2997,11 +3022,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *msg;
struct ceph_mds_request_head_legacy *lhead;
- const char *path1 = NULL;
- const char *path2 = NULL;
- u64 ino1 = 0, ino2 = 0;
- int pathlen1 = 0, pathlen2 = 0;
- bool freepath1 = false, freepath2 = false;
+ struct ceph_path_info path_info1 = {0};
+ struct ceph_path_info path_info2 = {0};
struct dentry *old_dentry = NULL;
int len;
u16 releases;
@@ -3011,25 +3033,42 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
u16 request_head_version = mds_supported_head_version(session);
kuid_t caller_fsuid = req->r_cred->fsuid;
kgid_t caller_fsgid = req->r_cred->fsgid;
+ bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
- req->r_parent, req->r_path1, req->r_ino1.ino,
- &path1, &pathlen1, &ino1, &freepath1,
- test_bit(CEPH_MDS_R_PARENT_LOCKED,
- &req->r_req_flags));
+ ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
+ req->r_parent, req->r_path1, req->r_ino1.ino,
+ &path_info1, parent_locked);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out;
}
+ /*
+ * When the parent directory's i_rwsem is *not* locked, req->r_parent may
+ * have become stale (e.g. after a concurrent rename) between the time the
+ * dentry was looked up and now. If we detect that the stored r_parent
+ * does not match the inode number we just encoded for the request, switch
+ * to the correct inode so that the MDS receives a valid parent reference.
+ */
+ if (!parent_locked &&
+ req->r_parent && path_info1.vino.ino && ceph_ino(req->r_parent) != path_info1.vino.ino) {
+ struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
+ if (!IS_ERR(correct_dir)) {
+ WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
+ ceph_ino(req->r_parent), path_info1.vino.ino);
+ iput(req->r_parent);
+ req->r_parent = correct_dir;
+ }
+ }
+
/* If r_old_dentry is set, then assume that its parent is locked */
if (req->r_old_dentry &&
!(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
old_dentry = req->r_old_dentry;
ret = set_request_path_attr(mdsc, NULL, old_dentry,
- req->r_old_dentry_dir,
- req->r_path2, req->r_ino2.ino,
- &path2, &pathlen2, &ino2, &freepath2, true);
+ req->r_old_dentry_dir,
+ req->r_path2, req->r_ino2.ino,
+ &path_info2, true);
if (ret < 0) {
msg = ERR_PTR(ret);
goto out_free1;
@@ -3060,7 +3099,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
/* filepaths */
len += 2 * (1 + sizeof(u32) + sizeof(u64));
- len += pathlen1 + pathlen2;
+ len += path_info1.pathlen + path_info2.pathlen;
/* cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -3068,9 +3107,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
if (req->r_dentry_drop)
- len += pathlen1;
+ len += path_info1.pathlen;
if (req->r_old_dentry_drop)
- len += pathlen2;
+ len += path_info2.pathlen;
/* MClientRequest tail */
@@ -3183,8 +3222,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
lhead->ino = cpu_to_le64(req->r_deleg_ino);
lhead->args = req->r_args;
- ceph_encode_filepath(&p, end, ino1, path1);
- ceph_encode_filepath(&p, end, ino2, path2);
+ ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
+ ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
/* make note of release offset, in case we need to replay */
req->r_request_release_offset = p - msg->front.iov_base;
@@ -3247,11 +3286,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.data_off = cpu_to_le16(0);
out_free2:
- if (freepath2)
- ceph_mdsc_free_path((char *)path2, pathlen2);
+ ceph_mdsc_free_path_info(&path_info2);
out_free1:
- if (freepath1)
- ceph_mdsc_free_path((char *)path1, pathlen1);
+ ceph_mdsc_free_path_info(&path_info1);
out:
return msg;
out_err:
@@ -4608,24 +4645,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
struct ceph_pagelist *pagelist = recon_state->pagelist;
struct dentry *dentry;
struct ceph_cap *cap;
- char *path;
- int pathlen = 0, err;
- u64 pathbase;
+ struct ceph_path_info path_info = {0};
+ int err;
u64 snap_follows;
dentry = d_find_primary(inode);
if (dentry) {
/* set pathbase to parent dir when msg_version >= 2 */
- path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
+ char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
recon_state->msg_version >= 2);
dput(dentry);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out_err;
}
- } else {
- path = NULL;
- pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
@@ -4658,7 +4691,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v2.pathbase = cpu_to_le64(pathbase);
+ rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
rec.v2.flock_len = (__force __le32)
((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
} else {
@@ -4673,7 +4706,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ts = inode_get_atime(inode);
ceph_encode_timespec64(&rec.v1.atime, &ts);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
- rec.v1.pathbase = cpu_to_le64(pathbase);
+ rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
}
if (list_empty(&ci->i_cap_snaps)) {
@@ -4735,7 +4768,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
sizeof(struct ceph_filelock);
rec.v2.flock_len = cpu_to_le32(struct_len);
- struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
+ struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
if (struct_v >= 2)
struct_len += sizeof(u64); /* snap_follows */
@@ -4759,7 +4792,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
ceph_pagelist_encode_8(pagelist, 1);
ceph_pagelist_encode_32(pagelist, struct_len);
}
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
ceph_locks_to_pagelist(flocks, pagelist,
num_fcntl_locks, num_flock_locks);
@@ -4770,17 +4803,17 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
} else {
err = ceph_pagelist_reserve(pagelist,
sizeof(u64) + sizeof(u32) +
- pathlen + sizeof(rec.v1));
+ path_info.pathlen + sizeof(rec.v1));
if (err)
goto out_err;
ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
- ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
}
out_err:
- ceph_mdsc_free_path(path, pathlen);
+ ceph_mdsc_free_path_info(&path_info);
if (!err)
recon_state->nr_caps++;
return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 3e2a6fa7c19a..0428a5eaf28c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
-static inline void ceph_mdsc_free_path(char *path, int len)
+/*
+ * Structure to group path-related output parameters for build_*_path functions
+ */
+struct ceph_path_info {
+ const char *path;
+ int pathlen;
+ struct ceph_vino vino;
+ bool freepath;
+};
+
+static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
{
- if (!IS_ERR_OR_NULL(path))
- __putname(path - (PATH_MAX - 1 - len));
+ if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
+ __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
}
extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
- struct dentry *dentry, int *plen, u64 *base,
+ struct dentry *dentry, struct ceph_path_info *path_info,
int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v4 2/2] ceph: fix client race condition where r_parent becomes stale before sending message
2025-08-12 9:57 [PATCH v4 0/2] ceph: fix client race conditions with stale r_parent Alex Markuze
2025-08-12 9:57 ` [PATCH v4 1/2] ceph: fix client race condition validating r_parent before applying state Alex Markuze
@ 2025-08-12 9:57 ` Alex Markuze
2025-08-13 17:56 ` Viacheslav Dubeyko
1 sibling, 1 reply; 5+ messages in thread
From: Alex Markuze @ 2025-08-12 9:57 UTC (permalink / raw)
To: ceph-devel; +Cc: linux-kernel, Slava.Dubeyko, idryomov, Alex Markuze
When the parent directory's i_rwsem is not locked, req->r_parent may become
stale due to concurrent operations (e.g. rename) between dentry lookup and
message creation. Validate that r_parent matches the encoded parent inode
and update to the correct inode if a mismatch is detected.
Signed-off-by: Alex Markuze <amarkuze@redhat.com>
---
fs/ceph/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 50 insertions(+), 2 deletions(-)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 36981950a368..42110d133f15 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -56,6 +56,51 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
+/*
+ * Check if the parent inode matches the vino from directory reply info
+ */
+static inline bool ceph_vino_matches_parent(struct inode *parent, struct ceph_vino vino)
+{
+ return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
+}
+
+/*
+ * Validate that the directory inode referenced by @req->r_parent matches the
+ * inode number and snapshot id contained in the reply's directory record. If
+ * they do not match – which can theoretically happen if the parent dentry was
+ * moved between the time the request was issued and the reply arrived – fall
+ * back to looking up the correct inode in the inode cache.
+ *
+ * A reference is *always* returned. Callers that receive a different inode
+ * than the original @parent are responsible for dropping the extra reference
+ * once the reply has been processed.
+ */
+static struct inode *ceph_get_reply_dir(struct super_block *sb,
+ struct inode *parent,
+ struct ceph_mds_reply_info_parsed *rinfo)
+{
+ struct ceph_vino vino;
+
+ if (unlikely(!rinfo->diri.in))
+ return parent; /* nothing to compare against */
+
+ /* If we didn't have a cached parent inode to begin with, just bail out. */
+ if (!parent)
+ return NULL;
+
+ vino.ino = le64_to_cpu(rinfo->diri.in->ino);
+ vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
+
+ if (likely(ceph_vino_matches_parent(parent, vino)))
+ return parent; /* matches – use the original reference */
+
+ /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
+ WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
+ ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
+
+ return ceph_get_inode(sb, vino, NULL);
+}
+
/**
* ceph_new_inode - allocate a new inode in advance of an expected create
* @dir: parent directory for new inode
@@ -1548,8 +1593,11 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
}
if (rinfo->head->is_dentry) {
- struct inode *dir = req->r_parent;
-
+ /*
+ * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
+ * so we need to get the correct inode
+ */
+ struct inode *dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
if (dir) {
err = ceph_fill_inode(dir, NULL, &rinfo->diri,
rinfo->dirfrag, session, -1,
--
2.34.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH v4 1/2] ceph: fix client race condition validating r_parent before applying state
2025-08-12 9:57 ` [PATCH v4 1/2] ceph: fix client race condition validating r_parent before applying state Alex Markuze
@ 2025-08-13 17:56 ` Viacheslav Dubeyko
0 siblings, 0 replies; 5+ messages in thread
From: Viacheslav Dubeyko @ 2025-08-13 17:56 UTC (permalink / raw)
To: Alex Markuze, ceph-devel@vger.kernel.org
Cc: idryomov@gmail.com, linux-kernel@vger.kernel.org
On Tue, 2025-08-12 at 09:57 +0000, Alex Markuze wrote:
> Add validation to ensure the cached parent directory inode matches the
> directory info in MDS replies. This prevents client-side race conditions
> where concurrent operations (e.g. rename) cause r_parent to become stale
> between request initiation and reply processing, which could lead to
> applying state changes to incorrect directory inodes.
>
> Signed-off-by: Alex Markuze <amarkuze@redhat.com>
> ---
> fs/ceph/debugfs.c | 14 ++--
> fs/ceph/dir.c | 17 ++---
> fs/ceph/file.c | 24 +++----
> fs/ceph/inode.c | 7 +-
> fs/ceph/mds_client.c | 165 ++++++++++++++++++++++++++-----------------
> fs/ceph/mds_client.h | 18 +++--
> 6 files changed, 139 insertions(+), 106 deletions(-)
>
Looks good.
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Thanks,
Slava.
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index fdd404fc8112..f3fe786b4143 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -55,8 +55,6 @@ static int mdsc_show(struct seq_file *s, void *p)
> struct ceph_mds_client *mdsc = fsc->mdsc;
> struct ceph_mds_request *req;
> struct rb_node *rp;
> - int pathlen = 0;
> - u64 pathbase;
> char *path;
>
> mutex_lock(&mdsc->mutex);
> @@ -81,8 +79,8 @@ static int mdsc_show(struct seq_file *s, void *p)
> if (req->r_inode) {
> seq_printf(s, " #%llx", ceph_ino(req->r_inode));
> } else if (req->r_dentry) {
> - path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
> - &pathbase, 0);
> + struct ceph_path_info path_info;
> + path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
> if (IS_ERR(path))
> path = NULL;
> spin_lock(&req->r_dentry->d_lock);
> @@ -91,7 +89,7 @@ static int mdsc_show(struct seq_file *s, void *p)
> req->r_dentry,
> path ? path : "");
> spin_unlock(&req->r_dentry->d_lock);
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> } else if (req->r_path1) {
> seq_printf(s, " #%llx/%s", req->r_ino1.ino,
> req->r_path1);
> @@ -100,8 +98,8 @@ static int mdsc_show(struct seq_file *s, void *p)
> }
>
> if (req->r_old_dentry) {
> - path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &pathlen,
> - &pathbase, 0);
> + struct ceph_path_info path_info;
> + path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0);
> if (IS_ERR(path))
> path = NULL;
> spin_lock(&req->r_old_dentry->d_lock);
> @@ -111,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
> req->r_old_dentry,
> path ? path : "");
> spin_unlock(&req->r_old_dentry->d_lock);
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
> if (req->r_ino2.ino)
> seq_printf(s, " #%llx/%s", req->r_ino2.ino,
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 1535b6540e9d..08c3129cd25d 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -1271,10 +1271,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
>
> /* If op failed, mark everyone involved for errors */
> if (result) {
> - int pathlen = 0;
> - u64 base = 0;
> - char *path = ceph_mdsc_build_path(mdsc, dentry, &pathlen,
> - &base, 0);
> + struct ceph_path_info path_info = {0};
> + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
>
> /* mark error on parent + clear complete */
> mapping_set_error(req->r_parent->i_mapping, result);
> @@ -1288,8 +1286,8 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> mapping_set_error(req->r_old_inode->i_mapping, result);
>
> pr_warn_client(cl, "failure path=(%llx)%s result=%d!\n",
> - base, IS_ERR(path) ? "<<bad>>" : path, result);
> - ceph_mdsc_free_path(path, pathlen);
> + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
> + ceph_mdsc_free_path_info(&path_info);
> }
> out:
> iput(req->r_old_inode);
> @@ -1347,8 +1345,6 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> int err = -EROFS;
> int op;
> char *path;
> - int pathlen;
> - u64 pathbase;
>
> if (ceph_snap(dir) == CEPH_SNAPDIR) {
> /* rmdir .snap/foo is RMSNAP */
> @@ -1367,14 +1363,15 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> if (!dn) {
> try_async = false;
> } else {
> - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
> + struct ceph_path_info path_info;
> + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
> if (IS_ERR(path)) {
> try_async = false;
> err = 0;
> } else {
> err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
> }
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> dput(dn);
>
> /* For none EACCES cases will let the MDS do the mds auth check */
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index c92ef1fc5b8a..d222dd192fec 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -368,8 +368,6 @@ int ceph_open(struct inode *inode, struct file *file)
> int flags, fmode, wanted;
> struct dentry *dentry;
> char *path;
> - int pathlen;
> - u64 pathbase;
> bool do_sync = false;
> int mask = MAY_READ;
>
> @@ -399,14 +397,15 @@ int ceph_open(struct inode *inode, struct file *file)
> if (!dentry) {
> do_sync = true;
> } else {
> - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
> + struct ceph_path_info path_info;
> + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
> if (IS_ERR(path)) {
> do_sync = true;
> err = 0;
> } else {
> err = ceph_mds_check_access(mdsc, path, mask);
> }
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> dput(dentry);
>
> /* For none EACCES cases will let the MDS do the mds auth check */
> @@ -613,15 +612,13 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
> mapping_set_error(req->r_parent->i_mapping, result);
>
> if (result) {
> - int pathlen = 0;
> - u64 base = 0;
> - char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &pathlen,
> - &base, 0);
> + struct ceph_path_info path_info = {0};
> + char *path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0);
>
> pr_warn_client(cl,
> "async create failure path=(%llx)%s result=%d!\n",
> - base, IS_ERR(path) ? "<<bad>>" : path, result);
> - ceph_mdsc_free_path(path, pathlen);
> + path_info.vino.ino, IS_ERR(path) ? "<<bad>>" : path, result);
> + ceph_mdsc_free_path_info(&path_info);
>
> ceph_dir_clear_complete(req->r_parent);
> if (!d_unhashed(dentry))
> @@ -789,8 +786,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> int mask;
> int err;
> char *path;
> - int pathlen;
> - u64 pathbase;
>
> doutc(cl, "%p %llx.%llx dentry %p '%pd' %s flags %d mode 0%o\n",
> dir, ceph_vinop(dir), dentry, dentry,
> @@ -812,7 +807,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> if (!dn) {
> try_async = false;
> } else {
> - path = ceph_mdsc_build_path(mdsc, dn, &pathlen, &pathbase, 0);
> + struct ceph_path_info path_info;
> + path = ceph_mdsc_build_path(mdsc, dn, &path_info, 0);
> if (IS_ERR(path)) {
> try_async = false;
> err = 0;
> @@ -824,7 +820,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> mask |= MAY_WRITE;
> err = ceph_mds_check_access(mdsc, path, mask);
> }
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> dput(dn);
>
> /* For none EACCES cases will let the MDS do the mds auth check */
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 4365d17ce6d7..36981950a368 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -2502,22 +2502,21 @@ int __ceph_setattr(struct mnt_idmap *idmap, struct inode *inode,
> int truncate_retry = 20; /* The RMW will take around 50ms */
> struct dentry *dentry;
> char *path;
> - int pathlen;
> - u64 pathbase;
> bool do_sync = false;
>
> dentry = d_find_alias(inode);
> if (!dentry) {
> do_sync = true;
> } else {
> - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase, 0);
> + struct ceph_path_info path_info;
> + path = ceph_mdsc_build_path(mdsc, dentry, &path_info, 0);
> if (IS_ERR(path)) {
> do_sync = true;
> err = 0;
> } else {
> err = ceph_mds_check_access(mdsc, path, MAY_WRITE);
> }
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> dput(dentry);
>
> /* For none EACCES cases will let the MDS do the mds auth check */
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 9eed6d73a508..f31278859119 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2729,7 +2729,7 @@ static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
> * foo/.snap/bar -> foo//bar
> */
> char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
> - int *plen, u64 *pbase, int for_wire)
> + struct ceph_path_info *path_info, int for_wire)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> struct dentry *cur;
> @@ -2839,16 +2839,30 @@ char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
> return ERR_PTR(-ENAMETOOLONG);
> }
>
> - *pbase = base;
> - *plen = PATH_MAX - 1 - pos;
> + /* Initialize the output structure */
> + memset(path_info, 0, sizeof(*path_info));
> +
> + path_info->vino.ino = base;
> + path_info->pathlen = PATH_MAX - 1 - pos;
> + path_info->path = path + pos;
> + path_info->freepath = true;
> +
> + /* Set snap from dentry if available */
> + if (d_inode(dentry))
> + path_info->vino.snap = ceph_snap(d_inode(dentry));
> + else
> + path_info->vino.snap = CEPH_NOSNAP;
> +
> doutc(cl, "on %p %d built %llx '%.*s'\n", dentry, d_count(dentry),
> - base, *plen, path + pos);
> + base, PATH_MAX - 1 - pos, path + pos);
> return path + pos;
> }
>
> +
> +
> static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry,
> - struct inode *dir, const char **ppath, int *ppathlen,
> - u64 *pino, bool *pfreepath, bool parent_locked)
> + struct inode *dir, struct ceph_path_info *path_info,
> + bool parent_locked)
> {
> char *path;
>
> @@ -2857,41 +2871,47 @@ static int build_dentry_path(struct ceph_mds_client *mdsc, struct dentry *dentry
> dir = d_inode_rcu(dentry->d_parent);
> if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
> !IS_ENCRYPTED(dir)) {
> - *pino = ceph_ino(dir);
> + path_info->vino.ino = ceph_ino(dir);
> + path_info->vino.snap = ceph_snap(dir);
> rcu_read_unlock();
> - *ppath = dentry->d_name.name;
> - *ppathlen = dentry->d_name.len;
> + path_info->path = dentry->d_name.name;
> + path_info->pathlen = dentry->d_name.len;
> + path_info->freepath = false;
> return 0;
> }
> rcu_read_unlock();
> - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
> + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
> if (IS_ERR(path))
> return PTR_ERR(path);
> - *ppath = path;
> - *pfreepath = true;
> + /*
> + * ceph_mdsc_build_path already fills path_info, including snap handling.
> + */
> return 0;
> }
>
> -static int build_inode_path(struct inode *inode,
> - const char **ppath, int *ppathlen, u64 *pino,
> - bool *pfreepath)
> +static int build_inode_path(struct inode *inode, struct ceph_path_info *path_info)
> {
> struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
> struct dentry *dentry;
> char *path;
>
> if (ceph_snap(inode) == CEPH_NOSNAP) {
> - *pino = ceph_ino(inode);
> - *ppathlen = 0;
> + path_info->vino.ino = ceph_ino(inode);
> + path_info->vino.snap = ceph_snap(inode);
> + path_info->pathlen = 0;
> + path_info->freepath = false;
> return 0;
> }
> dentry = d_find_alias(inode);
> - path = ceph_mdsc_build_path(mdsc, dentry, ppathlen, pino, 1);
> + path = ceph_mdsc_build_path(mdsc, dentry, path_info, 1);
> dput(dentry);
> if (IS_ERR(path))
> return PTR_ERR(path);
> - *ppath = path;
> - *pfreepath = true;
> + /*
> + * ceph_mdsc_build_path already fills path_info, including snap from dentry.
> + * Override with inode's snap since that's what this function is for.
> + */
> + path_info->vino.snap = ceph_snap(inode);
> return 0;
> }
>
> @@ -2901,26 +2921,31 @@ static int build_inode_path(struct inode *inode,
> */
> static int set_request_path_attr(struct ceph_mds_client *mdsc, struct inode *rinode,
> struct dentry *rdentry, struct inode *rdiri,
> - const char *rpath, u64 rino, const char **ppath,
> - int *pathlen, u64 *ino, bool *freepath,
> + const char *rpath, u64 rino, struct ceph_path_info *path_info,
> bool parent_locked)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> int r = 0;
>
> + /* Initialize the output structure */
> + memset(path_info, 0, sizeof(*path_info));
> +
> if (rinode) {
> - r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
> + r = build_inode_path(rinode, path_info);
> doutc(cl, " inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
> ceph_snap(rinode));
> } else if (rdentry) {
> - r = build_dentry_path(mdsc, rdentry, rdiri, ppath, pathlen, ino,
> - freepath, parent_locked);
> - doutc(cl, " dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath);
> + r = build_dentry_path(mdsc, rdentry, rdiri, path_info, parent_locked);
> + doutc(cl, " dentry %p %llx/%.*s\n", rdentry, path_info->vino.ino,
> + path_info->pathlen, path_info->path);
> } else if (rpath || rino) {
> - *ino = rino;
> - *ppath = rpath;
> - *pathlen = rpath ? strlen(rpath) : 0;
> - doutc(cl, " path %.*s\n", *pathlen, rpath);
> + path_info->vino.ino = rino;
> + path_info->vino.snap = CEPH_NOSNAP;
> + path_info->path = rpath;
> + path_info->pathlen = rpath ? strlen(rpath) : 0;
> + path_info->freepath = false;
> +
> + doutc(cl, " path %.*s\n", path_info->pathlen, rpath);
> }
>
> return r;
> @@ -2997,11 +3022,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> struct ceph_client *cl = mdsc->fsc->client;
> struct ceph_msg *msg;
> struct ceph_mds_request_head_legacy *lhead;
> - const char *path1 = NULL;
> - const char *path2 = NULL;
> - u64 ino1 = 0, ino2 = 0;
> - int pathlen1 = 0, pathlen2 = 0;
> - bool freepath1 = false, freepath2 = false;
> + struct ceph_path_info path_info1 = {0};
> + struct ceph_path_info path_info2 = {0};
> struct dentry *old_dentry = NULL;
> int len;
> u16 releases;
> @@ -3011,25 +3033,42 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> u16 request_head_version = mds_supported_head_version(session);
> kuid_t caller_fsuid = req->r_cred->fsuid;
> kgid_t caller_fsgid = req->r_cred->fsgid;
> + bool parent_locked = test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
>
> - ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> - req->r_parent, req->r_path1, req->r_ino1.ino,
> - &path1, &pathlen1, &ino1, &freepath1,
> - test_bit(CEPH_MDS_R_PARENT_LOCKED,
> - &req->r_req_flags));
> + ret = set_request_path_attr(mdsc, req->r_inode, req->r_dentry,
> + req->r_parent, req->r_path1, req->r_ino1.ino,
> + &path_info1, parent_locked);
> if (ret < 0) {
> msg = ERR_PTR(ret);
> goto out;
> }
>
> + /*
> + * When the parent directory's i_rwsem is *not* locked, req->r_parent may
> + * have become stale (e.g. after a concurrent rename) between the time the
> + * dentry was looked up and now. If we detect that the stored r_parent
> + * does not match the inode number we just encoded for the request, switch
> + * to the correct inode so that the MDS receives a valid parent reference.
> + */
> + if (!parent_locked &&
> + req->r_parent && path_info1.vino.ino && ceph_ino(req->r_parent) != path_info1.vino.ino) {
> + struct inode *correct_dir = ceph_get_inode(mdsc->fsc->sb, path_info1.vino, NULL);
> + if (!IS_ERR(correct_dir)) {
> + WARN_ONCE(1, "ceph: r_parent mismatch (had %llx wanted %llx) - updating\n",
> + ceph_ino(req->r_parent), path_info1.vino.ino);
> + iput(req->r_parent);
> + req->r_parent = correct_dir;
> + }
> + }
> +
> /* If r_old_dentry is set, then assume that its parent is locked */
> if (req->r_old_dentry &&
> !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
> old_dentry = req->r_old_dentry;
> ret = set_request_path_attr(mdsc, NULL, old_dentry,
> - req->r_old_dentry_dir,
> - req->r_path2, req->r_ino2.ino,
> - &path2, &pathlen2, &ino2, &freepath2, true);
> + req->r_old_dentry_dir,
> + req->r_path2, req->r_ino2.ino,
> + &path_info2, true);
> if (ret < 0) {
> msg = ERR_PTR(ret);
> goto out_free1;
> @@ -3060,7 +3099,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
>
> /* filepaths */
> len += 2 * (1 + sizeof(u32) + sizeof(u64));
> - len += pathlen1 + pathlen2;
> + len += path_info1.pathlen + path_info2.pathlen;
>
> /* cap releases */
> len += sizeof(struct ceph_mds_request_release) *
> @@ -3068,9 +3107,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
>
> if (req->r_dentry_drop)
> - len += pathlen1;
> + len += path_info1.pathlen;
> if (req->r_old_dentry_drop)
> - len += pathlen2;
> + len += path_info2.pathlen;
>
> /* MClientRequest tail */
>
> @@ -3183,8 +3222,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> lhead->ino = cpu_to_le64(req->r_deleg_ino);
> lhead->args = req->r_args;
>
> - ceph_encode_filepath(&p, end, ino1, path1);
> - ceph_encode_filepath(&p, end, ino2, path2);
> + ceph_encode_filepath(&p, end, path_info1.vino.ino, path_info1.path);
> + ceph_encode_filepath(&p, end, path_info2.vino.ino, path_info2.path);
>
> /* make note of release offset, in case we need to replay */
> req->r_request_release_offset = p - msg->front.iov_base;
> @@ -3247,11 +3286,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
> msg->hdr.data_off = cpu_to_le16(0);
>
> out_free2:
> - if (freepath2)
> - ceph_mdsc_free_path((char *)path2, pathlen2);
> + ceph_mdsc_free_path_info(&path_info2);
> out_free1:
> - if (freepath1)
> - ceph_mdsc_free_path((char *)path1, pathlen1);
> + ceph_mdsc_free_path_info(&path_info1);
> out:
> return msg;
> out_err:
> @@ -4608,24 +4645,20 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
> struct ceph_pagelist *pagelist = recon_state->pagelist;
> struct dentry *dentry;
> struct ceph_cap *cap;
> - char *path;
> - int pathlen = 0, err;
> - u64 pathbase;
> + struct ceph_path_info path_info = {0};
> + int err;
> u64 snap_follows;
>
> dentry = d_find_primary(inode);
> if (dentry) {
> /* set pathbase to parent dir when msg_version >= 2 */
> - path = ceph_mdsc_build_path(mdsc, dentry, &pathlen, &pathbase,
> + char *path = ceph_mdsc_build_path(mdsc, dentry, &path_info,
> recon_state->msg_version >= 2);
> dput(dentry);
> if (IS_ERR(path)) {
> err = PTR_ERR(path);
> goto out_err;
> }
> - } else {
> - path = NULL;
> - pathbase = 0;
> }
>
> spin_lock(&ci->i_ceph_lock);
> @@ -4658,7 +4691,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
> rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
> rec.v2.issued = cpu_to_le32(cap->issued);
> rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> - rec.v2.pathbase = cpu_to_le64(pathbase);
> + rec.v2.pathbase = cpu_to_le64(path_info.vino.ino);
> rec.v2.flock_len = (__force __le32)
> ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
> } else {
> @@ -4673,7 +4706,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
> ts = inode_get_atime(inode);
> ceph_encode_timespec64(&rec.v1.atime, &ts);
> rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
> - rec.v1.pathbase = cpu_to_le64(pathbase);
> + rec.v1.pathbase = cpu_to_le64(path_info.vino.ino);
> }
>
> if (list_empty(&ci->i_cap_snaps)) {
> @@ -4735,7 +4768,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
> sizeof(struct ceph_filelock);
> rec.v2.flock_len = cpu_to_le32(struct_len);
>
> - struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
> + struct_len += sizeof(u32) + path_info.pathlen + sizeof(rec.v2);
>
> if (struct_v >= 2)
> struct_len += sizeof(u64); /* snap_follows */
> @@ -4759,7 +4792,7 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
> ceph_pagelist_encode_8(pagelist, 1);
> ceph_pagelist_encode_32(pagelist, struct_len);
> }
> - ceph_pagelist_encode_string(pagelist, path, pathlen);
> + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
> ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
> ceph_locks_to_pagelist(flocks, pagelist,
> num_fcntl_locks, num_flock_locks);
> @@ -4770,17 +4803,17 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
> } else {
> err = ceph_pagelist_reserve(pagelist,
> sizeof(u64) + sizeof(u32) +
> - pathlen + sizeof(rec.v1));
> + path_info.pathlen + sizeof(rec.v1));
> if (err)
> goto out_err;
>
> ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
> - ceph_pagelist_encode_string(pagelist, path, pathlen);
> + ceph_pagelist_encode_string(pagelist, (char *)path_info.path, path_info.pathlen);
> ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
> }
>
> out_err:
> - ceph_mdsc_free_path(path, pathlen);
> + ceph_mdsc_free_path_info(&path_info);
> if (!err)
> recon_state->nr_caps++;
> return err;
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 3e2a6fa7c19a..0428a5eaf28c 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -617,14 +617,24 @@ extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath,
>
> extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
>
> -static inline void ceph_mdsc_free_path(char *path, int len)
> +/*
> + * Structure to group path-related output parameters for build_*_path functions
> + */
> +struct ceph_path_info {
> + const char *path;
> + int pathlen;
> + struct ceph_vino vino;
> + bool freepath;
> +};
> +
> +static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info)
> {
> - if (!IS_ERR_OR_NULL(path))
> - __putname(path - (PATH_MAX - 1 - len));
> + if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path))
> + __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen));
> }
>
> extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc,
> - struct dentry *dentry, int *plen, u64 *base,
> + struct dentry *dentry, struct ceph_path_info *path_info,
> int for_wire);
>
> extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4 2/2] ceph: fix client race condition where r_parent becomes stale before sending message
2025-08-12 9:57 ` [PATCH v4 2/2] ceph: fix client race condition where r_parent becomes stale before sending message Alex Markuze
@ 2025-08-13 17:56 ` Viacheslav Dubeyko
0 siblings, 0 replies; 5+ messages in thread
From: Viacheslav Dubeyko @ 2025-08-13 17:56 UTC (permalink / raw)
To: Alex Markuze, ceph-devel@vger.kernel.org
Cc: idryomov@gmail.com, linux-kernel@vger.kernel.org
On Tue, 2025-08-12 at 09:57 +0000, Alex Markuze wrote:
> When the parent directory's i_rwsem is not locked, req->r_parent may become
> stale due to concurrent operations (e.g. rename) between dentry lookup and
> message creation. Validate that r_parent matches the encoded parent inode
> and update to the correct inode if a mismatch is detected.
>
> Signed-off-by: Alex Markuze <amarkuze@redhat.com>
> ---
> fs/ceph/inode.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 50 insertions(+), 2 deletions(-)
>
Looks good.
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Thanks,
Slava.
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 36981950a368..42110d133f15 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -56,6 +56,51 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
> return 0;
> }
>
> +/*
> + * Check if the parent inode matches the vino from directory reply info
> + */
> +static inline bool ceph_vino_matches_parent(struct inode *parent, struct ceph_vino vino)
> +{
> + return ceph_ino(parent) == vino.ino && ceph_snap(parent) == vino.snap;
> +}
> +
> +/*
> + * Validate that the directory inode referenced by @req->r_parent matches the
> + * inode number and snapshot id contained in the reply's directory record. If
> + * they do not match – which can theoretically happen if the parent dentry was
> + * moved between the time the request was issued and the reply arrived – fall
> + * back to looking up the correct inode in the inode cache.
> + *
> + * A reference is *always* returned. Callers that receive a different inode
> + * than the original @parent are responsible for dropping the extra reference
> + * once the reply has been processed.
> + */
> +static struct inode *ceph_get_reply_dir(struct super_block *sb,
> + struct inode *parent,
> + struct ceph_mds_reply_info_parsed *rinfo)
> +{
> + struct ceph_vino vino;
> +
> + if (unlikely(!rinfo->diri.in))
> + return parent; /* nothing to compare against */
> +
> + /* If we didn't have a cached parent inode to begin with, just bail out. */
> + if (!parent)
> + return NULL;
> +
> + vino.ino = le64_to_cpu(rinfo->diri.in->ino);
> + vino.snap = le64_to_cpu(rinfo->diri.in->snapid);
> +
> + if (likely(ceph_vino_matches_parent(parent, vino)))
> + return parent; /* matches – use the original reference */
> +
> + /* Mismatch – this should be rare. Emit a WARN and obtain the correct inode. */
> + WARN_ONCE(1, "ceph: reply dir mismatch (parent valid %llx.%llx reply %llx.%llx)\n",
> + ceph_ino(parent), ceph_snap(parent), vino.ino, vino.snap);
> +
> + return ceph_get_inode(sb, vino, NULL);
> +}
> +
> /**
> * ceph_new_inode - allocate a new inode in advance of an expected create
> * @dir: parent directory for new inode
> @@ -1548,8 +1593,11 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
> }
>
> if (rinfo->head->is_dentry) {
> - struct inode *dir = req->r_parent;
> -
> + /*
> + * r_parent may be stale, in cases when R_PARENT_LOCKED is not set,
> + * so we need to get the correct inode
> + */
> + struct inode *dir = ceph_get_reply_dir(sb, req->r_parent, rinfo);
> if (dir) {
> err = ceph_fill_inode(dir, NULL, &rinfo->diri,
> rinfo->dirfrag, session, -1,
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-08-13 17:56 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-12 9:57 [PATCH v4 0/2] ceph: fix client race conditions with stale r_parent Alex Markuze
2025-08-12 9:57 ` [PATCH v4 1/2] ceph: fix client race condition validating r_parent before applying state Alex Markuze
2025-08-13 17:56 ` Viacheslav Dubeyko
2025-08-12 9:57 ` [PATCH v4 2/2] ceph: fix client race condition where r_parent becomes stale before sending message Alex Markuze
2025-08-13 17:56 ` Viacheslav Dubeyko
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).