* [PATCH 1/4] vfs: add d_prune dentry operation
2011-03-22 0:48 (unknown), Sage Weil
@ 2011-03-22 0:48 ` Sage Weil
2011-03-22 0:48 ` [PATCH 2/4] ceph: clear parent D_COMPLETE flag when on dentry prune Sage Weil
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Sage Weil @ 2011-03-22 0:48 UTC (permalink / raw)
To: linux-fsdevel, viro; +Cc: linux-kernel, ceph-devel, Sage Weil
This adds a d_prune dentry operation that is called by the VFS prior to
pruning (i.e. unhashing and killing) a hashed dentry from the dcache. This
will be used by Ceph to maintain a flag indicating whether the complete
contents of a directory are contained in the dcache, allowing it to satisfy
lookups and readdir without addition server communication.
Signed-off-by: Sage Weil <sage@newdream.net>
---
Documentation/filesystems/Locking | 1 +
fs/dcache.c | 8 ++++++++
include/linux/dcache.h | 3 +++
3 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 4471a41..30de5ef 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -29,6 +29,7 @@ d_hash no no no maybe
d_compare: yes no no maybe
d_delete: no yes no no
d_release: no no yes no
+d_prune: no yes no no
d_iput: no no yes no
d_dname: no no no no
d_automount: no no yes no
diff --git a/fs/dcache.c b/fs/dcache.c
index 611ffe9..c54b85e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -690,6 +690,8 @@ static void try_prune_one_dentry(struct dentry *dentry)
spin_unlock(&dentry->d_lock);
return;
}
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
+ dentry->d_op->d_prune(dentry);
dentry = dentry_kill(dentry, 1);
}
}
@@ -896,6 +898,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
/* detach this root from the system */
spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
+ dentry->d_op->d_prune(dentry);
dentry_lru_del(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
@@ -912,6 +916,8 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
d_u.d_child) {
spin_lock_nested(&loop->d_lock,
DENTRY_D_LOCK_NESTED);
+ if (dentry->d_flags & DCACHE_OP_PRUNE)
+ dentry->d_op->d_prune(dentry);
dentry_lru_del(loop);
__d_drop(loop);
spin_unlock(&loop->d_lock);
@@ -1375,6 +1381,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_REVALIDATE;
if (op->d_delete)
dentry->d_flags |= DCACHE_OP_DELETE;
+ if (op->d_prune)
+ dentry->d_flags |= DCACHE_OP_PRUNE;
}
EXPORT_SYMBOL(d_set_d_op);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f958c19..1e83bd8 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -165,6 +165,7 @@ struct dentry_operations {
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
void (*d_release)(struct dentry *);
+ void (*d_prune)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
@@ -219,6 +220,8 @@ struct dentry_operations {
#define DCACHE_MANAGED_DENTRY \
(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
+#define DCACHE_OP_PRUNE 0x80000
+
extern seqlock_t rename_lock;
static inline int dname_external(struct dentry *dentry)
--
1.7.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/4] ceph: clear parent D_COMPLETE flag when on dentry prune
2011-03-22 0:48 (unknown), Sage Weil
2011-03-22 0:48 ` [PATCH 1/4] vfs: add d_prune dentry operation Sage Weil
@ 2011-03-22 0:48 ` Sage Weil
2011-03-22 0:48 ` [PATCH 3/4] ceph: compensate for dentry_unhash() calls in vfs_rmdir() and vfs_rename_dir() Sage Weil
2011-03-22 0:48 ` [PATCH 4/4] ceph: use new D_COMPLETE dentry flag Sage Weil
3 siblings, 0 replies; 5+ messages in thread
From: Sage Weil @ 2011-03-22 0:48 UTC (permalink / raw)
To: linux-fsdevel, viro; +Cc: linux-kernel, ceph-devel, Sage Weil
When the VFS prunes a dentry from the cache, clear the D_COMPLETE flag
on the parent dentry. Do this for the live and snapshotted namespaces. Do
not bother for the .snap dir contents, since we do not cache that.
Signed-off-by: Sage Weil <sage@newdream.net>
---
fs/ceph/dir.c | 28 ++++++++++++++++++++++++++++
fs/ceph/super.h | 13 +++++++++++++
2 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3..e2005fa 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1053,7 +1053,33 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
return 1;
}
+/*
+ * When the VFS prunes a dentry from the cache, we need to clear the
+ * complete flag on the parent directory.
+ *
+ * Called under dentry->d_lock.
+ */
+static void ceph_d_prune(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+
+ dout("d_release %p\n", dentry);
+
+ /* do we have a valid parent? */
+ if (!dentry->d_parent || IS_ROOT(dentry))
+ return;
+
+ /* if we are not hashed, we don't affect D_COMPLETE */
+ if (d_unhashed(dentry))
+ return;
+ /*
+ * we hold d_lock, so d_parent is stable, and d_fsdata is never
+ * cleared until d_release
+ */
+ di = ceph_dentry(dentry->d_parent);
+ clear_bit(CEPH_D_COMPLETE, &di->flags);
+}
/*
* read() on a dir. This weird interface hack only works if mounted
@@ -1259,6 +1285,7 @@ const struct inode_operations ceph_dir_iops = {
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
.d_release = ceph_d_release,
+ .d_prune = ceph_d_prune,
};
const struct dentry_operations ceph_snapdir_dentry_ops = {
@@ -1268,4 +1295,5 @@ const struct dentry_operations ceph_snapdir_dentry_ops = {
const struct dentry_operations ceph_snap_dentry_ops = {
.d_release = ceph_d_release,
+ .d_prune = ceph_d_prune,
};
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 619fe71..b6bf098 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -201,6 +201,7 @@ struct ceph_inode_xattr {
* Ceph dentry state
*/
struct ceph_dentry_info {
+ unsigned long flags;
struct ceph_mds_session *lease_session;
u32 lease_gen, lease_shared_gen;
u32 lease_seq;
@@ -211,6 +212,18 @@ struct ceph_dentry_info {
u64 offset;
};
+/*
+ * dentry flags
+ *
+ * The locking for D_COMPLETE is a bit odd:
+ * - we can clear it at almost any time (see ceph_d_prune)
+ * - it is only meaningful if:
+ * - we hold dir inode i_lock
+ * - we hold dir FILE_SHARED caps
+ * - the dentry D_COMPLETE is set
+ */
+#define CEPH_D_COMPLETE 1 /* if set, d_u.d_subdirs is complete directory */
+
struct ceph_inode_xattrs_info {
/*
* (still encoded) xattr blob. we avoid the overhead of parsing
--
1.7.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/4] ceph: compensate for dentry_unhash() calls in vfs_rmdir() and vfs_rename_dir()
2011-03-22 0:48 (unknown), Sage Weil
2011-03-22 0:48 ` [PATCH 1/4] vfs: add d_prune dentry operation Sage Weil
2011-03-22 0:48 ` [PATCH 2/4] ceph: clear parent D_COMPLETE flag when on dentry prune Sage Weil
@ 2011-03-22 0:48 ` Sage Weil
2011-03-22 0:48 ` [PATCH 4/4] ceph: use new D_COMPLETE dentry flag Sage Weil
3 siblings, 0 replies; 5+ messages in thread
From: Sage Weil @ 2011-03-22 0:48 UTC (permalink / raw)
To: linux-fsdevel, viro; +Cc: linux-kernel, ceph-devel, Sage Weil
The VFS unhashes the dentry prior to calling the ->rmdir and ->rename
(directory) inode operations. This is contrary to what the ->d_prune
d_op aims to provide: fs notification prior to dentries being unhashed.
Because it is done under the directory i_mutex and is thus serialized
against ->lookup() and ->readdir(), we can safely compensate by clearing
the flag in those methods.
Signed-off-by: Sage Weil <sage@newdream.net>
---
fs/ceph/dir.c | 23 ++++++++++++++++++-----
1 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e2005fa..2072f62 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -825,7 +825,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct ceph_mds_request *req;
int err = -EROFS;
- int op;
+ int op = 0;
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* rmdir .snap/foo is RMSNAP */
@@ -855,6 +855,9 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
d_delete(dentry);
ceph_mdsc_put_request(req);
out:
+ if (err && op != CEPH_MDS_OP_UNLINK)
+ /* vfs_rmdir calls dentry_unhash(), grr */
+ ceph_dir_clear_complete(dir);
return err;
}
@@ -864,18 +867,24 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ int is_dir = new_dentry->d_inode &&
+ (new_dentry->d_inode->i_mode & S_IFMT) == S_IFDIR;
int err;
+ err = -EXDEV;
if (ceph_snap(old_dir) != ceph_snap(new_dir))
- return -EXDEV;
+ goto out;
+ err = -EROFS;
if (ceph_snap(old_dir) != CEPH_NOSNAP ||
ceph_snap(new_dir) != CEPH_NOSNAP)
- return -EROFS;
+ goto out;
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
req->r_dentry = dget(new_dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
@@ -906,6 +915,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
ceph_invalidate_dentry_lease(new_dentry);
}
ceph_mdsc_put_request(req);
+out:
+ if (err && is_dir)
+ /* vfs_rename_dir calls dentry_unhash(), grr */
+ ceph_dir_clear_complete(new_dir);
return err;
}
--
1.7.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 4/4] ceph: use new D_COMPLETE dentry flag
2011-03-22 0:48 (unknown), Sage Weil
` (2 preceding siblings ...)
2011-03-22 0:48 ` [PATCH 3/4] ceph: compensate for dentry_unhash() calls in vfs_rmdir() and vfs_rename_dir() Sage Weil
@ 2011-03-22 0:48 ` Sage Weil
3 siblings, 0 replies; 5+ messages in thread
From: Sage Weil @ 2011-03-22 0:48 UTC (permalink / raw)
To: linux-fsdevel, viro; +Cc: linux-kernel, ceph-devel, Sage Weil
We used to use a flag on the directory inode to track whether the dcache
contents for a directory were a complete cached copy. Switch to a dentry
flag CEPH_D_COMPLETE that is safely updated by ->d_prune().
Signed-off-by: Sage Weil <sage@newdream.net>
---
fs/ceph/caps.c | 8 ++----
fs/ceph/dir.c | 59 ++++++++++++++++++++++++++++++++++++++++++-------
fs/ceph/inode.c | 9 +++----
fs/ceph/mds_client.c | 6 ++--
fs/ceph/super.h | 10 ++++++-
5 files changed, 68 insertions(+), 24 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6b61ded..77ae202 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -487,17 +487,15 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
ci->i_rdcache_gen++;
/*
- * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+ * if we are newly issued FILE_SHARED, clear D_COMPLETE; we
* don't know what happened to this directory while we didn't
* have the cap.
*/
if ((issued & CEPH_CAP_FILE_SHARED) &&
(had & CEPH_CAP_FILE_SHARED) == 0) {
ci->i_shared_gen++;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
- }
+ if (S_ISDIR(ci->vfs_inode.i_mode))
+ ceph_dir_clear_complete(&ci->vfs_inode);
}
}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 2072f62..7dd6877 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -90,7 +90,7 @@ static unsigned fpos_off(loff_t p)
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
- * I_COMPLETE tells indicates we have all dentries in the dir. It is
+ * D_COMPLETE tells indicates we have all dentries in the dir. It is
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
@@ -181,8 +181,8 @@ more:
filp->f_pos++;
/* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
- dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+ if (!ceph_dir_test_complete(dir)) {
+ dout(" lost D_COMPLETE on %p; falling back to mds\n", dir);
err = -EAGAIN;
goto out;
}
@@ -267,7 +267,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
if ((filp->f_pos == 2 || fi->dentry) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ ceph_dir_test_complete(inode) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
spin_unlock(&inode->i_lock);
err = __dcache_readdir(filp, dirent, filldir);
@@ -332,7 +332,7 @@ more:
if (!req->r_did_prepopulate) {
dout("readdir !did_prepopulate");
- fi->dir_release_count--; /* preclude I_COMPLETE */
+ fi->dir_release_count--; /* preclude D_COMPLETE */
}
/* note next offset and last dentry name */
@@ -411,8 +411,7 @@ more:
*/
spin_lock(&inode->i_lock);
if (ci->i_release_count == fi->dir_release_count) {
- dout(" marking %p complete\n", inode);
- /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+ ceph_dir_set_complete(inode);
ci->i_max_offset = filp->f_pos;
}
spin_unlock(&inode->i_lock);
@@ -582,7 +581,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
fsc->mount_options->snapdir_name,
dentry->d_name.len) &&
!is_root_ceph_dentry(dir, dentry) &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ ceph_dir_test_complete(dir) &&
(__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
spin_unlock(&dir->i_lock);
dout(" dir %p complete, -ENOENT\n", dir);
@@ -906,7 +905,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
/* d_move screws up d_subdirs order */
- ceph_i_clear(new_dir, CEPH_I_COMPLETE);
+ ceph_dir_clear_complete(new_dir);
d_move(old_dentry, new_dentry);
@@ -1067,6 +1066,48 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
}
/*
+ * Set/clear/test dir complete flag on the dir's dentry.
+ */
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+ struct dentry *alias;
+
+ if (list_empty(&inode->i_dentry))
+ return NULL;
+ alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+ return alias;
+}
+
+void ceph_dir_set_complete(struct inode *inode)
+{
+ struct dentry *dentry = __d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+}
+
+void ceph_dir_clear_complete(struct inode *inode)
+{
+ struct dentry *dentry = __d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry)) {
+ dout(" marking %p (%p) NOT complete\n", inode, dentry);
+ set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ }
+}
+
+bool ceph_dir_test_complete(struct inode *inode)
+{
+ struct dentry *dentry = __d_find_any_alias(inode);
+
+ if (dentry && ceph_dentry(dentry))
+ return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
+ return false;
+}
+
+/*
* When the VFS prunes a dentry from the cache, we need to clear the
* complete flag on the parent directory.
*
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b54c97d..dc5829a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -712,9 +712,8 @@ static int fill_inode(struct inode *inode,
ceph_snap(inode) == CEPH_NOSNAP &&
(le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
- (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
- dout(" marking %p complete (empty)\n", inode);
- /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
+ !ceph_dir_test_complete(inode) == 0) {
+ ceph_dir_set_complete(inode);
ci->i_max_offset = 2;
}
break;
@@ -850,7 +849,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
di = ceph_dentry(dn);
spin_lock(&inode->i_lock);
- if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+ if (!ceph_dir_test_complete(inode)) {
spin_unlock(&inode->i_lock);
return;
}
@@ -1052,7 +1051,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
* d_move() puts the renamed dentry at the end of
* d_subdirs. We need to assign it an appropriate
* directory offset so we can behave when holding
- * I_COMPLETE.
+ * D_COMPLETE.
*/
ceph_set_dentry_offset(req->r_old_dentry);
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a1ee8fa..22673b9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1974,7 +1974,7 @@ out:
}
/*
- * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * Invalidate dir D_COMPLETE, dentry lease state on an aborted MDS
* namespace request.
*/
void ceph_invalidate_dir_request(struct ceph_mds_request *req)
@@ -1982,9 +1982,9 @@ void ceph_invalidate_dir_request(struct ceph_mds_request *req)
struct inode *inode = req->r_locked_dir;
struct ceph_inode_info *ci = ceph_inode(inode);
- dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+ dout("invalidate_dir_request %p (D_COMPLETE, lease(s))\n", inode);
spin_lock(&inode->i_lock);
- ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ ceph_dir_clear_complete(inode);
ci->i_release_count++;
spin_unlock(&inode->i_lock);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b6bf098..6bcf0f5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -262,7 +262,7 @@ struct ceph_inode_info {
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
+ u64 i_max_offset; /* largest readdir offset, set with D_COMPLETE */
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
@@ -426,7 +426,6 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
/*
* Ceph inode.
*/
-#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
#define CEPH_I_NODELAY 4 /* do not delay cap release */
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
@@ -484,6 +483,13 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
}
/*
+ * set/clear directory D_COMPLETE flag
+ */
+void ceph_dir_set_complete(struct inode *inode);
+void ceph_dir_clear_complete(struct inode *inode);
+bool ceph_dir_test_complete(struct inode *inode);
+
+/*
* caps helpers
*/
static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
--
1.7.0
^ permalink raw reply related [flat|nested] 5+ messages in thread