[PATCH 0/2] afs: avoid unhash/rehash

Linux filesystem development
 help / color / mirror / Atom feed

* [PATCH 0/2] afs: avoid unhash/rehash
@ 2026-07-02  1:58 NeilBrown
  2026-07-02  1:58 ` [PATCH 1/2] afs: use d_time instead of d_fsdata NeilBrown
  2026-07-02  1:58 ` [PATCH 2/2] afs: don't unhash/rehash dentries during unlink/rename NeilBrown
  0 siblings, 2 replies; 8+ messages in thread
From: NeilBrown @ 2026-07-02  1:58 UTC (permalink / raw)
  To: David Howells, Marc Dionne; +Cc: linux-afs, linux-fsdevel

afs (like nfs and others) needs to block "open" requests while
processing an unlink or rename because if the target is to be unlinked
on the server, then the open must fail or create a new file, which needs
to wait for the unlink to complete.

It does this by unhashing the dentry and later rehashing it.  This will
cause problems for proposed changes to locking which will lock the
dentry rather than the parent directory.  Unhashing will effectively
unlock the name.

The second patch here addresses this by using ->d_fsdata to mark a
dentry as being busy in unlink/rename and waiting in d_revalidate for
that mark to be removed.  This is the same approach that NFS has used
for a while.

The first patch frees up ->d_fsdata which is currently otherwised use.
It is being used for a purpose that ->d_time is more suitable for so the
first patch switches to use d_time.

Thanks,
NeilBrown

 [PATCH 1/2] afs: use d_time instead of d_fsdata
 [PATCH 2/2] afs: don't unhash/rehash dentries during unlink/rename

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] afs: use d_time instead of d_fsdata
  2026-07-02  1:58 [PATCH 0/2] afs: avoid unhash/rehash NeilBrown
@ 2026-07-02  1:58 ` NeilBrown
  2026-07-02  8:54   ` David Howells
  2026-07-02  9:09   ` David Howells
  2026-07-02  1:58 ` [PATCH 2/2] afs: don't unhash/rehash dentries during unlink/rename NeilBrown
  1 sibling, 2 replies; 8+ messages in thread
From: NeilBrown @ 2026-07-02  1:58 UTC (permalink / raw)
  To: David Howells, Marc Dionne; +Cc: linux-afs, linux-fsdevel

From: NeilBrown <neil@brown.name>

afs uses ->d_fsdata to store version information for the parent
directory.  ->d_time is arguably a better field to store this
information as the version is like a time stamp, and ->d_time is an
unsigned long, while ->d_fsdata is a void *.

This will leave ->d_fsdata free for a different use ...  which
admittedly is also not a void*, but is certainly not at all a time.

Interestingly the value stored in ->d_time or d_fsdata is u64 which does
not fit in "unsigned long" or "void *" on 32 bit hosts.  Maybe that
doesn't matter.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/afs/dir.c      | 18 +++++++++---------
 fs/afs/internal.h |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 498b99ccdf0e..7ff84b410097 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -795,7 +795,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 		afs_dir_iterate(dir, &cookie->ctx, NULL, &data_version);
 	}
 
-	dentry->d_fsdata = (void *)(unsigned long)data_version;
+	dentry->d_time = (unsigned long)data_version;
 
 	/* Check to see if we already have an inode for the primary fid. */
 	inode = ilookup5(dir->i_sb, cookie->fids[1].vnode,
@@ -882,9 +882,9 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry)
 	}
 
 	if (op->file[0].scb.have_status)
-		dentry->d_fsdata = (void *)(unsigned long)op->file[0].scb.status.data_version;
+		dentry->d_time = (unsigned long)op->file[0].scb.status.data_version;
 	else
-		dentry->d_fsdata = (void *)(unsigned long)op->file[0].dv_before;
+		dentry->d_time = (unsigned long)op->file[0].dv_before;
 	ret = afs_put_operation(op);
 out:
 	kfree(cookie);
@@ -997,7 +997,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
 	_debug("splice %p", dentry->d_inode);
 	d = d_splice_alias(inode, dentry);
 	if (!IS_ERR_OR_NULL(d)) {
-		d->d_fsdata = dentry->d_fsdata;
+		d->d_time = dentry->d_time;
 		trace_afs_lookup(dvnode, &d->d_name, &fid);
 	} else {
 		trace_afs_lookup(dvnode, &dentry->d_name, &fid);
@@ -1027,7 +1027,7 @@ static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry)
 	 * version.
 	 */
 	dir_version = (long)READ_ONCE(dvnode->status.data_version);
-	de_version = (long)READ_ONCE(dentry->d_fsdata);
+	de_version = (long)READ_ONCE(dentry->d_time);
 	if (de_version != dir_version) {
 		dir_version = (long)READ_ONCE(dvnode->invalid_before);
 		if (de_version - dir_version < 0)
@@ -1087,7 +1087,7 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	 * version.
 	 */
 	dir_version = dir->status.data_version;
-	de_version = (long)dentry->d_fsdata;
+	de_version = (long)dentry->d_time;
 	if (de_version == (long)dir_version)
 		goto out_valid_noupdate;
 
@@ -1148,7 +1148,7 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	}
 
 out_valid:
-	dentry->d_fsdata = (void *)(unsigned long)dir_version;
+	dentry->d_time = (unsigned long)dir_version;
 out_valid_noupdate:
 	key_put(key);
 	_leave(" = 1 [valid]");
@@ -1935,7 +1935,7 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 		spin_unlock(&new_inode->i_lock);
 	}
 
-	/* Now we can update d_fsdata on the dentries to reflect their
+	/* Now we can update d_time on the dentries to reflect their
 	 * new parent's data_version.
 	 */
 	afs_update_dentry_version(op, new_dvp, op->dentry);
@@ -2171,7 +2171,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	}
 
 	/* This bit is potentially nasty as there's a potential race with
-	 * afs_d_revalidate{,_rcu}().  We have to change d_fsdata on the dentry
+	 * afs_d_revalidate{,_rcu}().  We have to change d_time_ on the dentry
 	 * to reflect it's new parent's new data_version after the op, but
 	 * d_revalidate may see old_dentry between the op having taken place
 	 * and the version being updated.
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 0b72a8566299..a22027935482 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1764,17 +1764,17 @@ static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
 }
 
 /*
- * Note that a dentry got changed.  We need to set d_fsdata to the data version
+ * Note that a dentry got changed.  We need to set d_time to the data version
  * number derived from the result of the operation.  It doesn't matter if
- * d_fsdata goes backwards as we'll just revalidate.
+ * d_time goes backwards as we'll just revalidate.
  */
 static inline void afs_update_dentry_version(struct afs_operation *op,
 					     struct afs_vnode_param *dir_vp,
 					     struct dentry *dentry)
 {
 	if (!op->cumul_error.error)
-		dentry->d_fsdata =
-			(void *)(unsigned long)dir_vp->scb.status.data_version;
+		dentry->d_time =
+			(unsigned long)dir_vp->scb.status.data_version;
 }
 
 /*
-- 
2.50.0.107.gf914562f5916.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] afs: use d_time instead of d_fsdata
  2026-07-02  1:58 ` [PATCH 1/2] afs: use d_time instead of d_fsdata NeilBrown
@ 2026-07-02  8:54   ` David Howells
  2026-07-02 10:12     ` NeilBrown
  2026-07-02  9:09   ` David Howells
  1 sibling, 1 reply; 8+ messages in thread
From: David Howells @ 2026-07-02  8:54 UTC (permalink / raw)
  To: NeilBrown; +Cc: dhowells, Marc Dionne, linux-afs, linux-fsdevel

NeilBrown <neilb@ownmail.net> wrote:

> afs uses ->d_fsdata to store version information for the parent
> directory.  ->d_time is arguably a better field to store this
> information as the version is like a time stamp, and ->d_time is an
> unsigned long, while ->d_fsdata is a void *.

It wasn't clear that ->d_time was available for private use by the filesystem.

> This will leave ->d_fsdata free for a different use ...  which
> admittedly is also not a void*, but is certainly not at all a time.

Are you going to use ->d_fsdata for something generic?

> Interestingly the value stored in ->d_time or d_fsdata is u64 which does
> not fit in "unsigned long" or "void *" on 32 bit hosts.  Maybe that
> doesn't matter.

I know.  I just have to hope that someone triggers revalidation on a dirent
before 4 billion changes have been made to the directory on a 32-bit machine.
I could use both d_time and d_fsdata on 32-bit, I suppose.  I really don't
want to allocate an 8-byte blob for each dentry.

Maybe I should switch to a single-bit flag and "invalidate" all the dentries
attached to a directory if I detect a jump in the directory's version number
due to a third-party modification (the version is monotonically incremented
for each modification committed).

I'm not sure how to do a mass invalidation without risking deadlock, though,
because I'd quite likely be starting from a random dirent in a directory and
then have to walk through all the other dirents of the directory.  I suppose I
could lock the directory (probably using dvnode->validate_lock) and then walk
through d_children.

David

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] afs: use d_time instead of d_fsdata
  2026-07-02  8:54   ` David Howells
@ 2026-07-02 10:12     ` NeilBrown
  0 siblings, 0 replies; 8+ messages in thread
From: NeilBrown @ 2026-07-02 10:12 UTC (permalink / raw)
  To: David Howells; +Cc: dhowells, Marc Dionne, linux-afs, linux-fsdevel

On Thu, 02 Jul 2026, David Howells wrote:
> NeilBrown <neilb@ownmail.net> wrote:
> 
> > afs uses ->d_fsdata to store version information for the parent
> > directory.  ->d_time is arguably a better field to store this
> > information as the version is like a time stamp, and ->d_time is an
> > unsigned long, while ->d_fsdata is a void *.
> 
> It wasn't clear that ->d_time was available for private use by the filesystem.

Fair enough - it isn't clearly documented.
5 different filesystems use it, no common VFS code touches it.

> 
> > This will leave ->d_fsdata free for a different use ...  which
> > admittedly is also not a void*, but is certainly not at all a time.
> 
> Are you going to use ->d_fsdata for something generic?

I only need 1 bit - a flag to say that the dentry is being removed on
the server.  I've occasionally thought of asking for a DCACHE_PRIVATE
flag but ->d_fsdata is just as good.

> 
> > Interestingly the value stored in ->d_time or d_fsdata is u64 which does
> > not fit in "unsigned long" or "void *" on 32 bit hosts.  Maybe that
> > doesn't matter.
> 
> I know.  I just have to hope that someone triggers revalidation on a dirent
> before 4 billion changes have been made to the directory on a 32-bit machine.
> I could use both d_time and d_fsdata on 32-bit, I suppose.  I really don't
> want to allocate an 8-byte blob for each dentry.

I only want 1 bit of d_fsdata, and 60 bits is likely plenty for a
version number.  So that would be the easiest approach.

> 
> Maybe I should switch to a single-bit flag and "invalidate" all the dentries
> attached to a directory if I detect a jump in the directory's version number
> due to a third-party modification (the version is monotonically incremented
> for each modification committed).
> 
> I'm not sure how to do a mass invalidation without risking deadlock, though,
> because I'd quite likely be starting from a random dirent in a directory and
> then have to walk through all the other dirents of the directory.  I suppose I
> could lock the directory (probably using dvnode->validate_lock) and then walk
> through d_children.

Walking d_children could work, but that list can be long so you want to
allow scheduling points.
I haven't actually tried this, but I think you can allocate a
DCACHE_CURSOR dentry on the stack and insert it in the list whenever you
want to drop ->d_lock and schedule.

Maybe not worth the effort though - 32-bit is a vanishing breed.

NeilBrown

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] afs: use d_time instead of d_fsdata
  2026-07-02  1:58 ` [PATCH 1/2] afs: use d_time instead of d_fsdata NeilBrown
  2026-07-02  8:54   ` David Howells
@ 2026-07-02  9:09   ` David Howells
  2026-07-02 10:27     ` NeilBrown
  1 sibling, 1 reply; 8+ messages in thread
From: David Howells @ 2026-07-02  9:09 UTC (permalink / raw)
  To: NeilBrown; +Cc: dhowells, Marc Dionne, linux-afs, linux-fsdevel

NeilBrown <neilb@ownmail.net> wrote:

> afs uses ->d_fsdata to store version information for the parent
> directory.  ->d_time is arguably a better field to store this
> information as the version is like a time stamp, and ->d_time is an
> unsigned long, while ->d_fsdata is a void *.

Might it be better to rename ->d_time to ->d_version, since that would seem to
be more in line with how it's used?

> Interestingly the value stored in ->d_time or d_fsdata is u64 which does
> not fit in "unsigned long" or "void *" on 32 bit hosts.  Maybe that
> doesn't matter.

Hmmm...  It looks like NFS may have a bug here:

static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry)
{
	if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) &&
	    d_really_is_negative(dentry))
		return dentry->d_time == inode_peek_iversion_raw(dir);
	return nfs_verify_change_attribute(dir, dentry->d_time);
}

dentry->d_time may be 32-bits, but inode_peek_iversion_raw() is always 64 bits
and i_version is set to the 64-bit fattr->change_attr.

David


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] afs: use d_time instead of d_fsdata
  2026-07-02  9:09   ` David Howells
@ 2026-07-02 10:27     ` NeilBrown
  2026-07-02 10:53       ` David Howells
  0 siblings, 1 reply; 8+ messages in thread
From: NeilBrown @ 2026-07-02 10:27 UTC (permalink / raw)
  To: David Howells; +Cc: dhowells, Marc Dionne, linux-afs, linux-fsdevel

On Thu, 02 Jul 2026, David Howells wrote:
> NeilBrown <neilb@ownmail.net> wrote:
> 
> > afs uses ->d_fsdata to store version information for the parent
> > directory.  ->d_time is arguably a better field to store this
> > information as the version is like a time stamp, and ->d_time is an
> > unsigned long, while ->d_fsdata is a void *.
> 
> Might it be better to rename ->d_time to ->d_version, since that would seem to
> be more in line with how it's used?

Maybe.
 fuse and kernfs store an incrementing version number, like afs
 cifs and vboxsf store jiffies
 nfs stores whatever the server provides, which might be a counter or
   might be a timestamp.

So d_version is arguably slightly better than d_time.

> 
> > Interestingly the value stored in ->d_time or d_fsdata is u64 which does
> > not fit in "unsigned long" or "void *" on 32 bit hosts.  Maybe that
> > doesn't matter.
> 
> Hmmm...  It looks like NFS may have a bug here:
> 
> static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry)
> {
> 	if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) &&
> 	    d_really_is_negative(dentry))
> 		return dentry->d_time == inode_peek_iversion_raw(dir);
> 	return nfs_verify_change_attribute(dir, dentry->d_time);
> }
> 
> dentry->d_time may be 32-bits, but inode_peek_iversion_raw() is always 64 bits
> and i_version is set to the 64-bit fattr->change_attr.

I wonder if there would be any appetite for making d_time (or d_version)
always 64bit, much like i_ino was recently changed to u64.

NeilBrown

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] afs: use d_time instead of d_fsdata
  2026-07-02 10:27     ` NeilBrown
@ 2026-07-02 10:53       ` David Howells
  0 siblings, 0 replies; 8+ messages in thread
From: David Howells @ 2026-07-02 10:53 UTC (permalink / raw)
  To: NeilBrown, Alexander Viro; +Cc: dhowells, Marc Dionne, linux-afs, linux-fsdevel

NeilBrown <neilb@ownmail.net> wrote:

> > > Interestingly the value stored in ->d_time or d_fsdata is u64 which does
> > > not fit in "unsigned long" or "void *" on 32 bit hosts.  Maybe that
> > > doesn't matter.
> > 
> > Hmmm...  It looks like NFS may have a bug here:
> > 
> > static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry)
> > {
> > 	if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) &&
> > 	    d_really_is_negative(dentry))
> > 		return dentry->d_time == inode_peek_iversion_raw(dir);
> > 	return nfs_verify_change_attribute(dir, dentry->d_time);
> > }
> > 
> > dentry->d_time may be 32-bits, but inode_peek_iversion_raw() is always 64 bits
> > and i_version is set to the 64-bit fattr->change_attr.
> 
> I wonder if there would be any appetite for making d_time (or d_version)
> always 64bit, much like i_ino was recently changed to u64.

I suspect Al wouldn't be happy about that.  He counts the bits in struct
dentry very carefully.

David


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] afs: don't unhash/rehash dentries during unlink/rename
  2026-07-02  1:58 [PATCH 0/2] afs: avoid unhash/rehash NeilBrown
  2026-07-02  1:58 ` [PATCH 1/2] afs: use d_time instead of d_fsdata NeilBrown
@ 2026-07-02  1:58 ` NeilBrown
  1 sibling, 0 replies; 8+ messages in thread
From: NeilBrown @ 2026-07-02  1:58 UTC (permalink / raw)
  To: David Howells, Marc Dionne; +Cc: linux-afs, linux-fsdevel

From: NeilBrown <neil@brown.name>

afs needs to block lookup of dentries during unlink and rename.
There are two reasons:
1/ If the target is to be removed, not silly-renamed, the subsequent
   opens cannot be allowed as the file won't exist on the server.
2/ If the rename source is being moved between directories a lookup,
   particularly d_revalidate, might change ->d_time asynchronously
   with rename changing ->d_time with possible incorrect results.

afs current unhashes the dentry to force a lookup which will wait on the
directory lock, and rehashes afterwards.  This is incompatible with
proposed changed to directory locking which will require a dentry to
remain hashed throughout rename/unlink/etc operations.

This patch copies a mechanism developed for NFS.  ->d_fsdata which is
currently unused is now set to a non-NULL value when lookups must be
blocked.  d_revalidate checks for this value, and waits for it to become
NULL.

->d_lock is used to ensure d_revalidate never updates ->d_time while
->d_fsdata is set.

Signed-off-by: NeilBrown <neil@brown.name>
---
 fs/afs/afs.h      |  7 ++++++
 fs/afs/dir.c      | 64 +++++++++++++++++++++++++++++------------------
 fs/afs/internal.h |  5 +---
 3 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index ec3db00bd081..019e77b08458 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -26,6 +26,13 @@ typedef u64			afs_volid_t;
 typedef u64			afs_vnodeid_t;
 typedef u64			afs_dataversion_t;
 
+/* This is stored in ->d_fsdata to stop d_revalidate looking at,
+ * and possibly changing, ->d_time on a dentry which is being moved
+ * between directories, and to block lookup for dentry that is
+ * being removed without silly-rename.
+ */
+#define AFS_FSDATA_BLOCKED ((void*)1)
+
 typedef enum {
 	AFSVL_RWVOL,			/* read/write volume */
 	AFSVL_ROVOL,			/* read-only volume */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 7ff84b410097..e9aaf66218ea 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1021,6 +1021,10 @@ static int afs_d_revalidate_rcu(struct afs_vnode *dvnode, struct dentry *dentry)
 	if (!afs_check_validity(dvnode))
 		return -ECHILD;
 
+	/* A rename/unlink is pending */
+	if (dentry->d_fsdata)
+		return -ECHILD;
+
 	/* We only need to invalidate a dentry if the server's copy changed
 	 * behind our back.  If we made the change, it's no problem.  Note that
 	 * on a 32-bit system, we only have 32 bits in the dentry to store the
@@ -1056,6 +1060,10 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	if (flags & LOOKUP_RCU)
 		return afs_d_revalidate_rcu(dir, dentry);
 
+	/* Wait for rename/unlink to complete */
+wait_for_rename:
+	wait_var_event(&dentry->d_fsdata, dentry->d_fsdata == NULL);
+
 	if (d_really_is_positive(dentry)) {
 		vnode = AFS_FS_I(d_inode(dentry));
 		_enter("{v={%llx:%llu} n=%pd fl=%lx},",
@@ -1148,7 +1156,13 @@ static int afs_d_revalidate(struct inode *parent_dir, const struct qstr *name,
 	}
 
 out_valid:
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_fsdata) {
+		spin_unlock(&dentry->d_lock);
+		goto wait_for_rename;
+	}
 	dentry->d_time = (unsigned long)dir_version;
+	spin_unlock(&dentry->d_lock);
 out_valid_noupdate:
 	key_put(key);
 	_leave(" = 1 [valid]");
@@ -1523,8 +1537,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op)
 static void afs_unlink_put(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
-	if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT)
-		d_rehash(op->dentry);
+	store_release_wake_up(&op->dentry->d_fsdata, NULL);
 }
 
 static const struct afs_operation_ops afs_unlink_operation = {
@@ -1578,11 +1591,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
 		afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key));
 		goto error;
 	}
-	if (!d_unhashed(dentry)) {
-		/* Prevent a race with RCU lookup. */
-		__d_drop(dentry);
-		op->unlink.need_rehash = true;
-	}
+	dentry->d_fsdata = AFS_FSDATA_BLOCKED;
 	spin_unlock(&dentry->d_lock);
 
 	op->file[1].vnode = vnode;
@@ -1889,9 +1898,10 @@ static void afs_rename_edit_dir(struct afs_operation *op)
 
 	_enter("op=%08x", op->debug_id);
 
-	if (op->rename.rehash) {
-		d_rehash(op->rename.rehash);
-		op->rename.rehash = NULL;
+	if (op->rename.unblock) {
+		/* Rename has finished, so unlocks lookups to target */
+		store_release_wake_up(&op->rename.unblock->d_fsdata, NULL);
+		op->rename.unblock = NULL;
 	}
 
 	fscache_begin_write_operation(&orig_cres, afs_vnode_cache(orig_dvnode));
@@ -1974,6 +1984,9 @@ static void afs_rename_exchange_edit_dir(struct afs_operation *op)
 
 		d_exchange(old_dentry, new_dentry);
 		up_write(&orig_dvnode->validate_lock);
+	/* dentry has been moved, so d_validate can safely proceed */
+	store_release_wake_up(&old_dentry->d_fsdata, NULL);
+
 	} else {
 		down_write(&orig_dvnode->validate_lock);
 		if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
@@ -2013,11 +2026,10 @@ static void afs_rename_exchange_edit_dir(struct afs_operation *op)
 static void afs_rename_put(struct afs_operation *op)
 {
 	_enter("op=%08x", op->debug_id);
-	if (op->rename.rehash)
-		d_rehash(op->rename.rehash);
+	if (op->rename.unblock)
+		store_release_wake_up(&op->rename.unblock->d_fsdata, NULL);
+	store_release_wake_up(&op->dentry->d_fsdata, NULL);
 	dput(op->rename.tmp);
-	if (afs_op_error(op))
-		d_rehash(op->dentry);
 }
 
 static const struct afs_operation_ops afs_rename_operation = {
@@ -2125,7 +2137,6 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		op->ops		= &afs_rename_noreplace_operation;
 	} else if (flags & RENAME_EXCHANGE) {
 		op->ops		= &afs_rename_exchange_operation;
-		d_drop(new_dentry);
 	} else {
 		/* If we might displace the target, we might need to do silly
 		 * rename.
@@ -2139,14 +2150,12 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		 */
 		if (d_is_positive(new_dentry) && !d_is_dir(new_dentry)) {
 			/* To prevent any new references to the target during
-			 * the rename, we unhash the dentry in advance.
+			 * the rename, we set d_fsdata which afs_d_revalidate will wait for.
+			 * d_lock ensures d_count() and ->d_fsdata are consistent.
 			 */
-			if (!d_unhashed(new_dentry)) {
-				d_drop(new_dentry);
-				op->rename.rehash = new_dentry;
-			}
-
+			spin_lock(&new_dentry->d_lock);
 			if (d_count(new_dentry) > 2) {
+				spin_unlock(&new_dentry->d_lock);
 				/* copy the target dentry's name */
 				op->rename.tmp = d_alloc(new_dentry->d_parent,
 							 &new_dentry->d_name);
@@ -2164,8 +2173,12 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 				}
 
 				op->dentry_2 = op->rename.tmp;
-				op->rename.rehash = NULL;
 				op->rename.new_negative = true;
+			} else {
+				/* Block any lookups to target until the rename completes */
+				new_dentry->d_fsdata = AFS_FSDATA_BLOCKED;
+				op->rename.unblock = new_dentry;
+				spin_unlock(&new_dentry->d_lock);
 			}
 		}
 	}
@@ -2176,10 +2189,11 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	 * d_revalidate may see old_dentry between the op having taken place
 	 * and the version being updated.
 	 *
-	 * So drop the old_dentry for now to make other threads go through
-	 * lookup instead - which we hold a lock against.
+	 * So block revalidate on the old_dentry until the rename completes.
 	 */
-	d_drop(old_dentry);
+	spin_lock(&old_dentry->d_lock);
+	old_dentry->d_fsdata = AFS_FSDATA_BLOCKED;
+	spin_unlock(&old_dentry->d_lock);
 
 	ret = afs_do_sync_operation(op);
 	if (ret == -ENOTSUPP)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a22027935482..a0afe552ae64 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -900,10 +900,7 @@ struct afs_operation {
 			struct afs_symlink *symlink;
 		} create;
 		struct {
-			bool	need_rehash;
-		} unlink;
-		struct {
-			struct dentry	*rehash;
+			struct dentry	*unblock;
 			struct dentry	*tmp;
 			unsigned int	rename_flags;
 			bool		new_negative;
-- 
2.50.0.107.gf914562f5916.dirty


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-07-02 10:53 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-07-02  1:58 [PATCH 0/2] afs: avoid unhash/rehash NeilBrown
2026-07-02  1:58 ` [PATCH 1/2] afs: use d_time instead of d_fsdata NeilBrown
2026-07-02  8:54   ` David Howells
2026-07-02 10:12     ` NeilBrown
2026-07-02  9:09   ` David Howells
2026-07-02 10:27     ` NeilBrown
2026-07-02 10:53       ` David Howells
2026-07-02  1:58 ` [PATCH 2/2] afs: don't unhash/rehash dentries during unlink/rename NeilBrown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox