linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Al Viro <viro@ZenIV.linux.org.uk>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>,
	linux-fsdevel <linux-fsdevel@vger.kernel.org>,
	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
	Mikhail Efremov <sem@altlinux.org>
Subject: Re: [PATCH v2] vfs: Don't exchange "short" filenames unconditionally.
Date: Mon, 29 Sep 2014 17:27:14 +0100	[thread overview]
Message-ID: <20140929162714.GH7996@ZenIV.linux.org.uk> (raw)
In-Reply-To: <CA+55aFyveyOABqssxLinsmHkBg9=95TwEGC1ZT1POe+RmTbj+A@mail.gmail.com>

On Mon, Sep 29, 2014 at 09:07:00AM -0700, Linus Torvalds wrote:
> On Mon, Sep 29, 2014 at 8:59 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> >
> > Now RCU lookup starts.  And on another CPU we move the first dentry over
> > the third one.  copy_name() is called, it decrements the refcount down
> > to 1 (__d_free() hasn't happened yet) and doesn't schedule any freeing.
> 
> Ahh. If we were to do *all* of the copy-name name freeing under RCU,
> we'd be ok. But we don't. We do the refcount decrement immediately
> (and have to, if we want to have the rcu/refcount union). Yeah, good
> call, although I find that doubvle rcu grace period for the common
> case to be very annoying.

See below (or in followup to what you were replying to); it *is* trivial to
avoid double grace periods there.  Look: in free_dentry() we know if we want
the name freed.  So let's have __d_free() for "just free dentry" and
__d_free_ext() for "free dentry and name".  No looking at refcount in the
latter - just choose which one to call_rcu() when free_dentry() does decrement.
It's equivalent from the grace period POV to doing kfree_rcu(), but avoids
double call_rcu(); it just that in case we'd want kfree_rcu() we
schedule the call of a function that frees both.

What we get in free_dentry() is:
	* external name not shared: refcount driven to 0, RCU-delayed
call of "free dentry, free ext name"
	* external name still shared: refcount positive after decrement,
no freeing ext name
	* no external name: no ext name to free
In the last two cases we do what dentry_free() used to do, except that now
__d_free() doesn't even look for ext name.  Just frees the dentry.  If
it never had been hashed - directly called, otherwise - via call_rcu().

Does that look OK for you?

diff --git a/fs/dcache.c b/fs/dcache.c
index cb25a1a..0cf5aff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -235,20 +235,44 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 	return dentry_string_cmp(cs, ct, tcount);
 }
 
+struct ext_name {
+	union {
+		atomic_t count;
+		struct rcu_head head;
+	};
+	unsigned char name[];
+};
+
+static inline struct ext_name *ext_name(struct dentry *dentry)
+{
+	if (likely(!dname_external(dentry)))
+		return NULL;
+	return container_of(dentry->d_name.name, struct ext_name, name[0]);
+}
+
 static void __d_free(struct rcu_head *head)
 {
 	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
 
 	WARN_ON(!hlist_unhashed(&dentry->d_alias));
-	if (dname_external(dentry))
-		kfree(dentry->d_name.name);
+	kmem_cache_free(dentry_cache, dentry); 
+}
+
+static void __d_free_ext(struct rcu_head *head)
+{
+	struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
+	WARN_ON(!hlist_unhashed(&dentry->d_alias));
+	kfree(ext_name(dentry));
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
 static void dentry_free(struct dentry *dentry)
 {
+	struct ext_name *p = ext_name(dentry);
+	if (unlikely(p) && likely(atomic_dec_and_test(&p->count)))
+		call_rcu(&dentry->d_u.d_rcu, __d_free_ext);
 	/* if dentry was never visible to RCU, immediate free is OK */
-	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+	else if (!(dentry->d_flags & DCACHE_RCUACCESS))
 		__d_free(&dentry->d_u.d_rcu);
 	else
 		call_rcu(&dentry->d_u.d_rcu, __d_free);
@@ -1438,11 +1462,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	 */
 	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
 	if (name->len > DNAME_INLINE_LEN-1) {
-		dname = kmalloc(name->len + 1, GFP_KERNEL);
-		if (!dname) {
+		size_t size = offsetof(struct ext_name, name) + name->len + 1;
+		struct ext_name *p = kmalloc(size, GFP_KERNEL);
+		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
 		}
+		atomic_set(&p->count, 1);
+		dname = p->name;
 	} else  {
 		dname = dentry->d_iname;
 	}	
@@ -2372,11 +2399,10 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 }
 EXPORT_SYMBOL(dentry_update_name_case);
 
-static void switch_names(struct dentry *dentry, struct dentry *target,
-			 bool exchange)
+static void swap_names(struct dentry *dentry, struct dentry *target)
 {
-	if (dname_external(target)) {
-		if (dname_external(dentry)) {
+	if (unlikely(dname_external(target))) {
+		if (unlikely(dname_external(dentry))) {
 			/*
 			 * Both external: swap the pointers
 			 */
@@ -2392,7 +2418,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target,
 			target->d_name.name = target->d_iname;
 		}
 	} else {
-		if (dname_external(dentry)) {
+		if (unlikely(dname_external(dentry))) {
 			/*
 			 * dentry:external, target:internal.  Give dentry's
 			 * storage to target and make dentry internal
@@ -2407,12 +2433,6 @@ static void switch_names(struct dentry *dentry, struct dentry *target,
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			if (!exchange) {
-				memcpy(dentry->d_iname, target->d_name.name,
-						target->d_name.len + 1);
-				dentry->d_name.hash_len = target->d_name.hash_len;
-				return;
-			}
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
@@ -2422,6 +2442,22 @@ static void switch_names(struct dentry *dentry, struct dentry *target,
 	swap(dentry->d_name.hash_len, target->d_name.hash_len);
 }
 
+static void copy_name(struct dentry *dentry, struct dentry *target)
+{
+	struct ext_name *old_name = ext_name(dentry);
+	if (unlikely(dname_external(target))) {
+		atomic_inc(&ext_name(target)->count);
+		dentry->d_name = target->d_name;
+	} else {
+		memcpy(dentry->d_iname, target->d_name.name,
+				target->d_name.len + 1);
+		dentry->d_name.name = dentry->d_iname;
+		dentry->d_name.hash_len = target->d_name.hash_len;
+	}
+	if (unlikely(old_name) && likely(atomic_dec_and_test(&old_name->count)))
+		kfree_rcu(old_name, head);
+}
+
 static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
 {
 	/*
@@ -2518,7 +2554,10 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	}
 
 	/* Switch the names.. */
-	switch_names(dentry, target, exchange);
+	if (exchange)
+		swap_names(dentry, target);
+	else
+		copy_name(dentry, target);
 
 	/* ... and switch them in the tree */
 	if (IS_ROOT(dentry)) {

  reply	other threads:[~2014-09-29 16:27 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-09-24 18:14 [PATCH v2] vfs: Don't exchange "short" filenames unconditionally Mikhail Efremov
2014-09-24 18:55 ` Al Viro
2014-09-24 19:20   ` Linus Torvalds
2014-09-24 19:27     ` Linus Torvalds
2014-09-24 20:18     ` Al Viro
2014-09-25  4:46       ` Al Viro
2014-09-26 16:44         ` Al Viro
2014-09-27  4:45           ` Al Viro
2014-09-27 17:56             ` Linus Torvalds
2014-09-27 18:31               ` Al Viro
2014-09-27 19:16                 ` Al Viro
2014-09-27 19:37                   ` Linus Torvalds
2014-09-27 19:39                     ` Linus Torvalds
2014-09-27 19:49                       ` Al Viro
2014-09-27 19:55                         ` Linus Torvalds
2014-09-27 21:48                           ` [git pull] vfs.git for 3.17-rc7 Al Viro
2014-09-27 19:45                   ` [PATCH v2] vfs: Don't exchange "short" filenames unconditionally Al Viro
2014-09-28  7:47                   ` Al Viro
2014-09-28 18:05                     ` Al Viro
2014-09-28 21:51                       ` Al Viro
2014-09-29  1:06                         ` [PATCH] missing data dependency barrier in prepend_name() Al Viro
2014-09-29 15:15                       ` [PATCH v2] vfs: Don't exchange "short" filenames unconditionally Linus Torvalds
2014-09-29 15:59                         ` Al Viro
2014-09-29 16:07                           ` Linus Torvalds
2014-09-29 16:27                             ` Al Viro [this message]
2014-09-29 17:54                               ` Linus Torvalds
2014-09-29 19:04                                 ` Al Viro
2014-09-29 20:45                                   ` Linus Torvalds
2014-09-29 18:42                       ` Paul E. McKenney
2014-10-01  0:16                         ` Al Viro
2014-10-02  5:38                           ` Paul E. McKenney
2014-10-02 10:35                           ` Chuck Ebbert
2014-10-03  2:11                             ` Paul E. McKenney
2014-09-29 13:16                     ` Paul E. McKenney
2014-09-29 15:04                       ` Al Viro
2014-09-28 15:01               ` Mikhail Efremov
2014-09-26 20:23         ` Al Viro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20140929162714.GH7996@ZenIV.linux.org.uk \
    --to=viro@zeniv.linux.org.uk \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paulmck@linux.vnet.ibm.com \
    --cc=sem@altlinux.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).