From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <linux-kernel-owner@vger.kernel.org>
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1754549AbbDMTFx (ORCPT <rfc822;w@1wt.eu>);
	Mon, 13 Apr 2015 15:05:53 -0400
Received: from zeniv.linux.org.uk ([195.92.253.2]:54227 "EHLO
	ZenIV.linux.org.uk" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751524AbbDMTFv (ORCPT
	<rfc822;linux-kernel@vger.kernel.org>);
	Mon, 13 Apr 2015 15:05:51 -0400
Date: Mon, 13 Apr 2015 20:05:48 +0100
From: Al Viro <viro@ZenIV.linux.org.uk>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Subject: [RFC] new locking primitive (pulled from fs_pin)
Message-ID: <20150413190548.GQ889@ZenIV.linux.org.uk>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
User-Agent: Mutt/1.5.21 (2010-09-15)
Sender: linux-kernel-owner@vger.kernel.org
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org

New structure.  Intended use: embed into an object that will always
be freed with RCU delay.

Initialize with init_kill_once(&object->kill_once).

Use by grabbing rcu_read_lock(), finding the object, then
if (needs_killing(&object->kill_once)) {
	// do whatever actions you want, including
	// removal of references from the places
	// where they could be found
	finished_killing(&object->kill_once);
	// arrange for RCU-delayed freeing
}

If several threads attempt that, only the first one will
see needs_killing() return true *and* everything else will
wait in needs_killing() until the first one is past
finished_killing().  Note that they might end up returning
after object gets freed - needs_killing() itself is very
careful about dereferencing and its caller MUST NOT touch
the object after getting false from needs_killing().

needs_killing() must be called with rcu_read_lock() held and
drops it in all cases.

This thing used to be the locking side of fs/fs_pin.c, but
IMO it might make sense to separate it from fs_pin completely -
it could be useful elsewhere...

Comments (and especially suggestions on better names) are welcome...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index b06c987..b124faf 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -12,10 +12,7 @@ void pin_remove(struct fs_pin *pin)
 	hlist_del(&pin->m_list);
 	hlist_del(&pin->s_list);
 	spin_unlock(&pin_lock);
-	spin_lock_irq(&pin->wait.lock);
-	pin->done = 1;
-	wake_up_locked(&pin->wait);
-	spin_unlock_irq(&pin->wait.lock);
+	finished_killing(&pin->head);
 }
 
 void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
@@ -34,43 +31,12 @@ void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 
 void pin_kill(struct fs_pin *p)
 {
-	wait_queue_t wait;
-
 	if (!p) {
 		rcu_read_unlock();
 		return;
 	}
-	init_wait(&wait);
-	spin_lock_irq(&p->wait.lock);
-	if (likely(!p->done)) {
-		p->done = -1;
-		spin_unlock_irq(&p->wait.lock);
-		rcu_read_unlock();
+	if (needs_killing(&p->head))
 		p->kill(p);
-		return;
-	}
-	if (p->done > 0) {
-		spin_unlock_irq(&p->wait.lock);
-		rcu_read_unlock();
-		return;
-	}
-	__add_wait_queue(&p->wait, &wait);
-	while (1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		spin_unlock_irq(&p->wait.lock);
-		rcu_read_unlock();
-		schedule();
-		rcu_read_lock();
-		if (likely(list_empty(&wait.task_list)))
-			break;
-		/* OK, we know p couldn't have been freed yet */
-		spin_lock_irq(&p->wait.lock);
-		if (p->done > 0) {
-			spin_unlock_irq(&p->wait.lock);
-			break;
-		}
-	}
-	rcu_read_unlock();
 }
 
 void mnt_pin_kill(struct mount *m)
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 9dc4e03..f65daad 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -1,8 +1,7 @@
-#include <linux/wait.h>
+#include <linux/kill_once.h>
 
 struct fs_pin {
-	wait_queue_head_t	wait;
-	int			done;
+	struct kill_once	head;
 	struct hlist_node	s_list;
 	struct hlist_node	m_list;
 	void (*kill)(struct fs_pin *);
@@ -12,7 +11,7 @@ struct vfsmount;
 
 static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
 {
-	init_waitqueue_head(&p->wait);
+	init_kill_once(&p->head);
 	p->kill = kill;
 }
 
diff --git a/include/linux/kill_once.h b/include/linux/kill_once.h
new file mode 100644
index 0000000..03a3717
--- /dev/null
+++ b/include/linux/kill_once.h
@@ -0,0 +1,28 @@
+#include <linux/wait.h>
+
+/*
+Intended use:
+	rcu_read_lock();
+	p = <....>
+	if (needs_killing(p)) {
+		kill it
+		finished_killing(p);
+		arrange for rcu-delayed freeing
+	} else {
+		we are guaranteed that it is an ex-parrot
+	}
+*/
+
+struct kill_once {
+	wait_queue_head_t	wait;
+	int			done;
+};
+
+static inline void init_kill_once(struct kill_once *p)
+{
+	init_waitqueue_head(&p->wait);
+	p->done = 0;
+}
+
+bool needs_killing(struct kill_once *);
+void finished_killing(struct kill_once *);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index de7a416..c404207 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
 
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o kill_once.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/kill_once.c b/kernel/locking/kill_once.c
new file mode 100644
index 0000000..f59ad4b
--- /dev/null
+++ b/kernel/locking/kill_once.c
@@ -0,0 +1,59 @@
+#include <linux/sched.h>
+#include <linux/kill_once.h>
+
+void finished_killing(struct kill_once *p)
+{
+	spin_lock_irq(&p->wait.lock);
+	p->done = 1;
+	wake_up_locked(&p->wait);
+	spin_unlock_irq(&p->wait.lock);
+}
+
+bool needs_killing(struct kill_once *p)
+{
+	wait_queue_t wait;
+
+	init_wait(&wait);
+	spin_lock_irq(&p->wait.lock);
+	if (likely(!p->done)) {
+		p->done = -1;
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		return true;
+	}
+	if (p->done > 0) {
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		return false;
+	}
+	__add_wait_queue(&p->wait, &wait);
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&p->wait.lock);
+		rcu_read_unlock();
+		schedule();
+		rcu_read_lock();
+		/*
+		 * racy, but we are OK with false negatives -
+		 * if we observe anything other than an empty
+		 * wait.task_list after taking rcu_read_lock(),
+		 * we know that RCU grace period started after
+		 * pin_remove() couldn't have ended yet and
+		 * dereferencing p is still safe.
+		 */
+		if (likely(list_empty(&wait.task_list)))
+			break;
+		/*
+		 * OK, we know p couldn't have been freed yet and
+		 * can take that spinlock safely
+		 */
+		spin_lock_irq(&p->wait.lock);
+		/* now we can check p->done */
+		if (p->done > 0) {
+			spin_unlock_irq(&p->wait.lock);
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return false;
+}