linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: axboe@kernel.dk, vgoyal@redhat.com
Cc: ctalbott@google.com, rni@google.com,
	linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 28/36] blkcg: use double locking instead of RCU for blkg synchronization
Date: Tue, 21 Feb 2012 17:46:55 -0800	[thread overview]
Message-ID: <1329875223-5102-29-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1329875223-5102-1-git-send-email-tj@kernel.org>

blkgs are chained from both blkcgs and request_queues and thus
subjected to two locks - blkcg->lock and q->queue_lock.  As both blkcg
and q can go away anytime, locking during removal is tricky.  It's
currently solved by wrapping removal inside RCU, which makes the
synchronization complex.  There are three locks to worry about - the
outer RCU, q lock and blkcg lock, and it leads to nasty subtle
complications like conditional synchronize_rcu() on queue exit paths.

For all other paths, blkcg lock is naturally nested inside q lock and
the only exception is blkcg removal path, which is a very cold path
and can be implemented as clumsy but conceptually-simple reverse
double lock dancing.

This patch updates blkg removal path such that blkgs are removed while
holding both q and blkcg locks, which is trivial for request queue
exit path - blkg_destroy_all().  The blkcg removal path,
blkiocg_pre_destroy(), implements reverse double lock dancing
essentially identical to ioc_release_fn().

This simplifies blkg locking - no half-dead blkgs to worry about.  Now
unnecessary RCU annotations will be removed by the next patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 block/blk-cgroup.c |  136 +++++++++++++++++++--------------------------------
 block/blk-cgroup.h |    4 --
 block/cfq.h        |   10 ----
 3 files changed, 51 insertions(+), 99 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ce2dd15..aee71ef 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -620,32 +620,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
-static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	hlist_del_init_rcu(&blkg->blkcg_node);
-}
-
-/*
- * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
- * indicating that blk_group was unhashed by the time we got to it.
- */
-int blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	struct blkio_cgroup *blkcg = blkg->blkcg;
-	unsigned long flags;
-	int ret = 1;
-
-	spin_lock_irqsave(&blkcg->lock, flags);
-	if (!hlist_unhashed(&blkg->blkcg_node)) {
-		__blkiocg_del_blkio_group(blkg);
-		ret = 0;
-	}
-	spin_unlock_irqrestore(&blkcg->lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
-
 /* called under rcu_read_lock(). */
 struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				struct request_queue *q)
@@ -663,12 +637,16 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
 static void blkg_destroy(struct blkio_group *blkg)
 {
 	struct request_queue *q = blkg->q;
+	struct blkio_cgroup *blkcg = blkg->blkcg;
 
 	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&blkcg->lock);
 
 	/* Something wrong if we are trying to remove same group twice */
 	WARN_ON_ONCE(list_empty(&blkg->q_node));
+	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
 	list_del_init(&blkg->q_node);
+	hlist_del_init_rcu(&blkg->blkcg_node);
 
 	WARN_ON_ONCE(q->nr_blkgs <= 0);
 	q->nr_blkgs--;
@@ -712,47 +690,35 @@ static void update_root_blkg(struct request_queue *q, enum blkio_policy_id plid)
 	pol->ops.blkio_init_group_fn(blkg);
 }
 
+/**
+ * blkg_destroy_all - destroy all blkgs associated with a request_queue
+ * @q: request_queue of interest
+ * @destroy_root: whether to destroy root blkg or not
+ *
+ * Destroy blkgs associated with @q.  If @destroy_root is %true, all are
+ * destroyed; otherwise, root blkg is left alone.
+ */
 void blkg_destroy_all(struct request_queue *q, bool destroy_root)
 {
 	struct blkio_group *blkg, *n;
 	int i;
 
-	while (true) {
-		bool done = true;
-
-		spin_lock_irq(q->queue_lock);
-
-		list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
-			/* skip root? */
-			if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
-				continue;
-
-			/*
-			 * If cgroup removal path got to blk_group first
-			 * and removed it from cgroup list, then it will
-			 * take care of destroying cfqg also.
-			 */
-			if (!blkiocg_del_blkio_group(blkg))
-				blkg_destroy(blkg);
-			else
-				done = false;
-		}
+	spin_lock_irq(q->queue_lock);
 
-		spin_unlock_irq(q->queue_lock);
+	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+		struct blkio_cgroup *blkcg = blkg->blkcg;
 
-		/*
-		 * Group list may not be empty if we raced cgroup removal
-		 * and lost.  cgroup removal is guaranteed to make forward
-		 * progress and retrying after a while is enough.  This
-		 * ugliness is scheduled to be removed after locking
-		 * update.
-		 */
-		if (done)
-			break;
+		/* skip root? */
+		if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
+			continue;
 
-		msleep(10);	/* just some random duration I like */
+		spin_lock(&blkcg->lock);
+		blkg_destroy(blkg);
+		spin_unlock(&blkcg->lock);
 	}
 
+	spin_unlock_irq(q->queue_lock);
+
 	for (i = 0; i < BLKIO_NR_POLICIES; i++)
 		update_root_blkg(q, i);
 }
@@ -1590,45 +1556,45 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
 				ARRAY_SIZE(blkio_files));
 }
 
+/**
+ * blkiocg_pre_destroy - cgroup pre_destroy callback
+ * @subsys: cgroup subsys
+ * @cgroup: cgroup of interest
+ *
+ * This function is called when @cgroup is about to go away and responsible
+ * for shooting down all blkgs associated with @cgroup.  blkgs should be
+ * removed while holding both q and blkcg locks.  As blkcg lock is nested
+ * inside q lock, this function performs reverse double lock dancing.
+ *
+ * This is the blkcg counterpart of ioc_release_fn().
+ */
 static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
 			       struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
-	unsigned long flags;
-	struct blkio_group *blkg;
-	struct request_queue *q;
 
 	rcu_read_lock();
+	spin_lock_irq(&blkcg->lock);
 
-	do {
-		spin_lock_irqsave(&blkcg->lock, flags);
+	while (!hlist_empty(&blkcg->blkg_list)) {
+		struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
+						struct blkio_group, blkcg_node);
+		struct request_queue *q = rcu_dereference(blkg->q);
 
-		if (hlist_empty(&blkcg->blkg_list)) {
-			spin_unlock_irqrestore(&blkcg->lock, flags);
-			break;
+		if (spin_trylock(q->queue_lock)) {
+			blkg_destroy(blkg);
+			spin_unlock(q->queue_lock);
+		} else {
+			spin_unlock_irq(&blkcg->lock);
+			rcu_read_unlock();
+			cpu_relax();
+			rcu_read_lock();
+			spin_lock(&blkcg->lock);
 		}
+	}
 
-		blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
-					blkcg_node);
-		q = rcu_dereference(blkg->q);
-		__blkiocg_del_blkio_group(blkg);
-
-		spin_unlock_irqrestore(&blkcg->lock, flags);
-
-		/*
-		 * This blkio_group is being unlinked as associated cgroup is
-		 * going away. Let all the IO controlling policies know about
-		 * this event.
-		 */
-		spin_lock(&blkio_list_lock);
-		spin_lock_irqsave(q->queue_lock, flags);
-		blkg_destroy(blkg);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		spin_unlock(&blkio_list_lock);
-	} while (1);
-
+	spin_unlock_irq(&blkcg->lock);
 	rcu_read_unlock();
-
 	return 0;
 }
 
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 88b2c3b..bebc442 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -376,7 +376,6 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
 extern struct blkio_cgroup blkio_root_cgroup;
 extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
 extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
-extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 				       struct request_queue *q);
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
@@ -412,9 +411,6 @@ cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
 static inline struct blkio_cgroup *
 task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
 
-static inline int
-blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
-
 static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
 					      void *key) { return NULL; }
 static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
diff --git a/block/cfq.h b/block/cfq.h
index 5584e1b..c8b15ef 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -79,11 +79,6 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 					direction, sync);
 }
 
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	return blkiocg_del_blkio_group(blkg);
-}
-
 #else /* CFQ_GROUP_IOSCHED */
 static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol,
@@ -119,10 +114,5 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
 			struct blkio_policy_type *pol, uint64_t start_time,
 			uint64_t io_start_time, bool direction, bool sync) { }
 
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
-	return 0;
-}
-
 #endif /* CFQ_GROUP_IOSCHED */
 #endif
-- 
1.7.7.3


  parent reply	other threads:[~2012-02-22  1:48 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-22  1:46 [PATCHSET] blkcg: accumulated blkcg updates Tejun Heo
2012-02-22  1:46 ` [PATCH 01/36] block: blk-throttle should be drained regardless of q->elevator Tejun Heo
2012-02-22  1:46 ` [PATCH 02/36] blkcg: make CONFIG_BLK_CGROUP bool Tejun Heo
2012-02-22  1:46 ` [PATCH 03/36] cfq: don't register propio policy if !CONFIG_CFQ_GROUP_IOSCHED Tejun Heo
2012-02-22  1:46 ` [PATCH 04/36] elevator: clear auxiliary data earlier during elevator switch Tejun Heo
2012-02-22  1:46 ` [PATCH 05/36] elevator: make elevator_init_fn() return 0/-errno Tejun Heo
2012-02-22  1:46 ` [PATCH 06/36] block: implement blk_queue_bypass_start/end() Tejun Heo
2012-02-22  1:46 ` [PATCH 07/36] block: extend queue bypassing to cover blkcg policies Tejun Heo
2012-02-22  1:46 ` [PATCH 08/36] blkcg: shoot down blkio_groups on elevator switch Tejun Heo
2012-02-22  1:46 ` [PATCH 09/36] blkcg: move rcu_read_lock() outside of blkio_group get functions Tejun Heo
2012-02-22  1:46 ` [PATCH 10/36] blkcg: update blkg get functions take blkio_cgroup as parameter Tejun Heo
2012-02-22  1:46 ` [PATCH 11/36] blkcg: use q and plid instead of opaque void * for blkio_group association Tejun Heo
2012-02-22  1:46 ` [PATCH 12/36] blkcg: add blkio_policy[] array and allow one policy per policy ID Tejun Heo
2012-02-22  1:46 ` [PATCH 13/36] blkcg: use the usual get blkg path for root blkio_group Tejun Heo
2012-02-22  1:46 ` [PATCH 14/36] blkcg: factor out blkio_group creation Tejun Heo
2012-02-22  1:46 ` [PATCH 15/36] blkcg: don't allow or retain configuration of missing devices Tejun Heo
2012-02-22  1:46 ` [PATCH 16/36] blkcg: kill blkio_policy_node Tejun Heo
2012-02-22  1:46 ` [PATCH 17/36] blkcg: kill the mind-bending blkg->dev Tejun Heo
2012-02-22  1:46 ` [PATCH 18/36] blkcg: let blkio_group point to blkio_cgroup directly Tejun Heo
2012-02-22  1:46 ` [PATCH 19/36] blkcg: add blkcg_{init|drain|exit}_queue() Tejun Heo
2012-02-22  1:46 ` [PATCH 20/36] blkcg: clear all request_queues on blkcg policy [un]registrations Tejun Heo
2012-02-22  1:46 ` [PATCH 21/36] blkcg: let blkcg core handle policy private data allocation Tejun Heo
2012-02-22  1:46 ` [PATCH 22/36] blkcg: move refcnt to blkcg core Tejun Heo
2012-02-22  1:46 ` [PATCH 23/36] blkcg: make blkg->pd an array and move configuration and stats into it Tejun Heo
2012-02-22  1:46 ` [PATCH 24/36] blkcg: don't use blkg->plid in stat related functions Tejun Heo
2012-02-22  1:46 ` [PATCH 25/36] blkcg: move per-queue blkg list heads and counters to queue and blkg Tejun Heo
2012-02-22  1:46 ` [PATCH 26/36] blkcg: let blkcg core manage per-queue blkg list and counter Tejun Heo
2012-02-22  1:46 ` [PATCH 27/36] blkcg: unify blkg's for blkcg policies Tejun Heo
2012-03-05 21:01   ` [PATCH UPDATED " Tejun Heo
2012-02-22  1:46 ` Tejun Heo [this message]
2012-02-22  1:46 ` [PATCH 29/36] blkcg: drop unnecessary RCU locking Tejun Heo
2012-02-23 18:51   ` [PATCH UPDATED " Tejun Heo
2012-02-22  1:46 ` [PATCH 30/36] block: restructure get_request() Tejun Heo
2012-02-22  1:46 ` [PATCH 31/36] block: interface update for ioc/icq creation functions Tejun Heo
2012-02-22  1:46 ` [PATCH 32/36] block: ioc_task_link() can't fail Tejun Heo
2012-02-22  1:47 ` [PATCH 33/36] block: add io_context->active_ref Tejun Heo
2012-02-22 18:47   ` Vivek Goyal
2012-02-22 19:13     ` Tejun Heo
2012-02-23 18:20       ` Vivek Goyal
2012-02-22  1:47 ` [PATCH 34/36] block: implement bio_associate_current() Tejun Heo
2012-02-22 13:45   ` Jeff Moyer
2012-02-22 19:07     ` Tejun Heo
2012-02-22 19:33       ` Jeff Moyer
2012-02-22 19:37         ` Vivek Goyal
2012-02-22 19:41           ` Jeff Moyer
2012-02-22  1:47 ` [PATCH 35/36] block: make block cgroup policies follow bio task association Tejun Heo
2012-02-22  1:47 ` [PATCH 36/36] block: make blk-throttle preserve the issuing task on delayed bios Tejun Heo
2012-02-22 19:34 ` [PATCHSET] blkcg: accumulated blkcg updates Vivek Goyal
2012-02-22 22:04   ` Tejun Heo
2012-03-05 20:59 ` [PATCH 17.5] blkcg: skip blkg printing if q isn't associated with disk Tejun Heo
2012-03-05 21:07 ` [PATCHSET] blkcg: accumulated blkcg updates Tejun Heo
2012-03-05 21:08   ` Tejun Heo
2012-03-06 15:07     ` Vivek Goyal
2012-03-06 16:24       ` Vivek Goyal
2012-03-06 18:39         ` Vivek Goyal
2012-03-06 19:02           ` Vivek Goyal
2012-03-08  0:06             ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1329875223-5102-29-git-send-email-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=ctalbott@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rni@google.com \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).