From: Tejun Heo <tj@kernel.org>
To: axboe@kernel.dk, vgoyal@redhat.com
Cc: ctalbott@google.com, rni@google.com,
linux-kernel@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 28/36] blkcg: use double locking instead of RCU for blkg synchronization
Date: Tue, 21 Feb 2012 17:46:55 -0800 [thread overview]
Message-ID: <1329875223-5102-29-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1329875223-5102-1-git-send-email-tj@kernel.org>
blkgs are chained from both blkcgs and request_queues and thus
subjected to two locks - blkcg->lock and q->queue_lock. As both blkcg
and q can go away anytime, locking during removal is tricky. It's
currently solved by wrapping removal inside RCU, which makes the
synchronization complex. There are three locks to worry about - the
outer RCU, q lock and blkcg lock, and it leads to nasty subtle
complications like conditional synchronize_rcu() on queue exit paths.
For all other paths, blkcg lock is naturally nested inside q lock and
the only exception is blkcg removal path, which is a very cold path
and can be implemented as clumsy but conceptually-simple reverse
double lock dancing.
This patch updates blkg removal path such that blkgs are removed while
holding both q and blkcg locks, which is trivial for request queue
exit path - blkg_destroy_all(). The blkcg removal path,
blkiocg_pre_destroy(), implements reverse double lock dancing
essentially identical to ioc_release_fn().
This simplifies blkg locking - no half-dead blkgs to worry about. Now
unnecessary RCU annotations will be removed by the next patch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
block/blk-cgroup.c | 136 +++++++++++++++++++--------------------------------
block/blk-cgroup.h | 4 --
block/cfq.h | 10 ----
3 files changed, 51 insertions(+), 99 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ce2dd15..aee71ef 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -620,32 +620,6 @@ out:
}
EXPORT_SYMBOL_GPL(blkg_lookup_create);
-static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
- hlist_del_init_rcu(&blkg->blkcg_node);
-}
-
-/*
- * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
- * indicating that blk_group was unhashed by the time we got to it.
- */
-int blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
- struct blkio_cgroup *blkcg = blkg->blkcg;
- unsigned long flags;
- int ret = 1;
-
- spin_lock_irqsave(&blkcg->lock, flags);
- if (!hlist_unhashed(&blkg->blkcg_node)) {
- __blkiocg_del_blkio_group(blkg);
- ret = 0;
- }
- spin_unlock_irqrestore(&blkcg->lock, flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group);
-
/* called under rcu_read_lock(). */
struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
struct request_queue *q)
@@ -663,12 +637,16 @@ EXPORT_SYMBOL_GPL(blkg_lookup);
static void blkg_destroy(struct blkio_group *blkg)
{
struct request_queue *q = blkg->q;
+ struct blkio_cgroup *blkcg = blkg->blkcg;
lockdep_assert_held(q->queue_lock);
+ lockdep_assert_held(&blkcg->lock);
/* Something wrong if we are trying to remove same group twice */
WARN_ON_ONCE(list_empty(&blkg->q_node));
+ WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
list_del_init(&blkg->q_node);
+ hlist_del_init_rcu(&blkg->blkcg_node);
WARN_ON_ONCE(q->nr_blkgs <= 0);
q->nr_blkgs--;
@@ -712,47 +690,35 @@ static void update_root_blkg(struct request_queue *q, enum blkio_policy_id plid)
pol->ops.blkio_init_group_fn(blkg);
}
+/**
+ * blkg_destroy_all - destroy all blkgs associated with a request_queue
+ * @q: request_queue of interest
+ * @destroy_root: whether to destroy root blkg or not
+ *
+ * Destroy blkgs associated with @q. If @destroy_root is %true, all are
+ * destroyed; otherwise, root blkg is left alone.
+ */
void blkg_destroy_all(struct request_queue *q, bool destroy_root)
{
struct blkio_group *blkg, *n;
int i;
- while (true) {
- bool done = true;
-
- spin_lock_irq(q->queue_lock);
-
- list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
- /* skip root? */
- if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
- continue;
-
- /*
- * If cgroup removal path got to blk_group first
- * and removed it from cgroup list, then it will
- * take care of destroying cfqg also.
- */
- if (!blkiocg_del_blkio_group(blkg))
- blkg_destroy(blkg);
- else
- done = false;
- }
+ spin_lock_irq(q->queue_lock);
- spin_unlock_irq(q->queue_lock);
+ list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+ struct blkio_cgroup *blkcg = blkg->blkcg;
- /*
- * Group list may not be empty if we raced cgroup removal
- * and lost. cgroup removal is guaranteed to make forward
- * progress and retrying after a while is enough. This
- * ugliness is scheduled to be removed after locking
- * update.
- */
- if (done)
- break;
+ /* skip root? */
+ if (!destroy_root && blkg->blkcg == &blkio_root_cgroup)
+ continue;
- msleep(10); /* just some random duration I like */
+ spin_lock(&blkcg->lock);
+ blkg_destroy(blkg);
+ spin_unlock(&blkcg->lock);
}
+ spin_unlock_irq(q->queue_lock);
+
for (i = 0; i < BLKIO_NR_POLICIES; i++)
update_root_blkg(q, i);
}
@@ -1590,45 +1556,45 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
ARRAY_SIZE(blkio_files));
}
+/**
+ * blkiocg_pre_destroy - cgroup pre_destroy callback
+ * @subsys: cgroup subsys
+ * @cgroup: cgroup of interest
+ *
+ * This function is called when @cgroup is about to go away and responsible
+ * for shooting down all blkgs associated with @cgroup. blkgs should be
+ * removed while holding both q and blkcg locks. As blkcg lock is nested
+ * inside q lock, this function performs reverse double lock dancing.
+ *
+ * This is the blkcg counterpart of ioc_release_fn().
+ */
static int blkiocg_pre_destroy(struct cgroup_subsys *subsys,
struct cgroup *cgroup)
{
struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
- unsigned long flags;
- struct blkio_group *blkg;
- struct request_queue *q;
rcu_read_lock();
+ spin_lock_irq(&blkcg->lock);
- do {
- spin_lock_irqsave(&blkcg->lock, flags);
+ while (!hlist_empty(&blkcg->blkg_list)) {
+ struct blkio_group *blkg = hlist_entry(blkcg->blkg_list.first,
+ struct blkio_group, blkcg_node);
+ struct request_queue *q = rcu_dereference(blkg->q);
- if (hlist_empty(&blkcg->blkg_list)) {
- spin_unlock_irqrestore(&blkcg->lock, flags);
- break;
+ if (spin_trylock(q->queue_lock)) {
+ blkg_destroy(blkg);
+ spin_unlock(q->queue_lock);
+ } else {
+ spin_unlock_irq(&blkcg->lock);
+ rcu_read_unlock();
+ cpu_relax();
+ rcu_read_lock();
+ spin_lock(&blkcg->lock);
}
+ }
- blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
- blkcg_node);
- q = rcu_dereference(blkg->q);
- __blkiocg_del_blkio_group(blkg);
-
- spin_unlock_irqrestore(&blkcg->lock, flags);
-
- /*
- * This blkio_group is being unlinked as associated cgroup is
- * going away. Let all the IO controlling policies know about
- * this event.
- */
- spin_lock(&blkio_list_lock);
- spin_lock_irqsave(q->queue_lock, flags);
- blkg_destroy(blkg);
- spin_unlock_irqrestore(q->queue_lock, flags);
- spin_unlock(&blkio_list_lock);
- } while (1);
-
+ spin_unlock_irq(&blkcg->lock);
rcu_read_unlock();
-
return 0;
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 88b2c3b..bebc442 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -376,7 +376,6 @@ static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg,
extern struct blkio_cgroup blkio_root_cgroup;
extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup);
extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk);
-extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
struct request_queue *q);
struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
@@ -412,9 +411,6 @@ cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
static inline struct blkio_cgroup *
task_blkio_cgroup(struct task_struct *tsk) { return NULL; }
-static inline int
-blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
-
static inline struct blkio_group *blkg_lookup(struct blkio_cgroup *blkcg,
void *key) { return NULL; }
static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
diff --git a/block/cfq.h b/block/cfq.h
index 5584e1b..c8b15ef 100644
--- a/block/cfq.h
+++ b/block/cfq.h
@@ -79,11 +79,6 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
direction, sync);
}
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
- return blkiocg_del_blkio_group(blkg);
-}
-
#else /* CFQ_GROUP_IOSCHED */
static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol,
@@ -119,10 +114,5 @@ static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg,
struct blkio_policy_type *pol, uint64_t start_time,
uint64_t io_start_time, bool direction, bool sync) { }
-static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg)
-{
- return 0;
-}
-
#endif /* CFQ_GROUP_IOSCHED */
#endif
--
1.7.7.3
next prev parent reply other threads:[~2012-02-22 1:48 UTC|newest]
Thread overview: 58+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-02-22 1:46 [PATCHSET] blkcg: accumulated blkcg updates Tejun Heo
2012-02-22 1:46 ` [PATCH 01/36] block: blk-throttle should be drained regardless of q->elevator Tejun Heo
2012-02-22 1:46 ` [PATCH 02/36] blkcg: make CONFIG_BLK_CGROUP bool Tejun Heo
2012-02-22 1:46 ` [PATCH 03/36] cfq: don't register propio policy if !CONFIG_CFQ_GROUP_IOSCHED Tejun Heo
2012-02-22 1:46 ` [PATCH 04/36] elevator: clear auxiliary data earlier during elevator switch Tejun Heo
2012-02-22 1:46 ` [PATCH 05/36] elevator: make elevator_init_fn() return 0/-errno Tejun Heo
2012-02-22 1:46 ` [PATCH 06/36] block: implement blk_queue_bypass_start/end() Tejun Heo
2012-02-22 1:46 ` [PATCH 07/36] block: extend queue bypassing to cover blkcg policies Tejun Heo
2012-02-22 1:46 ` [PATCH 08/36] blkcg: shoot down blkio_groups on elevator switch Tejun Heo
2012-02-22 1:46 ` [PATCH 09/36] blkcg: move rcu_read_lock() outside of blkio_group get functions Tejun Heo
2012-02-22 1:46 ` [PATCH 10/36] blkcg: update blkg get functions take blkio_cgroup as parameter Tejun Heo
2012-02-22 1:46 ` [PATCH 11/36] blkcg: use q and plid instead of opaque void * for blkio_group association Tejun Heo
2012-02-22 1:46 ` [PATCH 12/36] blkcg: add blkio_policy[] array and allow one policy per policy ID Tejun Heo
2012-02-22 1:46 ` [PATCH 13/36] blkcg: use the usual get blkg path for root blkio_group Tejun Heo
2012-02-22 1:46 ` [PATCH 14/36] blkcg: factor out blkio_group creation Tejun Heo
2012-02-22 1:46 ` [PATCH 15/36] blkcg: don't allow or retain configuration of missing devices Tejun Heo
2012-02-22 1:46 ` [PATCH 16/36] blkcg: kill blkio_policy_node Tejun Heo
2012-02-22 1:46 ` [PATCH 17/36] blkcg: kill the mind-bending blkg->dev Tejun Heo
2012-02-22 1:46 ` [PATCH 18/36] blkcg: let blkio_group point to blkio_cgroup directly Tejun Heo
2012-02-22 1:46 ` [PATCH 19/36] blkcg: add blkcg_{init|drain|exit}_queue() Tejun Heo
2012-02-22 1:46 ` [PATCH 20/36] blkcg: clear all request_queues on blkcg policy [un]registrations Tejun Heo
2012-02-22 1:46 ` [PATCH 21/36] blkcg: let blkcg core handle policy private data allocation Tejun Heo
2012-02-22 1:46 ` [PATCH 22/36] blkcg: move refcnt to blkcg core Tejun Heo
2012-02-22 1:46 ` [PATCH 23/36] blkcg: make blkg->pd an array and move configuration and stats into it Tejun Heo
2012-02-22 1:46 ` [PATCH 24/36] blkcg: don't use blkg->plid in stat related functions Tejun Heo
2012-02-22 1:46 ` [PATCH 25/36] blkcg: move per-queue blkg list heads and counters to queue and blkg Tejun Heo
2012-02-22 1:46 ` [PATCH 26/36] blkcg: let blkcg core manage per-queue blkg list and counter Tejun Heo
2012-02-22 1:46 ` [PATCH 27/36] blkcg: unify blkg's for blkcg policies Tejun Heo
2012-03-05 21:01 ` [PATCH UPDATED " Tejun Heo
2012-02-22 1:46 ` Tejun Heo [this message]
2012-02-22 1:46 ` [PATCH 29/36] blkcg: drop unnecessary RCU locking Tejun Heo
2012-02-23 18:51 ` [PATCH UPDATED " Tejun Heo
2012-02-22 1:46 ` [PATCH 30/36] block: restructure get_request() Tejun Heo
2012-02-22 1:46 ` [PATCH 31/36] block: interface update for ioc/icq creation functions Tejun Heo
2012-02-22 1:46 ` [PATCH 32/36] block: ioc_task_link() can't fail Tejun Heo
2012-02-22 1:47 ` [PATCH 33/36] block: add io_context->active_ref Tejun Heo
2012-02-22 18:47 ` Vivek Goyal
2012-02-22 19:13 ` Tejun Heo
2012-02-23 18:20 ` Vivek Goyal
2012-02-22 1:47 ` [PATCH 34/36] block: implement bio_associate_current() Tejun Heo
2012-02-22 13:45 ` Jeff Moyer
2012-02-22 19:07 ` Tejun Heo
2012-02-22 19:33 ` Jeff Moyer
2012-02-22 19:37 ` Vivek Goyal
2012-02-22 19:41 ` Jeff Moyer
2012-02-22 1:47 ` [PATCH 35/36] block: make block cgroup policies follow bio task association Tejun Heo
2012-02-22 1:47 ` [PATCH 36/36] block: make blk-throttle preserve the issuing task on delayed bios Tejun Heo
2012-02-22 19:34 ` [PATCHSET] blkcg: accumulated blkcg updates Vivek Goyal
2012-02-22 22:04 ` Tejun Heo
2012-03-05 20:59 ` [PATCH 17.5] blkcg: skip blkg printing if q isn't associated with disk Tejun Heo
2012-03-05 21:07 ` [PATCHSET] blkcg: accumulated blkcg updates Tejun Heo
2012-03-05 21:08 ` Tejun Heo
2012-03-06 15:07 ` Vivek Goyal
2012-03-06 16:24 ` Vivek Goyal
2012-03-06 18:39 ` Vivek Goyal
2012-03-06 18:39 ` Vivek Goyal
2012-03-06 19:02 ` Vivek Goyal
2012-03-08 0:06 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1329875223-5102-29-git-send-email-tj@kernel.org \
--to=tj@kernel.org \
--cc=axboe@kernel.dk \
--cc=ctalbott@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=rni@google.com \
--cc=vgoyal@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.