[PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup

public inbox for cgroups@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir
@ 2018-02-07  8:40 Joseph Qi
       [not found] ` <6f136c90-faa9-4bc0-b02f-3a112b4d8360-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Joseph Qi @ 2018-02-07  8:40 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo; +Cc: xuejiufei, Caspar Zhang, linux-block, cgroups

We've triggered a WARNING in blk_throtl_bio when throttling writeback
io, which complains blkg->refcnt is already 0 when calling blkg_get, and
then kernel crashes with invalid page request.
After investigating this issue, we've found there is a race between
blkcg_bio_issue_check and cgroup_rmdir. The race is described below.

writeback kworker
  blkcg_bio_issue_check
    rcu_read_lock
    blkg_lookup
    <<< *race window*
    blk_throtl_bio
      spin_lock_irq(q->queue_lock)
      spin_unlock_irq(q->queue_lock)
    rcu_read_unlock

cgroup_rmdir
  cgroup_destroy_locked
    kill_css
      css_killed_ref_fn
        css_killed_work_fn
          offline_css
            blkcg_css_offline
              spin_trylock(q->queue_lock)
              blkg_destroy
              spin_unlock(q->queue_lock)

Since rcu can only prevent blkg from releasing when it is being used,
the blkg->refcnt can be decreased to 0 during blkg_destroy and schedule
blkg release.
Then trying to blkg_get in blk_throtl_bio will complains the WARNING.
And then the corresponding blkg_put will schedule blkg release again,
which result in double free.
This race is introduced by commit ae1188963611 ("blkcg: consolidate blkg
creation in blkcg_bio_issue_check()"). Before this commit, it will lookup
first and then try to lookup/create again with queue_lock. So revive
this logic to fix the race.

v2: fix a potential NULL pointer dereference when stats

Fixes: ae1188963611 ("blkcg: consolidate blkg creation in blkcg_bio_issue_check()")
Reported-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: stable@vger.kernel.org
---
 block/blk-cgroup.c         |  2 +-
 block/blk-throttle.c       | 30 ++++++++++++++++++++++++++----
 block/cfq-iosched.c        | 33 +++++++++++++++++++++++----------
 include/linux/blk-cgroup.h | 27 +++++----------------------
 4 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4117524..b1d22e5 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -162,7 +162,6 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
@@ -306,6 +305,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			return blkg;
 	}
 }
+EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c5a1316..decdd76 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2143,26 +2143,41 @@ static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 #endif
 }
 
-bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+bool blk_throtl_bio(struct request_queue *q, struct blkcg *blkcg,
 		    struct bio *bio)
 {
+	struct throtl_data *td = q->td;
 	struct throtl_qnode *qn = NULL;
-	struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
+	struct throtl_grp *tg;
+	struct blkcg_gq *blkg;
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
 	bool throttled = false;
-	struct throtl_data *td = tg->td;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
+	/*
+	 * If a group has no rules, just update the dispatch stats in lockless
+	 * manner and return.
+	 */
+	blkg = blkg_lookup(blkcg, q);
+	tg = blkg_to_tg(blkg);
+	if (tg && !tg->has_rules[rw])
+		goto out;
+
 	/* see throtl_charge_bio() */
-	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
+	if (bio_flagged(bio, BIO_THROTTLED))
 		goto out;
 
 	spin_lock_irq(q->queue_lock);
 
 	throtl_update_latency_buckets(td);
 
+	blkg = blkg_lookup_create(blkcg, q);
+	if (IS_ERR(blkg))
+		blkg = q->root_blkg;
+	tg = blkg_to_tg(blkg);
+
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
@@ -2253,6 +2268,13 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	if (throttled || !td->track_bio_latency)
 		bio->bi_issue_stat.stat |= SKIP_LATENCY;
 #endif
+	if (!throttled) {
+		blkg = blkg ?: q->root_blkg;
+		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
+				bio->bi_iter.bi_size);
+		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
+	}
+
 	return throttled;
 }
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 9f342ef..60f53b5 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1674,15 +1674,28 @@ static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
 	cfqg_stats_reset(&cfqg->stats);
 }
 
-static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
-					 struct blkcg *blkcg)
+/*
+ * Search for the cfq group current task belongs to. request_queue lock must
+ * be held.
+ */
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+						struct blkcg *blkcg)
 {
-	struct blkcg_gq *blkg;
+	struct request_queue *q = cfqd->queue;
+	struct cfq_group *cfqg = NULL;
 
-	blkg = blkg_lookup(blkcg, cfqd->queue);
-	if (likely(blkg))
-		return blkg_to_cfqg(blkg);
-	return NULL;
+	/* avoid lookup for the common case where there's no blkcg */
+	if (blkcg == &blkcg_root) {
+		cfqg = cfqd->root_group;
+	} else {
+		struct blkcg_gq *blkg;
+
+		blkg = blkg_lookup_create(blkcg, q);
+		if (!IS_ERR(blkg))
+			cfqg = blkg_to_cfqg(blkg);
+	}
+
+	return cfqg;
 }
 
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
@@ -2178,8 +2191,8 @@ static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
 };
 
 #else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
-					 struct blkcg *blkcg)
+static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
+						struct blkcg *blkcg)
 {
 	return cfqd->root_group;
 }
@@ -3814,7 +3827,7 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	struct cfq_group *cfqg;
 
 	rcu_read_lock();
-	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+	cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio));
 	if (!cfqg) {
 		cfqq = &cfqd->oom_cfqq;
 		goto out;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 69bea82..e667841 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -428,8 +428,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
 	 * or if either the blkcg or queue is going away.  Fall back to
 	 * root_rl in such cases.
 	 */
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg))
+	blkg = blkg_lookup_create(blkcg, q);
+	if (unlikely(IS_ERR(blkg)))
 		goto root_rl;
 
 	blkg_get(blkg);
@@ -672,10 +672,10 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING
-extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg *blkcg,
 			   struct bio *bio);
 #else
-static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg *blkcg,
 				  struct bio *bio) { return false; }
 #endif
 
@@ -683,7 +683,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio)
 {
 	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
 	bool throtl = false;
 
 	rcu_read_lock();
@@ -692,23 +691,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	/* associate blkcg if bio hasn't attached one */
 	bio_associate_blkcg(bio, &blkcg->css);
 
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg)) {
-		spin_lock_irq(q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q);
-		if (IS_ERR(blkg))
-			blkg = NULL;
-		spin_unlock_irq(q->queue_lock);
-	}
-
-	throtl = blk_throtl_bio(q, blkg, bio);
-
-	if (!throtl) {
-		blkg = blkg ?: q->root_blkg;
-		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
-				bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
-	}
+	throtl = blk_throtl_bio(q, blkcg, bio);
 
 	rcu_read_unlock();
 	return !throtl;
-- 
1.9.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir
       [not found] ` <6f136c90-faa9-4bc0-b02f-3a112b4d8360-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
@ 2018-02-07 21:38   ` Tejun Heo
       [not found]     ` <20180207213811.GF695913-4dN5La/x3IkLX0oZNxdnEQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Tejun Heo @ 2018-02-07 21:38 UTC (permalink / raw)
  To: Joseph Qi
  Cc: Jens Axboe, xuejiufei, Caspar Zhang, linux-block,
	cgroups-u79uwXL29TY76Z2rM5mHXA

Hello, Joseph.

On Wed, Feb 07, 2018 at 04:40:02PM +0800, Joseph Qi wrote:
> writeback kworker
>   blkcg_bio_issue_check
>     rcu_read_lock
>     blkg_lookup
>     <<< *race window*
>     blk_throtl_bio
>       spin_lock_irq(q->queue_lock)
>       spin_unlock_irq(q->queue_lock)
>     rcu_read_unlock
> 
> cgroup_rmdir
>   cgroup_destroy_locked
>     kill_css
>       css_killed_ref_fn
>         css_killed_work_fn
>           offline_css
>             blkcg_css_offline
>               spin_trylock(q->queue_lock)
>               blkg_destroy
>               spin_unlock(q->queue_lock)

Ah, right.  Thanks for spotting the bug.

> Since rcu can only prevent blkg from releasing when it is being used,
> the blkg->refcnt can be decreased to 0 during blkg_destroy and schedule
> blkg release.
> Then trying to blkg_get in blk_throtl_bio will complains the WARNING.
> And then the corresponding blkg_put will schedule blkg release again,
> which result in double free.
> This race is introduced by commit ae1188963611 ("blkcg: consolidate blkg
> creation in blkcg_bio_issue_check()"). Before this commit, it will lookup
> first and then try to lookup/create again with queue_lock. So revive
> this logic to fix the race.

The change seems a bit drastic to me.  Can't we do something like the
following instead?

blk_throtl_bio()
{
	... non throttled cases ...

	/* out-of-limit, queue to @tg */

	/*
	 * We can look up and retry but the race window is tiny here.
	 * Just letting it through should be good enough.
	 */
	if (!css_tryget(blkcg->css))
		goto out;

	... actual queueing ...
	css_put(blkcg->css);
	...
}

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir
       [not found]     ` <20180207213811.GF695913-4dN5La/x3IkLX0oZNxdnEQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
@ 2018-02-08  2:29       ` Joseph Qi
       [not found]         ` <b590caed-1423-4776-966d-cd9e346a8ea1-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Joseph Qi @ 2018-02-08  2:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jens Axboe, xuejiufei, Caspar Zhang, linux-block,
	cgroups-u79uwXL29TY76Z2rM5mHXA

Hi Tejun,
Thanks very much for reviewing this patch.

On 18/2/8 05:38, Tejun Heo wrote:
> Hello, Joseph.
> 
> On Wed, Feb 07, 2018 at 04:40:02PM +0800, Joseph Qi wrote:
>> writeback kworker
>>   blkcg_bio_issue_check
>>     rcu_read_lock
>>     blkg_lookup
>>     <<< *race window*
>>     blk_throtl_bio
>>       spin_lock_irq(q->queue_lock)
>>       spin_unlock_irq(q->queue_lock)
>>     rcu_read_unlock
>>
>> cgroup_rmdir
>>   cgroup_destroy_locked
>>     kill_css
>>       css_killed_ref_fn
>>         css_killed_work_fn
>>           offline_css
>>             blkcg_css_offline
>>               spin_trylock(q->queue_lock)
>>               blkg_destroy
>>               spin_unlock(q->queue_lock)
> 
> Ah, right.  Thanks for spotting the bug.
> 
>> Since rcu can only prevent blkg from releasing when it is being used,
>> the blkg->refcnt can be decreased to 0 during blkg_destroy and schedule
>> blkg release.
>> Then trying to blkg_get in blk_throtl_bio will complains the WARNING.
>> And then the corresponding blkg_put will schedule blkg release again,
>> which result in double free.
>> This race is introduced by commit ae1188963611 ("blkcg: consolidate blkg
>> creation in blkcg_bio_issue_check()"). Before this commit, it will lookup
>> first and then try to lookup/create again with queue_lock. So revive
>> this logic to fix the race.
> 
> The change seems a bit drastic to me.  Can't we do something like the
> following instead?
> 
> blk_throtl_bio()
> {
> 	... non throttled cases ...
> 
> 	/* out-of-limit, queue to @tg */
> 
> 	/*
> 	 * We can look up and retry but the race window is tiny here.
> 	 * Just letting it through should be good enough.
> 	 */
> 	if (!css_tryget(blkcg->css))
> 		goto out;
> 
> 	... actual queueing ...
> 	css_put(blkcg->css);
> 	...
> }
So you mean checking css->refcnt to prevent the further use of
blkg_get? I think it makes sense.
IMO, we should use css_tryget_online instead, and rightly after taking
queue_lock. Because there may be more use of blkg_get in blk_throtl_bio
in the futher. Actually it already has two now. One is in
blk_throtl_assoc_bio, and the other is in throtl_qnode_add_bio.
What do you think of this?

Thanks,
Joseph

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir
       [not found]         ` <b590caed-1423-4776-966d-cd9e346a8ea1-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
@ 2018-02-08 15:23           ` Tejun Heo
  2018-02-09  2:15             ` Joseph Qi
  0 siblings, 1 reply; 6+ messages in thread
From: Tejun Heo @ 2018-02-08 15:23 UTC (permalink / raw)
  To: Joseph Qi
  Cc: Jens Axboe, xuejiufei, Caspar Zhang, linux-block,
	cgroups-u79uwXL29TY76Z2rM5mHXA

Hello, Joseph.

On Thu, Feb 08, 2018 at 10:29:43AM +0800, Joseph Qi wrote:
> So you mean checking css->refcnt to prevent the further use of
> blkg_get? I think it makes sense.

Yes.

> IMO, we should use css_tryget_online instead, and rightly after taking

Not really.  An offline css still can have a vast amount of IO to
drain and write out.

> queue_lock. Because there may be more use of blkg_get in blk_throtl_bio
> in the futher. Actually it already has two now. One is in
> blk_throtl_assoc_bio, and the other is in throtl_qnode_add_bio.
> What do you think of this?

As long as we avoid making the bypass paths expensive, whatever makes
the code easier to read and maintain would be better.  css ref ops are
extremely cheap anyway.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir
  2018-02-08 15:23           ` Tejun Heo
@ 2018-02-09  2:15             ` Joseph Qi
       [not found]               ` <aac95b90-786d-95bf-b93d-87ecca79f846-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 6+ messages in thread
From: Joseph Qi @ 2018-02-09  2:15 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, xuejiufei, Caspar Zhang, linux-block, cgroups

Hi Tejun,

On 18/2/8 23:23, Tejun Heo wrote:
> Hello, Joseph.
> 
> On Thu, Feb 08, 2018 at 10:29:43AM +0800, Joseph Qi wrote:
>> So you mean checking css->refcnt to prevent the further use of
>> blkg_get? I think it makes sense.
> 
> Yes.
> 
>> IMO, we should use css_tryget_online instead, and rightly after taking
> 
> Not really.  An offline css still can have a vast amount of IO to
> drain and write out.
> 
IIUC, we have to identify it is in blkcg_css_offline now which will
blkg_put. Since percpu_ref_kill_and_confirm in kill_css will set flag
__PERCPU_REF_DEAD, so we can use this to avoid the race. IOW, if
__PERCPU_REF_DEAD is set now, we know blkcg css is in offline and
continue access blkg may risk double free. Thus we choose to skip these
ios.
I don't get how css_tryget works since it doesn't care the flag
__PERCPU_REF_DEAD. Also css_tryget can't prevent blkcg_css from
offlining since the race happens blkcg_css_offline is in progress.
Am I missing something here?

Thanks,
Joseph

>> queue_lock. Because there may be more use of blkg_get in blk_throtl_bio
>> in the futher. Actually it already has two now. One is in
>> blk_throtl_assoc_bio, and the other is in throtl_qnode_add_bio.
>> What do you think of this?
> 
> As long as we avoid making the bypass paths expensive, whatever makes
> the code easier to read and maintain would be better.  css ref ops are
> extremely cheap anyway.
> 
> Thanks.
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir
       [not found]               ` <aac95b90-786d-95bf-b93d-87ecca79f846-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
@ 2018-02-12 17:11                 ` Tejun Heo
  0 siblings, 0 replies; 6+ messages in thread
From: Tejun Heo @ 2018-02-12 17:11 UTC (permalink / raw)
  To: Joseph Qi
  Cc: Jens Axboe, xuejiufei, Caspar Zhang, linux-block,
	cgroups-u79uwXL29TY76Z2rM5mHXA

Hello, Joseph.

On Fri, Feb 09, 2018 at 10:15:19AM +0800, Joseph Qi wrote:
> IIUC, we have to identify it is in blkcg_css_offline now which will
> blkg_put. Since percpu_ref_kill_and_confirm in kill_css will set flag
> __PERCPU_REF_DEAD, so we can use this to avoid the race. IOW, if
> __PERCPU_REF_DEAD is set now, we know blkcg css is in offline and
> continue access blkg may risk double free. Thus we choose to skip these
> ios.
> I don't get how css_tryget works since it doesn't care the flag
> __PERCPU_REF_DEAD. Also css_tryget can't prevent blkcg_css from
> offlining since the race happens blkcg_css_offline is in progress.
> Am I missing something here?

Once marked dead, the ref is in atomic mode and css_tryget() would hit
the atomic counter.  Here, we don't care about the offlining and
draining.  A draining memcg can still have a lot of memory to be
written back attached to it and we don't want punt all of them to the
root cgroup.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-02-12 17:11 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-02-07  8:40 [PATCH v2] blk-throttle: fix race between blkcg_bio_issue_check and cgroup_rmdir Joseph Qi
     [not found] ` <6f136c90-faa9-4bc0-b02f-3a112b4d8360-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
2018-02-07 21:38   ` Tejun Heo
     [not found]     ` <20180207213811.GF695913-4dN5La/x3IkLX0oZNxdnEQ2O0Ztt9esIQQ4Iyu8u01E@public.gmane.org>
2018-02-08  2:29       ` Joseph Qi
     [not found]         ` <b590caed-1423-4776-966d-cd9e346a8ea1-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
2018-02-08 15:23           ` Tejun Heo
2018-02-09  2:15             ` Joseph Qi
     [not found]               ` <aac95b90-786d-95bf-b93d-87ecca79f846-KPsoFbNs7GizrGE5bRqYAgC/G2K4zDHf@public.gmane.org>
2018-02-12 17:11                 ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox