Re: v4.20-rc6: Sporadic use-after-free in bt_iter()

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jens Axboe <axboe@kernel.dk>
To: Bart Van Assche <bvanassche@acm.org>,
	"jianchao.wang" <jianchao.w.wang@oracle.com>,
	"linux-block@vger.kernel.org" <linux-block@vger.kernel.org>
Subject: Re: v4.20-rc6: Sporadic use-after-free in bt_iter()
Date: Thu, 20 Dec 2018 14:44:08 -0700	[thread overview]
Message-ID: <60b4819c-4c19-a4e3-41f3-e21b0544c9a4@kernel.dk> (raw)
In-Reply-To: <1545342043.185366.523.camel@acm.org>

On 12/20/18 2:40 PM, Bart Van Assche wrote:
> On Thu, 2018-12-20 at 14:34 -0700, Jens Axboe wrote:
>> Yeah, I don't think it's bullet proof either, it just closes the gap.
>> I'm fine with fiddling with the tag iteration. On top of what I sent, we
>> could have tag iteration hold the RCU read lock, and then we just need
>> to ensure that the tags are freed with RCU.
> 
> Do you mean using call_rcu() to free tags? Would that require to add a
> struct rcu_head to every request? Would it be acceptable to increase the
> size of struct request with an rcu_head? Additionally, could that reduce
> the queue depth if the time between grace periods is larger than the time
> between I/O submissions?

No no, the requests are out of full pages, we just need one per blk_mq_tags.
Something like the below... And then the tag iteration needs to grab the
RCU read lock to prevent the page freeing from happening.

This should essentially be free, as rcu read lock for tag iteration is
basically a no-op.

Need to handle the flush request on top of this as well.


diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2089c6c62f44..c39b58391ae8 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -228,13 +228,15 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
-	rq = tags->rqs[bitnr];
+	if (tags->rqs[bitnr].queue != hctx->queue)
+		return true;
 
 	/*
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assigning ->rqs[].
 	 */
-	if (rq && rq->q == hctx->queue)
+	rq = tags->rqs[bitnr].rq;
+	if (rq)
 		return iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
@@ -263,7 +265,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
 		.reserved = reserved,
 	};
 
+	rcu_read_lock();
 	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
+	rcu_read_unlock();
 }
 
 struct bt_tags_iter_data {
@@ -287,7 +291,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * We can hit rq == NULL here, because the tagging functions
 	 * test and set the bit before assining ->rqs[].
 	 */
-	rq = tags->rqs[bitnr];
+	rq = tags->rqs[bitnr].rq;
 	if (rq && blk_mq_request_started(rq))
 		return iter_data->fn(rq, iter_data->data, reserved);
 
@@ -317,8 +321,11 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
 		.reserved = reserved,
 	};
 
-	if (tags->rqs)
+	if (tags->rqs) {
+		rcu_read_lock();
 		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
+		rcu_read_unlock();
+	}
 }
 
 /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..bdd1bfc08c21 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -4,6 +4,11 @@
 
 #include "blk-mq.h"
 
+struct rq_tag_entry {
+	struct request_queue *queue;
+	struct request *rq;
+};
+
 /*
  * Tag address space map.
  */
@@ -16,9 +21,11 @@ struct blk_mq_tags {
 	struct sbitmap_queue bitmap_tags;
 	struct sbitmap_queue breserved_tags;
 
-	struct request **rqs;
+	struct rq_tag_entry *rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
+
+	struct rcu_work rcu_work;
 };
 
 
@@ -78,7 +85,8 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
 		unsigned int tag, struct request *rq)
 {
-	hctx->tags->rqs[tag] = rq;
+	hctx->tags->rqs[tag].queue = hctx->queue;
+	hctx->tags->rqs[tag].rq = rq;
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2de972857496..4decd1e7d2d9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -298,13 +298,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		rq->tag = -1;
 		rq->internal_tag = tag;
 	} else {
-		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
+		struct blk_mq_hw_ctx *hctx = data->hctx;
+
+		if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
 			rq_flags = RQF_MQ_INFLIGHT;
-			atomic_inc(&data->hctx->nr_active);
+			atomic_inc(&hctx->nr_active);
 		}
 		rq->tag = tag;
 		rq->internal_tag = -1;
-		data->hctx->tags->rqs[rq->tag] = rq;
+		hctx->tags->rqs[rq->tag].queue = hctx->queue;
+		hctx->tags->rqs[rq->tag].rq = rq;
 	}
 
 	/* csd/requeue_work/fifo_time is initialized before use */
@@ -797,8 +800,8 @@ EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	if (tag < tags->nr_tags) {
-		prefetch(tags->rqs[tag]);
-		return tags->rqs[tag];
+		prefetch(tags->rqs[tag].rq);
+		return tags->rqs[tag].rq;
 	}
 
 	return NULL;
@@ -809,10 +812,11 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
 			       void *priv, bool reserved)
 {
 	/*
-	 * If we find a request that is inflight and the queue matches,
-	 * we know the queue is busy. Return false to stop the iteration.
+	 * We're only called here if the queue matches. If the rq state is
+	 * inflight, we know the queue is busy. Return false to stop the
+	 * iteration.
 	 */
-	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+	if (rq->state == MQ_RQ_IN_FLIGHT) {
 		bool *busy = priv;
 
 		*busy = true;
@@ -1049,11 +1053,14 @@ bool blk_mq_get_driver_tag(struct request *rq)
 	shared = blk_mq_tag_busy(data.hctx);
 	rq->tag = blk_mq_get_tag(&data);
 	if (rq->tag >= 0) {
+		struct blk_mq_hw_ctx *hctx = data.hctx;
+
 		if (shared) {
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
 			atomic_inc(&data.hctx->nr_active);
 		}
-		data.hctx->tags->rqs[rq->tag] = rq;
+		hctx->tags->rqs[rq->tag].queue = hctx->queue;
+		hctx->tags->rqs[rq->tag].rq = rq;
 	}
 
 done:
@@ -2020,11 +2027,27 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	return cookie;
 }
 
-void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-		     unsigned int hctx_idx)
+static void blk_mq_rcu_free_pages(struct work_struct *work)
 {
+	struct blk_mq_tags *tags = container_of(to_rcu_work(work),
+						struct blk_mq_tags, rcu_work);
 	struct page *page;
 
+	while (!list_empty(&tags->page_list)) {
+		page = list_first_entry(&tags->page_list, struct page, lru);
+		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
+		__free_pages(page, page->private);
+	}
+}
+
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx)
+{
 	if (tags->rqs && set->ops->exit_request) {
 		int i;
 
@@ -2038,16 +2061,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		}
 	}
 
-	while (!list_empty(&tags->page_list)) {
-		page = list_first_entry(&tags->page_list, struct page, lru);
-		list_del_init(&page->lru);
-		/*
-		 * Remove kmemleak object previously allocated in
-		 * blk_mq_init_rq_map().
-		 */
-		kmemleak_free(page_address(page));
-		__free_pages(page, page->private);
-	}
+	/* Punt to RCU free, so we don't race with tag iteration */
+	INIT_RCU_WORK(&tags->rcu_work, blk_mq_rcu_free_pages);
+	queue_rcu_work(system_wq, &tags->rcu_work);
 }
 
 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
@@ -2077,7 +2093,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	if (!tags)
 		return NULL;
 
-	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
+	tags->rqs = kcalloc_node(nr_tags, sizeof(struct rq_tag_entry),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 node);
 	if (!tags->rqs) {

-- 
Jens Axboe

next prev parent reply	other threads:[~2018-12-20 21:44 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-19 23:24 v4.20-rc6: Sporadic use-after-free in bt_iter() Bart Van Assche
2018-12-19 23:27 ` Jens Axboe
2018-12-20  0:16   ` Bart Van Assche
2018-12-20  3:17     ` Jens Axboe
2018-12-20  3:24       ` jianchao.wang
2018-12-20  4:19         ` Jens Axboe
2018-12-20  4:32           ` jianchao.wang
2018-12-20  4:48             ` Jens Axboe
2018-12-20  5:03               ` jianchao.wang
2018-12-20 13:02                 ` Jens Axboe
2018-12-20 13:07                   ` Jens Axboe
2018-12-20 18:01                     ` Bart Van Assche
2018-12-20 18:21                       ` Jens Axboe
2018-12-20 18:33                         ` Jens Axboe
2018-12-20 20:56                           ` Bart Van Assche
2018-12-20 21:00                             ` Jens Axboe
2018-12-20 21:23                               ` Bart Van Assche
2018-12-20 21:26                                 ` Jens Axboe
2018-12-20 21:31                                   ` Bart Van Assche
2018-12-20 21:34                                     ` Jens Axboe
2018-12-20 21:40                                       ` Bart Van Assche
2018-12-20 21:44                                         ` Jens Axboe [this message]
2018-12-20 21:48                                           ` Jens Axboe
2018-12-20 22:19                                             ` Bart Van Assche
2018-12-20 22:23                                               ` Jens Axboe
2018-12-20 22:33                                                 ` Jens Axboe
2018-12-20 22:47                                                   ` Jens Axboe
2018-12-20 22:50                                                     ` Jens Axboe
2019-02-14 23:36                                                       ` Bart Van Assche
2019-02-15 18:29                                                         ` Evan Green
2019-02-19 16:48                                                           ` Bart Van Assche
2019-02-21 20:54                                                             ` Evan Green
2019-02-15  2:57                                                       ` jianchao.wang
2018-12-20  4:06 ` Ming Lei

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2089c6c62f4 dfblob:c39b58391ae dfblob:61deab0b5a5
dfblob:bdd1bfc08c2 dfblob:2de97285749 dfblob:4decd1e7d2d )
 OR (
bs:"Re: v4.20-rc6: Sporadic use-after-free in bt_iter()" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=60b4819c-4c19-a4e3-41f3-e21b0544c9a4@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=jianchao.w.wang@oracle.com \
    --cc=linux-block@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.