[PATCH v2 1/3] bcache: avoid invalidating buckets in use

Linux bcache driver list
 help / color / mirror / Atom feed

* [PATCH v2 1/3] bcache: avoid invalidating buckets in use
@ 2024-11-19  7:40 mingzhe.zou
  2024-11-19  7:40 ` [PATCH v2 2/3] bcache: fix io error during cache read race mingzhe.zou
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: mingzhe.zou @ 2024-11-19  7:40 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, dongsheng.yang, zoumingzhe

From: Mingzhe Zou <mingzhe.zou@easystack.cn>

If the bucket was reused while our bio was in flight, we might
have read the wrong data. Currently, we will reread the data from
the backing device. This not only reduces performance, but also
makes the process more complex.

When the bucket is in use, we hope not to reclaim it.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
 drivers/md/bcache/alloc.c  | 32 +++++++++++++++++++++++---------
 drivers/md/bcache/bcache.h |  3 ++-
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index da50f6661bae..18441aa74229 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -134,25 +134,41 @@ bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 	       !atomic_read(&b->pin) && can_inc_bucket_gen(b));
 }
 
-void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
+bool __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
 	lockdep_assert_held(&ca->set->bucket_lock);
 	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
 
+	if (!spin_trylock(&b->lock))
+		return false;
+
+	/*
+	 * If the bucket was reused while read bio was in flight, it will
+	 * reread the data from the backing device. This will increase latency
+	 * and cause other errors. When b->pin is not 0, do not invalidate
+	 * the bucket.
+	 */
+
+	if (atomic_inc_return(&b->pin) > 1) {
+		atomic_dec(&b->pin);
+		spin_unlock(&b->lock);
+		return false;
+	}
+
 	if (GC_SECTORS_USED(b))
 		trace_bcache_invalidate(ca, b - ca->buckets);
 
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
-	atomic_inc(&b->pin);
 	b->reclaimable_in_gc = 0;
+	spin_unlock(&b->lock);
+	return true;
 }
 
 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 {
-	__bch_invalidate_one_bucket(ca, b);
-
-	fifo_push(&ca->free_inc, b - ca->buckets);
+	if (bch_can_invalidate_bucket(ca, b) && __bch_invalidate_one_bucket(ca, b))
+		fifo_push(&ca->free_inc, b - ca->buckets);
 }
 
 /*
@@ -253,8 +269,7 @@ static void invalidate_buckets_fifo(struct cache *ca)
 
 		b = ca->buckets + ca->fifo_last_bucket++;
 
-		if (bch_can_invalidate_bucket(ca, b))
-			bch_invalidate_one_bucket(ca, b);
+		bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets) {
 			ca->invalidate_needs_gc = 1;
@@ -279,8 +294,7 @@ static void invalidate_buckets_random(struct cache *ca)
 
 		b = ca->buckets + n;
 
-		if (bch_can_invalidate_bucket(ca, b))
-			bch_invalidate_one_bucket(ca, b);
+		bch_invalidate_one_bucket(ca, b);
 
 		if (++checked >= ca->sb.nbuckets / 2) {
 			ca->invalidate_needs_gc = 1;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 785b0d9008fa..fc7f10c5f222 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -196,6 +196,7 @@
 
 struct bucket {
 	atomic_t	pin;
+	spinlock_t	lock;
 	uint16_t	prio;
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
@@ -981,7 +982,7 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b);
 void bch_rescale_priorities(struct cache_set *c, int sectors);
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b);
-void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b);
+bool __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b);
 
 void __bch_bucket_free(struct cache *ca, struct bucket *b);
 void bch_bucket_free(struct cache_set *c, struct bkey *k);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 2/3] bcache: fix io error during cache read race
  2024-11-19  7:40 [PATCH v2 1/3] bcache: avoid invalidating buckets in use mingzhe.zou
@ 2024-11-19  7:40 ` mingzhe.zou
  2024-12-20  5:20   ` 邹明哲
  2024-11-19  7:40 ` [PATCH v2 3/3] bcache: remove unused parameters mingzhe.zou
  2025-01-22  4:42 ` [PATCH v2 1/3] bcache: avoid invalidating buckets in use Coly Li
  2 siblings, 1 reply; 7+ messages in thread
From: mingzhe.zou @ 2024-11-19  7:40 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, dongsheng.yang, zoumingzhe

From: Mingzhe Zou <mingzhe.zou@easystack.cn>

In our production environment, bcache returned IO_ERROR(errno=-5).
These errors always happen during 1M read IO under high pressure
and without any message log. When the error occurred, we stopped
all reading and writing and used 1M read IO to read the entire disk
without any errors. Later we found that cache_read_races of cache_set
is non-zero.

If a large (1M) read bio is split into two or more bios, when one bio
reads dirty data, s->read_dirty_data will be set to true and remain.
If the bucket was reused while our subsequent read bio was in flight,
the read will be unrecoverable(cannot read data from backing).

This patch increases the count for bucket->pin to prevent the bucket
from being reclaimed and reused.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
 drivers/md/bcache/request.c | 39 ++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index af345dc6fde1..6c41957138e5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -502,12 +502,8 @@ static void bch_cache_read_endio(struct bio *bio)
 	struct closure *cl = bio->bi_private;
 	struct search *s = container_of(cl, struct search, cl);
 
-	/*
-	 * If the bucket was reused while our bio was in flight, we might have
-	 * read the wrong data. Set s->error but not error so it doesn't get
-	 * counted against the cache device, but we'll still reread the data
-	 * from the backing device.
-	 */
+	BUG_ON(ptr_stale(s->iop.c, &b->key, 0)); // bucket should not be reused
+	atomic_dec(&PTR_BUCKET(s->iop.c, &b->key, 0)->pin);
 
 	if (bio->bi_status)
 		s->iop.status = bio->bi_status;
@@ -520,6 +516,8 @@ static void bch_cache_read_endio(struct bio *bio)
 	bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
 }
 
+static void backing_request_endio(struct bio *bio);
+
 /*
  * Read from a single key, handling the initial cache miss if the key starts in
  * the middle of the bio
@@ -529,7 +527,6 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *n, *bio = &s->bio.bio;
 	struct bkey *bio_key;
-	unsigned int ptr;
 
 	if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
 		return MAP_CONTINUE;
@@ -553,20 +550,36 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	if (!KEY_SIZE(k))
 		return MAP_CONTINUE;
 
-	/* XXX: figure out best pointer - for multiple cache devices */
-	ptr = 0;
+	/*
+	 * If the bucket was reused while our bio was in flight, we might have
+	 * read the wrong data. Set s->cache_read_races and reread the data
+	 * from the backing device.
+	 */
+	spin_lock(&PTR_BUCKET(b->c, k, 0)->lock);
+	if (ptr_stale(s->iop.c, k, 0)) {
+		spin_unlock(&PTR_BUCKET(b->c, k, 0)->lock);
+		atomic_long_inc(&s->iop.c->cache_read_races);
+		pr_warn("%pU cache read race count: %lu", s->iop.c->sb.set_uuid,
+			atomic_long_read(&s->iop.c->cache_read_races));
 
-	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+		n->bi_end_io	= backing_request_endio;
+		n->bi_private	= &s->cl;
+
+		/* I/O request sent to backing device */
+		closure_bio_submit(s->iop.c, n, &s->cl);
+		return n == bio ? MAP_DONE : MAP_CONTINUE;
+	}
+	atomic_inc(&PTR_BUCKET(s->iop.c, k, 0)->pin);
+	spin_unlock(&PTR_BUCKET(b->c, k, 0)->lock);
 
-	if (KEY_DIRTY(k))
-		s->read_dirty_data = true;
+	PTR_BUCKET(b->c, k, 0)->prio = INITIAL_PRIO;
 
 	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
 				      KEY_OFFSET(k) - bio->bi_iter.bi_sector),
 			   GFP_NOIO, &s->d->bio_split);
 
 	bio_key = &container_of(n, struct bbio, bio)->key;
-	bch_bkey_copy_single_ptr(bio_key, k, ptr);
+	bch_bkey_copy_single_ptr(bio_key, k, 0);
 
 	bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
 	bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 3/3] bcache: remove unused parameters
  2024-11-19  7:40 [PATCH v2 1/3] bcache: avoid invalidating buckets in use mingzhe.zou
  2024-11-19  7:40 ` [PATCH v2 2/3] bcache: fix io error during cache read race mingzhe.zou
@ 2024-11-19  7:40 ` mingzhe.zou
  2025-01-22  4:42 ` [PATCH v2 1/3] bcache: avoid invalidating buckets in use Coly Li
  2 siblings, 0 replies; 7+ messages in thread
From: mingzhe.zou @ 2024-11-19  7:40 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, dongsheng.yang, zoumingzhe

From: Mingzhe Zou <mingzhe.zou@easystack.cn>

We have prevented the bucket in use from being reclaimed and reused.
So, search->recoverable and search->read_dirty_data are unused.

Moreover, we do not need to consider that the bucket is reused during
cache reading.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
 drivers/md/bcache/request.c | 45 +------------------------------------
 1 file changed, 1 insertion(+), 44 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 6c41957138e5..d9f0e1f08121 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -484,9 +484,7 @@ struct search {
 	struct bcache_device	*d;
 
 	unsigned int		insert_bio_sectors;
-	unsigned int		recoverable:1;
 	unsigned int		write:1;
-	unsigned int		read_dirty_data:1;
 	unsigned int		cache_missed:1;
 
 	struct block_device	*orig_bdev;
@@ -507,11 +505,6 @@ static void bch_cache_read_endio(struct bio *bio)
 
 	if (bio->bi_status)
 		s->iop.status = bio->bi_status;
-	else if (!KEY_DIRTY(&b->key) &&
-		 ptr_stale(s->iop.c, &b->key, 0)) {
-		atomic_long_inc(&s->iop.c->cache_read_races);
-		s->iop.status = BLK_STS_IOERR;
-	}
 
 	bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
 }
@@ -606,7 +599,6 @@ static CLOSURE_CALLBACK(cache_lookup)
 {
 	closure_type(s, struct search, iop.cl);
 	struct bio *bio = &s->bio.bio;
-	struct cached_dev *dc;
 	int ret;
 
 	bch_btree_op_init(&s->op, -1);
@@ -630,12 +622,6 @@ static CLOSURE_CALLBACK(cache_lookup)
 	 */
 	if (ret < 0) {
 		BUG_ON(ret == -EINTR);
-		if (s->d && s->d->c &&
-				!UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
-			dc = container_of(s->d, struct cached_dev, disk);
-			if (dc && atomic_read(&dc->has_dirty))
-				s->recoverable = false;
-		}
 		if (!s->iop.status)
 			s->iop.status = BLK_STS_IOERR;
 	}
@@ -651,10 +637,7 @@ static void request_endio(struct bio *bio)
 
 	if (bio->bi_status) {
 		struct search *s = container_of(cl, struct search, cl);
-
 		s->iop.status = bio->bi_status;
-		/* Only cache read errors are recoverable */
-		s->recoverable = false;
 	}
 
 	bio_put(bio);
@@ -684,7 +667,6 @@ static void backing_request_endio(struct bio *bio)
 			/* set to orig_bio->bi_status in bio_complete() */
 			s->iop.status = bio->bi_status;
 		}
-		s->recoverable = false;
 		/* should count I/O error for backing device here */
 		bch_count_backing_io_errors(dc, bio);
 	}
@@ -755,9 +737,7 @@ static inline struct search *search_alloc(struct bio *bio,
 	s->cache_miss		= NULL;
 	s->cache_missed		= 0;
 	s->d			= d;
-	s->recoverable		= 1;
 	s->write		= op_is_write(bio_op(bio));
-	s->read_dirty_data	= 0;
 	/* Count on the bcache device */
 	s->orig_bdev		= orig_bdev;
 	s->start_time		= start_time;
@@ -802,29 +782,6 @@ static CLOSURE_CALLBACK(cached_dev_read_error_done)
 
 static CLOSURE_CALLBACK(cached_dev_read_error)
 {
-	closure_type(s, struct search, cl);
-	struct bio *bio = &s->bio.bio;
-
-	/*
-	 * If read request hit dirty data (s->read_dirty_data is true),
-	 * then recovery a failed read request from cached device may
-	 * get a stale data back. So read failure recovery is only
-	 * permitted when read request hit clean data in cache device,
-	 * or when cache read race happened.
-	 */
-	if (s->recoverable && !s->read_dirty_data) {
-		/* Retry from the backing device: */
-		trace_bcache_read_retry(s->orig_bio);
-
-		s->iop.status = 0;
-		do_bio_hook(s, s->orig_bio, backing_request_endio);
-
-		/* XXX: invalidate cache */
-
-		/* I/O request sent to backing device */
-		closure_bio_submit(s->iop.c, bio, cl);
-	}
-
 	continue_at(cl, cached_dev_read_error_done, NULL);
 }
 
@@ -870,7 +827,7 @@ static CLOSURE_CALLBACK(cached_dev_read_done)
 		s->cache_miss = NULL;
 	}
 
-	if (verify(dc) && s->recoverable && !s->read_dirty_data)
+	if (verify(dc))
 		bch_data_verify(dc, s->orig_bio);
 
 	closure_get(&dc->disk.cl);
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re:[PATCH v2 2/3] bcache: fix io error during cache read race
  2024-11-19  7:40 ` [PATCH v2 2/3] bcache: fix io error during cache read race mingzhe.zou
@ 2024-12-20  5:20   ` 邹明哲
  2024-12-21 16:17     ` [PATCH " colyli
  0 siblings, 1 reply; 7+ messages in thread
From: 邹明哲 @ 2024-12-20  5:20 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, zoumingzhe

Hi, Coly:

Our users have reported this issue to us in their generation environment!

Please review these patches and provide feedback.

Thank you very much.

mingzhe

Original:
From：mingzhe.zou<mingzhe.zou@easystack.cn>
Date：2024-11-19 15:40:30(中国 (GMT+08:00))
To：colyli<colyli@suse.de>
Cc：linux-bcache<linux-bcache@vger.kernel.org> , dongsheng.yang<dongsheng.yang@easystack.cn> , zoumingzhe<zoumingzhe@qq.com>
Subject：[PATCH v2 2/3] bcache: fix io error during cache read race
From: Mingzhe Zou <mingzhe.zou@easystack.cn>

In our production environment, bcache returned IO_ERROR(errno=-5).
These errors always happen during 1M read IO under high pressure
and without any message log. When the error occurred, we stopped
all reading and writing and used 1M read IO to read the entire disk
without any errors. Later we found that cache_read_races of cache_set
is non-zero.

If a large (1M) read bio is split into two or more bios, when one bio
reads dirty data, s-&gt;read_dirty_data will be set to true and remain.
If the bucket was reused while our subsequent read bio was in flight,
the read will be unrecoverable(cannot read data from backing).

This patch increases the count for bucket-&gt;pin to prevent the bucket
from being reclaimed and reused.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
 drivers/md/bcache/request.c | 39 ++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index af345dc6fde1..6c41957138e5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -502,12 +502,8 @@ static void bch_cache_read_endio(struct bio *bio)
 	struct closure *cl = bio-&gt;bi_private;
 	struct search *s = container_of(cl, struct search, cl);
 
-	/*
-	 * If the bucket was reused while our bio was in flight, we might have
-	 * read the wrong data. Set s-&gt;error but not error so it doesn't get
-	 * counted against the cache device, but we'll still reread the data
-	 * from the backing device.
-	 */
+	BUG_ON(ptr_stale(s-&gt;iop.c, &amp;b-&gt;key, 0)); // bucket should not be reused
+	atomic_dec(&amp;PTR_BUCKET(s-&gt;iop.c, &amp;b-&gt;key, 0)-&gt;pin);
 
 	if (bio-&gt;bi_status)
 		s-&gt;iop.status = bio-&gt;bi_status;
@@ -520,6 +516,8 @@ static void bch_cache_read_endio(struct bio *bio)
 	bch_bbio_endio(s-&gt;iop.c, bio, bio-&gt;bi_status, "reading from cache");
 }
 
+static void backing_request_endio(struct bio *bio);
+
 /*
  * Read from a single key, handling the initial cache miss if the key starts in
  * the middle of the bio
@@ -529,7 +527,6 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	struct search *s = container_of(op, struct search, op);
 	struct bio *n, *bio = &amp;s-&gt;bio.bio;
 	struct bkey *bio_key;
-	unsigned int ptr;
 
 	if (bkey_cmp(k, &amp;KEY(s-&gt;iop.inode, bio-&gt;bi_iter.bi_sector, 0)) &lt;= 0)
 		return MAP_CONTINUE;
@@ -553,20 +550,36 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
 	if (!KEY_SIZE(k))
 		return MAP_CONTINUE;
 
-	/* XXX: figure out best pointer - for multiple cache devices */
-	ptr = 0;
+	/*
+	 * If the bucket was reused while our bio was in flight, we might have
+	 * read the wrong data. Set s-&gt;cache_read_races and reread the data
+	 * from the backing device.
+	 */
+	spin_lock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
+	if (ptr_stale(s-&gt;iop.c, k, 0)) {
+		spin_unlock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
+		atomic_long_inc(&amp;s-&gt;iop.c-&gt;cache_read_races);
+		pr_warn("%pU cache read race count: %lu", s-&gt;iop.c-&gt;sb.set_uuid,
+			atomic_long_read(&amp;s-&gt;iop.c-&gt;cache_read_races));
 
-	PTR_BUCKET(b-&gt;c, k, ptr)-&gt;prio = INITIAL_PRIO;
+		n-&gt;bi_end_io	= backing_request_endio;
+		n-&gt;bi_private	= &amp;s-&gt;cl;
+
+		/* I/O request sent to backing device */
+		closure_bio_submit(s-&gt;iop.c, n, &amp;s-&gt;cl);
+		return n == bio ? MAP_DONE : MAP_CONTINUE;
+	}
+	atomic_inc(&amp;PTR_BUCKET(s-&gt;iop.c, k, 0)-&gt;pin);
+	spin_unlock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
 
-	if (KEY_DIRTY(k))
-		s-&gt;read_dirty_data = true;
+	PTR_BUCKET(b-&gt;c, k, 0)-&gt;prio = INITIAL_PRIO;
 
 	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
 				      KEY_OFFSET(k) - bio-&gt;bi_iter.bi_sector),
 			   GFP_NOIO, &amp;s-&gt;d-&gt;bio_split);
 
 	bio_key = &amp;container_of(n, struct bbio, bio)-&gt;key;
-	bch_bkey_copy_single_ptr(bio_key, k, ptr);
+	bch_bkey_copy_single_ptr(bio_key, k, 0);
 
 	bch_cut_front(&amp;KEY(s-&gt;iop.inode, n-&gt;bi_iter.bi_sector, 0), bio_key);
 	bch_cut_back(&amp;KEY(s-&gt;iop.inode, bio_end_sector(n), 0), bio_key);
-- 
2.34.1


</mingzhe.zou@easystack.cn></mingzhe.zou@easystack.cn></zoumingzhe@qq.com></dongsheng.yang@easystack.cn></linux-bcache@vger.kernel.org></colyli@suse.de></mingzhe.zou@easystack.cn>


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 2/3] bcache: fix io error during cache read race
  2024-12-20  5:20   ` 邹明哲
@ 2024-12-21 16:17     ` colyli
  2024-12-24  2:38       ` 邹明哲
  0 siblings, 1 reply; 7+ messages in thread
From: colyli @ 2024-12-21 16:17 UTC (permalink / raw)
  To: 邹明哲; +Cc: linux-bcache, zoumingzhe

在 2024-12-20 13:20，邹明哲 写道：
> Hi, Coly:
> 
> Our users have reported this issue to us in their generation 
> environment!
> 
> Please review these patches and provide feedback.
> 
> Thank you very much.

Hi Mingzhe,

Yes it is planed for next week, I will start to look at this series.

BTW, I don't see change log from the v1 to v2 series. Can I assume the 
v2 series fix warning reported by kernel test robot?

Thanks.

Coly Li

> 
> mingzhe
> 
> Original:
> From：mingzhe.zou<mingzhe.zou@easystack.cn>
> Date：2024-11-19 15:40:30(中国 (GMT+08:00))
> To：colyli<colyli@suse.de>
> Cc：linux-bcache<linux-bcache@vger.kernel.org> ,
> dongsheng.yang<dongsheng.yang@easystack.cn> ,
> zoumingzhe<zoumingzhe@qq.com>
> Subject：[PATCH v2 2/3] bcache: fix io error during cache read race
> From: Mingzhe Zou <mingzhe.zou@easystack.cn>
> 
> In our production environment, bcache returned IO_ERROR(errno=-5).
> These errors always happen during 1M read IO under high pressure
> and without any message log. When the error occurred, we stopped
> all reading and writing and used 1M read IO to read the entire disk
> without any errors. Later we found that cache_read_races of cache_set
> is non-zero.
> 
> If a large (1M) read bio is split into two or more bios, when one bio
> reads dirty data, s-&gt;read_dirty_data will be set to true and remain.
> If the bucket was reused while our subsequent read bio was in flight,
> the read will be unrecoverable(cannot read data from backing).
> 
> This patch increases the count for bucket-&gt;pin to prevent the bucket
> from being reclaimed and reused.
> 
> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
> ---
>  drivers/md/bcache/request.c | 39 ++++++++++++++++++++++++-------------
>  1 file changed, 26 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index af345dc6fde1..6c41957138e5 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -502,12 +502,8 @@ static void bch_cache_read_endio(struct bio *bio)
>  	struct closure *cl = bio-&gt;bi_private;
>  	struct search *s = container_of(cl, struct search, cl);
> 
> -	/*
> -	 * If the bucket was reused while our bio was in flight, we might 
> have
> -	 * read the wrong data. Set s-&gt;error but not error so it doesn't 
> get
> -	 * counted against the cache device, but we'll still reread the data
> -	 * from the backing device.
> -	 */
> +	BUG_ON(ptr_stale(s-&gt;iop.c, &amp;b-&gt;key, 0)); // bucket should
> not be reused
> +	atomic_dec(&amp;PTR_BUCKET(s-&gt;iop.c, &amp;b-&gt;key, 0)-&gt;pin);
> 
>  	if (bio-&gt;bi_status)
>  		s-&gt;iop.status = bio-&gt;bi_status;
> @@ -520,6 +516,8 @@ static void bch_cache_read_endio(struct bio *bio)
>  	bch_bbio_endio(s-&gt;iop.c, bio, bio-&gt;bi_status, "reading from 
> cache");
>  }
> 
> +static void backing_request_endio(struct bio *bio);
> +
>  /*
>   * Read from a single key, handling the initial cache miss if the key 
> starts in
>   * the middle of the bio
> @@ -529,7 +527,6 @@ static int cache_lookup_fn(struct btree_op *op,
> struct btree *b, struct bkey *k)
>  	struct search *s = container_of(op, struct search, op);
>  	struct bio *n, *bio = &amp;s-&gt;bio.bio;
>  	struct bkey *bio_key;
> -	unsigned int ptr;
> 
>  	if (bkey_cmp(k, &amp;KEY(s-&gt;iop.inode, bio-&gt;bi_iter.bi_sector,
> 0)) &lt;= 0)
>  		return MAP_CONTINUE;
> @@ -553,20 +550,36 @@ static int cache_lookup_fn(struct btree_op *op,
> struct btree *b, struct bkey *k)
>  	if (!KEY_SIZE(k))
>  		return MAP_CONTINUE;
> 
> -	/* XXX: figure out best pointer - for multiple cache devices */
> -	ptr = 0;
> +	/*
> +	 * If the bucket was reused while our bio was in flight, we might 
> have
> +	 * read the wrong data. Set s-&gt;cache_read_races and reread the 
> data
> +	 * from the backing device.
> +	 */
> +	spin_lock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
> +	if (ptr_stale(s-&gt;iop.c, k, 0)) {
> +		spin_unlock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
> +		atomic_long_inc(&amp;s-&gt;iop.c-&gt;cache_read_races);
> +		pr_warn("%pU cache read race count: %lu", 
> s-&gt;iop.c-&gt;sb.set_uuid,
> +			atomic_long_read(&amp;s-&gt;iop.c-&gt;cache_read_races));
> 
> -	PTR_BUCKET(b-&gt;c, k, ptr)-&gt;prio = INITIAL_PRIO;
> +		n-&gt;bi_end_io	= backing_request_endio;
> +		n-&gt;bi_private	= &amp;s-&gt;cl;
> +
> +		/* I/O request sent to backing device */
> +		closure_bio_submit(s-&gt;iop.c, n, &amp;s-&gt;cl);
> +		return n == bio ? MAP_DONE : MAP_CONTINUE;
> +	}
> +	atomic_inc(&amp;PTR_BUCKET(s-&gt;iop.c, k, 0)-&gt;pin);
> +	spin_unlock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
> 
> -	if (KEY_DIRTY(k))
> -		s-&gt;read_dirty_data = true;
> +	PTR_BUCKET(b-&gt;c, k, 0)-&gt;prio = INITIAL_PRIO;
> 
>  	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
>  				      KEY_OFFSET(k) - bio-&gt;bi_iter.bi_sector),
>  			   GFP_NOIO, &amp;s-&gt;d-&gt;bio_split);
> 
>  	bio_key = &amp;container_of(n, struct bbio, bio)-&gt;key;
> -	bch_bkey_copy_single_ptr(bio_key, k, ptr);
> +	bch_bkey_copy_single_ptr(bio_key, k, 0);
> 
>  	bch_cut_front(&amp;KEY(s-&gt;iop.inode, n-&gt;bi_iter.bi_sector, 0), 
> bio_key);
>  	bch_cut_back(&amp;KEY(s-&gt;iop.inode, bio_end_sector(n), 0), 
> bio_key);

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 2/3] bcache: fix io error during cache read race
  2024-12-21 16:17     ` [PATCH " colyli
@ 2024-12-24  2:38       ` 邹明哲
  0 siblings, 0 replies; 7+ messages in thread
From: 邹明哲 @ 2024-12-24  2:38 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, zoumingzhe

Hi Coly Li

Yes, it mainly fixes the warning of kernel test robot.

mingzhe

Original:
From：colyli <colyli@suse.de>
Date：2024-12-22 00:17:06(中国 (GMT+08:00))
To：邹明哲<mingzhe.zou@easystack.cn>
Cc：linux-bcache <linux-bcache@vger.kernel.org> , zoumingzhe <zoumingzhe@qq.com>
Subject：Re: [PATCH v2 2/3] bcache: fix io error during cache read race
在 2024-12-20 13:20，邹明哲 写道：
&gt; Hi, Coly:
&gt; 
&gt; Our users have reported this issue to us in their generation 
&gt; environment!
&gt; 
&gt; Please review these patches and provide feedback.
&gt; 
&gt; Thank you very much.

Hi Mingzhe,

Yes it is planed for next week, I will start to look at this series.

BTW, I don't see change log from the v1 to v2 series. Can I assume the 
v2 series fix warning reported by kernel test robot?

Thanks.

Coly Li

&gt; 
&gt; mingzhe
&gt; 
&gt; Original:
&gt; From：mingzhe.zou<mingzhe.zou@easystack.cn>
&gt; Date：2024-11-19 15:40:30(中国 (GMT+08:00))
&gt; To：colyli<colyli@suse.de>
&gt; Cc：linux-bcache<linux-bcache@vger.kernel.org> ,
&gt; dongsheng.yang<dongsheng.yang@easystack.cn> ,
&gt; zoumingzhe<zoumingzhe@qq.com>
&gt; Subject：[PATCH v2 2/3] bcache: fix io error during cache read race
&gt; From: Mingzhe Zou <mingzhe.zou@easystack.cn>
&gt; 
&gt; In our production environment, bcache returned IO_ERROR(errno=-5).
&gt; These errors always happen during 1M read IO under high pressure
&gt; and without any message log. When the error occurred, we stopped
&gt; all reading and writing and used 1M read IO to read the entire disk
&gt; without any errors. Later we found that cache_read_races of cache_set
&gt; is non-zero.
&gt; 
&gt; If a large (1M) read bio is split into two or more bios, when one bio
&gt; reads dirty data, s-&gt;read_dirty_data will be set to true and remain.
&gt; If the bucket was reused while our subsequent read bio was in flight,
&gt; the read will be unrecoverable(cannot read data from backing).
&gt; 
&gt; This patch increases the count for bucket-&gt;pin to prevent the bucket
&gt; from being reclaimed and reused.
&gt; 
&gt; Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
&gt; ---
&gt;  drivers/md/bcache/request.c | 39 ++++++++++++++++++++++++-------------
&gt;  1 file changed, 26 insertions(+), 13 deletions(-)
&gt; 
&gt; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
&gt; index af345dc6fde1..6c41957138e5 100644
&gt; --- a/drivers/md/bcache/request.c
&gt; +++ b/drivers/md/bcache/request.c
&gt; @@ -502,12 +502,8 @@ static void bch_cache_read_endio(struct bio *bio)
&gt;  	struct closure *cl = bio-&gt;bi_private;
&gt;  	struct search *s = container_of(cl, struct search, cl);
&gt; 
&gt; -	/*
&gt; -	 * If the bucket was reused while our bio was in flight, we might 
&gt; have
&gt; -	 * read the wrong data. Set s-&gt;error but not error so it doesn't 
&gt; get
&gt; -	 * counted against the cache device, but we'll still reread the data
&gt; -	 * from the backing device.
&gt; -	 */
&gt; +	BUG_ON(ptr_stale(s-&gt;iop.c, &amp;b-&gt;key, 0)); // bucket should
&gt; not be reused
&gt; +	atomic_dec(&amp;PTR_BUCKET(s-&gt;iop.c, &amp;b-&gt;key, 0)-&gt;pin);
&gt; 
&gt;  	if (bio-&gt;bi_status)
&gt;  		s-&gt;iop.status = bio-&gt;bi_status;
&gt; @@ -520,6 +516,8 @@ static void bch_cache_read_endio(struct bio *bio)
&gt;  	bch_bbio_endio(s-&gt;iop.c, bio, bio-&gt;bi_status, "reading from 
&gt; cache");
&gt;  }
&gt; 
&gt; +static void backing_request_endio(struct bio *bio);
&gt; +
&gt;  /*
&gt;   * Read from a single key, handling the initial cache miss if the key 
&gt; starts in
&gt;   * the middle of the bio
&gt; @@ -529,7 +527,6 @@ static int cache_lookup_fn(struct btree_op *op,
&gt; struct btree *b, struct bkey *k)
&gt;  	struct search *s = container_of(op, struct search, op);
&gt;  	struct bio *n, *bio = &amp;s-&gt;bio.bio;
&gt;  	struct bkey *bio_key;
&gt; -	unsigned int ptr;
&gt; 
&gt;  	if (bkey_cmp(k, &amp;KEY(s-&gt;iop.inode, bio-&gt;bi_iter.bi_sector,
&gt; 0)) &lt;= 0)
&gt;  		return MAP_CONTINUE;
&gt; @@ -553,20 +550,36 @@ static int cache_lookup_fn(struct btree_op *op,
&gt; struct btree *b, struct bkey *k)
&gt;  	if (!KEY_SIZE(k))
&gt;  		return MAP_CONTINUE;
&gt; 
&gt; -	/* XXX: figure out best pointer - for multiple cache devices */
&gt; -	ptr = 0;
&gt; +	/*
&gt; +	 * If the bucket was reused while our bio was in flight, we might 
&gt; have
&gt; +	 * read the wrong data. Set s-&gt;cache_read_races and reread the 
&gt; data
&gt; +	 * from the backing device.
&gt; +	 */
&gt; +	spin_lock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
&gt; +	if (ptr_stale(s-&gt;iop.c, k, 0)) {
&gt; +		spin_unlock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
&gt; +		atomic_long_inc(&amp;s-&gt;iop.c-&gt;cache_read_races);
&gt; +		pr_warn("%pU cache read race count: %lu", 
&gt; s-&gt;iop.c-&gt;sb.set_uuid,
&gt; +			atomic_long_read(&amp;s-&gt;iop.c-&gt;cache_read_races));
&gt; 
&gt; -	PTR_BUCKET(b-&gt;c, k, ptr)-&gt;prio = INITIAL_PRIO;
&gt; +		n-&gt;bi_end_io	= backing_request_endio;
&gt; +		n-&gt;bi_private	= &amp;s-&gt;cl;
&gt; +
&gt; +		/* I/O request sent to backing device */
&gt; +		closure_bio_submit(s-&gt;iop.c, n, &amp;s-&gt;cl);
&gt; +		return n == bio ? MAP_DONE : MAP_CONTINUE;
&gt; +	}
&gt; +	atomic_inc(&amp;PTR_BUCKET(s-&gt;iop.c, k, 0)-&gt;pin);
&gt; +	spin_unlock(&amp;PTR_BUCKET(b-&gt;c, k, 0)-&gt;lock);
&gt; 
&gt; -	if (KEY_DIRTY(k))
&gt; -		s-&gt;read_dirty_data = true;
&gt; +	PTR_BUCKET(b-&gt;c, k, 0)-&gt;prio = INITIAL_PRIO;
&gt; 
&gt;  	n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
&gt;  				      KEY_OFFSET(k) - bio-&gt;bi_iter.bi_sector),
&gt;  			   GFP_NOIO, &amp;s-&gt;d-&gt;bio_split);
&gt; 
&gt;  	bio_key = &amp;container_of(n, struct bbio, bio)-&gt;key;
&gt; -	bch_bkey_copy_single_ptr(bio_key, k, ptr);
&gt; +	bch_bkey_copy_single_ptr(bio_key, k, 0);
&gt; 
&gt;  	bch_cut_front(&amp;KEY(s-&gt;iop.inode, n-&gt;bi_iter.bi_sector, 0), 
&gt; bio_key);
&gt;  	bch_cut_back(&amp;KEY(s-&gt;iop.inode, bio_end_sector(n), 0), 
&gt; bio_key);


</mingzhe.zou@easystack.cn></mingzhe.zou@easystack.cn></zoumingzhe@qq.com></dongsheng.yang@easystack.cn></linux-bcache@vger.kernel.org></colyli@suse.de></mingzhe.zou@easystack.cn></zoumingzhe@qq.com></linux-bcache@vger.kernel.org></mingzhe.zou@easystack.cn></colyli@suse.de>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 1/3] bcache: avoid invalidating buckets in use
  2024-11-19  7:40 [PATCH v2 1/3] bcache: avoid invalidating buckets in use mingzhe.zou
  2024-11-19  7:40 ` [PATCH v2 2/3] bcache: fix io error during cache read race mingzhe.zou
  2024-11-19  7:40 ` [PATCH v2 3/3] bcache: remove unused parameters mingzhe.zou
@ 2025-01-22  4:42 ` Coly Li
  2 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2025-01-22  4:42 UTC (permalink / raw)
  To: mingzhe.zou; +Cc: linux-bcache, dongsheng.yang, zoumingzhe

On Tue, Nov 19, 2024 at 03:40:29PM +0800, mingzhe.zou@easystack.cn wrote:
> From: Mingzhe Zou <mingzhe.zou@easystack.cn>
> 
> If the bucket was reused while our bio was in flight, we might
> have read the wrong data. Currently, we will reread the data from
> the backing device. This not only reduces performance, but also
> makes the process more complex.
> 
> When the bucket is in use, we hope not to reclaim it.
>

No please don't do this.

This is the essential designment of bcache buckets, concurrent access
is expected and permitted. Locked the bucket and prohibit detectable
race is a huge change to the whole code logic, may introduce potential
issue which I am not able to tell immediately.

I know although the upper layer code or application should retry with
an I/O error code received, but most of the cases it is very hard to
modify the deployed software. Maybe a feasible fix is to set the error
code to AGAIN, and before returning to upper layer code if the error
code is AGAIN other than ERROR, re-submit the bio again from a worker.

Anyway, this is just a quite rough idea, I don't think over it
carefully.

Thanks.

Coly Li


 
> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
> ---
>  drivers/md/bcache/alloc.c  | 32 +++++++++++++++++++++++---------
>  drivers/md/bcache/bcache.h |  3 ++-
>  2 files changed, 25 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
> index da50f6661bae..18441aa74229 100644
> --- a/drivers/md/bcache/alloc.c
> +++ b/drivers/md/bcache/alloc.c
> @@ -134,25 +134,41 @@ bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
>  	       !atomic_read(&b->pin) && can_inc_bucket_gen(b));
>  }
>  
> -void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
> +bool __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
>  {
>  	lockdep_assert_held(&ca->set->bucket_lock);
>  	BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE);
>  
> +	if (!spin_trylock(&b->lock))
> +		return false;
> +
> +	/*
> +	 * If the bucket was reused while read bio was in flight, it will
> +	 * reread the data from the backing device. This will increase latency
> +	 * and cause other errors. When b->pin is not 0, do not invalidate
> +	 * the bucket.
> +	 */
> +
> +	if (atomic_inc_return(&b->pin) > 1) {
> +		atomic_dec(&b->pin);
> +		spin_unlock(&b->lock);
> +		return false;
> +	}
> +
>  	if (GC_SECTORS_USED(b))
>  		trace_bcache_invalidate(ca, b - ca->buckets);
>  
>  	bch_inc_gen(ca, b);
>  	b->prio = INITIAL_PRIO;
> -	atomic_inc(&b->pin);
>  	b->reclaimable_in_gc = 0;
> +	spin_unlock(&b->lock);
> +	return true;
>  }
>  
>  static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
>  {
> -	__bch_invalidate_one_bucket(ca, b);
> -
> -	fifo_push(&ca->free_inc, b - ca->buckets);
> +	if (bch_can_invalidate_bucket(ca, b) && __bch_invalidate_one_bucket(ca, b))
> +		fifo_push(&ca->free_inc, b - ca->buckets);
>  }
>  
>  /*
> @@ -253,8 +269,7 @@ static void invalidate_buckets_fifo(struct cache *ca)
>  
>  		b = ca->buckets + ca->fifo_last_bucket++;
>  
> -		if (bch_can_invalidate_bucket(ca, b))
> -			bch_invalidate_one_bucket(ca, b);
> +		bch_invalidate_one_bucket(ca, b);
>  
>  		if (++checked >= ca->sb.nbuckets) {
>  			ca->invalidate_needs_gc = 1;
> @@ -279,8 +294,7 @@ static void invalidate_buckets_random(struct cache *ca)
>  
>  		b = ca->buckets + n;
>  
> -		if (bch_can_invalidate_bucket(ca, b))
> -			bch_invalidate_one_bucket(ca, b);
> +		bch_invalidate_one_bucket(ca, b);
>  
>  		if (++checked >= ca->sb.nbuckets / 2) {
>  			ca->invalidate_needs_gc = 1;
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 785b0d9008fa..fc7f10c5f222 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -196,6 +196,7 @@
>  
>  struct bucket {
>  	atomic_t	pin;
> +	spinlock_t	lock;
>  	uint16_t	prio;
>  	uint8_t		gen;
>  	uint8_t		last_gc; /* Most out of date gen in the btree */
> @@ -981,7 +982,7 @@ uint8_t bch_inc_gen(struct cache *ca, struct bucket *b);
>  void bch_rescale_priorities(struct cache_set *c, int sectors);
>  
>  bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b);
> -void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b);
> +bool __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b);
>  
>  void __bch_bucket_free(struct cache *ca, struct bucket *b);
>  void bch_bucket_free(struct cache_set *c, struct bkey *k);
> -- 
> 2.34.1
> 
> 

-- 
Coly Li

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-01-22  4:43 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-19  7:40 [PATCH v2 1/3] bcache: avoid invalidating buckets in use mingzhe.zou
2024-11-19  7:40 ` [PATCH v2 2/3] bcache: fix io error during cache read race mingzhe.zou
2024-12-20  5:20   ` 邹明哲
2024-12-21 16:17     ` [PATCH " colyli
2024-12-24  2:38       ` 邹明哲
2024-11-19  7:40 ` [PATCH v2 3/3] bcache: remove unused parameters mingzhe.zou
2025-01-22  4:42 ` [PATCH v2 1/3] bcache: avoid invalidating buckets in use Coly Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox